diff --git a/edu/.nbd/tickets/1f995a.md b/edu/.nbd/tickets/1f995a.md new file mode 100644 index 0000000..ed85370 --- /dev/null +++ b/edu/.nbd/tickets/1f995a.md @@ -0,0 +1,8 @@ ++++ +title = "Markov exercise: N-gram Generalization (Rust)" +priority = 7 +status = "done" +ticket_type = "task" +dependencies = [] ++++ +Write Section 8 of edu/markov.md: Exercise 4 — N-gram Generalization\n\nLearning objectives:\n- Generalize from bigrams to arbitrary-order n-gram chains\n- Use Vec as a HashMap key (or a joined string)\n- Empirically compare output quality for n = 1, 2, 3, 4\n\nContent to produce:\n- Setup instructions (extend Exercise 3 project)\n- Step-by-step hints:\n 1. Modify train to use a sliding window of n words as the key\n 2. Modify generate to maintain a deque/window of the last n words\n 3. Run on the same corpus with n = 1, 2, 3, 4 and print 50 words each\n 4. Discuss observations: when does it start memorising the corpus?\n- Full reference solution\n- Stretch goal: implement character-level n-grams instead of word-level\n\nTarget: replace the stub in edu/markov.md §8 \ No newline at end of file diff --git a/edu/markov.md b/edu/markov.md index a0f09a9..573ec34 100644 --- a/edu/markov.md +++ b/edu/markov.md @@ -386,7 +386,26 @@ impl BigramModel { **Goal:** Generalize the bigram model to an *n*-gram model where each state is a window of *n* consecutive words. Compare the output quality for *n* = 1, 2, 3, and 4 on the same corpus. +#### Setup + +Extend the project from Exercise 3, or create a fresh one: + +```sh +cargo new ngram-chain +cd ngram-chain +cargo add rand +``` + +The `NgramModel` stores transitions keyed by `Vec` — a window of *n* consecutive words. `Vec` implements `Hash` and `Eq`, so it works directly as a `HashMap` key with no extra wrapping. + +#### Starter Code + +Replace (or extend) `src/main.rs` with the following skeleton: + ```rust +use rand::Rng; +use std::collections::HashMap; + struct NgramModel { n: usize, transitions: HashMap, Vec<(String, usize)>>, @@ -396,9 +415,239 @@ impl NgramModel { fn train(corpus: &str, n: usize) -> Self { todo!() } fn generate(&self, seed: Vec, length: usize, rng: &mut impl Rng) -> Vec { todo!() } } + +fn main() { todo!() } +``` + +#### Step 1 — Tokenize and build the transition table + +Inside `train`, convert the corpus into a flat list of lowercase words: + +```rust +let words: Vec = corpus + .split_whitespace() + .map(|s| s.to_lowercase()) + .collect(); +``` + +Then iterate over **sliding windows** of size `n + 1`. The first `n` elements form the key (the context); the last element is the successor word to record: + +```rust +for window in words.windows(n + 1) { + let key: Vec = window[..n].to_vec(); + let next: String = window[n].clone(); + // insert (key → next) into the transition table +} ``` -> 🚧 This section is a stub — see nbd ticket `1f995a` +Use `HashMap::entry(...).or_default()` to get-or-insert the successor list. Scan for an existing entry for `next` and increment its count, or push `(next, 1)` if it is new. + +#### Step 2 — Weighted sampling helper + +Reuse the same cumulative-weight technique from Exercise 3. The helper below accepts any slice of `(word, count)` pairs and returns a randomly chosen word with probability proportional to its count: + +```rust +fn sample<'a>(choices: &'a [(String, usize)], rng: &mut impl Rng) -> &'a str { + let total: usize = choices.iter().map(|(_, w)| w).sum(); + let mut r = rng.gen_range(0..total); + for (word, weight) in choices { + if r < *weight { + return word; + } + r -= weight; + } + &choices.last().unwrap().0 +} +``` + +#### Step 3 — Implement `NgramModel::generate` + +`generate` is called with a **seed** — a `Vec` of exactly `n` words. It extends that seed word by word, up to `length` additional words, using a sliding window to track the current context: + +1. Copy the seed into `output` and into a `VecDeque` called `window`. +2. Each step: collect `window` into a `Vec` to use as the lookup key. +3. If the key is missing from `self.transitions`, stop early — the chain has reached a dead end. +4. Sample the next word, push it onto `output`, pop the oldest word from `window`, push the new word. + +```rust +use std::collections::VecDeque; + +// inside generate: +let mut window: VecDeque = VecDeque::from(seed.clone()); +let mut output = seed; + +for _ in 0..length { + let key: Vec = window.iter().cloned().collect(); + match self.transitions.get(&key) { + None => break, + Some(choices) => { + let next = sample(choices, rng).to_string(); + output.push(next.clone()); + window.pop_front(); + window.push_back(next); + } + } +} +output +``` + +#### Step 4 — Compare *n* = 1, 2, 3, 4 + +In `main`, train one model per value of `n` and generate 50 additional words from each. Use a fixed RNG seed for reproducibility. The short Alice corpus below is enough to observe the trend; swap in a larger public-domain text (e.g., the first chapter of *Alice's Adventures in Wonderland* from Project Gutenberg) for more interesting output. + +```rust +const CORPUS: &str = + "alice was beginning to get very tired of sitting by her sister on the \ + bank and of having nothing to do once or twice she had peeped into the \ + book her sister was reading but it had no pictures or conversations in \ + it and what is the use of a book thought alice without pictures or \ + conversations alice was beginning to get very tired of sitting"; + +fn main() { + use rand::{SeedableRng, rngs::SmallRng}; + + for n in 1..=4 { + let model = NgramModel::train(CORPUS, n); + let mut rng = SmallRng::seed_from_u64(42); + + // seed = the first n words of the corpus + let seed: Vec = CORPUS + .split_whitespace() + .take(n) + .map(|s| s.to_lowercase()) + .collect(); + + let words = model.generate(seed, 50, &mut rng); + println!("n={}: {}", n, words.join(" ")); + println!(); + } +} +``` + +Expected observations: + +- **n = 1:** no context at all; the model samples from the global word-frequency distribution, producing word soup with only rough statistical flavour. +- **n = 2 (bigram):** every adjacent pair appeared in the corpus, so individual transitions feel plausible; the topic still shifts erratically over longer runs. +- **n = 3 (trigram):** longer coherent stretches emerge; you will start to recognise verbatim phrases from the corpus. +- **n = 4:** on a small corpus, most 4-word contexts appear only once, leaving the model no real choice but to reproduce the training text nearly verbatim. Try a larger corpus to see *n* = 4 produce novel output. + +#### Step 5 — Memorisation vs novelty + +The pattern above is the central tension in all *n*-gram language models: + +- **Small *n*:** short context → many plausible continuations → high novelty, low coherence. +- **Large *n*:** long context → typically a unique continuation → low novelty, high local fidelity to the corpus. + +The corpus size determines the crossover point. For a paragraph-sized text, *n* = 2 is usually the maximum useful order. For a novel-length corpus, *n* = 4 or 5 can produce readable, novel output without simply transcribing the source. + +#### Stretch Goal — Character-level *n*-grams + +Swap words for individual characters: tokenize with `.chars()` instead of `.split_whitespace()`. The model and sampling logic are unchanged — only the "token" definition shifts from words to characters: + +```rust +// Character-level tokenization for train: +let chars: Vec = corpus.chars().map(|c| c.to_string()).collect(); +// Replace `words` with `chars` and proceed identically. +``` + +Generate 200 characters at *n* = 3, 5, and 8. You will see the model progress from random letter sequences, to plausible letter clusters and syllables, to recognisable words and phrases as *n* grows. This also demonstrates that the *n*-gram approach is domain-agnostic: the same code works for words, characters, DNA bases, MIDI note sequences, or any discrete token stream. + +#### Reference Solution + +
+Show full solution + +```rust +use rand::{Rng, SeedableRng, rngs::SmallRng}; +use std::collections::{HashMap, VecDeque}; + +struct NgramModel { + n: usize, + transitions: HashMap, Vec<(String, usize)>>, +} + +impl NgramModel { + fn train(corpus: &str, n: usize) -> Self { + let words: Vec = corpus + .split_whitespace() + .map(|s| s.to_lowercase()) + .collect(); + + let mut transitions: HashMap, Vec<(String, usize)>> = HashMap::new(); + + for window in words.windows(n + 1) { + let key: Vec = window[..n].to_vec(); + let next = window[n].clone(); + let entry = transitions.entry(key).or_default(); + if let Some(pair) = entry.iter_mut().find(|(w, _)| w == &next) { + pair.1 += 1; + } else { + entry.push((next, 1)); + } + } + + NgramModel { n, transitions } + } + + fn generate(&self, seed: Vec, length: usize, rng: &mut impl Rng) -> Vec { + assert_eq!(seed.len(), self.n, "seed must have exactly n words"); + let mut window: VecDeque = VecDeque::from(seed.clone()); + let mut output = seed; + + for _ in 0..length { + let key: Vec = window.iter().cloned().collect(); + match self.transitions.get(&key) { + None => break, + Some(choices) => { + let next = sample(choices, rng).to_string(); + output.push(next.clone()); + window.pop_front(); + window.push_back(next); + } + } + } + output + } +} + +fn sample<'a>(choices: &'a [(String, usize)], rng: &mut impl Rng) -> &'a str { + let total: usize = choices.iter().map(|(_, w)| w).sum(); + let mut r = rng.gen_range(0..total); + for (word, weight) in choices { + if r < *weight { + return word; + } + r -= weight; + } + &choices.last().unwrap().0 +} + +const CORPUS: &str = + "alice was beginning to get very tired of sitting by her sister on the \ + bank and of having nothing to do once or twice she had peeped into the \ + book her sister was reading but it had no pictures or conversations in \ + it and what is the use of a book thought alice without pictures or \ + conversations alice was beginning to get very tired of sitting"; + +fn main() { + for n in 1..=4 { + let model = NgramModel::train(CORPUS, n); + let mut rng = SmallRng::seed_from_u64(42); + + let seed: Vec = CORPUS + .split_whitespace() + .take(n) + .map(|s| s.to_lowercase()) + .collect(); + + let words = model.generate(seed, 50, &mut rng); + println!("n={}: {}", n, words.join(" ")); + println!(); + } +} +``` + +
---