diff --git a/edu/.nbd/tickets/1ef9f4.md b/edu/.nbd/tickets/1ef9f4.md index edb6308..94152ff 100644 --- a/edu/.nbd/tickets/1ef9f4.md +++ b/edu/.nbd/tickets/1ef9f4.md @@ -1,7 +1,7 @@ +++ title = "ยง10 Exercise 3: Semantic Document Search" priority = 5 -status = "todo" +status = "done" ticket_type = "task" dependencies = [] +++ diff --git a/edu/src/vector-db.md b/edu/src/vector-db.md index f9be761..f45b642 100644 --- a/edu/src/vector-db.md +++ b/edu/src/vector-db.md @@ -900,7 +900,281 @@ async fn embed_texts(texts: Vec) -> anyhow::Result>> { ### 10. Exercise 3 โ€” Semantic Document Search -**Goal:** Build a complete semantic search pipeline: embed a small corpus of text documents, store the embeddings in Turso, then accept a natural-language query, embed it, and return the top-*k* most relevant documents using vector similarity โ€” all without any keyword matching. ๐Ÿšง Full content tracked in [nbd:1ef9f4]. +**Goal:** Embed a corpus of 15 short text passages with fastembed-rs, store the embeddings in Turso, then accept a natural-language query, embed it, and return the top-5 most semantically relevant passages โ€” with no keyword matching. + +#### Setup + +Create a new project (or extend your existing `vec-demo` crate). Your `Cargo.toml` dependencies: + +```toml +[dependencies] +libsql = "0.9" +fastembed = "4" +tokio = { version = "1", features = ["full"] } +serde_json = "1" +``` + +The table schema uses `F32_BLOB(384)` because BGE-Small-EN-v1.5 produces 384-dimensional embeddings: + +```sql +CREATE TABLE IF NOT EXISTS docs ( + id INTEGER PRIMARY KEY, + passage TEXT NOT NULL, + embedding F32_BLOB(384) NOT NULL +) +``` + +#### Corpus + +Use these 15 passages spanning three topics. + +**Rust programming (5):** + +1. "Rust uses an ownership system to guarantee memory safety without a garbage collector." +2. "The borrow checker enforces that references do not outlive the data they point to." +3. "Cargo is Rust's build system and package manager, used to manage dependencies and run tests." +4. "Rust's trait system enables zero-cost abstractions and compile-time polymorphism." +5. "Async Rust uses futures and the tokio runtime to handle concurrent I/O efficiently." + +**Astronomy (5):** + +1. "A black hole is a region of spacetime where gravity is so strong that nothing can escape." +2. "The Milky Way galaxy contains an estimated 100 to 400 billion stars." +3. "Neutron stars are the collapsed cores of massive stars, with densities exceeding atomic nuclei." +4. "The cosmic microwave background is the thermal radiation left over from the early universe." +5. "Exoplanets are planets outside our solar system, detected via transit photometry or radial velocity." + +**Cooking (5):** + +1. "Maillard reaction gives browned foods their distinctive flavour through amino acid and sugar reactions." +2. "Sous vide cooking involves sealing food in vacuum bags and cooking at precise low temperatures." +3. "Emulsification combines two immiscible liquids, such as oil and water, using an emulsifier like lecithin." +4. "Fermentation converts sugars to acids or alcohol using microorganisms, used in bread, beer, and yogurt." +5. "Knife skills โ€” julienne, brunoise, chiffonade โ€” determine the surface area and cooking time of vegetables." + +#### Step 1 โ€” Embed the corpus + +Use `fastembed::TextEmbedding` with the default model (BGE-Small-EN-v1.5) to embed all 15 passages in a single `model.embed()` call. This returns a `Vec>` โ€” one 384-dimensional vector per passage. + +```rust +use fastembed::{TextEmbedding, InitOptions, EmbeddingModel}; + +let model = TextEmbedding::try_new(InitOptions { + model_name: EmbeddingModel::BGESmallENV15, + show_download_progress: true, + ..Default::default() +})?; + +let embeddings = model.embed(passages.clone(), None)?; +``` + +#### Step 2 โ€” Insert into Turso + +Loop over the passages and their corresponding embeddings. Convert each `Vec` to a JSON string so it can be passed to the `vector(?)` SQL function. Use `INSERT OR IGNORE` so re-runs are idempotent. + +```rust +fn vec_to_json(v: &[f32]) -> String { + let parts: Vec = v.iter().map(|x| format!("{x}")).collect(); + format!("[{}]", parts.join(",")) +} + +for (i, (passage, emb)) in passages.iter().zip(embeddings.iter()).enumerate() { + let json = vec_to_json(emb); + conn.execute( + "INSERT OR IGNORE INTO docs (id, passage, embedding) VALUES (?, ?, vector(?))", + libsql::params![i as i64, passage.as_str(), json.as_str()], + ) + .await?; +} +``` + +#### Step 3 โ€” Embed the query and search + +Embed the query string the same way you embedded the corpus โ€” using `model.embed()` with a single-element slice. Then run `vector_top_k('docs_idx', vector(?), 5)` and join back to the `docs` table to retrieve the passage text and cosine distance. + +```rust +let query = "memory safety in systems programming"; +let q_emb = model.embed(vec![query.to_string()], None)?; +let q_json = vec_to_json(&q_emb[0]); + +let mut rows = conn + .query( + "SELECT d.passage, v.distance + FROM vector_top_k('docs_idx', vector(?), 5) AS v + JOIN docs AS d ON d.rowid = v.id + ORDER BY v.distance", + libsql::params![q_json.as_str()], + ) + .await?; +``` + +#### Step 4 โ€” Run three queries and verify + +Run the following queries and confirm the results cluster by topic: + +| Query | Expected top results | +|---|---| +| `"memory safety in systems programming"` | Rust passages | +| `"stars and galaxies"` | Astronomy passages | +| `"fermentation and cooking techniques"` | Cooking passages | + +Print each result ranked by distance, showing the passage text and the cosine distance score: + +```rust +let queries = vec![ + "memory safety in systems programming", + "stars and galaxies", + "fermentation and cooking techniques", +]; + +for query in &queries { + println!("\n=== Query: \"{query}\" ===\n"); + let q_emb = model.embed(vec![query.to_string()], None)?; + let q_json = vec_to_json(&q_emb[0]); + + let mut rows = conn + .query( + "SELECT d.passage, v.distance + FROM vector_top_k('docs_idx', vector(?), 5) AS v + JOIN docs AS d ON d.rowid = v.id + ORDER BY v.distance", + libsql::params![q_json.as_str()], + ) + .await?; + + let mut rank = 1; + while let Some(row) = rows.next().await? { + let passage: String = row.get(0)?; + let distance: f64 = row.get(1)?; + println!(" {rank}. [{distance:.4}] {passage}"); + rank += 1; + } +} +``` + +#### Reference Solution + +
+Show full solution + +```rust +// src/main.rs โ€” Semantic Document Search (Exercise 3) + +use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; +use libsql::Builder; + +fn vec_to_json(v: &[f32]) -> String { + let parts: Vec = v.iter().map(|x| format!("{x}")).collect(); + format!("[{}]", parts.join(",")) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + // โ”€โ”€ 1. Connect to Turso (local file) โ”€โ”€ + let db = Builder::new_local("semantic_search.db").build().await?; + let conn = db.connect()?; + + // โ”€โ”€ 2. Create the docs table โ”€โ”€ + conn.execute( + "CREATE TABLE IF NOT EXISTS docs ( + id INTEGER PRIMARY KEY, + passage TEXT NOT NULL, + embedding F32_BLOB(384) NOT NULL + )", + (), + ) + .await?; + + // โ”€โ”€ 3. Create the vector index โ”€โ”€ + conn.execute( + "CREATE INDEX IF NOT EXISTS docs_idx ON docs (libsql_vector_idx(embedding))", + (), + ) + .await?; + + // โ”€โ”€ 4. Define the corpus โ”€โ”€ + let passages: Vec = vec![ + // Rust programming + "Rust uses an ownership system to guarantee memory safety without a garbage collector.", + "The borrow checker enforces that references do not outlive the data they point to.", + "Cargo is Rust's build system and package manager, used to manage dependencies and run tests.", + "Rust's trait system enables zero-cost abstractions and compile-time polymorphism.", + "Async Rust uses futures and the tokio runtime to handle concurrent I/O efficiently.", + // Astronomy + "A black hole is a region of spacetime where gravity is so strong that nothing can escape.", + "The Milky Way galaxy contains an estimated 100 to 400 billion stars.", + "Neutron stars are the collapsed cores of massive stars, with densities exceeding atomic nuclei.", + "The cosmic microwave background is the thermal radiation left over from the early universe.", + "Exoplanets are planets outside our solar system, detected via transit photometry or radial velocity.", + // Cooking + "Maillard reaction gives browned foods their distinctive flavour through amino acid and sugar reactions.", + "Sous vide cooking involves sealing food in vacuum bags and cooking at precise low temperatures.", + "Emulsification combines two immiscible liquids, such as oil and water, using an emulsifier like lecithin.", + "Fermentation converts sugars to acids or alcohol using microorganisms, used in bread, beer, and yogurt.", + "Knife skills โ€” julienne, brunoise, chiffonade โ€” determine the surface area and cooking time of vegetables.", + ] + .into_iter() + .map(String::from) + .collect(); + + // โ”€โ”€ 5. Embed the corpus โ”€โ”€ + let model = TextEmbedding::try_new(InitOptions { + model_name: EmbeddingModel::BGESmallENV15, + show_download_progress: true, + ..Default::default() + })?; + + let embeddings = model.embed(passages.clone(), None)?; + + // โ”€โ”€ 6. Insert passages + embeddings โ”€โ”€ + for (i, (passage, emb)) in passages.iter().zip(embeddings.iter()).enumerate() { + let json = vec_to_json(emb); + conn.execute( + "INSERT OR IGNORE INTO docs (id, passage, embedding) VALUES (?, ?, vector(?))", + libsql::params![i as i64, passage.as_str(), json.as_str()], + ) + .await?; + } + + println!("Inserted {} passages.\n", passages.len()); + + // โ”€โ”€ 7. Run three queries โ”€โ”€ + let queries = vec![ + "memory safety in systems programming", + "stars and galaxies", + "fermentation and cooking techniques", + ]; + + for query in &queries { + println!("=== Query: \"{query}\" ===\n"); + let q_emb = model.embed(vec![query.to_string()], None)?; + let q_json = vec_to_json(&q_emb[0]); + + let mut rows = conn + .query( + "SELECT d.passage, v.distance + FROM vector_top_k('docs_idx', vector(?), 5) AS v + JOIN docs AS d ON d.rowid = v.id + ORDER BY v.distance", + libsql::params![q_json.as_str()], + ) + .await?; + + let mut rank = 1; + while let Some(row) = rows.next().await? { + let passage: String = row.get(0)?; + let distance: f64 = row.get(1)?; + println!(" {rank}. [{distance:.4}] {passage}"); + rank += 1; + } + println!(); + } + + Ok(()) +} +``` + +
---