From 60c9fb67a802b93dac5a08c54c85b9b1d6998ea0 Mon Sep 17 00:00:00 2001 From: Elijah Voigt Date: Tue, 10 Mar 2026 14:05:45 -0700 Subject: [PATCH] =?UTF-8?q?docs(edu):=20write=20=C2=A711=20exercise=204=20?= =?UTF-8?q?recommendation=20engine=20for=20vector-db=20course=20[e8be9a]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- edu/src/vector-db.md | 279 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 278 insertions(+), 1 deletion(-) diff --git a/edu/src/vector-db.md b/edu/src/vector-db.md index f45b642..6a81ed8 100644 --- a/edu/src/vector-db.md +++ b/edu/src/vector-db.md @@ -1180,7 +1180,284 @@ async fn main() -> Result<(), Box> { ### 11. Exercise 4 — Recommendation Engine -**Goal:** Implement item-based collaborative filtering using vector similarity. Store item feature vectors (or learned item embeddings) in Turso, then given a target item, retrieve the *k* most similar items as recommendations. 🚧 Full content tracked in [nbd:e8be9a]. +**Goal:** Build an item-based recommendation engine. Store item feature vectors in Turso, then given a target item, find the *k* most similar items using KNN and exclude the query item from the results. + +We will use hand-crafted 5-dimensional feature vectors for a product catalogue (no fastembed dependency — this keeps the focus on the recommendation logic itself). The five dimensions represent affinity scores for: **[electronics, clothing, sports, food, books]**. + +**Catalogue (10 items):** + +| id | name | embedding | +|---|---|---| +| 1 | Laptop | `[0.95, 0.0, 0.1, 0.0, 0.2]` | +| 2 | Mechanical Keyboard | `[0.85, 0.0, 0.0, 0.0, 0.1]` | +| 3 | USB-C Hub | `[0.9, 0.0, 0.0, 0.0, 0.0]` | +| 4 | Running Shoes | `[0.0, 0.6, 0.9, 0.0, 0.0]` | +| 5 | Yoga Mat | `[0.0, 0.2, 0.95, 0.0, 0.0]` | +| 6 | Water Bottle | `[0.1, 0.1, 0.7, 0.0, 0.0]` | +| 7 | T-Shirt | `[0.0, 0.95, 0.1, 0.0, 0.0]` | +| 8 | Cookbook | `[0.0, 0.0, 0.0, 0.6, 0.9]` | +| 9 | Protein Bar | `[0.0, 0.0, 0.3, 0.95, 0.0]` | +| 10 | Novel | `[0.0, 0.0, 0.0, 0.1, 0.95]` | + +#### Step 1 — Schema + +Create a `products` table and an HNSW vector index: + +```rust +conn.execute( + "CREATE TABLE IF NOT EXISTS products ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + embedding F32_BLOB(5) NOT NULL + )", + (), +) +.await?; + +conn.execute( + "CREATE INDEX IF NOT EXISTS products_idx + ON products (libsql_vector_idx(embedding))", + (), +) +.await?; +``` + +#### Step 2 — Insert items + +Use the same pattern as Exercise 1: format each `Vec` as a JSON array string and insert with `INSERT OR IGNORE`: + +```rust +let products: Vec<(i64, &str, Vec)> = vec![ + (1, "Laptop", vec![0.95, 0.0, 0.1, 0.0, 0.2]), + (2, "Mechanical Keyboard", vec![0.85, 0.0, 0.0, 0.0, 0.1]), + (3, "USB-C Hub", vec![0.9, 0.0, 0.0, 0.0, 0.0]), + (4, "Running Shoes", vec![0.0, 0.6, 0.9, 0.0, 0.0]), + (5, "Yoga Mat", vec![0.0, 0.2, 0.95, 0.0, 0.0]), + (6, "Water Bottle", vec![0.1, 0.1, 0.7, 0.0, 0.0]), + (7, "T-Shirt", vec![0.0, 0.95, 0.1, 0.0, 0.0]), + (8, "Cookbook", vec![0.0, 0.0, 0.0, 0.6, 0.9]), + (9, "Protein Bar", vec![0.0, 0.0, 0.3, 0.95, 0.0]), + (10, "Novel", vec![0.0, 0.0, 0.0, 0.1, 0.95]), +]; + +for (id, name, emb) in &products { + let emb_json = serde_json::to_string(emb)?; + conn.execute( + "INSERT OR IGNORE INTO products (id, name, embedding) + VALUES (?, ?, vector(?))", + libsql::params![*id, *name, emb_json.as_str()], + ) + .await?; +} +``` + +#### Step 3 — Recommend function + +Write a helper that retrieves recommendations for a given item: + +```rust +async fn recommend( + conn: &libsql::Connection, + item_id: i64, + k: usize, +) -> Result, Box> { + // 1. Get the query item's embedding as a JSON string. + let mut stmt = conn + .prepare("SELECT vector_extract(embedding) FROM products WHERE id = ?") + .await?; + let mut rows = stmt.query(libsql::params![item_id]).await?; + let row = rows + .next() + .await? + .ok_or("item not found")?; + let query_vec: String = row.get(0)?; + + // 2. Use vector_top_k with k+1 to leave room for the query item itself. + let sql = format!( + "SELECT products.id, products.name, + vector_distance_cos(products.embedding, vector(?1)) AS distance + FROM vector_top_k('products_idx', ?1, {limit}) + JOIN products ON products.rowid = id + WHERE products.id != ?2 + ORDER BY distance + LIMIT ?3", + limit = k + 1 + ); + let mut stmt = conn.prepare(&sql).await?; + let mut rows = stmt + .query(libsql::params![query_vec.as_str(), item_id, k as i64]) + .await?; + + // 3. Collect (name, distance) pairs. + let mut results = Vec::new(); + while let Some(row) = rows.next().await? { + let name: String = row.get(1)?; + let distance: f64 = row.get(2)?; + results.push((name, distance)); + } + Ok(results) +} +``` + +The key ideas: + +1. **Retrieve the query vector** — `vector_extract` returns the stored embedding as a JSON string that can be passed straight back to `vector_top_k`. +2. **Over-fetch by one** — request `k + 1` candidates because `vector_top_k` will return the query item itself (distance ≈ 0). The `WHERE products.id != ?2` clause filters it out. +3. **Cosine distance** — `vector_distance_cos` returns a value between 0 (identical) and 2 (opposite). Lower means more similar. + +#### Step 4 — Print recommendations + +Request recommendations for three representative items and verify the clusters make sense: + +```rust +let queries = vec![ + (1, "Laptop"), + (4, "Running Shoes"), + (8, "Cookbook"), +]; + +for (id, name) in &queries { + let recs = recommend(&conn, *id, 2).await?; + let rec_str: Vec = recs + .iter() + .map(|(n, d)| format!("{n} ({d:.3})")) + .collect(); + println!( + "Customers who liked {name} also liked: {}", + rec_str.join(", ") + ); +} +``` + +**Expected output (distances are approximate):** + +```text +Customers who liked Laptop also liked: Mechanical Keyboard (0.023), USB-C Hub (0.041) +Customers who liked Running Shoes also liked: Yoga Mat (0.019), Water Bottle (0.063) +Customers who liked Cookbook also liked: Novel (0.168), Protein Bar (0.397) +``` + +- **Laptop** → electronics cluster (Mechanical Keyboard, USB-C Hub) +- **Running Shoes** → sports cluster (Yoga Mat, Water Bottle) +- **Cookbook** → food/books cluster (Novel, Protein Bar) + +
+Show full solution + +```rust +use libsql::Builder; + +/// Find the k most similar products to the given item, excluding the item itself. +async fn recommend( + conn: &libsql::Connection, + item_id: i64, + k: usize, +) -> Result, Box> { + // Retrieve the query item's embedding as a JSON string. + let mut stmt = conn + .prepare("SELECT vector_extract(embedding) FROM products WHERE id = ?") + .await?; + let mut rows = stmt.query(libsql::params![item_id]).await?; + let row = rows.next().await?.ok_or("item not found")?; + let query_vec: String = row.get(0)?; + + // KNN search: fetch k+1 to account for the query item appearing in its + // own results, then filter it out. + let sql = format!( + "SELECT products.id, products.name, + vector_distance_cos(products.embedding, vector(?1)) AS distance + FROM vector_top_k('products_idx', ?1, {limit}) + JOIN products ON products.rowid = id + WHERE products.id != ?2 + ORDER BY distance + LIMIT ?3", + limit = k + 1 + ); + let mut stmt = conn.prepare(&sql).await?; + let mut rows = stmt + .query(libsql::params![query_vec.as_str(), item_id, k as i64]) + .await?; + + let mut results = Vec::new(); + while let Some(row) = rows.next().await? { + let name: String = row.get(1)?; + let distance: f64 = row.get(2)?; + results.push((name, distance)); + } + Ok(results) +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let db = Builder::new_local(":memory:").build().await?; + let conn = db.connect()?; + + // --- Schema --- + conn.execute( + "CREATE TABLE IF NOT EXISTS products ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + embedding F32_BLOB(5) NOT NULL + )", + (), + ) + .await?; + + conn.execute( + "CREATE INDEX IF NOT EXISTS products_idx + ON products (libsql_vector_idx(embedding))", + (), + ) + .await?; + + // --- Seed data --- + let products: Vec<(i64, &str, Vec)> = vec![ + (1, "Laptop", vec![0.95, 0.0, 0.1, 0.0, 0.2]), + (2, "Mechanical Keyboard", vec![0.85, 0.0, 0.0, 0.0, 0.1]), + (3, "USB-C Hub", vec![0.9, 0.0, 0.0, 0.0, 0.0]), + (4, "Running Shoes", vec![0.0, 0.6, 0.9, 0.0, 0.0]), + (5, "Yoga Mat", vec![0.0, 0.2, 0.95, 0.0, 0.0]), + (6, "Water Bottle", vec![0.1, 0.1, 0.7, 0.0, 0.0]), + (7, "T-Shirt", vec![0.0, 0.95, 0.1, 0.0, 0.0]), + (8, "Cookbook", vec![0.0, 0.0, 0.0, 0.6, 0.9]), + (9, "Protein Bar", vec![0.0, 0.0, 0.3, 0.95, 0.0]), + (10, "Novel", vec![0.0, 0.0, 0.0, 0.1, 0.95]), + ]; + + for (id, name, emb) in &products { + let emb_json = serde_json::to_string(emb)?; + conn.execute( + "INSERT OR IGNORE INTO products (id, name, embedding) + VALUES (?, ?, vector(?))", + libsql::params![*id, *name, emb_json.as_str()], + ) + .await?; + } + + // --- Recommendations --- + let queries = vec![ + (1, "Laptop"), + (4, "Running Shoes"), + (8, "Cookbook"), + ]; + + for (id, name) in &queries { + let recs = recommend(&conn, *id, 2).await?; + let rec_str: Vec = recs + .iter() + .map(|(n, d)| format!("{n} ({d:.3})")) + .collect(); + println!( + "Customers who liked {name} also liked: {}", + rec_str.join(", ") + ); + } + + Ok(()) +} +``` + +
---