RAG (Retrieval-Augmented Generation)¶

Index documents, retrieve relevant context, generate an answer with an LLM.

PythonTypeScript

"""Minimal RAG (Retrieval-Augmented Generation) with vctrs + OpenAI.

Index documents, retrieve relevant context, generate an answer.

Usage:
    pip install vctrs sentence-transformers openai
    export OPENAI_API_KEY=sk-...
    python rag.py "What is HNSW?"
"""

import os
import sys
from sentence_transformers import SentenceTransformer
from vctrs import Database

# Chunks of text to index. In a real app these come from PDFs, web pages, etc.
CHUNKS = [
    "HNSW (Hierarchical Navigable Small World) is a graph-based algorithm for approximate nearest neighbor search. It builds a multi-layer graph where each layer is progressively sparser. Search starts at the top layer and greedily descends.",
    "Cosine similarity measures the angle between two vectors. It ranges from -1 (opposite) to 1 (identical). Cosine distance is 1 minus cosine similarity, so 0 means identical.",
    "Vector embeddings represent text, images, or other data as dense floating-point vectors. Similar items have similar embeddings. Models like sentence-transformers produce 384-dimensional embeddings.",
    "Brute-force search compares the query against every vector in the dataset. It gives perfect recall but is O(n). For large datasets, approximate methods like HNSW are much faster.",
    "Memory-mapped files let the OS manage which pages of a file are in RAM. This enables working with datasets larger than available memory. The OS pages data in and out as needed.",
    "RAG (Retrieval-Augmented Generation) combines a retriever (vector search) with a generator (LLM). The retriever finds relevant documents, which are passed as context to the LLM to generate accurate answers.",
]


def build_index():
    model = SentenceTransformer("all-MiniLM-L6-v2")
    dim = model.get_sentence_embedding_dimension()
    db = Database("./rag_db", dim=dim, metric="cosine")
    if len(db) == 0:
        print("Indexing chunks...")
        vectors = model.encode(CHUNKS)
        ids = [f"chunk{i}" for i in range(len(CHUNKS))]
        metadata = [{"text": chunk} for chunk in CHUNKS]
        db.add_many(ids, vectors, metadata)
        db.save()

    return db, model


def retrieve(db, model, query, k=3):
    query_vec = model.encode(query)
    results = db.search(query_vec, k=k)
    return [meta["text"] for _, _, meta in results]


def generate(query, context_chunks):
    # Try OpenAI. If no API key, just print the retrieved context.
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        print("(No OPENAI_API_KEY set — showing retrieved context only)\n")
        print(f"Query: {query}\n")
        print("Retrieved context:")
        for i, chunk in enumerate(context_chunks, 1):
            print(f"  {i}. {chunk}")
        return

    from openai import OpenAI
    client = OpenAI()

    context = "\n\n".join(context_chunks)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Answer based on this context:\n\n{context}"},
            {"role": "user", "content": query},
        ],
    )
    print(f"Q: {query}\n")
    print(f"A: {response.choices[0].message.content}")


def main():
    query = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else "What is HNSW and how does it work?"
    db, model = build_index()
    chunks = retrieve(db, model, query)
    generate(query, chunks)


if __name__ == "__main__":
    main()

/**
 * Minimal RAG (Retrieval-Augmented Generation) with vctrs + OpenAI.
 *
 * Index documents, retrieve relevant context, generate an answer.
 *
 * Usage:
 *   npm install vctrs @xenova/transformers openai
 *   export OPENAI_API_KEY=sk-...
 *   npx tsx rag.ts "What is HNSW?"
 */

import { VctrsDatabase } from "vctrs";
import { pipeline } from "@xenova/transformers";

// Chunks of text to index. In a real app these come from PDFs, web pages, etc.
const CHUNKS = [
  "HNSW (Hierarchical Navigable Small World) is a graph-based algorithm for approximate nearest neighbor search. It builds a multi-layer graph where each layer is progressively sparser. Search starts at the top layer and greedily descends.",
  "Cosine similarity measures the angle between two vectors. It ranges from -1 (opposite) to 1 (identical). Cosine distance is 1 minus cosine similarity, so 0 means identical.",
  "Vector embeddings represent text, images, or other data as dense floating-point vectors. Similar items have similar embeddings. Models like sentence-transformers produce 384-dimensional embeddings.",
  "Brute-force search compares the query against every vector in the dataset. It gives perfect recall but is O(n). For large datasets, approximate methods like HNSW are much faster.",
  "Memory-mapped files let the OS manage which pages of a file are in RAM. This enables working with datasets larger than available memory. The OS pages data in and out as needed.",
  "RAG (Retrieval-Augmented Generation) combines a retriever (vector search) with a generator (LLM). The retriever finds relevant documents, which are passed as context to the LLM to generate accurate answers.",
];

async function main() {
  const extractor = await pipeline("feature-extraction", "Xenova/all-MiniLM-L6-v2");

  async function embed(texts: string | string[]): Promise<number[][]> {
    const output = await extractor(texts, { pooling: "mean", normalize: true });
    const data: number[] = Array.from(output.data as Float32Array);
    const dim = output.dims[output.dims.length - 1];
    const arr = Array.isArray(texts) ? texts : [texts];
    return arr.map((_, i) => data.slice(i * dim, (i + 1) * dim));
  }

  const sampleVec = await embed("test");
  const dim = sampleVec[0].length;

  const db = new VctrsDatabase("./rag_db", dim, "cosine");

  if (db.length === 0) {
    console.log("Indexing chunks...");
    const vectors = await embed(CHUNKS);
    const ids = CHUNKS.map((_, i) => `chunk${i}`);
    const metadata = CHUNKS.map((text) => ({ text }));
    db.addMany(ids, vectors, metadata);
    db.save();
  }

  const query = process.argv.slice(2).join(" ") || "What is HNSW and how does it work?";
  const queryVec = (await embed(query))[0];
  const results = db.search(queryVec, 3);
  const contextChunks = results.map((r) => r.metadata?.text as string);

  // Try OpenAI. If no API key, just print the retrieved context.
  const apiKey = process.env.OPENAI_API_KEY;
  if (!apiKey) {
    console.log("(No OPENAI_API_KEY set — showing retrieved context only)\n");
    console.log(`Query: ${query}\n`);
    console.log("Retrieved context:");
    for (const [i, chunk] of contextChunks.entries()) {
      console.log(`  ${i + 1}. ${chunk}`);
    }
    return;
  }

  const { default: OpenAI } = await import("openai");
  const client = new OpenAI();

  const context = contextChunks.join("\n\n");
  const response = await client.chat.completions.create({
    model: "gpt-4o-mini",
    messages: [
      { role: "system", content: `Answer based on this context:\n\n${context}` },
      { role: "user", content: query },
    ],
  });

  console.log(`Q: ${query}\n`);
  console.log(`A: ${response.choices[0].message.content}`);
}

main();

Run it¶

PythonTypeScript

pip install vctrs sentence-transformers openai
export OPENAI_API_KEY=sk-...
python examples/rag/rag.py "What is HNSW?"

npm install @yang-29/vctrs @xenova/transformers openai
export OPENAI_API_KEY=sk-...
npx tsx examples/rag/rag.ts "What is HNSW?"