Understanding RAG: Building Intelligent AI Apps with Your Own Data
What is Retrieval Augmented Generation?
Large language models are powerful, but they have a fundamental limitation: their knowledge is frozen at training time. They cannot access your private data, your latest documentation, or events after their cutoff. RAG (Retrieval Augmented Generation) solves this by combining retrieval from an external knowledge base with generation from an LLM.
The core insight is simple: before asking the LLM to answer a question, first retrieve the most relevant documents from your knowledge base, then augment the prompt with that context, then generate a grounded answer.
Architecture Overview
┌─────────────────────────────────────────────────────────────────┐
│ RAG Pipeline Architecture │
│ │
│ INDEXING PHASE (offline) │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────────┐ │
│ │ Source │──▶│ Chunk │──▶│ Embed │──▶│ Vector DB │ │
│ │ Docs │ │ & Clean │ │ Model │ │ (Store) │ │
│ └──────────┘ └──────────┘ └──────────┘ └────────────┘ │
│ │
│ RETRIEVAL PHASE (online, per query) │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌────────────┐ │
│ │ User │──▶│ Embed │──▶│ Semantic │──▶│ Top-K │ │
│ │ Query │ │ Query │ │ Search │ │ Chunks │ │
│ └──────────┘ └──────────┘ └──────────┘ └────────────┘ │
│ │ │
│ GENERATION PHASE ▼ │
│ ┌──────────┐ ┌──────────────────────────────────────┐ │
│ │ Response │◀──│ LLM: Answer given [context] + [query]│ │
│ └──────────┘ └──────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘
Step 1: Document Ingestion and Chunking
How you chunk documents dramatically affects retrieval quality. Naive fixed-size chunking loses semantic coherence. Use semantic or recursive chunking.
python1from langchain.text_splitter import RecursiveCharacterTextSplitter 2from langchain.document_loaders import PyPDFLoader, DirectoryLoader 3import tiktoken 4 5class DocumentIngestionPipeline: 6 def __init__(self, chunk_size=512, chunk_overlap=50): 7 self.tokenizer = tiktoken.get_encoding("cl100k_base") 8 self.splitter = RecursiveCharacterTextSplitter( 9 chunk_size=chunk_size, 10 chunk_overlap=chunk_overlap, 11 length_function=self._token_length, 12 separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""] 13 ) 14 15 def _token_length(self, text: str) -> int: 16 return len(self.tokenizer.encode(text)) 17 18 def load_and_chunk(self, source_dir: str) -> list[dict]: 19 loader = DirectoryLoader(source_dir, glob="**/*.pdf", loader_cls=PyPDFLoader) 20 raw_docs = loader.load() 21 22 chunks = [] 23 for doc in raw_docs: 24 doc_chunks = self.splitter.split_documents([doc]) 25 for i, chunk in enumerate(doc_chunks): 26 chunks.append({ 27 "text": chunk.page_content, 28 "metadata": { 29 **chunk.metadata, 30 "chunk_index": i, 31 "total_chunks": len(doc_chunks) 32 } 33 }) 34 return chunks
Step 2: Generating and Storing Embeddings
python1from openai import OpenAI 2from pinecone import Pinecone, ServerlessSpec 3import hashlib 4 5class EmbeddingIndexer: 6 def __init__(self, index_name: str): 7 self.openai = OpenAI() 8 self.pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"]) 9 10 if index_name not in self.pc.list_indexes().names(): 11 self.pc.create_index( 12 name=index_name, 13 dimension=1536, # text-embedding-3-small 14 metric="cosine", 15 spec=ServerlessSpec(cloud="aws", region="us-east-1") 16 ) 17 self.index = self.pc.Index(index_name) 18 19 def embed_batch(self, texts: list[str]) -> list[list[float]]: 20 response = self.openai.embeddings.create( 21 model="text-embedding-3-small", 22 input=texts 23 ) 24 return [item.embedding for item in response.data] 25 26 def index_chunks(self, chunks: list[dict], batch_size: int = 100): 27 for i in range(0, len(chunks), batch_size): 28 batch = chunks[i:i + batch_size] 29 texts = [c["text"] for c in batch] 30 embeddings = self.embed_batch(texts) 31 32 vectors = [] 33 for chunk, embedding in zip(batch, embeddings): 34 doc_id = hashlib.md5(chunk["text"].encode()).hexdigest() 35 vectors.append({ 36 "id": doc_id, 37 "values": embedding, 38 "metadata": { 39 "text": chunk["text"], 40 **chunk["metadata"] 41 } 42 }) 43 44 self.index.upsert(vectors=vectors) 45 print(f"Indexed batch {i // batch_size + 1}: {len(vectors)} chunks")
Step 3: Retrieval with Hybrid Search
Semantic search alone misses exact keyword matches. Hybrid search combines dense (semantic) + sparse (BM25) retrieval for best coverage.
python1from rank_bm25 import BM25Okapi 2import numpy as np 3 4class HybridRetriever: 5 def __init__(self, vector_index, documents: list[str]): 6 self.vector_index = vector_index 7 self.documents = documents 8 tokenized = [doc.lower().split() for doc in documents] 9 self.bm25 = BM25Okapi(tokenized) 10 11 def semantic_search(self, query_embedding: list[float], top_k: int = 10): 12 results = self.vector_index.query( 13 vector=query_embedding, 14 top_k=top_k, 15 include_metadata=True 16 ) 17 return [ 18 {"text": r.metadata["text"], "score": r.score, "id": r.id} 19 for r in results.matches 20 ] 21 22 def bm25_search(self, query: str, top_k: int = 10): 23 tokens = query.lower().split() 24 scores = self.bm25.get_scores(tokens) 25 top_indices = np.argsort(scores)[::-1][:top_k] 26 return [ 27 {"text": self.documents[i], "score": float(scores[i]), "id": str(i)} 28 for i in top_indices 29 ] 30 31 def hybrid_search(self, query: str, query_embedding: list[float], 32 alpha: float = 0.7, top_k: int = 5): 33 """alpha=1.0 is pure semantic, alpha=0.0 is pure BM25""" 34 semantic_results = self.semantic_search(query_embedding, top_k=20) 35 bm25_results = self.bm25_search(query, top_k=20) 36 37 # Reciprocal Rank Fusion 38 scores = {} 39 k = 60 # RRF constant 40 41 for rank, result in enumerate(semantic_results): 42 doc_id = result["id"] 43 scores[doc_id] = scores.get(doc_id, 0) + alpha * (1 / (k + rank + 1)) 44 45 for rank, result in enumerate(bm25_results): 46 doc_id = result["id"] 47 scores[doc_id] = scores.get(doc_id, 0) + (1 - alpha) * (1 / (k + rank + 1)) 48 49 # Sort by combined score 50 all_results = {r["id"]: r for r in semantic_results + bm25_results} 51 ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k] 52 return [all_results[doc_id] for doc_id, _ in ranked if doc_id in all_results]
Step 4: Generation with Context
python1from anthropic import Anthropic 2 3class RAGPipeline: 4 def __init__(self, retriever: HybridRetriever, embedder: EmbeddingIndexer): 5 self.retriever = retriever 6 self.embedder = embedder 7 self.client = Anthropic() 8 9 def answer(self, question: str, top_k: int = 5) -> dict: 10 # Embed the question 11 query_embedding = self.embedder.embed_batch([question])[0] 12 13 # Retrieve relevant chunks 14 chunks = self.retriever.hybrid_search( 15 query=question, 16 query_embedding=query_embedding, 17 top_k=top_k 18 ) 19 20 # Build context string 21 context = "\n\n---\n\n".join([ 22 f"[Source {i+1}]\n{c['text']}" 23 for i, c in enumerate(chunks) 24 ]) 25 26 # Generate answer 27 response = self.client.messages.create( 28 model="claude-opus-4-7", 29 max_tokens=1024, 30 system="""You are a precise assistant. Answer questions using ONLY the provided context. 31 If the answer is not in the context, say so clearly. Cite sources by number.""", 32 messages=[{ 33 "role": "user", 34 "content": f"Context:\n{context}\n\nQuestion: {question}" 35 }] 36 ) 37 38 return { 39 "answer": response.content[0].text, 40 "sources": chunks, 41 "tokens_used": response.usage.input_tokens + response.usage.output_tokens 42 }
Advanced RAG Techniques
Query Rewriting
LLM-powered query expansion improves recall significantly:
python1async def rewrite_query(original_query: str, client: Anthropic) -> list[str]: 2 response = client.messages.create( 3 model="claude-haiku-4-5-20251001", 4 max_tokens=256, 5 messages=[{ 6 "role": "user", 7 "content": f"""Generate 3 alternative phrasings of this search query to improve retrieval. 8 Return as JSON array of strings. 9 10 Query: {original_query}""" 11 }] 12 ) 13 import json 14 variants = json.loads(response.content[0].text) 15 return [original_query] + variants
Contextual Compression
Filter retrieved chunks to only include sentences relevant to the query:
python1def compress_context(chunks: list[str], query: str, client: Anthropic) -> str: 2 response = client.messages.create( 3 model="claude-haiku-4-5-20251001", 4 max_tokens=512, 5 messages=[{ 6 "role": "user", 7 "content": f"""From the passages below, extract ONLY the sentences relevant to: "{query}" 8 9 Passages:\n{chr(10).join(chunks)}""" 10 }] 11 ) 12 return response.content[0].text
Evaluating RAG Quality
Use RAGAS (RAG Assessment) framework:
python1from ragas import evaluate 2from ragas.metrics import faithfulness, answer_relevancy, context_recall 3 4dataset = [ 5 { 6 "question": "What is the refund policy?", 7 "answer": rag_pipeline.answer("What is the refund policy?")["answer"], 8 "contexts": [c["text"] for c in retrieved_chunks], 9 "ground_truth": "30-day money back guarantee" 10 } 11] 12 13results = evaluate( 14 dataset=dataset, 15 metrics=[faithfulness, answer_relevancy, context_recall] 16) 17print(results) # faithfulness: 0.92, relevancy: 0.88, recall: 0.79
Production Checklist
- Chunking: 256-512 tokens with 10-15% overlap; preserve sentence boundaries
- Embedding model:
text-embedding-3-smallfor cost/quality;text-embedding-3-largefor precision - Retrieval: Always use hybrid search (semantic + BM25) in production
- Re-ranking: Add a cross-encoder re-ranker (e.g.,
cross-encoder/ms-marco-MiniLM-L-6-v2) on top-20 results - Observability: Log query, retrieved chunks, and final answer for every request
- Latency budget: Embedding (~50ms) + Retrieval (~20ms) + Generation (~800ms) = ~870ms p50