RAG system is live, but answers are always "plausible but wrong"? Retrieved documents are irrelevant, generated answers contain hallucinations, users keep complaining? It's not that RAG doesn't work—it's that you haven't systematically evaluated and optimized. In 2026, RAGAS evaluation framework + 6 optimization strategies can boost your RAG accuracy from 60% to 85%+.
Background: RAG Evaluation Metrics
RAG systems have two major components to evaluate: retrieval and generation. The RAGAS framework defines core metrics:
| Metric |
Component |
Meaning |
Range |
| Context Precision |
Retrieval |
Ranking of relevant docs in retrieved results |
0-1 |
| Context Recall |
Retrieval |
Proportion of required info that was retrieved |
0-1 |
| Faithfulness |
Generation |
Consistency of generated answer with retrieved docs |
0-1 |
| Answer Relevancy |
Generation |
Relevance of answer to the question |
0-1 |
| Answer Similarity |
Generation |
Semantic similarity to reference answer |
0-1 |
| Root Cause |
Percentage |
Manifestation |
| Improper chunking strategy |
25% |
Key info truncated or scattered across chunks |
| Low retrieval recall |
20% |
Relevant documents not retrieved |
| Missing reranking |
15% |
Relevant docs ranked lower |
| Poor query formulation |
15% |
User question doesn't match document phrasing |
| Single retrieval method |
10% |
Vector-only search misses keyword exact matches |
| Poor prompt engineering |
15% |
Generation prompt doesn't leverage retrieved content |
Step 1: Establish Evaluation Baseline with RAGAS
# rag_evaluation.py
from ragas import evaluate
from ragas.metrics import (
context_precision,
context_recall,
faithfulness,
answer_relevancy,
)
from datasets import Dataset
eval_data = {
"question": [
"What is zero trust architecture?",
"How does Istio implement mTLS?",
"What is the format of a SPIFFE ID?",
],
"contexts": [
["Zero trust architecture is a security model whose core principle is never trust, always verify..."],
["Istio manages mTLS certificate issuance and rotation automatically through Envoy Sidecar proxies..."],
["The SPIFFE ID format is spiffe://<trust domain>/<workload identifier>..."],
],
"answer": [
"Zero trust architecture is a security model whose core principle is never trust, always verify, not relying on network perimeters for security.",
"Istio implements mTLS through Envoy Sidecar proxies, automatically managing certificate issuance, distribution, and rotation.",
"The SPIFFE ID format is spiffe://<trust domain>/<workload identifier>, where trust domain identifies the trust domain.",
],
"ground_truth": [
"Zero trust architecture is a security model whose core principle is never trust, always verify; every request requires identity verification and authorization.",
"Istio implements mTLS through Envoy Sidecar proxy injection, automatically providing mutual TLS encryption and identity verification for inter-service communication.",
"The SPIFFE ID format is spiffe://<trust domain>/<workload identifier>, where trust domain identifies the trust domain and path identifies the workload.",
],
}
dataset = Dataset.from_dict(eval_data)
result = evaluate(
dataset,
metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
)
print(result)
Optimization Strategy 1: Chunk Strategy Optimization
# chunk_optimization.py
from langchain.text_splitter import RecursiveCharacterTextSplitter
class SmartChunker:
def __init__(self):
self.recursive_splitter = RecursiveCharacterTextSplitter(
chunk_size=512,
chunk_overlap=50,
separators=["\n\n", "\n", ". ", ". ", " ", ""],
)
def chunk_with_structure(self, document: str, metadata: dict = None) -> list:
chunks = []
sections = document.split("\n\n")
current_context = ""
for section in sections:
lines = section.strip().split("\n")
is_heading = any(line.startswith("#") for line in lines)
if is_heading:
current_context = section.strip()
sub_chunks = self.recursive_splitter.split_text(section)
for chunk in sub_chunks:
chunks.append({
"content": chunk,
"context": current_context,
"metadata": metadata or {},
})
return chunks
def chunk_with_parent_child(self, document: str) -> list:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=30)
result = []
parents = parent_splitter.split_text(document)
for i, parent in enumerate(parents):
children = child_splitter.split_text(parent)
result.append({
"parent_id": f"parent_{i}",
"parent_content": parent,
"children": [{"child_id": f"parent_{i}_child_{j}", "content": c} for j, c in enumerate(children)],
})
return result
Optimization Strategy 2: Retrieval Reranking
# reranker.py
from sentence_transformers import CrossEncoder
from typing import List, Dict
class Reranker:
def __init__(self, model_name: str = "BAAI/bge-reranker-v2-m3"):
self.model = CrossEncoder(model_name)
def rerank(
self,
query: str,
documents: List[Dict],
top_k: int = 5,
content_key: str = "content",
) -> List[Dict]:
pairs = [(query, doc[content_key]) for doc in documents]
scores = self.model.predict(pairs)
scored_docs = list(zip(documents, scores))
scored_docs.sort(key=lambda x: x[1], reverse=True)
results = []
for doc, score in scored_docs[:top_k]:
result = doc.copy()
result["rerank_score"] = float(score)
results.append(result)
return results
Optimization Strategy 3: Hybrid Retrieval
# hybrid_retriever.py
from typing import List, Dict
class HybridRetriever:
def __init__(self, vector_store, keyword_store, alpha: float = 0.7):
self.vector_store = vector_store
self.keyword_store = keyword_store
self.alpha = alpha
def search(self, query: str, top_k: int = 10) -> List[Dict]:
vector_results = self.vector_store.similarity_search_with_score(query, k=top_k * 2)
keyword_results = self.keyword_store.search(query, top_k=top_k * 2)
vector_scores = {}
for doc, score in vector_results:
doc_id = doc.metadata.get("id", hash(doc.page_content))
vector_scores[doc_id] = {
"content": doc.page_content,
"vector_score": 1.0 / (1.0 + score),
"keyword_score": 0.0,
}
for doc in keyword_results:
doc_id = doc.metadata.get("id", hash(doc.page_content))
if doc_id in vector_scores:
vector_scores[doc_id]["keyword_score"] = doc.score
else:
vector_scores[doc_id] = {
"content": doc.page_content,
"vector_score": 0.0,
"keyword_score": doc.score,
}
combined = []
for doc_id, scores in vector_scores.items():
hybrid_score = (
self.alpha * scores["vector_score"]
+ (1 - self.alpha) * scores["keyword_score"]
)
combined.append({
"content": scores["content"],
"hybrid_score": hybrid_score,
"vector_score": scores["vector_score"],
"keyword_score": scores["keyword_score"],
})
combined.sort(key=lambda x: x["hybrid_score"], reverse=True)
return combined[:top_k]
Optimization Strategy 4: Query Rewriting
# query_rewriter.py
from openai import OpenAI
from typing import List, Dict
class QueryRewriter:
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def rewrite(self, query: str, history: List[Dict] = None) -> str:
messages = [
{"role": "system", "content": """You are a query rewriting expert. Rewrite the user's natural language question for better document retrieval:
1. Fill in omitted context (from conversation history)
2. Convert colloquial expressions to technical terms
3. Decompose compound questions into sub-questions
4. Preserve the core intent of the original question
Output only the rewritten query, no explanation."""},
]
if history:
for msg in history[-3:]:
messages.append({"role": msg["role"], "content": msg["content"]})
messages.append({"role": "user", "content": f"Original query: {query}\nRewritten:"})
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
temperature=0.1,
max_tokens=200,
)
return response.choices[0].message.content.strip()
def expand_queries(self, query: str, n: int = 3) -> List[str]:
messages = [
{"role": "system", "content": f"Generate {n} retrieval variants from different angles for the following query, one per line."},
{"role": "user", "content": query},
]
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
temperature=0.5,
max_tokens=300,
)
variants = response.choices[0].message.content.strip().split("\n")
return [query] + [v.strip() for v in variants if v.strip()]
Optimization Strategy 5: Context Compression
# context_compressor.py
from typing import List
from openai import OpenAI
class ContextCompressor:
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def compress(self, query: str, documents: List[str], max_tokens: int = 2000) -> str:
combined = "\n\n---\n\n".join(documents)
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": f"""Extract the most relevant content from the following documents for the query.
Requirements:
1. Keep only directly relevant information
2. Remove redundancy and repetition
3. Maintain factual accuracy
4. Output no more than {max_tokens} tokens"""},
{"role": "user", "content": f"Query: {query}\n\nDocuments:\n{combined}"},
],
temperature=0.0,
max_tokens=max_tokens,
)
return response.choices[0].message.content
Optimization Strategy 6: Adaptive Retrieval
# adaptive_retrieval.py
class AdaptiveRetriever:
def __init__(self, hybrid_retriever, reranker, query_rewriter):
self.hybrid_retriever = hybrid_retriever
self.reranker = reranker
self.query_rewriter = query_rewriter
self.retrieval_stats = {"simple": 0, "expanded": 0, "rewritten": 0}
def retrieve(self, query: str, top_k: int = 5) -> list:
rewritten_query = self.query_rewriter.rewrite(query)
if rewritten_query.lower() == query.lower():
results = self.hybrid_retriever.search(query, top_k=top_k * 2)
self.retrieval_stats["simple"] += 1
else:
expanded = self.query_rewriter.expand_queries(query, n=2)
all_results = []
for q in expanded:
all_results.extend(self.hybrid_retriever.search(q, top_k=top_k))
seen = set()
results = []
for r in all_results:
key = hash(r["content"])
if key not in seen:
seen.add(key)
results.append(r)
self.retrieval_stats["expanded"] += 1
reranked = self.reranker.rerank(rewritten_query, results, top_k=top_k)
return reranked
Pitfall Guide
| # |
Pitfall |
Symptom |
Solution |
| 1 |
One-size-fits-all chunk size |
Tables/code truncated, semantics incomplete |
Use different chunk_size by content type: text 512, code 1024, tables 256 |
| 2 |
Embedding model mismatch |
Poor retrieval for cross-language queries |
Use multilingual embeddings like bge-m3 or multilingual-e5-large |
| 3 |
Reranker too slow |
Retrieval + reranking latency exceeds 2s |
Use lightweight reranker bge-reranker-v2-m3, or cache reranking results |
| 4 |
Query rewrite deviates from intent |
Rewritten query loses core user intent |
Set temperature=0.1, keep original query as fallback |
| 5 |
Fixed hybrid alpha |
Different query types need different optimal alpha |
Dynamically adjust: keyword queries alpha=0.3, semantic queries alpha=0.8 |
Error Troubleshooting
| Error Message |
Cause |
Solution |
ragas: openai API key not set |
OpenAI API Key not configured |
Set OPENAI_API_KEY environment variable |
CrossEncoder: model not found |
Reranker model not downloaded |
huggingface-cli download BAAI/bge-reranker-v2-m3 |
ChromaDB: collection not found |
Vector DB collection doesn't exist |
Create collection before inserting documents |
Token limit exceeded |
Retrieved docs exceed LLM context window |
Use ContextCompressor, or reduce top_k |
CUDA out of memory |
Embedding/reranker model GPU OOM |
Use CPU inference, or switch to smaller model |
FAISS index not built |
Vector index not constructed |
Build index first: index = faiss.IndexFlatIP(dimension) |
JSON decode error in RAGAS |
Evaluation data format incorrect |
Confirm Dataset has question/contexts/answer/ground_truth columns |
RecursiveCharacterTextSplitter: empty chunk |
Empty document or separator mismatch |
Add empty chunk filtering, check document encoding |
OpenAI: rate limit exceeded |
API rate limit exceeded |
Add retry logic and request interval |
SentenceTransformer: SSL error |
HuggingFace download blocked |
Set HF mirror HF_ENDPOINT=https://hf-mirror.com |
Advanced Optimization
1. Evaluation-Driven Iterative Optimization
def optimization_loop(eval_dataset, strategies, max_iterations=5):
best_score = 0
best_config = None
for i in range(max_iterations):
for strategy_name, strategy_fn in strategies.items():
modified_dataset = strategy_fn(eval_dataset)
result = evaluate(modified_dataset, metrics=[faithfulness, answer_relevancy])
score = result['faithfulness'] * 0.5 + result['answer_relevancy'] * 0.5
if score > best_score:
best_score = score
best_config = strategy_name
print(f"Iteration {i+1}: {strategy_name} score {score:.4f} (new best)")
return best_config, best_score
2. Multi-Granularity Indexing
| Index Granularity |
Chunk Size |
Purpose |
Retrieval Method |
| Paragraph-level |
512 tokens |
Precise retrieval |
Vector similarity |
| Document-level |
2000 tokens |
Context supplement |
Parent-child retrieval |
| Summary-level |
100 tokens |
Quick filtering |
Keyword matching |
Comparison Analysis
| Optimization Strategy |
Accuracy Boost |
Latency Impact |
Implementation Complexity |
Priority |
| Chunk optimization |
+10-15% |
None |
Low |
★★★★★ |
| Retrieval reranking |
+8-12% |
+50-200ms |
Low |
★★★★★ |
| Hybrid retrieval |
+5-10% |
+20ms |
Medium |
★★★★ |
| Query rewriting |
+5-8% |
+100ms |
Medium |
★★★★ |
| Context compression |
+3-5% |
+200ms |
Medium |
★★★ |
| Adaptive retrieval |
+5-10% |
+100ms |
High |
★★★ |
| RAG Evaluation Framework |
Metric Coverage |
Ease of Use |
LLM Dependency |
Open Source |
| RAGAS |
★★★★★ |
★★★★ |
Yes |
Yes |
| DeepEval |
★★★★ |
★★★★★ |
Yes |
Yes |
| TruLens |
★★★ |
★★★ |
Yes |
Yes |
| ARES |
★★★★ |
★★★ |
Yes |
Yes |
Summary: RAG optimization isn't about tweaking a single parameter—it's an "evaluate → optimize → re-evaluate" closed loop. First establish a baseline with RAGAS, then tackle each issue with 6 strategies: chunk optimization fixes info truncation, reranking improves Top-K accuracy, hybrid retrieval expands recall, query rewriting bridges semantic gaps, context compression reduces noise, adaptive retrieval selects the optimal path. Each strategy contributes 5-15% accuracy improvement, combining for 40%+. In 2026, RAG optimization without evaluation is like the blind men and the elephant.