Python RAG效果太差?2026年RAGAS評估+6種最佳化策略讓準確率提升40%

AI与大数据

Python RAG效果太差?2026年RAGAS評估+6種最佳化策略讓準確率提升40%

RAG系統上線了,但回答總是「似是而非」?檢索到的文件不相關,生成的答案包含幻覺,使用者投訴不斷?這不是RAG不行,而是你沒有系統評估和最佳化。2026年,RAGAS評估框架 + 6種最佳化策略,能讓你的RAG準確率從60%提升到85%+。


背景知識:RAG評估指標

RAG系統有兩大環節需要評估:檢索生成。RAGAS框架定義了核心指標:

指標 評估環節 含義 取值範圍
Context Precision 檢索 檢索到的文件中相關文件的排名 0-1
Context Recall 檢索 答案所需資訊被檢索到的比例 0-1
Faithfulness 生成 生成答案與檢索文件的一致性 0-1
Answer Relevancy 生成 答案與問題的相關性 0-1
Answer Similarity 生成 生成答案與參考答案的語義相似度 0-1

問題分析:RAG效果差的6大根因

根因 佔比 表現
Chunk策略不當 25% 關鍵資訊被截斷或分散在多個Chunk
檢索召回率低 20% 相關文件未被檢索到
缺少重排序 15% 相關文件排在後面
Query表述不佳 15% 使用者問題與文件表述不匹配
單一檢索方式 10% 僅用向量檢索,遺漏關鍵詞精確匹配
Prompt工程差 15% 生成Prompt未充分利用檢索內容

第1步:用RAGAS建立評估基線

# rag_evaluation.py
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    faithfulness,
    answer_relevancy,
)
from datasets import Dataset

eval_data = {
    "question": [
        "什麼是零信任架構?",
        "Istio如何實現mTLS?",
        "SPIFFE ID的格式是什麼?",
    ],
    "contexts": [
        ["零信任架構是一種安全模型,核心原則是永不信任,始終驗證..."],
        ["Istio透過Envoy Sidecar代理自動管理mTLS憑證的頒發和輪換..."],
        ["SPIFFE ID的格式為spiffe://<trust domain>/<workload identifier>..."],
    ],
    "answer": [
        "零信任架構是一種安全模型,核心原則是永不信任,始終驗證,不依賴網路邊界進行安全防護。",
        "Istio透過Envoy Sidecar代理實現mTLS,自動管理憑證的頒發、分發和輪換。",
        "SPIFFE ID格式為spiffe://<trust domain>/<workload identifier>,其中trust domain是信任域。",
    ],
    "ground_truth": [
        "零信任架構是一種安全模型,核心原則是永不信任,始終驗證,每個請求都需要身份驗證和授權。",
        "Istio透過Envoy Sidecar代理注入實現mTLS,自動為服務間通訊提供雙向TLS加密和身份驗證。",
        "SPIFFE ID格式為spiffe://<trust domain>/<workload identifier>,trust domain標識信任域,path標識工作負載。",
    ],
}

dataset = Dataset.from_dict(eval_data)

result = evaluate(
    dataset,
    metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
)

print(result)
# 輸出範例:
# {'context_precision': 0.75, 'context_recall': 0.68, 'faithfulness': 0.82, 'answer_relevancy': 0.79}

最佳化策略一:Chunk策略最佳化

# chunk_optimization.py
from langchain.text_splitter import RecursiveCharacterTextSplitter, SemanticChunker
from langchain_community.embeddings import OpenAIEmbeddings

class SmartChunker:
    def __init__(self, embeddings=None):
        self.embeddings = embeddings or OpenAIEmbeddings()
        self.recursive_splitter = RecursiveCharacterTextSplitter(
            chunk_size=512,
            chunk_overlap=50,
            separators=["\n\n", "\n", "。", ".", " ", ""],
        )

    def chunk_with_structure(self, document: str, metadata: dict = None) -> list:
        """結構化分塊:保留標題層級"""
        chunks = []
        sections = document.split("\n\n")

        current_context = ""
        for section in sections:
            lines = section.strip().split("\n")
            is_heading = any(line.startswith("#") for line in lines)

            if is_heading:
                current_context = section.strip()
                sub_chunks = self.recursive_splitter.split_text(section)
                for chunk in sub_chunks:
                    chunks.append({
                        "content": chunk,
                        "context": current_context,
                        "metadata": metadata or {},
                    })
            else:
                sub_chunks = self.recursive_splitter.split_text(section)
                for chunk in sub_chunks:
                    chunks.append({
                        "content": chunk,
                        "context": current_context,
                        "metadata": metadata or {},
                    })

        return chunks

    def chunk_with_parent_child(self, document: str) -> list:
        """父子分塊:大塊用於檢索,小塊用於生成"""
        parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)
        child_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=30)

        result = []
        parents = parent_splitter.split_text(document)

        for i, parent in enumerate(parents):
            children = child_splitter.split_text(parent)
            result.append({
                "parent_id": f"parent_{i}",
                "parent_content": parent,
                "children": [{"child_id": f"parent_{i}_child_{j}", "content": c} for j, c in enumerate(children)],
            })

        return result


chunker = SmartChunker()
doc = open("knowledge_base.md").read()
structured_chunks = chunker.chunk_with_structure(doc)
parent_child_chunks = chunker.chunk_with_parent_child(doc)

print(f"結構化分塊: {len(structured_chunks)} chunks")
print(f"父子分塊: {len(parent_child_chunks)} parent chunks")

最佳化策略二:檢索重排序

# reranker.py
from sentence_transformers import CrossEncoder
from typing import List, Dict

class Reranker:
    def __init__(self, model_name: str = "BAAI/bge-reranker-v2-m3"):
        self.model = CrossEncoder(model_name)

    def rerank(
        self,
        query: str,
        documents: List[Dict],
        top_k: int = 5,
        content_key: str = "content",
    ) -> List[Dict]:
        """對檢索結果進行重排序"""
        pairs = [(query, doc[content_key]) for doc in documents]
        scores = self.model.predict(pairs)

        scored_docs = list(zip(documents, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)

        results = []
        for doc, score in scored_docs[:top_k]:
            result = doc.copy()
            result["rerank_score"] = float(score)
            results.append(result)

        return results


reranker = Reranker()

query = "如何配置Istio mTLS?"
initial_results = [
    {"content": "Istio安裝指南...", "score": 0.85},
    {"content": "mTLS配置步驟:1. 建立PeerAuthentication...", "score": 0.72},
    {"content": "Istio流量管理概述...", "score": 0.68},
]

reranked = reranker.rerank(query, initial_results, top_k=3)
for r in reranked:
    print(f"Score: {r['rerank_score']:.4f} | {r['content'][:50]}")

最佳化策略三:混合檢索

# hybrid_retriever.py
from typing import List, Dict
import numpy as np

class HybridRetriever:
    def __init__(self, vector_store, keyword_store, alpha: float = 0.7):
        self.vector_store = vector_store
        self.keyword_store = keyword_store
        self.alpha = alpha

    def search(self, query: str, top_k: int = 10) -> List[Dict]:
        """混合檢索:向量檢索 + 關鍵詞檢索"""
        vector_results = self.vector_store.similarity_search_with_score(query, k=top_k * 2)
        keyword_results = self.keyword_store.search(query, top_k=top_k * 2)

        vector_scores = {}
        for doc, score in vector_results:
            doc_id = doc.metadata.get("id", hash(doc.page_content))
            vector_scores[doc_id] = {
                "content": doc.page_content,
                "vector_score": 1.0 / (1.0 + score),
                "keyword_score": 0.0,
            }

        for doc in keyword_results:
            doc_id = doc.metadata.get("id", hash(doc.page_content))
            if doc_id in vector_scores:
                vector_scores[doc_id]["keyword_score"] = doc.score
            else:
                vector_scores[doc_id] = {
                    "content": doc.page_content,
                    "vector_score": 0.0,
                    "keyword_score": doc.score,
                }

        combined = []
        for doc_id, scores in vector_scores.items():
            hybrid_score = (
                self.alpha * scores["vector_score"]
                + (1 - self.alpha) * scores["keyword_score"]
            )
            combined.append({
                "content": scores["content"],
                "hybrid_score": hybrid_score,
                "vector_score": scores["vector_score"],
                "keyword_score": scores["keyword_score"],
            })

        combined.sort(key=lambda x: x["hybrid_score"], reverse=True)
        return combined[:top_k]

最佳化策略四:Query改寫

# query_rewriter.py
from openai import OpenAI

class QueryRewriter:
    def __init__(self, model: str = "gpt-4o-mini"):
        self.client = OpenAI()
        self.model = model

    def rewrite(self, query: str, history: List[Dict] = None) -> str:
        """改寫使用者查詢,使其更適合檢索"""
        messages = [
            {"role": "system", "content": """你是一個查詢改寫專家。將使用者的自然語言問題改寫為更適合文件檢索的形式:
1. 補充省略的上下文(根據對話歷史)
2. 將口語化表述轉為專業術語
3. 拆解複合問題為子問題
4. 保留原始問題的核心意圖
只輸出改寫後的查詢,不要解釋。"""},
        ]

        if history:
            for msg in history[-3:]:
                messages.append({"role": msg["role"], "content": msg["content"]})

        messages.append({"role": "user", "content": f"原始查詢: {query}\n改寫後:"})

        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0.1,
            max_tokens=200,
        )

        return response.choices[0].message.content.strip()

    def expand_queries(self, query: str, n: int = 3) -> List[str]:
        """生成多個檢索變體"""
        messages = [
            {"role": "system", "content": f"為以下查詢生成{n}個不同角度的檢索變體,每行一個。"},
            {"role": "user", "content": query},
        ]

        response = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=0.5,
            max_tokens=300,
        )

        variants = response.choices[0].message.content.strip().split("\n")
        return [query] + [v.strip() for v in variants if v.strip()]


rewriter = QueryRewriter()
rewritten = rewriter.rewrite("怎麼配那個雙向認證?")
print(f"改寫後: {rewritten}")

expanded = rewriter.expand_queries("Istio mTLS配置", n=3)
print(f"查詢變體: {expanded}")

最佳化策略五:上下文壓縮

# context_compressor.py
from typing import List, Dict
from openai import OpenAI

class ContextCompressor:
    def __init__(self, model: str = "gpt-4o-mini"):
        self.client = OpenAI()
        self.model = model

    def compress(self, query: str, documents: List[str], max_tokens: int = 2000) -> str:
        """壓縮檢索到的文件,只保留與查詢相關的部分"""
        combined = "\n\n---\n\n".join(documents)

        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": f"""從以下文件中提取與查詢最相關的內容。
要求:
1. 只保留直接相關的資訊
2. 去除冗餘和重複
3. 保持事實準確性
4. 輸出不超過{max_tokens}個token"""},
                {"role": "user", "content": f"查詢: {query}\n\n文件:\n{combined}"},
            ],
            temperature=0.0,
            max_tokens=max_tokens,
        )

        return response.choices[0].message.content

最佳化策略六:自適應檢索

# adaptive_retrieval.py
class AdaptiveRetriever:
    def __init__(self, hybrid_retriever, reranker, query_rewriter):
        self.hybrid_retriever = hybrid_retriever
        self.reranker = reranker
        self.query_rewriter = query_rewriter
        self.retrieval_stats = {"simple": 0, "expanded": 0, "rewritten": 0}

    def retrieve(self, query: str, top_k: int = 5) -> List[Dict]:
        """自適應檢索策略選擇"""
        rewritten_query = self.query_rewriter.rewrite(query)

        if rewritten_query.lower() == query.lower():
            results = self.hybrid_retriever.search(query, top_k=top_k * 2)
            self.retrieval_stats["simple"] += 1
        else:
            expanded = self.query_rewriter.expand_queries(query, n=2)
            all_results = []
            for q in expanded:
                all_results.extend(self.hybrid_retriever.search(q, top_k=top_k))
            seen = set()
            results = []
            for r in all_results:
                key = hash(r["content"])
                if key not in seen:
                    seen.add(key)
                    results.append(r)
            self.retrieval_stats["expanded"] += 1

        reranked = self.reranker.rerank(rewritten_query, results, top_k=top_k)
        return reranked

避坑指南

序號 坑點 症狀 解決方案
1 Chunk size一刀切 表格/程式碼被截斷,語義不完整 按內容類型使用不同chunk_size:文字512、程式碼1024、表格256
2 嵌入模型與查詢不匹配 中文查詢英文文件,檢索效果差 使用多語言嵌入模型如 bge-m3multilingual-e5-large
3 重排序模型過慢 檢索+重排序延遲超過2秒 使用輕量級重排序模型 bge-reranker-v2-m3,或快取重排序結果
4 Query改寫偏離原意 改寫後查詢遺失使用者核心意圖 設定temperature=0.1,保留原始查詢作為備選
5 混合檢索alpha值固定 不同查詢類型最佳alpha不同 根據查詢類型動態調整:關鍵詞查詢alpha=0.3,語義查詢alpha=0.8

報錯排查

報錯資訊 原因 解決方法
ragas: openai API key not set OpenAI API Key未配置 設定 OPENAI_API_KEY 環境變數
CrossEncoder: model not found 重排序模型未下載 huggingface-cli download BAAI/bge-reranker-v2-m3
ChromaDB: collection not found 向量資料庫集合不存在 先建立集合再插入文件
Token limit exceeded 檢索文件總長度超LLM上下文視窗 使用ContextCompressor壓縮,或減少top_k
CUDA out of memory 嵌入/重排序模型GPU記憶體不足 使用CPU推理,或切換為更小的模型
FAISS index not built 向量索引未建構 index = faiss.IndexFlatIP(dimension)index.add(vectors)
JSON decode error in RAGAS 評估資料格式不正確 確認Dataset包含question/contexts/answer/ground_truth四列
RecursiveCharacterTextSplitter: empty chunk 文件內容為空或分隔符匹配異常 新增空chunk過濾,檢查文件編碼
OpenAI: rate limit exceeded API呼叫頻率超限 新增重試邏輯和請求間隔
SentenceTransformer: SSL error HuggingFace下載被牆 設定HF映象 HF_ENDPOINT=https://hf-mirror.com

進階最佳化

1. 評估驅動的迭代最佳化

def optimization_loop(eval_dataset, strategies, max_iterations=5):
    """評估驅動的自動最佳化迴圈"""
    best_score = 0
    best_config = None

    for i in range(max_iterations):
        for strategy_name, strategy_fn in strategies.items():
            modified_dataset = strategy_fn(eval_dataset)
            result = evaluate(modified_dataset, metrics=[faithfulness, answer_relevancy])
            score = result['faithfulness'] * 0.5 + result['answer_relevancy'] * 0.5

            if score > best_score:
                best_score = score
                best_config = strategy_name
                print(f"迭代{i+1}: {strategy_name} 得分 {score:.4f} (新最優)")

    return best_config, best_score

2. 多粒度索引

索引粒度 Chunk大小 用途 檢索方式
段落級 512 tokens 精確檢索 向量相似度
文件級 2000 tokens 上下文補充 父子檢索
摘要級 100 tokens 快速篩選 關鍵詞匹配

3. A/B測試框架

class RAGABTest:
    def __init__(self, variant_a, variant_b):
        self.variant_a = variant_a
        self.variant_b = variant_b
        self.results = {"a": [], "b": []}

    def run(self, queries: List[str], sample_ratio: float = 0.5):
        import random
        for query in queries:
            variant = "a" if random.random() < sample_ratio else "b"
            retriever = self.variant_a if variant == "a" else self.variant_b
            result = retriever.retrieve(query)
            self.results[variant].append({"query": query, "result": result})

    def compare(self):
        from ragas import evaluate
        score_a = evaluate(self._to_dataset(self.results["a"]))
        score_b = evaluate(self._to_dataset(self.results["b"]))
        return {"variant_a": score_a, "variant_b": score_b}

對比分析

最佳化策略 準確率提升 延遲影響 實現複雜度 推薦優先級
Chunk最佳化 +10-15% ★★★★★
檢索重排序 +8-12% +50-200ms ★★★★★
混合檢索 +5-10% +20ms ★★★★
Query改寫 +5-8% +100ms ★★★★
上下文壓縮 +3-5% +200ms ★★★
自適應檢索 +5-10% +100ms ★★★
RAG評估框架 指標覆蓋 易用性 LLM依賴 開源
RAGAS ★★★★★ ★★★★
DeepEval ★★★★ ★★★★★
TruLens ★★★ ★★★
ARES ★★★★ ★★★

總結:RAG最佳化不是調個參數就能搞定的——它是一個「評估→最佳化→再評估」的閉環過程。先用RAGAS建立基線,再用6種策略逐個擊破:Chunk最佳化解決資訊截斷、重排序提升Top-K準確率、混合檢索擴大召回、Query改寫彌合語義鴻溝、上下文壓縮減少噪聲、自適應檢索智慧選路。每種策略貢獻5-15%的準確率提升,組合起來就是40%+。2026年,不做評估的RAG最佳化,就是盲人摸象。


線上工具推薦

本站提供瀏覽器本地工具,免註冊即可試用 →

#Python#RAG#评估框架#RAGAS#检索增强#向量检索#Chunk优化#重排序