04 - RAG评估基础¶
本章介绍RAG系统的评估方法和指标,帮助您理解和衡量RAG系统的性能。
In [ ]:
Copied!
# 安装必要的库
!pip install langchain langchain-openai chromadb numpy pandas matplotlib
# 安装必要的库
!pip install langchain langchain-openai chromadb numpy pandas matplotlib
In [ ]:
Copied!
import numpy as np
from typing import List, Dict
def calculate_hit_rate(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算命中率(Hit Rate)
命中率 = 至少检索到一个相关文档的查询数 / 总查询数
"""
return 1.0 if any(doc in retrieved_docs for doc in relevant_docs) else 0.0
def calculate_mrr(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算平均倒数排名(Mean Reciprocal Rank)
MRR = 1 / 第一个相关文档的位置
"""
for i, doc in enumerate(retrieved_docs, 1):
if doc in relevant_docs:
return 1.0 / i
return 0.0
def calculate_precision(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算精确率(Precision)
Precision = 检索到的相关文档数 / 检索到的文档总数
"""
if not retrieved_docs:
return 0.0
relevant_retrieved = len(set(retrieved_docs) & set(relevant_docs))
return relevant_retrieved / len(retrieved_docs)
def calculate_recall(retrieved_docs: List[str], relevant_docs: List[str], total_relevant: int) -> float:
"""计算召回率(Recall)
Recall = 检索到的相关文档数 / 总相关文档数
"""
if total_relevant == 0:
return 0.0
relevant_retrieved = len(set(retrieved_docs) & set(relevant_docs))
return relevant_retrieved / total_relevant
# 示例使用
retrieved = ['doc1', 'doc3', 'doc5', 'doc7']
relevant = ['doc2', 'doc3', 'doc4', 'doc7', 'doc8']
print(f"Hit Rate: {calculate_hit_rate(retrieved, relevant):.2f}")
print(f"MRR: {calculate_mrr(retrieved, relevant):.2f}")
print(f"Precision@4: {calculate_precision(retrieved, relevant):.2f}")
print(f"Recall@4: {calculate_recall(retrieved, relevant, len(relevant)):.2f}")
import numpy as np
from typing import List, Dict
def calculate_hit_rate(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算命中率(Hit Rate)
命中率 = 至少检索到一个相关文档的查询数 / 总查询数
"""
return 1.0 if any(doc in retrieved_docs for doc in relevant_docs) else 0.0
def calculate_mrr(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算平均倒数排名(Mean Reciprocal Rank)
MRR = 1 / 第一个相关文档的位置
"""
for i, doc in enumerate(retrieved_docs, 1):
if doc in relevant_docs:
return 1.0 / i
return 0.0
def calculate_precision(retrieved_docs: List[str], relevant_docs: List[str]) -> float:
"""计算精确率(Precision)
Precision = 检索到的相关文档数 / 检索到的文档总数
"""
if not retrieved_docs:
return 0.0
relevant_retrieved = len(set(retrieved_docs) & set(relevant_docs))
return relevant_retrieved / len(retrieved_docs)
def calculate_recall(retrieved_docs: List[str], relevant_docs: List[str], total_relevant: int) -> float:
"""计算召回率(Recall)
Recall = 检索到的相关文档数 / 总相关文档数
"""
if total_relevant == 0:
return 0.0
relevant_retrieved = len(set(retrieved_docs) & set(relevant_docs))
return relevant_retrieved / total_relevant
# 示例使用
retrieved = ['doc1', 'doc3', 'doc5', 'doc7']
relevant = ['doc2', 'doc3', 'doc4', 'doc7', 'doc8']
print(f"Hit Rate: {calculate_hit_rate(retrieved, relevant):.2f}")
print(f"MRR: {calculate_mrr(retrieved, relevant):.2f}")
print(f"Precision@4: {calculate_precision(retrieved, relevant):.2f}")
print(f"Recall@4: {calculate_recall(retrieved, relevant, len(relevant)):.2f}")
In [ ]:
Copied!
def evaluate_faithfulness(generated_answer: str, context: List[str]) -> Dict:
"""评估生成答案的忠实度
检查答案中的事实是否都能在上下文中找到依据
"""
# 简化的评估逻辑
context_text = ' '.join(context)
# 提取答案中的关键陈述(简化版)
statements = generated_answer.split('。')
supported = 0
for statement in statements:
if any(word in context_text for word in statement.split()):
supported += 1
faithfulness_score = supported / len(statements) if statements else 0
return {
'score': faithfulness_score,
'supported_statements': supported,
'total_statements': len(statements)
}
# 示例
answer = "RAG结合了检索和生成技术。它可以提高答案准确性。"
context = [
"RAG(Retrieval-Augmented Generation)是一种结合检索和生成的AI技术",
"RAG可以通过外部知识库提高LLM的准确性"
]
result = evaluate_faithfulness(answer, context)
print(f"忠实度: {result['score']:.2f}")
def evaluate_faithfulness(generated_answer: str, context: List[str]) -> Dict:
"""评估生成答案的忠实度
检查答案中的事实是否都能在上下文中找到依据
"""
# 简化的评估逻辑
context_text = ' '.join(context)
# 提取答案中的关键陈述(简化版)
statements = generated_answer.split('。')
supported = 0
for statement in statements:
if any(word in context_text for word in statement.split()):
supported += 1
faithfulness_score = supported / len(statements) if statements else 0
return {
'score': faithfulness_score,
'supported_statements': supported,
'total_statements': len(statements)
}
# 示例
answer = "RAG结合了检索和生成技术。它可以提高答案准确性。"
context = [
"RAG(Retrieval-Augmented Generation)是一种结合检索和生成的AI技术",
"RAG可以通过外部知识库提高LLM的准确性"
]
result = evaluate_faithfulness(answer, context)
print(f"忠实度: {result['score']:.2f}")
3.2 相关性(Relevancy)¶
In [ ]:
Copied!
def evaluate_relevancy(query: str, answer: str) -> float:
"""评估答案与查询的相关性
简化版:检查答案是否包含查询的关键词
"""
query_words = set(query.lower().split())
answer_words = set(answer.lower().split())
overlap = len(query_words & answer_words)
relevancy = overlap / len(query_words) if query_words else 0
return relevancy
# 示例
query = "什么是RAG技术?"
answer = "RAG是检索增强生成技术,结合了检索和生成的方法。"
score = evaluate_relevancy(query, answer)
print(f"相关性得分: {score:.2f}")
def evaluate_relevancy(query: str, answer: str) -> float:
"""评估答案与查询的相关性
简化版:检查答案是否包含查询的关键词
"""
query_words = set(query.lower().split())
answer_words = set(answer.lower().split())
overlap = len(query_words & answer_words)
relevancy = overlap / len(query_words) if query_words else 0
return relevancy
# 示例
query = "什么是RAG技术?"
answer = "RAG是检索增强生成技术,结合了检索和生成的方法。"
score = evaluate_relevancy(query, answer)
print(f"相关性得分: {score:.2f}")
In [ ]:
Copied!
class SimpleRAGEvaluator:
"""简化的RAG评估器
实际项目中可以使用RAGAS库:
!pip install ragas
"""
def __init__(self):
self.results = []
def evaluate_query(
self,
query: str,
retrieved_docs: List[str],
relevant_docs: List[str],
generated_answer: str,
context: List[str]
) -> Dict:
"""评估单个查询"""
# 检索指标
hit_rate = calculate_hit_rate(retrieved_docs, relevant_docs)
mrr = calculate_mrr(retrieved_docs, relevant_docs)
precision = calculate_precision(retrieved_docs, relevant_docs)
# 生成指标
faithfulness = evaluate_faithfulness(generated_answer, context)
relevancy = evaluate_relevancy(query, generated_answer)
result = {
'query': query,
'hit_rate': hit_rate,
'mrr': mrr,
'precision': precision,
'faithfulness': faithfulness['score'],
'relevancy': relevancy
}
self.results.append(result)
return result
def get_average_scores(self) -> Dict:
"""计算平均得分"""
if not self.results:
return {}
metrics = self.results[0].keys()
averages = {}
for metric in metrics:
if metric == 'query':
continue
values = [r[metric] for r in self.results]
averages[metric] = np.mean(values)
return averages
# 使用示例
evaluator = SimpleRAGEvaluator()
# 评估几个查询
test_queries = [
{
'query': '什么是RAG?',
'retrieved': ['doc1', 'doc2', 'doc3'],
'relevant': ['doc1', 'doc3'],
'answer': 'RAG是检索增强生成技术',
'context': ['RAG结合检索和生成', '提高LLM准确性']
},
{
'query': '如何优化RAG?',
'retrieved': ['doc4', 'doc5', 'doc6'],
'relevant': ['doc4'],
'answer': '可以通过混合检索优化',
'context': ['混合检索结合向量搜索和关键词搜索']
}
]
for tq in test_queries:
result = evaluator.evaluate_query(
tq['query'],
tq['retrieved'],
tq['relevant'],
tq['answer'],
tq['context']
)
print(f"\n查询: {tq['query']}")
print(f"命中率: {result['hit_rate']:.2f}, MRR: {result['mrr']:.2f}")
print(f"忠实度: {result['faithfulness']:.2f}, 相关性: {result['relevancy']:.2f}")
print("\n=== 平均得分 ===")
averages = evaluator.get_average_scores()
for metric, score in averages.items():
print(f"{metric}: {score:.2f}")
class SimpleRAGEvaluator:
"""简化的RAG评估器
实际项目中可以使用RAGAS库:
!pip install ragas
"""
def __init__(self):
self.results = []
def evaluate_query(
self,
query: str,
retrieved_docs: List[str],
relevant_docs: List[str],
generated_answer: str,
context: List[str]
) -> Dict:
"""评估单个查询"""
# 检索指标
hit_rate = calculate_hit_rate(retrieved_docs, relevant_docs)
mrr = calculate_mrr(retrieved_docs, relevant_docs)
precision = calculate_precision(retrieved_docs, relevant_docs)
# 生成指标
faithfulness = evaluate_faithfulness(generated_answer, context)
relevancy = evaluate_relevancy(query, generated_answer)
result = {
'query': query,
'hit_rate': hit_rate,
'mrr': mrr,
'precision': precision,
'faithfulness': faithfulness['score'],
'relevancy': relevancy
}
self.results.append(result)
return result
def get_average_scores(self) -> Dict:
"""计算平均得分"""
if not self.results:
return {}
metrics = self.results[0].keys()
averages = {}
for metric in metrics:
if metric == 'query':
continue
values = [r[metric] for r in self.results]
averages[metric] = np.mean(values)
return averages
# 使用示例
evaluator = SimpleRAGEvaluator()
# 评估几个查询
test_queries = [
{
'query': '什么是RAG?',
'retrieved': ['doc1', 'doc2', 'doc3'],
'relevant': ['doc1', 'doc3'],
'answer': 'RAG是检索增强生成技术',
'context': ['RAG结合检索和生成', '提高LLM准确性']
},
{
'query': '如何优化RAG?',
'retrieved': ['doc4', 'doc5', 'doc6'],
'relevant': ['doc4'],
'answer': '可以通过混合检索优化',
'context': ['混合检索结合向量搜索和关键词搜索']
}
]
for tq in test_queries:
result = evaluator.evaluate_query(
tq['query'],
tq['retrieved'],
tq['relevant'],
tq['answer'],
tq['context']
)
print(f"\n查询: {tq['query']}")
print(f"命中率: {result['hit_rate']:.2f}, MRR: {result['mrr']:.2f}")
print(f"忠实度: {result['faithfulness']:.2f}, 相关性: {result['relevancy']:.2f}")
print("\n=== 平均得分 ===")
averages = evaluator.get_average_scores()
for metric, score in averages.items():
print(f"{metric}: {score:.2f}")
6. 练习¶
练习1: 实现F1-score¶
def calculate_f1_score(precision: float, recall: float) -> float:
"""计算F1分数(精确率和召回率的调和平均)"""
# TODO: 实现F1-score计算
pass
练习2: 实现NDCG(Normalized Discounted Cumulative Gain)¶
def calculate_ndcg(retrieved_docs: List[str], relevant_docs: List[str], k: int = 10) -> float:
"""计算NDCG@k
NDCG考虑文档在结果列表中的位置和相关程度
"""
# TODO: 实现NDCG计算
pass
练习3: 可视化评估结果¶
import matplotlib.pyplot as plt
def plot_evaluation_results(results: List[Dict]):
"""绘制评估结果对比图"""
# TODO: 创建图表比较不同配置的性能
pass
参考答案请查看 exercises/04_rag_evaluation_answers.ipynb
7. 总结¶
本章学习了:
✅ 检索质量评估
- Hit Rate, MRR, Precision, Recall
✅ 生成质量评估
- Faithfulness, Relevancy
✅ 端到端评估
- 构建评估框架
- 计算综合指标
✅ 最佳实践
- 创建测试数据集
- 定期评估和监控
下一步: 第5章将综合前面所学,完成一个完整的项目!
恭喜完成第4章的学习! 🎉
你已经掌握了RAG评估的基础知识!