1. 环境准备¶
In [ ]:
Copied!
# 导入必要的库
import numpy as np
from typing import List, Dict, Any
from pathlib import Path
# 检查环境
print("检查环境...")
print(f"NumPy版本: {np.__version__}")
# 模拟sentence-transformers(如果未安装)
try:
from sentence_transformers import SentenceTransformer
print("✅ sentence-transformers已安装")
except ImportError:
print("⚠️ sentence-transformers未安装")
print(" 请运行: pip install sentence-transformers")
print(" 将使用模拟数据演示")
print("\n环境准备完成!")
# 导入必要的库
import numpy as np
from typing import List, Dict, Any
from pathlib import Path
# 检查环境
print("检查环境...")
print(f"NumPy版本: {np.__version__}")
# 模拟sentence-transformers(如果未安装)
try:
from sentence_transformers import SentenceTransformer
print("✅ sentence-transformers已安装")
except ImportError:
print("⚠️ sentence-transformers未安装")
print(" 请运行: pip install sentence-transformers")
print(" 将使用模拟数据演示")
print("\n环境准备完成!")
In [ ]:
Copied!
# 演示:文本到向量的转换
class SimpleEmbeddingModel:
"""
简化的嵌入模型(用于演示)
"""
def __init__(self, embedding_dim: int = 768):
self.embedding_dim = embedding_dim
# 模拟:为常见词预设向量
self.word_vectors = {
"苹果": np.random.randn(embedding_dim) * 0.1 + np.array([0.5] * embedding_dim),
"香蕉": np.random.randn(embedding_dim) * 0.1 + np.array([0.5] * embedding_dim),
"电脑": np.random.randn(embedding_dim) * 0.1 + np.array([-0.5] * embedding_dim),
}
def encode(self, text: str) -> np.ndarray:
"""
将文本编码为向量
"""
# 简化实现:查找词向量,否则随机生成
for word in self.word_vectors:
if word in text:
return self.word_vectors[word]
# 默认返回随机向量
return np.random.randn(self.embedding_dim)
# 创建模型
model = SimpleEmbeddingModel(embedding_dim=128)
print("嵌入模型演示:")
print("-" * 50)
# 编码示例文本
texts = ["苹果", "香蕉", "电脑"]
embeddings = {text: model.encode(text) for text in texts}
for text, emb in embeddings.items():
print(f"\n文本: '{text}'")
print(f"向量维度: {emb.shape}")
print(f"向量前5维: {emb[:5]}")
# 演示:文本到向量的转换
class SimpleEmbeddingModel:
"""
简化的嵌入模型(用于演示)
"""
def __init__(self, embedding_dim: int = 768):
self.embedding_dim = embedding_dim
# 模拟:为常见词预设向量
self.word_vectors = {
"苹果": np.random.randn(embedding_dim) * 0.1 + np.array([0.5] * embedding_dim),
"香蕉": np.random.randn(embedding_dim) * 0.1 + np.array([0.5] * embedding_dim),
"电脑": np.random.randn(embedding_dim) * 0.1 + np.array([-0.5] * embedding_dim),
}
def encode(self, text: str) -> np.ndarray:
"""
将文本编码为向量
"""
# 简化实现:查找词向量,否则随机生成
for word in self.word_vectors:
if word in text:
return self.word_vectors[word]
# 默认返回随机向量
return np.random.randn(self.embedding_dim)
# 创建模型
model = SimpleEmbeddingModel(embedding_dim=128)
print("嵌入模型演示:")
print("-" * 50)
# 编码示例文本
texts = ["苹果", "香蕉", "电脑"]
embeddings = {text: model.encode(text) for text in texts}
for text, emb in embeddings.items():
print(f"\n文本: '{text}'")
print(f"向量维度: {emb.shape}")
print(f"向量前5维: {emb[:5]}")
2.2 计算向量相似度¶
In [ ]:
Copied!
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
"""
计算余弦相似度
Args:
vec1, vec2: 向量
Returns:
相似度分数 (0-1)
"""
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
# 计算相似度矩阵
print("相似度矩阵:")
print("-" * 50)
print(f"{'文本':<10} {'苹果':<10} {'香蕉':<10} {'电脑':<10}")
print("-" * 50)
for text1 in texts:
row = [text1]
for text2 in texts:
sim = cosine_similarity(embeddings[text1], embeddings[text2])
row.append(f"{sim:.3f}")
print(f"{row[0]:<10} {row[1]:<10} {row[2]:<10} {row[3]:<10}")
print("\n观察:")
print("- '苹果' 和 '香蕉' 相似度高(都是水果)")
print("- '电脑' 与其他相似度低(不同类别)")
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
"""
计算余弦相似度
Args:
vec1, vec2: 向量
Returns:
相似度分数 (0-1)
"""
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
# 计算相似度矩阵
print("相似度矩阵:")
print("-" * 50)
print(f"{'文本':<10} {'苹果':<10} {'香蕉':<10} {'电脑':<10}")
print("-" * 50)
for text1 in texts:
row = [text1]
for text2 in texts:
sim = cosine_similarity(embeddings[text1], embeddings[text2])
row.append(f"{sim:.3f}")
print(f"{row[0]:<10} {row[1]:<10} {row[2]:<10} {row[3]:<10}")
print("\n观察:")
print("- '苹果' 和 '香蕉' 相似度高(都是水果)")
print("- '电脑' 与其他相似度低(不同类别)")
In [ ]:
Copied!
# 主流嵌入模型对比表
models_comparison = [
{
"name": "all-MiniLM-L6-v2",
"dim": 384,
"speed": "快",
"quality": "中",
"size": "80MB",
"use_case": "快速检索、资源受限"
},
{
"name": "all-mpnet-base-v2",
"dim": 768,
"speed": "中",
"quality": "高",
"size": "420MB",
"use_case": "高质量检索"
},
{
"name": "bge-large-zh-v1.5",
"dim": 1024,
"speed": "中",
"quality": "高(中文)",
"size": "1.34GB",
"use_case": "中文高质量检索"
},
{
"name": "e5-large-v2",
"dim": 1024,
"speed": "中",
"quality": "高",
"size": "1.34GB",
"use_case": "多语言检索"
},
{
"name": "text-embedding-3-small",
"dim": 1536,
"speed": "API调用",
"quality": "很高",
"size": "-",
"use_case": "生产环境(OpenAI)"
}
]
print("主流嵌入模型对比:")
print("=" * 100)
print(f"{'模型名称':<25} {'维度':<8} {'速度':<8} {'质量':<10} {'大小':<10} {'适用场景':<20}")
print("=" * 100)
for model in models_comparison:
print(f"{model['name']:<25} {model['dim']:<8} {model['speed']:<8} "
f"{model['quality']:<10} {model['size']:<10} {model['use_case']:<20}")
# 主流嵌入模型对比表
models_comparison = [
{
"name": "all-MiniLM-L6-v2",
"dim": 384,
"speed": "快",
"quality": "中",
"size": "80MB",
"use_case": "快速检索、资源受限"
},
{
"name": "all-mpnet-base-v2",
"dim": 768,
"speed": "中",
"quality": "高",
"size": "420MB",
"use_case": "高质量检索"
},
{
"name": "bge-large-zh-v1.5",
"dim": 1024,
"speed": "中",
"quality": "高(中文)",
"size": "1.34GB",
"use_case": "中文高质量检索"
},
{
"name": "e5-large-v2",
"dim": 1024,
"speed": "中",
"quality": "高",
"size": "1.34GB",
"use_case": "多语言检索"
},
{
"name": "text-embedding-3-small",
"dim": 1536,
"speed": "API调用",
"quality": "很高",
"size": "-",
"use_case": "生产环境(OpenAI)"
}
]
print("主流嵌入模型对比:")
print("=" * 100)
print(f"{'模型名称':<25} {'维度':<8} {'速度':<8} {'质量':<10} {'大小':<10} {'适用场景':<20}")
print("=" * 100)
for model in models_comparison:
print(f"{model['name']:<25} {model['dim']:<8} {model['speed']:<8} "
f"{model['quality']:<10} {model['size']:<10} {model['use_case']:<20}")
3.2 模型选择决策树¶
In [ ]:
Copied!
def choose_embedding_model(
language: str = "zh",
quality_priority: bool = False,
resource_limited: bool = False,
use_api: bool = False
) -> str:
"""
根据需求选择合适的嵌入模型
Args:
language: 主要语言 (zh/en/multi)
quality_priority: 是否优先质量
resource_limited: 资源是否受限
use_api: 是否使用API
Returns:
推荐的模型名称
"""
if use_api:
return "text-embedding-3-small (OpenAI)"
if language == "zh":
if resource_limited:
return "bge-small-zh-v1.5"
elif quality_priority:
return "bge-large-zh-v1.5"
else:
return "bge-base-zh-v1.5"
elif language == "en":
if resource_limited:
return "all-MiniLM-L6-v2"
elif quality_priority:
return "all-mpnet-base-v2"
else:
return "all-MiniLM-L12-v2"
else: # 多语言
if quality_priority:
return "e5-large-v2"
else:
return "e5-base-v2"
# 测试选择函数
print("模型选择示例:")
print("-" * 50)
scenarios = [
{"language": "zh", "quality_priority": True, "resource_limited": False},
{"language": "en", "quality_priority": False, "resource_limited": True},
{"language": "multi", "quality_priority": True, "resource_limited": False},
]
for i, scenario in enumerate(scenarios, 1):
model = choose_embedding_model(**scenario)
print(f"\n场景{i}: {scenario}")
print(f"推荐: {model}")
def choose_embedding_model(
language: str = "zh",
quality_priority: bool = False,
resource_limited: bool = False,
use_api: bool = False
) -> str:
"""
根据需求选择合适的嵌入模型
Args:
language: 主要语言 (zh/en/multi)
quality_priority: 是否优先质量
resource_limited: 资源是否受限
use_api: 是否使用API
Returns:
推荐的模型名称
"""
if use_api:
return "text-embedding-3-small (OpenAI)"
if language == "zh":
if resource_limited:
return "bge-small-zh-v1.5"
elif quality_priority:
return "bge-large-zh-v1.5"
else:
return "bge-base-zh-v1.5"
elif language == "en":
if resource_limited:
return "all-MiniLM-L6-v2"
elif quality_priority:
return "all-mpnet-base-v2"
else:
return "all-MiniLM-L12-v2"
else: # 多语言
if quality_priority:
return "e5-large-v2"
else:
return "e5-base-v2"
# 测试选择函数
print("模型选择示例:")
print("-" * 50)
scenarios = [
{"language": "zh", "quality_priority": True, "resource_limited": False},
{"language": "en", "quality_priority": False, "resource_limited": True},
{"language": "multi", "quality_priority": True, "resource_limited": False},
]
for i, scenario in enumerate(scenarios, 1):
model = choose_embedding_model(**scenario)
print(f"\n场景{i}: {scenario}")
print(f"推荐: {model}")
In [ ]:
Copied!
class EmbeddingEvaluator:
"""
嵌入质量评估器
"""
def __init__(self, model):
self.model = model
def evaluate_retrieval(
self,
queries: List[str],
documents: List[str],
relevant_docs: List[List[int]],
top_k: int = 5
) -> Dict[str, float]:
"""
评估检索质量
Args:
queries: 查询列表
documents: 文档列表
relevant_docs: 每个查询的相关文档索引
top_k: 检索top-k
Returns:
评估指标
"""
# 编码所有文档
doc_embeddings = [self.model.encode(doc) for doc in documents]
hit_count = 0
mrr_scores = []
for query, relevant in zip(queries, relevant_docs):
# 编码查询
query_emb = self.model.encode(query)
# 计算相似度
similarities = [
cosine_similarity(query_emb, doc_emb)
for doc_emb in doc_embeddings
]
# 获取top-k
top_indices = np.argsort(similarities)[::-1][:top_k]
# Hit Rate
if any(idx in relevant for idx in top_indices):
hit_count += 1
# MRR
for rank, idx in enumerate(top_indices, 1):
if idx in relevant:
mrr_scores.append(1.0 / rank)
break
else:
mrr_scores.append(0.0)
return {
"hit_rate": hit_count / len(queries),
"mrr": np.mean(mrr_scores),
"num_queries": len(queries)
}
# 创建评估器
evaluator = EmbeddingEvaluator(model)
# 测试数据
test_queries = [
"苹果好吃吗",
"电脑配置",
]
test_docs = [
"苹果是一种水果,味道甜美",
"电脑是一种电子设备",
"香蕉也是水果",
]
test_relevant = [
[0], # 第一个查询相关文档
[1], # 第二个查询相关文档
]
# 评估
results = evaluator.evaluate_retrieval(
queries=test_queries,
documents=test_docs,
relevant_docs=test_relevant
)
print("评估结果:")
print(f"-" * 30)
for metric, value in results.items():
if isinstance(value, float):
print(f"{metric}: {value:.3f}")
else:
print(f"{metric}: {value}")
class EmbeddingEvaluator:
"""
嵌入质量评估器
"""
def __init__(self, model):
self.model = model
def evaluate_retrieval(
self,
queries: List[str],
documents: List[str],
relevant_docs: List[List[int]],
top_k: int = 5
) -> Dict[str, float]:
"""
评估检索质量
Args:
queries: 查询列表
documents: 文档列表
relevant_docs: 每个查询的相关文档索引
top_k: 检索top-k
Returns:
评估指标
"""
# 编码所有文档
doc_embeddings = [self.model.encode(doc) for doc in documents]
hit_count = 0
mrr_scores = []
for query, relevant in zip(queries, relevant_docs):
# 编码查询
query_emb = self.model.encode(query)
# 计算相似度
similarities = [
cosine_similarity(query_emb, doc_emb)
for doc_emb in doc_embeddings
]
# 获取top-k
top_indices = np.argsort(similarities)[::-1][:top_k]
# Hit Rate
if any(idx in relevant for idx in top_indices):
hit_count += 1
# MRR
for rank, idx in enumerate(top_indices, 1):
if idx in relevant:
mrr_scores.append(1.0 / rank)
break
else:
mrr_scores.append(0.0)
return {
"hit_rate": hit_count / len(queries),
"mrr": np.mean(mrr_scores),
"num_queries": len(queries)
}
# 创建评估器
evaluator = EmbeddingEvaluator(model)
# 测试数据
test_queries = [
"苹果好吃吗",
"电脑配置",
]
test_docs = [
"苹果是一种水果,味道甜美",
"电脑是一种电子设备",
"香蕉也是水果",
]
test_relevant = [
[0], # 第一个查询相关文档
[1], # 第二个查询相关文档
]
# 评估
results = evaluator.evaluate_retrieval(
queries=test_queries,
documents=test_docs,
relevant_docs=test_relevant
)
print("评估结果:")
print(f"-" * 30)
for metric, value in results.items():
if isinstance(value, float):
print(f"{metric}: {value:.3f}")
else:
print(f"{metric}: {value}")
In [ ]:
Copied!
class DocumentRetriever:
"""
基于嵌入的文档检索系统
"""
def __init__(self, model):
self.model = model
self.documents = []
self.embeddings = []
def add_documents(self, docs: List[str]):
"""
添加文档到索引
"""
self.documents.extend(docs)
for doc in docs:
emb = self.model.encode(doc)
self.embeddings.append(emb)
def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
"""
检索相关文档
"""
query_emb = self.model.encode(query)
# 计算相似度
similarities = [
cosine_similarity(query_emb, doc_emb)
for doc_emb in self.embeddings
]
# 排序
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
results.append({
"document": self.documents[idx],
"score": similarities[idx],
"index": idx
})
return results
# 创建检索系统
retriever = DocumentRetriever(model)
# 添加文档
docs = [
"机器学习是人工智能的一个分支",
"深度学习使用神经网络",
"Python是一种编程语言",
"苹果公司生产iPhone",
]
retriever.add_documents(docs)
# 测试检索
test_queries = [
"什么是AI",
"编程语言",
"智能手机",
]
for query in test_queries:
print(f"\n查询: {query}")
print("-" * 40)
results = retriever.retrieve(query, top_k=2)
for i, result in enumerate(results, 1):
print(f"\n结果 {i} (相关度: {result['score']:.3f}):")
print(f" {result['document']}")
class DocumentRetriever:
"""
基于嵌入的文档检索系统
"""
def __init__(self, model):
self.model = model
self.documents = []
self.embeddings = []
def add_documents(self, docs: List[str]):
"""
添加文档到索引
"""
self.documents.extend(docs)
for doc in docs:
emb = self.model.encode(doc)
self.embeddings.append(emb)
def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
"""
检索相关文档
"""
query_emb = self.model.encode(query)
# 计算相似度
similarities = [
cosine_similarity(query_emb, doc_emb)
for doc_emb in self.embeddings
]
# 排序
top_indices = np.argsort(similarities)[::-1][:top_k]
results = []
for idx in top_indices:
results.append({
"document": self.documents[idx],
"score": similarities[idx],
"index": idx
})
return results
# 创建检索系统
retriever = DocumentRetriever(model)
# 添加文档
docs = [
"机器学习是人工智能的一个分支",
"深度学习使用神经网络",
"Python是一种编程语言",
"苹果公司生产iPhone",
]
retriever.add_documents(docs)
# 测试检索
test_queries = [
"什么是AI",
"编程语言",
"智能手机",
]
for query in test_queries:
print(f"\n查询: {query}")
print("-" * 40)
results = retriever.retrieve(query, top_k=2)
for i, result in enumerate(results, 1):
print(f"\n结果 {i} (相关度: {result['score']:.3f}):")
print(f" {result['document']}")
6. 练习¶
练习1:模型对比¶
尝试使用不同的嵌入模型(如果有sentence-transformers),对比它们在相同数据集上的表现。
提示:
from sentence_transformers import SentenceTransformer
# 加载不同模型
model1 = SentenceTransformer('all-MiniLM-L6-v2')
model2 = SentenceTransformer('all-mpnet-base-v2')
# 对比性能...
练习2:优化检索¶
改进DocumentRetriever类,添加以下功能:
- 缓存嵌入向量
- 批量编码
- 结果过滤阈值
练习3:多语言检索¶
使用多语言嵌入模型(如e5),实现跨语言检索功能。