当前位置：首页 > news >正文

【人工智能】text2vec-large-chinese模型搭建本地知识库

news 2025/7/8 15:13:12

本demo使用 text2vec-large-chinese 模型进行文本处理，然后再过 bge-reranker-v2-m3进行增强

1. 对文本进行向量处理，并保存只至本地

from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import faiss
import os
import pickle
import fitz  # PyMuPDF
import redevice = torch.device("cuda")
# 加载模型并移动到指定设备
model = SentenceTransformer('/home/sky/model_data/text2vec-large-chinese').to(device)def read_txt(file_path):with open(file_path, 'r', encoding='utf-8') as file:return file.read()def read_pdf(file_path):document = fitz.open(file_path)text = ""for page_num in range(len(document)):page = document.load_page(page_num)text += page.get_text()return textdef preprocess_text(text):# 去除多余的空格和换行符text = re.sub(r'\s+', ' ', text)# 去除特殊字符text = re.sub(r'[^\w\s]', '', text)return text.strip()# 对文本进行分块
def split_into_chunks(text, max_chunk_size=100):# 这里我们简单地以固定长度分块，实际应用中可以根据语义进行更智能的分割words = text.split()chunks = [' '.join(words[i:i + max_chunk_size]) for i in range(0, len(words), max_chunk_size)]return chunks# 读取文档内容
def read_document(file_path):if file_path.endswith('.txt'):return read_txt(file_path)elif file_path.endswith('.pdf'):return read_pdf(file_path)else:raise ValueError("Unsupported file type")# 生成向量
def generate_embeddings(chunks, model, device):embeddings = model.encode(chunks, convert_to_tensor=True, device=device).cpu().numpy()return chunks, embeddings# 保存向量为FAISS格式
def save_embeddings_faiss(chunks, embeddings, output_file):# 创建FAISS索引dimension = embeddings.shape[1]index = faiss.IndexFlatL2(dimension)  # 使用L2距离index.add(embeddings)# 保存FAISS索引faiss.write_index(index, output_file + '.index')# 保存句子列表with open(output_file + '.pkl', 'wb') as f:pickle.dump(chunks, f)# 主函数
def build_knowledge_base(file_path, output_file):# 读取文档内容text = read_document(file_path)# 预处理文本text = preprocess_text(text)# 将文本分割成块chunks = split_into_chunks(text)# 生成向量chunks, embeddings = generate_embeddings(chunks, model, device)# 保存向量为FAISS格式save_embeddings_faiss(chunks, embeddings, output_file)print(f"Embeddings and FAISS index saved to {output_file}.index and {output_file}.pkl")# 使用示例
file_path = '/home/sky/model_data/code/pdf_dir/st.txt'  # 或者 'path/to/your/document.txt'
output_file = '/home/sky/model_data/code/pdf_dir/xldb/textfaiss_new_index'
build_knowledge_base(file_path, output_file)

2. 普通相似度匹配

import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import torch# 检查是否有可用的GPU
device = "cuda"# 加载模型并移动到指定设备
model = SentenceTransformer('/home/sky/model_data/text2vec-large-chinese').to(device)# 加载FAISS索引
index = faiss.read_index('/home/sky/model_data/code/pdf_dir/xldb/textfaiss_index.index')# 加载句子列表
with open('/home/sky/model_data/code/pdf_dir/xldb/textfaiss_index.pkl', 'rb') as f:sentences = pickle.load(f)# 定义查询函数
def search_faiss(query, sentences, index, model, top_n=5):# 生成查询向量query_embedding = model.encode([query], convert_to_tensor=True, device=device).cpu().numpy()# 在FAISS索引中搜索D, I = index.search(query_embedding, top_n)# 返回结果results = []for i, distance in zip(I[0], D[0]):# 1 - L2距离近似于余弦相似度similarity = 1 - distanceresults.append((sentences[i], similarity))return results# 查询示例
query = '今天天气不错。'
results = search_faiss(query, sentences, index, model, top_n=5)# 打印结果
for sentence, score in results:print(f"句子: {sentence}，相似度: {score:.4f}")

3. 使用rerank增强，rerank模型：bge-reranker-v2-m3

import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
import torch# 检查是否有可用的GPU
device = "cuda"# 加载模型并移动到指定设备
model = SentenceTransformer('/home/sky/model_data/text2vec-large-chinese').to(device)# 加载FAISS索引
index = faiss.read_index('/home/sky/model_data/code/pdf_dir/xldb/textfaiss_index.index')# 加载句子列表
with open('/home/sky/model_data/code/pdf_dir/xldb/textfaiss_index.pkl', 'rb') as f:sentences = pickle.load(f)# 加载re-ranking模型
reranker = CrossEncoder('/home/sky/model_data/bge-reranker-v2-m3', device=device)# 定义查询函数
def search_faiss(query, sentences, index, model, reranker, top_n=5, rerank_top_n=5):# 生成查询向量query_embedding = model.encode([query], convert_to_tensor=True, device=device).cpu().numpy()# 在FAISS索引中搜索D, I = index.search(query_embedding, rerank_top_n)# 获取初始候选结果initial_results = [(sentences[i], 1 - d) for i, d in zip(I[0], D[0])]# 准备re-ranking的输入rerank_inputs = [(query, sentence) for sentence, _ in initial_results]# 使用re-ranking模型进行重新评分rerank_scores = reranker.predict(rerank_inputs, batch_size=8)# 合并初始分数和re-ranking分数final_results = [(sentence, rerank_score) for (sentence, _), rerank_score in zip(initial_results, rerank_scores)]# 按re-ranking分数排序并取前top_n个结果final_results.sort(key=lambda x: x[1], reverse=True)return final_results[:top_n]# 查询示例
query = '今天天气不错。'
results = search_faiss(query, sentences, index, model, reranker, top_n=5)# 打印结果
for sentence, score in results:print(f"句子: {sentence}，相似度: {score:.4f}")

查看全文

http://www.lryc.cn/news/484545.html