drewli20200316
/

agentV2

Model card Files Files and versions

xet

Community

drewli20200316 commited on Feb 13

Commit

635abe8

verified ·

1 Parent(s): 528ba5d

Upload vector2.py with huggingface_hub

Browse files

Files changed (1) hide show

vector2.py +307 -0

vector2.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os
+from pydantic import BaseModel
+from tqdm import tqdm
+import json
+import uuid
+import time
+import redis
+import pandas as pd
+from openai import OpenAI
+from langchain.embeddings.base import Embeddings
+from langchain_core.documents import Document
+from langchain_milvus import Milvus, BM25BuiltInFunction
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_classic.retrievers.parent_document_retriever import ParentDocumentRetriever
+from langchain_core.stores import InMemoryStore
+from dotenv import load_dotenv
+# 加载 .env 文件中的环境变量, 隐藏 API Keys
+load_dotenv()
+# ============================================================
+# Redis 缓存处理模块
+# ============================================================
+def get_redis_client():
+    # 创建Redis连接, 使用连接池 (推荐用于生产环境)
+    pool = redis.ConnectionPool(host='0.0.0.0', port=6379, db=0, password=None, max_connections=10)
+    r = redis.StrictRedis(connection_pool=pool)
+    # 测试连接
+    try:
+        r.ping()
+        print("成功连接到 Redis !")
+    except redis.ConnectionError:
+        print("无法连接到 Redis !")
+    return r
+# 将 (question, answer) 问答对, 存入 redis
+def cache_set(r, question: str, answer: str):
+    r.hset("qa", question, answer)
+    r.expire("qa", 3600)
+# 通过 question, 读取存在 redis 中的 answer
+def cache_get(r, question: str):
+    return r.hget("qa", question)
+# ============================================================
+# 嵌入模型, 采用 OpenAI text-embedding-3-small
+# ============================================================
+class OpenAIEmbeddings(Embeddings):
+    """基于 OpenAI Embedding API 的自定义嵌入类"""
+    def __init__(self):
+        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    def embed_documents(self, texts):
+        embeddings = []
+        for text in texts:
+            response = self.client.embeddings.create(
+                model="text-embedding-3-small",
+                input=[text],
+            )
+            embeddings.append(response.data[0].embedding)
+        return embeddings
+    def embed_query(self, text):
+        # 查询文档
+        return self.embed_documents([text])[0]
+# ============================================================
+# Milvus 向量数据库封装类 (第一路召回: JSONL 文本数据)
+# ============================================================
+class Milvus_vector():
+    def __init__(self, uri="./milvus_agent.db", collection_name="LangChainCollection"):
+        self.URI = uri
+        self.collection_name = collection_name
+        self.embeddings = OpenAIEmbeddings()
+        # 定义索引类型
+        self.dense_index = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+        }
+        self.sparse_index = {
+            "metric_type": "BM25",
+            "index_type": "SPARSE_INVERTED_INDEX"
+        }
+    def create_vector_store(self, docs):
+        init_docs = docs[:10]
+        self.vectorstore = Milvus.from_documents(
+            documents=init_docs,
+            embedding=self.embeddings,
+            builtin_function=BM25BuiltInFunction(),  # output_field_names="sparse",
+            index_params=[self.dense_index, self.sparse_index],
+            vector_field=["dense", "sparse"],
+            connection_args={
+                "uri": self.URI,
+            },
+            collection_name=self.collection_name,
+            # 支持 ("Strong", "Session", "Bounded", "Eventually")
+            consistency_level="Bounded",
+            drop_old=False,
+        )
+        print("已初始化创建 Milvus ‼")
+        count = 10
+        temp = []
+        for doc in tqdm(docs[10:]):
+            temp.append(doc)
+            if len(temp) >= 5:
+                self.vectorstore.aadd_documents(temp)
+                count += len(temp)
+                temp = []
+                print(f"已插入 {count} 条数据......")
+                time.sleep(1)
+        print(f"总共插入 {count} 条数据......")
+        print("已创建 Milvus 索引完成 ‼")
+        return self.vectorstore
+# ============================================================
+# PDF 父子文档检索器 (第二路召回: PDF 文档数据)
+# ============================================================
+class Pdf_retriever():
+    def __init__(self, uri="./pdf_agent.db", collection_name="LangChainCollection"):
+        self.URI = uri
+        self.collection_name = collection_name
+        self.embeddings = OpenAIEmbeddings()
+        # 定义索引类型
+        self.dense_index = {
+            "metric_type": "IP",
+            "index_type": "IVF_FLAT",
+        }
+        self.sparse_index = {
+            "metric_type": "BM25",
+            "index_type": "SPARSE_INVERTED_INDEX"
+        }
+        self.docstore = InMemoryStore()
+        # 文本分割器
+        self.child_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=200,
+            chunk_overlap=50,
+            length_function=len,
+            separators=["\n\n", "\n", "。", "！", "？", "；", "，", " ", ""]
+        )
+        self.parent_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
+    def create_pdf_vector_store(self, docs):
+        self.milvus_vectorstore = Milvus(
+            embedding_function=self.embeddings,
+            builtin_function=BM25BuiltInFunction(),
+            vector_field=["dense", "sparse"],
+            index_params=[
+                {
+                    "metric_type": "IP",
+                    "index_type": "IVF_FLAT",
+                },
+                {
+                    "metric_type": "BM25",
+                    "index_type": "SPARSE_INVERTED_INDEX"
+                }
+            ],
+            connection_args={"uri": self.URI},
+            collection_name=self.collection_name,
+            consistency_level="Bounded",
+            drop_old=False,
+        )
+        # 设置父子文档检索器
+        self.retriever = ParentDocumentRetriever(
+            vectorstore=self.milvus_vectorstore,
+            docstore=self.docstore,
+            child_splitter=self.child_splitter,
+            parent_splitter=self.parent_splitter,
+        )
+        # 添加文档
+        count = 0
+        temp = []
+        for doc in tqdm(docs):
+            temp.append(doc)
+            if len(temp) >= 10:
+                # ParentDocumentRetriever()不支持异步等待操作
+                self.retriever.add_documents(temp)
+                count += len(temp)
+                temp = []
+                print(f"已插入 {count} 条数据......")
+                time.sleep(1)
+        print(f"总共插入 {count} 条数据......")
+        print("基于PDF文档数据的 Milvus 索引完成 ‼")
+        return self.retriever
+# ============================================================
+# 数据预处理: 从 JSONL 文件加载文档 (第一路)
+# ============================================================
+def prepare_document(file_path=['./data/dialog.jsonl', './data/train.jsonl']):
+    # 逐条取出文本数据, 创建嵌入张量, 然后将张量数据插入Milvus
+    file_path1 = file_path[0]
+    count = 0
+    docs = []
+    with open(file_path1, 'r', encoding='utf-8') as f:
+        for line in f:
+            content = json.loads(line.strip())
+            prompt = content['query'] + "\n" + content['response']
+            temp_doc = Document(page_content=prompt, metadata={"doc_id": str(uuid.uuid4())})
+            docs.append(temp_doc)
+            count += 1
+    print(f"已加载 {count} 条数据!")
+    return docs
+# ============================================================
+# 数据预处理: 从 PDF 提取结果加载文档 (第二路)
+# ============================================================
+def prepare_pdf_document(file_path="./pdf_output/pdf_detailed_text.xlsx"):
+    df = pd.read_excel(file_path)
+    # 空行直接删除, 否则后续处理报错
+    df = df.dropna(subset=['text_content'])
+    # 将DataFrame转换为LangChain文档
+    documents = []
+    for _, row in df.iterrows():
+        # 确保 text_content 是字符串, 且不为 NaN
+        text_content = str(row['text_content']) if pd.notna(row['text_content']) else ""
+        doc = Document(
+            page_content=text_content.strip(),
+            metadata={"doc_id": str(uuid.uuid4())}
+        )
+        documents.append(doc)
+    print(f"成功加载 {len(documents)} 个文档")
+    return documents
+# ============================================================
+# 主入口: 执行数据入库流程
+# ============================================================
+if __name__ == "__main__":
+    # ============================================================
+    # 数据灌入 Milvus Server (agent6 多 Worker 模式)
+    # ============================================================
+    # collection_name 必须与 agent6.py 中的一致:
+    #   medical_agent  → 第一路 JSONL 医学问答
+    #   medical_pdf    → 第二路 PDF 文档
+    MILVUS_SERVER_URI = os.getenv("MILVUS_URI", "http://localhost:19530")
+    # --- 第一路: JSONL 数据 → medical_agent ---
+    docs = prepare_document()
+    print("预处理文档数据成功......")
+    milvus_vectorstore = Milvus_vector(
+        uri=MILVUS_SERVER_URI,
+        collection_name="medical_agent",
+    )
+    print("创建 Milvus 连接成功......")
+    vectorstore = milvus_vectorstore.create_vector_store(docs)
+    print("第一路 (JSONL) 数据灌入完成 ✅")
+    # --- 第二路: PDF 数据 → medical_pdf ---
+    pdf_docs = prepare_pdf_document()
+    print("预处理 PDF 文档数据成功......")
+    pdf_vectorstore = Pdf_retriever(
+        uri=MILVUS_SERVER_URI,
+        collection_name="medical_pdf",
+    )
+    print("创建 PDF Milvus 连接成功......")
+    retriever = pdf_vectorstore.create_pdf_vector_store(pdf_docs)
+    print("第二路 (PDF) 数据灌入完成 ✅")
+    print("全部数据灌入 Milvus Server 完成, 可以启动 agent6.py 了 ✅")