| from colpali_manager import ColpaliManager |
| from milvus_manager import MilvusManager |
| from pdf_manager import PdfManager |
| import hashlib |
|
|
|
|
|
|
| pdf_manager = PdfManager() |
| colpali_manager = ColpaliManager() |
|
|
|
|
|
|
| class Middleware: |
| def __init__(self, id:str, create_collection=True): |
| |
| hashed_id = 0 |
| milvus_db_name = f"milvus_{hashed_id}.db" |
| self.milvus_manager = MilvusManager(milvus_db_name, id, create_collection) |
|
|
| def index(self, pdf_path: str, id:str, max_pages: int, pages: list[int] = None): |
|
|
| if type(pdf_path) == None: |
| print("no docs") |
| return |
| |
| print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}") |
|
|
| image_paths = pdf_manager.save_images(id, pdf_path, max_pages) |
|
|
| print(f"Saved {len(image_paths)} images") |
|
|
| colbert_vecs = colpali_manager.process_images(image_paths) |
|
|
| images_data = [{ |
| "colbert_vecs": colbert_vecs[i], |
| "filepath": image_paths[i] |
| } for i in range(len(image_paths))] |
|
|
| print(f"Inserting {len(images_data)} images data to Milvus") |
|
|
| self.milvus_manager.insert_images_data(images_data) |
|
|
| print("Indexing completed") |
|
|
| return image_paths |
|
|
|
|
| |
| def search(self, search_queries: list[str], topk: int = 10): |
| print(f"Searching for {len(search_queries)} queries with topk={topk}") |
|
|
| final_res = [] |
|
|
| for query in search_queries: |
| print(f"Searching for query: {query}") |
| query_vec = colpali_manager.process_text([query])[0] |
| search_res = self.milvus_manager.search(query_vec, topk=topk) |
| print(f"Search result: {len(search_res)} results for query: {query}") |
| final_res.append(search_res) |
|
|
| return final_res |
|
|
|
|