DoAn / scripts /build_data.py

Thay đổi promt

92c9b4d about 2 months ago

6.23 kB

	import sys
	import argparse
	from pathlib import Path
	from dotenv import find_dotenv, load_dotenv

	load_dotenv(find_dotenv(usecwd=True))

	REPO_ROOT = Path(__file__).resolve().parents[1]
	if str(REPO_ROOT) not in sys.path:
	sys.path.insert(0, str(REPO_ROOT))

	from core.rag.chunk import chunk_markdown_file
	from core.rag.embedding_model import EmbeddingConfig, QwenEmbeddings
	from core.rag.vector_store import ChromaConfig, ChromaVectorDB
	from core.hash_file.hash_file import HashProcessor

	_hasher = HashProcessor(verbose=False)


	def get_db_file_info(db: ChromaVectorDB) -> dict:

	docs = db.get_all_documents()
	file_to_ids = {}
	file_to_hash = {}

	for d in docs:
	meta = d.get("metadata", {})
	source = meta.get("source_basename") or meta.get("source_file")
	doc_id = d.get("id")
	content_hash = meta.get("content_hash", "")

	if source and doc_id:
	if source not in file_to_ids:
	file_to_ids[source] = set()
	file_to_ids[source].add(doc_id)

	# Store first hash found for file
	if source not in file_to_hash and content_hash:
	file_to_hash[source] = content_hash

	return {"ids": file_to_ids, "hashes": file_to_hash}


	def main():
	parser = argparse.ArgumentParser(description="Build ChromaDB from markdown files")
	parser.add_argument("--force", action="store_true", help="Rebuild all files")
	parser.add_argument("--no-delete", action="store_true", help="Don't delete orphaned docs")
	args = parser.parse_args()

	print("=" * 60)
	print("BUILD HUST RAG DATABASE")
	print("=" * 60)

	# Step 1: Initialize embedder
	print("\n[1/5] Initializing embedder...")
	emb_cfg = EmbeddingConfig()
	emb = QwenEmbeddings(emb_cfg)
	print(f" Model: {emb_cfg.model}")
	print(f" API: {emb_cfg.api_base_url}")

	# Step 2: Initialize ChromaDB
	print("\n[2/5] Initializing ChromaDB...")
	db_cfg = ChromaConfig()
	db = ChromaVectorDB(embedder=emb, config=db_cfg)
	old_count = db.count()
	print(f" Collection: {db_cfg.collection_name}")
	print(f" Current docs: {old_count}")

	# Get current DB state
	db_info = {"ids": {}, "hashes": {}}
	if not args.force and old_count > 0:
	print("\n Scanning documents in DB...")
	db_info = get_db_file_info(db)
	print(f" Found {len(db_info['ids'])} source files in DB")

	# Step 3: Scan markdown files
	print("\n[3/5] Scanning markdown files...")
	root = REPO_ROOT / "data" / "data_process"
	md_files = sorted(root.rglob("*.md"))
	print(f" Found {len(md_files)} markdown files on disk")

	# Compare files on disk vs in DB
	current_files = {f.name for f in md_files}
	db_files = set(db_info["ids"].keys())

	# Find files to delete (in DB but not on disk)
	files_to_delete = db_files - current_files

	# Step 4: Delete orphaned docs
	deleted_count = 0
	if files_to_delete and not args.no_delete:
	print(f"\n[4/5] Cleaning up {len(files_to_delete)} deleted files...")
	for filename in files_to_delete:
	doc_ids = list(db_info["ids"].get(filename, []))
	if doc_ids:
	db.delete_documents(doc_ids)
	deleted_count += len(doc_ids)
	print(f" Deleted: {filename} ({len(doc_ids)} chunks)")
	else:
	print("\n[4/5] No files to delete")

	# Step 5: Process markdown files (add new, update)
	print("\n[5/5] Processing markdown files...")
	total_added = 0
	total_updated = 0
	skipped = 0

	for i, f in enumerate(md_files, 1):
	file_hash = _hasher.get_file_hash(str(f))
	db_hash = db_info["hashes"].get(f.name, "")
	existing_ids = db_info["ids"].get(f.name, set())

	# Skip if hash matches (file unchanged)
	if not args.force and db_hash == file_hash:
	print(f" [{i}/{len(md_files)}] {f.name}: SKIPPED (unchanged)")
	skipped += 1
	continue

	# If file changed, delete old chunks first
	if existing_ids and not args.force:
	db.delete_documents(list(existing_ids))
	print(f" [{i}/{len(md_files)}] {f.name}: UPDATED (deleted {len(existing_ids)} old chunks)")
	is_update = True
	else:
	is_update = False

	try:
	docs = chunk_markdown_file(f)
	if docs:
	# Add hash to metadata for change detection
	for doc in docs:
	if hasattr(doc, 'metadata'):
	doc.metadata["content_hash"] = file_hash
	elif isinstance(doc, dict) and "metadata" in doc:
	doc["metadata"]["content_hash"] = file_hash

	n = db.upsert_documents(docs)
	if is_update:
	total_updated += n
	print(f" [{i}/{len(md_files)}] {f.name}: +{n} new chunks")
	else:
	total_added += n
	print(f" [{i}/{len(md_files)}] {f.name}: {n} chunks")
	else:
	print(f" [{i}/{len(md_files)}] {f.name}: SKIPPED (no chunks)")
	except Exception as e:
	print(f" [{i}/{len(md_files)}] {f.name}: ERROR - {e}")

	# Summary
	new_count = db.count()
	has_changes = deleted_count > 0 or total_updated > 0 or total_added > 0

	# Delete BM25 cache if changes detected (BM25 doesn't support incremental update)
	if has_changes:
	bm25_cache = REPO_ROOT / "data" / "chroma" / "bm25_cache.pkl"
	if bm25_cache.exists():
	bm25_cache.unlink()
	print("\n[!] Deleted BM25 cache (will auto-rebuild on next query)")

	print(f"\n{'=' * 60}")
	print("SUMMARY")
	print("=" * 60)
	print(f" Deleted (orphaned): {deleted_count} chunks")
	print(f" Updated: {total_updated} chunks")
	print(f" Added: {total_added} chunks")
	print(f" Skipped: {skipped} files")
	print(f" DB docs: {old_count} -> {new_count} ({new_count - old_count:+d})")

	print("\nDONE!")


	if __name__ == "__main__":
	main()