Spaces:
Sleeping
Sleeping
File size: 5,369 Bytes
f866820 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | #!/usr/bin/env python3
"""
Regenerate embeddings using semantic sentence-transformers model.
Purpose:
Completely regenerates embeddings using the semantic sentence-transformers model,
creates a new Pinecone index, and uploads the embeddings. This is a full refresh
of the vector database with semantic embeddings.
Process:
1. Loads documents and chunks them
2. Generates semantic embeddings (384-dim using all-MiniLM-L6-v2)
3. Saves to data/chunks_semantic.jsonl
4. Creates new Pinecone index with 384 dimensions
5. Uploads semantic embeddings to new index
Inputs:
None (uses sample_docs directory by default)
PINECONE_API_KEY environment variable
Outputs:
Saves embedded chunks to data/chunks_semantic.jsonl
Creates and populates new Pinecone index
Prints progress and completion messages
Environment variables required:
PINECONE_API_KEY: Your Pinecone API key
Usage:
python scripts/regenerate_with_semantic.py
"""
import sys
import os
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents
from src.ingestion.embeddings import batch_embed_chunks, get_embedding
from pinecone import Pinecone, ServerlessSpec
import src.config as cfg
import json
def main():
print("=" * 60)
print("Regenerating Embeddings with Semantic Model")
print("=" * 60)
# Step 1: Load and chunk documents
print("\n[1/5] Loading documents...")
docs_dir = str(PROJECT_ROOT / "sample_docs")
docs = load_markdown_docs(docs_dir)
print(f" Loaded {len(docs)} documents")
print("\n[2/5] Chunking documents...")
chunks = chunk_documents(docs, max_tokens=300, overlap=50)
print(f" Generated {len(chunks)} chunks")
# Step 2: Generate semantic embeddings
print("\n[3/5] Generating semantic embeddings...")
print(" Using model: all-MiniLM-L6-v2 (384 dimensions)")
print(" This may take 1-2 minutes...")
embedded = batch_embed_chunks(
chunks,
provider="sentence-transformers",
model_name="all-MiniLM-L6-v2"
)
# Get actual dimension from first embedding
actual_dim = len(embedded[0]['embedding'])
print(f" β Generated {len(embedded)} embeddings ({actual_dim} dimensions)")
# Step 3: Save to file
print("\n[4/5] Saving embeddings...")
output_file = PROJECT_ROOT / "data" / "chunks_semantic.jsonl"
output_file.parent.mkdir(parents=True, exist_ok=True)
with output_file.open("w", encoding="utf-8") as f:
for i, e in enumerate(embedded):
# Merge text back from chunks
chunk_text = chunks[i]["text"]
obj = {
"id": f"{e['filename']}::{e['chunk_id']}",
"filename": e["filename"],
"chunk_id": e["chunk_id"],
"text": chunk_text,
"chars": e.get("chars", 0),
"embedding": e["embedding"]
}
f.write(json.dumps(obj, ensure_ascii=False) + "\n")
print(f" β Saved to: {output_file}")
# Step 4: Create new Pinecone index
print("\n[5/5] Setting up Pinecone index...")
print(f" Connecting to Pinecone...")
pc = Pinecone(api_key=cfg.PINECONE_API_KEY)
new_index_name = "rag-semantic-384"
print(f" Creating new index: {new_index_name}")
print(f" Dimension: {actual_dim}, Metric: cosine")
# Check if index exists
existing_indexes = [idx.name for idx in pc.list_indexes()]
if new_index_name in existing_indexes:
print(f" Index '{new_index_name}' already exists - deleting old version...")
pc.delete_index(new_index_name)
# Create new index
pc.create_index(
name=new_index_name,
dimension=actual_dim,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
print(f" β Index created")
# Wait for index to be ready
print(" Waiting for index to be ready...")
import time
while not pc.describe_index(new_index_name).status.ready:
time.sleep(1)
# Step 5: Upload to Pinecone
print(f"\n Uploading {len(embedded)} vectors to Pinecone...")
index = pc.Index(new_index_name)
# Prepare vectors for upsert
vectors = []
for e in embedded:
vec_id = f"{e['filename']}::{e['chunk_id']}"
vectors.append({
"id": vec_id,
"values": e["embedding"],
"metadata": {}
})
# Upsert in batches of 100
batch_size = 100
for i in range(0, len(vectors), batch_size):
batch = vectors[i:i+batch_size]
index.upsert(vectors=batch)
print(f" Uploaded {min(i+batch_size, len(vectors))}/{len(vectors)} vectors")
# Verify upload
stats = index.describe_index_stats()
print(f" β Index now contains {stats.total_vector_count} vectors")
print("\n" + "=" * 60)
print("β
COMPLETE!")
print("=" * 60)
print(f"\nNext steps:")
print(f"1. Update config: export PINECONE_INDEX_NAME='{new_index_name}'")
print(f"2. Test search: python -c \"from src.retrieval.retriever import query_pinecone; print(query_pinecone('what is GDPR', top_k=5))\"")
print()
if __name__ == "__main__":
main() |