File size: 5,369 Bytes
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
"""
Regenerate embeddings using semantic sentence-transformers model.

Purpose:
    Completely regenerates embeddings using the semantic sentence-transformers model,
    creates a new Pinecone index, and uploads the embeddings. This is a full refresh
    of the vector database with semantic embeddings.

Process:
1. Loads documents and chunks them
2. Generates semantic embeddings (384-dim using all-MiniLM-L6-v2)
3. Saves to data/chunks_semantic.jsonl
4. Creates new Pinecone index with 384 dimensions
5. Uploads semantic embeddings to new index

Inputs:
    None (uses sample_docs directory by default)
    PINECONE_API_KEY environment variable

Outputs:
    Saves embedded chunks to data/chunks_semantic.jsonl
    Creates and populates new Pinecone index
    Prints progress and completion messages

Environment variables required:
    PINECONE_API_KEY: Your Pinecone API key

Usage:
    python scripts/regenerate_with_semantic.py
"""

import sys
import os
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents
from src.ingestion.embeddings import batch_embed_chunks, get_embedding
from pinecone import Pinecone, ServerlessSpec
import src.config as cfg
import json


def main():
    print("=" * 60)
    print("Regenerating Embeddings with Semantic Model")
    print("=" * 60)

    # Step 1: Load and chunk documents
    print("\n[1/5] Loading documents...")
    docs_dir = str(PROJECT_ROOT / "sample_docs")
    docs = load_markdown_docs(docs_dir)
    print(f"   Loaded {len(docs)} documents")

    print("\n[2/5] Chunking documents...")
    chunks = chunk_documents(docs, max_tokens=300, overlap=50)
    print(f"   Generated {len(chunks)} chunks")

    # Step 2: Generate semantic embeddings
    print("\n[3/5] Generating semantic embeddings...")
    print("   Using model: all-MiniLM-L6-v2 (384 dimensions)")
    print("   This may take 1-2 minutes...")

    embedded = batch_embed_chunks(
        chunks,
        provider="sentence-transformers",
        model_name="all-MiniLM-L6-v2"
    )

    # Get actual dimension from first embedding
    actual_dim = len(embedded[0]['embedding'])
    print(f"   βœ“ Generated {len(embedded)} embeddings ({actual_dim} dimensions)")

    # Step 3: Save to file
    print("\n[4/5] Saving embeddings...")
    output_file = PROJECT_ROOT / "data" / "chunks_semantic.jsonl"
    output_file.parent.mkdir(parents=True, exist_ok=True)

    with output_file.open("w", encoding="utf-8") as f:
        for i, e in enumerate(embedded):
            # Merge text back from chunks
            chunk_text = chunks[i]["text"]
            obj = {
                "id": f"{e['filename']}::{e['chunk_id']}",
                "filename": e["filename"],
                "chunk_id": e["chunk_id"],
                "text": chunk_text,
                "chars": e.get("chars", 0),
                "embedding": e["embedding"]
            }
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    print(f"   βœ“ Saved to: {output_file}")

    # Step 4: Create new Pinecone index
    print("\n[5/5] Setting up Pinecone index...")
    print(f"   Connecting to Pinecone...")

    pc = Pinecone(api_key=cfg.PINECONE_API_KEY)

    new_index_name = "rag-semantic-384"
    print(f"   Creating new index: {new_index_name}")
    print(f"   Dimension: {actual_dim}, Metric: cosine")

    # Check if index exists
    existing_indexes = [idx.name for idx in pc.list_indexes()]

    if new_index_name in existing_indexes:
        print(f"   Index '{new_index_name}' already exists - deleting old version...")
        pc.delete_index(new_index_name)

    # Create new index
    pc.create_index(
        name=new_index_name,
        dimension=actual_dim,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"   βœ“ Index created")

    # Wait for index to be ready
    print("   Waiting for index to be ready...")
    import time
    while not pc.describe_index(new_index_name).status.ready:
        time.sleep(1)

    # Step 5: Upload to Pinecone
    print(f"\n   Uploading {len(embedded)} vectors to Pinecone...")

    index = pc.Index(new_index_name)

    # Prepare vectors for upsert
    vectors = []
    for e in embedded:
        vec_id = f"{e['filename']}::{e['chunk_id']}"
        vectors.append({
            "id": vec_id,
            "values": e["embedding"],
            "metadata": {}
        })

    # Upsert in batches of 100
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        index.upsert(vectors=batch)
        print(f"   Uploaded {min(i+batch_size, len(vectors))}/{len(vectors)} vectors")

    # Verify upload
    stats = index.describe_index_stats()
    print(f"   βœ“ Index now contains {stats.total_vector_count} vectors")

    print("\n" + "=" * 60)
    print("βœ… COMPLETE!")
    print("=" * 60)
    print(f"\nNext steps:")
    print(f"1. Update config: export PINECONE_INDEX_NAME='{new_index_name}'")
    print(f"2. Test search: python -c \"from src.retrieval.retriever import query_pinecone; print(query_pinecone('what is GDPR', top_k=5))\"")
    print()


if __name__ == "__main__":
    main()