Spaces:

alezsd
/

NSF-RAG-Codex

Sleeping

@@ -54,6 +54,37 @@ DEMO_EXAMPLES = [
     "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
 ]
 # ── Función principal ─────────────────────────────────────────────────────────
 def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
@@ -243,6 +274,15 @@ with gr.Blocks(
                             ],
                             value="llama-3.3-70b-versatile",
                         )
                         show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
                     btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
@@ -275,6 +315,13 @@ with gr.Blocks(
                 outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
             )
         # ── Pestaña 2: Evaluación ─────────────────────────────────────────────
         with gr.TabItem(" Evaluation with GT"):
             gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")

     "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
 ]
+# Variable global para el vector store activo
+current_embed_model = "openai"
+vs = VectorStore(embedding_model="openai")
+vs.index(all_pairs)
+def cambiar_embedding(embed_model: str):
+    global vs, corrector, current_embed_model
+    if embed_model == current_embed_model:
+        return f"ℹ Ya estás usando **{embed_model}**"
+    try:
+        current_embed_model = embed_model
+        vs = VectorStore(embedding_model=embed_model)
+        # Indexar si la colección está vacía
+        if vs.count() == 0:
+            vs.index(all_pairs)
+            msg = f" Re-indexado con **{embed_model}** · {vs.count()} docs"
+        else:
+            msg = f" Cargado índice existente **{embed_model}** · {vs.count()} docs"
+        # Recrear el corrector con el nuevo vector store
+        corrector = RAGCorrector(vs)
+        return msg
+    except Exception as e:
+        return f" Error cambiando embedding: {e}"
 # ── Función principal ─────────────────────────────────────────────────────────
 def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
                             ],
                             value="llama-3.3-70b-versatile",
                         )
+                        embedding_selector = gr.Dropdown(
+                            label="Modelo de Embedding",
+                            choices=[
+                                "openai",    # text-embedding-3-small
+                                "mpnet",     # paraphrase-multilingual-mpnet-base-v2
+                            ],
+                            value="openai",
+                        )
                         show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
                     btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
                 outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
             )
+            embed_status = gr.Markdown()
+            embedding_selector.change(
+                fn=cambiar_embedding,
+                inputs=[embedding_selector],
+                outputs=[embed_status],
+            )
         # ── Pestaña 2: Evaluación ─────────────────────────────────────────────
         with gr.TabItem(" Evaluation with GT"):
             gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")

chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/data_level0.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6fa2d11317016f60bb3749001d253cfcda22e6a1a86ef4205cd7275136b13845
-size 167600

 version https://git-lfs.github.com/spec/v1
+oid sha256:d9a99eaf64f11b1ada2c9f30404ee828a73c3655cf45dd205b342779d17a1252
+size 1689408

chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/header.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
 size 100

 version https://git-lfs.github.com/spec/v1
+oid sha256:877f8338d53e0d5deb2685cd0c3d6ca5a63b93273c282d15324a1e3d78f4f657
 size 100

chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fdf769d33a55186b4a6847f3f7f5e3df6e7bc0389f4c44a037f33b2d310aa89
+size 36090

chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/length.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
-size 400

 version https://git-lfs.github.com/spec/v1
+oid sha256:e11bd04e05b530050077f737f739485b559b0e6e6a51cf7fd8f02950279e3633
+size 4032

chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/link_lists.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
-size 0

 version https://git-lfs.github.com/spec/v1
+oid sha256:4ff3ccd12bb7743c17e7e4c32786b6ef7a427c87a45d5f4de37e3fcad162af84
+size 8656

chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c590488ca400e009d6b879ddc9c3ae539c905f6303551b2b661917b130e4067
+size 1676000

chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47f6c2dc55a35a27eb2842e8ca379968e83a861a382bdb68505796e318930e07
+size 100

chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0a86285e486c9f52221296c887a10e701f082d562008e30ae6c4aaa23d06f7a
+size 35802

chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52bb6e4904c0cbddf4653e32b02524eb07937f72ea2853c840c07fc239a5656d
+size 4000

chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90805b546d1990d551d863243bbaa5b8fe363d3e37a84702e6b0de5dce1b909d
+size 8556

chroma_db/chroma.sqlite3 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:31e2501c8f9078412eff559bb7c60b28fe65d2554b1646a80c66ca1e7f993a75
-size 225280

 version https://git-lfs.github.com/spec/v1
+oid sha256:62ff941a20a9ac6009dbb3f0723d7414b3e15b74106e973ab5293013ad647c53
+size 187084800

chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a56e255663a3eab9842659c4e0eb8952791278840d2e878adeebadb63ea4417e
+size 3212000

chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b58c6fcb8aa76156f8ef447a97afa6faced3ff9db8b05b2fdbcc5c4473480cee
+size 100

chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5017d07aa4ce076a7c3d5b6a864ce713f45f097d8041507e874b4acff000f0e8
+size 35802

chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8220828c449043847cfd4441f6ea2ce538c791e3da3f2df3eae2448f7dd808
+size 4000

chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba26051786d261478a3bf76bab748cd2f4333823a223c555aa5b6d1943b7f22
+size 8488

vector_store.py CHANGED Viewed

@@ -29,26 +29,22 @@ COLLECTION   = "scriptorium_corpus"
 class VectorStore:
-    def __init__(self):
         self.client = chromadb.PersistentClient(path=CHROMA_PATH)
-        # Función de embedding: OpenAI si hay clave, sino local (sentencetransformers)
-        #if OPENAI_KEY:
-        #    self.ef = embedding_functions.OpenAIEmbeddingFunction(
-        #        api_key=OPENAI_KEY,
-        #        model_name=EMBED_MODEL,
-        #    )
-        #else:
-        #    # Fallback: modelo local multilingüe
-        #    self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
-        #        model_name="intfloat/multilingual-e5-small"
-        #    )
-        self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
-            model_name="intfloat/multilingual-e5-small")
         self.collection = self.client.get_or_create_collection(
-            name=COLLECTION,
             embedding_function=self.ef,
             metadata={"hnsw:space": "cosine"},
         )

 class VectorStore:
+    def __init__(self, embedding_model: str = "e5"):
         self.client = chromadb.PersistentClient(path=CHROMA_PATH)
+        if embedding_model == "mpnet":
+            self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
+                model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+            )
+            collection_name = "scriptorium_mpnet"
+        else:  # e5 por defecto — el que ya funciona
+            self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
+                model_name="intfloat/multilingual-e5-small"
+            )
+            collection_name = "scriptorium_e5"
         self.collection = self.client.get_or_create_collection(
+            name=collection_name,       # ← colección separada por modelo
             embedding_function=self.ef,
             metadata={"hnsw:space": "cosine"},
         )