alezsd commited on
Commit
7e81e3e
·
1 Parent(s): d9f9b33

multiple LLMs and multiple Embedding Vectors

Browse files
__pycache__/corpus_loader.cpython-39.pyc ADDED
Binary file (3.62 kB). View file
 
__pycache__/evaluator.cpython-39.pyc ADDED
Binary file (5.36 kB). View file
 
__pycache__/knowledge_base.cpython-39.pyc ADDED
Binary file (4.7 kB). View file
 
__pycache__/rag_corrector.cpython-39.pyc ADDED
Binary file (5.92 kB). View file
 
__pycache__/vector_store.cpython-39.pyc ADDED
Binary file (4.83 kB). View file
 
app.py CHANGED
@@ -54,6 +54,37 @@ DEMO_EXAMPLES = [
54
  "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
55
  ]
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # ── Función principal ─────────────────────────────────────────────────────────
58
 
59
  def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
@@ -243,6 +274,15 @@ with gr.Blocks(
243
  ],
244
  value="llama-3.3-70b-versatile",
245
  )
 
 
 
 
 
 
 
 
 
246
  show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
247
  btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
248
 
@@ -275,6 +315,13 @@ with gr.Blocks(
275
  outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
276
  )
277
 
 
 
 
 
 
 
 
278
  # ── Pestaña 2: Evaluación ─────────────────────────────────────────────
279
  with gr.TabItem(" Evaluation with GT"):
280
  gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
 
54
  "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
55
  ]
56
 
57
+ # Variable global para el vector store activo
58
+ current_embed_model = "openai"
59
+ vs = VectorStore(embedding_model="openai")
60
+ vs.index(all_pairs)
61
+
62
+ def cambiar_embedding(embed_model: str):
63
+ global vs, corrector, current_embed_model
64
+
65
+ if embed_model == current_embed_model:
66
+ return f"ℹ Ya estás usando **{embed_model}**"
67
+
68
+ try:
69
+ current_embed_model = embed_model
70
+ vs = VectorStore(embedding_model=embed_model)
71
+
72
+ # Indexar si la colección está vacía
73
+ if vs.count() == 0:
74
+ vs.index(all_pairs)
75
+ msg = f" Re-indexado con **{embed_model}** · {vs.count()} docs"
76
+ else:
77
+ msg = f" Cargado índice existente **{embed_model}** · {vs.count()} docs"
78
+
79
+ # Recrear el corrector con el nuevo vector store
80
+ corrector = RAGCorrector(vs)
81
+ return msg
82
+
83
+ except Exception as e:
84
+ return f" Error cambiando embedding: {e}"
85
+
86
+
87
+
88
  # ── Función principal ─────────────────────────────────────────────────────────
89
 
90
  def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
 
274
  ],
275
  value="llama-3.3-70b-versatile",
276
  )
277
+ embedding_selector = gr.Dropdown(
278
+ label="Modelo de Embedding",
279
+ choices=[
280
+ "openai", # text-embedding-3-small
281
+ "mpnet", # paraphrase-multilingual-mpnet-base-v2
282
+ ],
283
+ value="openai",
284
+ )
285
+
286
  show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
287
  btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
288
 
 
315
  outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
316
  )
317
 
318
+ embed_status = gr.Markdown()
319
+ embedding_selector.change(
320
+ fn=cambiar_embedding,
321
+ inputs=[embedding_selector],
322
+ outputs=[embed_status],
323
+ )
324
+
325
  # ── Pestaña 2: Evaluación ─────────────────────────────────────────────
326
  with gr.TabItem(" Evaluation with GT"):
327
  gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/data_level0.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fa2d11317016f60bb3749001d253cfcda22e6a1a86ef4205cd7275136b13845
3
- size 167600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9a99eaf64f11b1ada2c9f30404ee828a73c3655cf45dd205b342779d17a1252
3
+ size 1689408
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/header.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
  size 100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:877f8338d53e0d5deb2685cd0c3d6ca5a63b93273c282d15324a1e3d78f4f657
3
  size 100
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fdf769d33a55186b4a6847f3f7f5e3df6e7bc0389f4c44a037f33b2d310aa89
3
+ size 36090
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/length.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a12e561363385e9dfeeab326368731c030ed4b374e7f5897ac819159d2884c5
3
- size 400
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e11bd04e05b530050077f737f739485b559b0e6e6a51cf7fd8f02950279e3633
3
+ size 4032
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/link_lists.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
- size 0
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff3ccd12bb7743c17e7e4c32786b6ef7a427c87a45d5f4de37e3fcad162af84
3
+ size 8656
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c590488ca400e009d6b879ddc9c3ae539c905f6303551b2b661917b130e4067
3
+ size 1676000
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f6c2dc55a35a27eb2842e8ca379968e83a861a382bdb68505796e318930e07
3
+ size 100
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a86285e486c9f52221296c887a10e701f082d562008e30ae6c4aaa23d06f7a
3
+ size 35802
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52bb6e4904c0cbddf4653e32b02524eb07937f72ea2853c840c07fc239a5656d
3
+ size 4000
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90805b546d1990d551d863243bbaa5b8fe363d3e37a84702e6b0de5dce1b909d
3
+ size 8556
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31e2501c8f9078412eff559bb7c60b28fe65d2554b1646a80c66ca1e7f993a75
3
- size 225280
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62ff941a20a9ac6009dbb3f0723d7414b3e15b74106e973ab5293013ad647c53
3
+ size 187084800
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a56e255663a3eab9842659c4e0eb8952791278840d2e878adeebadb63ea4417e
3
+ size 3212000
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b58c6fcb8aa76156f8ef447a97afa6faced3ff9db8b05b2fdbcc5c4473480cee
3
+ size 100
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5017d07aa4ce076a7c3d5b6a864ce713f45f097d8041507e874b4acff000f0e8
3
+ size 35802
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8220828c449043847cfd4441f6ea2ce538c791e3da3f2df3eae2448f7dd808
3
+ size 4000
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ba26051786d261478a3bf76bab748cd2f4333823a223c555aa5b6d1943b7f22
3
+ size 8488
vector_store.py CHANGED
@@ -29,26 +29,22 @@ COLLECTION = "scriptorium_corpus"
29
 
30
 
31
  class VectorStore:
32
- def __init__(self):
33
  self.client = chromadb.PersistentClient(path=CHROMA_PATH)
34
 
35
- # Función de embedding: OpenAI si hay clave, sino local (sentencetransformers)
36
- #if OPENAI_KEY:
37
- # self.ef = embedding_functions.OpenAIEmbeddingFunction(
38
- # api_key=OPENAI_KEY,
39
- # model_name=EMBED_MODEL,
40
- # )
41
- #else:
42
- # # Fallback: modelo local multilingüe
43
- # self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
44
- # model_name="intfloat/multilingual-e5-small"
45
- # )
46
-
47
- self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
48
- model_name="intfloat/multilingual-e5-small")
49
 
50
  self.collection = self.client.get_or_create_collection(
51
- name=COLLECTION,
52
  embedding_function=self.ef,
53
  metadata={"hnsw:space": "cosine"},
54
  )
 
29
 
30
 
31
  class VectorStore:
32
+ def __init__(self, embedding_model: str = "e5"):
33
  self.client = chromadb.PersistentClient(path=CHROMA_PATH)
34
 
35
+ if embedding_model == "mpnet":
36
+ self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
37
+ model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
38
+ )
39
+ collection_name = "scriptorium_mpnet"
40
+ else: # e5 por defecto — el que ya funciona
41
+ self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
42
+ model_name="intfloat/multilingual-e5-small"
43
+ )
44
+ collection_name = "scriptorium_e5"
 
 
 
 
45
 
46
  self.collection = self.client.get_or_create_collection(
47
+ name=collection_name, # ← colección separada por modelo
48
  embedding_function=self.ef,
49
  metadata={"hnsw:space": "cosine"},
50
  )