Spaces:
Sleeping
Sleeping
multiple LLMs and multiple Embedding Vectors
Browse files- __pycache__/corpus_loader.cpython-39.pyc +0 -0
- __pycache__/evaluator.cpython-39.pyc +0 -0
- __pycache__/knowledge_base.cpython-39.pyc +0 -0
- __pycache__/rag_corrector.cpython-39.pyc +0 -0
- __pycache__/vector_store.cpython-39.pyc +0 -0
- app.py +47 -0
- chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/data_level0.bin +2 -2
- chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/header.bin +1 -1
- chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/index_metadata.pickle +3 -0
- chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/length.bin +2 -2
- chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/link_lists.bin +2 -2
- chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/data_level0.bin +3 -0
- chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/header.bin +3 -0
- chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/index_metadata.pickle +3 -0
- chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/length.bin +3 -0
- chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/link_lists.bin +3 -0
- chroma_db/chroma.sqlite3 +2 -2
- chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/data_level0.bin +3 -0
- chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/header.bin +3 -0
- chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/index_metadata.pickle +3 -0
- chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/length.bin +3 -0
- chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/link_lists.bin +3 -0
- vector_store.py +12 -16
__pycache__/corpus_loader.cpython-39.pyc
ADDED
|
Binary file (3.62 kB). View file
|
|
|
__pycache__/evaluator.cpython-39.pyc
ADDED
|
Binary file (5.36 kB). View file
|
|
|
__pycache__/knowledge_base.cpython-39.pyc
ADDED
|
Binary file (4.7 kB). View file
|
|
|
__pycache__/rag_corrector.cpython-39.pyc
ADDED
|
Binary file (5.92 kB). View file
|
|
|
__pycache__/vector_store.cpython-39.pyc
ADDED
|
Binary file (4.83 kB). View file
|
|
|
app.py
CHANGED
|
@@ -54,6 +54,37 @@ DEMO_EXAMPLES = [
|
|
| 54 |
"fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
|
| 55 |
]
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# ── Función principal ─────────────────────────────────────────────────────────
|
| 58 |
|
| 59 |
def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
|
|
@@ -243,6 +274,15 @@ with gr.Blocks(
|
|
| 243 |
],
|
| 244 |
value="llama-3.3-70b-versatile",
|
| 245 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
|
| 247 |
btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
|
| 248 |
|
|
@@ -275,6 +315,13 @@ with gr.Blocks(
|
|
| 275 |
outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
|
| 276 |
)
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
# ── Pestaña 2: Evaluación ─────────────────────────────────────────────
|
| 279 |
with gr.TabItem(" Evaluation with GT"):
|
| 280 |
gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
|
|
|
|
| 54 |
"fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
|
| 55 |
]
|
| 56 |
|
| 57 |
+
# Variable global para el vector store activo
|
| 58 |
+
current_embed_model = "openai"
|
| 59 |
+
vs = VectorStore(embedding_model="openai")
|
| 60 |
+
vs.index(all_pairs)
|
| 61 |
+
|
| 62 |
+
def cambiar_embedding(embed_model: str):
|
| 63 |
+
global vs, corrector, current_embed_model
|
| 64 |
+
|
| 65 |
+
if embed_model == current_embed_model:
|
| 66 |
+
return f"ℹ Ya estás usando **{embed_model}**"
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
current_embed_model = embed_model
|
| 70 |
+
vs = VectorStore(embedding_model=embed_model)
|
| 71 |
+
|
| 72 |
+
# Indexar si la colección está vacía
|
| 73 |
+
if vs.count() == 0:
|
| 74 |
+
vs.index(all_pairs)
|
| 75 |
+
msg = f" Re-indexado con **{embed_model}** · {vs.count()} docs"
|
| 76 |
+
else:
|
| 77 |
+
msg = f" Cargado índice existente **{embed_model}** · {vs.count()} docs"
|
| 78 |
+
|
| 79 |
+
# Recrear el corrector con el nuevo vector store
|
| 80 |
+
corrector = RAGCorrector(vs)
|
| 81 |
+
return msg
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
return f" Error cambiando embedding: {e}"
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
# ── Función principal ─────────────────────────────────────────────────────────
|
| 89 |
|
| 90 |
def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
|
|
|
|
| 274 |
],
|
| 275 |
value="llama-3.3-70b-versatile",
|
| 276 |
)
|
| 277 |
+
embedding_selector = gr.Dropdown(
|
| 278 |
+
label="Modelo de Embedding",
|
| 279 |
+
choices=[
|
| 280 |
+
"openai", # text-embedding-3-small
|
| 281 |
+
"mpnet", # paraphrase-multilingual-mpnet-base-v2
|
| 282 |
+
],
|
| 283 |
+
value="openai",
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
|
| 287 |
btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
|
| 288 |
|
|
|
|
| 315 |
outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
|
| 316 |
)
|
| 317 |
|
| 318 |
+
embed_status = gr.Markdown()
|
| 319 |
+
embedding_selector.change(
|
| 320 |
+
fn=cambiar_embedding,
|
| 321 |
+
inputs=[embedding_selector],
|
| 322 |
+
outputs=[embed_status],
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
# ── Pestaña 2: Evaluación ─────────────────────────────────────────────
|
| 326 |
with gr.TabItem(" Evaluation with GT"):
|
| 327 |
gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
|
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/data_level0.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9a99eaf64f11b1ada2c9f30404ee828a73c3655cf45dd205b342779d17a1252
|
| 3 |
+
size 1689408
|
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/header.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 100
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:877f8338d53e0d5deb2685cd0c3d6ca5a63b93273c282d15324a1e3d78f4f657
|
| 3 |
size 100
|
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4fdf769d33a55186b4a6847f3f7f5e3df6e7bc0389f4c44a037f33b2d310aa89
|
| 3 |
+
size 36090
|
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/length.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e11bd04e05b530050077f737f739485b559b0e6e6a51cf7fd8f02950279e3633
|
| 3 |
+
size 4032
|
chroma_db/035fc15d-702a-4c76-9f8e-7d18e719c7ad/link_lists.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ff3ccd12bb7743c17e7e4c32786b6ef7a427c87a45d5f4de37e3fcad162af84
|
| 3 |
+
size 8656
|
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c590488ca400e009d6b879ddc9c3ae539c905f6303551b2b661917b130e4067
|
| 3 |
+
size 1676000
|
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47f6c2dc55a35a27eb2842e8ca379968e83a861a382bdb68505796e318930e07
|
| 3 |
+
size 100
|
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0a86285e486c9f52221296c887a10e701f082d562008e30ae6c4aaa23d06f7a
|
| 3 |
+
size 35802
|
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52bb6e4904c0cbddf4653e32b02524eb07937f72ea2853c840c07fc239a5656d
|
| 3 |
+
size 4000
|
chroma_db/cfdfd7f0-5b59-4fee-990c-cd1950937fc0/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90805b546d1990d551d863243bbaa5b8fe363d3e37a84702e6b0de5dce1b909d
|
| 3 |
+
size 8556
|
chroma_db/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62ff941a20a9ac6009dbb3f0723d7414b3e15b74106e973ab5293013ad647c53
|
| 3 |
+
size 187084800
|
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a56e255663a3eab9842659c4e0eb8952791278840d2e878adeebadb63ea4417e
|
| 3 |
+
size 3212000
|
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b58c6fcb8aa76156f8ef447a97afa6faced3ff9db8b05b2fdbcc5c4473480cee
|
| 3 |
+
size 100
|
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5017d07aa4ce076a7c3d5b6a864ce713f45f097d8041507e874b4acff000f0e8
|
| 3 |
+
size 35802
|
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c8220828c449043847cfd4441f6ea2ce538c791e3da3f2df3eae2448f7dd808
|
| 3 |
+
size 4000
|
chroma_db/ddc9588a-5d3d-41eb-91a6-dbb69b2a2822/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ba26051786d261478a3bf76bab748cd2f4333823a223c555aa5b6d1943b7f22
|
| 3 |
+
size 8488
|
vector_store.py
CHANGED
|
@@ -29,26 +29,22 @@ COLLECTION = "scriptorium_corpus"
|
|
| 29 |
|
| 30 |
|
| 31 |
class VectorStore:
|
| 32 |
-
def __init__(self):
|
| 33 |
self.client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# )
|
| 46 |
-
|
| 47 |
-
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 48 |
-
model_name="intfloat/multilingual-e5-small")
|
| 49 |
|
| 50 |
self.collection = self.client.get_or_create_collection(
|
| 51 |
-
name=
|
| 52 |
embedding_function=self.ef,
|
| 53 |
metadata={"hnsw:space": "cosine"},
|
| 54 |
)
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
class VectorStore:
|
| 32 |
+
def __init__(self, embedding_model: str = "e5"):
|
| 33 |
self.client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 34 |
|
| 35 |
+
if embedding_model == "mpnet":
|
| 36 |
+
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 37 |
+
model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
| 38 |
+
)
|
| 39 |
+
collection_name = "scriptorium_mpnet"
|
| 40 |
+
else: # e5 por defecto — el que ya funciona
|
| 41 |
+
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 42 |
+
model_name="intfloat/multilingual-e5-small"
|
| 43 |
+
)
|
| 44 |
+
collection_name = "scriptorium_e5"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
self.collection = self.client.get_or_create_collection(
|
| 47 |
+
name=collection_name, # ← colección separada por modelo
|
| 48 |
embedding_function=self.ef,
|
| 49 |
metadata={"hnsw:space": "cosine"},
|
| 50 |
)
|