contextqasv-tool

Sleeping

App Files Files Community

zaldivards commited on Feb 8, 2025

Commit

0918d3a

1 Parent(s): 2d2dc23

refactor: cosine similarity and text splitting

Browse files

Files changed (6) hide show

app.py +127 -74
sources/Constitucion de la Republica.pdf +0 -0
sources/GeForce-RTX-4090-GAMING-X-TRIO-24G.pdf +0 -3
sources/Reglamento General de Transito y Seguridad Vial correcto.pdf +0 -0
sources/add_your_files_here +0 -0
sources/march19newarmouriessamplemenu.pdf +0 -3

app.py CHANGED Viewed

@@ -1,46 +1,58 @@
-import gradio as gr
-import spaces
-import subprocess
 import os
-import shutil
-import string
-import random
 import glob
 from pypdf import PdfReader
 from sentence_transformers import SentenceTransformer
 model_name = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
-chunk_size = int(os.environ.get("CHUNK_SIZE", 128))
-default_max_characters = int(os.environ.get("DEFAULT_MAX_CHARACTERS", 258))
 model = SentenceTransformer(model_name)
-# model.to(device="cuda")
-@spaces.GPU
-def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
-    query_embeddings = model.encode(queries, prompt_name="query")
-    document_embeddings = model.encode(chunks)
-    scores = query_embeddings @ document_embeddings.T
-    results = {}
-    for query, query_scores in zip(queries, scores):
-        chunk_idxs = [i for i in range(len(chunks))]
-        # Get a structure like {query: [(chunk_idx, score), (chunk_idx, score), ...]}
-        results[query] = list(zip(chunk_idxs, query_scores))
-    return results
-def extract_text_from_pdf(reader):
-    full_text = ""
-    for idx, page in enumerate(reader.pages):
-        text = page.extract_text()
-        if len(text) > 0:
-            full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
-    return full_text.strip()
-def convert(filename) -> str:
     plain_text_filetypes = [
         ".txt",
         ".csv",
@@ -54,7 +66,7 @@ def convert(filename) -> str:
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
     if any(filename.endswith(ft) for ft in plain_text_filetypes):
-        with open(filename, "r") as f:
             return f.read()
     if filename.endswith(".pdf"):
@@ -63,75 +75,116 @@ def convert(filename) -> str:
     raise ValueError(f"Unsupported file type: {filename}")
-def chunk_to_length(text, max_length=512):
     chunks = []
-    while len(text) > max_length:
-        chunks.append(text[:max_length])
-        text = text[max_length:]
-    chunks.append(text)
     return chunks
 @spaces.GPU
-def predict(query, max_characters) -> str:
     # Embed the query
     query_embedding = model.encode(query, prompt_name="query")
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents
-    for filename, doc in docs.items():
         # Calculate dot product between query and document embeddings
-        similarities = doc["embeddings"] @ query_embedding.T
         # Add chunks and similarities to the all_chunks list
-        all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)])
     # Sort all chunks by similarity
-    all_chunks.sort(key=lambda x: x[2], reverse=True)
-    # Initialize a dictionary to store relevant chunks for each document
-    relevant_chunks = {}
-    # Add most relevant chunks until max_characters is reached
-    total_chars = 0
-    for filename, chunk, _ in all_chunks:
-        if total_chars + len(chunk) <= max_characters:
-            if filename not in relevant_chunks:
-                relevant_chunks[filename] = []
-            relevant_chunks[filename].append(chunk)
-            total_chars += len(chunk)
-        else:
-            break
-    return relevant_chunks
-docs = {}
-for filename in glob.glob("sources/*"):
-    if filename.endswith("add_your_files_here"):
-        continue
-    converted_doc = convert(filename)
-    chunks = chunk_to_length(converted_doc, chunk_size)
-    embeddings = model.encode(chunks)
-    docs[filename] = {
-        "chunks": chunks,
-        "embeddings": embeddings,
-    }
 gr.Interface(
     predict,
     inputs=[
         gr.Textbox(label="Query asked about the documents"),
-        gr.Number(label="Max output characters", value=default_max_characters),
     ],
-    outputs=[gr.JSON(label="Relevant chunks")],
-    title="RAG Community Tool Template demo",
-    description="This is a demo of the RAG Community Tool Template. To use RAG in HuggingChat with your own documents, start by cloning this space, add your documents to the `sources` folder, and then create a community tool with this space!",
-).launch()

 import os
 import glob
+import pickle
+from pathlib import Path
+import gradio as gr
+import spaces
+import numpy as np
 from pypdf import PdfReader
 from sentence_transformers import SentenceTransformer
 model_name = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
+chunk_size = int(os.environ.get("CHUNK_SIZE", 1000))
+default_k = int(os.environ.get("DEFAULT_K", 5))
 model = SentenceTransformer(model_name)
+docs = {}
+def extract_text_from_pdf(reader: PdfReader) -> str:
+    """Extract text from PDF pages
+    Parameters
+    ----------
+    reader : PdfReader
+        PDF reader
+    Returns
+    -------
+    str
+        Raw text
+    """
+    content = [page.extract_text().strip() for page in reader.pages]
+    return "\n\n".join(content).strip()
+def convert(filename: str) -> str:
+    """Convert file content to raw text
+    Parameters
+    ----------
+    filename : str
+        The filename or path
+    Returns
+    -------
+    str
+        The raw text
+    Raises
+    ------
+    ValueError
+        If the file type is not supported.
+    """
     plain_text_filetypes = [
         ".txt",
         ".csv",
     ]
     # Already a plain text file that wouldn't benefit from pandoc so return the content
     if any(filename.endswith(ft) for ft in plain_text_filetypes):
+        with open(filename, "r", encoding="utf-8") as f:
             return f.read()
     if filename.endswith(".pdf"):
     raise ValueError(f"Unsupported file type: {filename}")
+def generate_chunks(text: str, max_length: int) -> list[str]:
+    """Generate chunks from a file's raw text. Chunks are calculated based
+    on the `max_lenght` parameter and the split character (.)
+    Parameters
+    ----------
+    text : str
+        The raw text
+    max_length : int
+        Maximum number of characters a chunk can have. Note that chunks
+        may not have this exact lenght, as another component is also
+        involved in the splitting process
+    Returns
+    -------
+    list[str]
+        A list of chunks/nodes
+    """
+    segments = text.split(".")
     chunks = []
+    chunk = ""
+    for current_segment in segments:
+        if len(chunk) < max_length:
+            chunk += current_segment
+        else:
+            chunks.append(chunk)
+            chunk = current_segment
+    if chunk:
+        chunks.append(chunk)
     return chunks
 @spaces.GPU
+def predict(query: str, k: int = 5) -> str:
+    """Find k most relevant chunks based on the given query
+    Parameters
+    ----------
+    query : str
+        The input query
+    k : int, optional
+        Number of relevant chunks to return, by default 5
+    Returns
+    -------
+    str
+        The k chunks concatenated together as a single string.
+    Example
+    -------
+    If k=2, the returned string might look like:
+    "CONTEXT:\n\nchunk-1\n\nchunk-2"
+    """
     # Embed the query
     query_embedding = model.encode(query, prompt_name="query")
     # Initialize a list to store all chunks and their similarities across all documents
     all_chunks = []
     # Iterate through all documents
+    for doc in docs.values():
         # Calculate dot product between query and document embeddings
+        similarities = np.dot(doc["embeddings"], query_embedding) / (
+            np.linalg.norm(doc["embeddings"]) * np.linalg.norm(query_embedding)
+        )
         # Add chunks and similarities to the all_chunks list
+        all_chunks.extend(list(zip(doc["chunks"], similarities)))
     # Sort all chunks by similarity
+    all_chunks.sort(key=lambda x: x[1], reverse=True)
+    return "CONTEXT:\n\n" + "\n\n".join(chunk for chunk, _ in all_chunks[:k])
+def init():
+    """Init function
+    It will load or calculate the embeddings
+    """
+    global docs  # pylint: disable=W0603
+    embeddings_file = Path("embeddings.pickle")
+    if embeddings_file.exists():
+        with open(embeddings_file, "rb") as embeddings_pickle:
+            docs = pickle.load(embeddings_pickle)
+    else:
+        for filename in glob.glob("sources/*"):
+            converted_doc = convert(filename)
+            chunks = generate_chunks(converted_doc, chunk_size)
+            embeddings = model.encode(chunks)
+            docs[filename] = {
+                "chunks": chunks,
+                "embeddings": embeddings,
+            }
+        with open(embeddings_file, "wb") as pickle_file:
+            pickle.dump(docs, pickle_file)
+init()
 gr.Interface(
     predict,
     inputs=[
         gr.Textbox(label="Query asked about the documents"),
+        gr.Number(label="Number of relevant sources returned (k)", value=default_k),
     ],
+    outputs=[gr.Text(label="Relevant chunks")],
+    title="ContextQA tool - El Salvador",
+    description="Forked and customized RAG tool working with law documents from El Salvador",
+).launch()

sources/Constitucion de la Republica.pdf ADDED Viewed

Binary file (321 kB). View file

sources/GeForce-RTX-4090-GAMING-X-TRIO-24G.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:96cb2dd9797ac7dca9df67a7fd499bb45eecb15219c617bb2d73a3eec19649e6
-size 1519838

sources/Reglamento General de Transito y Seguridad Vial correcto.pdf ADDED Viewed

Binary file (387 kB). View file

sources/add_your_files_here DELETED Viewed

File without changes

sources/march19newarmouriessamplemenu.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:886365911dc9cea7d983108b532729e1a895388b27c096bc6554535073ca351a
-size 52843