Spaces:

Arxived
/

quick-spin

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 21, 2024

Commit

08b924e

verified ·

1 Parent(s): 0c77c36

Update interim/app.py

Browse files

Files changed (1) hide show

interim/app.py +70 -57

interim/app.py CHANGED Viewed

@@ -56,6 +56,9 @@ def check_poppler_installed():
 check_poppler_installed()
 def load_docs(document_path):
     try:
         import fitz  # PyMuPDF for text extraction
@@ -71,11 +74,11 @@ def load_docs(document_path):
         doc.close()
-        # Step 2: Combine cleaned text
         full_text = "\n".join(extracted_text)
         st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
-        # Step 3: Chunk the cleaned text
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=100,
@@ -83,9 +86,9 @@ def load_docs(document_path):
         )
         split_docs = text_splitter.create_documents([full_text])
-        # Debug: Show filtered chunks
         st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
-        for i, doc in enumerate(split_docs[:5]):  # Show first 5 chunks
             st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
         return split_docs
@@ -126,30 +129,28 @@ def already_indexed(vectordb, file_name):
     return file_name in indexed_sources
 def load_chain(file_name=None):
     loaded_patent = st.session_state.get("LOADED_PATENT")
-    # Debug: Check PERSISTED_DIRECTORY
-    st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
-    # Debug: Confirm already indexed
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
         st.write("🔄 Starting document processing and vectorstore update...")
         # Remove existing collection and load new docs
         vectordb.delete_collection()
         docs = load_docs(file_name)
-        # Debug: Verify text chunking
-        st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
-        for i, doc in enumerate(docs[:5]):  # Show first 5 chunks for debugging
-            st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
         # Update vectorstore
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -160,18 +161,15 @@ def load_chain(file_name=None):
         # Save loaded patent in session state
         st.session_state["LOADED_PATENT"] = file_name
-    # Debug: Check vectorstore indexing
     indexed_docs = vectordb.get(include=["documents"])
-    st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
-    for i, doc in enumerate(indexed_docs["documents"][:3]):  # Show first 3 indexed docs
-        st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
-    # Test retrieval with a sample query
     retriever = vectordb.as_retriever(search_kwargs={"k": 3})
     test_query = "What is this document about?"
     results = retriever.get_relevant_documents(test_query)
-    # Debug: Verify document retrieval
     st.write("🔍 Test Retrieval Results for Query:")
     if results:
         for i, res in enumerate(results):
@@ -182,18 +180,16 @@ def load_chain(file_name=None):
     # Configure memory for conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
-        return_messages=True,
-        input_key="question",
-        output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         retriever,
-        return_source_documents=False,
-        memory=memory,
     )
 def extract_patent_number(url):
     pattern = r"/patent/([A-Z]{2}\d+)"
     match = re.search(pattern, url)
@@ -208,19 +204,36 @@ def download_pdf(patent_number):
         st.error(f"Failed to download patent PDF: {e}")
         st.stop()
-def preview_pdf(pdf_path):
-    """Generate and display the first page of the PDF as an image."""
     try:
-        doc = fitz.open(pdf_path)  # Open PDF
-        first_page = doc[0]  # Extract the first page
-        pix = first_page.get_pixmap()  # Render page to a Pixmap (image)
         temp_image_path = os.path.join(tempfile.gettempdir(), "pdf_preview.png")
-        pix.save(temp_image_path)  # Save the image temporarily
         return temp_image_path
     except Exception as e:
         st.error(f"Error generating PDF preview: {e}")
         return None
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
@@ -234,7 +247,7 @@ if __name__ == "__main__":
     patent_link = st.text_area(
         "Enter Google Patent Link:",
         value="https://patents.google.com/patent/US8676427B1/en",
-        height=100
     )
     # Initialize session state
@@ -259,39 +272,39 @@ if __name__ == "__main__":
         # File handling
         pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
         if not os.path.isfile(pdf_path):
-            st.write("📥 Downloading patent file...")
-            try:
-                pdf_path = download_pdf(patent_number)
-                st.write(f"✅ File downloaded: {pdf_path}")
-            except Exception as e:
-                st.error(f"Failed to download patent: {e}")
-                st.stop()
         else:
             st.write("✅ File already downloaded.")
         # Generate PDF preview only if not already displayed
         if not st.session_state.get("pdf_preview_displayed", False):
-            st.write("🖼️ Generating PDF preview...")
-            preview_image_path = preview_pdf(pdf_path)
-            if preview_image_path:
-                st.session_state.pdf_preview = preview_image_path
-                st.image(preview_image_path, caption="First Page Preview", use_container_width=True)
-                st.session_state["pdf_preview_displayed"] = True
-            else:
-                st.warning("Failed to generate PDF preview.")
-                st.session_state.pdf_preview = None
         # Load the document into the system
-        st.write("🔄 Loading document into the system...")
-        try:
-            st.session_state.chain = load_chain(pdf_path)
-            st.session_state.LOADED_PATENT = patent_number
-            st.session_state.loaded_pdf_path = pdf_path
-            st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
-            st.success("🚀 Document successfully loaded! You can now start asking questions.")
-        except Exception as e:
-            st.error(f"Failed to load the document: {e}")
-            st.stop()
     # Display previous chat messages
     if st.session_state.messages:

 check_poppler_installed()
 def load_docs(document_path):
+    """
+    Load and clean the PDF content, then split into chunks.
+    """
     try:
         import fitz  # PyMuPDF for text extraction
         doc.close()
+        # Combine all pages into one text
         full_text = "\n".join(extracted_text)
         st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
+        # Step 2: Chunk the cleaned text
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=100,
         )
         split_docs = text_splitter.create_documents([full_text])
+        # Debug: Show total chunks count and first 3 chunks for verification
         st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
+        for i, doc in enumerate(split_docs[:3]):  # Show first 3 chunks only
             st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
         return split_docs
     return file_name in indexed_sources
 def load_chain(file_name=None):
+    """
+    Load cleaned PDF text, split into chunks, and update the vectorstore.
+    """
     loaded_patent = st.session_state.get("LOADED_PATENT")
+    # Debug: Show persist directory
+    st.write(f"🗂 Using Persisted Directory: {PERSISTED_DIRECTORY}")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
         st.write("🔄 Starting document processing and vectorstore update...")
         # Remove existing collection and load new docs
         vectordb.delete_collection()
         docs = load_docs(file_name)
         # Update vectorstore
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
         # Save loaded patent in session state
         st.session_state["LOADED_PATENT"] = file_name
+    # Debug: Check vectorstore indexing summary
     indexed_docs = vectordb.get(include=["documents"])
+    st.write(f"✅ Total Indexed Documents: {len(indexed_docs['documents'])}")
+    # Test retrieval with a simple query
     retriever = vectordb.as_retriever(search_kwargs={"k": 3})
     test_query = "What is this document about?"
     results = retriever.get_relevant_documents(test_query)
     st.write("🔍 Test Retrieval Results for Query:")
     if results:
         for i, res in enumerate(results):
     # Configure memory for conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
+        return_messages=True
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         retriever,
+        memory=memory
     )
 def extract_patent_number(url):
     pattern = r"/patent/([A-Z]{2}\d+)"
     match = re.search(pattern, url)
         st.error(f"Failed to download patent PDF: {e}")
         st.stop()
+def preview_pdf(pdf_path, scale_factor=0.5):
+    """
+    Generate and display a resized preview of the first page of the PDF.
+    Args:
+        pdf_path (str): Path to the PDF file.
+        scale_factor (float): Factor to reduce the image size (default is 0.5).
+    Returns:
+        str: Path to the resized image preview.
+    """
     try:
+        # Open the PDF and extract the first page
+        doc = fitz.open(pdf_path)
+        first_page = doc[0]
+        # Apply scaling using a transformation matrix
+        matrix = fitz.Matrix(scale_factor, scale_factor)  # Scale down the image
+        pix = first_page.get_pixmap(matrix=matrix)  # Generate scaled image
+        # Save the preview image
         temp_image_path = os.path.join(tempfile.gettempdir(), "pdf_preview.png")
+        pix.save(temp_image_path)
+        doc.close()
         return temp_image_path
     except Exception as e:
         st.error(f"Error generating PDF preview: {e}")
         return None
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
     patent_link = st.text_area(
         "Enter Google Patent Link:",
         value="https://patents.google.com/patent/US8676427B1/en",
+        height=90
     )
     # Initialize session state
         # File handling
         pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
         if not os.path.isfile(pdf_path):
+            with st.spinner("📥 Downloading patent file..."):
+                try:
+                    pdf_path = download_pdf(patent_number)
+                    st.write(f"✅ File downloaded: {pdf_path}")
+                except Exception as e:
+                    st.error(f"Failed to download patent: {e}")
+                    st.stop()
         else:
             st.write("✅ File already downloaded.")
         # Generate PDF preview only if not already displayed
         if not st.session_state.get("pdf_preview_displayed", False):
+            with st.spinner("🖼️ Generating PDF preview..."):
+                preview_image_path = preview_pdf(pdf_path, scale_factor=0.5)
+                if preview_image_path:
+                    st.session_state.pdf_preview = preview_image_path
+                    st.image(preview_image_path, caption="First Page Preview", use_container_width=False)
+                    st.session_state["pdf_preview_displayed"] = True
+                else:
+                    st.warning("Failed to generate PDF preview.")
+                    st.session_state.pdf_preview = None
         # Load the document into the system
+        with st.spinner("🔄 Loading document into the system..."):
+            try:
+                st.session_state.chain = load_chain(pdf_path)
+                st.session_state.LOADED_PATENT = patent_number
+                st.session_state.loaded_pdf_path = pdf_path
+                st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
+                st.success("🚀 Document successfully loaded! You can now start asking questions.")
+            except Exception as e:
+                st.error(f"Failed to load the document: {e}")
+                st.stop()
     # Display previous chat messages
     if st.session_state.messages: