pkgprateek commited on
Commit
7c2156b
·
1 Parent(s): 2236760

feat: add session-based document management with 7-day auto-cleanup

Browse files

- Session isolation using gr.BrowserState with 7-day expiry
- Document list UI with CheckboxGroup for selection
- Delete functionality: samples removed from session only, uploads deleted from ChromaDB
- Fixed 7-day auto-cleanup to actually delete expired docs
- Updated security badge with honest messaging
- Session-filtered RAG retrieval using RunnableLambda

Files changed (2) hide show
  1. app/main.py +387 -37
  2. app/rag_pipeline.py +139 -17
app/main.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  from rag_pipeline import RAGPipeline
3
  from document_processor import DocumentProcessor
4
  import os
 
5
  from dotenv import load_dotenv
6
 
7
  load_dotenv()
@@ -11,9 +12,47 @@ class DocumentRagApp:
11
  def __init__(self):
12
  self.processor = DocumentProcessor()
13
  self.rag_pipeline = RAGPipeline()
14
- self.loaded_documents = []
15
 
16
- def load_samples(self, vertical):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """Load sample documents with live progress updates"""
18
  samples = {
19
  "Legal": [
@@ -33,33 +72,49 @@ class DocumentRagApp:
33
  ],
34
  }
35
 
 
36
  try:
37
  total_chunks = 0
38
  for idx, path in enumerate(samples[vertical], 1):
39
  if os.path.exists(path):
40
- yield f"Loading document {idx}/{len(samples[vertical])}..."
 
 
 
41
  chunks = self.processor.process_txt(path)
42
 
43
- yield f"Creating smart chunks ({len(chunks)} chunks)..."
44
- self.rag_pipeline.add_documents(chunks, is_sample=True)
 
 
 
 
 
 
45
 
46
- yield f"Building search index..."
47
- self.loaded_documents.append(os.path.basename(path))
 
48
  total_chunks += len(chunks)
49
 
50
- yield f"✓ Success! Loaded {len(samples[vertical])} documents ({total_chunks} searchable chunks)"
 
 
 
51
  except Exception as e:
52
- yield f"❌ Error: {str(e)}"
53
 
54
- def process_file(self, file):
55
  """Process uploaded file with live progress updates"""
 
 
56
  if not file:
57
- yield "⚠️ Please upload a file"
58
  return
59
 
60
  try:
61
  filename = os.path.basename(file.name)
62
- yield f"Processing {filename}..."
63
 
64
  ext = os.path.splitext(file.name)[1].lower()
65
  if ext == ".pdf":
@@ -69,22 +124,35 @@ class DocumentRagApp:
69
  elif ext == ".docx":
70
  chunks = self.processor.process_docx(file.name)
71
  else:
72
- yield "❌ Unsupported format. Please upload PDF, DOCX, or TXT files."
 
 
 
73
  return
74
 
75
- yield f"✂️ Created {len(chunks)} smart chunks..."
76
 
77
- yield f"Building search index (securing with AES-256)..."
78
- self.rag_pipeline.add_documents(chunks, is_sample=False)
79
- self.loaded_documents.append(filename)
 
 
80
 
81
- yield f"✓ Success! {filename} ready for questions ({len(chunks)} searchable chunks)"
 
 
 
 
 
 
82
  except Exception as e:
83
- yield f"❌ Error: {str(e)}. Please try again or contact support."
 
 
 
84
 
85
  def switch_model(self, model_choice):
86
  """Handle model switching from UI radio button"""
87
- # Map UI choices to model keys
88
  model_map = {
89
  "GPT-OSS 120B (OpenAI) - Default": "gpt-oss-120b",
90
  "Llama 3.3 70B (Meta)": "llama-3.3-70b",
@@ -93,7 +161,7 @@ class DocumentRagApp:
93
 
94
  model_key = model_map.get(model_choice)
95
  if not model_key:
96
- return f"❌ Invalid model selection"
97
 
98
  try:
99
  display_name = self.rag_pipeline.switch_model(model_key)
@@ -101,17 +169,70 @@ class DocumentRagApp:
101
  except Exception as e:
102
  return f"❌ Error switching model: {str(e)}"
103
 
104
- def ask(self, question):
105
- if not self.loaded_documents:
 
106
  return "Please load documents first"
107
  if not question.strip():
108
  return "Please enter a question"
109
  try:
110
- result = self.rag_pipeline.query(question)
111
  return result["answer"]
112
  except Exception as e:
113
  return f"Error: {str(e)}"
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  app = DocumentRagApp()
117
 
@@ -451,9 +572,79 @@ body {
451
  color: var(--text-secondary);
452
  opacity: 0.7;
453
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  """
455
 
456
  with gr.Blocks(css=css, theme=gr.themes.Base(), title="Enterprise RAG") as demo:
 
 
 
 
457
  with gr.Column(elem_id="main-container"):
458
  # --- HERO ---
459
  gr.HTML("""
@@ -514,8 +705,8 @@ with gr.Blocks(css=css, theme=gr.themes.Base(), title="Enterprise RAG") as demo:
514
  <div class="security-badge">
515
  <div class="badge-icon">🔒</div>
516
  <div class="badge-content">
517
- <div class="badge-title">AES-256 Encrypted</div>
518
- <div class="badge-subtitle">Processed locally • Auto-deleted in 7 days</div>
519
  </div>
520
  </div>
521
  """)
@@ -547,6 +738,33 @@ with gr.Blocks(css=css, theme=gr.themes.Base(), title="Enterprise RAG") as demo:
547
  elem_classes="model-status",
548
  )
549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  # --- RIGHT: INTERACTION CARD (55%) ---
551
  with gr.Column(scale=11):
552
  with gr.Group(elem_classes="glass-card"):
@@ -593,41 +811,173 @@ with gr.Blocks(css=css, theme=gr.themes.Base(), title="Enterprise RAG") as demo:
593
  with gr.Row(elem_id="footer-info"):
594
  gr.HTML("""
595
  <div style="text-align: center; color: var(--text-secondary); margin-top: 3rem; padding-bottom: 2rem; font-size: 0.9rem;">
596
- <p>🔒 <strong>Secure Environment</strong>: Documents processed locally & auto-deleted after 7 days.</p>
597
  <p style="margin-top: 0.5rem; opacity: 0.6;">© 2024 Enterprise RAG Platform. Licensed under MIT.</p>
598
  </div>
599
  """)
600
 
601
- # Event Wiring with live updates (generators)
602
- load_legal.click(fn=lambda: app.load_samples("Legal"), outputs=load_status)
603
- load_research.click(fn=lambda: app.load_samples("Research"), outputs=load_status)
604
- load_finops.click(fn=lambda: app.load_samples("FinOps"), outputs=load_status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
 
606
- process_btn.click(fn=app.process_file, inputs=file_upload, outputs=upload_status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
 
608
  # Model switching
609
  model_selector.change(
610
  fn=app.switch_model, inputs=model_selector, outputs=model_status
611
  )
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  q1.click(
614
- fn=lambda: app.ask("What are the termination conditions?"),
 
615
  outputs=answer,
616
  )
617
  q2.click(
618
- fn=lambda: app.ask("Summarize payment terms"),
 
619
  outputs=answer,
620
  )
621
  q3.click(
622
- fn=lambda: app.ask("Summarize key findings"),
 
623
  outputs=answer,
624
  )
625
  q4.click(
626
- fn=lambda: app.ask("What are the key risks mentioned?"),
 
627
  outputs=answer,
628
  )
629
 
630
- ask_btn.click(fn=app.ask, inputs=question, outputs=answer)
 
 
631
 
632
  if __name__ == "__main__":
633
  demo.launch(share=False)
 
2
  from rag_pipeline import RAGPipeline
3
  from document_processor import DocumentProcessor
4
  import os
5
+ import uuid
6
  from dotenv import load_dotenv
7
 
8
  load_dotenv()
 
12
  def __init__(self):
13
  self.processor = DocumentProcessor()
14
  self.rag_pipeline = RAGPipeline()
 
15
 
16
+ def initialize_session(self, session_data):
17
+ """
18
+ Initialize or restore a user session.
19
+ Session expires after 7 days (consistent with file deletion).
20
+
21
+ Args:
22
+ session_data: Dict with 'id' and 'created_at' or None for new session
23
+
24
+ Returns:
25
+ session_data (dict), loaded documents list, and status message
26
+ """
27
+ from datetime import datetime, timedelta
28
+
29
+ now = datetime.now()
30
+
31
+ if session_data is None:
32
+ # New user - generate session with timestamp
33
+ session_data = {"id": str(uuid.uuid4()), "created_at": now.isoformat()}
34
+ return session_data, [], ""
35
+
36
+ # Check if session has expired (older than 7 days)
37
+ try:
38
+ created_at = datetime.fromisoformat(session_data.get("created_at", ""))
39
+ if now - created_at > timedelta(days=7):
40
+ # Session expired - create new session
41
+ session_data = {"id": str(uuid.uuid4()), "created_at": now.isoformat()}
42
+ return session_data, [], "Session expired. Started fresh session."
43
+ except (ValueError, TypeError):
44
+ # Invalid timestamp - create new session
45
+ session_data = {"id": str(uuid.uuid4()), "created_at": now.isoformat()}
46
+ return session_data, [], ""
47
+
48
+ # Valid session - load existing documents
49
+ session_id = session_data.get("id")
50
+ existing_docs = self.rag_pipeline.get_documents_by_session(session_id)
51
+ doc_names = [doc["filename"] for doc in existing_docs]
52
+ status = f"✓ Restored {len(existing_docs)} documents" if existing_docs else ""
53
+ return session_data, doc_names, status
54
+
55
+ def load_samples(self, vertical, session_id, current_docs):
56
  """Load sample documents with live progress updates"""
57
  samples = {
58
  "Legal": [
 
72
  ],
73
  }
74
 
75
+ loaded_docs = list(current_docs) if current_docs else []
76
  try:
77
  total_chunks = 0
78
  for idx, path in enumerate(samples[vertical], 1):
79
  if os.path.exists(path):
80
+ yield (
81
+ f"Loading document {idx}/{len(samples[vertical])}...",
82
+ loaded_docs,
83
+ )
84
  chunks = self.processor.process_txt(path)
85
 
86
+ yield (
87
+ f"Creating smart chunks ({len(chunks)} chunks)...",
88
+ loaded_docs,
89
+ )
90
+ # Samples are global (is_sample=True), no session filtering
91
+ self.rag_pipeline.add_documents(
92
+ chunks, session_id=None, is_sample=True
93
+ )
94
 
95
+ doc_name = os.path.basename(path)
96
+ if doc_name not in loaded_docs:
97
+ loaded_docs.append(doc_name)
98
  total_chunks += len(chunks)
99
 
100
+ yield (
101
+ f"✓ Success! Loaded {len(samples[vertical])} documents ({total_chunks} searchable chunks)",
102
+ loaded_docs,
103
+ )
104
  except Exception as e:
105
+ yield f"❌ Error: {str(e)}", loaded_docs
106
 
107
+ def process_file(self, file, session_id, current_docs):
108
  """Process uploaded file with live progress updates"""
109
+ loaded_docs = list(current_docs) if current_docs else []
110
+
111
  if not file:
112
+ yield "⚠️ Please upload a file", loaded_docs
113
  return
114
 
115
  try:
116
  filename = os.path.basename(file.name)
117
+ yield f"Processing {filename}...", loaded_docs
118
 
119
  ext = os.path.splitext(file.name)[1].lower()
120
  if ext == ".pdf":
 
124
  elif ext == ".docx":
125
  chunks = self.processor.process_docx(file.name)
126
  else:
127
+ yield (
128
+ "❌ Unsupported format. Please upload PDF, DOCX, or TXT files.",
129
+ loaded_docs,
130
+ )
131
  return
132
 
133
+ yield f"✂️ Created {len(chunks)} smart chunks...", loaded_docs
134
 
135
+ yield "Building secure search index...", loaded_docs
136
+ # Pass session_id for user document isolation
137
+ self.rag_pipeline.add_documents(
138
+ chunks, session_id=session_id, is_sample=False
139
+ )
140
 
141
+ if filename not in loaded_docs:
142
+ loaded_docs.append(filename)
143
+
144
+ yield (
145
+ f"✓ Success! {filename} ready for questions ({len(chunks)} searchable chunks)",
146
+ loaded_docs,
147
+ )
148
  except Exception as e:
149
+ yield (
150
+ f"❌ Error: {str(e)}. Please try again or contact support.",
151
+ loaded_docs,
152
+ )
153
 
154
  def switch_model(self, model_choice):
155
  """Handle model switching from UI radio button"""
 
156
  model_map = {
157
  "GPT-OSS 120B (OpenAI) - Default": "gpt-oss-120b",
158
  "Llama 3.3 70B (Meta)": "llama-3.3-70b",
 
161
 
162
  model_key = model_map.get(model_choice)
163
  if not model_key:
164
+ return "❌ Invalid model selection"
165
 
166
  try:
167
  display_name = self.rag_pipeline.switch_model(model_key)
 
169
  except Exception as e:
170
  return f"❌ Error switching model: {str(e)}"
171
 
172
+ def ask(self, question, session_id, current_docs):
173
+ """Answer a question using documents from this session"""
174
+ if not current_docs:
175
  return "Please load documents first"
176
  if not question.strip():
177
  return "Please enter a question"
178
  try:
179
+ result = self.rag_pipeline.query(question, session_id=session_id)
180
  return result["answer"]
181
  except Exception as e:
182
  return f"Error: {str(e)}"
183
 
184
+ def delete_document(self, doc_to_delete, session_id, current_docs):
185
+ """
186
+ Delete a document from the session.
187
+ - Sample docs: only removed from session (not from storage)
188
+ - User docs: removed from session AND storage/ChromaDB
189
+ """
190
+ if not doc_to_delete:
191
+ return current_docs, "No document selected"
192
+
193
+ # Check if it's a sample document
194
+ sample_names = [
195
+ "service_agreement.txt",
196
+ "amendment.txt",
197
+ "nda.txt",
198
+ "llm_enterprise_survey.txt",
199
+ "rag_methodology.txt",
200
+ "vector_db_benchmark.txt",
201
+ "cloud_cost_optimization.txt",
202
+ "aws_invoice_sept2024.txt",
203
+ "kubernetes_cost_allocation.txt",
204
+ ]
205
+
206
+ is_sample = doc_to_delete in sample_names
207
+
208
+ if is_sample:
209
+ # Sample doc: just remove from this session's list (not from storage)
210
+ print(f"[DEBUG] Removing SAMPLE doc from session only: {doc_to_delete}")
211
+ updated_docs = [d for d in current_docs if d != doc_to_delete]
212
+ return updated_docs, f"✓ Removed {doc_to_delete}"
213
+ else:
214
+ # User doc: remove from session AND delete from ChromaDB
215
+ print(
216
+ f"[DEBUG] Deleting USER doc from session AND storage: {doc_to_delete}"
217
+ )
218
+ user_docs = self.rag_pipeline.get_documents_by_session(session_id)
219
+ for doc in user_docs:
220
+ if doc["filename"] == doc_to_delete:
221
+ print(f"[DEBUG] Found in storage, deleting: {doc['path']}")
222
+ success = self.rag_pipeline.delete_document(session_id, doc["path"])
223
+ if success:
224
+ print(f"[DEBUG] Successfully deleted from ChromaDB")
225
+ updated_docs = [d for d in current_docs if d != doc_to_delete]
226
+ return updated_docs, f"✓ Deleted {doc_to_delete}"
227
+ else:
228
+ print(f"[DEBUG] Failed to delete from ChromaDB")
229
+ return current_docs, f"❌ Failed to delete {doc_to_delete}"
230
+
231
+ # Document not found in storage, just remove from list
232
+ print(f"[DEBUG] Doc not in storage, just removing from session list")
233
+ updated_docs = [d for d in current_docs if d != doc_to_delete]
234
+ return updated_docs, f"✓ Removed {doc_to_delete}"
235
+
236
 
237
  app = DocumentRagApp()
238
 
 
572
  color: var(--text-secondary);
573
  opacity: 0.7;
574
  }
575
+
576
+ /* --- DOCUMENT CHECKBOX GROUP --- */
577
+ .doc-checkbox-group {
578
+ margin-top: 0.5rem !important;
579
+ margin-bottom: 0.5rem !important;
580
+ }
581
+
582
+ .doc-checkbox-group .wrap {
583
+ display: flex !important;
584
+ flex-wrap: wrap !important;
585
+ gap: 0.6rem !important;
586
+ }
587
+
588
+ .doc-checkbox-group label {
589
+ display: flex !important;
590
+ align-items: center !important;
591
+ gap: 0.5rem !important;
592
+ background: rgba(255, 255, 255, 0.08) !important;
593
+ border: 1px solid rgba(255, 255, 255, 0.12) !important;
594
+ border-radius: 100px !important;
595
+ padding: 0.4rem 0.9rem 0.4rem 0.6rem !important;
596
+ font-size: 0.8rem !important;
597
+ color: var(--text-secondary) !important;
598
+ cursor: pointer !important;
599
+ transition: all 0.15s ease !important;
600
+ }
601
+
602
+ .doc-checkbox-group label:hover {
603
+ background: rgba(255, 255, 255, 0.12) !important;
604
+ border-color: rgba(255, 255, 255, 0.2) !important;
605
+ }
606
+
607
+ .doc-checkbox-group label.selected {
608
+ background: rgba(239, 68, 68, 0.15) !important;
609
+ border-color: rgba(239, 68, 68, 0.4) !important;
610
+ color: #fca5a5 !important;
611
+ }
612
+
613
+ /* Show checkbox with custom styling */
614
+ .doc-checkbox-group input[type="checkbox"] {
615
+ appearance: none !important;
616
+ -webkit-appearance: none !important;
617
+ width: 14px !important;
618
+ height: 14px !important;
619
+ border: 1.5px solid rgba(255, 255, 255, 0.4) !important;
620
+ border-radius: 3px !important;
621
+ background: transparent !important;
622
+ cursor: pointer !important;
623
+ margin: 0 !important;
624
+ flex-shrink: 0 !important;
625
+ }
626
+
627
+ .doc-checkbox-group input[type="checkbox"]:checked {
628
+ background: #ef4444 !important;
629
+ border-color: #ef4444 !important;
630
+ }
631
+
632
+ .doc-checkbox-group input[type="checkbox"]:checked::after {
633
+ content: '✓' !important;
634
+ display: block !important;
635
+ text-align: center !important;
636
+ font-size: 10px !important;
637
+ line-height: 12px !important;
638
+ color: white !important;
639
+ font-weight: bold !important;
640
+ }
641
  """
642
 
643
  with gr.Blocks(css=css, theme=gr.themes.Base(), title="Enterprise RAG") as demo:
644
+ # Session and document state (persisted in browser localStorage)
645
+ session_state = gr.BrowserState(default_value=None, storage_key="rag_session_id")
646
+ docs_state = gr.State(value=[]) # List of loaded document names
647
+
648
  with gr.Column(elem_id="main-container"):
649
  # --- HERO ---
650
  gr.HTML("""
 
705
  <div class="security-badge">
706
  <div class="badge-icon">🔒</div>
707
  <div class="badge-content">
708
+ <div class="badge-title">Secure Transfer</div>
709
+ <div class="badge-subtitle">Files encrypted in transit • Auto-deleted in 7 days</div>
710
  </div>
711
  </div>
712
  """)
 
738
  elem_classes="model-status",
739
  )
740
 
741
+ # Divider before document list
742
+ gr.HTML(
743
+ '<div style="margin: 1rem 0; height: 1px; background: rgba(255,255,255,0.15);"></div>'
744
+ )
745
+
746
+ # Active Documents Section - using CheckboxGroup for reliable selection
747
+ gr.Markdown(
748
+ "**📄 Active Documents**", elem_classes="card-subheader"
749
+ )
750
+ doc_checkboxes = gr.CheckboxGroup(
751
+ choices=[],
752
+ value=[],
753
+ label="",
754
+ show_label=False,
755
+ elem_classes="doc-checkbox-group",
756
+ )
757
+ # Spacing before delete button
758
+ gr.HTML('<div style="height: 0.10rem;"></div>')
759
+ with gr.Row():
760
+ remove_docs_btn = gr.Button(
761
+ "🗑️ Delete Selected Documents",
762
+ size="sm",
763
+ elem_classes="query-btn",
764
+ visible=False,
765
+ )
766
+ delete_status = gr.Markdown("", elem_classes="status-message")
767
+
768
  # --- RIGHT: INTERACTION CARD (55%) ---
769
  with gr.Column(scale=11):
770
  with gr.Group(elem_classes="glass-card"):
 
811
  with gr.Row(elem_id="footer-info"):
812
  gr.HTML("""
813
  <div style="text-align: center; color: var(--text-secondary); margin-top: 3rem; padding-bottom: 2rem; font-size: 0.9rem;">
814
+ <p>🔒 <strong>Secure Environment</strong>: Documents stored securely & auto-deleted after 7 days.</p>
815
  <p style="margin-top: 0.5rem; opacity: 0.6;">© 2024 Enterprise RAG Platform. Licensed under MIT.</p>
816
  </div>
817
  """)
818
 
819
+ # --- HELPER FUNCTIONS ---
820
+
821
+ def update_doc_ui(docs):
822
+ """Update document checkboxes and remove button visibility"""
823
+ choices = docs if docs else []
824
+ show_btn = len(docs) > 0
825
+ return gr.update(choices=choices, value=[]), gr.update(visible=show_btn)
826
+
827
+ # Helper to extract session ID from session_data dict
828
+ def get_session_id(session_data):
829
+ """Extract session ID string from session data dict"""
830
+ if isinstance(session_data, dict):
831
+ return session_data.get("id")
832
+ return session_data # Backwards compatibility
833
+
834
+ # --- SESSION INITIALIZATION ---
835
+ def on_load(session_data):
836
+ """Initialize session on page load"""
837
+ new_session_data, docs, status = app.initialize_session(session_data)
838
+ checkbox_update, btn_update = update_doc_ui(docs)
839
+ return new_session_data, docs, checkbox_update, btn_update, status
840
+
841
+ demo.load(
842
+ fn=on_load,
843
+ inputs=[session_state],
844
+ outputs=[
845
+ session_state,
846
+ docs_state,
847
+ doc_checkboxes,
848
+ remove_docs_btn,
849
+ load_status,
850
+ ],
851
+ )
852
+
853
+ # --- EVENT WIRING ---
854
+
855
+ # Sample loading - create specific wrapper functions for each vertical
856
+ def load_legal_samples(session_data, current_docs):
857
+ session_id = get_session_id(session_data)
858
+ for status, docs in app.load_samples("Legal", session_id, current_docs):
859
+ checkbox_update, btn_update = update_doc_ui(docs)
860
+ yield status, docs, checkbox_update, btn_update
861
+
862
+ def load_research_samples(session_data, current_docs):
863
+ session_id = get_session_id(session_data)
864
+ for status, docs in app.load_samples("Research", session_id, current_docs):
865
+ checkbox_update, btn_update = update_doc_ui(docs)
866
+ yield status, docs, checkbox_update, btn_update
867
+
868
+ def load_finops_samples(session_data, current_docs):
869
+ session_id = get_session_id(session_data)
870
+ for status, docs in app.load_samples("FinOps", session_id, current_docs):
871
+ checkbox_update, btn_update = update_doc_ui(docs)
872
+ yield status, docs, checkbox_update, btn_update
873
+
874
+ load_legal.click(
875
+ fn=load_legal_samples,
876
+ inputs=[session_state, docs_state],
877
+ outputs=[load_status, docs_state, doc_checkboxes, remove_docs_btn],
878
+ )
879
+ load_research.click(
880
+ fn=load_research_samples,
881
+ inputs=[session_state, docs_state],
882
+ outputs=[load_status, docs_state, doc_checkboxes, remove_docs_btn],
883
+ )
884
+ load_finops.click(
885
+ fn=load_finops_samples,
886
+ inputs=[session_state, docs_state],
887
+ outputs=[load_status, docs_state, doc_checkboxes, remove_docs_btn],
888
+ )
889
 
890
+ # File upload
891
+ def process_file_wrapper(file, session_data, current_docs):
892
+ session_id = get_session_id(session_data)
893
+ for status, docs in app.process_file(file, session_id, current_docs):
894
+ checkbox_update, btn_update = update_doc_ui(docs)
895
+ yield status, docs, checkbox_update, btn_update
896
+
897
+ process_btn.click(
898
+ fn=process_file_wrapper,
899
+ inputs=[file_upload, session_state, docs_state],
900
+ outputs=[upload_status, docs_state, doc_checkboxes, remove_docs_btn],
901
+ )
902
+
903
+ # Document deletion (batch removal via checkboxes)
904
+ def remove_selected_docs(selected_docs, session_data, current_docs):
905
+ """Remove all selected documents"""
906
+ session_id = get_session_id(session_data)
907
+ if not selected_docs:
908
+ checkbox_update, btn_update = update_doc_ui(current_docs)
909
+ return current_docs, "No documents selected", checkbox_update, btn_update
910
+
911
+ messages = []
912
+ updated_docs = list(current_docs)
913
+ for doc_name in selected_docs:
914
+ updated_docs, msg = app.delete_document(doc_name, session_id, updated_docs)
915
+ messages.append(msg)
916
+
917
+ checkbox_update, btn_update = update_doc_ui(updated_docs)
918
+ status_msg = (
919
+ " / ".join(messages)
920
+ if len(messages) <= 2
921
+ else f"Removed {len(selected_docs)} documents"
922
+ )
923
+ return updated_docs, status_msg, checkbox_update, btn_update
924
+
925
+ remove_docs_btn.click(
926
+ fn=remove_selected_docs,
927
+ inputs=[doc_checkboxes, session_state, docs_state],
928
+ outputs=[docs_state, delete_status, doc_checkboxes, remove_docs_btn],
929
+ )
930
 
931
  # Model switching
932
  model_selector.change(
933
  fn=app.switch_model, inputs=model_selector, outputs=model_status
934
  )
935
 
936
+ # Question answering - explicit functions for each quick question
937
+ def ask_termination(session_data, current_docs):
938
+ session_id = get_session_id(session_data)
939
+ return app.ask("What are the termination conditions?", session_id, current_docs)
940
+
941
+ def ask_payment(session_data, current_docs):
942
+ session_id = get_session_id(session_data)
943
+ return app.ask("Summarize payment terms", session_id, current_docs)
944
+
945
+ def ask_findings(session_data, current_docs):
946
+ session_id = get_session_id(session_data)
947
+ return app.ask("Summarize key findings", session_id, current_docs)
948
+
949
+ def ask_risks(session_data, current_docs):
950
+ session_id = get_session_id(session_data)
951
+ return app.ask("What are the key risks mentioned?", session_id, current_docs)
952
+
953
+ def ask_custom(question, session_data, current_docs):
954
+ session_id = get_session_id(session_data)
955
+ return app.ask(question, session_id, current_docs)
956
+
957
  q1.click(
958
+ fn=ask_termination,
959
+ inputs=[session_state, docs_state],
960
  outputs=answer,
961
  )
962
  q2.click(
963
+ fn=ask_payment,
964
+ inputs=[session_state, docs_state],
965
  outputs=answer,
966
  )
967
  q3.click(
968
+ fn=ask_findings,
969
+ inputs=[session_state, docs_state],
970
  outputs=answer,
971
  )
972
  q4.click(
973
+ fn=ask_risks,
974
+ inputs=[session_state, docs_state],
975
  outputs=answer,
976
  )
977
 
978
+ ask_btn.click(
979
+ fn=ask_custom, inputs=[question, session_state, docs_state], outputs=answer
980
+ )
981
 
982
  if __name__ == "__main__":
983
  demo.launch(share=False)
app/rag_pipeline.py CHANGED
@@ -3,7 +3,11 @@ from langchain_huggingface import HuggingFaceEmbeddings
3
  from langchain_openai import ChatOpenAI
4
  from langchain_core.prompts import PromptTemplate
5
  from langchain_core.documents import Document
6
- from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 
 
 
 
7
  from typing import List
8
  import os
9
  from datetime import datetime, timedelta
@@ -81,6 +85,9 @@ class RAGPipeline:
81
  self.current_model = default_model
82
  self.llm = self._initialize_llm(default_model)
83
 
 
 
 
84
  # Create RAG chain
85
  self.rag_chain = self.create_rag_chain()
86
 
@@ -213,13 +220,30 @@ Answer:""",
213
  search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
214
  )
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  rag_chain = RunnableParallel(
217
  {
218
  "result": (
219
  {
220
- "context": retriever
221
  | (lambda docs: "\n\n".join([d.page_content for d in docs])),
222
- "sources": retriever
223
  | (
224
  lambda docs: ", ".join(
225
  list(
@@ -237,27 +261,42 @@ Answer:""",
237
  | prompt
238
  | self.llm
239
  ),
240
- "source_documents": retriever,
241
  }
242
  )
243
  return rag_chain
244
 
245
- def add_documents(self, documents: List[Document], is_sample: bool = False) -> None:
 
 
 
 
 
246
  """
247
  Add processed document chunks to the vector store for retrieval.
248
- Tracks upload timestamp for auto-cleanup (user docs only).
249
 
250
  Args:
251
  documents: List of Document objects with text and metadata
252
- is_sample: If True, document won't be auto-deleted (for demo samples)
 
253
  """
 
 
 
 
 
 
 
 
254
  self.vector_store.add_documents(documents)
255
- # In newer versions of langchain-chroma, persist() is no longer needed
256
- # as documents are automatically persisted when added
257
 
258
  # Track document metadata for cleanup (skip samples)
259
  if not is_sample and documents:
260
- self._track_document(documents[0].metadata.get("source", "unknown"))
 
 
 
261
 
262
  def _check_rate_limit(self) -> bool:
263
  """
@@ -303,12 +342,14 @@ Answer:""",
303
 
304
  return True
305
 
306
- def query(self, question: str):
307
  """
308
  Query the RAG system with a question, retrieves relevant context and generates answer.
 
309
 
310
  Args:
311
  question: User's question string
 
312
 
313
  Returns:
314
  dict: {
@@ -327,6 +368,9 @@ Answer:""",
327
  "Please try again later."
328
  )
329
 
 
 
 
330
  answer = self.rag_chain.invoke(question)
331
  result = answer["result"]
332
 
@@ -396,12 +440,13 @@ Answer:""",
396
 
397
  return citations
398
 
399
- def _track_document(self, source_path: str) -> None:
400
  """
401
  Track document upload timestamp for auto-cleanup.
402
 
403
  Args:
404
  source_path: Path to the uploaded document
 
405
  """
406
  # Load existing metadata
407
  if self.doc_metadata_file.exists():
@@ -410,9 +455,10 @@ Answer:""",
410
  else:
411
  metadata = {"documents": {}}
412
 
413
- # Add new document with current timestamp
414
  metadata["documents"][source_path] = {
415
  "uploaded_at": datetime.now().isoformat(),
 
416
  "is_sample": False,
417
  }
418
 
@@ -434,6 +480,7 @@ Answer:""",
434
  now = datetime.now()
435
  seven_days_ago = now - timedelta(days=7)
436
  documents_to_keep = {}
 
437
 
438
  for doc_path, doc_info in metadata.get("documents", {}).items():
439
  upload_time = datetime.fromisoformat(doc_info["uploaded_at"])
@@ -442,12 +489,87 @@ Answer:""",
442
  if upload_time > seven_days_ago or doc_info.get("is_sample", False):
443
  documents_to_keep[doc_path] = doc_info
444
  else:
445
- # Delete from vector store
446
- # Note: ChromaDB doesn't support direct deletion by metadata filter
447
- # In production, you'd implement this with collection.delete()
448
- print(f"Would delete old document: {doc_path}")
 
 
 
449
 
450
  # Update metadata file
451
  metadata["documents"] = documents_to_keep
452
  with open(self.doc_metadata_file, "w") as f:
453
  json.dump(metadata, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from langchain_openai import ChatOpenAI
4
  from langchain_core.prompts import PromptTemplate
5
  from langchain_core.documents import Document
6
+ from langchain_core.runnables import (
7
+ RunnableParallel,
8
+ RunnablePassthrough,
9
+ RunnableLambda,
10
+ )
11
  from typing import List
12
  import os
13
  from datetime import datetime, timedelta
 
85
  self.current_model = default_model
86
  self.llm = self._initialize_llm(default_model)
87
 
88
+ # Current session ID for retrieval filtering (set per-query)
89
+ self._current_session_id = None
90
+
91
  # Create RAG chain
92
  self.rag_chain = self.create_rag_chain()
93
 
 
220
  search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
221
  )
222
 
223
+ # Wrap retriever to filter by session
224
+ def session_filter(docs):
225
+ """Filter documents by current session."""
226
+ session_id = self._current_session_id
227
+ if session_id:
228
+ # Return docs matching session_id OR sample docs (is_sample=True)
229
+ return [
230
+ d
231
+ for d in docs
232
+ if d.metadata.get("session_id") == session_id
233
+ or d.metadata.get("is_sample", False)
234
+ ]
235
+ return docs
236
+
237
+ # Create session-filtered retriever as a Runnable
238
+ session_filtered_retriever = retriever | RunnableLambda(session_filter)
239
+
240
  rag_chain = RunnableParallel(
241
  {
242
  "result": (
243
  {
244
+ "context": session_filtered_retriever
245
  | (lambda docs: "\n\n".join([d.page_content for d in docs])),
246
+ "sources": session_filtered_retriever
247
  | (
248
  lambda docs: ", ".join(
249
  list(
 
261
  | prompt
262
  | self.llm
263
  ),
264
+ "source_documents": session_filtered_retriever,
265
  }
266
  )
267
  return rag_chain
268
 
269
+ def add_documents(
270
+ self,
271
+ documents: List[Document],
272
+ session_id: str = None,
273
+ is_sample: bool = False,
274
+ ) -> None:
275
  """
276
  Add processed document chunks to the vector store for retrieval.
277
+ Adds session_id and timestamp metadata for isolation and auto-cleanup.
278
 
279
  Args:
280
  documents: List of Document objects with text and metadata
281
+ session_id: User's session ID for isolation (None for samples)
282
+ is_sample: If True, document is global and won't be auto-deleted
283
  """
284
+ # Add session and timestamp metadata to each chunk
285
+ now = datetime.now().isoformat()
286
+
287
+ for doc in documents:
288
+ doc.metadata["session_id"] = session_id if not is_sample else "global"
289
+ doc.metadata["uploaded_at"] = now
290
+ doc.metadata["is_sample"] = is_sample
291
+
292
  self.vector_store.add_documents(documents)
 
 
293
 
294
  # Track document metadata for cleanup (skip samples)
295
  if not is_sample and documents:
296
+ self._track_document(
297
+ documents[0].metadata.get("source", "unknown"),
298
+ session_id=session_id,
299
+ )
300
 
301
  def _check_rate_limit(self) -> bool:
302
  """
 
342
 
343
  return True
344
 
345
+ def query(self, question: str, session_id: str = None):
346
  """
347
  Query the RAG system with a question, retrieves relevant context and generates answer.
348
+ Results are filtered to the user's session documents + global samples.
349
 
350
  Args:
351
  question: User's question string
352
+ session_id: User's session ID for filtering results
353
 
354
  Returns:
355
  dict: {
 
368
  "Please try again later."
369
  )
370
 
371
+ # Set session ID for filtered retrieval
372
+ self._current_session_id = session_id
373
+
374
  answer = self.rag_chain.invoke(question)
375
  result = answer["result"]
376
 
 
440
 
441
  return citations
442
 
443
+ def _track_document(self, source_path: str, session_id: str = None) -> None:
444
  """
445
  Track document upload timestamp for auto-cleanup.
446
 
447
  Args:
448
  source_path: Path to the uploaded document
449
+ session_id: User's session ID for the document
450
  """
451
  # Load existing metadata
452
  if self.doc_metadata_file.exists():
 
455
  else:
456
  metadata = {"documents": {}}
457
 
458
+ # Add new document with current timestamp and session
459
  metadata["documents"][source_path] = {
460
  "uploaded_at": datetime.now().isoformat(),
461
+ "session_id": session_id,
462
  "is_sample": False,
463
  }
464
 
 
480
  now = datetime.now()
481
  seven_days_ago = now - timedelta(days=7)
482
  documents_to_keep = {}
483
+ deleted_count = 0
484
 
485
  for doc_path, doc_info in metadata.get("documents", {}).items():
486
  upload_time = datetime.fromisoformat(doc_info["uploaded_at"])
 
489
  if upload_time > seven_days_ago or doc_info.get("is_sample", False):
490
  documents_to_keep[doc_path] = doc_info
491
  else:
492
+ # Actually delete from ChromaDB using source path filter
493
+ try:
494
+ self.vector_store._collection.delete(where={"source": doc_path})
495
+ deleted_count += 1
496
+ print(f"Deleted expired document: {doc_path}")
497
+ except Exception as e:
498
+ print(f"Error deleting document {doc_path}: {e}")
499
 
500
  # Update metadata file
501
  metadata["documents"] = documents_to_keep
502
  with open(self.doc_metadata_file, "w") as f:
503
  json.dump(metadata, f, indent=2)
504
+
505
+ if deleted_count > 0:
506
+ print(f"Cleanup complete: removed {deleted_count} expired documents")
507
+
508
+ def get_documents_by_session(self, session_id: str) -> List[str]:
509
+ """
510
+ Get list of document names for a given session.
511
+
512
+ Args:
513
+ session_id: User's session ID
514
+
515
+ Returns:
516
+ List[str]: List of document filenames belonging to this session
517
+ """
518
+ if not self.doc_metadata_file.exists():
519
+ return []
520
+
521
+ with open(self.doc_metadata_file, "r") as f:
522
+ metadata = json.load(f)
523
+
524
+ documents = []
525
+ for doc_path, doc_info in metadata.get("documents", {}).items():
526
+ if doc_info.get("session_id") == session_id:
527
+ # Extract just the filename
528
+ filename = doc_path.split("/")[-1] if "/" in doc_path else doc_path
529
+ documents.append(
530
+ {
531
+ "filename": filename,
532
+ "path": doc_path,
533
+ "uploaded_at": doc_info["uploaded_at"],
534
+ }
535
+ )
536
+
537
+ return documents
538
+
539
+ def delete_document(self, session_id: str, source_path: str) -> bool:
540
+ """
541
+ Delete a specific document from vector store and metadata.
542
+
543
+ Args:
544
+ session_id: User's session ID (for verification)
545
+ source_path: Full path to the document to delete
546
+
547
+ Returns:
548
+ bool: True if deleted, False if not found or not authorized
549
+ """
550
+ if not self.doc_metadata_file.exists():
551
+ return False
552
+
553
+ with open(self.doc_metadata_file, "r") as f:
554
+ metadata = json.load(f)
555
+
556
+ # Verify document belongs to this session
557
+ doc_info = metadata.get("documents", {}).get(source_path)
558
+ if not doc_info:
559
+ return False
560
+ if doc_info.get("session_id") != session_id:
561
+ return False # Not authorized to delete
562
+
563
+ # Delete from ChromaDB
564
+ try:
565
+ self.vector_store._collection.delete(where={"source": source_path})
566
+ except Exception as e:
567
+ print(f"Error deleting from ChromaDB: {e}")
568
+ return False
569
+
570
+ # Remove from metadata
571
+ del metadata["documents"][source_path]
572
+ with open(self.doc_metadata_file, "w") as f:
573
+ json.dump(metadata, f, indent=2)
574
+
575
+ return True