Spaces:

Vansh180
/

Hackrx6

Sleeping

App Files Files Community

Hackrx6 / main.py

Vansh180

Initial commit

4598839 9 months ago

raw

history blame contribute delete

5.96 kB

	import os
	import re
	import json
	import tempfile
	import requests
	import fitz
	import pytesseract
	from PIL import Image
	from docx import Document
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	import google.generativeai as genai
	from fastapi import FastAPI, Request

	app = FastAPI()
	embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

	# Utility function: Download file from URL to temp directory
	def download_file(url: str, dest_dir: str) -> str:
	ext = url.split('.')[-1].split('?')[0]
	local_path = os.path.join(dest_dir, f"file_{abs(hash(url))}.{ext}")
	resp = requests.get(url, stream=True)
	resp.raise_for_status()
	with open(local_path, "wb") as f:
	for chunk in resp.iter_content(8192):
	f.write(chunk)
	return local_path

	# Extract text from PDF, DOCX, or Images
	def extract_text(file_path: str, max_pages: int = 3) -> str:
	ext = file_path.split('.')[-1].lower()
	if ext == "pdf":
	doc = fitz.open(file_path)
	return "\n".join(page.get_text() for page in doc[:max_pages])
	elif ext == "docx":
	doc = Document(file_path)
	return "\n".join(p.text for p in doc.paragraphs)
	elif ext in {"jpg", "jpeg", "png"}:
	return pytesseract.image_to_string(Image.open(file_path))
	else:
	raise ValueError(f"Unsupported file type: {ext}")

	# Extract parameters like age, gender, procedure, location, policy_duration from text
	def extract_params(text: str) -> dict:
	age_m = re.search(r"(\d{2})[- ]?year[- ]?old", text, re.IGNORECASE)
	gender_m = re.search(r"\b(male\|female)\b", text, re.IGNORECASE)
	proc_m = re.search(r"(\w+(?:\s\w+)*\s(?:surgery\|replacement\|operation\|treatment))", text, re.IGNORECASE)
	loc_m = re.search(r"(?:in\|at)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", text)
	dur_m = re.search(r"(\d+)[- ]?(?:month\|year)[- ]?old.*?insurance", text, re.IGNORECASE)
	return {
	"age": int(age_m.group(1)) if age_m else None,
	"gender": gender_m.group(1).lower() if gender_m else None,
	"procedure": proc_m.group(1).strip() if proc_m else None,
	"location": loc_m.group(1).strip() if loc_m else None,
	"policy_duration": (
	dur_m.group(1) + (" months" if "month" in dur_m.group(0) else " years")
	) if dur_m else None
	}

	# Chunk large text into overlapping pieces
	def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = " ".join(words[i:i + chunk_size])
	chunks.append(chunk)
	return chunks

	# Prepare FAISS index from list of policy document file paths
	def prepare_policy_index(policy_file_paths: list) -> tuple:
	all_chunks, chunk_sources = [], []
	for path in policy_file_paths:
	text = extract_text(path)
	chunks = chunk_text(text)
	all_chunks.extend(chunks)
	chunk_sources.extend([os.path.basename(path)] * len(chunks))
	embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(np.array(embeddings))
	return all_chunks, chunk_sources, index

	# Semantic search over the FAISS index for a query string
	def semantic_search(query: str, chunks: list, chunk_sources: list, index, top_k: int = 3) -> list:
	query_embedding = embedding_model.encode([query])
	D, I = index.search(np.array(query_embedding), top_k)
	return [(chunks[i], chunk_sources[i]) for i in I[0]]

	# Call Gemini LLM for final decision
	def get_llm_decision_gemini(structured_json: dict, retrieved_clauses: list, gemini_api_key: str) -> str:
	genai.configure(api_key=gemini_api_key)
	llm = genai.GenerativeModel("gemini-1.5-flash")
	prompt = f"""
	You are an insurance claim decision model.

	Claim Info:
	{json.dumps(structured_json, indent=2)}

	Relevant Policy Clauses:
	{retrieved_clauses[0][0]}
	{retrieved_clauses[1][0] if len(retrieved_clauses) > 1 else ''}
	{retrieved_clauses[2][0] if len(retrieved_clauses) > 2 else ''}

	Your task is to:
	1. Decide if the claim should be approved or rejected
	2. Mention amount if applicable (else null)
	3. Give clear justification pointing to the relevant clauses

	Respond only in JSON:
	{{"Decision": "...", "Amount": "...", "Justification": "..."}}
	"""
	response = llm.generate_content(prompt)
	return response.text

	# The FastAPI /hackrx/run endpoint
	@app.post("/hackrx/run")
	async def hackrx_run(request: Request):
	data = await request.json()
	document_urls = data.get("documents")
	questions = data.get("questions", [])

	if not document_urls:
	return {"error": "No documents provided."}

	if isinstance(document_urls, str):
	document_urls = [document_urls]

	gemini_api_key = os.environ.get("GOOGLE_API_KEY")
	if not gemini_api_key:
	return {"error": "API key not configured in environment variables."}

	with tempfile.TemporaryDirectory() as tmpdir:
	# Download all policy docs
	policy_paths = [download_file(url, tmpdir) for url in document_urls]
	# Extract text and build FAISS index once per request
	chunks, chunk_sources, index = prepare_policy_index(policy_paths)

	answers = []
	for question in questions:
	# Extract structured info from question (optional; can also use raw question text)
	structured_query = extract_params(question)
	# Compose query text for semantic search
	query_text = " ".join([str(v) for v in structured_query.values() if v])
	# Retrieve top relevant clauses
	retrieved_clauses = semantic_search(query_text, chunks, chunk_sources, index)
	# Get final decision from Gemini
	answer = get_llm_decision_gemini(structured_query, retrieved_clauses, gemini_api_key)
	answers.append(answer)

	return {"answers": answers}