gaurv007
/

alpha-factory

+"""
+RAG System — ChromaDB + arXiv paper indexer.
+Retrieves relevant academic papers for the Hypothesis Hunter.
+"""
+import os
+from pathlib import Path
+from typing import Optional
+class PaperRAG:
+    """
+    RAG retrieval over arXiv q-fin papers.
+    Uses ChromaDB for vector storage + bge-small for embeddings.
+    """
+    def __init__(self, persist_dir: Path, collection_name: str = "qfin_papers"):
+        try:
+            import chromadb
+            self.client = chromadb.PersistentClient(path=str(persist_dir))
+            self.collection = self.client.get_or_create_collection(
+                name=collection_name,
+                metadata={"hnsw:space": "cosine"},
+            )
+            self._available = True
+        except ImportError:
+            self._available = False
+            print("[WARN] chromadb not installed — RAG disabled. pip install chromadb")
+    @property
+    def available(self) -> bool:
+        return self._available
+    def index_papers(self, papers: list[dict]):
+        """
+        Index papers into ChromaDB.
+        Each paper: {"id": "arxiv_id", "title": "...", "abstract": "...", "categories": [...]}
+        """
+        if not self._available:
+            return
+        ids = [p["id"] for p in papers]
+        documents = [f"{p['title']}\n\n{p['abstract']}" for p in papers]
+        metadatas = [{"title": p["title"], "categories": ",".join(p.get("categories", []))} for p in papers]
+        # Batch insert (ChromaDB handles embedding automatically with default model)
+        batch_size = 100
+        for i in range(0, len(ids), batch_size):
+            self.collection.upsert(
+                ids=ids[i:i+batch_size],
+                documents=documents[i:i+batch_size],
+                metadatas=metadatas[i:i+batch_size],
+            )
+    def retrieve(self, query: str, n_results: int = 3) -> list[str]:
+        """
+        Retrieve top-N relevant paper abstracts for a given theme/query.
+        Returns list of formatted paper strings.
+        """
+        if not self._available or self.collection.count() == 0:
+            return []
+        results = self.collection.query(
+            query_texts=[query],
+            n_results=n_results,
+        )
+        papers = []
+        if results["documents"] and results["documents"][0]:
+            for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
+                papers.append(f"[{meta.get('title', 'Unknown')}]\n{doc[:500]}")
+        return papers
+    def count(self) -> int:
+        """Number of indexed papers."""
+        if not self._available:
+            return 0
+        return self.collection.count()
+async def fetch_arxiv_papers(
+    categories: list[str] = ["q-fin.PM", "q-fin.ST", "q-fin.CP", "stat.AP"],
+    max_results: int = 500,
+    start_year: int = 2021,
+) -> list[dict]:
+    """
+    Fetch papers from arXiv API.
+    Returns list of {id, title, abstract, categories, published}.
+    """
+    import asyncio
+    import aiohttp
+    from xml.etree import ElementTree as ET
+    base_url = "http://export.arxiv.org/api/query"
+    papers = []
+    for cat in categories:
+        query = f"cat:{cat}"
+        params = {
+            "search_query": query,
+            "start": 0,
+            "max_results": max_results // len(categories),
+            "sortBy": "submittedDate",
+            "sortOrder": "descending",
+        }
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(base_url, params=params) as resp:
+                    if resp.status != 200:
+                        continue
+                    text = await resp.text()
+            # Parse XML
+            ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
+            root = ET.fromstring(text)
+            for entry in root.findall("atom:entry", ns):
+                arxiv_id = entry.find("atom:id", ns).text.split("/")[-1]
+                title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
+                abstract = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
+                published = entry.find("atom:published", ns).text[:10]
+                cats = [c.attrib["term"] for c in entry.findall("arxiv:primary_category", ns)]
+                papers.append({
+                    "id": arxiv_id,
+                    "title": title,
+                    "abstract": abstract,
+                    "categories": cats,
+                    "published": published,
+                })
+            # Rate limit arxiv API
+            await asyncio.sleep(3)
+        except Exception as e:
+            print(f"[WARN] arXiv fetch failed for {cat}: {e}")
+    return papers