gaurv007
/

alpha-factory

@@ -1,140 +0,0 @@
-"""
-RAG System — ChromaDB + arXiv paper indexer.
-Retrieves relevant academic papers for the Hypothesis Hunter.
-"""
-import os
-from pathlib import Path
-from typing import Optional
-class PaperRAG:
-    """
-    RAG retrieval over arXiv q-fin papers.
-    Uses ChromaDB for vector storage + bge-small for embeddings.
-    """
-    def __init__(self, persist_dir: Path, collection_name: str = "qfin_papers"):
-        try:
-            import chromadb
-            self.client = chromadb.PersistentClient(path=str(persist_dir))
-            self.collection = self.client.get_or_create_collection(
-                name=collection_name,
-                metadata={"hnsw:space": "cosine"},
-            )
-            self._available = True
-        except ImportError:
-            self._available = False
-            print("[WARN] chromadb not installed — RAG disabled. pip install chromadb")
-    @property
-    def available(self) -> bool:
-        return self._available
-    def index_papers(self, papers: list[dict]):
-        """
-        Index papers into ChromaDB.
-        Each paper: {"id": "arxiv_id", "title": "...", "abstract": "...", "categories": [...]}
-        """
-        if not self._available:
-            return
-        ids = [p["id"] for p in papers]
-        documents = [f"{p['title']}\n\n{p['abstract']}" for p in papers]
-        metadatas = [{"title": p["title"], "categories": ",".join(p.get("categories", []))} for p in papers]
-        # Batch insert (ChromaDB handles embedding automatically with default model)
-        batch_size = 100
-        for i in range(0, len(ids), batch_size):
-            self.collection.upsert(
-                ids=ids[i:i+batch_size],
-                documents=documents[i:i+batch_size],
-                metadatas=metadatas[i:i+batch_size],
-            )
-    def retrieve(self, query: str, n_results: int = 3) -> list[str]:
-        """
-        Retrieve top-N relevant paper abstracts for a given theme/query.
-        Returns list of formatted paper strings.
-        """
-        if not self._available or self.collection.count() == 0:
-            return []
-        results = self.collection.query(
-            query_texts=[query],
-            n_results=n_results,
-        )
-        papers = []
-        if results["documents"] and results["documents"][0]:
-            for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
-                papers.append(f"[{meta.get('title', 'Unknown')}]\n{doc[:500]}")
-        return papers
-    def count(self) -> int:
-        """Number of indexed papers."""
-        if not self._available:
-            return 0
-        return self.collection.count()
-async def fetch_arxiv_papers(
-    categories: list[str] = ["q-fin.PM", "q-fin.ST", "q-fin.CP", "stat.AP"],
-    max_results: int = 500,
-    start_year: int = 2021,
-) -> list[dict]:
-    """
-    Fetch papers from arXiv API.
-    Returns list of {id, title, abstract, categories, published}.
-    """
-    import asyncio
-    import aiohttp
-    from xml.etree import ElementTree as ET
-    base_url = "http://export.arxiv.org/api/query"
-    papers = []
-    for cat in categories:
-        query = f"cat:{cat}"
-        params = {
-            "search_query": query,
-            "start": 0,
-            "max_results": max_results // len(categories),
-            "sortBy": "submittedDate",
-            "sortOrder": "descending",
-        }
-        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(base_url, params=params) as resp:
-                    if resp.status != 200:
-                        continue
-                    text = await resp.text()
-            # Parse XML
-            ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
-            root = ET.fromstring(text)
-            for entry in root.findall("atom:entry", ns):
-                arxiv_id = entry.find("atom:id", ns).text.split("/")[-1]
-                title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
-                abstract = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
-                published = entry.find("atom:published", ns).text[:10]
-                cats = [c.attrib["term"] for c in entry.findall("arxiv:primary_category", ns)]
-                papers.append({
-                    "id": arxiv_id,
-                    "title": title,
-                    "abstract": abstract,
-                    "categories": cats,
-                    "published": published,
-                })
-            # Rate limit arxiv API
-            await asyncio.sleep(3)
-        except Exception as e:
-            print(f"[WARN] arXiv fetch failed for {cat}: {e}")
-    return papers