consolidate: Evo arXiv ingestion pipeline

Browse files

Files changed (1) hide show

evo/arxiv_ingestor.py +43 -0

evo/arxiv_ingestor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Agent Q3 [Evo] — arXiv Ingestor
+Fetches papers → chunks → embeds (nomic-embed-text 384-dim) → stores in ChromaDB.
+"""
+import arxiv, os
+from chromadb_store import ChromaStore
+from sentence_transformers import SentenceTransformer
+EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-ai/nomic-embed-text-v1")
+SEARCH_QUERIES = [
+    "prediction markets mechanism design",
+    "multi-agent LLM orchestration LangGraph",
+    "LoRA fine-tuning reinforcement learning",
+    "smart contract Solidity security",
+    "decentralized finance automated market maker",
+]
+def chunk_text(text: str, size: int = 512, overlap: int = 64) -> list[str]:
+    words = text.split()
+    chunks, i = [], 0
+    while i < len(words):
+        chunks.append(" ".join(words[i:i+size]))
+        i += size - overlap
+    return chunks
+def ingest(max_results: int = 20):
+    embedder = SentenceTransformer(EMBED_MODEL, trust_remote_code=True)
+    store    = ChromaStore(collection="arxiv_papers")
+    client   = arxiv.Client()
+    for query in SEARCH_QUERIES:
+        search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
+        for paper in client.results(search):
+            text   = f"{paper.title}\n{paper.summary}"
+            chunks = chunk_text(text)
+            embeds = embedder.encode(chunks, normalize_embeddings=True).tolist()
+            ids    = [f"{paper.get_short_id()}_{i}" for i in range(len(chunks))]
+            metas  = [{"title": paper.title, "arxiv_id": paper.get_short_id(), "query": query}]*len(chunks)
+            store.add(ids=ids, embeddings=embeds, documents=chunks, metadatas=metas)
+            print(f"Ingested: {paper.title[:60]}")
+if __name__ == "__main__":
+    ingest()