madDegen commited on
Commit
f5d3e1c
·
verified ·
1 Parent(s): 6a55852

consolidate: Evo arXiv ingestion pipeline

Browse files
Files changed (1) hide show
  1. evo/arxiv_ingestor.py +43 -0
evo/arxiv_ingestor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Agent Q3 [Evo] — arXiv Ingestor
3
+ Fetches papers → chunks → embeds (nomic-embed-text 384-dim) → stores in ChromaDB.
4
+ """
5
+ import arxiv, os
6
+ from chromadb_store import ChromaStore
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-ai/nomic-embed-text-v1")
10
+ SEARCH_QUERIES = [
11
+ "prediction markets mechanism design",
12
+ "multi-agent LLM orchestration LangGraph",
13
+ "LoRA fine-tuning reinforcement learning",
14
+ "smart contract Solidity security",
15
+ "decentralized finance automated market maker",
16
+ ]
17
+
18
+ def chunk_text(text: str, size: int = 512, overlap: int = 64) -> list[str]:
19
+ words = text.split()
20
+ chunks, i = [], 0
21
+ while i < len(words):
22
+ chunks.append(" ".join(words[i:i+size]))
23
+ i += size - overlap
24
+ return chunks
25
+
26
+ def ingest(max_results: int = 20):
27
+ embedder = SentenceTransformer(EMBED_MODEL, trust_remote_code=True)
28
+ store = ChromaStore(collection="arxiv_papers")
29
+ client = arxiv.Client()
30
+
31
+ for query in SEARCH_QUERIES:
32
+ search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
33
+ for paper in client.results(search):
34
+ text = f"{paper.title}\n{paper.summary}"
35
+ chunks = chunk_text(text)
36
+ embeds = embedder.encode(chunks, normalize_embeddings=True).tolist()
37
+ ids = [f"{paper.get_short_id()}_{i}" for i in range(len(chunks))]
38
+ metas = [{"title": paper.title, "arxiv_id": paper.get_short_id(), "query": query}]*len(chunks)
39
+ store.add(ids=ids, embeddings=embeds, documents=chunks, metadatas=metas)
40
+ print(f"Ingested: {paper.title[:60]}")
41
+
42
+ if __name__ == "__main__":
43
+ ingest()