consolidate: Evo arXiv ingestion pipeline
Browse files- evo/arxiv_ingestor.py +43 -0
evo/arxiv_ingestor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Q3 [Evo] — arXiv Ingestor
|
| 3 |
+
Fetches papers → chunks → embeds (nomic-embed-text 384-dim) → stores in ChromaDB.
|
| 4 |
+
"""
|
| 5 |
+
import arxiv, os
|
| 6 |
+
from chromadb_store import ChromaStore
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-ai/nomic-embed-text-v1")
|
| 10 |
+
SEARCH_QUERIES = [
|
| 11 |
+
"prediction markets mechanism design",
|
| 12 |
+
"multi-agent LLM orchestration LangGraph",
|
| 13 |
+
"LoRA fine-tuning reinforcement learning",
|
| 14 |
+
"smart contract Solidity security",
|
| 15 |
+
"decentralized finance automated market maker",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
def chunk_text(text: str, size: int = 512, overlap: int = 64) -> list[str]:
|
| 19 |
+
words = text.split()
|
| 20 |
+
chunks, i = [], 0
|
| 21 |
+
while i < len(words):
|
| 22 |
+
chunks.append(" ".join(words[i:i+size]))
|
| 23 |
+
i += size - overlap
|
| 24 |
+
return chunks
|
| 25 |
+
|
| 26 |
+
def ingest(max_results: int = 20):
|
| 27 |
+
embedder = SentenceTransformer(EMBED_MODEL, trust_remote_code=True)
|
| 28 |
+
store = ChromaStore(collection="arxiv_papers")
|
| 29 |
+
client = arxiv.Client()
|
| 30 |
+
|
| 31 |
+
for query in SEARCH_QUERIES:
|
| 32 |
+
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
|
| 33 |
+
for paper in client.results(search):
|
| 34 |
+
text = f"{paper.title}\n{paper.summary}"
|
| 35 |
+
chunks = chunk_text(text)
|
| 36 |
+
embeds = embedder.encode(chunks, normalize_embeddings=True).tolist()
|
| 37 |
+
ids = [f"{paper.get_short_id()}_{i}" for i in range(len(chunks))]
|
| 38 |
+
metas = [{"title": paper.title, "arxiv_id": paper.get_short_id(), "query": query}]*len(chunks)
|
| 39 |
+
store.add(ids=ids, embeddings=embeds, documents=chunks, metadatas=metas)
|
| 40 |
+
print(f"Ingested: {paper.title[:60]}")
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
ingest()
|