azlaan428 commited on
Commit
1ccdc0b
·
1 Parent(s): c2cbe98

feat: PubMed retrieval + LangChain ReAct agent scaffold

Browse files
Files changed (4) hide show
  1. .gitignore +5 -0
  2. agent/agent.py +82 -0
  3. requirements.txt +10 -0
  4. retrieval/pubmed.py +21 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .env
5
+ *.egg-info/
agent/agent.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4
+
5
+ from langchain.agents import AgentExecutor, create_react_agent
6
+ from langchain.tools import Tool
7
+ from langchain_core.prompts import PromptTemplate
8
+ from langchain_huggingface import HuggingFacePipeline
9
+ from retrieval.pubmed import fetch_pubmed
10
+
11
+
12
+ def pubmed_tool_fn(query: str) -> str:
13
+ results = fetch_pubmed(query, max_results=5)
14
+ if not results:
15
+ return "No abstracts found for this query."
16
+ return "\n\n".join([f"[PMID {r['pmid']}]\n{r['abstract']}" for r in results])
17
+
18
+ pubmed_tool = Tool(
19
+ name="PubMedSearch",
20
+ func=pubmed_tool_fn,
21
+ description=(
22
+ "Searches PubMed for biomedical literature. "
23
+ "Input should be a clinical or scientific query string. "
24
+ "Returns abstracts relevant to the query."
25
+ )
26
+ )
27
+
28
+ REACT_PROMPT = PromptTemplate.from_template("""You are a biomedical research assistant. Use the tools available to answer the user's question accurately and concisely.
29
+
30
+ Tools available:
31
+ {tools}
32
+
33
+ Tool names: {tool_names}
34
+
35
+ Use EXACTLY this format:
36
+
37
+ Question: the input question you must answer
38
+ Thought: your reasoning about what to do
39
+ Action: the tool name to use (must be one of [{tool_names}])
40
+ Action Input: the input to the tool
41
+ Observation: the result of the tool
42
+ Thought: I now know the final answer
43
+ Final Answer: your comprehensive answer based on the literature
44
+
45
+ Begin!
46
+
47
+ Question: {input}
48
+ {agent_scratchpad}""")
49
+
50
+
51
+ def load_llm():
52
+ from transformers import pipeline as hf_pipeline
53
+ pipe = hf_pipeline(
54
+ "text-generation",
55
+ model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
56
+ device_map="auto",
57
+ max_new_tokens=512,
58
+ )
59
+ return HuggingFacePipeline(pipeline=pipe)
60
+
61
+
62
+ def build_agent():
63
+ llm = load_llm()
64
+ tools = [pubmed_tool]
65
+ agent = create_react_agent(llm, tools, REACT_PROMPT)
66
+ return AgentExecutor(
67
+ agent=agent,
68
+ tools=tools,
69
+ verbose=True,
70
+ max_iterations=5,
71
+ handle_parsing_errors=True,
72
+ )
73
+
74
+
75
+ if __name__ == "__main__":
76
+ print("Loading model... (cached, should be fast)")
77
+ executor = build_agent()
78
+ query = "What ML methods are used for epilepsy seizure detection?"
79
+ print(f"\nQuery: {query}\n")
80
+ response = executor.invoke({"input": query})
81
+ print("\n=== Final Response ===")
82
+ print(response["output"])
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ biopython==1.87
2
+ langchain==0.2.16
3
+ langchain-community==0.2.16
4
+ langchain-core==0.2.40
5
+ langchain-text-splitters==0.2.4
6
+ transformers>=4.40.0
7
+ accelerate>=0.30.0
8
+ sentencepiece>=0.1.99
9
+ torch
10
+ numpy==1.26.4
retrieval/pubmed.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Bio import Entrez
2
+
3
+ Entrez.email = "azlaanmohammad66@gmail.com"
4
+
5
+ def fetch_pubmed(query: str, max_results: int = 5) -> list[dict]:
6
+ handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
7
+ record = Entrez.read(handle)
8
+ ids = record["IdList"]
9
+ if not ids:
10
+ return []
11
+
12
+ handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
13
+ raw = handle.read()
14
+
15
+ abstracts = [a.strip() for a in raw.strip().split("\n\n\n") if a.strip()]
16
+ return [{"pmid": pmid, "abstract": ab} for pmid, ab in zip(ids, abstracts)]
17
+
18
+ if __name__ == "__main__":
19
+ results = fetch_pubmed("epilepsy seizure detection machine learning")
20
+ for r in results:
21
+ print(f"PMID: {r['pmid']}\n{r['abstract']}\n{'-'*60}")