| from helper import extract_html_content |
| from IPython.display import display, HTML |
| from llama_index.utils.workflow import draw_all_possible_flows |
| from llama_index.core.tools import FunctionTool |
| from llama_index.core.agent import FunctionCallingAgent |
| from llama_index.core import Settings |
| from llama_parse import LlamaParse |
| from llama_index.llms.groq import Groq |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
| from llama_index.core import ( |
| VectorStoreIndex, |
| StorageContext, |
| load_index_from_storage |
| ) |
| import nest_asyncio |
| from llama_index.core.workflow import ( |
| StartEvent, |
| StopEvent, |
| Workflow, |
| step, |
| Event, |
| Context |
| ) |
| import json |
| from pathlib import Path |
| from dotenv import load_dotenv |
| import os |
| import asyncio |
|
|
|
|
| storage_dir = "./storage" |
| nest_asyncio.apply() |
|
|
| load_dotenv() |
| llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY") |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
| LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL") |
|
|
| global_llm = Groq(api_key=GROQ_API_KEY, model="llama3-70b-8192") |
| global_embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") |
| Settings.embed_model = global_embed_model |
|
|
| documents = LlamaParse( |
| api_key=llama_cloud_api_key, |
| result_type="markdown", |
| content_guideline_instruction="This is a resume, gather related facts together and format it as " |
| "bullet points with headers" |
| ).load_data("data/fake_resume.pdf") |
|
|
| print(documents[0].text) |
|
|
| index = VectorStoreIndex.from_documents( |
| documents, |
| embed_model=global_embed_model |
| ) |
|
|
| query_engine = index.as_query_engine(llm=global_llm, similarity_top_k=5) |
| response = query_engine.query("What is this person's name and what was their most recent job?") |
| print(response) |
|
|
| index.storage_context.persist(persist_dir=storage_dir) |
|
|
| restored_index = None |
| |
| if os.path.exists(storage_dir): |
| |
| storage_context = StorageContext.from_defaults(persist_dir=storage_dir) |
| restored_index = load_index_from_storage(storage_context) |
| else: |
| print("Index not found on disk.") |
|
|
|
|
| print("\n\n Reading back the index \n") |
| response = restored_index.as_query_engine(llm=global_llm, similarity_top_k=5)\ |
| .query("What is this person's name and what was their most recent job?") |
| print(response) |
|
|
| print("\n\n" + "="*50, "\n\n") |
|
|
|
|
| def query_resume(q: str) -> str: |
| """Answers questions about a specific resume.""" |
| |
| response = query_engine.query(f"This is a question about the specific resume we have in our database: {q}") |
| return response.response |
|
|
|
|
| resume_tool = FunctionTool.from_defaults(fn=query_resume) |
|
|
| agent = FunctionCallingAgent.from_tools( |
| tools=[resume_tool], |
| llm=global_llm, |
| verbose=True |
| ) |
|
|
| response = agent.chat("How many years of experience does the applicant have?") |
| print(response) |
|
|
| print("\n\n" + "="*50, "\n\n") |
|
|
|
|
| class ParseFormEvent(Event): |
| application_form: str |
|
|
|
|
| class QueryEvent(Event): |
| query: str |
|
|
|
|
| class ResponseEvent(Event): |
| response: str |
|
|
|
|
| |
| class RAGWorkflow(Workflow): |
| |
| storage_dir = "./storage" |
| llm: Groq |
| query_engine: VectorStoreIndex |
|
|
| @step |
| async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent: |
| self.llm = global_llm |
| self.storage_dir = storage_dir |
| if not ev.resume_file: |
| raise ValueError("No resume file provided") |
|
|
| if not ev.application_form: |
| raise ValueError("No application form provided") |
|
|
|
|
| |
| if os.path.exists(self.storage_dir): |
| |
| storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir) |
| index = load_index_from_storage(storage_context) |
| else: |
| |
| documents = LlamaParse( |
| result_type="markdown", |
| content_guideline_instruction="This is a resume, gather related facts together and format it as " |
| "bullet points with headers" |
| ).load_data(ev.resume_file) |
| |
| index = VectorStoreIndex.from_documents( |
| documents, |
| embed_model=global_embed_model |
| ) |
| index.storage_context.persist(persist_dir=self.storage_dir) |
|
|
| |
| self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5) |
|
|
| |
| |
| |
| return ParseFormEvent(application_form=ev.application_form) |
|
|
| @step |
| async def parse_form(self, ctx: Context, ev: ParseFormEvent) -> QueryEvent: |
| parser = LlamaParse( |
| result_type="markdown", |
| content_guideline_instruction="This is a job application form. Create a list of all the fields that " |
| "need to be filled in.", |
| formatting_instruction="Return a bulleted list of the fields ONLY." |
| ) |
|
|
| |
| result = parser.load_data(ev.application_form)[0] |
| raw_json = self.llm.complete( |
| f""" |
| This is a parsed form. |
| Convert it into a JSON object containing only the list |
| of fields to be filled in, in the form {{ fields: [...] }}. |
| <form>{result.text}</form>. |
| Return JSON ONLY, no markdown. |
| """) |
| fields = json.loads(raw_json.text)["fields"] |
|
|
| |
| |
| for field in fields: |
| ctx.send_event(QueryEvent( |
| field=field, |
| query=f"How would you answer this question about the candidate? {field}" |
| )) |
|
|
| |
| await ctx.set("total_fields", len(fields)) |
| return |
|
|
| @step |
| async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent: |
| response = self.query_engine.query( |
| f"This is a question about the specific resume we have in our database: {ev.query}") |
| return ResponseEvent(field=ev.field, response=response.response) |
|
|
| |
| @step |
| async def fill_in_application(self, ctx: Context, ev: ResponseEvent) -> StopEvent: |
| |
| total_fields = await ctx.get("total_fields") |
|
|
| responses = ctx.collect_events(ev, [ResponseEvent] * total_fields) |
| if responses is None: |
| return None |
|
|
| |
| responseList = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses) |
|
|
| result = self.llm.complete(f""" |
| You are given a list of fields in an application form and responses to |
| questions about those fields from a resume. Combine the two into a list of |
| fields and succinct, factual answers to fill in those fields. |
| |
| <responses> |
| {responseList} |
| </responses> |
| """) |
| return StopEvent(result=result) |
|
|
|
|
| async def main(): |
| w = RAGWorkflow(timeout=120, verbose=False) |
| result = await w.run( |
| resume_file="data/fake_resume.pdf", |
| application_form="data/fake_application_form.pdf" |
| ) |
| print(result) |
|
|
| |
| workflow_file = Path(__file__).parent / "workflows" / "form_parsing_workflow.html" |
| draw_all_possible_flows(w, filename=str(workflow_file)) |
| html_content = extract_html_content(str(workflow_file)) |
| display(HTML(html_content), metadata=dict(isolated=True)) |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|