| |
| import os |
| from openai import AsyncOpenAI |
|
|
| client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) |
|
|
| class TaskDecomposer: |
| def __init__(self): |
| pass |
|
|
| async def decompose(self, context): |
| base_prompt = self._build_prompt(context) |
|
|
| response = await client.chat.completions.create( |
| model="gpt-4o", |
| messages=[ |
| {"role": "system", "content": "You are a task planner that breaks down multimodal user goals into executable subtasks."}, |
| {"role": "user", "content": base_prompt} |
| ] |
| ) |
|
|
| content = response.choices[0].message.content |
| return self._parse_subtasks(content) |
|
|
| def _build_prompt(self, context): |
| description = [] |
| if "text" in context: |
| description.append(f"Text: {context['text']}") |
| if "image_summary" in context: |
| description.append(f"Image summary: {context['image_summary']}") |
| if "video_summary" in context: |
| description.append(f"Video summary: {context['video_summary']}") |
|
|
| combined = "\n".join(description) |
| prompt = f""" |
| Given the following multimodal input, generate a list of clear, web-searchable subtasks needed to achieve the user's goal. Output the list in JSON array format, with each item as an object containing 'query', 'language', and 'modality'. |
| |
| {combined} |
| """ |
| return prompt |
|
|
| def _parse_subtasks(self, llm_output): |
| import json |
| try: |
| return json.loads(llm_output) |
| except: |
| return [] |
|
|