gaurv007
/

alpha-factory

@@ -1,6 +1,7 @@
 """
-LLM Client — unified interface to vLLM / Ollama with guided JSON generation.
-All outputs are schema-constrained. No free-text alpha generation.
 """
 import asyncio
 import json
@@ -14,52 +15,120 @@ T = TypeVar("T", bound=BaseModel)
 class LLMClient:
     """
-    Async LLM client with structured JSON output.
-    Connects to vLLM or Ollama (both expose OpenAI-compatible API).
     """
-    def __init__(self, config: LLMConfig):
         self.config = config
-        self.client = AsyncOpenAI(
-            base_url=config.base_url,
-            api_key=config.api_key,
-        )
         self._token_count = 0
     async def generate_json(
         self,
         prompt: str,
         schema: type[T],
         model: str | None = None,
         temperature: float | None = None,
         system_prompt: str = "You are a quantitative finance expert.",
     ) -> T:
         """
         Generate a structured JSON response conforming to the given Pydantic schema.
-        Uses guided decoding via response_format (vLLM supports this natively).
         """
-        model = model or self.config.mediumfish_model
         temp = temperature or self.config.temperature_generation
-        # Build JSON schema for guided generation
         json_schema = schema.model_json_schema()
-        response = await self.client.chat.completions.create(
-            model=model,
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            temperature=temp,
-            max_tokens=self.config.max_tokens,
-            response_format={
-                "type": "json_schema",
-                "json_schema": {
-                    "name": schema.__name__,
-                    "schema": json_schema,
                 },
-            },
-        )
         content = response.choices[0].message.content
         self._token_count += response.usage.total_tokens if response.usage else 0
@@ -71,17 +140,18 @@ class LLMClient:
     async def generate_text(
         self,
         prompt: str,
         model: str | None = None,
         temperature: float | None = None,
         system_prompt: str = "You are a quantitative finance expert.",
         max_tokens: int = 2048,
     ) -> str:
         """Generate free-text response (for memos/reports only, never for expressions)."""
-        model = model or self.config.mediumfish_model
         temp = temperature or self.config.temperature_critique
-        response = await self.client.chat.completions.create(
-            model=model,
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": prompt},
@@ -94,6 +164,19 @@ class LLMClient:
         self._token_count += response.usage.total_tokens if response.usage else 0
         return content
     @property
     def tokens_used(self) -> int:
         return self._token_count

 """
+LLM Client — unified interface supporting both Ollama (local) and HuggingFace (cloud).
+Auto-switches between providers based on ModelManager selection.
+All outputs are schema-constrained via guided JSON generation.
 """
 import asyncio
 import json
 class LLMClient:
     """
+    Async LLM client supporting:
+    - Ollama (local, http://localhost:11434/v1)
+    - HuggingFace Inference API (cloud, https://router.huggingface.co/v1)
+    - vLLM (local/remote, any OpenAI-compatible endpoint)
+    All outputs are JSON-schema-constrained for reliability.
     """
+    def __init__(self, config: LLMConfig, model_manager=None):
         self.config = config
+        self.model_manager = model_manager
+        self._clients: dict[str, AsyncOpenAI] = {}
         self._token_count = 0
+    def _get_client(self, base_url: str, api_key: str = "dummy", **headers) -> AsyncOpenAI:
+        """Get or create an AsyncOpenAI client for the given endpoint."""
+        key = f"{base_url}|{api_key}"
+        if key not in self._clients:
+            self._clients[key] = AsyncOpenAI(
+                base_url=base_url,
+                api_key=api_key,
+                default_headers=headers if headers else None,
+            )
+        return self._clients[key]
+    def _resolve_model(self, tier: str = "mediumfish", model_override: str | None = None) -> tuple[AsyncOpenAI, str]:
+        """
+        Resolve which client + model to use for a given tier.
+        Priority: model_override > ModelManager selection > config default
+        """
+        if model_override:
+            # Direct model name — use default endpoint
+            client = self._get_client(self.config.base_url, self.config.api_key)
+            return client, model_override
+        if self.model_manager:
+            base_url, model_name, headers = self.model_manager.get_endpoint(tier)
+            api_key = headers.get("Authorization", "").replace("Bearer ", "") or "dummy"
+            client = self._get_client(base_url, api_key)
+            return client, model_name
+        # Fallback: use config defaults
+        tier_to_model = {
+            "microfish": self.config.microfish_model,
+            "tinyfish": self.config.tinyfish_model,
+            "mediumfish": self.config.mediumfish_model,
+            "bigfish": self.config.bigfish_model,
+        }
+        model = tier_to_model.get(tier, self.config.mediumfish_model)
+        client = self._get_client(self.config.base_url, self.config.api_key)
+        return client, model
     async def generate_json(
         self,
         prompt: str,
         schema: type[T],
+        tier: str = "mediumfish",
         model: str | None = None,
         temperature: float | None = None,
         system_prompt: str = "You are a quantitative finance expert.",
     ) -> T:
         """
         Generate a structured JSON response conforming to the given Pydantic schema.
+        Uses guided decoding via response_format.
+        Args:
+            prompt: The user prompt
+            schema: Pydantic model class for output validation
+            tier: Model tier (microfish/tinyfish/mediumfish/bigfish)
+            model: Override model name (optional)
+            temperature: Override temperature (optional)
+            system_prompt: System message
         """
+        client, model_name = self._resolve_model(tier, model)
         temp = temperature or self.config.temperature_generation
         json_schema = schema.model_json_schema()
+        try:
+            response = await client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=temp,
+                max_tokens=self.config.max_tokens,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": {
+                        "name": schema.__name__,
+                        "schema": json_schema,
+                    },
                 },
+            )
+        except Exception:
+            # Fallback: some providers don't support json_schema format
+            # Try with json_object format + schema instruction in prompt
+            schema_str = json.dumps(json_schema, indent=2)
+            augmented_prompt = (
+                f"{prompt}\n\n"
+                f"IMPORTANT: Output ONLY valid JSON matching this schema:\n"
+                f"```json\n{schema_str}\n```\n"
+                f"No other text. Just the JSON."
+            )
+            response = await client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": augmented_prompt},
+                ],
+                temperature=temp,
+                max_tokens=self.config.max_tokens,
+                response_format={"type": "json_object"},
+            )
         content = response.choices[0].message.content
         self._token_count += response.usage.total_tokens if response.usage else 0
     async def generate_text(
         self,
         prompt: str,
+        tier: str = "mediumfish",
         model: str | None = None,
         temperature: float | None = None,
         system_prompt: str = "You are a quantitative finance expert.",
         max_tokens: int = 2048,
     ) -> str:
         """Generate free-text response (for memos/reports only, never for expressions)."""
+        client, model_name = self._resolve_model(tier, model)
         temp = temperature or self.config.temperature_critique
+        response = await client.chat.completions.create(
+            model=model_name,
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": prompt},
         self._token_count += response.usage.total_tokens if response.usage else 0
         return content
+    async def health_check(self, tier: str = "mediumfish") -> bool:
+        """Check if the model endpoint is reachable."""
+        try:
+            client, model_name = self._resolve_model(tier)
+            response = await client.chat.completions.create(
+                model=model_name,
+                messages=[{"role": "user", "content": "Say 'ok'"}],
+                max_tokens=5,
+            )
+            return True
+        except Exception:
+            return False
     @property
     def tokens_used(self) -> int:
         return self._token_count