dwijverma2
/

doc-enricher

ml-intern

Model card Files Files and versions

xet

Community

dwijverma2 commited on 28 days ago

Commit

e5a883e

verified ·

1 Parent(s): d2a9d6e

Add LLM client for Ollama

Browse files

Files changed (1) hide show

doc_enricher/llm_client.py +181 -0

doc_enricher/llm_client.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+LLM Client for paragraph classification via Ollama.
+Uses the /api/chat endpoint with JSON-constrained decoding
+for reliable structured output from Llama3.
+"""
+import json
+import logging
+import requests
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Default Ollama endpoint
+DEFAULT_OLLAMA_URL = "http://localhost:11434"
+SYSTEM_PROMPT = """You are a document structure classifier. Your task is to classify each paragraph of a document into exactly one of three categories:
+- **TITLE**: The document title. Usually appears once near the top. Short, descriptive of the entire document's topic. Not a section name.
+- **SECTION_HEADING**: A section or subsection heading. Short, labels a section of content. Typically a phrase or short sentence, not a full paragraph of prose.
+- **BODY**: Regular body text. Sentences, bullet points, paragraphs of actual content. This is the default — if in doubt, classify as BODY.
+Rules:
+1. A document usually has exactly ONE title (the first significant text). If the first paragraph is short and describes the whole document, it's likely TITLE.
+2. SECTION_HEADINGs are short (typically under 10 words) and introduce a topic. They are NOT sentences — they don't end with periods.
+3. Everything else is BODY.
+4. Consider context: a short line between two long paragraphs of prose is likely a SECTION_HEADING. A short line in a list of short lines is likely BODY.
+You will receive paragraphs in the format:
+[index] text...
+You MUST respond with valid JSON in exactly this format:
+{"classifications": [{"index": <int>, "label": "<TITLE|SECTION_HEADING|BODY>"}, ...]}
+Include an entry for EVERY paragraph index provided. Do not skip any."""
+class OllamaClassifier:
+    """Classifies paragraphs using a local Ollama LLM instance."""
+    def __init__(
+        self,
+        model: str = "llama3",
+        ollama_url: str = DEFAULT_OLLAMA_URL,
+        temperature: float = 0.0,
+        num_ctx: int = 8192,
+        timeout: int = 180,
+    ):
+        self.model = model
+        self.ollama_url = ollama_url.rstrip("/")
+        self.temperature = temperature
+        self.num_ctx = num_ctx
+        self.timeout = timeout
+        # Verify Ollama is reachable
+        self._check_connection()
+    def _check_connection(self):
+        """Check that Ollama is running and the model is available."""
+        try:
+            resp = requests.get(f"{self.ollama_url}/api/tags", timeout=5)
+            resp.raise_for_status()
+            models = [m["name"] for m in resp.json().get("models", [])]
+            # Model names may include tag like "llama3:latest"
+            model_found = any(
+                self.model in m for m in models
+            )
+            if not model_found:
+                logger.warning(
+                    f"Model '{self.model}' not found in Ollama. "
+                    f"Available: {models}. Will attempt to use it anyway "
+                    f"(Ollama may auto-pull)."
+                )
+            else:
+                logger.info(f"Ollama connected. Model '{self.model}' available.")
+        except requests.ConnectionError:
+            raise ConnectionError(
+                f"Cannot connect to Ollama at {self.ollama_url}. "
+                f"Is Ollama running? Start with: ollama serve"
+            )
+    def classify_batch(
+        self,
+        paragraphs: list[dict],
+        formatting_hints: bool = True,
+    ) -> dict:
+        """
+        Send a batch of paragraphs to the LLM for classification.
+        Args:
+            paragraphs: List of dicts with keys:
+                - index (int): Paragraph index in the original document
+                - text (str): Paragraph text
+                - style_name (str, optional): Current style name
+                - is_bold (bool, optional): Whether any run is bold
+                - avg_font_size_pt (float, optional): Average font size
+            formatting_hints: Whether to include formatting metadata in prompt
+        Returns:
+            Dict with "classifications" key containing list of
+            {"index": int, "label": str} dicts
+        """
+        # Build the user message
+        lines = []
+        for p in paragraphs:
+            line = f'[{p["index"]}] {p["text"][:300]}'
+            if formatting_hints:
+                hints = []
+                if p.get("style_name"):
+                    hints.append(f'style="{p["style_name"]}"')
+                if p.get("is_bold") is not None:
+                    hints.append(f'bold={p["is_bold"]}')
+                if p.get("avg_font_size_pt") is not None:
+                    hints.append(f'size={p["avg_font_size_pt"]:.1f}pt')
+                if hints:
+                    line += f'  ({", ".join(hints)})'
+            lines.append(line)
+        user_content = (
+            f"Classify these {len(paragraphs)} paragraphs from a document:\n\n"
+            + "\n".join(lines)
+        )
+        logger.debug(f"Sending {len(paragraphs)} paragraphs to LLM ({len(user_content)} chars)")
+        response = requests.post(
+            f"{self.ollama_url}/api/chat",
+            json={
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": user_content},
+                ],
+                "stream": False,
+                "format": "json",
+                "options": {
+                    "temperature": self.temperature,
+                    "num_ctx": self.num_ctx,
+                },
+            },
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        raw_text = response.json()["message"]["content"]
+        logger.debug(f"LLM response: {raw_text[:500]}")
+        try:
+            result = json.loads(raw_text)
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse LLM JSON response: {e}\nRaw: {raw_text[:1000]}")
+            raise ValueError(f"LLM returned invalid JSON: {e}")
+        # Validate structure
+        if "classifications" not in result:
+            # Some models return a flat list or different key
+            # Try to recover
+            if isinstance(result, list):
+                result = {"classifications": result}
+            elif isinstance(result, dict) and len(result) == 1:
+                result = {"classifications": list(result.values())[0]}
+            else:
+                raise ValueError(
+                    f"LLM response missing 'classifications' key. Got keys: {list(result.keys())}"
+                )
+        # Validate each classification
+        valid_labels = {"TITLE", "SECTION_HEADING", "BODY"}
+        for item in result["classifications"]:
+            if "label" not in item or "index" not in item:
+                logger.warning(f"Malformed classification item: {item}")
+                continue
+            label = item["label"].upper().strip()
+            if label not in valid_labels:
+                logger.warning(f"Unknown label '{label}' for index {item['index']}, defaulting to BODY")
+                item["label"] = "BODY"
+            else:
+                item["label"] = label
+        return result