| """ |
| LLM-based citation relevance evaluator. |
| Supports OpenAI, Anthropic, DeepSeek, Gemini, vLLM, and Ollama backends. |
| """ |
| import json |
| import re |
| from dataclasses import dataclass |
| from typing import Optional, Dict, Any |
| from enum import Enum |
| import os |
|
|
| import requests |
|
|
|
|
| class LLMBackend(Enum): |
| OPENAI = "openai" |
| ANTHROPIC = "anthropic" |
| GEMINI = "gemini" |
| VLLM = "vllm" |
| OLLAMA = "ollama" |
| DEEPSEEK = "deepseek" |
|
|
|
|
| @dataclass |
| class EvaluationResult: |
| """Result of LLM citation evaluation.""" |
| entry_key: str |
| relevance_score: int |
| is_relevant: bool |
| explanation: str |
| context_used: str |
| abstract_used: str |
| line_number: Optional[int] = None |
| file_path: Optional[str] = None |
| error: Optional[str] = None |
| |
| @property |
| def score_label(self) -> str: |
| labels = { |
| 1: "Not Relevant", |
| 2: "Marginally Relevant", |
| 3: "Somewhat Relevant", |
| 4: "Relevant", |
| 5: "Highly Relevant" |
| } |
| return labels.get(self.relevance_score, "Unknown") |
|
|
|
|
| class LLMEvaluator: |
| """Evaluates citation relevance using LLM.""" |
| |
| PROMPT_TEMPLATE = """You are an expert academic reviewer. Given a citation context from a LaTeX document and the cited paper's abstract, evaluate whether this citation is appropriate and relevant. |
| |
| ## Citation Context (from the manuscript): |
| {context} |
| |
| ## Cited Paper's Abstract: |
| {abstract} |
| |
| ## Task: |
| Evaluate the relevance and appropriateness of this citation. Consider: |
| 1. Does the citation support the claim being made in the context? |
| 2. Is the cited paper's topic related to the discussion? |
| 3. Is this citation necessary, or could it be replaced with a more relevant one? |
| |
| ## Response Format: |
| Provide your response in the following JSON format: |
| {{ |
| "relevance_score": <1-5 integer>, |
| "is_relevant": <true/false>, |
| "explanation": "<brief explanation in 1-2 sentences>" |
| }} |
| |
| Score guide: |
| - 1: Not relevant at all |
| - 2: Marginally relevant |
| - 3: Somewhat relevant |
| - 4: Relevant and appropriate |
| - 5: Highly relevant and essential |
| |
| STRICTLY FOLLOW THE JSON FORMAT. Respond ONLY with the JSON object, no other text.""" |
|
|
| def __init__( |
| self, |
| backend: LLMBackend = LLMBackend.GEMINI, |
| endpoint: Optional[str] = None, |
| model: Optional[str] = None, |
| api_key: Optional[str] = None |
| ): |
| self.backend = backend |
| self.api_key = api_key or os.environ.get(f"{backend.name}_API_KEY") |
| |
| |
| if backend == LLMBackend.OPENAI: |
| self.endpoint = endpoint or "https://api.openai.com/v1/chat/completions" |
| self.model = model or "gpt-5-mini" |
| elif backend == LLMBackend.ANTHROPIC: |
| self.endpoint = endpoint or "https://api.anthropic.com/v1/messages" |
| self.model = model or "claude-4.5-haiku" |
| elif backend == LLMBackend.DEEPSEEK: |
| self.endpoint = endpoint or "https://api.deepseek.com/chat/completions" |
| self.model = model or "deepseek-chat" |
| elif backend == LLMBackend.OLLAMA: |
| self.endpoint = endpoint or "http://localhost:11434/api/generate" |
| self.model = model or "Qwen/qwen3-4B-Instruct-2507" |
| elif backend == LLMBackend.VLLM: |
| self.endpoint = endpoint or "http://localhost:8000/v1/chat/completions" |
| self.model = model or "Qwen/qwen3-4B-Instruct-2507" |
| elif backend == LLMBackend.GEMINI: |
| self.endpoint = endpoint or "https://generativelanguage.googleapis.com/v1beta/models" |
| self.model = model or "gemini-2.5-flash-lite" |
| |
| def evaluate(self, entry_key: str, context: str, abstract: str) -> EvaluationResult: |
| """Evaluate citation relevance.""" |
| if not context or not abstract: |
| return EvaluationResult( |
| entry_key=entry_key, |
| relevance_score=0, |
| is_relevant=False, |
| explanation="Missing context or abstract", |
| context_used=context, |
| abstract_used=abstract, |
| error="Missing context or abstract for evaluation" |
| ) |
| |
| |
| prompt = self.PROMPT_TEMPLATE.format(context=context, abstract=abstract) |
| |
| try: |
| if self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): |
| response = self._call_openai_compatible(prompt) |
| elif self.backend == LLMBackend.ANTHROPIC: |
| response = self._call_anthropic(prompt) |
| elif self.backend == LLMBackend.OLLAMA: |
| response = self._call_ollama(prompt) |
| elif self.backend == LLMBackend.GEMINI: |
| response = self._call_gemini(prompt) |
| else: |
| raise ValueError(f"Unknown backend: {self.backend}") |
| |
| return self._parse_response(entry_key, response, context, abstract) |
| |
| except Exception as e: |
| return EvaluationResult( |
| entry_key=entry_key, |
| relevance_score=0, |
| is_relevant=False, |
| explanation="", |
| context_used=context, |
| abstract_used=abstract, |
| error=str(e) |
| ) |
| |
| def _call_openai_compatible(self, prompt: str) -> str: |
| """Call OpenAI-compatible API (OpenAI, DeepSeek, vLLM).""" |
| headers = { |
| "Content-Type": "application/json", |
| "Authorization": f"Bearer {self.api_key}" |
| } |
| |
| payload = { |
| "model": self.model, |
| "messages": [ |
| {"role": "user", "content": prompt} |
| ], |
| "temperature": 0.1, |
| "max_tokens": 2000, |
| "response_format": {"type": "json_object"} if self.backend == LLMBackend.OPENAI else None |
| } |
| |
| response = requests.post( |
| self.endpoint, |
| json=payload, |
| headers=headers, |
| timeout=60 |
| ) |
| response.raise_for_status() |
| |
| data = response.json() |
| choices = data.get("choices", []) |
| if choices: |
| return choices[0].get("message", {}).get("content", "") |
| return "" |
|
|
| def _call_anthropic(self, prompt: str) -> str: |
| """Call Anthropic API.""" |
| headers = { |
| "x-api-key": self.api_key, |
| "anthropic-version": "2023-06-01", |
| "content-type": "application/json" |
| } |
| |
| payload = { |
| "model": self.model, |
| "max_tokens": 2000, |
| "temperature": 0.1, |
| "messages": [ |
| {"role": "user", "content": prompt} |
| ] |
| } |
| |
| response = requests.post( |
| self.endpoint, |
| json=payload, |
| headers=headers, |
| timeout=60 |
| ) |
| response.raise_for_status() |
| |
| data = response.json() |
| content = data.get("content", []) |
| if content and content[0].get("type") == "text": |
| return content[0].get("text", "") |
| return "" |
|
|
| def _call_ollama(self, prompt: str) -> str: |
| """Call Ollama API.""" |
| payload = { |
| "model": self.model, |
| "prompt": prompt, |
| "stream": False, |
| "options": { |
| "temperature": 0.1, |
| "num_predict": 2000 |
| }, |
| "format": "json" |
| } |
| |
| response = requests.post( |
| self.endpoint, |
| json=payload, |
| timeout=60 |
| ) |
| response.raise_for_status() |
| |
| return response.json().get("response", "") |
| |
| def _call_gemini(self, prompt: str) -> str: |
| """Call Gemini API.""" |
| |
| url = f"{self.endpoint}/{self.model}:generateContent" |
| if self.api_key: |
| url += f"?key={self.api_key}" |
| |
| payload = { |
| "contents": [ |
| { |
| "parts": [ |
| {"text": prompt} |
| ] |
| } |
| ], |
| "generationConfig": { |
| "temperature": 0.1, |
| "maxOutputTokens": 2000, |
| "responseMimeType": "application/json" |
| } |
| } |
| |
| response = requests.post( |
| url, |
| json=payload, |
| timeout=60 |
| ) |
| response.raise_for_status() |
| |
| candidates = response.json().get("candidates", []) |
| if candidates: |
| content = candidates[0].get("content", {}) |
| parts = content.get("parts", []) |
| if parts: |
| return parts[0].get("text", "") |
| return "" |
| |
| def _parse_response(self, entry_key: str, response: str, context: str, abstract: str) -> EvaluationResult: |
| """Parse LLM response.""" |
| |
| json_match = re.search(r'\{[^{}]*\}', response, re.DOTALL) |
| |
| data = {} |
| if not json_match: |
| |
| try: |
| data = json.loads(response.strip()) |
| except json.JSONDecodeError: |
| pass |
| else: |
| try: |
| data = json.loads(json_match.group()) |
| except json.JSONDecodeError: |
| pass |
| |
| if not data: |
| return EvaluationResult( |
| entry_key=entry_key, |
| relevance_score=0, |
| is_relevant=False, |
| explanation=response, |
| context_used=context, |
| abstract_used=abstract, |
| error="Failed to parse LLM response as JSON" |
| ) |
| |
| |
| relevance_score = data.get("relevance_score", 0) |
| if isinstance(relevance_score, str): |
| try: |
| relevance_score = int(relevance_score) |
| except ValueError: |
| relevance_score = 0 |
| |
| is_relevant = data.get("is_relevant", False) |
| if isinstance(is_relevant, str): |
| is_relevant = is_relevant.lower() in ("true", "yes", "1") |
| |
| explanation = data.get("explanation", "") |
| |
| return EvaluationResult( |
| entry_key=entry_key, |
| relevance_score=relevance_score, |
| is_relevant=is_relevant, |
| explanation=explanation, |
| context_used=context, |
| abstract_used=abstract |
| ) |
| |
| def test_connection(self) -> bool: |
| """Test if LLM backend is accessible.""" |
| try: |
| if self.backend == LLMBackend.OLLAMA: |
| response = requests.get( |
| self.endpoint.replace("/api/generate", "/api/tags"), |
| timeout=5 |
| ) |
| return response.status_code == 200 |
| elif self.backend in (LLMBackend.OPENAI, LLMBackend.DEEPSEEK, LLMBackend.VLLM): |
| |
| headers = {"Authorization": f"Bearer {self.api_key}"} |
| |
| if "chat/completions" in self.endpoint: |
| |
| payload = { |
| "model": self.model, |
| "messages": [{"role": "user", "content": "hi"}], |
| "max_tokens": 1 |
| } |
| response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10) |
| return response.status_code == 200 |
| else: |
| return False |
| elif self.backend == LLMBackend.ANTHROPIC: |
| headers = { |
| "x-api-key": self.api_key, |
| "anthropic-version": "2023-06-01", |
| "content-type": "application/json" |
| } |
| payload = { |
| "model": self.model, |
| "max_tokens": 1, |
| "messages": [{"role": "user", "content": "hi"}] |
| } |
| response = requests.post(self.endpoint, json=payload, headers=headers, timeout=10) |
| return response.status_code == 200 |
| elif self.backend == LLMBackend.GEMINI: |
| if not self.api_key: |
| return False |
| url = f"{self.endpoint}/{self.model}:generateContent?key={self.api_key}" |
| payload = { |
| "contents": [{"parts": [{"text": "test"}]}], |
| "generationConfig": {"maxOutputTokens": 10} |
| } |
| response = requests.post(url, json=payload, timeout=10) |
| return response.status_code == 200 |
| except Exception: |
| return False |
| return False |
|
|