Update main.py
Browse files
main.py
CHANGED
|
@@ -1,1028 +1,1028 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import time
|
| 3 |
-
import requests
|
| 4 |
-
import random
|
| 5 |
-
import re
|
| 6 |
-
from difflib import SequenceMatcher
|
| 7 |
-
from typing import List, Optional, Dict, Any
|
| 8 |
-
from urllib.parse import quote_plus
|
| 9 |
-
|
| 10 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 11 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
-
import uvicorn
|
| 13 |
-
from pydantic import BaseModel
|
| 14 |
-
from PyPDF2 import PdfReader
|
| 15 |
-
|
| 16 |
-
from langchain_groq import ChatGroq
|
| 17 |
-
from langchain_core.prompts import ChatPromptTemplate
|
| 18 |
-
|
| 19 |
-
# ==========================================
|
| 20 |
-
# 1. Environment & API Setup
|
| 21 |
-
# ==========================================
|
| 22 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY"
|
| 23 |
-
SERPER_API_KEY = os.getenv("SERPER_API_KEY"
|
| 24 |
-
SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY"
|
| 25 |
-
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 26 |
-
SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS = 1.2
|
| 27 |
-
SEMANTIC_SCHOLAR_MAX_RETRIES = 4
|
| 28 |
-
|
| 29 |
-
if not GROQ_API_KEY or not SERPER_API_KEY:
|
| 30 |
-
print("WARNING: GROQ_API_KEY or SERPER_API_KEY is missing!")
|
| 31 |
-
|
| 32 |
-
llm = ChatGroq(model="
|
| 33 |
-
|
| 34 |
-
# Basic Memory Cache to maintain API efficiency (as promised in the application)
|
| 35 |
-
query_cache = {}
|
| 36 |
-
semantic_query_cache: Dict[str, List[Dict[str, str]]] = {}
|
| 37 |
-
_last_semantic_scholar_call_ts = 0.0
|
| 38 |
-
|
| 39 |
-
# ==========================================
|
| 40 |
-
# 2. Pydantic Models
|
| 41 |
-
# ==========================================
|
| 42 |
-
class MatchReport(BaseModel):
|
| 43 |
-
chunk_text: str
|
| 44 |
-
is_plagiarized: bool
|
| 45 |
-
plagiarism_type: Optional[str] = None
|
| 46 |
-
source_url: Optional[str] = None
|
| 47 |
-
source_type: Optional[str] = None # "Academic" or "Web"
|
| 48 |
-
similarity_score: float
|
| 49 |
-
|
| 50 |
-
class PlagiarismReport(BaseModel):
|
| 51 |
-
filename: str
|
| 52 |
-
total_words: int
|
| 53 |
-
plagiarized_words: int
|
| 54 |
-
overall_plagiarism_score: float
|
| 55 |
-
severity_level: str # Low, Medium, High, Very High
|
| 56 |
-
details: List[MatchReport]
|
| 57 |
-
|
| 58 |
-
class DetailedPlagiarismReport(BaseModel):
|
| 59 |
-
"""Comprehensive report generated by LLM"""
|
| 60 |
-
filename: str
|
| 61 |
-
scan_timestamp: str
|
| 62 |
-
executive_summary: str
|
| 63 |
-
overall_score: float
|
| 64 |
-
severity_level: str
|
| 65 |
-
matched_sources: List[Dict[str, Any]]
|
| 66 |
-
key_findings: List[str]
|
| 67 |
-
plagiarism_breakdown: Dict[str, Any] # Types and percentages
|
| 68 |
-
detailed_analysis: str # LLM-generated detailed analysis
|
| 69 |
-
affected_sections: List[Dict[str, Any]] # Which parts are problematic
|
| 70 |
-
recommendations: List[str]
|
| 71 |
-
academic_integrity_risk: str # Assessment level
|
| 72 |
-
|
| 73 |
-
app = FastAPI(title="Pro Plagiarism Detector (Turnitin Clone)")
|
| 74 |
-
|
| 75 |
-
app.add_middleware(
|
| 76 |
-
CORSMiddleware,
|
| 77 |
-
allow_origins=["*"],
|
| 78 |
-
allow_credentials=True,
|
| 79 |
-
allow_methods=["*"],
|
| 80 |
-
allow_headers=["*"],
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
# ==========================================
|
| 84 |
-
# 3. Agent Tools: Serper & Semantic Scholar
|
| 85 |
-
# ==========================================
|
| 86 |
-
|
| 87 |
-
def _semantic_scholar_headers() -> Dict[str, str]:
|
| 88 |
-
headers: Dict[str, str] = {}
|
| 89 |
-
if SEMANTIC_SCHOLAR_API_KEY:
|
| 90 |
-
# API key must be sent in x-api-key header.
|
| 91 |
-
headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY
|
| 92 |
-
return headers
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def _semantic_scholar_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 96 |
-
global _last_semantic_scholar_call_ts
|
| 97 |
-
filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
|
| 98 |
-
|
| 99 |
-
for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
|
| 100 |
-
elapsed = time.time() - _last_semantic_scholar_call_ts
|
| 101 |
-
if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
|
| 102 |
-
time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
|
| 103 |
-
|
| 104 |
-
response = requests.get(
|
| 105 |
-
f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
|
| 106 |
-
headers=_semantic_scholar_headers(),
|
| 107 |
-
params=filtered_params,
|
| 108 |
-
timeout=20,
|
| 109 |
-
)
|
| 110 |
-
_last_semantic_scholar_call_ts = time.time()
|
| 111 |
-
|
| 112 |
-
if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
|
| 113 |
-
retry_after = response.headers.get("Retry-After")
|
| 114 |
-
if retry_after and retry_after.isdigit():
|
| 115 |
-
wait_seconds = float(retry_after)
|
| 116 |
-
else:
|
| 117 |
-
wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
|
| 118 |
-
time.sleep(wait_seconds)
|
| 119 |
-
continue
|
| 120 |
-
|
| 121 |
-
response.raise_for_status()
|
| 122 |
-
return response.json()
|
| 123 |
-
|
| 124 |
-
raise requests.HTTPError("Semantic Scholar request failed after retries")
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
def _semantic_scholar_post(path: str, body: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> Any:
|
| 128 |
-
global _last_semantic_scholar_call_ts
|
| 129 |
-
filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
|
| 130 |
-
|
| 131 |
-
for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
|
| 132 |
-
elapsed = time.time() - _last_semantic_scholar_call_ts
|
| 133 |
-
if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
|
| 134 |
-
time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
|
| 135 |
-
|
| 136 |
-
response = requests.post(
|
| 137 |
-
f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
|
| 138 |
-
headers=_semantic_scholar_headers(),
|
| 139 |
-
params=filtered_params,
|
| 140 |
-
json=body,
|
| 141 |
-
timeout=25,
|
| 142 |
-
)
|
| 143 |
-
_last_semantic_scholar_call_ts = time.time()
|
| 144 |
-
|
| 145 |
-
if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
|
| 146 |
-
retry_after = response.headers.get("Retry-After")
|
| 147 |
-
if retry_after and retry_after.isdigit():
|
| 148 |
-
wait_seconds = float(retry_after)
|
| 149 |
-
else:
|
| 150 |
-
wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
|
| 151 |
-
time.sleep(wait_seconds)
|
| 152 |
-
continue
|
| 153 |
-
|
| 154 |
-
response.raise_for_status()
|
| 155 |
-
return response.json()
|
| 156 |
-
|
| 157 |
-
raise requests.HTTPError("Semantic Scholar request failed after retries")
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
def s2_paper_autocomplete(query: str) -> Dict[str, Any]:
|
| 161 |
-
return _semantic_scholar_get("/paper/autocomplete", {"query": query[:100]})
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def s2_paper_batch(ids: List[str], fields: Optional[str] = None) -> Any:
|
| 165 |
-
return _semantic_scholar_post("/paper/batch", {"ids": ids[:500]}, {"fields": fields})
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def s2_paper_search(
|
| 169 |
-
query: str,
|
| 170 |
-
fields: Optional[str] = None,
|
| 171 |
-
limit: int = 100,
|
| 172 |
-
offset: int = 0,
|
| 173 |
-
year: Optional[str] = None,
|
| 174 |
-
fields_of_study: Optional[str] = None,
|
| 175 |
-
open_access_pdf: bool = False,
|
| 176 |
-
) -> Dict[str, Any]:
|
| 177 |
-
params: Dict[str, Any] = {
|
| 178 |
-
"query": query,
|
| 179 |
-
"fields": fields,
|
| 180 |
-
"limit": min(max(limit, 1), 100),
|
| 181 |
-
"offset": max(offset, 0),
|
| 182 |
-
"year": year,
|
| 183 |
-
"fieldsOfStudy": fields_of_study,
|
| 184 |
-
}
|
| 185 |
-
if open_access_pdf:
|
| 186 |
-
params["openAccessPdf"] = ""
|
| 187 |
-
return _semantic_scholar_get("/paper/search", params)
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
def s2_paper_search_bulk(
|
| 191 |
-
query: str,
|
| 192 |
-
fields: Optional[str] = None,
|
| 193 |
-
token: Optional[str] = None,
|
| 194 |
-
sort: Optional[str] = None,
|
| 195 |
-
) -> Dict[str, Any]:
|
| 196 |
-
return _semantic_scholar_get(
|
| 197 |
-
"/paper/search/bulk",
|
| 198 |
-
{
|
| 199 |
-
"query": query,
|
| 200 |
-
"fields": fields,
|
| 201 |
-
"token": token,
|
| 202 |
-
"sort": sort,
|
| 203 |
-
},
|
| 204 |
-
)
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
def s2_paper_search_match(query: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 208 |
-
return _semantic_scholar_get("/paper/search/match", {"query": query, "fields": fields})
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
def s2_paper_details(paper_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 212 |
-
safe_id = quote_plus(paper_id)
|
| 213 |
-
return _semantic_scholar_get(f"/paper/{safe_id}", {"fields": fields})
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
def s2_paper_authors(
|
| 217 |
-
paper_id: str,
|
| 218 |
-
fields: Optional[str] = None,
|
| 219 |
-
limit: int = 100,
|
| 220 |
-
offset: int = 0,
|
| 221 |
-
) -> Dict[str, Any]:
|
| 222 |
-
safe_id = quote_plus(paper_id)
|
| 223 |
-
return _semantic_scholar_get(
|
| 224 |
-
f"/paper/{safe_id}/authors",
|
| 225 |
-
{"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
|
| 226 |
-
)
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
def s2_paper_citations(
|
| 230 |
-
paper_id: str,
|
| 231 |
-
fields: Optional[str] = None,
|
| 232 |
-
limit: int = 100,
|
| 233 |
-
offset: int = 0,
|
| 234 |
-
publication_date_or_year: Optional[str] = None,
|
| 235 |
-
) -> Dict[str, Any]:
|
| 236 |
-
safe_id = quote_plus(paper_id)
|
| 237 |
-
return _semantic_scholar_get(
|
| 238 |
-
f"/paper/{safe_id}/citations",
|
| 239 |
-
{
|
| 240 |
-
"fields": fields,
|
| 241 |
-
"limit": min(max(limit, 1), 1000),
|
| 242 |
-
"offset": max(offset, 0),
|
| 243 |
-
"publicationDateOrYear": publication_date_or_year,
|
| 244 |
-
},
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
def s2_paper_references(
|
| 249 |
-
paper_id: str,
|
| 250 |
-
fields: Optional[str] = None,
|
| 251 |
-
limit: int = 100,
|
| 252 |
-
offset: int = 0,
|
| 253 |
-
) -> Dict[str, Any]:
|
| 254 |
-
safe_id = quote_plus(paper_id)
|
| 255 |
-
return _semantic_scholar_get(
|
| 256 |
-
f"/paper/{safe_id}/references",
|
| 257 |
-
{"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
|
| 258 |
-
)
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
def s2_author_batch(ids: List[str], fields: Optional[str] = None) -> Any:
|
| 262 |
-
return _semantic_scholar_post("/author/batch", {"ids": ids[:1000]}, {"fields": fields})
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def s2_author_search(
|
| 266 |
-
query: str,
|
| 267 |
-
fields: Optional[str] = None,
|
| 268 |
-
limit: int = 100,
|
| 269 |
-
offset: int = 0,
|
| 270 |
-
) -> Dict[str, Any]:
|
| 271 |
-
return _semantic_scholar_get(
|
| 272 |
-
"/author/search",
|
| 273 |
-
{
|
| 274 |
-
"query": query,
|
| 275 |
-
"fields": fields,
|
| 276 |
-
"limit": min(max(limit, 1), 1000),
|
| 277 |
-
"offset": max(offset, 0),
|
| 278 |
-
},
|
| 279 |
-
)
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
def s2_author_details(author_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 283 |
-
safe_id = quote_plus(author_id)
|
| 284 |
-
return _semantic_scholar_get(f"/author/{safe_id}", {"fields": fields})
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
def s2_author_papers(
|
| 288 |
-
author_id: str,
|
| 289 |
-
fields: Optional[str] = None,
|
| 290 |
-
limit: int = 100,
|
| 291 |
-
offset: int = 0,
|
| 292 |
-
publication_date_or_year: Optional[str] = None,
|
| 293 |
-
) -> Dict[str, Any]:
|
| 294 |
-
safe_id = quote_plus(author_id)
|
| 295 |
-
return _semantic_scholar_get(
|
| 296 |
-
f"/author/{safe_id}/papers",
|
| 297 |
-
{
|
| 298 |
-
"fields": fields,
|
| 299 |
-
"limit": min(max(limit, 1), 1000),
|
| 300 |
-
"offset": max(offset, 0),
|
| 301 |
-
"publicationDateOrYear": publication_date_or_year,
|
| 302 |
-
},
|
| 303 |
-
)
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
def s2_snippet_search(
|
| 307 |
-
query: str,
|
| 308 |
-
fields: Optional[str] = None,
|
| 309 |
-
limit: int = 10,
|
| 310 |
-
year: Optional[str] = None,
|
| 311 |
-
fields_of_study: Optional[str] = None,
|
| 312 |
-
) -> Dict[str, Any]:
|
| 313 |
-
return _semantic_scholar_get(
|
| 314 |
-
"/snippet/search",
|
| 315 |
-
{
|
| 316 |
-
"query": query,
|
| 317 |
-
"fields": fields,
|
| 318 |
-
"limit": min(max(limit, 1), 1000),
|
| 319 |
-
"year": year,
|
| 320 |
-
"fieldsOfStudy": fields_of_study,
|
| 321 |
-
},
|
| 322 |
-
)
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
def build_search_query(text: str, max_terms: int = 10) -> str:
|
| 326 |
-
"""Builds a compact keyword query to improve search recall and reduce noisy long queries."""
|
| 327 |
-
stopwords = {
|
| 328 |
-
"the", "and", "for", "that", "with", "this", "from", "into", "our", "their",
|
| 329 |
-
"were", "have", "has", "had", "been", "are", "was", "will", "would", "can",
|
| 330 |
-
"could", "should", "about", "through", "using", "based", "than", "then", "also",
|
| 331 |
-
"such", "these", "those", "while", "where", "when", "what", "which", "who",
|
| 332 |
-
}
|
| 333 |
-
words = re.findall(r"[A-Za-z0-9]+", text.lower())
|
| 334 |
-
keywords = [w for w in words if len(w) > 2 and w not in stopwords]
|
| 335 |
-
return " ".join(keywords[:max_terms]) if keywords else " ".join(words[:max_terms])
|
| 336 |
-
|
| 337 |
-
def search_google_serper(query: str) -> List[Dict]:
|
| 338 |
-
"""Searches the open web using Google Serper API."""
|
| 339 |
-
url = "https://google.serper.dev/search"
|
| 340 |
-
payload = {"q": query}
|
| 341 |
-
headers = {
|
| 342 |
-
'X-API-KEY': SERPER_API_KEY,
|
| 343 |
-
'Content-Type': 'application/json'
|
| 344 |
-
}
|
| 345 |
-
|
| 346 |
-
try:
|
| 347 |
-
response = requests.post(url, headers=headers, json=payload)
|
| 348 |
-
response.raise_for_status()
|
| 349 |
-
data = response.json()
|
| 350 |
-
|
| 351 |
-
results = []
|
| 352 |
-
for item in data.get("organic", [])[:3]: # Top 3 web results
|
| 353 |
-
results.append({
|
| 354 |
-
"text": item.get("snippet", ""),
|
| 355 |
-
"url": item.get("link", ""),
|
| 356 |
-
"source_type": "Web (Google)"
|
| 357 |
-
})
|
| 358 |
-
return results
|
| 359 |
-
except Exception as e:
|
| 360 |
-
print(f"Serper Error: {e}")
|
| 361 |
-
return []
|
| 362 |
-
|
| 363 |
-
def search_semantic_scholar(query: str) -> List[Dict]:
|
| 364 |
-
"""Searches academic papers using Semantic Scholar API."""
|
| 365 |
-
prepared_query = build_search_query(query, max_terms=10)
|
| 366 |
-
normalized_query = " ".join(prepared_query.split()).lower()
|
| 367 |
-
if normalized_query in semantic_query_cache:
|
| 368 |
-
return semantic_query_cache[normalized_query]
|
| 369 |
-
|
| 370 |
-
try:
|
| 371 |
-
results = []
|
| 372 |
-
|
| 373 |
-
# Try snippet search first because it returns passage-level text better suited for chunk comparison.
|
| 374 |
-
snippet_data = s2_snippet_search(
|
| 375 |
-
query=prepared_query,
|
| 376 |
-
fields="snippet.text,snippet.snippetKind",
|
| 377 |
-
limit=3,
|
| 378 |
-
)
|
| 379 |
-
for item in snippet_data.get("data", []):
|
| 380 |
-
snippet = item.get("snippet", {})
|
| 381 |
-
paper = item.get("paper", {})
|
| 382 |
-
snippet_text = snippet.get("text", "")
|
| 383 |
-
if snippet_text:
|
| 384 |
-
corpus_id = paper.get("corpusId")
|
| 385 |
-
paper_url = f"https://www.semanticscholar.org/paper/{corpus_id}" if corpus_id else None
|
| 386 |
-
results.append({
|
| 387 |
-
"text": snippet_text,
|
| 388 |
-
"url": paper_url,
|
| 389 |
-
"source_type": "Academic (Semantic Scholar Snippet)",
|
| 390 |
-
})
|
| 391 |
-
|
| 392 |
-
# Keep paper abstract search as fallback/secondary source.
|
| 393 |
-
data = s2_paper_search(
|
| 394 |
-
query=prepared_query,
|
| 395 |
-
limit=2,
|
| 396 |
-
fields="title,abstract,url",
|
| 397 |
-
)
|
| 398 |
-
|
| 399 |
-
for item in data.get("data", []):
|
| 400 |
-
if item.get("abstract"): # Only keep if abstract exists to compare text
|
| 401 |
-
results.append({
|
| 402 |
-
"text": item["abstract"],
|
| 403 |
-
"url": item.get("url", f"https://www.semanticscholar.org/paper/{item['paperId']}"),
|
| 404 |
-
"source_type": "Academic (Semantic Scholar)"
|
| 405 |
-
})
|
| 406 |
-
semantic_query_cache[normalized_query] = results
|
| 407 |
-
return results
|
| 408 |
-
except Exception as e:
|
| 409 |
-
print(f"Semantic Scholar Error: {e}")
|
| 410 |
-
return []
|
| 411 |
-
|
| 412 |
-
def aggregate_search(query: str) -> List[Dict]:
|
| 413 |
-
"""Combines Academic and Web sources and implements caching."""
|
| 414 |
-
# Use the first 15 words to make the search query efficient
|
| 415 |
-
search_query = " ".join(query.split()[:15])
|
| 416 |
-
|
| 417 |
-
if search_query in query_cache:
|
| 418 |
-
return query_cache[search_query]
|
| 419 |
-
|
| 420 |
-
# Run both searches
|
| 421 |
-
web_results = search_google_serper(search_query)
|
| 422 |
-
academic_results = search_semantic_scholar(search_query)
|
| 423 |
-
|
| 424 |
-
combined = web_results + academic_results
|
| 425 |
-
query_cache[search_query] = combined # Save to cache
|
| 426 |
-
|
| 427 |
-
# Sleep to respect rate limits
|
| 428 |
-
time.sleep(1)
|
| 429 |
-
|
| 430 |
-
return combined
|
| 431 |
-
|
| 432 |
-
# ==========================================
|
| 433 |
-
# 4. Core Comparison Logic
|
| 434 |
-
# ==========================================
|
| 435 |
-
|
| 436 |
-
def calculate_exact_similarity(text1: str, text2: str) -> float:
|
| 437 |
-
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
| 438 |
-
|
| 439 |
-
def check_paraphrasing_with_llm(chunk: str, source_text: str) -> bool:
|
| 440 |
-
prompt = ChatPromptTemplate.from_messages([
|
| 441 |
-
("system", "You are an expert academic plagiarism detector. Determine if TEXT A is a direct paraphrase, stolen idea, or highly similar structure to TEXT B. Ignore generic academic phrases like 'In this paper we demonstrate'. Respond ONLY with 'YES' or 'NO'."),
|
| 442 |
-
("user", "TEXT A: {chunk}\n\nTEXT B: {source_text}")
|
| 443 |
-
])
|
| 444 |
-
chain = prompt | llm
|
| 445 |
-
response = chain.invoke({"chunk": chunk, "source_text": source_text})
|
| 446 |
-
return "YES" in response.content.upper()
|
| 447 |
-
|
| 448 |
-
def generate_detailed_report_with_llm(
|
| 449 |
-
filename: str,
|
| 450 |
-
match_reports: List[MatchReport],
|
| 451 |
-
total_words: int,
|
| 452 |
-
overall_score: float
|
| 453 |
-
) -> DetailedPlagiarismReport:
|
| 454 |
-
"""Generate a comprehensive report using LLM analysis"""
|
| 455 |
-
from datetime import datetime
|
| 456 |
-
|
| 457 |
-
# 1. Aggregate data for analysis
|
| 458 |
-
plagiarized_reports = [r for r in match_reports if r.is_plagiarized]
|
| 459 |
-
plagiarism_types = {}
|
| 460 |
-
sources_by_type = {"Academic": [], "Web": []}
|
| 461 |
-
|
| 462 |
-
for report in plagiarized_reports:
|
| 463 |
-
ptype = report.plagiarism_type or "Unknown"
|
| 464 |
-
plagiarism_types[ptype] = plagiarism_types.get(ptype, 0) + 1
|
| 465 |
-
|
| 466 |
-
if report.source_type:
|
| 467 |
-
if "Academic" in report.source_type:
|
| 468 |
-
if report.source_url not in sources_by_type["Academic"]:
|
| 469 |
-
sources_by_type["Academic"].append({
|
| 470 |
-
"url": report.source_url,
|
| 471 |
-
"type": report.source_type,
|
| 472 |
-
"max_similarity": report.similarity_score
|
| 473 |
-
})
|
| 474 |
-
else:
|
| 475 |
-
if report.source_url not in sources_by_type["Web"]:
|
| 476 |
-
sources_by_type["Web"].append({
|
| 477 |
-
"url": report.source_url,
|
| 478 |
-
"type": report.source_type,
|
| 479 |
-
"max_similarity": report.similarity_score
|
| 480 |
-
})
|
| 481 |
-
|
| 482 |
-
# 2. Determine severity level
|
| 483 |
-
if overall_score < 15:
|
| 484 |
-
severity = "Low"
|
| 485 |
-
risk_level = "Minimal - Normal citation variations detected"
|
| 486 |
-
elif overall_score < 30:
|
| 487 |
-
severity = "Medium"
|
| 488 |
-
risk_level = "Moderate - Multiple sources match detected"
|
| 489 |
-
elif overall_score < 50:
|
| 490 |
-
severity = "High"
|
| 491 |
-
risk_level = "Significant - Substantial plagiarism detected"
|
| 492 |
-
else:
|
| 493 |
-
severity = "Very High"
|
| 494 |
-
risk_level = "Critical - Extensive plagiarism detected"
|
| 495 |
-
|
| 496 |
-
# 3. Use LLM to generate detailed analysis
|
| 497 |
-
plagiarism_context = f"""
|
| 498 |
-
Document: {filename}
|
| 499 |
-
Total Words: {total_words}
|
| 500 |
-
Plagiarism Score: {overall_score}%
|
| 501 |
-
Plagiarism Types Found: {plagiarism_types}
|
| 502 |
-
Academic Matches: {len(sources_by_type['Academic'])}
|
| 503 |
-
Web Matches: {len(sources_by_type['Web'])}
|
| 504 |
-
|
| 505 |
-
Suspicious Sections (samples):
|
| 506 |
-
{chr(10).join([f"- {r.chunk_text[:100]}..." for r in plagiarized_reports[:5]])}
|
| 507 |
-
"""
|
| 508 |
-
|
| 509 |
-
analysis_prompt = ChatPromptTemplate.from_messages([
|
| 510 |
-
("system", """You are an expert academic integrity analyzer and plagiarism report generator.
|
| 511 |
-
Generate a professional, detailed plagiarism analysis report.
|
| 512 |
-
Focus on: severity assessment, academic integrity concerns, specific problem areas, and recommendations.
|
| 513 |
-
Be thorough but concise."""),
|
| 514 |
-
("user", """Create a detailed plagiarism analysis for this document:
|
| 515 |
-
|
| 516 |
-
{plagiarism_context}
|
| 517 |
-
|
| 518 |
-
Provide:
|
| 519 |
-
1. Executive Summary (2-3 sentences)
|
| 520 |
-
2. Key Findings (3-4 bullet points)
|
| 521 |
-
3. Detailed Analysis (2-3 paragraphs explaining the plagiarism pattern)
|
| 522 |
-
4. Recommendations (3-4 specific actions to remediate)
|
| 523 |
-
|
| 524 |
-
Format clearly with section headers.""")
|
| 525 |
-
])
|
| 526 |
-
|
| 527 |
-
chain = analysis_prompt | llm
|
| 528 |
-
llm_response = chain.invoke({"plagiarism_context": plagiarism_context})
|
| 529 |
-
llm_analysis = llm_response.content
|
| 530 |
-
|
| 531 |
-
# 4. Extract findings from LLM response
|
| 532 |
-
lines = llm_analysis.split('\n')
|
| 533 |
-
key_findings = []
|
| 534 |
-
recommendations = []
|
| 535 |
-
detailed_analysis = ""
|
| 536 |
-
|
| 537 |
-
in_findings = False
|
| 538 |
-
in_recommendations = False
|
| 539 |
-
|
| 540 |
-
for line in lines:
|
| 541 |
-
if 'Key Findings' in line:
|
| 542 |
-
in_findings = True
|
| 543 |
-
in_recommendations = False
|
| 544 |
-
elif 'Recommendations' in line:
|
| 545 |
-
in_findings = False
|
| 546 |
-
in_recommendations = True
|
| 547 |
-
elif 'Detailed Analysis' in line or 'Analysis' in line:
|
| 548 |
-
in_findings = False
|
| 549 |
-
in_recommendations = False
|
| 550 |
-
elif in_findings and line.strip().startswith(('-', '*', '•')):
|
| 551 |
-
key_findings.append(line.strip().lstrip('-*•').strip())
|
| 552 |
-
elif in_recommendations and line.strip().startswith(('-', '*', '•')):
|
| 553 |
-
recommendations.append(line.strip().lstrip('-*•').strip())
|
| 554 |
-
elif not in_findings and not in_recommendations and line.strip():
|
| 555 |
-
detailed_analysis += line + "\n"
|
| 556 |
-
|
| 557 |
-
if not key_findings:
|
| 558 |
-
key_findings = [
|
| 559 |
-
f"Overall plagiarism score: {overall_score}%",
|
| 560 |
-
f"Primary plagiarism type: {max(plagiarism_types.keys(), key=plagiarism_types.get) if plagiarism_types else 'Not detected'}",
|
| 561 |
-
f"Multiple sources detected: {len(sources_by_type['Academic']) + len(sources_by_type['Web'])} sources"
|
| 562 |
-
]
|
| 563 |
-
|
| 564 |
-
if not recommendations:
|
| 565 |
-
recommendations = [
|
| 566 |
-
"Properly cite all sources according to your institution's guidelines",
|
| 567 |
-
"Use quotation marks for direct quotes and provide page numbers",
|
| 568 |
-
"Paraphrase content properly and cite original sources",
|
| 569 |
-
"Use plagiarism detection tools during the writing process"
|
| 570 |
-
]
|
| 571 |
-
|
| 572 |
-
# 5. Affected sections
|
| 573 |
-
affected_sections = []
|
| 574 |
-
for i, report in enumerate(plagiarized_reports[:10]):
|
| 575 |
-
affected_sections.append({
|
| 576 |
-
"section_number": i + 1,
|
| 577 |
-
"text_snippet": report.chunk_text[:150],
|
| 578 |
-
"similarity_score": report.similarity_score,
|
| 579 |
-
"plagiarism_type": report.plagiarism_type,
|
| 580 |
-
"source": report.source_url,
|
| 581 |
-
"source_type": report.source_type
|
| 582 |
-
})
|
| 583 |
-
|
| 584 |
-
return DetailedPlagiarismReport(
|
| 585 |
-
filename=filename,
|
| 586 |
-
scan_timestamp=datetime.now().isoformat(),
|
| 587 |
-
executive_summary=llm_analysis.split('\n')[0] if llm_analysis else f"Document contains {overall_score}% plagiarized content",
|
| 588 |
-
overall_score=round(overall_score, 2),
|
| 589 |
-
severity_level=severity,
|
| 590 |
-
matched_sources=sources_by_type["Academic"] + sources_by_type["Web"],
|
| 591 |
-
key_findings=key_findings,
|
| 592 |
-
plagiarism_breakdown={
|
| 593 |
-
"total_plagiarism_percentage": round(overall_score, 2),
|
| 594 |
-
"types": plagiarism_types,
|
| 595 |
-
"academic_sources": len(sources_by_type["Academic"]),
|
| 596 |
-
"web_sources": len(sources_by_type["Web"])
|
| 597 |
-
},
|
| 598 |
-
detailed_analysis=detailed_analysis or llm_analysis,
|
| 599 |
-
affected_sections=affected_sections,
|
| 600 |
-
recommendations=recommendations,
|
| 601 |
-
academic_integrity_risk=risk_level
|
| 602 |
-
)
|
| 603 |
-
|
| 604 |
-
def analyze_chunk(chunk: str) -> MatchReport:
|
| 605 |
-
search_results = aggregate_search(chunk)
|
| 606 |
-
|
| 607 |
-
best_score = 0.0
|
| 608 |
-
best_url = None
|
| 609 |
-
best_source_type = None
|
| 610 |
-
plagiarism_type = None
|
| 611 |
-
is_plagiarized = False
|
| 612 |
-
|
| 613 |
-
for result in search_results:
|
| 614 |
-
source_text = result['text']
|
| 615 |
-
|
| 616 |
-
# 1. Math/Deterministic Check
|
| 617 |
-
exact_sim = calculate_exact_similarity(chunk, source_text)
|
| 618 |
-
|
| 619 |
-
if exact_sim > best_score:
|
| 620 |
-
best_score = exact_sim
|
| 621 |
-
best_url = result['url']
|
| 622 |
-
best_source_type = result['source_type']
|
| 623 |
-
|
| 624 |
-
if exact_sim > 0.50: # Lowered to 50% because we are comparing against abstracts/snippets
|
| 625 |
-
is_plagiarized = True
|
| 626 |
-
plagiarism_type = "Exact/Heavy Match"
|
| 627 |
-
break
|
| 628 |
-
|
| 629 |
-
# 2. Agentic Check for Mosaic Plagiarism
|
| 630 |
-
elif exact_sim > 0.25:
|
| 631 |
-
if check_paraphrasing_with_llm(chunk, source_text):
|
| 632 |
-
is_plagiarized = True
|
| 633 |
-
plagiarism_type = "Paraphrased Match (Mosaic)"
|
| 634 |
-
best_url = result['url']
|
| 635 |
-
best_source_type = result['source_type']
|
| 636 |
-
best_score = max(best_score, 0.85)
|
| 637 |
-
break
|
| 638 |
-
|
| 639 |
-
return MatchReport(
|
| 640 |
-
chunk_text=chunk,
|
| 641 |
-
is_plagiarized=is_plagiarized,
|
| 642 |
-
plagiarism_type=plagiarism_type,
|
| 643 |
-
source_url=best_url,
|
| 644 |
-
source_type=best_source_type,
|
| 645 |
-
similarity_score=round(best_score, 2)
|
| 646 |
-
)
|
| 647 |
-
|
| 648 |
-
# ==========================================
|
| 649 |
-
# 6. Report Formatting Functions
|
| 650 |
-
# ==========================================
|
| 651 |
-
|
| 652 |
-
def format_report_json(detailed_report: DetailedPlagiarismReport) -> Dict[str, Any]:
|
| 653 |
-
"""Format report as JSON"""
|
| 654 |
-
return {
|
| 655 |
-
"filename": detailed_report.filename,
|
| 656 |
-
"scan_timestamp": detailed_report.scan_timestamp,
|
| 657 |
-
# Backward-compatible top-level fields expected by existing clients.
|
| 658 |
-
"overall_score": detailed_report.overall_score,
|
| 659 |
-
"severity_level": detailed_report.severity_level,
|
| 660 |
-
"academic_integrity_risk": detailed_report.academic_integrity_risk,
|
| 661 |
-
"summary": {
|
| 662 |
-
"overall_plagiarism_score": detailed_report.overall_score,
|
| 663 |
-
"severity_level": detailed_report.severity_level,
|
| 664 |
-
"academic_integrity_risk": detailed_report.academic_integrity_risk
|
| 665 |
-
},
|
| 666 |
-
"executive_summary": detailed_report.executive_summary,
|
| 667 |
-
"key_findings": detailed_report.key_findings,
|
| 668 |
-
"plagiarism_breakdown": detailed_report.plagiarism_breakdown,
|
| 669 |
-
"matched_sources": detailed_report.matched_sources,
|
| 670 |
-
"affected_sections": detailed_report.affected_sections,
|
| 671 |
-
"detailed_analysis": detailed_report.detailed_analysis,
|
| 672 |
-
"recommendations": detailed_report.recommendations
|
| 673 |
-
}
|
| 674 |
-
|
| 675 |
-
def format_report_text(detailed_report: DetailedPlagiarismReport) -> str:
|
| 676 |
-
"""Format report as plain text"""
|
| 677 |
-
report = "=" * 80 + "\n"
|
| 678 |
-
report += "DETAILED PLAGIARISM DETECTION REPORT\n"
|
| 679 |
-
report += "=" * 80 + "\n\n"
|
| 680 |
-
|
| 681 |
-
report += f"FILE: {detailed_report.filename}\n"
|
| 682 |
-
report += f"SCAN DATE: {detailed_report.scan_timestamp}\n"
|
| 683 |
-
report += "-" * 80 + "\n\n"
|
| 684 |
-
|
| 685 |
-
report += "SUMMARY\n"
|
| 686 |
-
report += "-" * 80 + "\n"
|
| 687 |
-
report += f"Overall Plagiarism Score: {detailed_report.overall_score}%\n"
|
| 688 |
-
report += f"Severity Level: {detailed_report.severity_level}\n"
|
| 689 |
-
report += f"Academic Integrity Risk: {detailed_report.academic_integrity_risk}\n\n"
|
| 690 |
-
|
| 691 |
-
report += "EXECUTIVE SUMMARY\n"
|
| 692 |
-
report += "-" * 80 + "\n"
|
| 693 |
-
report += f"{detailed_report.executive_summary}\n\n"
|
| 694 |
-
|
| 695 |
-
report += "KEY FINDINGS\n"
|
| 696 |
-
report += "-" * 80 + "\n"
|
| 697 |
-
for i, finding in enumerate(detailed_report.key_findings, 1):
|
| 698 |
-
report += f"{i}. {finding}\n"
|
| 699 |
-
report += "\n"
|
| 700 |
-
|
| 701 |
-
report += "PLAGIARISM BREAKDOWN\n"
|
| 702 |
-
report += "-" * 80 + "\n"
|
| 703 |
-
report += f"Total Plagiarism %: {detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%\n"
|
| 704 |
-
report += f"Academic Sources: {detailed_report.plagiarism_breakdown['academic_sources']}\n"
|
| 705 |
-
report += f"Web Sources: {detailed_report.plagiarism_breakdown['web_sources']}\n"
|
| 706 |
-
if detailed_report.plagiarism_breakdown.get('types'):
|
| 707 |
-
report += "Types Detected:\n"
|
| 708 |
-
for ptype, count in detailed_report.plagiarism_breakdown['types'].items():
|
| 709 |
-
report += f" - {ptype}: {count} instances\n"
|
| 710 |
-
report += "\n"
|
| 711 |
-
|
| 712 |
-
report += "MATCHED SOURCES\n"
|
| 713 |
-
report += "-" * 80 + "\n"
|
| 714 |
-
if detailed_report.matched_sources:
|
| 715 |
-
for i, source in enumerate(detailed_report.matched_sources[:10], 1):
|
| 716 |
-
report += f"{i}. URL: {source.get('url', 'N/A')}\n"
|
| 717 |
-
report += f" Type: {source.get('type', 'N/A')}\n"
|
| 718 |
-
report += f" Similarity: {source.get('max_similarity', 'N/A')}\n\n"
|
| 719 |
-
else:
|
| 720 |
-
report += "No sources matched.\n\n"
|
| 721 |
-
|
| 722 |
-
report += "DETAILED ANALYSIS\n"
|
| 723 |
-
report += "-" * 80 + "\n"
|
| 724 |
-
report += f"{detailed_report.detailed_analysis}\n\n"
|
| 725 |
-
|
| 726 |
-
if detailed_report.affected_sections:
|
| 727 |
-
report += "AFFECTED SECTIONS (Top Issues)\n"
|
| 728 |
-
report += "-" * 80 + "\n"
|
| 729 |
-
for section in detailed_report.affected_sections[:5]:
|
| 730 |
-
report += f"\nSection {section['section_number']}:\n"
|
| 731 |
-
report += f"Text Snippet: {section['text_snippet']}\n"
|
| 732 |
-
report += f"Similarity Score: {section['similarity_score']}\n"
|
| 733 |
-
report += f"Plagiarism Type: {section['plagiarism_type']}\n"
|
| 734 |
-
report += f"Source: {section['source']}\n"
|
| 735 |
-
report += "\n"
|
| 736 |
-
|
| 737 |
-
report += "RECOMMENDATIONS\n"
|
| 738 |
-
report += "-" * 80 + "\n"
|
| 739 |
-
for i, rec in enumerate(detailed_report.recommendations, 1):
|
| 740 |
-
report += f"{i}. {rec}\n"
|
| 741 |
-
report += "\n"
|
| 742 |
-
|
| 743 |
-
report += "=" * 80 + "\n"
|
| 744 |
-
report += "End of Report\n"
|
| 745 |
-
report += "=" * 80 + "\n"
|
| 746 |
-
|
| 747 |
-
return report
|
| 748 |
-
|
| 749 |
-
def format_report_html(detailed_report: DetailedPlagiarismReport) -> str:
|
| 750 |
-
"""Format report as HTML"""
|
| 751 |
-
html = f"""
|
| 752 |
-
<!DOCTYPE html>
|
| 753 |
-
<html lang="en">
|
| 754 |
-
<head>
|
| 755 |
-
<meta charset="UTF-8">
|
| 756 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 757 |
-
<title>Plagiarism Detection Report - {detailed_report.filename}</title>
|
| 758 |
-
<style>
|
| 759 |
-
body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
|
| 760 |
-
.container {{ background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
|
| 761 |
-
h1 {{ color: #333; border-bottom: 3px solid #2196F3; padding-bottom: 10px; }}
|
| 762 |
-
h2 {{ color: #2196F3; margin-top: 30px; }}
|
| 763 |
-
.summary {{ background-color: #f0f7ff; padding: 15px; border-left: 4px solid #2196F3; margin: 20px 0; }}
|
| 764 |
-
.score {{ font-size: 24px; font-weight: bold; color: #d32f2f; }}
|
| 765 |
-
.severity-low {{ color: #4caf50; }}
|
| 766 |
-
.severity-medium {{ color: #ff9800; }}
|
| 767 |
-
.severity-high {{ color: #f44336; }}
|
| 768 |
-
.severity-very-high {{ color: #c41c3b; }}
|
| 769 |
-
.findings {{ background-color: #fff3e0; padding: 15px; border-left: 4px solid #ff9800; }}
|
| 770 |
-
.source-item {{ background-color: #f5f5f5; padding: 10px; margin: 10px 0; border-radius: 4px; }}
|
| 771 |
-
.recommendation {{ background-color: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 3px solid #4caf50; }}
|
| 772 |
-
table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
|
| 773 |
-
th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
|
| 774 |
-
th {{ background-color: #2196F3; color: white; }}
|
| 775 |
-
.affected-section {{ background-color: #fce4ec; padding: 15px; margin: 10px 0; border-radius: 4px; }}
|
| 776 |
-
</style>
|
| 777 |
-
</head>
|
| 778 |
-
<body>
|
| 779 |
-
<div class="container">
|
| 780 |
-
<h1>🔍 Plagiarism Detection Report</h1>
|
| 781 |
-
|
| 782 |
-
<div class="summary">
|
| 783 |
-
<p><strong>File:</strong> {detailed_report.filename}</p>
|
| 784 |
-
<p><strong>Scan Date:</strong> {detailed_report.scan_timestamp}</p>
|
| 785 |
-
<p><strong>Overall Plagiarism Score:</strong> <span class="score">{detailed_report.overall_score}%</span></p>
|
| 786 |
-
<p><strong>Severity Level:</strong> <span class="severity-{detailed_report.severity_level.lower().replace(' ', '-')}">{detailed_report.severity_level}</span></p>
|
| 787 |
-
<p><strong>Academic Integrity Risk:</strong> {detailed_report.academic_integrity_risk}</p>
|
| 788 |
-
</div>
|
| 789 |
-
|
| 790 |
-
<h2>Executive Summary</h2>
|
| 791 |
-
<p>{detailed_report.executive_summary}</p>
|
| 792 |
-
|
| 793 |
-
<h2>Key Findings</h2>
|
| 794 |
-
<div class="findings">
|
| 795 |
-
<ul>
|
| 796 |
-
{"".join([f"<li>{finding}</li>" for finding in detailed_report.key_findings])}
|
| 797 |
-
</ul>
|
| 798 |
-
</div>
|
| 799 |
-
|
| 800 |
-
<h2>Plagiarism Breakdown</h2>
|
| 801 |
-
<table>
|
| 802 |
-
<tr>
|
| 803 |
-
<th>Category</th>
|
| 804 |
-
<th>Value</th>
|
| 805 |
-
</tr>
|
| 806 |
-
<tr>
|
| 807 |
-
<td>Total Plagiarism %</td>
|
| 808 |
-
<td>{detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%</td>
|
| 809 |
-
</tr>
|
| 810 |
-
<tr>
|
| 811 |
-
<td>Academic Sources</td>
|
| 812 |
-
<td>{detailed_report.plagiarism_breakdown['academic_sources']}</td>
|
| 813 |
-
</tr>
|
| 814 |
-
<tr>
|
| 815 |
-
<td>Web Sources</td>
|
| 816 |
-
<td>{detailed_report.plagiarism_breakdown['web_sources']}</td>
|
| 817 |
-
</tr>
|
| 818 |
-
</table>
|
| 819 |
-
|
| 820 |
-
<h2>Matched Sources</h2>
|
| 821 |
-
{"".join([f'<div class="source-item"><strong>{source.get("type", "Unknown")}</strong><br/><a href="{source.get("url", "#")}" target="_blank">{source.get("url", "N/A")}</a><br/>Similarity: {source.get("max_similarity", "N/A")}</div>' for source in detailed_report.matched_sources[:10]])}
|
| 822 |
-
|
| 823 |
-
<h2>Detailed Analysis</h2>
|
| 824 |
-
<p>{detailed_report.detailed_analysis.replace(chr(10), "<br/>")}</p>
|
| 825 |
-
|
| 826 |
-
{"<h2>Affected Sections (Top Issues)</h2>" + "".join([f'<div class="affected-section"><strong>Section {section["section_number"]}</strong><br/><em>Text:</em> {section["text_snippet"]}...<br/><em>Similarity:</em> {section["similarity_score"]}<br/><em>Type:</em> {section["plagiarism_type"]}</div>' for section in detailed_report.affected_sections[:5]]) if detailed_report.affected_sections else ""}
|
| 827 |
-
|
| 828 |
-
<h2>Recommendations</h2>
|
| 829 |
-
<div>
|
| 830 |
-
{"".join([f'<div class="recommendation"><strong>✓</strong> {rec}</div>' for rec in detailed_report.recommendations])}
|
| 831 |
-
</div>
|
| 832 |
-
</div>
|
| 833 |
-
</body>
|
| 834 |
-
</html>
|
| 835 |
-
"""
|
| 836 |
-
return html
|
| 837 |
-
|
| 838 |
-
# ==========================================
|
| 839 |
-
# 5. API Endpoints & Utility
|
| 840 |
-
# ==========================================
|
| 841 |
-
|
| 842 |
-
def extract_text_from_pdf(file_bytes) -> str:
|
| 843 |
-
reader = PdfReader(file_bytes)
|
| 844 |
-
return "".join([page.extract_text() + "\n" for page in reader.pages if page.extract_text()])
|
| 845 |
-
|
| 846 |
-
def chunk_text(text: str, words_per_chunk: int = 40) -> List[str]:
|
| 847 |
-
words = text.split()
|
| 848 |
-
chunks = []
|
| 849 |
-
for i in range(0, len(words), words_per_chunk - 10):
|
| 850 |
-
chunk = " ".join(words[i:i + words_per_chunk])
|
| 851 |
-
if len(chunk.split()) > 15:
|
| 852 |
-
chunks.append(chunk)
|
| 853 |
-
return chunks
|
| 854 |
-
|
| 855 |
-
@app.post("/scan-paper", response_model=PlagiarismReport)
|
| 856 |
-
async def scan_paper(file: UploadFile = File(...)):
|
| 857 |
-
text = extract_text_from_pdf(file.file)
|
| 858 |
-
total_words = len(text.split())
|
| 859 |
-
|
| 860 |
-
if total_words == 0:
|
| 861 |
-
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 862 |
-
|
| 863 |
-
chunks = chunk_text(text)
|
| 864 |
-
|
| 865 |
-
# Cap chunks for safety during testing (remove in production)
|
| 866 |
-
if len(chunks) > 20:
|
| 867 |
-
chunks = chunks[:20]
|
| 868 |
-
|
| 869 |
-
detailed_reports = []
|
| 870 |
-
plagiarized_word_count = 0
|
| 871 |
-
|
| 872 |
-
for chunk in chunks:
|
| 873 |
-
report = analyze_chunk(chunk)
|
| 874 |
-
detailed_reports.append(report)
|
| 875 |
-
|
| 876 |
-
if report.is_plagiarized:
|
| 877 |
-
plagiarized_word_count += len(chunk.split())
|
| 878 |
-
|
| 879 |
-
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 880 |
-
overall_score = (plagiarized_word_count / total_words) * 100
|
| 881 |
-
|
| 882 |
-
# Determine severity level
|
| 883 |
-
if overall_score < 15:
|
| 884 |
-
severity = "Low"
|
| 885 |
-
elif overall_score < 30:
|
| 886 |
-
severity = "Medium"
|
| 887 |
-
elif overall_score < 50:
|
| 888 |
-
severity = "High"
|
| 889 |
-
else:
|
| 890 |
-
severity = "Very High"
|
| 891 |
-
|
| 892 |
-
return PlagiarismReport(
|
| 893 |
-
filename=file.filename,
|
| 894 |
-
total_words=total_words,
|
| 895 |
-
plagiarized_words=plagiarized_word_count,
|
| 896 |
-
overall_plagiarism_score=round(overall_score, 2),
|
| 897 |
-
severity_level=severity,
|
| 898 |
-
details=detailed_reports
|
| 899 |
-
)
|
| 900 |
-
|
| 901 |
-
@app.post("/generate-detailed-report")
|
| 902 |
-
async def generate_detailed_report(file: UploadFile = File(...)):
|
| 903 |
-
"""Generate comprehensive plagiarism report with LLM analysis"""
|
| 904 |
-
text = extract_text_from_pdf(file.file)
|
| 905 |
-
total_words = len(text.split())
|
| 906 |
-
|
| 907 |
-
if total_words == 0:
|
| 908 |
-
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 909 |
-
|
| 910 |
-
chunks = chunk_text(text)
|
| 911 |
-
|
| 912 |
-
# Cap chunks
|
| 913 |
-
if len(chunks) > 20:
|
| 914 |
-
chunks = chunks[:20]
|
| 915 |
-
|
| 916 |
-
detailed_reports = []
|
| 917 |
-
plagiarized_word_count = 0
|
| 918 |
-
|
| 919 |
-
for chunk in chunks:
|
| 920 |
-
report = analyze_chunk(chunk)
|
| 921 |
-
detailed_reports.append(report)
|
| 922 |
-
|
| 923 |
-
if report.is_plagiarized:
|
| 924 |
-
plagiarized_word_count += len(chunk.split())
|
| 925 |
-
|
| 926 |
-
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 927 |
-
overall_score = (plagiarized_word_count / total_words) * 100
|
| 928 |
-
|
| 929 |
-
# Generate detailed report with LLM analysis
|
| 930 |
-
detailed_report = generate_detailed_report_with_llm(
|
| 931 |
-
filename=file.filename,
|
| 932 |
-
match_reports=detailed_reports,
|
| 933 |
-
total_words=total_words,
|
| 934 |
-
overall_score=overall_score
|
| 935 |
-
)
|
| 936 |
-
|
| 937 |
-
return format_report_json(detailed_report)
|
| 938 |
-
|
| 939 |
-
@app.post("/report/text")
|
| 940 |
-
async def report_text(file: UploadFile = File(...)):
|
| 941 |
-
"""Generate detailed plagiarism report as plain text"""
|
| 942 |
-
text = extract_text_from_pdf(file.file)
|
| 943 |
-
total_words = len(text.split())
|
| 944 |
-
|
| 945 |
-
if total_words == 0:
|
| 946 |
-
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 947 |
-
|
| 948 |
-
chunks = chunk_text(text)
|
| 949 |
-
|
| 950 |
-
if len(chunks) > 20:
|
| 951 |
-
chunks = chunks[:20]
|
| 952 |
-
|
| 953 |
-
detailed_reports = []
|
| 954 |
-
plagiarized_word_count = 0
|
| 955 |
-
|
| 956 |
-
for chunk in chunks:
|
| 957 |
-
report = analyze_chunk(chunk)
|
| 958 |
-
detailed_reports.append(report)
|
| 959 |
-
|
| 960 |
-
if report.is_plagiarized:
|
| 961 |
-
plagiarized_word_count += len(chunk.split())
|
| 962 |
-
|
| 963 |
-
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 964 |
-
overall_score = (plagiarized_word_count / total_words) * 100
|
| 965 |
-
|
| 966 |
-
# Generate detailed report
|
| 967 |
-
detailed_report = generate_detailed_report_with_llm(
|
| 968 |
-
filename=file.filename,
|
| 969 |
-
match_reports=detailed_reports,
|
| 970 |
-
total_words=total_words,
|
| 971 |
-
overall_score=overall_score
|
| 972 |
-
)
|
| 973 |
-
|
| 974 |
-
from fastapi.responses import PlainTextResponse
|
| 975 |
-
return PlainTextResponse(format_report_text(detailed_report))
|
| 976 |
-
|
| 977 |
-
@app.post("/report/html")
|
| 978 |
-
async def report_html(file: UploadFile = File(...)):
|
| 979 |
-
"""Generate detailed plagiarism report as HTML"""
|
| 980 |
-
text = extract_text_from_pdf(file.file)
|
| 981 |
-
total_words = len(text.split())
|
| 982 |
-
|
| 983 |
-
if total_words == 0:
|
| 984 |
-
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 985 |
-
|
| 986 |
-
chunks = chunk_text(text)
|
| 987 |
-
|
| 988 |
-
if len(chunks) > 20:
|
| 989 |
-
chunks = chunks[:20]
|
| 990 |
-
|
| 991 |
-
detailed_reports = []
|
| 992 |
-
plagiarized_word_count = 0
|
| 993 |
-
|
| 994 |
-
for chunk in chunks:
|
| 995 |
-
report = analyze_chunk(chunk)
|
| 996 |
-
detailed_reports.append(report)
|
| 997 |
-
|
| 998 |
-
if report.is_plagiarized:
|
| 999 |
-
plagiarized_word_count += len(chunk.split())
|
| 1000 |
-
|
| 1001 |
-
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 1002 |
-
overall_score = (plagiarized_word_count / total_words) * 100
|
| 1003 |
-
|
| 1004 |
-
# Generate detailed report
|
| 1005 |
-
detailed_report = generate_detailed_report_with_llm(
|
| 1006 |
-
filename=file.filename,
|
| 1007 |
-
match_reports=detailed_reports,
|
| 1008 |
-
total_words=total_words,
|
| 1009 |
-
overall_score=overall_score
|
| 1010 |
-
)
|
| 1011 |
-
|
| 1012 |
-
from fastapi.responses import HTMLResponse
|
| 1013 |
-
return HTMLResponse(format_report_html(detailed_report))
|
| 1014 |
-
|
| 1015 |
-
@app.get("/")
|
| 1016 |
-
async def root():
|
| 1017 |
-
return {
|
| 1018 |
-
"message": "Pro Plagiarism Detector API",
|
| 1019 |
-
"endpoints": {
|
| 1020 |
-
"scan": "/scan-paper (POST - basic scan)",
|
| 1021 |
-
"detailed_report": "/generate-detailed-report (POST - JSON report with LLM analysis)",
|
| 1022 |
-
"text_report": "/report/text (POST - plain text report)",
|
| 1023 |
-
"html_report": "/report/html (POST - HTML report)"
|
| 1024 |
-
}
|
| 1025 |
-
}
|
| 1026 |
-
|
| 1027 |
-
if __name__ == "__main__":
|
| 1028 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import requests
|
| 4 |
+
import random
|
| 5 |
+
import re
|
| 6 |
+
from difflib import SequenceMatcher
|
| 7 |
+
from typing import List, Optional, Dict, Any
|
| 8 |
+
from urllib.parse import quote_plus
|
| 9 |
+
|
| 10 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
import uvicorn
|
| 13 |
+
from pydantic import BaseModel
|
| 14 |
+
from PyPDF2 import PdfReader
|
| 15 |
+
|
| 16 |
+
from langchain_groq import ChatGroq
|
| 17 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 18 |
+
|
| 19 |
+
# ==========================================
|
| 20 |
+
# 1. Environment & API Setup
|
| 21 |
+
# ==========================================
|
| 22 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 23 |
+
SERPER_API_KEY = os.getenv("SERPER_API_KEY")
|
| 24 |
+
SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
|
| 25 |
+
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
|
| 26 |
+
SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS = 1.2
|
| 27 |
+
SEMANTIC_SCHOLAR_MAX_RETRIES = 4
|
| 28 |
+
|
| 29 |
+
if not GROQ_API_KEY or not SERPER_API_KEY:
|
| 30 |
+
print("WARNING: GROQ_API_KEY or SERPER_API_KEY is missing!")
|
| 31 |
+
|
| 32 |
+
llm = ChatGroq(model="openai/gpt-oss-120b", temperature=0.1)
|
| 33 |
+
|
| 34 |
+
# Basic Memory Cache to maintain API efficiency (as promised in the application)
|
| 35 |
+
query_cache = {}
|
| 36 |
+
semantic_query_cache: Dict[str, List[Dict[str, str]]] = {}
|
| 37 |
+
_last_semantic_scholar_call_ts = 0.0
|
| 38 |
+
|
| 39 |
+
# ==========================================
|
| 40 |
+
# 2. Pydantic Models
|
| 41 |
+
# ==========================================
|
| 42 |
+
class MatchReport(BaseModel):
|
| 43 |
+
chunk_text: str
|
| 44 |
+
is_plagiarized: bool
|
| 45 |
+
plagiarism_type: Optional[str] = None
|
| 46 |
+
source_url: Optional[str] = None
|
| 47 |
+
source_type: Optional[str] = None # "Academic" or "Web"
|
| 48 |
+
similarity_score: float
|
| 49 |
+
|
| 50 |
+
class PlagiarismReport(BaseModel):
|
| 51 |
+
filename: str
|
| 52 |
+
total_words: int
|
| 53 |
+
plagiarized_words: int
|
| 54 |
+
overall_plagiarism_score: float
|
| 55 |
+
severity_level: str # Low, Medium, High, Very High
|
| 56 |
+
details: List[MatchReport]
|
| 57 |
+
|
| 58 |
+
class DetailedPlagiarismReport(BaseModel):
|
| 59 |
+
"""Comprehensive report generated by LLM"""
|
| 60 |
+
filename: str
|
| 61 |
+
scan_timestamp: str
|
| 62 |
+
executive_summary: str
|
| 63 |
+
overall_score: float
|
| 64 |
+
severity_level: str
|
| 65 |
+
matched_sources: List[Dict[str, Any]]
|
| 66 |
+
key_findings: List[str]
|
| 67 |
+
plagiarism_breakdown: Dict[str, Any] # Types and percentages
|
| 68 |
+
detailed_analysis: str # LLM-generated detailed analysis
|
| 69 |
+
affected_sections: List[Dict[str, Any]] # Which parts are problematic
|
| 70 |
+
recommendations: List[str]
|
| 71 |
+
academic_integrity_risk: str # Assessment level
|
| 72 |
+
|
| 73 |
+
app = FastAPI(title="Pro Plagiarism Detector (Turnitin Clone)")
|
| 74 |
+
|
| 75 |
+
app.add_middleware(
|
| 76 |
+
CORSMiddleware,
|
| 77 |
+
allow_origins=["*"],
|
| 78 |
+
allow_credentials=True,
|
| 79 |
+
allow_methods=["*"],
|
| 80 |
+
allow_headers=["*"],
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# ==========================================
|
| 84 |
+
# 3. Agent Tools: Serper & Semantic Scholar
|
| 85 |
+
# ==========================================
|
| 86 |
+
|
| 87 |
+
def _semantic_scholar_headers() -> Dict[str, str]:
|
| 88 |
+
headers: Dict[str, str] = {}
|
| 89 |
+
if SEMANTIC_SCHOLAR_API_KEY:
|
| 90 |
+
# API key must be sent in x-api-key header.
|
| 91 |
+
headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY
|
| 92 |
+
return headers
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _semantic_scholar_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 96 |
+
global _last_semantic_scholar_call_ts
|
| 97 |
+
filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
|
| 98 |
+
|
| 99 |
+
for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
|
| 100 |
+
elapsed = time.time() - _last_semantic_scholar_call_ts
|
| 101 |
+
if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
|
| 102 |
+
time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
|
| 103 |
+
|
| 104 |
+
response = requests.get(
|
| 105 |
+
f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
|
| 106 |
+
headers=_semantic_scholar_headers(),
|
| 107 |
+
params=filtered_params,
|
| 108 |
+
timeout=20,
|
| 109 |
+
)
|
| 110 |
+
_last_semantic_scholar_call_ts = time.time()
|
| 111 |
+
|
| 112 |
+
if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
|
| 113 |
+
retry_after = response.headers.get("Retry-After")
|
| 114 |
+
if retry_after and retry_after.isdigit():
|
| 115 |
+
wait_seconds = float(retry_after)
|
| 116 |
+
else:
|
| 117 |
+
wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
|
| 118 |
+
time.sleep(wait_seconds)
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
response.raise_for_status()
|
| 122 |
+
return response.json()
|
| 123 |
+
|
| 124 |
+
raise requests.HTTPError("Semantic Scholar request failed after retries")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _semantic_scholar_post(path: str, body: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> Any:
|
| 128 |
+
global _last_semantic_scholar_call_ts
|
| 129 |
+
filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
|
| 130 |
+
|
| 131 |
+
for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
|
| 132 |
+
elapsed = time.time() - _last_semantic_scholar_call_ts
|
| 133 |
+
if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
|
| 134 |
+
time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
|
| 135 |
+
|
| 136 |
+
response = requests.post(
|
| 137 |
+
f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
|
| 138 |
+
headers=_semantic_scholar_headers(),
|
| 139 |
+
params=filtered_params,
|
| 140 |
+
json=body,
|
| 141 |
+
timeout=25,
|
| 142 |
+
)
|
| 143 |
+
_last_semantic_scholar_call_ts = time.time()
|
| 144 |
+
|
| 145 |
+
if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
|
| 146 |
+
retry_after = response.headers.get("Retry-After")
|
| 147 |
+
if retry_after and retry_after.isdigit():
|
| 148 |
+
wait_seconds = float(retry_after)
|
| 149 |
+
else:
|
| 150 |
+
wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
|
| 151 |
+
time.sleep(wait_seconds)
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
response.raise_for_status()
|
| 155 |
+
return response.json()
|
| 156 |
+
|
| 157 |
+
raise requests.HTTPError("Semantic Scholar request failed after retries")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def s2_paper_autocomplete(query: str) -> Dict[str, Any]:
|
| 161 |
+
return _semantic_scholar_get("/paper/autocomplete", {"query": query[:100]})
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def s2_paper_batch(ids: List[str], fields: Optional[str] = None) -> Any:
|
| 165 |
+
return _semantic_scholar_post("/paper/batch", {"ids": ids[:500]}, {"fields": fields})
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def s2_paper_search(
|
| 169 |
+
query: str,
|
| 170 |
+
fields: Optional[str] = None,
|
| 171 |
+
limit: int = 100,
|
| 172 |
+
offset: int = 0,
|
| 173 |
+
year: Optional[str] = None,
|
| 174 |
+
fields_of_study: Optional[str] = None,
|
| 175 |
+
open_access_pdf: bool = False,
|
| 176 |
+
) -> Dict[str, Any]:
|
| 177 |
+
params: Dict[str, Any] = {
|
| 178 |
+
"query": query,
|
| 179 |
+
"fields": fields,
|
| 180 |
+
"limit": min(max(limit, 1), 100),
|
| 181 |
+
"offset": max(offset, 0),
|
| 182 |
+
"year": year,
|
| 183 |
+
"fieldsOfStudy": fields_of_study,
|
| 184 |
+
}
|
| 185 |
+
if open_access_pdf:
|
| 186 |
+
params["openAccessPdf"] = ""
|
| 187 |
+
return _semantic_scholar_get("/paper/search", params)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def s2_paper_search_bulk(
|
| 191 |
+
query: str,
|
| 192 |
+
fields: Optional[str] = None,
|
| 193 |
+
token: Optional[str] = None,
|
| 194 |
+
sort: Optional[str] = None,
|
| 195 |
+
) -> Dict[str, Any]:
|
| 196 |
+
return _semantic_scholar_get(
|
| 197 |
+
"/paper/search/bulk",
|
| 198 |
+
{
|
| 199 |
+
"query": query,
|
| 200 |
+
"fields": fields,
|
| 201 |
+
"token": token,
|
| 202 |
+
"sort": sort,
|
| 203 |
+
},
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def s2_paper_search_match(query: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 208 |
+
return _semantic_scholar_get("/paper/search/match", {"query": query, "fields": fields})
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def s2_paper_details(paper_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 212 |
+
safe_id = quote_plus(paper_id)
|
| 213 |
+
return _semantic_scholar_get(f"/paper/{safe_id}", {"fields": fields})
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def s2_paper_authors(
|
| 217 |
+
paper_id: str,
|
| 218 |
+
fields: Optional[str] = None,
|
| 219 |
+
limit: int = 100,
|
| 220 |
+
offset: int = 0,
|
| 221 |
+
) -> Dict[str, Any]:
|
| 222 |
+
safe_id = quote_plus(paper_id)
|
| 223 |
+
return _semantic_scholar_get(
|
| 224 |
+
f"/paper/{safe_id}/authors",
|
| 225 |
+
{"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def s2_paper_citations(
|
| 230 |
+
paper_id: str,
|
| 231 |
+
fields: Optional[str] = None,
|
| 232 |
+
limit: int = 100,
|
| 233 |
+
offset: int = 0,
|
| 234 |
+
publication_date_or_year: Optional[str] = None,
|
| 235 |
+
) -> Dict[str, Any]:
|
| 236 |
+
safe_id = quote_plus(paper_id)
|
| 237 |
+
return _semantic_scholar_get(
|
| 238 |
+
f"/paper/{safe_id}/citations",
|
| 239 |
+
{
|
| 240 |
+
"fields": fields,
|
| 241 |
+
"limit": min(max(limit, 1), 1000),
|
| 242 |
+
"offset": max(offset, 0),
|
| 243 |
+
"publicationDateOrYear": publication_date_or_year,
|
| 244 |
+
},
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def s2_paper_references(
|
| 249 |
+
paper_id: str,
|
| 250 |
+
fields: Optional[str] = None,
|
| 251 |
+
limit: int = 100,
|
| 252 |
+
offset: int = 0,
|
| 253 |
+
) -> Dict[str, Any]:
|
| 254 |
+
safe_id = quote_plus(paper_id)
|
| 255 |
+
return _semantic_scholar_get(
|
| 256 |
+
f"/paper/{safe_id}/references",
|
| 257 |
+
{"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def s2_author_batch(ids: List[str], fields: Optional[str] = None) -> Any:
|
| 262 |
+
return _semantic_scholar_post("/author/batch", {"ids": ids[:1000]}, {"fields": fields})
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def s2_author_search(
|
| 266 |
+
query: str,
|
| 267 |
+
fields: Optional[str] = None,
|
| 268 |
+
limit: int = 100,
|
| 269 |
+
offset: int = 0,
|
| 270 |
+
) -> Dict[str, Any]:
|
| 271 |
+
return _semantic_scholar_get(
|
| 272 |
+
"/author/search",
|
| 273 |
+
{
|
| 274 |
+
"query": query,
|
| 275 |
+
"fields": fields,
|
| 276 |
+
"limit": min(max(limit, 1), 1000),
|
| 277 |
+
"offset": max(offset, 0),
|
| 278 |
+
},
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def s2_author_details(author_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
|
| 283 |
+
safe_id = quote_plus(author_id)
|
| 284 |
+
return _semantic_scholar_get(f"/author/{safe_id}", {"fields": fields})
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def s2_author_papers(
|
| 288 |
+
author_id: str,
|
| 289 |
+
fields: Optional[str] = None,
|
| 290 |
+
limit: int = 100,
|
| 291 |
+
offset: int = 0,
|
| 292 |
+
publication_date_or_year: Optional[str] = None,
|
| 293 |
+
) -> Dict[str, Any]:
|
| 294 |
+
safe_id = quote_plus(author_id)
|
| 295 |
+
return _semantic_scholar_get(
|
| 296 |
+
f"/author/{safe_id}/papers",
|
| 297 |
+
{
|
| 298 |
+
"fields": fields,
|
| 299 |
+
"limit": min(max(limit, 1), 1000),
|
| 300 |
+
"offset": max(offset, 0),
|
| 301 |
+
"publicationDateOrYear": publication_date_or_year,
|
| 302 |
+
},
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def s2_snippet_search(
|
| 307 |
+
query: str,
|
| 308 |
+
fields: Optional[str] = None,
|
| 309 |
+
limit: int = 10,
|
| 310 |
+
year: Optional[str] = None,
|
| 311 |
+
fields_of_study: Optional[str] = None,
|
| 312 |
+
) -> Dict[str, Any]:
|
| 313 |
+
return _semantic_scholar_get(
|
| 314 |
+
"/snippet/search",
|
| 315 |
+
{
|
| 316 |
+
"query": query,
|
| 317 |
+
"fields": fields,
|
| 318 |
+
"limit": min(max(limit, 1), 1000),
|
| 319 |
+
"year": year,
|
| 320 |
+
"fieldsOfStudy": fields_of_study,
|
| 321 |
+
},
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def build_search_query(text: str, max_terms: int = 10) -> str:
|
| 326 |
+
"""Builds a compact keyword query to improve search recall and reduce noisy long queries."""
|
| 327 |
+
stopwords = {
|
| 328 |
+
"the", "and", "for", "that", "with", "this", "from", "into", "our", "their",
|
| 329 |
+
"were", "have", "has", "had", "been", "are", "was", "will", "would", "can",
|
| 330 |
+
"could", "should", "about", "through", "using", "based", "than", "then", "also",
|
| 331 |
+
"such", "these", "those", "while", "where", "when", "what", "which", "who",
|
| 332 |
+
}
|
| 333 |
+
words = re.findall(r"[A-Za-z0-9]+", text.lower())
|
| 334 |
+
keywords = [w for w in words if len(w) > 2 and w not in stopwords]
|
| 335 |
+
return " ".join(keywords[:max_terms]) if keywords else " ".join(words[:max_terms])
|
| 336 |
+
|
| 337 |
+
def search_google_serper(query: str) -> List[Dict]:
|
| 338 |
+
"""Searches the open web using Google Serper API."""
|
| 339 |
+
url = "https://google.serper.dev/search"
|
| 340 |
+
payload = {"q": query}
|
| 341 |
+
headers = {
|
| 342 |
+
'X-API-KEY': SERPER_API_KEY,
|
| 343 |
+
'Content-Type': 'application/json'
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
try:
|
| 347 |
+
response = requests.post(url, headers=headers, json=payload)
|
| 348 |
+
response.raise_for_status()
|
| 349 |
+
data = response.json()
|
| 350 |
+
|
| 351 |
+
results = []
|
| 352 |
+
for item in data.get("organic", [])[:3]: # Top 3 web results
|
| 353 |
+
results.append({
|
| 354 |
+
"text": item.get("snippet", ""),
|
| 355 |
+
"url": item.get("link", ""),
|
| 356 |
+
"source_type": "Web (Google)"
|
| 357 |
+
})
|
| 358 |
+
return results
|
| 359 |
+
except Exception as e:
|
| 360 |
+
print(f"Serper Error: {e}")
|
| 361 |
+
return []
|
| 362 |
+
|
| 363 |
+
def search_semantic_scholar(query: str) -> List[Dict]:
|
| 364 |
+
"""Searches academic papers using Semantic Scholar API."""
|
| 365 |
+
prepared_query = build_search_query(query, max_terms=10)
|
| 366 |
+
normalized_query = " ".join(prepared_query.split()).lower()
|
| 367 |
+
if normalized_query in semantic_query_cache:
|
| 368 |
+
return semantic_query_cache[normalized_query]
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
results = []
|
| 372 |
+
|
| 373 |
+
# Try snippet search first because it returns passage-level text better suited for chunk comparison.
|
| 374 |
+
snippet_data = s2_snippet_search(
|
| 375 |
+
query=prepared_query,
|
| 376 |
+
fields="snippet.text,snippet.snippetKind",
|
| 377 |
+
limit=3,
|
| 378 |
+
)
|
| 379 |
+
for item in snippet_data.get("data", []):
|
| 380 |
+
snippet = item.get("snippet", {})
|
| 381 |
+
paper = item.get("paper", {})
|
| 382 |
+
snippet_text = snippet.get("text", "")
|
| 383 |
+
if snippet_text:
|
| 384 |
+
corpus_id = paper.get("corpusId")
|
| 385 |
+
paper_url = f"https://www.semanticscholar.org/paper/{corpus_id}" if corpus_id else None
|
| 386 |
+
results.append({
|
| 387 |
+
"text": snippet_text,
|
| 388 |
+
"url": paper_url,
|
| 389 |
+
"source_type": "Academic (Semantic Scholar Snippet)",
|
| 390 |
+
})
|
| 391 |
+
|
| 392 |
+
# Keep paper abstract search as fallback/secondary source.
|
| 393 |
+
data = s2_paper_search(
|
| 394 |
+
query=prepared_query,
|
| 395 |
+
limit=2,
|
| 396 |
+
fields="title,abstract,url",
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
for item in data.get("data", []):
|
| 400 |
+
if item.get("abstract"): # Only keep if abstract exists to compare text
|
| 401 |
+
results.append({
|
| 402 |
+
"text": item["abstract"],
|
| 403 |
+
"url": item.get("url", f"https://www.semanticscholar.org/paper/{item['paperId']}"),
|
| 404 |
+
"source_type": "Academic (Semantic Scholar)"
|
| 405 |
+
})
|
| 406 |
+
semantic_query_cache[normalized_query] = results
|
| 407 |
+
return results
|
| 408 |
+
except Exception as e:
|
| 409 |
+
print(f"Semantic Scholar Error: {e}")
|
| 410 |
+
return []
|
| 411 |
+
|
| 412 |
+
def aggregate_search(query: str) -> List[Dict]:
|
| 413 |
+
"""Combines Academic and Web sources and implements caching."""
|
| 414 |
+
# Use the first 15 words to make the search query efficient
|
| 415 |
+
search_query = " ".join(query.split()[:15])
|
| 416 |
+
|
| 417 |
+
if search_query in query_cache:
|
| 418 |
+
return query_cache[search_query]
|
| 419 |
+
|
| 420 |
+
# Run both searches
|
| 421 |
+
web_results = search_google_serper(search_query)
|
| 422 |
+
academic_results = search_semantic_scholar(search_query)
|
| 423 |
+
|
| 424 |
+
combined = web_results + academic_results
|
| 425 |
+
query_cache[search_query] = combined # Save to cache
|
| 426 |
+
|
| 427 |
+
# Sleep to respect rate limits
|
| 428 |
+
time.sleep(1)
|
| 429 |
+
|
| 430 |
+
return combined
|
| 431 |
+
|
| 432 |
+
# ==========================================
|
| 433 |
+
# 4. Core Comparison Logic
|
| 434 |
+
# ==========================================
|
| 435 |
+
|
| 436 |
+
def calculate_exact_similarity(text1: str, text2: str) -> float:
|
| 437 |
+
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
| 438 |
+
|
| 439 |
+
def check_paraphrasing_with_llm(chunk: str, source_text: str) -> bool:
|
| 440 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 441 |
+
("system", "You are an expert academic plagiarism detector. Determine if TEXT A is a direct paraphrase, stolen idea, or highly similar structure to TEXT B. Ignore generic academic phrases like 'In this paper we demonstrate'. Respond ONLY with 'YES' or 'NO'."),
|
| 442 |
+
("user", "TEXT A: {chunk}\n\nTEXT B: {source_text}")
|
| 443 |
+
])
|
| 444 |
+
chain = prompt | llm
|
| 445 |
+
response = chain.invoke({"chunk": chunk, "source_text": source_text})
|
| 446 |
+
return "YES" in response.content.upper()
|
| 447 |
+
|
| 448 |
+
def generate_detailed_report_with_llm(
|
| 449 |
+
filename: str,
|
| 450 |
+
match_reports: List[MatchReport],
|
| 451 |
+
total_words: int,
|
| 452 |
+
overall_score: float
|
| 453 |
+
) -> DetailedPlagiarismReport:
|
| 454 |
+
"""Generate a comprehensive report using LLM analysis"""
|
| 455 |
+
from datetime import datetime
|
| 456 |
+
|
| 457 |
+
# 1. Aggregate data for analysis
|
| 458 |
+
plagiarized_reports = [r for r in match_reports if r.is_plagiarized]
|
| 459 |
+
plagiarism_types = {}
|
| 460 |
+
sources_by_type = {"Academic": [], "Web": []}
|
| 461 |
+
|
| 462 |
+
for report in plagiarized_reports:
|
| 463 |
+
ptype = report.plagiarism_type or "Unknown"
|
| 464 |
+
plagiarism_types[ptype] = plagiarism_types.get(ptype, 0) + 1
|
| 465 |
+
|
| 466 |
+
if report.source_type:
|
| 467 |
+
if "Academic" in report.source_type:
|
| 468 |
+
if report.source_url not in sources_by_type["Academic"]:
|
| 469 |
+
sources_by_type["Academic"].append({
|
| 470 |
+
"url": report.source_url,
|
| 471 |
+
"type": report.source_type,
|
| 472 |
+
"max_similarity": report.similarity_score
|
| 473 |
+
})
|
| 474 |
+
else:
|
| 475 |
+
if report.source_url not in sources_by_type["Web"]:
|
| 476 |
+
sources_by_type["Web"].append({
|
| 477 |
+
"url": report.source_url,
|
| 478 |
+
"type": report.source_type,
|
| 479 |
+
"max_similarity": report.similarity_score
|
| 480 |
+
})
|
| 481 |
+
|
| 482 |
+
# 2. Determine severity level
|
| 483 |
+
if overall_score < 15:
|
| 484 |
+
severity = "Low"
|
| 485 |
+
risk_level = "Minimal - Normal citation variations detected"
|
| 486 |
+
elif overall_score < 30:
|
| 487 |
+
severity = "Medium"
|
| 488 |
+
risk_level = "Moderate - Multiple sources match detected"
|
| 489 |
+
elif overall_score < 50:
|
| 490 |
+
severity = "High"
|
| 491 |
+
risk_level = "Significant - Substantial plagiarism detected"
|
| 492 |
+
else:
|
| 493 |
+
severity = "Very High"
|
| 494 |
+
risk_level = "Critical - Extensive plagiarism detected"
|
| 495 |
+
|
| 496 |
+
# 3. Use LLM to generate detailed analysis
|
| 497 |
+
plagiarism_context = f"""
|
| 498 |
+
Document: {filename}
|
| 499 |
+
Total Words: {total_words}
|
| 500 |
+
Plagiarism Score: {overall_score}%
|
| 501 |
+
Plagiarism Types Found: {plagiarism_types}
|
| 502 |
+
Academic Matches: {len(sources_by_type['Academic'])}
|
| 503 |
+
Web Matches: {len(sources_by_type['Web'])}
|
| 504 |
+
|
| 505 |
+
Suspicious Sections (samples):
|
| 506 |
+
{chr(10).join([f"- {r.chunk_text[:100]}..." for r in plagiarized_reports[:5]])}
|
| 507 |
+
"""
|
| 508 |
+
|
| 509 |
+
analysis_prompt = ChatPromptTemplate.from_messages([
|
| 510 |
+
("system", """You are an expert academic integrity analyzer and plagiarism report generator.
|
| 511 |
+
Generate a professional, detailed plagiarism analysis report.
|
| 512 |
+
Focus on: severity assessment, academic integrity concerns, specific problem areas, and recommendations.
|
| 513 |
+
Be thorough but concise."""),
|
| 514 |
+
("user", """Create a detailed plagiarism analysis for this document:
|
| 515 |
+
|
| 516 |
+
{plagiarism_context}
|
| 517 |
+
|
| 518 |
+
Provide:
|
| 519 |
+
1. Executive Summary (2-3 sentences)
|
| 520 |
+
2. Key Findings (3-4 bullet points)
|
| 521 |
+
3. Detailed Analysis (2-3 paragraphs explaining the plagiarism pattern)
|
| 522 |
+
4. Recommendations (3-4 specific actions to remediate)
|
| 523 |
+
|
| 524 |
+
Format clearly with section headers.""")
|
| 525 |
+
])
|
| 526 |
+
|
| 527 |
+
chain = analysis_prompt | llm
|
| 528 |
+
llm_response = chain.invoke({"plagiarism_context": plagiarism_context})
|
| 529 |
+
llm_analysis = llm_response.content
|
| 530 |
+
|
| 531 |
+
# 4. Extract findings from LLM response
|
| 532 |
+
lines = llm_analysis.split('\n')
|
| 533 |
+
key_findings = []
|
| 534 |
+
recommendations = []
|
| 535 |
+
detailed_analysis = ""
|
| 536 |
+
|
| 537 |
+
in_findings = False
|
| 538 |
+
in_recommendations = False
|
| 539 |
+
|
| 540 |
+
for line in lines:
|
| 541 |
+
if 'Key Findings' in line:
|
| 542 |
+
in_findings = True
|
| 543 |
+
in_recommendations = False
|
| 544 |
+
elif 'Recommendations' in line:
|
| 545 |
+
in_findings = False
|
| 546 |
+
in_recommendations = True
|
| 547 |
+
elif 'Detailed Analysis' in line or 'Analysis' in line:
|
| 548 |
+
in_findings = False
|
| 549 |
+
in_recommendations = False
|
| 550 |
+
elif in_findings and line.strip().startswith(('-', '*', '•')):
|
| 551 |
+
key_findings.append(line.strip().lstrip('-*•').strip())
|
| 552 |
+
elif in_recommendations and line.strip().startswith(('-', '*', '•')):
|
| 553 |
+
recommendations.append(line.strip().lstrip('-*•').strip())
|
| 554 |
+
elif not in_findings and not in_recommendations and line.strip():
|
| 555 |
+
detailed_analysis += line + "\n"
|
| 556 |
+
|
| 557 |
+
if not key_findings:
|
| 558 |
+
key_findings = [
|
| 559 |
+
f"Overall plagiarism score: {overall_score}%",
|
| 560 |
+
f"Primary plagiarism type: {max(plagiarism_types.keys(), key=plagiarism_types.get) if plagiarism_types else 'Not detected'}",
|
| 561 |
+
f"Multiple sources detected: {len(sources_by_type['Academic']) + len(sources_by_type['Web'])} sources"
|
| 562 |
+
]
|
| 563 |
+
|
| 564 |
+
if not recommendations:
|
| 565 |
+
recommendations = [
|
| 566 |
+
"Properly cite all sources according to your institution's guidelines",
|
| 567 |
+
"Use quotation marks for direct quotes and provide page numbers",
|
| 568 |
+
"Paraphrase content properly and cite original sources",
|
| 569 |
+
"Use plagiarism detection tools during the writing process"
|
| 570 |
+
]
|
| 571 |
+
|
| 572 |
+
# 5. Affected sections
|
| 573 |
+
affected_sections = []
|
| 574 |
+
for i, report in enumerate(plagiarized_reports[:10]):
|
| 575 |
+
affected_sections.append({
|
| 576 |
+
"section_number": i + 1,
|
| 577 |
+
"text_snippet": report.chunk_text[:150],
|
| 578 |
+
"similarity_score": report.similarity_score,
|
| 579 |
+
"plagiarism_type": report.plagiarism_type,
|
| 580 |
+
"source": report.source_url,
|
| 581 |
+
"source_type": report.source_type
|
| 582 |
+
})
|
| 583 |
+
|
| 584 |
+
return DetailedPlagiarismReport(
|
| 585 |
+
filename=filename,
|
| 586 |
+
scan_timestamp=datetime.now().isoformat(),
|
| 587 |
+
executive_summary=llm_analysis.split('\n')[0] if llm_analysis else f"Document contains {overall_score}% plagiarized content",
|
| 588 |
+
overall_score=round(overall_score, 2),
|
| 589 |
+
severity_level=severity,
|
| 590 |
+
matched_sources=sources_by_type["Academic"] + sources_by_type["Web"],
|
| 591 |
+
key_findings=key_findings,
|
| 592 |
+
plagiarism_breakdown={
|
| 593 |
+
"total_plagiarism_percentage": round(overall_score, 2),
|
| 594 |
+
"types": plagiarism_types,
|
| 595 |
+
"academic_sources": len(sources_by_type["Academic"]),
|
| 596 |
+
"web_sources": len(sources_by_type["Web"])
|
| 597 |
+
},
|
| 598 |
+
detailed_analysis=detailed_analysis or llm_analysis,
|
| 599 |
+
affected_sections=affected_sections,
|
| 600 |
+
recommendations=recommendations,
|
| 601 |
+
academic_integrity_risk=risk_level
|
| 602 |
+
)
|
| 603 |
+
|
| 604 |
+
def analyze_chunk(chunk: str) -> MatchReport:
|
| 605 |
+
search_results = aggregate_search(chunk)
|
| 606 |
+
|
| 607 |
+
best_score = 0.0
|
| 608 |
+
best_url = None
|
| 609 |
+
best_source_type = None
|
| 610 |
+
plagiarism_type = None
|
| 611 |
+
is_plagiarized = False
|
| 612 |
+
|
| 613 |
+
for result in search_results:
|
| 614 |
+
source_text = result['text']
|
| 615 |
+
|
| 616 |
+
# 1. Math/Deterministic Check
|
| 617 |
+
exact_sim = calculate_exact_similarity(chunk, source_text)
|
| 618 |
+
|
| 619 |
+
if exact_sim > best_score:
|
| 620 |
+
best_score = exact_sim
|
| 621 |
+
best_url = result['url']
|
| 622 |
+
best_source_type = result['source_type']
|
| 623 |
+
|
| 624 |
+
if exact_sim > 0.50: # Lowered to 50% because we are comparing against abstracts/snippets
|
| 625 |
+
is_plagiarized = True
|
| 626 |
+
plagiarism_type = "Exact/Heavy Match"
|
| 627 |
+
break
|
| 628 |
+
|
| 629 |
+
# 2. Agentic Check for Mosaic Plagiarism
|
| 630 |
+
elif exact_sim > 0.25:
|
| 631 |
+
if check_paraphrasing_with_llm(chunk, source_text):
|
| 632 |
+
is_plagiarized = True
|
| 633 |
+
plagiarism_type = "Paraphrased Match (Mosaic)"
|
| 634 |
+
best_url = result['url']
|
| 635 |
+
best_source_type = result['source_type']
|
| 636 |
+
best_score = max(best_score, 0.85)
|
| 637 |
+
break
|
| 638 |
+
|
| 639 |
+
return MatchReport(
|
| 640 |
+
chunk_text=chunk,
|
| 641 |
+
is_plagiarized=is_plagiarized,
|
| 642 |
+
plagiarism_type=plagiarism_type,
|
| 643 |
+
source_url=best_url,
|
| 644 |
+
source_type=best_source_type,
|
| 645 |
+
similarity_score=round(best_score, 2)
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
# ==========================================
|
| 649 |
+
# 6. Report Formatting Functions
|
| 650 |
+
# ==========================================
|
| 651 |
+
|
| 652 |
+
def format_report_json(detailed_report: DetailedPlagiarismReport) -> Dict[str, Any]:
|
| 653 |
+
"""Format report as JSON"""
|
| 654 |
+
return {
|
| 655 |
+
"filename": detailed_report.filename,
|
| 656 |
+
"scan_timestamp": detailed_report.scan_timestamp,
|
| 657 |
+
# Backward-compatible top-level fields expected by existing clients.
|
| 658 |
+
"overall_score": detailed_report.overall_score,
|
| 659 |
+
"severity_level": detailed_report.severity_level,
|
| 660 |
+
"academic_integrity_risk": detailed_report.academic_integrity_risk,
|
| 661 |
+
"summary": {
|
| 662 |
+
"overall_plagiarism_score": detailed_report.overall_score,
|
| 663 |
+
"severity_level": detailed_report.severity_level,
|
| 664 |
+
"academic_integrity_risk": detailed_report.academic_integrity_risk
|
| 665 |
+
},
|
| 666 |
+
"executive_summary": detailed_report.executive_summary,
|
| 667 |
+
"key_findings": detailed_report.key_findings,
|
| 668 |
+
"plagiarism_breakdown": detailed_report.plagiarism_breakdown,
|
| 669 |
+
"matched_sources": detailed_report.matched_sources,
|
| 670 |
+
"affected_sections": detailed_report.affected_sections,
|
| 671 |
+
"detailed_analysis": detailed_report.detailed_analysis,
|
| 672 |
+
"recommendations": detailed_report.recommendations
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
def format_report_text(detailed_report: DetailedPlagiarismReport) -> str:
|
| 676 |
+
"""Format report as plain text"""
|
| 677 |
+
report = "=" * 80 + "\n"
|
| 678 |
+
report += "DETAILED PLAGIARISM DETECTION REPORT\n"
|
| 679 |
+
report += "=" * 80 + "\n\n"
|
| 680 |
+
|
| 681 |
+
report += f"FILE: {detailed_report.filename}\n"
|
| 682 |
+
report += f"SCAN DATE: {detailed_report.scan_timestamp}\n"
|
| 683 |
+
report += "-" * 80 + "\n\n"
|
| 684 |
+
|
| 685 |
+
report += "SUMMARY\n"
|
| 686 |
+
report += "-" * 80 + "\n"
|
| 687 |
+
report += f"Overall Plagiarism Score: {detailed_report.overall_score}%\n"
|
| 688 |
+
report += f"Severity Level: {detailed_report.severity_level}\n"
|
| 689 |
+
report += f"Academic Integrity Risk: {detailed_report.academic_integrity_risk}\n\n"
|
| 690 |
+
|
| 691 |
+
report += "EXECUTIVE SUMMARY\n"
|
| 692 |
+
report += "-" * 80 + "\n"
|
| 693 |
+
report += f"{detailed_report.executive_summary}\n\n"
|
| 694 |
+
|
| 695 |
+
report += "KEY FINDINGS\n"
|
| 696 |
+
report += "-" * 80 + "\n"
|
| 697 |
+
for i, finding in enumerate(detailed_report.key_findings, 1):
|
| 698 |
+
report += f"{i}. {finding}\n"
|
| 699 |
+
report += "\n"
|
| 700 |
+
|
| 701 |
+
report += "PLAGIARISM BREAKDOWN\n"
|
| 702 |
+
report += "-" * 80 + "\n"
|
| 703 |
+
report += f"Total Plagiarism %: {detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%\n"
|
| 704 |
+
report += f"Academic Sources: {detailed_report.plagiarism_breakdown['academic_sources']}\n"
|
| 705 |
+
report += f"Web Sources: {detailed_report.plagiarism_breakdown['web_sources']}\n"
|
| 706 |
+
if detailed_report.plagiarism_breakdown.get('types'):
|
| 707 |
+
report += "Types Detected:\n"
|
| 708 |
+
for ptype, count in detailed_report.plagiarism_breakdown['types'].items():
|
| 709 |
+
report += f" - {ptype}: {count} instances\n"
|
| 710 |
+
report += "\n"
|
| 711 |
+
|
| 712 |
+
report += "MATCHED SOURCES\n"
|
| 713 |
+
report += "-" * 80 + "\n"
|
| 714 |
+
if detailed_report.matched_sources:
|
| 715 |
+
for i, source in enumerate(detailed_report.matched_sources[:10], 1):
|
| 716 |
+
report += f"{i}. URL: {source.get('url', 'N/A')}\n"
|
| 717 |
+
report += f" Type: {source.get('type', 'N/A')}\n"
|
| 718 |
+
report += f" Similarity: {source.get('max_similarity', 'N/A')}\n\n"
|
| 719 |
+
else:
|
| 720 |
+
report += "No sources matched.\n\n"
|
| 721 |
+
|
| 722 |
+
report += "DETAILED ANALYSIS\n"
|
| 723 |
+
report += "-" * 80 + "\n"
|
| 724 |
+
report += f"{detailed_report.detailed_analysis}\n\n"
|
| 725 |
+
|
| 726 |
+
if detailed_report.affected_sections:
|
| 727 |
+
report += "AFFECTED SECTIONS (Top Issues)\n"
|
| 728 |
+
report += "-" * 80 + "\n"
|
| 729 |
+
for section in detailed_report.affected_sections[:5]:
|
| 730 |
+
report += f"\nSection {section['section_number']}:\n"
|
| 731 |
+
report += f"Text Snippet: {section['text_snippet']}\n"
|
| 732 |
+
report += f"Similarity Score: {section['similarity_score']}\n"
|
| 733 |
+
report += f"Plagiarism Type: {section['plagiarism_type']}\n"
|
| 734 |
+
report += f"Source: {section['source']}\n"
|
| 735 |
+
report += "\n"
|
| 736 |
+
|
| 737 |
+
report += "RECOMMENDATIONS\n"
|
| 738 |
+
report += "-" * 80 + "\n"
|
| 739 |
+
for i, rec in enumerate(detailed_report.recommendations, 1):
|
| 740 |
+
report += f"{i}. {rec}\n"
|
| 741 |
+
report += "\n"
|
| 742 |
+
|
| 743 |
+
report += "=" * 80 + "\n"
|
| 744 |
+
report += "End of Report\n"
|
| 745 |
+
report += "=" * 80 + "\n"
|
| 746 |
+
|
| 747 |
+
return report
|
| 748 |
+
|
| 749 |
+
def format_report_html(detailed_report: DetailedPlagiarismReport) -> str:
|
| 750 |
+
"""Format report as HTML"""
|
| 751 |
+
html = f"""
|
| 752 |
+
<!DOCTYPE html>
|
| 753 |
+
<html lang="en">
|
| 754 |
+
<head>
|
| 755 |
+
<meta charset="UTF-8">
|
| 756 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 757 |
+
<title>Plagiarism Detection Report - {detailed_report.filename}</title>
|
| 758 |
+
<style>
|
| 759 |
+
body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
|
| 760 |
+
.container {{ background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
|
| 761 |
+
h1 {{ color: #333; border-bottom: 3px solid #2196F3; padding-bottom: 10px; }}
|
| 762 |
+
h2 {{ color: #2196F3; margin-top: 30px; }}
|
| 763 |
+
.summary {{ background-color: #f0f7ff; padding: 15px; border-left: 4px solid #2196F3; margin: 20px 0; }}
|
| 764 |
+
.score {{ font-size: 24px; font-weight: bold; color: #d32f2f; }}
|
| 765 |
+
.severity-low {{ color: #4caf50; }}
|
| 766 |
+
.severity-medium {{ color: #ff9800; }}
|
| 767 |
+
.severity-high {{ color: #f44336; }}
|
| 768 |
+
.severity-very-high {{ color: #c41c3b; }}
|
| 769 |
+
.findings {{ background-color: #fff3e0; padding: 15px; border-left: 4px solid #ff9800; }}
|
| 770 |
+
.source-item {{ background-color: #f5f5f5; padding: 10px; margin: 10px 0; border-radius: 4px; }}
|
| 771 |
+
.recommendation {{ background-color: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 3px solid #4caf50; }}
|
| 772 |
+
table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
|
| 773 |
+
th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
|
| 774 |
+
th {{ background-color: #2196F3; color: white; }}
|
| 775 |
+
.affected-section {{ background-color: #fce4ec; padding: 15px; margin: 10px 0; border-radius: 4px; }}
|
| 776 |
+
</style>
|
| 777 |
+
</head>
|
| 778 |
+
<body>
|
| 779 |
+
<div class="container">
|
| 780 |
+
<h1>🔍 Plagiarism Detection Report</h1>
|
| 781 |
+
|
| 782 |
+
<div class="summary">
|
| 783 |
+
<p><strong>File:</strong> {detailed_report.filename}</p>
|
| 784 |
+
<p><strong>Scan Date:</strong> {detailed_report.scan_timestamp}</p>
|
| 785 |
+
<p><strong>Overall Plagiarism Score:</strong> <span class="score">{detailed_report.overall_score}%</span></p>
|
| 786 |
+
<p><strong>Severity Level:</strong> <span class="severity-{detailed_report.severity_level.lower().replace(' ', '-')}">{detailed_report.severity_level}</span></p>
|
| 787 |
+
<p><strong>Academic Integrity Risk:</strong> {detailed_report.academic_integrity_risk}</p>
|
| 788 |
+
</div>
|
| 789 |
+
|
| 790 |
+
<h2>Executive Summary</h2>
|
| 791 |
+
<p>{detailed_report.executive_summary}</p>
|
| 792 |
+
|
| 793 |
+
<h2>Key Findings</h2>
|
| 794 |
+
<div class="findings">
|
| 795 |
+
<ul>
|
| 796 |
+
{"".join([f"<li>{finding}</li>" for finding in detailed_report.key_findings])}
|
| 797 |
+
</ul>
|
| 798 |
+
</div>
|
| 799 |
+
|
| 800 |
+
<h2>Plagiarism Breakdown</h2>
|
| 801 |
+
<table>
|
| 802 |
+
<tr>
|
| 803 |
+
<th>Category</th>
|
| 804 |
+
<th>Value</th>
|
| 805 |
+
</tr>
|
| 806 |
+
<tr>
|
| 807 |
+
<td>Total Plagiarism %</td>
|
| 808 |
+
<td>{detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%</td>
|
| 809 |
+
</tr>
|
| 810 |
+
<tr>
|
| 811 |
+
<td>Academic Sources</td>
|
| 812 |
+
<td>{detailed_report.plagiarism_breakdown['academic_sources']}</td>
|
| 813 |
+
</tr>
|
| 814 |
+
<tr>
|
| 815 |
+
<td>Web Sources</td>
|
| 816 |
+
<td>{detailed_report.plagiarism_breakdown['web_sources']}</td>
|
| 817 |
+
</tr>
|
| 818 |
+
</table>
|
| 819 |
+
|
| 820 |
+
<h2>Matched Sources</h2>
|
| 821 |
+
{"".join([f'<div class="source-item"><strong>{source.get("type", "Unknown")}</strong><br/><a href="{source.get("url", "#")}" target="_blank">{source.get("url", "N/A")}</a><br/>Similarity: {source.get("max_similarity", "N/A")}</div>' for source in detailed_report.matched_sources[:10]])}
|
| 822 |
+
|
| 823 |
+
<h2>Detailed Analysis</h2>
|
| 824 |
+
<p>{detailed_report.detailed_analysis.replace(chr(10), "<br/>")}</p>
|
| 825 |
+
|
| 826 |
+
{"<h2>Affected Sections (Top Issues)</h2>" + "".join([f'<div class="affected-section"><strong>Section {section["section_number"]}</strong><br/><em>Text:</em> {section["text_snippet"]}...<br/><em>Similarity:</em> {section["similarity_score"]}<br/><em>Type:</em> {section["plagiarism_type"]}</div>' for section in detailed_report.affected_sections[:5]]) if detailed_report.affected_sections else ""}
|
| 827 |
+
|
| 828 |
+
<h2>Recommendations</h2>
|
| 829 |
+
<div>
|
| 830 |
+
{"".join([f'<div class="recommendation"><strong>✓</strong> {rec}</div>' for rec in detailed_report.recommendations])}
|
| 831 |
+
</div>
|
| 832 |
+
</div>
|
| 833 |
+
</body>
|
| 834 |
+
</html>
|
| 835 |
+
"""
|
| 836 |
+
return html
|
| 837 |
+
|
| 838 |
+
# ==========================================
|
| 839 |
+
# 5. API Endpoints & Utility
|
| 840 |
+
# ==========================================
|
| 841 |
+
|
| 842 |
+
def extract_text_from_pdf(file_bytes) -> str:
|
| 843 |
+
reader = PdfReader(file_bytes)
|
| 844 |
+
return "".join([page.extract_text() + "\n" for page in reader.pages if page.extract_text()])
|
| 845 |
+
|
| 846 |
+
def chunk_text(text: str, words_per_chunk: int = 40) -> List[str]:
|
| 847 |
+
words = text.split()
|
| 848 |
+
chunks = []
|
| 849 |
+
for i in range(0, len(words), words_per_chunk - 10):
|
| 850 |
+
chunk = " ".join(words[i:i + words_per_chunk])
|
| 851 |
+
if len(chunk.split()) > 15:
|
| 852 |
+
chunks.append(chunk)
|
| 853 |
+
return chunks
|
| 854 |
+
|
| 855 |
+
@app.post("/scan-paper", response_model=PlagiarismReport)
|
| 856 |
+
async def scan_paper(file: UploadFile = File(...)):
|
| 857 |
+
text = extract_text_from_pdf(file.file)
|
| 858 |
+
total_words = len(text.split())
|
| 859 |
+
|
| 860 |
+
if total_words == 0:
|
| 861 |
+
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 862 |
+
|
| 863 |
+
chunks = chunk_text(text)
|
| 864 |
+
|
| 865 |
+
# Cap chunks for safety during testing (remove in production)
|
| 866 |
+
if len(chunks) > 20:
|
| 867 |
+
chunks = chunks[:20]
|
| 868 |
+
|
| 869 |
+
detailed_reports = []
|
| 870 |
+
plagiarized_word_count = 0
|
| 871 |
+
|
| 872 |
+
for chunk in chunks:
|
| 873 |
+
report = analyze_chunk(chunk)
|
| 874 |
+
detailed_reports.append(report)
|
| 875 |
+
|
| 876 |
+
if report.is_plagiarized:
|
| 877 |
+
plagiarized_word_count += len(chunk.split())
|
| 878 |
+
|
| 879 |
+
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 880 |
+
overall_score = (plagiarized_word_count / total_words) * 100
|
| 881 |
+
|
| 882 |
+
# Determine severity level
|
| 883 |
+
if overall_score < 15:
|
| 884 |
+
severity = "Low"
|
| 885 |
+
elif overall_score < 30:
|
| 886 |
+
severity = "Medium"
|
| 887 |
+
elif overall_score < 50:
|
| 888 |
+
severity = "High"
|
| 889 |
+
else:
|
| 890 |
+
severity = "Very High"
|
| 891 |
+
|
| 892 |
+
return PlagiarismReport(
|
| 893 |
+
filename=file.filename,
|
| 894 |
+
total_words=total_words,
|
| 895 |
+
plagiarized_words=plagiarized_word_count,
|
| 896 |
+
overall_plagiarism_score=round(overall_score, 2),
|
| 897 |
+
severity_level=severity,
|
| 898 |
+
details=detailed_reports
|
| 899 |
+
)
|
| 900 |
+
|
| 901 |
+
@app.post("/generate-detailed-report")
|
| 902 |
+
async def generate_detailed_report(file: UploadFile = File(...)):
|
| 903 |
+
"""Generate comprehensive plagiarism report with LLM analysis"""
|
| 904 |
+
text = extract_text_from_pdf(file.file)
|
| 905 |
+
total_words = len(text.split())
|
| 906 |
+
|
| 907 |
+
if total_words == 0:
|
| 908 |
+
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 909 |
+
|
| 910 |
+
chunks = chunk_text(text)
|
| 911 |
+
|
| 912 |
+
# Cap chunks
|
| 913 |
+
if len(chunks) > 20:
|
| 914 |
+
chunks = chunks[:20]
|
| 915 |
+
|
| 916 |
+
detailed_reports = []
|
| 917 |
+
plagiarized_word_count = 0
|
| 918 |
+
|
| 919 |
+
for chunk in chunks:
|
| 920 |
+
report = analyze_chunk(chunk)
|
| 921 |
+
detailed_reports.append(report)
|
| 922 |
+
|
| 923 |
+
if report.is_plagiarized:
|
| 924 |
+
plagiarized_word_count += len(chunk.split())
|
| 925 |
+
|
| 926 |
+
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 927 |
+
overall_score = (plagiarized_word_count / total_words) * 100
|
| 928 |
+
|
| 929 |
+
# Generate detailed report with LLM analysis
|
| 930 |
+
detailed_report = generate_detailed_report_with_llm(
|
| 931 |
+
filename=file.filename,
|
| 932 |
+
match_reports=detailed_reports,
|
| 933 |
+
total_words=total_words,
|
| 934 |
+
overall_score=overall_score
|
| 935 |
+
)
|
| 936 |
+
|
| 937 |
+
return format_report_json(detailed_report)
|
| 938 |
+
|
| 939 |
+
@app.post("/report/text")
|
| 940 |
+
async def report_text(file: UploadFile = File(...)):
|
| 941 |
+
"""Generate detailed plagiarism report as plain text"""
|
| 942 |
+
text = extract_text_from_pdf(file.file)
|
| 943 |
+
total_words = len(text.split())
|
| 944 |
+
|
| 945 |
+
if total_words == 0:
|
| 946 |
+
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 947 |
+
|
| 948 |
+
chunks = chunk_text(text)
|
| 949 |
+
|
| 950 |
+
if len(chunks) > 20:
|
| 951 |
+
chunks = chunks[:20]
|
| 952 |
+
|
| 953 |
+
detailed_reports = []
|
| 954 |
+
plagiarized_word_count = 0
|
| 955 |
+
|
| 956 |
+
for chunk in chunks:
|
| 957 |
+
report = analyze_chunk(chunk)
|
| 958 |
+
detailed_reports.append(report)
|
| 959 |
+
|
| 960 |
+
if report.is_plagiarized:
|
| 961 |
+
plagiarized_word_count += len(chunk.split())
|
| 962 |
+
|
| 963 |
+
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 964 |
+
overall_score = (plagiarized_word_count / total_words) * 100
|
| 965 |
+
|
| 966 |
+
# Generate detailed report
|
| 967 |
+
detailed_report = generate_detailed_report_with_llm(
|
| 968 |
+
filename=file.filename,
|
| 969 |
+
match_reports=detailed_reports,
|
| 970 |
+
total_words=total_words,
|
| 971 |
+
overall_score=overall_score
|
| 972 |
+
)
|
| 973 |
+
|
| 974 |
+
from fastapi.responses import PlainTextResponse
|
| 975 |
+
return PlainTextResponse(format_report_text(detailed_report))
|
| 976 |
+
|
| 977 |
+
@app.post("/report/html")
|
| 978 |
+
async def report_html(file: UploadFile = File(...)):
|
| 979 |
+
"""Generate detailed plagiarism report as HTML"""
|
| 980 |
+
text = extract_text_from_pdf(file.file)
|
| 981 |
+
total_words = len(text.split())
|
| 982 |
+
|
| 983 |
+
if total_words == 0:
|
| 984 |
+
raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
|
| 985 |
+
|
| 986 |
+
chunks = chunk_text(text)
|
| 987 |
+
|
| 988 |
+
if len(chunks) > 20:
|
| 989 |
+
chunks = chunks[:20]
|
| 990 |
+
|
| 991 |
+
detailed_reports = []
|
| 992 |
+
plagiarized_word_count = 0
|
| 993 |
+
|
| 994 |
+
for chunk in chunks:
|
| 995 |
+
report = analyze_chunk(chunk)
|
| 996 |
+
detailed_reports.append(report)
|
| 997 |
+
|
| 998 |
+
if report.is_plagiarized:
|
| 999 |
+
plagiarized_word_count += len(chunk.split())
|
| 1000 |
+
|
| 1001 |
+
plagiarized_word_count = min(plagiarized_word_count, total_words)
|
| 1002 |
+
overall_score = (plagiarized_word_count / total_words) * 100
|
| 1003 |
+
|
| 1004 |
+
# Generate detailed report
|
| 1005 |
+
detailed_report = generate_detailed_report_with_llm(
|
| 1006 |
+
filename=file.filename,
|
| 1007 |
+
match_reports=detailed_reports,
|
| 1008 |
+
total_words=total_words,
|
| 1009 |
+
overall_score=overall_score
|
| 1010 |
+
)
|
| 1011 |
+
|
| 1012 |
+
from fastapi.responses import HTMLResponse
|
| 1013 |
+
return HTMLResponse(format_report_html(detailed_report))
|
| 1014 |
+
|
| 1015 |
+
@app.get("/")
|
| 1016 |
+
async def root():
|
| 1017 |
+
return {
|
| 1018 |
+
"message": "Pro Plagiarism Detector API",
|
| 1019 |
+
"endpoints": {
|
| 1020 |
+
"scan": "/scan-paper (POST - basic scan)",
|
| 1021 |
+
"detailed_report": "/generate-detailed-report (POST - JSON report with LLM analysis)",
|
| 1022 |
+
"text_report": "/report/text (POST - plain text report)",
|
| 1023 |
+
"html_report": "/report/html (POST - HTML report)"
|
| 1024 |
+
}
|
| 1025 |
+
}
|
| 1026 |
+
|
| 1027 |
+
if __name__ == "__main__":
|
| 1028 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|