Manas281 commited on
Commit
95e10bf
·
verified ·
1 Parent(s): bffaf4b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +1027 -1027
main.py CHANGED
@@ -1,1028 +1,1028 @@
1
- import os
2
- import time
3
- import requests
4
- import random
5
- import re
6
- from difflib import SequenceMatcher
7
- from typing import List, Optional, Dict, Any
8
- from urllib.parse import quote_plus
9
-
10
- from fastapi import FastAPI, UploadFile, File, HTTPException
11
- from fastapi.middleware.cors import CORSMiddleware
12
- import uvicorn
13
- from pydantic import BaseModel
14
- from PyPDF2 import PdfReader
15
-
16
- from langchain_groq import ChatGroq
17
- from langchain_core.prompts import ChatPromptTemplate
18
-
19
- # ==========================================
20
- # 1. Environment & API Setup
21
- # ==========================================
22
- GROQ_API_KEY = os.getenv("GROQ_API_KEY","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")
23
- SERPER_API_KEY = os.getenv("SERPER_API_KEY","e8f2e0a4b337e7e8b63d5bf6057f01441cfa6ca5")
24
- SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY","0a00aUnvlY9iwf7sDOQEoaVRubNMtFTi3KOa6bB0")
25
- SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
26
- SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS = 1.2
27
- SEMANTIC_SCHOLAR_MAX_RETRIES = 4
28
-
29
- if not GROQ_API_KEY or not SERPER_API_KEY:
30
- print("WARNING: GROQ_API_KEY or SERPER_API_KEY is missing!")
31
-
32
- llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.1)
33
-
34
- # Basic Memory Cache to maintain API efficiency (as promised in the application)
35
- query_cache = {}
36
- semantic_query_cache: Dict[str, List[Dict[str, str]]] = {}
37
- _last_semantic_scholar_call_ts = 0.0
38
-
39
- # ==========================================
40
- # 2. Pydantic Models
41
- # ==========================================
42
- class MatchReport(BaseModel):
43
- chunk_text: str
44
- is_plagiarized: bool
45
- plagiarism_type: Optional[str] = None
46
- source_url: Optional[str] = None
47
- source_type: Optional[str] = None # "Academic" or "Web"
48
- similarity_score: float
49
-
50
- class PlagiarismReport(BaseModel):
51
- filename: str
52
- total_words: int
53
- plagiarized_words: int
54
- overall_plagiarism_score: float
55
- severity_level: str # Low, Medium, High, Very High
56
- details: List[MatchReport]
57
-
58
- class DetailedPlagiarismReport(BaseModel):
59
- """Comprehensive report generated by LLM"""
60
- filename: str
61
- scan_timestamp: str
62
- executive_summary: str
63
- overall_score: float
64
- severity_level: str
65
- matched_sources: List[Dict[str, Any]]
66
- key_findings: List[str]
67
- plagiarism_breakdown: Dict[str, Any] # Types and percentages
68
- detailed_analysis: str # LLM-generated detailed analysis
69
- affected_sections: List[Dict[str, Any]] # Which parts are problematic
70
- recommendations: List[str]
71
- academic_integrity_risk: str # Assessment level
72
-
73
- app = FastAPI(title="Pro Plagiarism Detector (Turnitin Clone)")
74
-
75
- app.add_middleware(
76
- CORSMiddleware,
77
- allow_origins=["*"],
78
- allow_credentials=True,
79
- allow_methods=["*"],
80
- allow_headers=["*"],
81
- )
82
-
83
- # ==========================================
84
- # 3. Agent Tools: Serper & Semantic Scholar
85
- # ==========================================
86
-
87
- def _semantic_scholar_headers() -> Dict[str, str]:
88
- headers: Dict[str, str] = {}
89
- if SEMANTIC_SCHOLAR_API_KEY:
90
- # API key must be sent in x-api-key header.
91
- headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY
92
- return headers
93
-
94
-
95
- def _semantic_scholar_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
96
- global _last_semantic_scholar_call_ts
97
- filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
98
-
99
- for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
100
- elapsed = time.time() - _last_semantic_scholar_call_ts
101
- if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
102
- time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
103
-
104
- response = requests.get(
105
- f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
106
- headers=_semantic_scholar_headers(),
107
- params=filtered_params,
108
- timeout=20,
109
- )
110
- _last_semantic_scholar_call_ts = time.time()
111
-
112
- if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
113
- retry_after = response.headers.get("Retry-After")
114
- if retry_after and retry_after.isdigit():
115
- wait_seconds = float(retry_after)
116
- else:
117
- wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
118
- time.sleep(wait_seconds)
119
- continue
120
-
121
- response.raise_for_status()
122
- return response.json()
123
-
124
- raise requests.HTTPError("Semantic Scholar request failed after retries")
125
-
126
-
127
- def _semantic_scholar_post(path: str, body: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> Any:
128
- global _last_semantic_scholar_call_ts
129
- filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
130
-
131
- for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
132
- elapsed = time.time() - _last_semantic_scholar_call_ts
133
- if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
134
- time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
135
-
136
- response = requests.post(
137
- f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
138
- headers=_semantic_scholar_headers(),
139
- params=filtered_params,
140
- json=body,
141
- timeout=25,
142
- )
143
- _last_semantic_scholar_call_ts = time.time()
144
-
145
- if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
146
- retry_after = response.headers.get("Retry-After")
147
- if retry_after and retry_after.isdigit():
148
- wait_seconds = float(retry_after)
149
- else:
150
- wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
151
- time.sleep(wait_seconds)
152
- continue
153
-
154
- response.raise_for_status()
155
- return response.json()
156
-
157
- raise requests.HTTPError("Semantic Scholar request failed after retries")
158
-
159
-
160
- def s2_paper_autocomplete(query: str) -> Dict[str, Any]:
161
- return _semantic_scholar_get("/paper/autocomplete", {"query": query[:100]})
162
-
163
-
164
- def s2_paper_batch(ids: List[str], fields: Optional[str] = None) -> Any:
165
- return _semantic_scholar_post("/paper/batch", {"ids": ids[:500]}, {"fields": fields})
166
-
167
-
168
- def s2_paper_search(
169
- query: str,
170
- fields: Optional[str] = None,
171
- limit: int = 100,
172
- offset: int = 0,
173
- year: Optional[str] = None,
174
- fields_of_study: Optional[str] = None,
175
- open_access_pdf: bool = False,
176
- ) -> Dict[str, Any]:
177
- params: Dict[str, Any] = {
178
- "query": query,
179
- "fields": fields,
180
- "limit": min(max(limit, 1), 100),
181
- "offset": max(offset, 0),
182
- "year": year,
183
- "fieldsOfStudy": fields_of_study,
184
- }
185
- if open_access_pdf:
186
- params["openAccessPdf"] = ""
187
- return _semantic_scholar_get("/paper/search", params)
188
-
189
-
190
- def s2_paper_search_bulk(
191
- query: str,
192
- fields: Optional[str] = None,
193
- token: Optional[str] = None,
194
- sort: Optional[str] = None,
195
- ) -> Dict[str, Any]:
196
- return _semantic_scholar_get(
197
- "/paper/search/bulk",
198
- {
199
- "query": query,
200
- "fields": fields,
201
- "token": token,
202
- "sort": sort,
203
- },
204
- )
205
-
206
-
207
- def s2_paper_search_match(query: str, fields: Optional[str] = None) -> Dict[str, Any]:
208
- return _semantic_scholar_get("/paper/search/match", {"query": query, "fields": fields})
209
-
210
-
211
- def s2_paper_details(paper_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
212
- safe_id = quote_plus(paper_id)
213
- return _semantic_scholar_get(f"/paper/{safe_id}", {"fields": fields})
214
-
215
-
216
- def s2_paper_authors(
217
- paper_id: str,
218
- fields: Optional[str] = None,
219
- limit: int = 100,
220
- offset: int = 0,
221
- ) -> Dict[str, Any]:
222
- safe_id = quote_plus(paper_id)
223
- return _semantic_scholar_get(
224
- f"/paper/{safe_id}/authors",
225
- {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
226
- )
227
-
228
-
229
- def s2_paper_citations(
230
- paper_id: str,
231
- fields: Optional[str] = None,
232
- limit: int = 100,
233
- offset: int = 0,
234
- publication_date_or_year: Optional[str] = None,
235
- ) -> Dict[str, Any]:
236
- safe_id = quote_plus(paper_id)
237
- return _semantic_scholar_get(
238
- f"/paper/{safe_id}/citations",
239
- {
240
- "fields": fields,
241
- "limit": min(max(limit, 1), 1000),
242
- "offset": max(offset, 0),
243
- "publicationDateOrYear": publication_date_or_year,
244
- },
245
- )
246
-
247
-
248
- def s2_paper_references(
249
- paper_id: str,
250
- fields: Optional[str] = None,
251
- limit: int = 100,
252
- offset: int = 0,
253
- ) -> Dict[str, Any]:
254
- safe_id = quote_plus(paper_id)
255
- return _semantic_scholar_get(
256
- f"/paper/{safe_id}/references",
257
- {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
258
- )
259
-
260
-
261
- def s2_author_batch(ids: List[str], fields: Optional[str] = None) -> Any:
262
- return _semantic_scholar_post("/author/batch", {"ids": ids[:1000]}, {"fields": fields})
263
-
264
-
265
- def s2_author_search(
266
- query: str,
267
- fields: Optional[str] = None,
268
- limit: int = 100,
269
- offset: int = 0,
270
- ) -> Dict[str, Any]:
271
- return _semantic_scholar_get(
272
- "/author/search",
273
- {
274
- "query": query,
275
- "fields": fields,
276
- "limit": min(max(limit, 1), 1000),
277
- "offset": max(offset, 0),
278
- },
279
- )
280
-
281
-
282
- def s2_author_details(author_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
283
- safe_id = quote_plus(author_id)
284
- return _semantic_scholar_get(f"/author/{safe_id}", {"fields": fields})
285
-
286
-
287
- def s2_author_papers(
288
- author_id: str,
289
- fields: Optional[str] = None,
290
- limit: int = 100,
291
- offset: int = 0,
292
- publication_date_or_year: Optional[str] = None,
293
- ) -> Dict[str, Any]:
294
- safe_id = quote_plus(author_id)
295
- return _semantic_scholar_get(
296
- f"/author/{safe_id}/papers",
297
- {
298
- "fields": fields,
299
- "limit": min(max(limit, 1), 1000),
300
- "offset": max(offset, 0),
301
- "publicationDateOrYear": publication_date_or_year,
302
- },
303
- )
304
-
305
-
306
- def s2_snippet_search(
307
- query: str,
308
- fields: Optional[str] = None,
309
- limit: int = 10,
310
- year: Optional[str] = None,
311
- fields_of_study: Optional[str] = None,
312
- ) -> Dict[str, Any]:
313
- return _semantic_scholar_get(
314
- "/snippet/search",
315
- {
316
- "query": query,
317
- "fields": fields,
318
- "limit": min(max(limit, 1), 1000),
319
- "year": year,
320
- "fieldsOfStudy": fields_of_study,
321
- },
322
- )
323
-
324
-
325
- def build_search_query(text: str, max_terms: int = 10) -> str:
326
- """Builds a compact keyword query to improve search recall and reduce noisy long queries."""
327
- stopwords = {
328
- "the", "and", "for", "that", "with", "this", "from", "into", "our", "their",
329
- "were", "have", "has", "had", "been", "are", "was", "will", "would", "can",
330
- "could", "should", "about", "through", "using", "based", "than", "then", "also",
331
- "such", "these", "those", "while", "where", "when", "what", "which", "who",
332
- }
333
- words = re.findall(r"[A-Za-z0-9]+", text.lower())
334
- keywords = [w for w in words if len(w) > 2 and w not in stopwords]
335
- return " ".join(keywords[:max_terms]) if keywords else " ".join(words[:max_terms])
336
-
337
- def search_google_serper(query: str) -> List[Dict]:
338
- """Searches the open web using Google Serper API."""
339
- url = "https://google.serper.dev/search"
340
- payload = {"q": query}
341
- headers = {
342
- 'X-API-KEY': SERPER_API_KEY,
343
- 'Content-Type': 'application/json'
344
- }
345
-
346
- try:
347
- response = requests.post(url, headers=headers, json=payload)
348
- response.raise_for_status()
349
- data = response.json()
350
-
351
- results = []
352
- for item in data.get("organic", [])[:3]: # Top 3 web results
353
- results.append({
354
- "text": item.get("snippet", ""),
355
- "url": item.get("link", ""),
356
- "source_type": "Web (Google)"
357
- })
358
- return results
359
- except Exception as e:
360
- print(f"Serper Error: {e}")
361
- return []
362
-
363
- def search_semantic_scholar(query: str) -> List[Dict]:
364
- """Searches academic papers using Semantic Scholar API."""
365
- prepared_query = build_search_query(query, max_terms=10)
366
- normalized_query = " ".join(prepared_query.split()).lower()
367
- if normalized_query in semantic_query_cache:
368
- return semantic_query_cache[normalized_query]
369
-
370
- try:
371
- results = []
372
-
373
- # Try snippet search first because it returns passage-level text better suited for chunk comparison.
374
- snippet_data = s2_snippet_search(
375
- query=prepared_query,
376
- fields="snippet.text,snippet.snippetKind",
377
- limit=3,
378
- )
379
- for item in snippet_data.get("data", []):
380
- snippet = item.get("snippet", {})
381
- paper = item.get("paper", {})
382
- snippet_text = snippet.get("text", "")
383
- if snippet_text:
384
- corpus_id = paper.get("corpusId")
385
- paper_url = f"https://www.semanticscholar.org/paper/{corpus_id}" if corpus_id else None
386
- results.append({
387
- "text": snippet_text,
388
- "url": paper_url,
389
- "source_type": "Academic (Semantic Scholar Snippet)",
390
- })
391
-
392
- # Keep paper abstract search as fallback/secondary source.
393
- data = s2_paper_search(
394
- query=prepared_query,
395
- limit=2,
396
- fields="title,abstract,url",
397
- )
398
-
399
- for item in data.get("data", []):
400
- if item.get("abstract"): # Only keep if abstract exists to compare text
401
- results.append({
402
- "text": item["abstract"],
403
- "url": item.get("url", f"https://www.semanticscholar.org/paper/{item['paperId']}"),
404
- "source_type": "Academic (Semantic Scholar)"
405
- })
406
- semantic_query_cache[normalized_query] = results
407
- return results
408
- except Exception as e:
409
- print(f"Semantic Scholar Error: {e}")
410
- return []
411
-
412
- def aggregate_search(query: str) -> List[Dict]:
413
- """Combines Academic and Web sources and implements caching."""
414
- # Use the first 15 words to make the search query efficient
415
- search_query = " ".join(query.split()[:15])
416
-
417
- if search_query in query_cache:
418
- return query_cache[search_query]
419
-
420
- # Run both searches
421
- web_results = search_google_serper(search_query)
422
- academic_results = search_semantic_scholar(search_query)
423
-
424
- combined = web_results + academic_results
425
- query_cache[search_query] = combined # Save to cache
426
-
427
- # Sleep to respect rate limits
428
- time.sleep(1)
429
-
430
- return combined
431
-
432
- # ==========================================
433
- # 4. Core Comparison Logic
434
- # ==========================================
435
-
436
- def calculate_exact_similarity(text1: str, text2: str) -> float:
437
- return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
438
-
439
- def check_paraphrasing_with_llm(chunk: str, source_text: str) -> bool:
440
- prompt = ChatPromptTemplate.from_messages([
441
- ("system", "You are an expert academic plagiarism detector. Determine if TEXT A is a direct paraphrase, stolen idea, or highly similar structure to TEXT B. Ignore generic academic phrases like 'In this paper we demonstrate'. Respond ONLY with 'YES' or 'NO'."),
442
- ("user", "TEXT A: {chunk}\n\nTEXT B: {source_text}")
443
- ])
444
- chain = prompt | llm
445
- response = chain.invoke({"chunk": chunk, "source_text": source_text})
446
- return "YES" in response.content.upper()
447
-
448
- def generate_detailed_report_with_llm(
449
- filename: str,
450
- match_reports: List[MatchReport],
451
- total_words: int,
452
- overall_score: float
453
- ) -> DetailedPlagiarismReport:
454
- """Generate a comprehensive report using LLM analysis"""
455
- from datetime import datetime
456
-
457
- # 1. Aggregate data for analysis
458
- plagiarized_reports = [r for r in match_reports if r.is_plagiarized]
459
- plagiarism_types = {}
460
- sources_by_type = {"Academic": [], "Web": []}
461
-
462
- for report in plagiarized_reports:
463
- ptype = report.plagiarism_type or "Unknown"
464
- plagiarism_types[ptype] = plagiarism_types.get(ptype, 0) + 1
465
-
466
- if report.source_type:
467
- if "Academic" in report.source_type:
468
- if report.source_url not in sources_by_type["Academic"]:
469
- sources_by_type["Academic"].append({
470
- "url": report.source_url,
471
- "type": report.source_type,
472
- "max_similarity": report.similarity_score
473
- })
474
- else:
475
- if report.source_url not in sources_by_type["Web"]:
476
- sources_by_type["Web"].append({
477
- "url": report.source_url,
478
- "type": report.source_type,
479
- "max_similarity": report.similarity_score
480
- })
481
-
482
- # 2. Determine severity level
483
- if overall_score < 15:
484
- severity = "Low"
485
- risk_level = "Minimal - Normal citation variations detected"
486
- elif overall_score < 30:
487
- severity = "Medium"
488
- risk_level = "Moderate - Multiple sources match detected"
489
- elif overall_score < 50:
490
- severity = "High"
491
- risk_level = "Significant - Substantial plagiarism detected"
492
- else:
493
- severity = "Very High"
494
- risk_level = "Critical - Extensive plagiarism detected"
495
-
496
- # 3. Use LLM to generate detailed analysis
497
- plagiarism_context = f"""
498
- Document: {filename}
499
- Total Words: {total_words}
500
- Plagiarism Score: {overall_score}%
501
- Plagiarism Types Found: {plagiarism_types}
502
- Academic Matches: {len(sources_by_type['Academic'])}
503
- Web Matches: {len(sources_by_type['Web'])}
504
-
505
- Suspicious Sections (samples):
506
- {chr(10).join([f"- {r.chunk_text[:100]}..." for r in plagiarized_reports[:5]])}
507
- """
508
-
509
- analysis_prompt = ChatPromptTemplate.from_messages([
510
- ("system", """You are an expert academic integrity analyzer and plagiarism report generator.
511
- Generate a professional, detailed plagiarism analysis report.
512
- Focus on: severity assessment, academic integrity concerns, specific problem areas, and recommendations.
513
- Be thorough but concise."""),
514
- ("user", """Create a detailed plagiarism analysis for this document:
515
-
516
- {plagiarism_context}
517
-
518
- Provide:
519
- 1. Executive Summary (2-3 sentences)
520
- 2. Key Findings (3-4 bullet points)
521
- 3. Detailed Analysis (2-3 paragraphs explaining the plagiarism pattern)
522
- 4. Recommendations (3-4 specific actions to remediate)
523
-
524
- Format clearly with section headers.""")
525
- ])
526
-
527
- chain = analysis_prompt | llm
528
- llm_response = chain.invoke({"plagiarism_context": plagiarism_context})
529
- llm_analysis = llm_response.content
530
-
531
- # 4. Extract findings from LLM response
532
- lines = llm_analysis.split('\n')
533
- key_findings = []
534
- recommendations = []
535
- detailed_analysis = ""
536
-
537
- in_findings = False
538
- in_recommendations = False
539
-
540
- for line in lines:
541
- if 'Key Findings' in line:
542
- in_findings = True
543
- in_recommendations = False
544
- elif 'Recommendations' in line:
545
- in_findings = False
546
- in_recommendations = True
547
- elif 'Detailed Analysis' in line or 'Analysis' in line:
548
- in_findings = False
549
- in_recommendations = False
550
- elif in_findings and line.strip().startswith(('-', '*', '•')):
551
- key_findings.append(line.strip().lstrip('-*•').strip())
552
- elif in_recommendations and line.strip().startswith(('-', '*', '•')):
553
- recommendations.append(line.strip().lstrip('-*•').strip())
554
- elif not in_findings and not in_recommendations and line.strip():
555
- detailed_analysis += line + "\n"
556
-
557
- if not key_findings:
558
- key_findings = [
559
- f"Overall plagiarism score: {overall_score}%",
560
- f"Primary plagiarism type: {max(plagiarism_types.keys(), key=plagiarism_types.get) if plagiarism_types else 'Not detected'}",
561
- f"Multiple sources detected: {len(sources_by_type['Academic']) + len(sources_by_type['Web'])} sources"
562
- ]
563
-
564
- if not recommendations:
565
- recommendations = [
566
- "Properly cite all sources according to your institution's guidelines",
567
- "Use quotation marks for direct quotes and provide page numbers",
568
- "Paraphrase content properly and cite original sources",
569
- "Use plagiarism detection tools during the writing process"
570
- ]
571
-
572
- # 5. Affected sections
573
- affected_sections = []
574
- for i, report in enumerate(plagiarized_reports[:10]):
575
- affected_sections.append({
576
- "section_number": i + 1,
577
- "text_snippet": report.chunk_text[:150],
578
- "similarity_score": report.similarity_score,
579
- "plagiarism_type": report.plagiarism_type,
580
- "source": report.source_url,
581
- "source_type": report.source_type
582
- })
583
-
584
- return DetailedPlagiarismReport(
585
- filename=filename,
586
- scan_timestamp=datetime.now().isoformat(),
587
- executive_summary=llm_analysis.split('\n')[0] if llm_analysis else f"Document contains {overall_score}% plagiarized content",
588
- overall_score=round(overall_score, 2),
589
- severity_level=severity,
590
- matched_sources=sources_by_type["Academic"] + sources_by_type["Web"],
591
- key_findings=key_findings,
592
- plagiarism_breakdown={
593
- "total_plagiarism_percentage": round(overall_score, 2),
594
- "types": plagiarism_types,
595
- "academic_sources": len(sources_by_type["Academic"]),
596
- "web_sources": len(sources_by_type["Web"])
597
- },
598
- detailed_analysis=detailed_analysis or llm_analysis,
599
- affected_sections=affected_sections,
600
- recommendations=recommendations,
601
- academic_integrity_risk=risk_level
602
- )
603
-
604
- def analyze_chunk(chunk: str) -> MatchReport:
605
- search_results = aggregate_search(chunk)
606
-
607
- best_score = 0.0
608
- best_url = None
609
- best_source_type = None
610
- plagiarism_type = None
611
- is_plagiarized = False
612
-
613
- for result in search_results:
614
- source_text = result['text']
615
-
616
- # 1. Math/Deterministic Check
617
- exact_sim = calculate_exact_similarity(chunk, source_text)
618
-
619
- if exact_sim > best_score:
620
- best_score = exact_sim
621
- best_url = result['url']
622
- best_source_type = result['source_type']
623
-
624
- if exact_sim > 0.50: # Lowered to 50% because we are comparing against abstracts/snippets
625
- is_plagiarized = True
626
- plagiarism_type = "Exact/Heavy Match"
627
- break
628
-
629
- # 2. Agentic Check for Mosaic Plagiarism
630
- elif exact_sim > 0.25:
631
- if check_paraphrasing_with_llm(chunk, source_text):
632
- is_plagiarized = True
633
- plagiarism_type = "Paraphrased Match (Mosaic)"
634
- best_url = result['url']
635
- best_source_type = result['source_type']
636
- best_score = max(best_score, 0.85)
637
- break
638
-
639
- return MatchReport(
640
- chunk_text=chunk,
641
- is_plagiarized=is_plagiarized,
642
- plagiarism_type=plagiarism_type,
643
- source_url=best_url,
644
- source_type=best_source_type,
645
- similarity_score=round(best_score, 2)
646
- )
647
-
648
- # ==========================================
649
- # 6. Report Formatting Functions
650
- # ==========================================
651
-
652
- def format_report_json(detailed_report: DetailedPlagiarismReport) -> Dict[str, Any]:
653
- """Format report as JSON"""
654
- return {
655
- "filename": detailed_report.filename,
656
- "scan_timestamp": detailed_report.scan_timestamp,
657
- # Backward-compatible top-level fields expected by existing clients.
658
- "overall_score": detailed_report.overall_score,
659
- "severity_level": detailed_report.severity_level,
660
- "academic_integrity_risk": detailed_report.academic_integrity_risk,
661
- "summary": {
662
- "overall_plagiarism_score": detailed_report.overall_score,
663
- "severity_level": detailed_report.severity_level,
664
- "academic_integrity_risk": detailed_report.academic_integrity_risk
665
- },
666
- "executive_summary": detailed_report.executive_summary,
667
- "key_findings": detailed_report.key_findings,
668
- "plagiarism_breakdown": detailed_report.plagiarism_breakdown,
669
- "matched_sources": detailed_report.matched_sources,
670
- "affected_sections": detailed_report.affected_sections,
671
- "detailed_analysis": detailed_report.detailed_analysis,
672
- "recommendations": detailed_report.recommendations
673
- }
674
-
675
- def format_report_text(detailed_report: DetailedPlagiarismReport) -> str:
676
- """Format report as plain text"""
677
- report = "=" * 80 + "\n"
678
- report += "DETAILED PLAGIARISM DETECTION REPORT\n"
679
- report += "=" * 80 + "\n\n"
680
-
681
- report += f"FILE: {detailed_report.filename}\n"
682
- report += f"SCAN DATE: {detailed_report.scan_timestamp}\n"
683
- report += "-" * 80 + "\n\n"
684
-
685
- report += "SUMMARY\n"
686
- report += "-" * 80 + "\n"
687
- report += f"Overall Plagiarism Score: {detailed_report.overall_score}%\n"
688
- report += f"Severity Level: {detailed_report.severity_level}\n"
689
- report += f"Academic Integrity Risk: {detailed_report.academic_integrity_risk}\n\n"
690
-
691
- report += "EXECUTIVE SUMMARY\n"
692
- report += "-" * 80 + "\n"
693
- report += f"{detailed_report.executive_summary}\n\n"
694
-
695
- report += "KEY FINDINGS\n"
696
- report += "-" * 80 + "\n"
697
- for i, finding in enumerate(detailed_report.key_findings, 1):
698
- report += f"{i}. {finding}\n"
699
- report += "\n"
700
-
701
- report += "PLAGIARISM BREAKDOWN\n"
702
- report += "-" * 80 + "\n"
703
- report += f"Total Plagiarism %: {detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%\n"
704
- report += f"Academic Sources: {detailed_report.plagiarism_breakdown['academic_sources']}\n"
705
- report += f"Web Sources: {detailed_report.plagiarism_breakdown['web_sources']}\n"
706
- if detailed_report.plagiarism_breakdown.get('types'):
707
- report += "Types Detected:\n"
708
- for ptype, count in detailed_report.plagiarism_breakdown['types'].items():
709
- report += f" - {ptype}: {count} instances\n"
710
- report += "\n"
711
-
712
- report += "MATCHED SOURCES\n"
713
- report += "-" * 80 + "\n"
714
- if detailed_report.matched_sources:
715
- for i, source in enumerate(detailed_report.matched_sources[:10], 1):
716
- report += f"{i}. URL: {source.get('url', 'N/A')}\n"
717
- report += f" Type: {source.get('type', 'N/A')}\n"
718
- report += f" Similarity: {source.get('max_similarity', 'N/A')}\n\n"
719
- else:
720
- report += "No sources matched.\n\n"
721
-
722
- report += "DETAILED ANALYSIS\n"
723
- report += "-" * 80 + "\n"
724
- report += f"{detailed_report.detailed_analysis}\n\n"
725
-
726
- if detailed_report.affected_sections:
727
- report += "AFFECTED SECTIONS (Top Issues)\n"
728
- report += "-" * 80 + "\n"
729
- for section in detailed_report.affected_sections[:5]:
730
- report += f"\nSection {section['section_number']}:\n"
731
- report += f"Text Snippet: {section['text_snippet']}\n"
732
- report += f"Similarity Score: {section['similarity_score']}\n"
733
- report += f"Plagiarism Type: {section['plagiarism_type']}\n"
734
- report += f"Source: {section['source']}\n"
735
- report += "\n"
736
-
737
- report += "RECOMMENDATIONS\n"
738
- report += "-" * 80 + "\n"
739
- for i, rec in enumerate(detailed_report.recommendations, 1):
740
- report += f"{i}. {rec}\n"
741
- report += "\n"
742
-
743
- report += "=" * 80 + "\n"
744
- report += "End of Report\n"
745
- report += "=" * 80 + "\n"
746
-
747
- return report
748
-
749
- def format_report_html(detailed_report: DetailedPlagiarismReport) -> str:
750
- """Format report as HTML"""
751
- html = f"""
752
- <!DOCTYPE html>
753
- <html lang="en">
754
- <head>
755
- <meta charset="UTF-8">
756
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
757
- <title>Plagiarism Detection Report - {detailed_report.filename}</title>
758
- <style>
759
- body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
760
- .container {{ background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
761
- h1 {{ color: #333; border-bottom: 3px solid #2196F3; padding-bottom: 10px; }}
762
- h2 {{ color: #2196F3; margin-top: 30px; }}
763
- .summary {{ background-color: #f0f7ff; padding: 15px; border-left: 4px solid #2196F3; margin: 20px 0; }}
764
- .score {{ font-size: 24px; font-weight: bold; color: #d32f2f; }}
765
- .severity-low {{ color: #4caf50; }}
766
- .severity-medium {{ color: #ff9800; }}
767
- .severity-high {{ color: #f44336; }}
768
- .severity-very-high {{ color: #c41c3b; }}
769
- .findings {{ background-color: #fff3e0; padding: 15px; border-left: 4px solid #ff9800; }}
770
- .source-item {{ background-color: #f5f5f5; padding: 10px; margin: 10px 0; border-radius: 4px; }}
771
- .recommendation {{ background-color: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 3px solid #4caf50; }}
772
- table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
773
- th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
774
- th {{ background-color: #2196F3; color: white; }}
775
- .affected-section {{ background-color: #fce4ec; padding: 15px; margin: 10px 0; border-radius: 4px; }}
776
- </style>
777
- </head>
778
- <body>
779
- <div class="container">
780
- <h1>🔍 Plagiarism Detection Report</h1>
781
-
782
- <div class="summary">
783
- <p><strong>File:</strong> {detailed_report.filename}</p>
784
- <p><strong>Scan Date:</strong> {detailed_report.scan_timestamp}</p>
785
- <p><strong>Overall Plagiarism Score:</strong> <span class="score">{detailed_report.overall_score}%</span></p>
786
- <p><strong>Severity Level:</strong> <span class="severity-{detailed_report.severity_level.lower().replace(' ', '-')}">{detailed_report.severity_level}</span></p>
787
- <p><strong>Academic Integrity Risk:</strong> {detailed_report.academic_integrity_risk}</p>
788
- </div>
789
-
790
- <h2>Executive Summary</h2>
791
- <p>{detailed_report.executive_summary}</p>
792
-
793
- <h2>Key Findings</h2>
794
- <div class="findings">
795
- <ul>
796
- {"".join([f"<li>{finding}</li>" for finding in detailed_report.key_findings])}
797
- </ul>
798
- </div>
799
-
800
- <h2>Plagiarism Breakdown</h2>
801
- <table>
802
- <tr>
803
- <th>Category</th>
804
- <th>Value</th>
805
- </tr>
806
- <tr>
807
- <td>Total Plagiarism %</td>
808
- <td>{detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%</td>
809
- </tr>
810
- <tr>
811
- <td>Academic Sources</td>
812
- <td>{detailed_report.plagiarism_breakdown['academic_sources']}</td>
813
- </tr>
814
- <tr>
815
- <td>Web Sources</td>
816
- <td>{detailed_report.plagiarism_breakdown['web_sources']}</td>
817
- </tr>
818
- </table>
819
-
820
- <h2>Matched Sources</h2>
821
- {"".join([f'<div class="source-item"><strong>{source.get("type", "Unknown")}</strong><br/><a href="{source.get("url", "#")}" target="_blank">{source.get("url", "N/A")}</a><br/>Similarity: {source.get("max_similarity", "N/A")}</div>' for source in detailed_report.matched_sources[:10]])}
822
-
823
- <h2>Detailed Analysis</h2>
824
- <p>{detailed_report.detailed_analysis.replace(chr(10), "<br/>")}</p>
825
-
826
- {"<h2>Affected Sections (Top Issues)</h2>" + "".join([f'<div class="affected-section"><strong>Section {section["section_number"]}</strong><br/><em>Text:</em> {section["text_snippet"]}...<br/><em>Similarity:</em> {section["similarity_score"]}<br/><em>Type:</em> {section["plagiarism_type"]}</div>' for section in detailed_report.affected_sections[:5]]) if detailed_report.affected_sections else ""}
827
-
828
- <h2>Recommendations</h2>
829
- <div>
830
- {"".join([f'<div class="recommendation"><strong>✓</strong> {rec}</div>' for rec in detailed_report.recommendations])}
831
- </div>
832
- </div>
833
- </body>
834
- </html>
835
- """
836
- return html
837
-
838
- # ==========================================
839
- # 5. API Endpoints & Utility
840
- # ==========================================
841
-
842
- def extract_text_from_pdf(file_bytes) -> str:
843
- reader = PdfReader(file_bytes)
844
- return "".join([page.extract_text() + "\n" for page in reader.pages if page.extract_text()])
845
-
846
- def chunk_text(text: str, words_per_chunk: int = 40) -> List[str]:
847
- words = text.split()
848
- chunks = []
849
- for i in range(0, len(words), words_per_chunk - 10):
850
- chunk = " ".join(words[i:i + words_per_chunk])
851
- if len(chunk.split()) > 15:
852
- chunks.append(chunk)
853
- return chunks
854
-
855
- @app.post("/scan-paper", response_model=PlagiarismReport)
856
- async def scan_paper(file: UploadFile = File(...)):
857
- text = extract_text_from_pdf(file.file)
858
- total_words = len(text.split())
859
-
860
- if total_words == 0:
861
- raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
862
-
863
- chunks = chunk_text(text)
864
-
865
- # Cap chunks for safety during testing (remove in production)
866
- if len(chunks) > 20:
867
- chunks = chunks[:20]
868
-
869
- detailed_reports = []
870
- plagiarized_word_count = 0
871
-
872
- for chunk in chunks:
873
- report = analyze_chunk(chunk)
874
- detailed_reports.append(report)
875
-
876
- if report.is_plagiarized:
877
- plagiarized_word_count += len(chunk.split())
878
-
879
- plagiarized_word_count = min(plagiarized_word_count, total_words)
880
- overall_score = (plagiarized_word_count / total_words) * 100
881
-
882
- # Determine severity level
883
- if overall_score < 15:
884
- severity = "Low"
885
- elif overall_score < 30:
886
- severity = "Medium"
887
- elif overall_score < 50:
888
- severity = "High"
889
- else:
890
- severity = "Very High"
891
-
892
- return PlagiarismReport(
893
- filename=file.filename,
894
- total_words=total_words,
895
- plagiarized_words=plagiarized_word_count,
896
- overall_plagiarism_score=round(overall_score, 2),
897
- severity_level=severity,
898
- details=detailed_reports
899
- )
900
-
901
- @app.post("/generate-detailed-report")
902
- async def generate_detailed_report(file: UploadFile = File(...)):
903
- """Generate comprehensive plagiarism report with LLM analysis"""
904
- text = extract_text_from_pdf(file.file)
905
- total_words = len(text.split())
906
-
907
- if total_words == 0:
908
- raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
909
-
910
- chunks = chunk_text(text)
911
-
912
- # Cap chunks
913
- if len(chunks) > 20:
914
- chunks = chunks[:20]
915
-
916
- detailed_reports = []
917
- plagiarized_word_count = 0
918
-
919
- for chunk in chunks:
920
- report = analyze_chunk(chunk)
921
- detailed_reports.append(report)
922
-
923
- if report.is_plagiarized:
924
- plagiarized_word_count += len(chunk.split())
925
-
926
- plagiarized_word_count = min(plagiarized_word_count, total_words)
927
- overall_score = (plagiarized_word_count / total_words) * 100
928
-
929
- # Generate detailed report with LLM analysis
930
- detailed_report = generate_detailed_report_with_llm(
931
- filename=file.filename,
932
- match_reports=detailed_reports,
933
- total_words=total_words,
934
- overall_score=overall_score
935
- )
936
-
937
- return format_report_json(detailed_report)
938
-
939
- @app.post("/report/text")
940
- async def report_text(file: UploadFile = File(...)):
941
- """Generate detailed plagiarism report as plain text"""
942
- text = extract_text_from_pdf(file.file)
943
- total_words = len(text.split())
944
-
945
- if total_words == 0:
946
- raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
947
-
948
- chunks = chunk_text(text)
949
-
950
- if len(chunks) > 20:
951
- chunks = chunks[:20]
952
-
953
- detailed_reports = []
954
- plagiarized_word_count = 0
955
-
956
- for chunk in chunks:
957
- report = analyze_chunk(chunk)
958
- detailed_reports.append(report)
959
-
960
- if report.is_plagiarized:
961
- plagiarized_word_count += len(chunk.split())
962
-
963
- plagiarized_word_count = min(plagiarized_word_count, total_words)
964
- overall_score = (plagiarized_word_count / total_words) * 100
965
-
966
- # Generate detailed report
967
- detailed_report = generate_detailed_report_with_llm(
968
- filename=file.filename,
969
- match_reports=detailed_reports,
970
- total_words=total_words,
971
- overall_score=overall_score
972
- )
973
-
974
- from fastapi.responses import PlainTextResponse
975
- return PlainTextResponse(format_report_text(detailed_report))
976
-
977
- @app.post("/report/html")
978
- async def report_html(file: UploadFile = File(...)):
979
- """Generate detailed plagiarism report as HTML"""
980
- text = extract_text_from_pdf(file.file)
981
- total_words = len(text.split())
982
-
983
- if total_words == 0:
984
- raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
985
-
986
- chunks = chunk_text(text)
987
-
988
- if len(chunks) > 20:
989
- chunks = chunks[:20]
990
-
991
- detailed_reports = []
992
- plagiarized_word_count = 0
993
-
994
- for chunk in chunks:
995
- report = analyze_chunk(chunk)
996
- detailed_reports.append(report)
997
-
998
- if report.is_plagiarized:
999
- plagiarized_word_count += len(chunk.split())
1000
-
1001
- plagiarized_word_count = min(plagiarized_word_count, total_words)
1002
- overall_score = (plagiarized_word_count / total_words) * 100
1003
-
1004
- # Generate detailed report
1005
- detailed_report = generate_detailed_report_with_llm(
1006
- filename=file.filename,
1007
- match_reports=detailed_reports,
1008
- total_words=total_words,
1009
- overall_score=overall_score
1010
- )
1011
-
1012
- from fastapi.responses import HTMLResponse
1013
- return HTMLResponse(format_report_html(detailed_report))
1014
-
1015
- @app.get("/")
1016
- async def root():
1017
- return {
1018
- "message": "Pro Plagiarism Detector API",
1019
- "endpoints": {
1020
- "scan": "/scan-paper (POST - basic scan)",
1021
- "detailed_report": "/generate-detailed-report (POST - JSON report with LLM analysis)",
1022
- "text_report": "/report/text (POST - plain text report)",
1023
- "html_report": "/report/html (POST - HTML report)"
1024
- }
1025
- }
1026
-
1027
- if __name__ == "__main__":
1028
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
+ import os
2
+ import time
3
+ import requests
4
+ import random
5
+ import re
6
+ from difflib import SequenceMatcher
7
+ from typing import List, Optional, Dict, Any
8
+ from urllib.parse import quote_plus
9
+
10
+ from fastapi import FastAPI, UploadFile, File, HTTPException
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ import uvicorn
13
+ from pydantic import BaseModel
14
+ from PyPDF2 import PdfReader
15
+
16
+ from langchain_groq import ChatGroq
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+
19
+ # ==========================================
20
+ # 1. Environment & API Setup
21
+ # ==========================================
22
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
23
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY")
24
+ SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
25
+ SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
26
+ SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS = 1.2
27
+ SEMANTIC_SCHOLAR_MAX_RETRIES = 4
28
+
29
+ if not GROQ_API_KEY or not SERPER_API_KEY:
30
+ print("WARNING: GROQ_API_KEY or SERPER_API_KEY is missing!")
31
+
32
+ llm = ChatGroq(model="openai/gpt-oss-120b", temperature=0.1)
33
+
34
+ # Basic Memory Cache to maintain API efficiency (as promised in the application)
35
+ query_cache = {}
36
+ semantic_query_cache: Dict[str, List[Dict[str, str]]] = {}
37
+ _last_semantic_scholar_call_ts = 0.0
38
+
39
+ # ==========================================
40
+ # 2. Pydantic Models
41
+ # ==========================================
42
+ class MatchReport(BaseModel):
43
+ chunk_text: str
44
+ is_plagiarized: bool
45
+ plagiarism_type: Optional[str] = None
46
+ source_url: Optional[str] = None
47
+ source_type: Optional[str] = None # "Academic" or "Web"
48
+ similarity_score: float
49
+
50
+ class PlagiarismReport(BaseModel):
51
+ filename: str
52
+ total_words: int
53
+ plagiarized_words: int
54
+ overall_plagiarism_score: float
55
+ severity_level: str # Low, Medium, High, Very High
56
+ details: List[MatchReport]
57
+
58
+ class DetailedPlagiarismReport(BaseModel):
59
+ """Comprehensive report generated by LLM"""
60
+ filename: str
61
+ scan_timestamp: str
62
+ executive_summary: str
63
+ overall_score: float
64
+ severity_level: str
65
+ matched_sources: List[Dict[str, Any]]
66
+ key_findings: List[str]
67
+ plagiarism_breakdown: Dict[str, Any] # Types and percentages
68
+ detailed_analysis: str # LLM-generated detailed analysis
69
+ affected_sections: List[Dict[str, Any]] # Which parts are problematic
70
+ recommendations: List[str]
71
+ academic_integrity_risk: str # Assessment level
72
+
73
+ app = FastAPI(title="Pro Plagiarism Detector (Turnitin Clone)")
74
+
75
+ app.add_middleware(
76
+ CORSMiddleware,
77
+ allow_origins=["*"],
78
+ allow_credentials=True,
79
+ allow_methods=["*"],
80
+ allow_headers=["*"],
81
+ )
82
+
83
+ # ==========================================
84
+ # 3. Agent Tools: Serper & Semantic Scholar
85
+ # ==========================================
86
+
87
+ def _semantic_scholar_headers() -> Dict[str, str]:
88
+ headers: Dict[str, str] = {}
89
+ if SEMANTIC_SCHOLAR_API_KEY:
90
+ # API key must be sent in x-api-key header.
91
+ headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY
92
+ return headers
93
+
94
+
95
+ def _semantic_scholar_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
96
+ global _last_semantic_scholar_call_ts
97
+ filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
98
+
99
+ for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
100
+ elapsed = time.time() - _last_semantic_scholar_call_ts
101
+ if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
102
+ time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
103
+
104
+ response = requests.get(
105
+ f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
106
+ headers=_semantic_scholar_headers(),
107
+ params=filtered_params,
108
+ timeout=20,
109
+ )
110
+ _last_semantic_scholar_call_ts = time.time()
111
+
112
+ if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
113
+ retry_after = response.headers.get("Retry-After")
114
+ if retry_after and retry_after.isdigit():
115
+ wait_seconds = float(retry_after)
116
+ else:
117
+ wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
118
+ time.sleep(wait_seconds)
119
+ continue
120
+
121
+ response.raise_for_status()
122
+ return response.json()
123
+
124
+ raise requests.HTTPError("Semantic Scholar request failed after retries")
125
+
126
+
127
+ def _semantic_scholar_post(path: str, body: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> Any:
128
+ global _last_semantic_scholar_call_ts
129
+ filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
130
+
131
+ for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
132
+ elapsed = time.time() - _last_semantic_scholar_call_ts
133
+ if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
134
+ time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
135
+
136
+ response = requests.post(
137
+ f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
138
+ headers=_semantic_scholar_headers(),
139
+ params=filtered_params,
140
+ json=body,
141
+ timeout=25,
142
+ )
143
+ _last_semantic_scholar_call_ts = time.time()
144
+
145
+ if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
146
+ retry_after = response.headers.get("Retry-After")
147
+ if retry_after and retry_after.isdigit():
148
+ wait_seconds = float(retry_after)
149
+ else:
150
+ wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
151
+ time.sleep(wait_seconds)
152
+ continue
153
+
154
+ response.raise_for_status()
155
+ return response.json()
156
+
157
+ raise requests.HTTPError("Semantic Scholar request failed after retries")
158
+
159
+
160
+ def s2_paper_autocomplete(query: str) -> Dict[str, Any]:
161
+ return _semantic_scholar_get("/paper/autocomplete", {"query": query[:100]})
162
+
163
+
164
+ def s2_paper_batch(ids: List[str], fields: Optional[str] = None) -> Any:
165
+ return _semantic_scholar_post("/paper/batch", {"ids": ids[:500]}, {"fields": fields})
166
+
167
+
168
+ def s2_paper_search(
169
+ query: str,
170
+ fields: Optional[str] = None,
171
+ limit: int = 100,
172
+ offset: int = 0,
173
+ year: Optional[str] = None,
174
+ fields_of_study: Optional[str] = None,
175
+ open_access_pdf: bool = False,
176
+ ) -> Dict[str, Any]:
177
+ params: Dict[str, Any] = {
178
+ "query": query,
179
+ "fields": fields,
180
+ "limit": min(max(limit, 1), 100),
181
+ "offset": max(offset, 0),
182
+ "year": year,
183
+ "fieldsOfStudy": fields_of_study,
184
+ }
185
+ if open_access_pdf:
186
+ params["openAccessPdf"] = ""
187
+ return _semantic_scholar_get("/paper/search", params)
188
+
189
+
190
+ def s2_paper_search_bulk(
191
+ query: str,
192
+ fields: Optional[str] = None,
193
+ token: Optional[str] = None,
194
+ sort: Optional[str] = None,
195
+ ) -> Dict[str, Any]:
196
+ return _semantic_scholar_get(
197
+ "/paper/search/bulk",
198
+ {
199
+ "query": query,
200
+ "fields": fields,
201
+ "token": token,
202
+ "sort": sort,
203
+ },
204
+ )
205
+
206
+
207
+ def s2_paper_search_match(query: str, fields: Optional[str] = None) -> Dict[str, Any]:
208
+ return _semantic_scholar_get("/paper/search/match", {"query": query, "fields": fields})
209
+
210
+
211
+ def s2_paper_details(paper_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
212
+ safe_id = quote_plus(paper_id)
213
+ return _semantic_scholar_get(f"/paper/{safe_id}", {"fields": fields})
214
+
215
+
216
+ def s2_paper_authors(
217
+ paper_id: str,
218
+ fields: Optional[str] = None,
219
+ limit: int = 100,
220
+ offset: int = 0,
221
+ ) -> Dict[str, Any]:
222
+ safe_id = quote_plus(paper_id)
223
+ return _semantic_scholar_get(
224
+ f"/paper/{safe_id}/authors",
225
+ {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
226
+ )
227
+
228
+
229
+ def s2_paper_citations(
230
+ paper_id: str,
231
+ fields: Optional[str] = None,
232
+ limit: int = 100,
233
+ offset: int = 0,
234
+ publication_date_or_year: Optional[str] = None,
235
+ ) -> Dict[str, Any]:
236
+ safe_id = quote_plus(paper_id)
237
+ return _semantic_scholar_get(
238
+ f"/paper/{safe_id}/citations",
239
+ {
240
+ "fields": fields,
241
+ "limit": min(max(limit, 1), 1000),
242
+ "offset": max(offset, 0),
243
+ "publicationDateOrYear": publication_date_or_year,
244
+ },
245
+ )
246
+
247
+
248
+ def s2_paper_references(
249
+ paper_id: str,
250
+ fields: Optional[str] = None,
251
+ limit: int = 100,
252
+ offset: int = 0,
253
+ ) -> Dict[str, Any]:
254
+ safe_id = quote_plus(paper_id)
255
+ return _semantic_scholar_get(
256
+ f"/paper/{safe_id}/references",
257
+ {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
258
+ )
259
+
260
+
261
+ def s2_author_batch(ids: List[str], fields: Optional[str] = None) -> Any:
262
+ return _semantic_scholar_post("/author/batch", {"ids": ids[:1000]}, {"fields": fields})
263
+
264
+
265
+ def s2_author_search(
266
+ query: str,
267
+ fields: Optional[str] = None,
268
+ limit: int = 100,
269
+ offset: int = 0,
270
+ ) -> Dict[str, Any]:
271
+ return _semantic_scholar_get(
272
+ "/author/search",
273
+ {
274
+ "query": query,
275
+ "fields": fields,
276
+ "limit": min(max(limit, 1), 1000),
277
+ "offset": max(offset, 0),
278
+ },
279
+ )
280
+
281
+
282
+ def s2_author_details(author_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
283
+ safe_id = quote_plus(author_id)
284
+ return _semantic_scholar_get(f"/author/{safe_id}", {"fields": fields})
285
+
286
+
287
+ def s2_author_papers(
288
+ author_id: str,
289
+ fields: Optional[str] = None,
290
+ limit: int = 100,
291
+ offset: int = 0,
292
+ publication_date_or_year: Optional[str] = None,
293
+ ) -> Dict[str, Any]:
294
+ safe_id = quote_plus(author_id)
295
+ return _semantic_scholar_get(
296
+ f"/author/{safe_id}/papers",
297
+ {
298
+ "fields": fields,
299
+ "limit": min(max(limit, 1), 1000),
300
+ "offset": max(offset, 0),
301
+ "publicationDateOrYear": publication_date_or_year,
302
+ },
303
+ )
304
+
305
+
306
+ def s2_snippet_search(
307
+ query: str,
308
+ fields: Optional[str] = None,
309
+ limit: int = 10,
310
+ year: Optional[str] = None,
311
+ fields_of_study: Optional[str] = None,
312
+ ) -> Dict[str, Any]:
313
+ return _semantic_scholar_get(
314
+ "/snippet/search",
315
+ {
316
+ "query": query,
317
+ "fields": fields,
318
+ "limit": min(max(limit, 1), 1000),
319
+ "year": year,
320
+ "fieldsOfStudy": fields_of_study,
321
+ },
322
+ )
323
+
324
+
325
+ def build_search_query(text: str, max_terms: int = 10) -> str:
326
+ """Builds a compact keyword query to improve search recall and reduce noisy long queries."""
327
+ stopwords = {
328
+ "the", "and", "for", "that", "with", "this", "from", "into", "our", "their",
329
+ "were", "have", "has", "had", "been", "are", "was", "will", "would", "can",
330
+ "could", "should", "about", "through", "using", "based", "than", "then", "also",
331
+ "such", "these", "those", "while", "where", "when", "what", "which", "who",
332
+ }
333
+ words = re.findall(r"[A-Za-z0-9]+", text.lower())
334
+ keywords = [w for w in words if len(w) > 2 and w not in stopwords]
335
+ return " ".join(keywords[:max_terms]) if keywords else " ".join(words[:max_terms])
336
+
337
+ def search_google_serper(query: str) -> List[Dict]:
338
+ """Searches the open web using Google Serper API."""
339
+ url = "https://google.serper.dev/search"
340
+ payload = {"q": query}
341
+ headers = {
342
+ 'X-API-KEY': SERPER_API_KEY,
343
+ 'Content-Type': 'application/json'
344
+ }
345
+
346
+ try:
347
+ response = requests.post(url, headers=headers, json=payload)
348
+ response.raise_for_status()
349
+ data = response.json()
350
+
351
+ results = []
352
+ for item in data.get("organic", [])[:3]: # Top 3 web results
353
+ results.append({
354
+ "text": item.get("snippet", ""),
355
+ "url": item.get("link", ""),
356
+ "source_type": "Web (Google)"
357
+ })
358
+ return results
359
+ except Exception as e:
360
+ print(f"Serper Error: {e}")
361
+ return []
362
+
363
+ def search_semantic_scholar(query: str) -> List[Dict]:
364
+ """Searches academic papers using Semantic Scholar API."""
365
+ prepared_query = build_search_query(query, max_terms=10)
366
+ normalized_query = " ".join(prepared_query.split()).lower()
367
+ if normalized_query in semantic_query_cache:
368
+ return semantic_query_cache[normalized_query]
369
+
370
+ try:
371
+ results = []
372
+
373
+ # Try snippet search first because it returns passage-level text better suited for chunk comparison.
374
+ snippet_data = s2_snippet_search(
375
+ query=prepared_query,
376
+ fields="snippet.text,snippet.snippetKind",
377
+ limit=3,
378
+ )
379
+ for item in snippet_data.get("data", []):
380
+ snippet = item.get("snippet", {})
381
+ paper = item.get("paper", {})
382
+ snippet_text = snippet.get("text", "")
383
+ if snippet_text:
384
+ corpus_id = paper.get("corpusId")
385
+ paper_url = f"https://www.semanticscholar.org/paper/{corpus_id}" if corpus_id else None
386
+ results.append({
387
+ "text": snippet_text,
388
+ "url": paper_url,
389
+ "source_type": "Academic (Semantic Scholar Snippet)",
390
+ })
391
+
392
+ # Keep paper abstract search as fallback/secondary source.
393
+ data = s2_paper_search(
394
+ query=prepared_query,
395
+ limit=2,
396
+ fields="title,abstract,url",
397
+ )
398
+
399
+ for item in data.get("data", []):
400
+ if item.get("abstract"): # Only keep if abstract exists to compare text
401
+ results.append({
402
+ "text": item["abstract"],
403
+ "url": item.get("url", f"https://www.semanticscholar.org/paper/{item['paperId']}"),
404
+ "source_type": "Academic (Semantic Scholar)"
405
+ })
406
+ semantic_query_cache[normalized_query] = results
407
+ return results
408
+ except Exception as e:
409
+ print(f"Semantic Scholar Error: {e}")
410
+ return []
411
+
412
+ def aggregate_search(query: str) -> List[Dict]:
413
+ """Combines Academic and Web sources and implements caching."""
414
+ # Use the first 15 words to make the search query efficient
415
+ search_query = " ".join(query.split()[:15])
416
+
417
+ if search_query in query_cache:
418
+ return query_cache[search_query]
419
+
420
+ # Run both searches
421
+ web_results = search_google_serper(search_query)
422
+ academic_results = search_semantic_scholar(search_query)
423
+
424
+ combined = web_results + academic_results
425
+ query_cache[search_query] = combined # Save to cache
426
+
427
+ # Sleep to respect rate limits
428
+ time.sleep(1)
429
+
430
+ return combined
431
+
432
+ # ==========================================
433
+ # 4. Core Comparison Logic
434
+ # ==========================================
435
+
436
+ def calculate_exact_similarity(text1: str, text2: str) -> float:
437
+ return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
438
+
439
+ def check_paraphrasing_with_llm(chunk: str, source_text: str) -> bool:
440
+ prompt = ChatPromptTemplate.from_messages([
441
+ ("system", "You are an expert academic plagiarism detector. Determine if TEXT A is a direct paraphrase, stolen idea, or highly similar structure to TEXT B. Ignore generic academic phrases like 'In this paper we demonstrate'. Respond ONLY with 'YES' or 'NO'."),
442
+ ("user", "TEXT A: {chunk}\n\nTEXT B: {source_text}")
443
+ ])
444
+ chain = prompt | llm
445
+ response = chain.invoke({"chunk": chunk, "source_text": source_text})
446
+ return "YES" in response.content.upper()
447
+
448
+ def generate_detailed_report_with_llm(
449
+ filename: str,
450
+ match_reports: List[MatchReport],
451
+ total_words: int,
452
+ overall_score: float
453
+ ) -> DetailedPlagiarismReport:
454
+ """Generate a comprehensive report using LLM analysis"""
455
+ from datetime import datetime
456
+
457
+ # 1. Aggregate data for analysis
458
+ plagiarized_reports = [r for r in match_reports if r.is_plagiarized]
459
+ plagiarism_types = {}
460
+ sources_by_type = {"Academic": [], "Web": []}
461
+
462
+ for report in plagiarized_reports:
463
+ ptype = report.plagiarism_type or "Unknown"
464
+ plagiarism_types[ptype] = plagiarism_types.get(ptype, 0) + 1
465
+
466
+ if report.source_type:
467
+ if "Academic" in report.source_type:
468
+ if report.source_url not in sources_by_type["Academic"]:
469
+ sources_by_type["Academic"].append({
470
+ "url": report.source_url,
471
+ "type": report.source_type,
472
+ "max_similarity": report.similarity_score
473
+ })
474
+ else:
475
+ if report.source_url not in sources_by_type["Web"]:
476
+ sources_by_type["Web"].append({
477
+ "url": report.source_url,
478
+ "type": report.source_type,
479
+ "max_similarity": report.similarity_score
480
+ })
481
+
482
+ # 2. Determine severity level
483
+ if overall_score < 15:
484
+ severity = "Low"
485
+ risk_level = "Minimal - Normal citation variations detected"
486
+ elif overall_score < 30:
487
+ severity = "Medium"
488
+ risk_level = "Moderate - Multiple sources match detected"
489
+ elif overall_score < 50:
490
+ severity = "High"
491
+ risk_level = "Significant - Substantial plagiarism detected"
492
+ else:
493
+ severity = "Very High"
494
+ risk_level = "Critical - Extensive plagiarism detected"
495
+
496
+ # 3. Use LLM to generate detailed analysis
497
+ plagiarism_context = f"""
498
+ Document: {filename}
499
+ Total Words: {total_words}
500
+ Plagiarism Score: {overall_score}%
501
+ Plagiarism Types Found: {plagiarism_types}
502
+ Academic Matches: {len(sources_by_type['Academic'])}
503
+ Web Matches: {len(sources_by_type['Web'])}
504
+
505
+ Suspicious Sections (samples):
506
+ {chr(10).join([f"- {r.chunk_text[:100]}..." for r in plagiarized_reports[:5]])}
507
+ """
508
+
509
+ analysis_prompt = ChatPromptTemplate.from_messages([
510
+ ("system", """You are an expert academic integrity analyzer and plagiarism report generator.
511
+ Generate a professional, detailed plagiarism analysis report.
512
+ Focus on: severity assessment, academic integrity concerns, specific problem areas, and recommendations.
513
+ Be thorough but concise."""),
514
+ ("user", """Create a detailed plagiarism analysis for this document:
515
+
516
+ {plagiarism_context}
517
+
518
+ Provide:
519
+ 1. Executive Summary (2-3 sentences)
520
+ 2. Key Findings (3-4 bullet points)
521
+ 3. Detailed Analysis (2-3 paragraphs explaining the plagiarism pattern)
522
+ 4. Recommendations (3-4 specific actions to remediate)
523
+
524
+ Format clearly with section headers.""")
525
+ ])
526
+
527
+ chain = analysis_prompt | llm
528
+ llm_response = chain.invoke({"plagiarism_context": plagiarism_context})
529
+ llm_analysis = llm_response.content
530
+
531
+ # 4. Extract findings from LLM response
532
+ lines = llm_analysis.split('\n')
533
+ key_findings = []
534
+ recommendations = []
535
+ detailed_analysis = ""
536
+
537
+ in_findings = False
538
+ in_recommendations = False
539
+
540
+ for line in lines:
541
+ if 'Key Findings' in line:
542
+ in_findings = True
543
+ in_recommendations = False
544
+ elif 'Recommendations' in line:
545
+ in_findings = False
546
+ in_recommendations = True
547
+ elif 'Detailed Analysis' in line or 'Analysis' in line:
548
+ in_findings = False
549
+ in_recommendations = False
550
+ elif in_findings and line.strip().startswith(('-', '*', '•')):
551
+ key_findings.append(line.strip().lstrip('-*•').strip())
552
+ elif in_recommendations and line.strip().startswith(('-', '*', '•')):
553
+ recommendations.append(line.strip().lstrip('-*•').strip())
554
+ elif not in_findings and not in_recommendations and line.strip():
555
+ detailed_analysis += line + "\n"
556
+
557
+ if not key_findings:
558
+ key_findings = [
559
+ f"Overall plagiarism score: {overall_score}%",
560
+ f"Primary plagiarism type: {max(plagiarism_types.keys(), key=plagiarism_types.get) if plagiarism_types else 'Not detected'}",
561
+ f"Multiple sources detected: {len(sources_by_type['Academic']) + len(sources_by_type['Web'])} sources"
562
+ ]
563
+
564
+ if not recommendations:
565
+ recommendations = [
566
+ "Properly cite all sources according to your institution's guidelines",
567
+ "Use quotation marks for direct quotes and provide page numbers",
568
+ "Paraphrase content properly and cite original sources",
569
+ "Use plagiarism detection tools during the writing process"
570
+ ]
571
+
572
+ # 5. Affected sections
573
+ affected_sections = []
574
+ for i, report in enumerate(plagiarized_reports[:10]):
575
+ affected_sections.append({
576
+ "section_number": i + 1,
577
+ "text_snippet": report.chunk_text[:150],
578
+ "similarity_score": report.similarity_score,
579
+ "plagiarism_type": report.plagiarism_type,
580
+ "source": report.source_url,
581
+ "source_type": report.source_type
582
+ })
583
+
584
+ return DetailedPlagiarismReport(
585
+ filename=filename,
586
+ scan_timestamp=datetime.now().isoformat(),
587
+ executive_summary=llm_analysis.split('\n')[0] if llm_analysis else f"Document contains {overall_score}% plagiarized content",
588
+ overall_score=round(overall_score, 2),
589
+ severity_level=severity,
590
+ matched_sources=sources_by_type["Academic"] + sources_by_type["Web"],
591
+ key_findings=key_findings,
592
+ plagiarism_breakdown={
593
+ "total_plagiarism_percentage": round(overall_score, 2),
594
+ "types": plagiarism_types,
595
+ "academic_sources": len(sources_by_type["Academic"]),
596
+ "web_sources": len(sources_by_type["Web"])
597
+ },
598
+ detailed_analysis=detailed_analysis or llm_analysis,
599
+ affected_sections=affected_sections,
600
+ recommendations=recommendations,
601
+ academic_integrity_risk=risk_level
602
+ )
603
+
604
+ def analyze_chunk(chunk: str) -> MatchReport:
605
+ search_results = aggregate_search(chunk)
606
+
607
+ best_score = 0.0
608
+ best_url = None
609
+ best_source_type = None
610
+ plagiarism_type = None
611
+ is_plagiarized = False
612
+
613
+ for result in search_results:
614
+ source_text = result['text']
615
+
616
+ # 1. Math/Deterministic Check
617
+ exact_sim = calculate_exact_similarity(chunk, source_text)
618
+
619
+ if exact_sim > best_score:
620
+ best_score = exact_sim
621
+ best_url = result['url']
622
+ best_source_type = result['source_type']
623
+
624
+ if exact_sim > 0.50: # Lowered to 50% because we are comparing against abstracts/snippets
625
+ is_plagiarized = True
626
+ plagiarism_type = "Exact/Heavy Match"
627
+ break
628
+
629
+ # 2. Agentic Check for Mosaic Plagiarism
630
+ elif exact_sim > 0.25:
631
+ if check_paraphrasing_with_llm(chunk, source_text):
632
+ is_plagiarized = True
633
+ plagiarism_type = "Paraphrased Match (Mosaic)"
634
+ best_url = result['url']
635
+ best_source_type = result['source_type']
636
+ best_score = max(best_score, 0.85)
637
+ break
638
+
639
+ return MatchReport(
640
+ chunk_text=chunk,
641
+ is_plagiarized=is_plagiarized,
642
+ plagiarism_type=plagiarism_type,
643
+ source_url=best_url,
644
+ source_type=best_source_type,
645
+ similarity_score=round(best_score, 2)
646
+ )
647
+
648
+ # ==========================================
649
+ # 6. Report Formatting Functions
650
+ # ==========================================
651
+
652
+ def format_report_json(detailed_report: DetailedPlagiarismReport) -> Dict[str, Any]:
653
+ """Format report as JSON"""
654
+ return {
655
+ "filename": detailed_report.filename,
656
+ "scan_timestamp": detailed_report.scan_timestamp,
657
+ # Backward-compatible top-level fields expected by existing clients.
658
+ "overall_score": detailed_report.overall_score,
659
+ "severity_level": detailed_report.severity_level,
660
+ "academic_integrity_risk": detailed_report.academic_integrity_risk,
661
+ "summary": {
662
+ "overall_plagiarism_score": detailed_report.overall_score,
663
+ "severity_level": detailed_report.severity_level,
664
+ "academic_integrity_risk": detailed_report.academic_integrity_risk
665
+ },
666
+ "executive_summary": detailed_report.executive_summary,
667
+ "key_findings": detailed_report.key_findings,
668
+ "plagiarism_breakdown": detailed_report.plagiarism_breakdown,
669
+ "matched_sources": detailed_report.matched_sources,
670
+ "affected_sections": detailed_report.affected_sections,
671
+ "detailed_analysis": detailed_report.detailed_analysis,
672
+ "recommendations": detailed_report.recommendations
673
+ }
674
+
675
+ def format_report_text(detailed_report: DetailedPlagiarismReport) -> str:
676
+ """Format report as plain text"""
677
+ report = "=" * 80 + "\n"
678
+ report += "DETAILED PLAGIARISM DETECTION REPORT\n"
679
+ report += "=" * 80 + "\n\n"
680
+
681
+ report += f"FILE: {detailed_report.filename}\n"
682
+ report += f"SCAN DATE: {detailed_report.scan_timestamp}\n"
683
+ report += "-" * 80 + "\n\n"
684
+
685
+ report += "SUMMARY\n"
686
+ report += "-" * 80 + "\n"
687
+ report += f"Overall Plagiarism Score: {detailed_report.overall_score}%\n"
688
+ report += f"Severity Level: {detailed_report.severity_level}\n"
689
+ report += f"Academic Integrity Risk: {detailed_report.academic_integrity_risk}\n\n"
690
+
691
+ report += "EXECUTIVE SUMMARY\n"
692
+ report += "-" * 80 + "\n"
693
+ report += f"{detailed_report.executive_summary}\n\n"
694
+
695
+ report += "KEY FINDINGS\n"
696
+ report += "-" * 80 + "\n"
697
+ for i, finding in enumerate(detailed_report.key_findings, 1):
698
+ report += f"{i}. {finding}\n"
699
+ report += "\n"
700
+
701
+ report += "PLAGIARISM BREAKDOWN\n"
702
+ report += "-" * 80 + "\n"
703
+ report += f"Total Plagiarism %: {detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%\n"
704
+ report += f"Academic Sources: {detailed_report.plagiarism_breakdown['academic_sources']}\n"
705
+ report += f"Web Sources: {detailed_report.plagiarism_breakdown['web_sources']}\n"
706
+ if detailed_report.plagiarism_breakdown.get('types'):
707
+ report += "Types Detected:\n"
708
+ for ptype, count in detailed_report.plagiarism_breakdown['types'].items():
709
+ report += f" - {ptype}: {count} instances\n"
710
+ report += "\n"
711
+
712
+ report += "MATCHED SOURCES\n"
713
+ report += "-" * 80 + "\n"
714
+ if detailed_report.matched_sources:
715
+ for i, source in enumerate(detailed_report.matched_sources[:10], 1):
716
+ report += f"{i}. URL: {source.get('url', 'N/A')}\n"
717
+ report += f" Type: {source.get('type', 'N/A')}\n"
718
+ report += f" Similarity: {source.get('max_similarity', 'N/A')}\n\n"
719
+ else:
720
+ report += "No sources matched.\n\n"
721
+
722
+ report += "DETAILED ANALYSIS\n"
723
+ report += "-" * 80 + "\n"
724
+ report += f"{detailed_report.detailed_analysis}\n\n"
725
+
726
+ if detailed_report.affected_sections:
727
+ report += "AFFECTED SECTIONS (Top Issues)\n"
728
+ report += "-" * 80 + "\n"
729
+ for section in detailed_report.affected_sections[:5]:
730
+ report += f"\nSection {section['section_number']}:\n"
731
+ report += f"Text Snippet: {section['text_snippet']}\n"
732
+ report += f"Similarity Score: {section['similarity_score']}\n"
733
+ report += f"Plagiarism Type: {section['plagiarism_type']}\n"
734
+ report += f"Source: {section['source']}\n"
735
+ report += "\n"
736
+
737
+ report += "RECOMMENDATIONS\n"
738
+ report += "-" * 80 + "\n"
739
+ for i, rec in enumerate(detailed_report.recommendations, 1):
740
+ report += f"{i}. {rec}\n"
741
+ report += "\n"
742
+
743
+ report += "=" * 80 + "\n"
744
+ report += "End of Report\n"
745
+ report += "=" * 80 + "\n"
746
+
747
+ return report
748
+
749
+ def format_report_html(detailed_report: DetailedPlagiarismReport) -> str:
750
+ """Format report as HTML"""
751
+ html = f"""
752
+ <!DOCTYPE html>
753
+ <html lang="en">
754
+ <head>
755
+ <meta charset="UTF-8">
756
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
757
+ <title>Plagiarism Detection Report - {detailed_report.filename}</title>
758
+ <style>
759
+ body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
760
+ .container {{ background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
761
+ h1 {{ color: #333; border-bottom: 3px solid #2196F3; padding-bottom: 10px; }}
762
+ h2 {{ color: #2196F3; margin-top: 30px; }}
763
+ .summary {{ background-color: #f0f7ff; padding: 15px; border-left: 4px solid #2196F3; margin: 20px 0; }}
764
+ .score {{ font-size: 24px; font-weight: bold; color: #d32f2f; }}
765
+ .severity-low {{ color: #4caf50; }}
766
+ .severity-medium {{ color: #ff9800; }}
767
+ .severity-high {{ color: #f44336; }}
768
+ .severity-very-high {{ color: #c41c3b; }}
769
+ .findings {{ background-color: #fff3e0; padding: 15px; border-left: 4px solid #ff9800; }}
770
+ .source-item {{ background-color: #f5f5f5; padding: 10px; margin: 10px 0; border-radius: 4px; }}
771
+ .recommendation {{ background-color: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 3px solid #4caf50; }}
772
+ table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
773
+ th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
774
+ th {{ background-color: #2196F3; color: white; }}
775
+ .affected-section {{ background-color: #fce4ec; padding: 15px; margin: 10px 0; border-radius: 4px; }}
776
+ </style>
777
+ </head>
778
+ <body>
779
+ <div class="container">
780
+ <h1>🔍 Plagiarism Detection Report</h1>
781
+
782
+ <div class="summary">
783
+ <p><strong>File:</strong> {detailed_report.filename}</p>
784
+ <p><strong>Scan Date:</strong> {detailed_report.scan_timestamp}</p>
785
+ <p><strong>Overall Plagiarism Score:</strong> <span class="score">{detailed_report.overall_score}%</span></p>
786
+ <p><strong>Severity Level:</strong> <span class="severity-{detailed_report.severity_level.lower().replace(' ', '-')}">{detailed_report.severity_level}</span></p>
787
+ <p><strong>Academic Integrity Risk:</strong> {detailed_report.academic_integrity_risk}</p>
788
+ </div>
789
+
790
+ <h2>Executive Summary</h2>
791
+ <p>{detailed_report.executive_summary}</p>
792
+
793
+ <h2>Key Findings</h2>
794
+ <div class="findings">
795
+ <ul>
796
+ {"".join([f"<li>{finding}</li>" for finding in detailed_report.key_findings])}
797
+ </ul>
798
+ </div>
799
+
800
+ <h2>Plagiarism Breakdown</h2>
801
+ <table>
802
+ <tr>
803
+ <th>Category</th>
804
+ <th>Value</th>
805
+ </tr>
806
+ <tr>
807
+ <td>Total Plagiarism %</td>
808
+ <td>{detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%</td>
809
+ </tr>
810
+ <tr>
811
+ <td>Academic Sources</td>
812
+ <td>{detailed_report.plagiarism_breakdown['academic_sources']}</td>
813
+ </tr>
814
+ <tr>
815
+ <td>Web Sources</td>
816
+ <td>{detailed_report.plagiarism_breakdown['web_sources']}</td>
817
+ </tr>
818
+ </table>
819
+
820
+ <h2>Matched Sources</h2>
821
+ {"".join([f'<div class="source-item"><strong>{source.get("type", "Unknown")}</strong><br/><a href="{source.get("url", "#")}" target="_blank">{source.get("url", "N/A")}</a><br/>Similarity: {source.get("max_similarity", "N/A")}</div>' for source in detailed_report.matched_sources[:10]])}
822
+
823
+ <h2>Detailed Analysis</h2>
824
+ <p>{detailed_report.detailed_analysis.replace(chr(10), "<br/>")}</p>
825
+
826
+ {"<h2>Affected Sections (Top Issues)</h2>" + "".join([f'<div class="affected-section"><strong>Section {section["section_number"]}</strong><br/><em>Text:</em> {section["text_snippet"]}...<br/><em>Similarity:</em> {section["similarity_score"]}<br/><em>Type:</em> {section["plagiarism_type"]}</div>' for section in detailed_report.affected_sections[:5]]) if detailed_report.affected_sections else ""}
827
+
828
+ <h2>Recommendations</h2>
829
+ <div>
830
+ {"".join([f'<div class="recommendation"><strong>✓</strong> {rec}</div>' for rec in detailed_report.recommendations])}
831
+ </div>
832
+ </div>
833
+ </body>
834
+ </html>
835
+ """
836
+ return html
837
+
838
+ # ==========================================
839
+ # 5. API Endpoints & Utility
840
+ # ==========================================
841
+
842
+ def extract_text_from_pdf(file_bytes) -> str:
843
+ reader = PdfReader(file_bytes)
844
+ return "".join([page.extract_text() + "\n" for page in reader.pages if page.extract_text()])
845
+
846
+ def chunk_text(text: str, words_per_chunk: int = 40) -> List[str]:
847
+ words = text.split()
848
+ chunks = []
849
+ for i in range(0, len(words), words_per_chunk - 10):
850
+ chunk = " ".join(words[i:i + words_per_chunk])
851
+ if len(chunk.split()) > 15:
852
+ chunks.append(chunk)
853
+ return chunks
854
+
855
+ @app.post("/scan-paper", response_model=PlagiarismReport)
856
+ async def scan_paper(file: UploadFile = File(...)):
857
+ text = extract_text_from_pdf(file.file)
858
+ total_words = len(text.split())
859
+
860
+ if total_words == 0:
861
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
862
+
863
+ chunks = chunk_text(text)
864
+
865
+ # Cap chunks for safety during testing (remove in production)
866
+ if len(chunks) > 20:
867
+ chunks = chunks[:20]
868
+
869
+ detailed_reports = []
870
+ plagiarized_word_count = 0
871
+
872
+ for chunk in chunks:
873
+ report = analyze_chunk(chunk)
874
+ detailed_reports.append(report)
875
+
876
+ if report.is_plagiarized:
877
+ plagiarized_word_count += len(chunk.split())
878
+
879
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
880
+ overall_score = (plagiarized_word_count / total_words) * 100
881
+
882
+ # Determine severity level
883
+ if overall_score < 15:
884
+ severity = "Low"
885
+ elif overall_score < 30:
886
+ severity = "Medium"
887
+ elif overall_score < 50:
888
+ severity = "High"
889
+ else:
890
+ severity = "Very High"
891
+
892
+ return PlagiarismReport(
893
+ filename=file.filename,
894
+ total_words=total_words,
895
+ plagiarized_words=plagiarized_word_count,
896
+ overall_plagiarism_score=round(overall_score, 2),
897
+ severity_level=severity,
898
+ details=detailed_reports
899
+ )
900
+
901
+ @app.post("/generate-detailed-report")
902
+ async def generate_detailed_report(file: UploadFile = File(...)):
903
+ """Generate comprehensive plagiarism report with LLM analysis"""
904
+ text = extract_text_from_pdf(file.file)
905
+ total_words = len(text.split())
906
+
907
+ if total_words == 0:
908
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
909
+
910
+ chunks = chunk_text(text)
911
+
912
+ # Cap chunks
913
+ if len(chunks) > 20:
914
+ chunks = chunks[:20]
915
+
916
+ detailed_reports = []
917
+ plagiarized_word_count = 0
918
+
919
+ for chunk in chunks:
920
+ report = analyze_chunk(chunk)
921
+ detailed_reports.append(report)
922
+
923
+ if report.is_plagiarized:
924
+ plagiarized_word_count += len(chunk.split())
925
+
926
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
927
+ overall_score = (plagiarized_word_count / total_words) * 100
928
+
929
+ # Generate detailed report with LLM analysis
930
+ detailed_report = generate_detailed_report_with_llm(
931
+ filename=file.filename,
932
+ match_reports=detailed_reports,
933
+ total_words=total_words,
934
+ overall_score=overall_score
935
+ )
936
+
937
+ return format_report_json(detailed_report)
938
+
939
+ @app.post("/report/text")
940
+ async def report_text(file: UploadFile = File(...)):
941
+ """Generate detailed plagiarism report as plain text"""
942
+ text = extract_text_from_pdf(file.file)
943
+ total_words = len(text.split())
944
+
945
+ if total_words == 0:
946
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
947
+
948
+ chunks = chunk_text(text)
949
+
950
+ if len(chunks) > 20:
951
+ chunks = chunks[:20]
952
+
953
+ detailed_reports = []
954
+ plagiarized_word_count = 0
955
+
956
+ for chunk in chunks:
957
+ report = analyze_chunk(chunk)
958
+ detailed_reports.append(report)
959
+
960
+ if report.is_plagiarized:
961
+ plagiarized_word_count += len(chunk.split())
962
+
963
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
964
+ overall_score = (plagiarized_word_count / total_words) * 100
965
+
966
+ # Generate detailed report
967
+ detailed_report = generate_detailed_report_with_llm(
968
+ filename=file.filename,
969
+ match_reports=detailed_reports,
970
+ total_words=total_words,
971
+ overall_score=overall_score
972
+ )
973
+
974
+ from fastapi.responses import PlainTextResponse
975
+ return PlainTextResponse(format_report_text(detailed_report))
976
+
977
+ @app.post("/report/html")
978
+ async def report_html(file: UploadFile = File(...)):
979
+ """Generate detailed plagiarism report as HTML"""
980
+ text = extract_text_from_pdf(file.file)
981
+ total_words = len(text.split())
982
+
983
+ if total_words == 0:
984
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
985
+
986
+ chunks = chunk_text(text)
987
+
988
+ if len(chunks) > 20:
989
+ chunks = chunks[:20]
990
+
991
+ detailed_reports = []
992
+ plagiarized_word_count = 0
993
+
994
+ for chunk in chunks:
995
+ report = analyze_chunk(chunk)
996
+ detailed_reports.append(report)
997
+
998
+ if report.is_plagiarized:
999
+ plagiarized_word_count += len(chunk.split())
1000
+
1001
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
1002
+ overall_score = (plagiarized_word_count / total_words) * 100
1003
+
1004
+ # Generate detailed report
1005
+ detailed_report = generate_detailed_report_with_llm(
1006
+ filename=file.filename,
1007
+ match_reports=detailed_reports,
1008
+ total_words=total_words,
1009
+ overall_score=overall_score
1010
+ )
1011
+
1012
+ from fastapi.responses import HTMLResponse
1013
+ return HTMLResponse(format_report_html(detailed_report))
1014
+
1015
+ @app.get("/")
1016
+ async def root():
1017
+ return {
1018
+ "message": "Pro Plagiarism Detector API",
1019
+ "endpoints": {
1020
+ "scan": "/scan-paper (POST - basic scan)",
1021
+ "detailed_report": "/generate-detailed-report (POST - JSON report with LLM analysis)",
1022
+ "text_report": "/report/text (POST - plain text report)",
1023
+ "html_report": "/report/html (POST - HTML report)"
1024
+ }
1025
+ }
1026
+
1027
+ if __name__ == "__main__":
1028
  uvicorn.run(app, host="0.0.0.0", port=8000)