| import fitz |
| import re |
| from typing import List |
| from transformers import pipeline |
| import torch |
|
|
| PDF_FILE_PATH = "module_b/file_2.pdf" |
|
|
|
|
| def extract_nepali_sentences_from_pdf(pdf_path: str) -> List[str]: |
| """ |
| Extracts clean Nepali sentences from a searchable PDF using PyMuPDF. |
| """ |
| print(f"Opening PDF: {pdf_path}") |
| doc = fitz.open(pdf_path) |
| |
| full_text = "" |
| for page in doc: |
| text = page.get_text("text") |
| full_text += text + "\n" |
| |
| doc.close() |
| |
| if not full_text.strip(): |
| print("Warning: No text found. PDF might be scanned (image-based). Use OCR version instead.") |
| return [] |
| |
| |
| text = full_text.replace('\n', ' ') |
| text = re.sub(r'\s+', ' ', text).strip() |
| |
| |
| sentences = re.split(r'(?<=[।.!?])\s+(?=[अ-हँ-ॿअ-ह])|(?<=[।.!?])(?=$)', text) |
| if len(sentences) <= 1: |
| sentences = re.split(r'(?<=[।.!?])\s+', text) |
| |
| |
| cleaned = [s.strip(' ।.!?').strip() for s in sentences if len(s.strip()) > 5] |
| |
| print(f"Successfully extracted {len(cleaned)} clean sentences.\n") |
| return cleaned |
|
|
|
|
| print("Loading your model from Hugging Face...") |
| model_name = "sangy1212/distilbert-base-nepali-fine-tuned" |
|
|
| classifier = pipeline( |
| "text-classification", |
| model=model_name, |
| tokenizer=model_name, |
| device=0 if torch.cuda.is_available() else -1, |
| batch_size=16 |
| ) |
|
|
| print("Model loaded and ready!\n") |
|
|
| id_to_label = { |
| "LABEL_0": "neutral", |
| "LABEL_1": "gender", |
| "LABEL_2": "religional", |
| "LABEL_3": "caste", |
| "LABEL_4": "religion", |
| "LABEL_5": "appearence", |
| "LABEL_6": "socialstatus", |
| "LABEL_7": "amiguity", |
| "LABEL_8": "political", |
| "LABEL_9": "Age", |
| "LABEL_10": "Disablity" |
| } |
|
|
| def predict_bias_on_sentences(sentences: List[str], confidence_threshold: float = 0.7): |
| """ |
| Runs batch prediction and prints results with nice formatting. |
| """ |
| if not sentences: |
| print("No sentences to analyze.") |
| return |
| |
| print(f"Running bias detection on {len(sentences)} sentences...\n") |
| |
| |
| results = classifier(sentences) |
| |
| print("="*100) |
| print("BIAS DETECTION RESULTS") |
| print("="*100) |
| |
| biased_count = 0 |
| for sent, res in zip(sentences, results): |
| label_id = res['label'] |
| category = id_to_label.get(label_id, "unknown") |
| confidence = res['score'] |
| |
| if category != "neutral" and confidence >= confidence_threshold: |
| mark = " BIAS DETECTED" |
| biased_count += 1 |
| else: |
| mark = "✓ neutral / low confidence" |
| |
| print(f"{mark}") |
| print(f" Category : {category.upper()}") |
| print(f" Confidence : {confidence:.3f}") |
| print(f" Sentence : {sent}") |
| print("-" * 80) |
| |
| print(f"\nSummary: {biased_count}/{len(sentences)} sentences contain detectable bias (confidence ≥ {confidence_threshold})") |
|
|
|
|
| if __name__ == "__main__": |
| pdf_file_path = PDF_FILE_PATH |
| import os |
|
|
| if os.path.exists(pdf_file_path): |
| print(f"Using PDF file at: {pdf_file_path}\n") |
| else: |
| print(f"PDF file not found at: {pdf_file_path}. Please check the path.") |
| exit(1) |
| |
| |
| sentences = extract_nepali_sentences_from_pdf(pdf_file_path) |
| |
| |
| if sentences: |
| predict_bias_on_sentences(sentences, confidence_threshold=0.7) |
| |
| print("\nDone! Your bias detection is complete.") |