Spaces:
Running
Running
File size: 5,597 Bytes
73633b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | # βββββββββββββββββββββββββββββββββββββββββββββ
# src/preprocessor.py (v3)
# βββββββββββββββββββββββββββββββββββββββββββββ
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import SPACY_MODEL, TOP_SENTENCES, MIN_SENTENCE_LENGTH
try:
nlp = spacy.load(SPACY_MODEL)
except OSError:
print(f"[ERROR] Run: python -m spacy download {SPACY_MODEL}")
raise
# Only these NER labels make meaningful quiz answers
GOOD_NER_LABELS = {
"PERSON", "ORG", "GPE", "LOC",
"DATE", "EVENT", "WORK_OF_ART",
"NORP", "FAC", "PRODUCT",
}
# Hard blacklist β never use these as answers
BLACKLIST = {
"annual", "various", "many", "several", "some", "other",
"new", "old", "big", "large", "small", "high", "low",
"one", "two", "three", "four", "five", "first", "second",
"today", "yesterday", "now", "then", "later", "also",
"he", "she", "it", "they", "we", "i", "the", "a", "an",
"moon", "sun", "earth",
"india", "america", "china", "russia", "england", "world", # too broad
"isro", "nasa", "wwe", "un", "who", # abbreviations make circular Qs
}
# Prefer answers with these labels β they make the clearest questions
HIGH_PRIORITY_LABELS = {"PERSON", "ORG", "GPE", "LOC", "EVENT", "WORK_OF_ART", "FAC", "PRODUCT"}
def extract_sentences(text: str) -> list:
doc = nlp(text)
sentences = []
for sent in doc.sents:
clean = sent.text.strip()
word_count = len([t for t in sent if not t.is_space and not t.is_punct])
if word_count >= MIN_SENTENCE_LENGTH:
sentences.append(clean)
return sentences
def rank_sentences(sentences: list, top_n: int = TOP_SENTENCES) -> list:
if len(sentences) <= top_n:
return sentences
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(sentences)
scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
top_indices = sorted(np.argsort(scores)[::-1][:top_n])
return [sentences[i] for i in top_indices]
def is_good_answer(text: str, label: str) -> bool:
t = text.strip()
if len(t) < 2:
return False
# Reject blacklisted words (case-insensitive)
if t.lower() in BLACKLIST:
return False
# Must be an allowed NER label
if label not in GOOD_NER_LABELS:
return False
# Single lowercase word with no capitals = probably not a proper noun
if len(t.split()) == 1 and t[0].islower() and not t.isdigit():
return False
# Reject very long phrases (>5 words) β hard to use as MCQ answers
if len(t.split()) > 5:
return False
return True
def extract_answer_candidates(sentence: str) -> list:
"""
Extract answer candidates from a sentence.
Returns high-priority entities first, then dates/others.
Only ONE answer per sentence is ultimately used (the best one).
"""
doc = nlp(sentence)
high = [] # PERSON, ORG, GPE, etc.
low = [] # DATE, QUANTITY, etc.
seen = set()
for ent in doc.ents:
text = ent.text.strip()
label = ent.label_
if not is_good_answer(text, label):
continue
if text.lower() in seen:
continue
seen.add(text.lower())
if label in HIGH_PRIORITY_LABELS:
high.append(text)
else:
low.append(text)
# Return high-priority first, then dates/quantities
return high + low
def preprocess(text: str) -> dict:
text = re.sub(r'\s+', ' ', text).strip()
all_sentences = extract_sentences(text)
top_sentences = rank_sentences(all_sentences)
sentence_answers = {}
for sent in top_sentences:
candidates = extract_answer_candidates(sent)
if candidates:
sentence_answers[sent] = candidates
doc = nlp(text)
# Store entities WITH their labels for the distractor generator
all_entities = []
seen = set()
for ent in doc.ents:
if is_good_answer(ent.text.strip(), ent.label_) and ent.text.lower() not in seen:
seen.add(ent.text.lower())
all_entities.append({"text": ent.text.strip(), "label": ent.label_})
return {
"all_sentences" : all_sentences,
"top_sentences" : top_sentences,
"sentence_answers" : sentence_answers,
"entities" : all_entities, # now list of {"text":..,"label":..}
}
if __name__ == "__main__":
sample = """
The Indian Space Research Organisation (ISRO) was founded in 1969 by Vikram Sarabhai.
ISRO developed India's first satellite, Aryabhata, which was launched in 1975.
The Chandrayaan-1 mission in 2008 discovered water molecules on the Moon.
In 2023, Chandrayaan-3 successfully landed near the lunar south pole, making India
the fourth country to achieve a Moon landing.
The Mars Orbiter Mission, also called Mangalyaan, was launched in 2013.
"""
result = preprocess(sample)
print("=== SENTENCE β CANDIDATES ===")
for sent, ans in result['sentence_answers'].items():
print(f" Source : {sent[:75]}")
print(f" Answers: {ans}\n")
print("=== ALL ENTITIES (for distractors) ===")
for e in result['entities']:
print(f" {e['label']:15s} {e['text']}") |