Spaces:

tanmmayyy
/

mcq_generator

Running

App Files Files Community

mcq_generator / src /mcq_builder.py

tanmmayyy

Initial commit — MCQ Generator with T5 + NER + WordNet

73633b5 19 days ago

raw

history blame contribute delete

7.69 kB

	# ─────────────────────────────────────────────
	# src/mcq_builder.py (v4)
	# Added strict MCQ quality validation.
	# ─────────────────────────────────────────────

	import random
	from dataclasses import dataclass
	import sys, os

	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from config import NUM_DISTRACTORS, MAX_QUESTIONS

	from src.preprocessor import preprocess
	from src.question_generator import generate_questions
	from src.distractor_generator import get_distractors


	@dataclass
	class MCQ:
	question : str
	options : list
	correct_index : int
	correct_answer : str
	explanation : str

	def display(self):
	print(f"\nQ: {self.question}")
	for i, opt in enumerate(self.options):
	marker = " ✓" if i == self.correct_index else ""
	print(f" {chr(65+i)}. {opt}{marker}")
	print(f" Explanation: {self.explanation[:100]}...")


	def are_too_similar(a: str, b: str) -> bool:
	"""
	Check if two option strings are too similar to coexist in the same MCQ.
	Handles cases like "WWE" vs "World Wrestling Entertainment",
	or "ISRO" vs "Indian Space Research Organisation".
	"""
	a_lower = a.lower().strip()
	b_lower = b.lower().strip()

	# Exact match
	if a_lower == b_lower:
	return True

	# One is a substring of the other (e.g. "WWE" in "WWE Championship")
	if a_lower in b_lower or b_lower in a_lower:
	return True

	# Check word overlap ratio — if 60%+ words overlap, too similar
	words_a = set(a_lower.split())
	words_b = set(b_lower.split())
	if not words_a or not words_b:
	return False
	overlap = len(words_a & words_b)
	smaller = min(len(words_a), len(words_b))
	if smaller > 0 and overlap / smaller >= 0.6:
	return True

	return False


	def deduplicate_options(answer: str, distractors: list) -> list:
	"""
	Remove distractors that are too similar to each other or to the answer.
	Returns a clean list of unique distractors.
	"""
	clean = []
	for d in distractors:
	# Skip if too similar to the correct answer
	if are_too_similar(d, answer):
	continue
	# Skip if too similar to an already-accepted distractor
	if any(are_too_similar(d, accepted) for accepted in clean):
	continue
	clean.append(d)
	return clean


	def is_valid_mcq(question: str, answer: str, options: list) -> tuple:
	"""
	Final quality gate before an MCQ is accepted.
	Returns (is_valid: bool, reason: str).
	"""
	# Answer must appear in options exactly once
	answer_count = sum(1 for o in options if o.lower().strip() == answer.lower().strip())
	if answer_count != 1:
	return False, f"Answer appears {answer_count} times in options"

	# Must have exactly 4 options
	if len(options) != 4:
	return False, f"Only {len(options)} options"

	# No two options should be too similar
	for i in range(len(options)):
	for j in range(i + 1, len(options)):
	if are_too_similar(options[i], options[j]):
	return False, f"Options too similar: '{options[i]}' vs '{options[j]}'"

	# Generic placeholder options are a last resort — skip if more than 1
	generic = {"None of the above", "Cannot be determined",
	"All of the above", "Information not provided"}
	generic_count = sum(1 for o in options if o in generic)
	if generic_count > 1:
	return False, "Too many generic placeholder options"

	# Question should not just be asking "What is X?" where X is the answer
	q_lower = question.lower()
	a_lower = answer.lower()
	if a_lower in q_lower:
	return False, "Answer already present in question"

	return True, "OK"


	def build_mcq(question: str, answer: str, distractors: list, explanation: str):
	"""Build and validate one MCQ. Returns MCQ or None if quality check fails."""

	# Deduplicate distractors against each other and the answer
	clean_distractors = deduplicate_options(answer, distractors)

	if len(clean_distractors) < 1:
	return None

	# Pad to 3 if needed (after dedup we might have fewer)
	placeholders = ["None of the above", "Cannot be determined", "All of the above"]
	for p in placeholders:
	if len(clean_distractors) >= NUM_DISTRACTORS:
	break
	if p not in clean_distractors:
	clean_distractors.append(p)

	options = [answer] + clean_distractors[:NUM_DISTRACTORS]
	random.shuffle(options)
	correct_index = options.index(answer)

	# Run quality gate
	valid, reason = is_valid_mcq(question, answer, options)
	if not valid:
	print(f" [QC] Rejected MCQ — {reason}: Q='{question[:50]}'")
	return None

	return MCQ(
	question = question,
	options = options,
	correct_index = correct_index,
	correct_answer = answer,
	explanation = explanation,
	)


	def build_quiz(passage: str, num_questions: int = MAX_QUESTIONS) -> list:
	print(f"\n[Pipeline] Starting for passage ({len(passage)} chars)...")

	print("[Pipeline] Step 1/3: Preprocessing...")
	prep = preprocess(passage)
	sentence_answers = prep["sentence_answers"]
	all_entities = prep["entities"]

	if not sentence_answers:
	print("[Pipeline] No suitable sentences found.")
	return []

	print("[Pipeline] Step 2/3: Generating questions...")
	qa_pairs = generate_questions(sentence_answers)

	if not qa_pairs:
	print("[Pipeline] No questions generated.")
	return []

	print(f"[Pipeline] {len(qa_pairs)} candidate question(s) generated.")

	print("[Pipeline] Step 3/3: Building and validating MCQs...")
	mcqs = []

	for qa in qa_pairs:
	if len(mcqs) >= num_questions:
	break

	distractors = get_distractors(
	answer = qa["answer"],
	all_entities = all_entities,
	)

	mcq = build_mcq(
	question = qa["question"],
	answer = qa["answer"],
	distractors = distractors,
	explanation = qa["sentence"],
	)

	if mcq is not None:
	mcqs.append(mcq)

	print(f"[Pipeline] Done. {len(mcqs)} valid MCQ(s) built.")

	if len(mcqs) == 0:
	print("\n[Pipeline] NOTICE: Could not build valid MCQs from this passage.")
	print(" This usually means the passage lacks specific named facts.")
	print(" Try a factual passage with: people names, places, dates, organisations.")

	return mcqs


	if __name__ == "__main__":
	# Test with ISRO passage (factual — should work well)
	passage = """
	The Indian Space Research Organisation (ISRO) was founded in 1969 by Vikram Sarabhai.
	It is headquartered in Bengaluru, Karnataka. ISRO developed India's first satellite,
	Aryabhata, which was launched in 1975. The Chandrayaan-1 mission in 2008 discovered
	water molecules on the Moon. In 2023, Chandrayaan-3 successfully landed near the
	lunar south pole, making India the fourth country to achieve a Moon landing.
	The Mars Orbiter Mission, also called Mangalyaan, was launched in 2013 and made
	India the first Asian country to reach Martian orbit.
	"""

	mcqs = build_quiz(passage, num_questions=5)
	print("\n========== GENERATED QUIZ ==========")
	for i, mcq in enumerate(mcqs, 1):
	print(f"\n--- Question {i} ---")
	mcq.display()