transcript-analysis / tests /test_vocabulary_analyser.py

list matched phrases (#9)

8336cbd verified 3 months ago

3.39 kB

	import textwrap
	import sys
	from pathlib import Path

	import pytest

	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))

	from measures.VocabularyAnalyser import VocabularyAnalyser


	@pytest.fixture
	def glossary_file(tmp_path):
	"""Create a small glossary CSV for testing."""
	csv_content = textwrap.dedent(
	"""\
	acute,,,
	acute angle, acute angles,,
	acute triangle, acute triangles,,
	add, added, adding, adds
	addend, addends,,
	"""
	)
	path = tmp_path / "glossary.csv"
	path.write_text(csv_content, encoding="utf-8")
	return str(path)


	class DummyUtterance:
	def __init__(self, speaker, text):
	self.speaker = speaker
	self.text = text
	self.vocabulary_terms = None
	self.vocabulary_matches = None


	class DummyTranscript:
	def __init__(self, utterances):
	self.utterances = utterances


	@pytest.fixture
	def analyser(glossary_file):
	return VocabularyAnalyser(glossary_file)


	def test_match_counts_base_once(analyser):
	text = "Add add ADD adding added adds"
	assert analyser.match_one_utterance(text) == ["add"]


	def test_match_prefers_longest_phrase(analyser):
	text = "An acute angle appears in this proof."
	assert analyser.match_one_utterance(text) == ["acute angle"]


	def test_match_handles_overlapping_and_distinct_terms(analyser):
	text = (
	"The class studied the properties of an acute triangle, then discussed an acute situation."
	)
	assert analyser.match_one_utterance(text) == [
	"acute",
	"acute triangle",
	]


	def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
	transcript = DummyTranscript(
	[
	DummyUtterance("Teacher", "We add addends in this acute triangle."),
	DummyUtterance("Student", "Acute angles contrast with obtuse ones."),
	DummyUtterance("Teacher", "No glossary matches"),
	]
	)

	result = analyser.run_analysis(transcript)

	assert result is transcript
	assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
	assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
	assert transcript.utterances[2].vocabulary_terms == []

	assert transcript.utterances[0].vocabulary_matches == {
	"acute triangle": [
	{"form": "acute triangle", "start": 23, "end": 37},
	],
	"add": [
	{"form": "add", "start": 3, "end": 6},
	],
	"addend": [
	{"form": "addends", "start": 7, "end": 14},
	],
	}
	assert transcript.utterances[1].vocabulary_matches == {
	"acute angle": [
	{"form": "acute angles", "start": 0, "end": 12},
	]
	}
	assert transcript.utterances[2].vocabulary_matches == {}


	def test_vocabulary_matches_capture_multiple_occurrences(analyser):
	transcript = DummyTranscript([
	DummyUtterance("Teacher", "Add adds add."),
	])

	analyser.run_analysis(transcript)

	matches = transcript.utterances[0].vocabulary_matches
	assert transcript.utterances[0].vocabulary_terms == ["add"]
	assert matches["add"] == [
	{"form": "add", "start": 0, "end": 3},
	{"form": "adds", "start": 4, "end": 8},
	{"form": "add", "start": 9, "end": 12},
	]