import textwrap import sys from pathlib import Path import pytest PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from measures.VocabularyAnalyser import VocabularyAnalyser @pytest.fixture def glossary_file(tmp_path): """Create a small glossary CSV for testing.""" csv_content = textwrap.dedent( """\ acute,,, acute angle, acute angles,, acute triangle, acute triangles,, add, added, adding, adds addend, addends,, """ ) path = tmp_path / "glossary.csv" path.write_text(csv_content, encoding="utf-8") return str(path) class DummyUtterance: def __init__(self, speaker, text): self.speaker = speaker self.text = text self.vocabulary_terms = None self.vocabulary_matches = None class DummyTranscript: def __init__(self, utterances): self.utterances = utterances @pytest.fixture def analyser(glossary_file): return VocabularyAnalyser(glossary_file) def test_match_counts_base_once(analyser): text = "Add add ADD adding added adds" assert analyser.match_one_utterance(text) == ["add"] def test_match_prefers_longest_phrase(analyser): text = "An acute angle appears in this proof." assert analyser.match_one_utterance(text) == ["acute angle"] def test_match_handles_overlapping_and_distinct_terms(analyser): text = ( "The class studied the properties of an acute triangle, then discussed an acute situation." ) assert analyser.match_one_utterance(text) == [ "acute", "acute triangle", ] def test_run_analysis_adds_vocabulary_terms_and_matches(analyser): transcript = DummyTranscript( [ DummyUtterance("Teacher", "We add addends in this acute triangle."), DummyUtterance("Student", "Acute angles contrast with obtuse ones."), DummyUtterance("Teacher", "No glossary matches"), ] ) result = analyser.run_analysis(transcript) assert result is transcript assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"] assert transcript.utterances[1].vocabulary_terms == ["acute angle"] assert transcript.utterances[2].vocabulary_terms == [] assert transcript.utterances[0].vocabulary_matches == { "acute triangle": [ {"form": "acute triangle", "start": 23, "end": 37}, ], "add": [ {"form": "add", "start": 3, "end": 6}, ], "addend": [ {"form": "addends", "start": 7, "end": 14}, ], } assert transcript.utterances[1].vocabulary_matches == { "acute angle": [ {"form": "acute angles", "start": 0, "end": 12}, ] } assert transcript.utterances[2].vocabulary_matches == {} def test_vocabulary_matches_capture_multiple_occurrences(analyser): transcript = DummyTranscript([ DummyUtterance("Teacher", "Add adds add."), ]) analyser.run_analysis(transcript) matches = transcript.utterances[0].vocabulary_matches assert transcript.utterances[0].vocabulary_terms == ["add"] assert matches["add"] == [ {"form": "add", "start": 0, "end": 3}, {"form": "adds", "start": 4, "end": 8}, {"form": "add", "start": 9, "end": 12}, ]