| import textwrap |
| import sys |
| from pathlib import Path |
|
|
| import pytest |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from measures.VocabularyAnalyser import VocabularyAnalyser |
|
|
|
|
| @pytest.fixture |
| def glossary_file(tmp_path): |
| """Create a small glossary CSV for testing.""" |
| csv_content = textwrap.dedent( |
| """\ |
| acute,,, |
| acute angle, acute angles,, |
| acute triangle, acute triangles,, |
| add, added, adding, adds |
| addend, addends,, |
| """ |
| ) |
| path = tmp_path / "glossary.csv" |
| path.write_text(csv_content, encoding="utf-8") |
| return str(path) |
|
|
|
|
| class DummyUtterance: |
| def __init__(self, speaker, text): |
| self.speaker = speaker |
| self.text = text |
| self.vocabulary_terms = None |
| self.vocabulary_matches = None |
|
|
|
|
| class DummyTranscript: |
| def __init__(self, utterances): |
| self.utterances = utterances |
|
|
|
|
| @pytest.fixture |
| def analyser(glossary_file): |
| return VocabularyAnalyser(glossary_file) |
|
|
|
|
| def test_match_counts_base_once(analyser): |
| text = "Add add ADD adding added adds" |
| assert analyser.match_one_utterance(text) == ["add"] |
|
|
|
|
| def test_match_prefers_longest_phrase(analyser): |
| text = "An acute angle appears in this proof." |
| assert analyser.match_one_utterance(text) == ["acute angle"] |
|
|
|
|
| def test_match_handles_overlapping_and_distinct_terms(analyser): |
| text = ( |
| "The class studied the properties of an acute triangle, then discussed an acute situation." |
| ) |
| assert analyser.match_one_utterance(text) == [ |
| "acute", |
| "acute triangle", |
| ] |
|
|
|
|
| def test_run_analysis_adds_vocabulary_terms_and_matches(analyser): |
| transcript = DummyTranscript( |
| [ |
| DummyUtterance("Teacher", "We add addends in this acute triangle."), |
| DummyUtterance("Student", "Acute angles contrast with obtuse ones."), |
| DummyUtterance("Teacher", "No glossary matches"), |
| ] |
| ) |
|
|
| result = analyser.run_analysis(transcript) |
|
|
| assert result is transcript |
| assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"] |
| assert transcript.utterances[1].vocabulary_terms == ["acute angle"] |
| assert transcript.utterances[2].vocabulary_terms == [] |
|
|
| assert transcript.utterances[0].vocabulary_matches == { |
| "acute triangle": [ |
| {"form": "acute triangle", "start": 23, "end": 37}, |
| ], |
| "add": [ |
| {"form": "add", "start": 3, "end": 6}, |
| ], |
| "addend": [ |
| {"form": "addends", "start": 7, "end": 14}, |
| ], |
| } |
| assert transcript.utterances[1].vocabulary_matches == { |
| "acute angle": [ |
| {"form": "acute angles", "start": 0, "end": 12}, |
| ] |
| } |
| assert transcript.utterances[2].vocabulary_matches == {} |
|
|
|
|
| def test_vocabulary_matches_capture_multiple_occurrences(analyser): |
| transcript = DummyTranscript([ |
| DummyUtterance("Teacher", "Add adds add."), |
| ]) |
|
|
| analyser.run_analysis(transcript) |
|
|
| matches = transcript.utterances[0].vocabulary_matches |
| assert transcript.utterances[0].vocabulary_terms == ["add"] |
| assert matches["add"] == [ |
| {"form": "add", "start": 0, "end": 3}, |
| {"form": "adds", "start": 4, "end": 8}, |
| {"form": "add", "start": 9, "end": 12}, |
| ] |
|
|