transcript-analysis / tests /test_vocabulary_analyser.py
ikarasz's picture
list matched phrases (#9)
8336cbd verified
import textwrap
import sys
from pathlib import Path
import pytest
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from measures.VocabularyAnalyser import VocabularyAnalyser
@pytest.fixture
def glossary_file(tmp_path):
"""Create a small glossary CSV for testing."""
csv_content = textwrap.dedent(
"""\
acute,,,
acute angle, acute angles,,
acute triangle, acute triangles,,
add, added, adding, adds
addend, addends,,
"""
)
path = tmp_path / "glossary.csv"
path.write_text(csv_content, encoding="utf-8")
return str(path)
class DummyUtterance:
def __init__(self, speaker, text):
self.speaker = speaker
self.text = text
self.vocabulary_terms = None
self.vocabulary_matches = None
class DummyTranscript:
def __init__(self, utterances):
self.utterances = utterances
@pytest.fixture
def analyser(glossary_file):
return VocabularyAnalyser(glossary_file)
def test_match_counts_base_once(analyser):
text = "Add add ADD adding added adds"
assert analyser.match_one_utterance(text) == ["add"]
def test_match_prefers_longest_phrase(analyser):
text = "An acute angle appears in this proof."
assert analyser.match_one_utterance(text) == ["acute angle"]
def test_match_handles_overlapping_and_distinct_terms(analyser):
text = (
"The class studied the properties of an acute triangle, then discussed an acute situation."
)
assert analyser.match_one_utterance(text) == [
"acute",
"acute triangle",
]
def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
transcript = DummyTranscript(
[
DummyUtterance("Teacher", "We add addends in this acute triangle."),
DummyUtterance("Student", "Acute angles contrast with obtuse ones."),
DummyUtterance("Teacher", "No glossary matches"),
]
)
result = analyser.run_analysis(transcript)
assert result is transcript
assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
assert transcript.utterances[2].vocabulary_terms == []
assert transcript.utterances[0].vocabulary_matches == {
"acute triangle": [
{"form": "acute triangle", "start": 23, "end": 37},
],
"add": [
{"form": "add", "start": 3, "end": 6},
],
"addend": [
{"form": "addends", "start": 7, "end": 14},
],
}
assert transcript.utterances[1].vocabulary_matches == {
"acute angle": [
{"form": "acute angles", "start": 0, "end": 12},
]
}
assert transcript.utterances[2].vocabulary_matches == {}
def test_vocabulary_matches_capture_multiple_occurrences(analyser):
transcript = DummyTranscript([
DummyUtterance("Teacher", "Add adds add."),
])
analyser.run_analysis(transcript)
matches = transcript.utterances[0].vocabulary_matches
assert transcript.utterances[0].vocabulary_terms == ["add"]
assert matches["add"] == [
{"form": "add", "start": 0, "end": 3},
{"form": "adds", "start": 4, "end": 8},
{"form": "add", "start": 9, "end": 12},
]