techfreakworm commited on
Commit
5d81907
·
unverified ·
1 Parent(s): 8122b04

feat(dialog): parse_dialog with SPEAKER A-D regex and edge-case handling

Browse files
Files changed (2) hide show
  1. server/dialog.py +40 -0
  2. tests/test_dialog_parser.py +50 -0
server/dialog.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dialog mode: parse SPEAKER X: scripts into ordered turns and stitch
2
+ per-turn outputs into a single concatenated WAV.
3
+
4
+ Generator is in this same file but added in Task 12.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from dataclasses import dataclass
10
+
11
+
12
+ _SPEAKER_RE = re.compile(r"^\s*SPEAKER\s+([A-D])\s*:\s*", re.MULTILINE)
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class DialogTurn:
17
+ speaker: str # "A" | "B" | "C" | "D"
18
+ text: str
19
+
20
+
21
+ class DialogParseError(ValueError):
22
+ """Raised when a dialog script can't be parsed into turns."""
23
+
24
+
25
+ def parse_dialog(text: str) -> list[DialogTurn]:
26
+ matches = list(_SPEAKER_RE.finditer(text))
27
+ if not matches:
28
+ raise DialogParseError(
29
+ "Use SPEAKER A: ... / SPEAKER B: ... lines to define turns."
30
+ )
31
+ turns: list[DialogTurn] = []
32
+ for i, m in enumerate(matches):
33
+ start = m.end()
34
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
35
+ block = text[start:end].strip()
36
+ if block:
37
+ turns.append(DialogTurn(speaker=m.group(1), text=block))
38
+ if not turns:
39
+ raise DialogParseError("No non-empty speaker turns found.")
40
+ return turns
tests/test_dialog_parser.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from server.dialog import DialogParseError, DialogTurn, parse_dialog
4
+
5
+
6
+ def test_simple_a_b_alternation():
7
+ text = "SPEAKER A: hi\nSPEAKER B: hello"
8
+ turns = parse_dialog(text)
9
+ assert turns == [
10
+ DialogTurn(speaker="A", text="hi"),
11
+ DialogTurn(speaker="B", text="hello"),
12
+ ]
13
+
14
+
15
+ def test_multi_line_turn():
16
+ text = "SPEAKER A: line one\nstill A\nSPEAKER B: end."
17
+ turns = parse_dialog(text)
18
+ assert turns[0].speaker == "A"
19
+ assert turns[0].text == "line one\nstill A"
20
+ assert turns[1].speaker == "B"
21
+ assert turns[1].text == "end."
22
+
23
+
24
+ def test_leading_whitespace_tolerated():
25
+ text = " SPEAKER A: hi\n SPEAKER B: hello"
26
+ turns = parse_dialog(text)
27
+ assert [t.speaker for t in turns] == ["A", "B"]
28
+
29
+
30
+ def test_missing_prefix_raises():
31
+ with pytest.raises(DialogParseError):
32
+ parse_dialog("plain text with no speakers")
33
+
34
+
35
+ def test_unknown_letter_is_ignored_so_no_match_raises():
36
+ # "SPEAKER E: ..." doesn't match the regex -> treated as no tags.
37
+ with pytest.raises(DialogParseError):
38
+ parse_dialog("SPEAKER E: nope")
39
+
40
+
41
+ def test_three_consecutive_a_turns():
42
+ text = "SPEAKER A: one\nSPEAKER A: two\nSPEAKER A: three"
43
+ turns = parse_dialog(text)
44
+ assert [t.text for t in turns] == ["one", "two", "three"]
45
+
46
+
47
+ def test_empty_turn_is_dropped():
48
+ text = "SPEAKER A: hi\nSPEAKER B:\nSPEAKER C: bye"
49
+ turns = parse_dialog(text)
50
+ assert [t.speaker for t in turns] == ["A", "C"]