Waqas167 commited on
Commit
190732d
·
verified ·
1 Parent(s): bd2b21d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -460
app.py CHANGED
@@ -1,460 +1,258 @@
1
- # import streamlit as st
2
- # from transformers import AutoProcessor, Wav2Vec2ForCTC
3
- # import torch
4
- # import librosa
5
- # import os
6
- # from pydub import AudioSegment
7
- # from moviepy.editor import VideoFileClip
8
- # from google import genai
9
- # from google.genai import types
10
-
11
- # # ----------- Configuration -----------
12
- # model_id = "facebook/mms-1b-l1107"
13
- # lang_code = "urd-script_arabic"
14
- # api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # ⚠️ Replace with st.secrets for production
15
-
16
- # # ----------- Load Processor and Model -----------
17
- # @st.cache_resource
18
- # def load_model_and_processor():
19
- # processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
20
- # model = Wav2Vec2ForCTC.from_pretrained(
21
- # model_id,
22
- # target_lang=lang_code,
23
- # ignore_mismatched_sizes=True
24
- # )
25
- # model.load_adapter(lang_code)
26
- # return processor, model
27
-
28
- # processor, model = load_model_and_processor()
29
-
30
- # # ----------- Audio Conversion -----------
31
- # def get_wav_from_input(file_path, output_path="converted.wav"):
32
- # ext = os.path.splitext(file_path)[-1].lower()
33
- # if ext in [".mp4", ".mkv", ".avi", ".mov"]:
34
- # video = VideoFileClip(file_path)
35
- # video.audio.write_audiofile(output_path, fps=16000)
36
- # elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
37
- # audio = AudioSegment.from_file(file_path)
38
- # audio = audio.set_frame_rate(16000).set_channels(1)
39
- # audio.export(output_path, format="wav")
40
- # elif ext == ".wav":
41
- # audio = AudioSegment.from_wav(file_path)
42
- # audio.export(output_path, format="wav")
43
- # else:
44
- # raise ValueError("Unsupported file format.")
45
- # return output_path
46
-
47
- # # ----------- Transcription -----------
48
- # def transcribe(file_path):
49
- # wav_path = get_wav_from_input(file_path)
50
- # audio, sr = librosa.load(wav_path, sr=16000)
51
- # inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
52
- # with torch.no_grad():
53
- # logits = model(**inputs).logits
54
- # pred_ids = torch.argmax(logits, dim=-1)
55
- # return processor.batch_decode(pred_ids)[0]
56
-
57
- # # ----------- Gemini Analysis -----------
58
- # def analyze_transcript(transcript):
59
- # client = genai.Client(api_key=api_key)
60
-
61
- # system_instr = """
62
- # You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
63
-
64
- # Then:
65
- # 1. Translate the corrected Urdu transcript into English.
66
- # 2. Determine whether the transcript involves a single speaker or multiple speakers.
67
- # 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
68
-
69
- # ⚠️ Format the segmented transcript *exactly* like this:
70
-
71
- # **Segmented Transcript**
72
-
73
- # **Urdu:**
74
- # Person 01:
75
- # [Urdu line here]
76
-
77
- # Person 02:
78
- # [Urdu line here]
79
-
80
- # ...
81
-
82
- # **English:**
83
- # Person 01:
84
- # [English line here]
85
-
86
- # Person 02:
87
- # [English line here]
88
-
89
- # ...
90
-
91
- # After that, provide your analysis in the following format:
92
-
93
- # **Speaker-wise Analysis**
94
- # [One or two sentences per speaker about tone, emotion, behavior]
95
-
96
- # **Sentiment and Communication Style**
97
- # [Concise overall tone: e.g., friendly, formal, tense, etc.]
98
-
99
- # **Summary of Discussion**
100
- # [A 2–3 line summary of what the speakers talked about, in English]
101
- # """
102
-
103
- # response = client.models.generate_content(
104
- # model="gemini-2.5-flash",
105
- # contents=[transcript],
106
- # config=types.GenerateContentConfig(
107
- # system_instruction=system_instr,
108
- # temperature=0.0
109
- # )
110
- # )
111
- # return response.text
112
-
113
- # # ----------- Format Display Helper -----------
114
- # def format_transcript_block(text: str) -> str:
115
- # lines = text.split("Person ")
116
- # formatted = ""
117
- # for line in lines:
118
- # line = line.strip()
119
- # if not line:
120
- # continue
121
- # if line.startswith("01:") or line.startswith("02:"):
122
- # formatted += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
123
- # else:
124
- # formatted += f"{line.strip()}\n\n"
125
- # return formatted
126
-
127
- # # ----------- Streamlit UI -----------
128
- # # Styled Header
129
- # st.markdown("""
130
- # <div style="text-align: left; padding-bottom: 1rem;">
131
- # <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
132
- # 🎙️ Urdu Audio & Video Speech Analyzer
133
- # </h1>
134
- # <p style='color: #CCCCCC; font-size: 1.05em; margin-top: 0;'>
135
- # Upload Urdu audio or video to get structured transcription, speaker diarization, and smart AI analysis.
136
- # </p>
137
- # </div>
138
- # """, unsafe_allow_html=True)
139
-
140
- # # File Upload
141
- # st.markdown("### 📂 Upload an audio or video file")
142
- # with st.container():
143
- # uploaded_file = st.file_uploader(
144
- # label="",
145
- # type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
146
- # label_visibility="collapsed"
147
- # )
148
-
149
- # if uploaded_file is not None:
150
- # with st.spinner("⏳ Transcribing..."):
151
- # file_name = uploaded_file.name
152
- # temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
153
- # with open(temp_path, "wb") as f:
154
- # f.write(uploaded_file.read())
155
- # transcript = transcribe(temp_path)
156
-
157
- # st.markdown("### 📝 Raw Urdu Transcription")
158
- # st.text(transcript)
159
-
160
- # with st.spinner("🔍 Analyzing with Gemini..."):
161
- # report = analyze_transcript(transcript)
162
-
163
- # # Extract Segmented Urdu and English
164
- # segmented_urdu = ""
165
- # segmented_english = ""
166
- # analysis_only = ""
167
-
168
- # if "Urdu:" in report and "English:" in report:
169
- # urdu_start = report.find("Urdu:")
170
- # english_start = report.find("English:")
171
- # segmented_urdu = report[urdu_start + len("Urdu:"):english_start].strip()
172
-
173
- # english_section = report[english_start + len("English:"):].strip()
174
- # if "**Speaker-wise Analysis**" in english_section:
175
- # parts = english_section.split("**Speaker-wise Analysis**")
176
- # segmented_english = parts[0].strip()
177
- # analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
178
- # else:
179
- # segmented_english = english_section.strip()
180
- # analysis_only = "⚠️ Could not extract structured analysis."
181
-
182
- # # Show Segmented Transcript
183
- # if segmented_urdu and segmented_english:
184
- # st.markdown("### 🗣️ Segmented Transcript")
185
- # col1, col2 = st.columns(2)
186
-
187
- # with col1:
188
- # st.markdown("#### Urdu")
189
- # st.markdown(format_transcript_block(segmented_urdu))
190
-
191
- # with col2:
192
- # st.markdown("#### English")
193
- # st.markdown(format_transcript_block(segmented_english))
194
-
195
- # # Show Gemini Analysis Only (No transcript repeat)
196
- # if analysis_only:
197
- # st.markdown("### 🧠 Gemini Analysis Summary")
198
- # st.markdown(analysis_only)
199
- # app.py
200
-
201
- # api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo"
202
-
203
- import io, os, numpy as np, streamlit as st, librosa, torch, soundfile as sf
204
- from transformers import AutoProcessor, Wav2Vec2ForCTC
205
- from pydub import AudioSegment
206
- from moviepy.editor import VideoFileClip
207
- from google import genai
208
- from google.genai import types
209
-
210
- # ✅ programmatic Start/Stop mic (no WebRTC)
211
- from streamlit_mic_recorder import mic_recorder
212
-
213
- # ---------------- Config ----------------
214
- st.set_page_config(page_title="Urdu Speech Analyzer", page_icon="🎙️", layout="wide")
215
- PAGE_TITLE = "🎙️ Urdu Audio & Video Speech Analyzer"
216
- model_id = "facebook/mms-1b-l1107"
217
- lang_code = "urd-script_arabic"
218
- api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # hard-coded as requested
219
-
220
- # ---------------- Model ----------------
221
- @st.cache_resource
222
- def load_model_and_processor():
223
- processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
224
- model = Wav2Vec2ForCTC.from_pretrained(
225
- model_id, target_lang=lang_code, ignore_mismatched_sizes=True
226
- )
227
- model.load_adapter(lang_code)
228
- return processor, model
229
-
230
- processor, model = load_model_and_processor()
231
-
232
- # ---------------- Helpers ----------------
233
- def get_wav_from_input(file_path, output_path="converted.wav"):
234
- ext = os.path.splitext(file_path)[-1].lower()
235
- if ext in [".mp4", ".mkv", ".avi", ".mov"]:
236
- video = VideoFileClip(file_path)
237
- video.audio.write_audiofile(output_path, fps=16000)
238
- elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
239
- audio = AudioSegment.from_file(file_path)
240
- audio = audio.set_frame_rate(16000).set_channels(1)
241
- audio.export(output_path, format="wav")
242
- elif ext == ".wav":
243
- audio = AudioSegment.from_wav(file_path)
244
- audio = audio.set_frame_rate(16000).set_channels(1)
245
- audio.export(output_path, format="wav")
246
- else:
247
- raise ValueError("Unsupported file format.")
248
- return output_path
249
-
250
- def save_wav_resampled(audio_f32: np.ndarray, sr_in: int, path: str):
251
- if sr_in != 16000:
252
- audio_f32 = librosa.resample(audio_f32, orig_sr=sr_in, target_sr=16000)
253
- audio_f32 = librosa.util.normalize(audio_f32)
254
- sf.write(path, audio_f32.astype(np.float32), 16000)
255
-
256
- def transcribe(wav_path) -> str:
257
- audio, sr = librosa.load(wav_path, sr=16000, mono=True)
258
- inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
259
- with torch.no_grad():
260
- logits = model(**inputs).logits
261
- pred_ids = torch.argmax(logits, dim=-1)
262
- return processor.batch_decode(pred_ids)[0]
263
-
264
- def analyze_transcript(transcript: str) -> str:
265
- client = genai.Client(api_key=api_key)
266
- system_instr = """
267
- You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
268
-
269
- Then:
270
- 1. Translate the corrected Urdu transcript into English.
271
- 2. Determine whether the transcript involves a single speaker or multiple speakers.
272
- 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
273
-
274
- ⚠️ Format the segmented transcript *exactly* like this:
275
-
276
- **Segmented Transcript**
277
-
278
- **Urdu:**
279
- Person 01:
280
- [Urdu line here]
281
-
282
- Person 02:
283
- [Urdu line here]
284
-
285
- ...
286
-
287
- **English:**
288
- Person 01:
289
- [English line here]
290
-
291
- Person 02:
292
- [English line here]
293
-
294
- ...
295
-
296
- After that, provide your analysis in the following format:
297
-
298
- **Speaker-wise Analysis**
299
- [One or two sentences per speaker about tone, emotion, behavior]
300
-
301
- **Sentiment and Communication Style**
302
- [Concise overall tone: e.g., friendly, formal, tense, etc.]
303
-
304
- **Summary of Discussion**
305
- [A 2–3 line summary of what the speakers talked about, in English]
306
- """
307
- resp = client.models.generate_content(
308
- model="gemini-2.5-flash",
309
- contents=[transcript],
310
- config=types.GenerateContentConfig(system_instruction=system_instr, temperature=0.0)
311
- )
312
- return resp.text
313
-
314
- def format_transcript_block(text: str) -> str:
315
- lines = text.split("Person ")
316
- out = ""
317
- for line in lines:
318
- line = line.strip()
319
- if not line:
320
- continue
321
- if line.startswith("01:") or line.startswith("02:"):
322
- out += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
323
- else:
324
- out += f"{line}\n\n"
325
- return out
326
-
327
- # ---------------- Header ----------------
328
- st.markdown(f"""
329
- <div style="text-align: left; padding-bottom: 1rem;">
330
- <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
331
- {PAGE_TITLE}
332
- </h1>
333
- <p style='color: #7c8a98; font-size: 1.05em; margin-top: 0;'>
334
- Record or upload Urdu speech for structured transcription, diarization, and smart AI analysis.
335
- </p>
336
- </div>
337
- """, unsafe_allow_html=True)
338
-
339
- # ================= Mic: true Start/Stop + narrow Analyze =================
340
- st.markdown("### 🎤 Live recording")
341
-
342
- # The component renders **Start** and **Stop** buttons and keeps recording until you press Stop.
343
- rec = mic_recorder(
344
- start_prompt="▶️ Start",
345
- stop_prompt="⏹️ Stop",
346
- just_once=False, # allow multiple recordings in a session
347
- key="recorder",
348
- format="wav" # returns WAV bytes
349
- )
350
-
351
- # `rec` returns after Stop. Different versions return bytes or a dict — handle both.
352
- audio_bytes, sr_in = None, 44100
353
- if rec is not None:
354
- if isinstance(rec, dict) and "bytes" in rec:
355
- audio_bytes = rec["bytes"]
356
- sr_in = int(rec.get("sample_rate", 44100))
357
- elif isinstance(rec, (bytes, bytearray)):
358
- audio_bytes = rec
359
- sr_in = 44100 # component default
360
- else:
361
- # fallback: try to extract .get("audio") etc if lib changes
362
- audio_bytes = rec.get("audio") if isinstance(rec, dict) else None
363
-
364
- if audio_bytes:
365
- st.success("Audio captured.")
366
- # Convert to mono float32
367
- data, sr_read = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
368
- if data.ndim > 1:
369
- data = data.mean(axis=1)
370
- if sr_read: # prefer the rate embedded in the WAV
371
- sr_in = sr_read
372
-
373
- # Save as 16 kHz mono for the model
374
- tmp_wav = "mic_recording.wav"
375
- save_wav_resampled(data, sr_in, tmp_wav)
376
-
377
- # Minimal playback (no waveform)
378
- st.audio(audio_bytes, format="audio/wav")
379
- st.caption(f"Duration: {data.size / sr_in:.2f} s")
380
-
381
- # Slim Analyze button (not full width)
382
- if st.button("🔍 Analyze", type="primary"):
383
- with st.spinner("⏳ Transcribing & analyzing..."):
384
- transcript = transcribe(tmp_wav) # raw not displayed
385
- report = analyze_transcript(transcript)
386
-
387
- segmented_urdu = segmented_english = analysis_only = ""
388
- if "Urdu:" in report and "English:" in report:
389
- u0 = report.find("Urdu:")
390
- e0 = report.find("English:")
391
- segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
392
- english_section = report[e0 + len("English:"):].strip()
393
- if "**Speaker-wise Analysis**" in english_section:
394
- parts = english_section.split("**Speaker-wise Analysis**")
395
- segmented_english = parts[0].strip()
396
- analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
397
- else:
398
- segmented_english = english_section.strip()
399
- analysis_only = "⚠️ Could not extract structured analysis."
400
-
401
- if segmented_urdu or segmented_english:
402
- st.markdown("### 🗣️ Segmented Transcript")
403
- c1, c2 = st.columns(2)
404
- with c1:
405
- st.markdown("#### Urdu")
406
- st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
407
- with c2:
408
- st.markdown("#### English")
409
- st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
410
- if analysis_only:
411
- st.markdown("### 🧠 Gemini Analysis Summary")
412
- st.markdown(analysis_only)
413
-
414
- st.markdown("---")
415
-
416
- # ================= Upload (unchanged) =================
417
- st.markdown("### 📂 Or upload an audio/video file")
418
- uploaded_file = st.file_uploader(
419
- label="",
420
- type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
421
- label_visibility="collapsed"
422
- )
423
- if uploaded_file is not None:
424
- with st.spinner("⏳ Transcribing..."):
425
- file_name = uploaded_file.name
426
- temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
427
- with open(temp_path, "wb") as f:
428
- f.write(uploaded_file.read())
429
- wav_path = get_wav_from_input(temp_path)
430
- transcript = transcribe(wav_path)
431
-
432
- with st.spinner("🔍 Analyzing with Gemini..."):
433
- report = analyze_transcript(transcript)
434
-
435
- segmented_urdu = segmented_english = analysis_only = ""
436
- if "Urdu:" in report and "English:" in report:
437
- u0 = report.find("Urdu:")
438
- e0 = report.find("English:")
439
- segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
440
- english_section = report[e0 + len("English:"):].strip()
441
- if "**Speaker-wise Analysis**" in english_section:
442
- parts = english_section.split("**Speaker-wise Analysis**")
443
- segmented_english = parts[0].strip()
444
- analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
445
- else:
446
- segmented_english = english_section.strip()
447
- analysis_only = "⚠️ Could not extract structured analysis."
448
-
449
- if segmented_urdu or segmented_english:
450
- st.markdown("### 🗣️ Segmented Transcript")
451
- c1, c2 = st.columns(2)
452
- with c1:
453
- st.markdown("#### Urdu")
454
- st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
455
- with c2:
456
- st.markdown("#### English")
457
- st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
458
- if analysis_only:
459
- st.markdown("### 🧠 Gemini Analysis Summary")
460
- st.markdown(analysis_only)
 
1
+ import io, os, numpy as np, streamlit as st, librosa, torch, soundfile as sf
2
+ from transformers import AutoProcessor, Wav2Vec2ForCTC
3
+ from pydub import AudioSegment
4
+ from moviepy.editor import VideoFileClip
5
+ from google import genai
6
+ from google.genai import types
7
+
8
+ # programmatic Start/Stop mic (no WebRTC)
9
+ from streamlit_mic_recorder import mic_recorder
10
+
11
+ # ---------------- Config ----------------
12
+ st.set_page_config(page_title="Urdu Speech Analyzer", page_icon="🎙️", layout="wide")
13
+ PAGE_TITLE = "🎙️ Urdu Audio & Video Speech Analyzer"
14
+ model_id = "facebook/mms-1b-l1107"
15
+ lang_code = "urd-script_arabic"
16
+ api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # hard-coded as requested
17
+
18
+ # ---------------- Model ----------------
19
+ @st.cache_resource
20
+ def load_model_and_processor():
21
+ processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
22
+ model = Wav2Vec2ForCTC.from_pretrained(
23
+ model_id, target_lang=lang_code, ignore_mismatched_sizes=True
24
+ )
25
+ model.load_adapter(lang_code)
26
+ return processor, model
27
+
28
+ processor, model = load_model_and_processor()
29
+
30
+ # ---------------- Helpers ----------------
31
+ def get_wav_from_input(file_path, output_path="converted.wav"):
32
+ ext = os.path.splitext(file_path)[-1].lower()
33
+ if ext in [".mp4", ".mkv", ".avi", ".mov"]:
34
+ video = VideoFileClip(file_path)
35
+ video.audio.write_audiofile(output_path, fps=16000)
36
+ elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
37
+ audio = AudioSegment.from_file(file_path)
38
+ audio = audio.set_frame_rate(16000).set_channels(1)
39
+ audio.export(output_path, format="wav")
40
+ elif ext == ".wav":
41
+ audio = AudioSegment.from_wav(file_path)
42
+ audio = audio.set_frame_rate(16000).set_channels(1)
43
+ audio.export(output_path, format="wav")
44
+ else:
45
+ raise ValueError("Unsupported file format.")
46
+ return output_path
47
+
48
+ def save_wav_resampled(audio_f32: np.ndarray, sr_in: int, path: str):
49
+ if sr_in != 16000:
50
+ audio_f32 = librosa.resample(audio_f32, orig_sr=sr_in, target_sr=16000)
51
+ audio_f32 = librosa.util.normalize(audio_f32)
52
+ sf.write(path, audio_f32.astype(np.float32), 16000)
53
+
54
+ def transcribe(wav_path) -> str:
55
+ audio, sr = librosa.load(wav_path, sr=16000, mono=True)
56
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
57
+ with torch.no_grad():
58
+ logits = model(**inputs).logits
59
+ pred_ids = torch.argmax(logits, dim=-1)
60
+ return processor.batch_decode(pred_ids)[0]
61
+
62
+ def analyze_transcript(transcript: str) -> str:
63
+ client = genai.Client(api_key=api_key)
64
+ system_instr = """
65
+ You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
66
+
67
+ Then:
68
+ 1. Translate the corrected Urdu transcript into English.
69
+ 2. Determine whether the transcript involves a single speaker or multiple speakers.
70
+ 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
71
+
72
+ ⚠️ Format the segmented transcript *exactly* like this:
73
+
74
+ **Segmented Transcript**
75
+
76
+ **Urdu:**
77
+ Person 01:
78
+ [Urdu line here]
79
+
80
+ Person 02:
81
+ [Urdu line here]
82
+
83
+ ...
84
+
85
+ **English:**
86
+ Person 01:
87
+ [English line here]
88
+
89
+ Person 02:
90
+ [English line here]
91
+
92
+ ...
93
+
94
+ After that, provide your analysis in the following format:
95
+
96
+ **Speaker-wise Analysis**
97
+ [One or two sentences per speaker about tone, emotion, behavior]
98
+
99
+ **Sentiment and Communication Style**
100
+ [Concise overall tone: e.g., friendly, formal, tense, etc.]
101
+
102
+ **Summary of Discussion**
103
+ [A 2–3 line summary of what the speakers talked about, in English]
104
+ """
105
+ resp = client.models.generate_content(
106
+ model="gemini-2.5-flash",
107
+ contents=[transcript],
108
+ config=types.GenerateContentConfig(system_instruction=system_instr, temperature=0.0)
109
+ )
110
+ return resp.text
111
+
112
+ def format_transcript_block(text: str) -> str:
113
+ lines = text.split("Person ")
114
+ out = ""
115
+ for line in lines:
116
+ line = line.strip()
117
+ if not line:
118
+ continue
119
+ if line.startswith("01:") or line.startswith("02:"):
120
+ out += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
121
+ else:
122
+ out += f"{line}\n\n"
123
+ return out
124
+
125
+ # ---------------- Header ----------------
126
+ st.markdown(f"""
127
+ <div style="text-align: left; padding-bottom: 1rem;">
128
+ <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
129
+ {PAGE_TITLE}
130
+ </h1>
131
+ <p style='color: #7c8a98; font-size: 1.05em; margin-top: 0;'>
132
+ Record or upload Urdu speech for structured transcription, diarization, and smart AI analysis.
133
+ </p>
134
+ </div>
135
+ """, unsafe_allow_html=True)
136
+
137
+ # ================= Mic: true Start/Stop + narrow Analyze =================
138
+ st.markdown("### 🎤 Live recording")
139
+
140
+ # The component renders **Start** and **Stop** buttons and keeps recording until you press Stop.
141
+ rec = mic_recorder(
142
+ start_prompt="▶️ Start",
143
+ stop_prompt="⏹️ Stop",
144
+ just_once=False, # allow multiple recordings in a session
145
+ key="recorder",
146
+ format="wav" # returns WAV bytes
147
+ )
148
+
149
+ # `rec` returns after Stop. Different versions return bytes or a dict — handle both.
150
+ audio_bytes, sr_in = None, 44100
151
+ if rec is not None:
152
+ if isinstance(rec, dict) and "bytes" in rec:
153
+ audio_bytes = rec["bytes"]
154
+ sr_in = int(rec.get("sample_rate", 44100))
155
+ elif isinstance(rec, (bytes, bytearray)):
156
+ audio_bytes = rec
157
+ sr_in = 44100 # component default
158
+ else:
159
+ # fallback: try to extract .get("audio") etc if lib changes
160
+ audio_bytes = rec.get("audio") if isinstance(rec, dict) else None
161
+
162
+ if audio_bytes:
163
+ st.success("Audio captured.")
164
+ # Convert to mono float32
165
+ data, sr_read = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
166
+ if data.ndim > 1:
167
+ data = data.mean(axis=1)
168
+ if sr_read: # prefer the rate embedded in the WAV
169
+ sr_in = sr_read
170
+
171
+ # Save as 16 kHz mono for the model
172
+ tmp_wav = "mic_recording.wav"
173
+ save_wav_resampled(data, sr_in, tmp_wav)
174
+
175
+ # Minimal playback (no waveform)
176
+ st.audio(audio_bytes, format="audio/wav")
177
+ st.caption(f"Duration: {data.size / sr_in:.2f} s")
178
+
179
+ # Slim Analyze button (not full width)
180
+ if st.button("🔍 Analyze", type="primary"):
181
+ with st.spinner("⏳ Transcribing & analyzing..."):
182
+ transcript = transcribe(tmp_wav) # raw not displayed
183
+ report = analyze_transcript(transcript)
184
+
185
+ segmented_urdu = segmented_english = analysis_only = ""
186
+ if "Urdu:" in report and "English:" in report:
187
+ u0 = report.find("Urdu:")
188
+ e0 = report.find("English:")
189
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
190
+ english_section = report[e0 + len("English:"):].strip()
191
+ if "**Speaker-wise Analysis**" in english_section:
192
+ parts = english_section.split("**Speaker-wise Analysis**")
193
+ segmented_english = parts[0].strip()
194
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
195
+ else:
196
+ segmented_english = english_section.strip()
197
+ analysis_only = "⚠️ Could not extract structured analysis."
198
+
199
+ if segmented_urdu or segmented_english:
200
+ st.markdown("### 🗣️ Segmented Transcript")
201
+ c1, c2 = st.columns(2)
202
+ with c1:
203
+ st.markdown("#### Urdu")
204
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
205
+ with c2:
206
+ st.markdown("#### English")
207
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
208
+ if analysis_only:
209
+ st.markdown("### 🧠 Gemini Analysis Summary")
210
+ st.markdown(analysis_only)
211
+
212
+ st.markdown("---")
213
+
214
+ # ================= Upload (unchanged) =================
215
+ st.markdown("### 📂 Or upload an audio/video file")
216
+ uploaded_file = st.file_uploader(
217
+ label="",
218
+ type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
219
+ label_visibility="collapsed"
220
+ )
221
+ if uploaded_file is not None:
222
+ with st.spinner("⏳ Transcribing..."):
223
+ file_name = uploaded_file.name
224
+ temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
225
+ with open(temp_path, "wb") as f:
226
+ f.write(uploaded_file.read())
227
+ wav_path = get_wav_from_input(temp_path)
228
+ transcript = transcribe(wav_path)
229
+
230
+ with st.spinner("🔍 Analyzing with Gemini..."):
231
+ report = analyze_transcript(transcript)
232
+
233
+ segmented_urdu = segmented_english = analysis_only = ""
234
+ if "Urdu:" in report and "English:" in report:
235
+ u0 = report.find("Urdu:")
236
+ e0 = report.find("English:")
237
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
238
+ english_section = report[e0 + len("English:"):].strip()
239
+ if "**Speaker-wise Analysis**" in english_section:
240
+ parts = english_section.split("**Speaker-wise Analysis**")
241
+ segmented_english = parts[0].strip()
242
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
243
+ else:
244
+ segmented_english = english_section.strip()
245
+ analysis_only = "⚠️ Could not extract structured analysis."
246
+
247
+ if segmented_urdu or segmented_english:
248
+ st.markdown("### 🗣️ Segmented Transcript")
249
+ c1, c2 = st.columns(2)
250
+ with c1:
251
+ st.markdown("#### Urdu")
252
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
253
+ with c2:
254
+ st.markdown("#### English")
255
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
256
+ if analysis_only:
257
+ st.markdown("### 🧠 Gemini Analysis Summary")
258
+ st.markdown(analysis_only)