MINZO4546 commited on
Commit
ed9a034
·
verified ·
1 Parent(s): e13a882

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -174
app.py CHANGED
@@ -1,187 +1,52 @@
1
- # Copyright 2025 Xiaomi Corporation.
2
  import os
3
  import time
4
-
5
  import gradio as gr
6
- import torch
7
- from huggingface_hub import snapshot_download
8
-
9
  from src.mimo_audio.mimo_audio import MimoAudio
10
 
11
-
12
- MODEL_REPO = "XiaomiMiMo/MiMo-V2.5-ASR"
13
- TOKENIZER_REPO = "XiaomiMiMo/MiMo-Audio-Tokenizer"
14
- DOWNLOAD_ROOT = os.environ.get("MIMO_DOWNLOAD_ROOT", "assets/models")
15
-
16
- LANGUAGE_TAGS = {
17
- "Auto": "",
18
- "Chinese": "<chinese>",
19
- "English": "<english>",
20
- }
21
-
22
-
23
- def download_models():
24
- os.makedirs(DOWNLOAD_ROOT, exist_ok=True)
25
- hf_token = os.getenv("HF_TOKEN")
26
-
27
- model_path = os.path.join(DOWNLOAD_ROOT, MODEL_REPO.replace("/", "_"))
28
- tokenizer_path = os.path.join(DOWNLOAD_ROOT, TOKENIZER_REPO.replace("/", "_"))
29
-
30
- print(f"[download] {MODEL_REPO} -> {model_path}")
31
- snapshot_download(repo_id=MODEL_REPO, token=hf_token, local_dir=model_path)
32
-
33
- print(f"[download] {TOKENIZER_REPO} -> {tokenizer_path}")
34
- snapshot_download(repo_id=TOKENIZER_REPO, token=hf_token, local_dir=tokenizer_path)
35
-
36
- return model_path, tokenizer_path
37
-
38
-
39
- class ASRGenerator:
40
- def __init__(self, model):
41
- self.model = model
42
-
43
- def transcribe(self, audio_path, audio_tag=""):
44
- return self.model.asr_sft(audio_path, audio_tag=audio_tag)
45
-
46
-
47
- class MiMoV25ASRInterface:
48
  def __init__(self, model_path, tokenizer_path):
49
- device = "cuda" if torch.cuda.is_available() else "cpu"
50
- print(f"[init] device={device}")
51
- print(f"[init] model_path={model_path}")
52
- print(f"[init] tokenizer_path={tokenizer_path}")
53
-
54
  self.model = MimoAudio(model_path, tokenizer_path)
55
- self.asr_generator = ASRGenerator(self.model)
56
- print("[init] model ready")
57
-
58
- def transcribe(self, uploaded_audio, recorded_audio, language_choice):
59
- audio_path = uploaded_audio or recorded_audio
60
- if audio_path is None:
61
- return "", "❌ Error: Please upload an audio file or record from your microphone."
62
 
 
 
63
  audio_tag = LANGUAGE_TAGS.get(language_choice, "")
64
-
65
  try:
66
- print(f"Performing ASR task:")
67
- print(f" Audio: {audio_path}")
68
- print(f" Language: {language_choice} (tag='{audio_tag}')")
69
-
70
  start = time.time()
71
- transcript = self.asr_generator.transcribe(audio_path, audio_tag=audio_tag)
 
 
 
 
 
 
 
72
  elapsed = time.time() - start
73
-
74
- status_msg = (
75
- f"✅ Transcription completed in {elapsed:.2f}s\n"
76
- f"🎵 Input audio: {os.path.basename(audio_path)}\n"
77
- f"🌐 Language tag: {language_choice}"
78
- )
79
- return transcript, status_msg
80
-
81
  except Exception as e:
82
- error_msg = f"❌ Error during transcription: {str(e)}"
83
- print(error_msg)
84
- return "", error_msg
85
-
86
- def create_interface(self):
87
- with gr.Blocks(
88
- title="MiMo-V2.5-ASR Speech Recognition",
89
- theme=gr.themes.Soft(),
90
- fill_height=True,
91
- analytics_enabled=False,
92
- ) as iface:
93
- gr.Markdown("# 🎙️ MiMo-V2.5-ASR: Robust Speech Recognition")
94
- gr.Markdown(
95
- "Upload an audio file **or** record directly from your microphone. "
96
- "Supports Chinese, English, Chinese dialects, code-switch, singing, "
97
- "noisy environments, and multi-speaker scenarios."
98
- )
99
-
100
- with gr.Row():
101
- with gr.Column():
102
- uploaded_audio = gr.Audio(
103
- label="Upload Audio File",
104
- type="filepath",
105
- sources=["upload"],
106
- interactive=True,
107
- )
108
- recorded_audio = gr.Audio(
109
- label="Or Record from Microphone",
110
- type="filepath",
111
- sources=["microphone"],
112
- interactive=True,
113
- )
114
- language_choice = gr.Radio(
115
- label="Language Tag",
116
- choices=list(LANGUAGE_TAGS.keys()),
117
- value="Auto",
118
- info=(
119
- "Auto: automatic language detection (recommended for "
120
- "code-switched speech). Select Chinese or English to "
121
- "bias the model toward that language."
122
- ),
123
- )
124
- transcribe_btn = gr.Button(
125
- "🎧 Transcribe", variant="primary", size="lg"
126
- )
127
-
128
- with gr.Column():
129
- output_text = gr.Textbox(
130
- label="Transcription",
131
- lines=10,
132
- interactive=False,
133
- placeholder="Transcription result will appear here...",
134
- show_copy_button=True,
135
- )
136
- status = gr.Textbox(
137
- label="Status",
138
- lines=4,
139
- interactive=False,
140
- placeholder="Processing status will be shown here...",
141
- )
142
- with gr.Row():
143
- clear_btn = gr.Button("🗑️ Clear", size="sm")
144
-
145
- transcribe_btn.click(
146
- fn=self.transcribe,
147
- inputs=[uploaded_audio, recorded_audio, language_choice],
148
- outputs=[output_text, status],
149
- )
150
-
151
- def clear_all():
152
- return None, None, "Auto", "", ""
153
-
154
- clear_btn.click(
155
- fn=clear_all,
156
- outputs=[
157
- uploaded_audio,
158
- recorded_audio,
159
- language_choice,
160
- output_text,
161
- status,
162
- ],
163
- )
164
-
165
- return iface
166
-
167
-
168
- def main():
169
- print("🚀 Launch MiMo-V2.5-ASR demo...")
170
-
171
- model_path, tokenizer_path = download_models()
172
- interface = MiMoV25ASRInterface(model_path, tokenizer_path)
173
-
174
- iface = interface.create_interface()
175
-
176
- host = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0")
177
- port = int(os.environ.get("GRADIO_SERVER_PORT", "7898"))
178
- print(f"🌐 Launch service - {host}:{port}")
179
- iface.queue(default_concurrency_limit=4, max_size=20).launch(
180
- server_name=host,
181
- server_port=port,
182
- show_api=False,
183
- )
184
-
185
-
186
- if __name__ == "__main__":
187
- main()
 
1
+ # Updated for Inachi-Core (Elephant AI) - Text & Audio Dual Mode
2
  import os
3
  import time
 
4
  import gradio as gr
 
 
 
5
  from src.mimo_audio.mimo_audio import MimoAudio
6
 
7
+ class InachiProEngine:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def __init__(self, model_path, tokenizer_path):
9
+ # MiMo-V2.5-Pro load කිරීම
 
 
 
 
10
  self.model = MimoAudio(model_path, tokenizer_path)
 
 
 
 
 
 
 
11
 
12
+ def generate(self, text_input, audio_input, language_choice):
13
+ # Audio හෝ Text යන දෙකෙන් ඕනෑම එකක් process කිරීමේ හැකියාව
14
  audio_tag = LANGUAGE_TAGS.get(language_choice, "")
15
+
16
  try:
 
 
 
 
17
  start = time.time()
18
+ # මෙතනදී text_input එක කෙලින්ම model එකට pass කළ හැකියි
19
+ # MiMo-Pro හි text-to-text හෝ audio-to-text functions පාවිච්චි වේ
20
+ if audio_input:
21
+ result = self.model.asr_sft(audio_input, audio_tag=audio_tag)
22
+ else:
23
+ # Text chat logic
24
+ result = self.model.chat(text_input)
25
+
26
  elapsed = time.time() - start
27
+ return result, f"🚀 Processed in {elapsed:.2f}s"
 
 
 
 
 
 
 
28
  except Exception as e:
29
+ return "", f"❌ Error: {str(e)}"
30
+
31
+ # UI එකට Textbox එකක් ඇතුළත් කිරීම
32
+ def create_dual_interface(engine):
33
+ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as iface:
34
+ gr.Markdown("# 🔱 INACHI-CORE | MiMo-V2.5-Pro")
35
+
36
+ with gr.Row():
37
+ with gr.Column(scale=1):
38
+ audio_in = gr.Audio(label="Audio Input (Optional)", type="filepath")
39
+ text_in = gr.Textbox(label="Message / Prompt", placeholder="Type your command here...")
40
+ lang = gr.Radio(choices=["Auto", "Chinese", "English"], value="Auto", label="Language Context")
41
+ submit_btn = gr.Button("Execute Command", variant="primary")
42
+
43
+ with gr.Column(scale=1):
44
+ chat_out = gr.Textbox(label="Inachi Response", lines=12)
45
+ status = gr.Label(label="System Heartbeat")
46
+
47
+ submit_btn.click(
48
+ fn=engine.generate,
49
+ inputs=[text_in, audio_in, lang],
50
+ outputs=[chat_out, status]
51
+ )
52
+ return iface