Spaces:

Yilin0601
/

SpeechAccuracyClassification

Sleeping

App Files Files Community

Yilin0601 commited on Mar 20, 2025

Commit

9018a55

verified ·

1 Parent(s): f4e0fed

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # app.py
 import gradio as gr
 import torch
 import numpy as np
 import librosa
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
-# 1. Load your model & feature extractor
-model_name = "path_or_hub_id_of_your_finetuned_model"
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
@@ -14,22 +16,25 @@ model.eval()
 def classify_accuracy(audio):
     """
-    audio: This will be a tuple (sample_rate, audio_data) when using Gradio's microphone or file upload
-    We need to convert it to the correct format for the model.
     """
     sample_rate, data = audio
-    # Convert audio data to float32 numpy array
     if not isinstance(data, np.ndarray):
         data = np.array(data)
-    # If sample_rate != 16000, resample (optional)
-    # For small demos, you can do it with librosa
-    if sample_rate != 16000:
-        data = librosa.resample(data, orig_sr=sample_rate, target_sr=16000)
-        sample_rate = 16000
-    # Extract features
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
@@ -42,25 +47,28 @@ def classify_accuracy(audio):
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
-    # Convert to final accuracy level
-    accuracy_level = predicted_id + 3  # or however you map 0..7 → 3..10
-    return f"Accuracy Level: {accuracy_level}"
-# 2. Build Gradio interface
 title = "Speech Accuracy Classifier"
-description = "Upload an audio file (or record) to see the predicted accuracy level."
-# We use "microphone=True" in gr.Audio if you want an optional mic input
-# By default, "type='numpy'" returns (sample_rate, data)
 demo = gr.Interface(
     fn=classify_accuracy,
-    inputs=gr.Audio(source="upload", type="numpy"),
-    outputs="text",
     title=title,
     description=description,
-    allow_flagging="never"  # optional
 )
-# 3. Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()

 # app.py
 import gradio as gr
 import torch
 import numpy as np
 import librosa
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
+# 1. Load your fine-tuned model & feature extractor from the Hugging Face Hub or local path
+# Replace "YourUsername/YourModelRepo" with the actual repo ID where your fine-tuned model is hosted
+model_name = "YourUsername/YourModelRepo"
 model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
 def classify_accuracy(audio):
     """
+    audio: Gradio provides a tuple (sample_rate, data) when type='numpy'.
+    We'll convert to the correct format, run inference, and return the predicted level.
     """
+    if audio is None:
+        return "No audio provided."
     sample_rate, data = audio
+    # Ensure the audio is a NumPy array
     if not isinstance(data, np.ndarray):
         data = np.array(data)
+    # Resample if needed (model expects 16kHz)
+    target_sr = 16000
+    if sample_rate != target_sr:
+        data = librosa.resample(data, orig_sr=sample_rate, target_sr=target_sr)
+        sample_rate = target_sr
+    # Convert to batch of size 1
     inputs = feature_extractor(
         data,
         sampling_rate=sample_rate,
         logits = outputs.logits
         predicted_id = torch.argmax(logits, dim=-1).item()
+    # Map model output (0..7) back to your desired scale (3..10) if needed
+    accuracy_level = predicted_id + 3
+    return f"Predicted Accuracy Level: {accuracy_level}"
+# 2. Build Gradio Interface
 title = "Speech Accuracy Classifier"
+description = (
+    "Upload an audio file (or record audio) on the left. "
+    "The model will classify the audio's accuracy level on the right."
+)
+# Gradio Interface:
 demo = gr.Interface(
     fn=classify_accuracy,
+    inputs=gr.Audio(source="upload", type="numpy"),  # left side: audio upload
+    outputs="text",                                  # right side: classification result
     title=title,
     description=description,
+    allow_flagging="never"  # disable user flagging if you prefer
 )
+# 3. Launch Gradio App
 if __name__ == "__main__":
     demo.launch()