-
Notifications
You must be signed in to change notification settings - Fork 0
Add Streamlit UI for real-time SER predictions; update README and requirements #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,3 +10,4 @@ tensorflow==2.6.0 | |
| keras==2.6.0 | ||
| sounddevice==0.4.2 | ||
| soundfile==0.10.3.post1 | ||
| streamlit==1.45.1 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| import io | ||
| from pathlib import Path | ||
|
|
||
| import librosa | ||
| import numpy as np | ||
| import soundfile as sf | ||
| import streamlit as st | ||
|
|
||
| try: | ||
| from keras.models import load_model | ||
| except Exception: # fallback for some environments | ||
| from tensorflow.keras.models import load_model | ||
|
|
||
| EMOTIONS = [ | ||
| "Angry", | ||
| "Disgust", | ||
| "Fear", | ||
| "Happy", | ||
| "Neutral", | ||
| "Pleasant Surprise", | ||
| "Sad", | ||
| ] | ||
|
|
||
| MODEL_DEFAULT_PATH = "model/SER_model.h5" | ||
|
|
||
|
|
||
| def extract_mfcc_from_bytes(audio_bytes: bytes) -> np.ndarray: | ||
| """Load audio bytes and return the model input tensor shape (1, 40, 1).""" | ||
| audio, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32") | ||
|
|
||
| if audio.ndim > 1: | ||
| audio = np.mean(audio, axis=1) | ||
|
|
||
| # Match notebook/GUI preprocessing as closely as possible. | ||
| target_sr = 22050 | ||
| if sr != target_sr: | ||
| audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) | ||
| sr = target_sr | ||
|
|
||
| # Use the same effective slicing strategy as existing notebook code. | ||
| start = int(0.5 * sr) | ||
| end = start + int(3.0 * sr) | ||
| if len(audio) < end: | ||
| pad = end - len(audio) | ||
| audio = np.pad(audio, (0, pad), mode="constant") | ||
| sliced = audio[start:end] | ||
|
|
||
| mfcc = librosa.feature.mfcc(y=sliced, sr=sr, n_mfcc=40) | ||
| mfcc_mean = np.mean(mfcc.T, axis=0) | ||
|
|
||
| return mfcc_mean.reshape(1, 40, 1) | ||
|
|
||
|
|
||
| @st.cache_resource(show_spinner=False) | ||
| def get_model(model_path: str): | ||
| return load_model(model_path) | ||
|
|
||
|
|
||
| def main() -> None: | ||
| st.set_page_config(page_title="Speech Emotion Recognition", page_icon="🎙️", layout="centered") | ||
|
|
||
| st.title("🎙️ Speech Emotion Recognition") | ||
| st.caption("Upload a WAV/MP3 audio file and predict emotion using your trained SER model.") | ||
|
|
||
| model_path = st.sidebar.text_input("Model path", value=MODEL_DEFAULT_PATH) | ||
| model_exists = Path(model_path).exists() | ||
| if not model_exists: | ||
| st.warning(f"Model file not found: `{model_path}`. Upload audio is disabled until model is available.") | ||
|
|
||
| uploaded = st.file_uploader("Upload audio", type=["wav", "mp3", "ogg", "flac", "m4a"]) | ||
|
|
||
| if uploaded and model_exists: | ||
| audio_bytes = uploaded.read() | ||
| st.audio(audio_bytes) | ||
|
|
||
| with st.spinner("Extracting features and running prediction..."): | ||
| model = get_model(model_path) | ||
| features = extract_mfcc_from_bytes(audio_bytes) | ||
| probs = model.predict(features, verbose=0)[0] | ||
|
Comment on lines
+78
to
+79
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||
|
|
||
| if len(probs) != len(EMOTIONS): | ||
| st.error( | ||
| f"Model output has {len(probs)} classes, but UI expects {len(EMOTIONS)} emotions. " | ||
| "Please align EMOTIONS with your training label order." | ||
| ) | ||
| return | ||
|
|
||
| pred_idx = int(np.argmax(probs)) | ||
| pred_label = EMOTIONS[pred_idx] | ||
| pred_conf = float(probs[pred_idx]) | ||
|
|
||
| st.success(f"Predicted emotion: **{pred_label}** ({pred_conf:.2%})") | ||
|
|
||
| st.subheader("Class probabilities") | ||
| table_rows = [{"emotion": label, "probability": float(p)} for label, p in zip(EMOTIONS, probs)] | ||
| st.dataframe(table_rows, use_container_width=True) | ||
| st.bar_chart({"probability": probs}, x_label="class_index", y_label="probability") | ||
|
|
||
| st.markdown("---") | ||
| st.markdown("**Tips**: Keep audio clear and speech-focused. Very noisy clips may reduce accuracy.") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The uploader advertises
mp3andm4a, but decoding is done solely throughsoundfile.readinextract_mfcc_from_byteswith no fallback or error handling. In environments using the pinnedsoundfile==0.10.3.post1, these compressed formats are often not decodable (especiallym4a), so selecting such a file can raise a runtime decode error instead of producing a prediction; either restrict allowed types to guaranteed codecs or add a robust fallback decoder path.Useful? React with 👍 / 👎.