import streamlit as st import cv2 import torch import torch.nn as nn import numpy as np import mediapipe as mp import pickle from collections import deque import PIL.Image st.set_page_config(page_title="ASL Translator", page_icon="๐ค", layout="wide") st.title("๐ค ASL Sign Language Translator") st.markdown("Show your hand to the camera โ hold a sign steady to add it to the sentence.") # โโ Load model โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ @st.cache_resource def load_model(): with open("label_encoder.pkl", "rb") as f: le = pickle.load(f) class ASLClassifier(nn.Module): def __init__(self, input_dim=63, num_classes=28): super().__init__() self.net = nn.Sequential( nn.Linear(input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.2), nn.Linear(64, num_classes) ) def forward(self, x): return self.net(x) model = ASLClassifier(num_classes=len(le.classes_)) model.load_state_dict(torch.load("asl_model_best.pth", map_location="cpu", weights_only=True)) model.eval() return model, le model, le = load_model() # โโ MediaPipe โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ @st.cache_resource def load_hands(): mp_hands = mp.solutions.hands return mp_hands.Hands( static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7, min_tracking_confidence=0.7 ), mp_hands hands, mp_hands = load_hands() mp_drawing = mp.solutions.drawing_utils # โโ Sidebar โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ st.sidebar.header("Settings") hold_frames = st.sidebar.slider("Hold frames to confirm", 10, 40, 20) min_confidence = st.sidebar.slider("Min confidence", 0.5, 1.0, 0.75) st.sidebar.markdown("---") st.sidebar.markdown("**How to use:**") st.sidebar.markdown("- Hold a sign steady โ letter added") st.sidebar.markdown("- Sign `del` โ delete last letter") st.sidebar.markdown("- Sign `space` โ add space") st.sidebar.markdown("- Click **Clear** to reset sentence") # โโ Session state โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ for key, val in [("sentence", ""), ("last_letter", ""), ("hold_count", 0)]: if key not in st.session_state: st.session_state[key] = val pred_buffer = deque(maxlen=7) # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # TAB 1: Live webcam TAB 2: Upload image # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ tab1, tab2 = st.tabs(["Live webcam", "Upload image"]) # โโ TAB 1: Webcam โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ with tab1: col_cam, col_info = st.columns([2, 1]) with col_cam: run = st.checkbox("Start camera", value=False) FRAME_WINDOW = st.image([]) with col_info: st.markdown("### Current sign") sign_display = st.empty() conf_display = st.empty() st.markdown("### Sentence") sentence_display = st.empty() if st.button("Clear sentence"): st.session_state.sentence = "" st.session_state.last_letter = "" st.session_state.hold_count = 0 st.markdown("---") st.markdown("### Model info") st.success("99.22% accuracy") st.info("28 classes ยท AโZ + del + space") st.info("63,673 training samples") cap = None if run: cap = cv2.VideoCapture(0) cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) while run: ret, frame = cap.read() if not ret: st.error("Camera not found.") break frame = cv2.flip(frame, 1) rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) result = hands.process(rgb) smoothed = "" confidence = 0.0 if result.multi_hand_landmarks: lm = result.multi_hand_landmarks[0].landmark wx, wy, wz = lm[0].x, lm[0].y, lm[0].z coords = [] for point in lm: coords.extend([point.x - wx, point.y - wy, point.z - wz]) x_tensor = torch.tensor([coords], dtype=torch.float32) with torch.no_grad(): logits = model(x_tensor) probs = torch.softmax(logits, dim=1) conf, pred = probs.max(dim=1) label = le.inverse_transform(pred.numpy())[0] confidence = conf.item() if confidence >= min_confidence: pred_buffer.append(label) smoothed = max(set(pred_buffer), key=pred_buffer.count) if smoothed == st.session_state.last_letter: st.session_state.hold_count += 1 else: st.session_state.hold_count = 0 st.session_state.last_letter = smoothed if st.session_state.hold_count == hold_frames: if smoothed == "del": st.session_state.sentence = st.session_state.sentence[:-1] elif smoothed == "space": st.session_state.sentence += " " else: st.session_state.sentence += smoothed st.session_state.hold_count = 0 mp_drawing.draw_landmarks( frame, result.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS ) cv2.rectangle(frame, (10, 10), (300, 80), (0, 0, 0), -1) cv2.putText(frame, f"Sign: {smoothed}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 120), 2) cv2.putText(frame, f"Conf: {confidence:.2f}", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (180, 180, 180), 1) bar_w = int((st.session_state.hold_count / hold_frames) * 250) cv2.rectangle(frame, (10, 88), (260, 102), (50, 50, 50), -1) cv2.rectangle(frame, (10, 88), (10 + bar_w, 102), (0, 255, 120), -1) else: cv2.putText(frame, "No hand detected", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (100, 100, 255), 2) h, w = frame.shape[:2] cv2.rectangle(frame, (0, h - 45), (w, h), (0, 0, 0), -1) cv2.putText(frame, f"{st.session_state.sentence or '...'}", (10, h - 12), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2) FRAME_WINDOW.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) sign_display.markdown( f"
Confidence: {confidence:.2%}
", unsafe_allow_html=True ) sentence_display.markdown( f"