import streamlit as st import cv2 import torch import torch.nn as nn import numpy as np import mediapipe as mp import pickle from collections import deque import PIL.Image st.set_page_config(page_title="ASL Translator", page_icon="๐ŸคŸ", layout="wide") st.title("๐ŸคŸ ASL Sign Language Translator") st.markdown("Show your hand to the camera โ€” hold a sign steady to add it to the sentence.") # โ”€โ”€ Load model โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ @st.cache_resource def load_model(): with open("label_encoder.pkl", "rb") as f: le = pickle.load(f) class ASLClassifier(nn.Module): def __init__(self, input_dim=63, num_classes=28): super().__init__() self.net = nn.Sequential( nn.Linear(input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, 64), nn.BatchNorm1d(64), nn.ReLU(), nn.Dropout(0.2), nn.Linear(64, num_classes) ) def forward(self, x): return self.net(x) model = ASLClassifier(num_classes=len(le.classes_)) model.load_state_dict(torch.load("asl_model_best.pth", map_location="cpu", weights_only=True)) model.eval() return model, le model, le = load_model() # โ”€โ”€ MediaPipe โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ @st.cache_resource def load_hands(): mp_hands = mp.solutions.hands return mp_hands.Hands( static_image_mode=False, max_num_hands=1, min_detection_confidence=0.7, min_tracking_confidence=0.7 ), mp_hands hands, mp_hands = load_hands() mp_drawing = mp.solutions.drawing_utils # โ”€โ”€ Sidebar โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ st.sidebar.header("Settings") hold_frames = st.sidebar.slider("Hold frames to confirm", 10, 40, 20) min_confidence = st.sidebar.slider("Min confidence", 0.5, 1.0, 0.75) st.sidebar.markdown("---") st.sidebar.markdown("**How to use:**") st.sidebar.markdown("- Hold a sign steady โ†’ letter added") st.sidebar.markdown("- Sign `del` โ†’ delete last letter") st.sidebar.markdown("- Sign `space` โ†’ add space") st.sidebar.markdown("- Click **Clear** to reset sentence") # โ”€โ”€ Session state โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ for key, val in [("sentence", ""), ("last_letter", ""), ("hold_count", 0)]: if key not in st.session_state: st.session_state[key] = val pred_buffer = deque(maxlen=7) # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• # TAB 1: Live webcam TAB 2: Upload image # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• tab1, tab2 = st.tabs(["Live webcam", "Upload image"]) # โ”€โ”€ TAB 1: Webcam โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with tab1: col_cam, col_info = st.columns([2, 1]) with col_cam: run = st.checkbox("Start camera", value=False) FRAME_WINDOW = st.image([]) with col_info: st.markdown("### Current sign") sign_display = st.empty() conf_display = st.empty() st.markdown("### Sentence") sentence_display = st.empty() if st.button("Clear sentence"): st.session_state.sentence = "" st.session_state.last_letter = "" st.session_state.hold_count = 0 st.markdown("---") st.markdown("### Model info") st.success("99.22% accuracy") st.info("28 classes ยท Aโ€“Z + del + space") st.info("63,673 training samples") cap = None if run: cap = cv2.VideoCapture(0) cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) while run: ret, frame = cap.read() if not ret: st.error("Camera not found.") break frame = cv2.flip(frame, 1) rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) result = hands.process(rgb) smoothed = "" confidence = 0.0 if result.multi_hand_landmarks: lm = result.multi_hand_landmarks[0].landmark wx, wy, wz = lm[0].x, lm[0].y, lm[0].z coords = [] for point in lm: coords.extend([point.x - wx, point.y - wy, point.z - wz]) x_tensor = torch.tensor([coords], dtype=torch.float32) with torch.no_grad(): logits = model(x_tensor) probs = torch.softmax(logits, dim=1) conf, pred = probs.max(dim=1) label = le.inverse_transform(pred.numpy())[0] confidence = conf.item() if confidence >= min_confidence: pred_buffer.append(label) smoothed = max(set(pred_buffer), key=pred_buffer.count) if smoothed == st.session_state.last_letter: st.session_state.hold_count += 1 else: st.session_state.hold_count = 0 st.session_state.last_letter = smoothed if st.session_state.hold_count == hold_frames: if smoothed == "del": st.session_state.sentence = st.session_state.sentence[:-1] elif smoothed == "space": st.session_state.sentence += " " else: st.session_state.sentence += smoothed st.session_state.hold_count = 0 mp_drawing.draw_landmarks( frame, result.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS ) cv2.rectangle(frame, (10, 10), (300, 80), (0, 0, 0), -1) cv2.putText(frame, f"Sign: {smoothed}", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 120), 2) cv2.putText(frame, f"Conf: {confidence:.2f}", (20, 70), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (180, 180, 180), 1) bar_w = int((st.session_state.hold_count / hold_frames) * 250) cv2.rectangle(frame, (10, 88), (260, 102), (50, 50, 50), -1) cv2.rectangle(frame, (10, 88), (10 + bar_w, 102), (0, 255, 120), -1) else: cv2.putText(frame, "No hand detected", (20, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (100, 100, 255), 2) h, w = frame.shape[:2] cv2.rectangle(frame, (0, h - 45), (w, h), (0, 0, 0), -1) cv2.putText(frame, f"{st.session_state.sentence or '...'}", (10, h - 12), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2) FRAME_WINDOW.image(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) sign_display.markdown( f"

{smoothed or 'โ€”'}

", unsafe_allow_html=True ) conf_display.markdown( f"

Confidence: {confidence:.2%}

", unsafe_allow_html=True ) sentence_display.markdown( f"
" f"{st.session_state.sentence or '...'}
", unsafe_allow_html=True ) if cap: cap.release() # โ”€โ”€ TAB 2: Image upload โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with tab2: st.markdown("### Test with an image") st.markdown("Upload a photo of a hand making an ASL sign โ€” works great for testing on Hugging Face.") uploaded = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"]) if uploaded: img_pil = PIL.Image.open(uploaded).convert("RGB") img_rgb = np.array(img_pil) img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) # Use static mode for single image hands_static = mp.solutions.hands.Hands( static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5 ) result = hands_static.process(img_rgb) hands_static.close() col_img, col_result = st.columns([1, 1]) with col_img: if result.multi_hand_landmarks: mp_drawing.draw_landmarks( img_bgr, result.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS ) st.image(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB), caption="Uploaded image", use_container_width=True) with col_result: if result.multi_hand_landmarks: lm = result.multi_hand_landmarks[0].landmark wx, wy, wz = lm[0].x, lm[0].y, lm[0].z coords = [] for point in lm: coords.extend([point.x - wx, point.y - wy, point.z - wz]) x_tensor = torch.tensor([coords], dtype=torch.float32) with torch.no_grad(): logits = model(x_tensor) probs = torch.softmax(logits, dim=1) conf, pred = probs.max(dim=1) label = le.inverse_transform(pred.numpy())[0] confidence = conf.item() # Top 3 predictions top3_conf, top3_idx = probs[0].topk(3) top3_labels = le.inverse_transform(top3_idx.numpy()) st.markdown("### Prediction") st.markdown( f"

{label}

", unsafe_allow_html=True ) st.markdown(f"**Confidence:** {confidence:.2%}") st.markdown("---") st.markdown("**Top 3 predictions:**") for lbl, cf in zip(top3_labels, top3_conf): st.progress(float(cf), text=f"{lbl} โ€” {cf:.2%}") else: st.warning("No hand detected. Try a clearer image with better lighting.") st.markdown("**Tips:**") st.markdown("- Make sure your hand is clearly visible") st.markdown("- Good lighting helps a lot") st.markdown("- Try a plain background")