| import os |
| os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
| from gradio_client import Client, handle_file |
| from typing import Any, Dict, List, Optional, Tuple, Union |
| import requests |
| import json |
|
|
| |
| _svision_client = None |
|
|
|
|
| def _get_svision_client(): |
| """Get or create the svision client (lazy initialization).""" |
| global _svision_client |
| if _svision_client is None: |
| _svision_client = Client("VeuReu/svision") |
| return _svision_client |
|
|
|
|
| def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1): |
| """ |
| Call the /scenes_extraction endpoint of the remote Space VeuReu/svision. |
| |
| Parameters |
| ---------- |
| video_path : str |
| Path to the input video file. |
| threshold : float, optional |
| Scene change detection threshold; higher values make detection less sensitive. |
| offset_frames : int, optional |
| Number of frames to include before and after a detected scene boundary. |
| crop_ratio : float, optional |
| Ratio for cropping borders before performing scene detection. |
| |
| Returns |
| ------- |
| Any |
| Response returned by the remote /scenes_extraction endpoint. |
| """ |
| result = _get_svision_client().predict( |
| video_file={"video": handle_file(video_path)}, |
| threshold=threshold, |
| offset_frames=offset_frames, |
| crop_ratio=crop_ratio, |
| api_name="/scenes_extraction" |
| ) |
| return result |
|
|
|
|
| def keyframes_every_second_extraction(video_path: str): |
| """ |
| Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision. |
| |
| Parameters |
| ---------- |
| video_path : str |
| Path to the input video file. |
| |
| Returns |
| ------- |
| Any |
| Response returned by the remote /keyframes_every_second_extraction endpoint. |
| """ |
| result = _get_svision_client().predict( |
| video_path={"video": handle_file(video_path)}, |
| api_name="/keyframes_every_second_extraction" |
| ) |
| return result |
|
|
|
|
| def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]: |
| """ |
| Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision. |
| |
| This function sends an image together with metadata and face collection data |
| to perform OCR, face detection, and annotation enhancement. |
| |
| Parameters |
| ---------- |
| imagen_path : str |
| Path to the input image file. |
| informacion_image : Dict[str, Any] |
| Dictionary containing image-related metadata. |
| face_col : List[Dict[str, Any]] |
| List of dictionaries representing detected faces or face metadata. |
| |
| Returns |
| ------- |
| Dict[str, Any] |
| Processed output containing OCR results, face detection data, and annotations. |
| """ |
| informacion_image_str = json.dumps(informacion_image) |
| face_col_str = json.dumps(face_col) |
| result = _get_svision_client().predict( |
| image=handle_file(imagen_path), |
| informacion_image=informacion_image_str, |
| face_col=face_col_str, |
| api_name="/add_ocr_and_faces" |
| ) |
| return result |
|
|
|
|
| def extract_descripcion_escena(imagen_path: str) -> str: |
| """ |
| Call the /describe_images endpoint of the remote Space VeuReu/svision. |
| |
| This function sends an image to receive a textual description of its visual content. |
| |
| Parameters |
| ---------- |
| imagen_path : str |
| Path to the input image file. |
| |
| Returns |
| ------- |
| str |
| Description generated for the given image. |
| """ |
| result = _get_svision_client().predict( |
| images=[{"image": handle_file(imagen_path)}], |
| api_name="/describe_images" |
| ) |
| return result |
|
|
|
|
| def _extract_path_from_gradio_file(file_obj) -> Optional[str]: |
| """Extract file path from Gradio file object (can be dict, str, tuple, or other). |
| |
| Gradio Gallery returns different formats depending on version: |
| - List of tuples: [(path, caption), ...] |
| - List of dicts: [{"name": path, "data": None, "is_file": True}, ...] |
| - List of FileData: [FileData(path=..., url=...), ...] |
| - List of paths: [path, ...] |
| """ |
| if file_obj is None: |
| return None |
| |
| |
| if isinstance(file_obj, tuple) and len(file_obj) >= 1: |
| return _extract_path_from_gradio_file(file_obj[0]) |
| |
| |
| if isinstance(file_obj, str): |
| return file_obj |
| |
| |
| if isinstance(file_obj, dict): |
| return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image") |
| |
| |
| if hasattr(file_obj, "path") and file_obj.path: |
| return file_obj.path |
| if hasattr(file_obj, "url") and file_obj.url: |
| return file_obj.url |
| if hasattr(file_obj, "name") and file_obj.name: |
| return file_obj.name |
| |
| |
| return str(file_obj) if file_obj else None |
|
|
|
|
| def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]: |
| """ |
| Call the /face_image_embedding_casting endpoint to detect faces and get embeddings. |
| |
| This replaces local DeepFace/face_recognition processing by delegating to svision Space. |
| |
| Parameters |
| ---------- |
| image_path : str |
| Path to the input image file (a video frame). |
| |
| Returns |
| ------- |
| List[Dict[str, Any]] |
| List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string). |
| Returns empty list if no faces detected or on error. |
| """ |
| try: |
| |
| result = _get_svision_client().predict( |
| image=handle_file(image_path), |
| api_name="/face_image_embedding_casting" |
| ) |
| |
| print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}") |
| |
| |
| if result and len(result) >= 2: |
| face_crops_raw = result[0] if result[0] else [] |
| face_embeddings = result[1] if result[1] else [] |
| |
| print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}") |
| if face_crops_raw and len(face_crops_raw) > 0: |
| print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}") |
| |
| |
| faces = [] |
| for i, emb_dict in enumerate(face_embeddings): |
| |
| crop_path = None |
| if i < len(face_crops_raw): |
| raw_crop = face_crops_raw[i] |
| crop_path = _extract_path_from_gradio_file(raw_crop) |
| if not crop_path: |
| print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}") |
| |
| embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else [] |
| |
| faces.append({ |
| "embedding": embedding, |
| "face_crop_path": crop_path, |
| "index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i, |
| }) |
| |
| print(f"[svision_client] Detected {len(faces)} faces from image") |
| return faces |
| return [] |
| except Exception as e: |
| print(f"[svision_client] get_face_embeddings_from_image error: {e}") |
| import traceback |
| traceback.print_exc() |
| return [] |
|
|
|
|
| def get_face_embeddings_simple(image_path: str) -> List[List[float]]: |
| """ |
| Call the /face_image_embedding endpoint to get face embeddings only. |
| |
| Parameters |
| ---------- |
| image_path : str |
| Path to the input image file. |
| |
| Returns |
| ------- |
| List[List[float]] |
| List of embedding vectors (one per detected face). |
| """ |
| try: |
| result = _get_svision_client().predict( |
| image=handle_file(image_path), |
| api_name="/face_image_embedding" |
| ) |
| return result if result else [] |
| except Exception as e: |
| print(f"[svision_client] get_face_embeddings_simple error: {e}") |
| return [] |
|
|