Upload feature extractor
Browse files- feature_extraction_avhubert.py +68 -2
- preprocessor_config.json +1 -3
feature_extraction_avhubert.py
CHANGED
|
@@ -1,16 +1,28 @@
|
|
|
|
|
|
|
|
| 1 |
import cv2
|
| 2 |
import librosa
|
| 3 |
import mediapipe as mp
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
| 7 |
import torchvision.transforms.v2 as transforms
|
| 8 |
from numpy.typing import NDArray
|
|
|
|
| 9 |
from python_speech_features import logfbank
|
| 10 |
from transformers import FeatureExtractionMixin
|
| 11 |
from transformers.feature_extraction_utils import BatchFeature
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class AVHubertFeatureExtractor(FeatureExtractionMixin):
|
|
@@ -72,13 +84,67 @@ class AVHubertFeatureExtractor(FeatureExtractionMixin):
|
|
| 72 |
frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
|
| 73 |
|
| 74 |
if extract_mouth:
|
| 75 |
-
frames_np = self._extract_mouth(frames_np)
|
| 76 |
|
| 77 |
return torch.from_numpy(frames_np).unsqueeze(dim=1)
|
| 78 |
|
| 79 |
def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
|
| 80 |
mouth_frames = []
|
| 81 |
top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
with mp_face_mesh.FaceMesh(
|
| 83 |
static_image_mode=self.static_image_mode,
|
| 84 |
max_num_faces=1,
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
import cv2
|
| 4 |
import librosa
|
| 5 |
import mediapipe as mp
|
| 6 |
import numpy as np
|
| 7 |
+
import requests
|
| 8 |
import torch
|
| 9 |
import torch.nn.functional as F
|
| 10 |
import torchvision.transforms.v2 as transforms
|
| 11 |
from numpy.typing import NDArray
|
| 12 |
+
from packaging.version import Version
|
| 13 |
from python_speech_features import logfbank
|
| 14 |
from transformers import FeatureExtractionMixin
|
| 15 |
from transformers.feature_extraction_utils import BatchFeature
|
| 16 |
|
| 17 |
+
use_legacy_mp = False
|
| 18 |
+
if Version(mp.__version__) <= Version("0.10.21"):
|
| 19 |
+
mp_face_mesh = mp.solutions.face_mesh
|
| 20 |
+
use_legacy_mp = True
|
| 21 |
+
else:
|
| 22 |
+
BaseOptions = mp.tasks.BaseOptions
|
| 23 |
+
FaceLandmarker = mp.tasks.vision.FaceLandmarker
|
| 24 |
+
FaceLandmarkerOptions = mp.tasks.vision.FaceLandmarkerOptions
|
| 25 |
+
VisionRunningMode = mp.tasks.vision.RunningMode
|
| 26 |
|
| 27 |
|
| 28 |
class AVHubertFeatureExtractor(FeatureExtractionMixin):
|
|
|
|
| 84 |
frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
|
| 85 |
|
| 86 |
if extract_mouth:
|
| 87 |
+
frames_np = self._extract_mouth_legacy(frames_np) if use_legacy_mp else self._extract_mouth(frames_np)
|
| 88 |
|
| 89 |
return torch.from_numpy(frames_np).unsqueeze(dim=1)
|
| 90 |
|
| 91 |
def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
|
| 92 |
mouth_frames = []
|
| 93 |
top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
|
| 94 |
+
|
| 95 |
+
model_path = Path.home() / ".cache" / "reazonspeech" / "mediapipe---models--face_landmarker.task"
|
| 96 |
+
model_path.parent.mkdir(parents=True, exist_ok=True)
|
| 97 |
+
if not model_path.exists():
|
| 98 |
+
with open(model_path, "wb") as f:
|
| 99 |
+
f.write(requests.get("https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task").content)
|
| 100 |
+
with FaceLandmarker.create_from_options(
|
| 101 |
+
FaceLandmarkerOptions(
|
| 102 |
+
base_options=BaseOptions(model_asset_path=model_path.as_posix()),
|
| 103 |
+
running_mode=VisionRunningMode.IMAGE,
|
| 104 |
+
num_faces=1,
|
| 105 |
+
min_face_detection_confidence=self.min_detection_confidence,
|
| 106 |
+
min_tracking_confidence=self.min_tracking_confidence,
|
| 107 |
+
)
|
| 108 |
+
) as face_mesh:
|
| 109 |
+
for frame in frames:
|
| 110 |
+
res = face_mesh.detect(
|
| 111 |
+
mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
| 112 |
+
)
|
| 113 |
+
if res.face_landmarks is None or len(res.face_landmarks) == 0:
|
| 114 |
+
mouth_frames.append(np.zeros([self.image_crop_size, self.image_crop_size], dtype=np.uint8))
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
landmarks = res.face_landmarks[0]
|
| 118 |
+
top = landmarks[top_idx]
|
| 119 |
+
left = landmarks[left_idx]
|
| 120 |
+
right = landmarks[right_idx]
|
| 121 |
+
bottom = landmarks[bottom_idx]
|
| 122 |
+
|
| 123 |
+
H, W = frame.shape[:2]
|
| 124 |
+
xmax = max(top.x, left.x, right.x, bottom.x)
|
| 125 |
+
ymax = max(top.y, left.y, right.y, bottom.y)
|
| 126 |
+
xmin = min(top.x, left.x, right.x, bottom.x)
|
| 127 |
+
ymin = min(top.y, left.y, right.y, bottom.y)
|
| 128 |
+
|
| 129 |
+
patch_size = max((xmax - xmin) * W, (ymax - ymin) * H) # To extract square region
|
| 130 |
+
half = int(patch_size / 2)
|
| 131 |
+
y_center = int(ymin * H) + int(((ymax - ymin) / 2) * H)
|
| 132 |
+
x_center = int(xmin * W) + int(((xmax - xmin) / 2) * W)
|
| 133 |
+
lip = frame[
|
| 134 |
+
y_center - half : y_center + half,
|
| 135 |
+
x_center - half : x_center + half,
|
| 136 |
+
:,
|
| 137 |
+
]
|
| 138 |
+
try:
|
| 139 |
+
lip = cv2.resize(lip, (self.image_crop_size, self.image_crop_size))
|
| 140 |
+
except Exception:
|
| 141 |
+
lip = np.zeros([self.image_crop_size, self.image_crop_size, 3], dtype=np.uint8)
|
| 142 |
+
mouth_frames.append(cv2.cvtColor(lip, cv2.COLOR_RGB2GRAY))
|
| 143 |
+
return np.stack(mouth_frames, axis=0)
|
| 144 |
+
|
| 145 |
+
def _extract_mouth_legacy(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
|
| 146 |
+
mouth_frames = []
|
| 147 |
+
top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
|
| 148 |
with mp_face_mesh.FaceMesh(
|
| 149 |
static_image_mode=self.static_image_mode,
|
| 150 |
max_num_faces=1,
|
preprocessor_config.json
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"auto_map": {
|
| 3 |
-
"AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor"
|
| 4 |
-
"AutoProcessor": "processing_avhubert.AVHubertProcessor"
|
| 5 |
},
|
| 6 |
"feature_extractor_type": "AVHubertFeatureExtractor",
|
| 7 |
"image_crop_size": 88,
|
|
@@ -15,7 +14,6 @@
|
|
| 15 |
"min_detection_confidence": 0.5,
|
| 16 |
"min_tracking_confidence": 0.5,
|
| 17 |
"normalize": true,
|
| 18 |
-
"processor_class": "AVHubertProcessor",
|
| 19 |
"refine_landmarks": false,
|
| 20 |
"sr": 16000,
|
| 21 |
"stack_order_audio": 4,
|
|
|
|
| 1 |
{
|
| 2 |
"auto_map": {
|
| 3 |
+
"AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor"
|
|
|
|
| 4 |
},
|
| 5 |
"feature_extractor_type": "AVHubertFeatureExtractor",
|
| 6 |
"image_crop_size": 88,
|
|
|
|
| 14 |
"min_detection_confidence": 0.5,
|
| 15 |
"min_tracking_confidence": 0.5,
|
| 16 |
"normalize": true,
|
|
|
|
| 17 |
"refine_landmarks": false,
|
| 18 |
"sr": 16000,
|
| 19 |
"stack_order_audio": 4,
|