yubo0306 commited on
Commit
7268dea
·
verified ·
1 Parent(s): ceb46c5

Upload processor

Browse files
feature_extraction_avhubert.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import librosa
3
+ import mediapipe as mp
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchvision.transforms.v2 as transforms
8
+ from numpy.typing import NDArray
9
+ from python_speech_features import logfbank
10
+ from transformers import FeatureExtractionMixin
11
+ from transformers.feature_extraction_utils import BatchFeature
12
+
13
+ mp_face_mesh = mp.solutions.face_mesh
14
+
15
+
16
+ class AVHubertFeatureExtractor(FeatureExtractionMixin):
17
+ model_input_names = ["input_values", "pixel_values"]
18
+
19
+ def __init__(
20
+ self,
21
+ max_sample_size: int | None = None,
22
+ normalize: bool = True,
23
+ stack_order_audio: int = 4,
24
+ image_crop_size: int = 88,
25
+ image_mean: float = 0.421,
26
+ image_std: float = 0.165,
27
+ sr: int = 16_000,
28
+ static_image_mode: bool = False,
29
+ refine_landmarks: bool = False,
30
+ min_detection_confidence: float = 0.5,
31
+ min_tracking_confidence: float = 0.5,
32
+ landmark_indices: tuple[int, ...] = (5, 411, 199, 187), # (top, right, bottom, left) of mouth
33
+ **kwargs,
34
+ ) -> None:
35
+ super().__init__(**kwargs)
36
+ self.max_sample_size = max_sample_size
37
+ self.normalize = normalize
38
+ self.stack_order_audio = stack_order_audio
39
+ self.image_crop_size = image_crop_size
40
+ self.transforms = transforms.Compose(
41
+ [
42
+ transforms.ToImage(),
43
+ transforms.CenterCrop(image_crop_size),
44
+ transforms.ToDtype(torch.float32, scale=True),
45
+ transforms.Normalize([image_mean], [image_std]),
46
+ ]
47
+ )
48
+ self.sr = sr
49
+ self.static_image_mode = static_image_mode
50
+ self.refine_landmarks = refine_landmarks
51
+ self.min_detection_confidence = min_detection_confidence
52
+ self.min_tracking_confidence = min_tracking_confidence
53
+ self.landmark_indices = landmark_indices
54
+
55
+ def _load_video(self, video: str | NDArray[np.uint8], extract_mouth: bool = False) -> torch.FloatTensor:
56
+ """Input video must be in RGB format if type is numpy array."""
57
+ if isinstance(video, str):
58
+ cap = cv2.VideoCapture(video)
59
+ frames = []
60
+ for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
61
+ ret, frame = cap.read()
62
+ if not ret:
63
+ break
64
+ if not extract_mouth: # Already extracted mouth
65
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
66
+ else:
67
+ frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
68
+ frames_np = np.stack(frames, axis=0)
69
+ else:
70
+ frames_np = video
71
+ if not extract_mouth: # Already extracted mouth
72
+ frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
73
+
74
+ if extract_mouth:
75
+ frames_np = self._extract_mouth(frames_np)
76
+
77
+ return torch.from_numpy(frames_np).unsqueeze(dim=1)
78
+
79
+ def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
80
+ mouth_frames = []
81
+ top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
82
+ with mp_face_mesh.FaceMesh(
83
+ static_image_mode=self.static_image_mode,
84
+ max_num_faces=1,
85
+ refine_landmarks=self.refine_landmarks,
86
+ min_detection_confidence=self.min_detection_confidence,
87
+ min_tracking_confidence=self.min_tracking_confidence,
88
+ ) as face_mesh:
89
+ for frame in frames:
90
+ res = face_mesh.process(frame)
91
+ if res.multi_face_landmarks is None or len(res.multi_face_landmarks) == 0:
92
+ mouth_frames.append(np.zeros([self.image_crop_size, self.image_crop_size], dtype=np.uint8))
93
+ continue
94
+ landmarks = res.multi_face_landmarks[0].landmark
95
+ top = landmarks[top_idx]
96
+ left = landmarks[left_idx]
97
+ right = landmarks[right_idx]
98
+ bottom = landmarks[bottom_idx]
99
+
100
+ H, W = frame.shape[:2]
101
+ xmax = max(top.x, left.x, right.x, bottom.x)
102
+ ymax = max(top.y, left.y, right.y, bottom.y)
103
+ xmin = min(top.x, left.x, right.x, bottom.x)
104
+ ymin = min(top.y, left.y, right.y, bottom.y)
105
+
106
+ patch_size = max((xmax - xmin) * W, (ymax - ymin) * H) # To extract square region
107
+ half = int(patch_size / 2)
108
+ y_center = int(ymin * H) + int(((ymax - ymin) / 2) * H)
109
+ x_center = int(xmin * W) + int(((xmax - xmin) / 2) * W)
110
+ lip = frame[
111
+ y_center - half : y_center + half,
112
+ x_center - half : x_center + half,
113
+ :,
114
+ ]
115
+ try:
116
+ lip = cv2.resize(lip, (self.image_crop_size, self.image_crop_size))
117
+ except Exception:
118
+ lip = np.zeros([self.image_crop_size, self.image_crop_size, 3], dtype=np.uint8)
119
+ mouth_frames.append(cv2.cvtColor(lip, cv2.COLOR_RGB2GRAY))
120
+ return np.stack(mouth_frames, axis=0)
121
+
122
+ def _load_audio(self, audio: str | NDArray[np.float32]) -> torch.FloatTensor:
123
+ def stacker(feats, stack_order):
124
+ feat_dim = feats.shape[1]
125
+ if len(feats) % stack_order != 0:
126
+ res = stack_order - len(feats) % stack_order
127
+ res = np.zeros([res, feat_dim]).astype(feats.dtype)
128
+ feats = np.concatenate([feats, res], axis=0)
129
+ feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order * feat_dim)
130
+ return feats
131
+
132
+ sr = None
133
+ if isinstance(audio, str):
134
+ audio, sr = librosa.load(audio, sr=16_000)
135
+ if sr is None:
136
+ sr = self.sr
137
+ fbank = logfbank(audio, samplerate=sr).astype(np.float32)
138
+ fbank = stacker(fbank, self.stack_order_audio)
139
+ return torch.from_numpy(fbank)
140
+
141
+ def _align_time_steps(
142
+ self, audio: list[torch.FloatTensor], video: list[torch.FloatTensor]
143
+ ) -> tuple[list[torch.FloatTensor], list[torch.FloatTensor]]:
144
+ aligned_indices = []
145
+ for sample_audio, sample_video in zip(audio, video):
146
+ diff = len(sample_audio) - len(sample_video)
147
+ if diff != 0:
148
+ aligned_indices.append(
149
+ torch.arange(0, len(sample_audio)).float() * len(sample_video) / len(sample_audio)
150
+ )
151
+ else:
152
+ aligned_indices.append(torch.arange(0, len(sample_audio)))
153
+ return (
154
+ audio,
155
+ [
156
+ sample[torch.clamp(torch.floor(indices), max=sample.shape[0] - 1).long()]
157
+ for sample, indices in zip(video, aligned_indices)
158
+ ],
159
+ )
160
+
161
+ def __call__(
162
+ self,
163
+ raw_audio: NDArray[np.float32] | str | list[NDArray[np.float32]] | list[str] | None = None,
164
+ raw_video: NDArray[np.uint8] | str | list[NDArray[np.uint8]] | list[str] | None = None,
165
+ extract_mouth: bool = False,
166
+ **kwargs,
167
+ ) -> BatchFeature:
168
+ if not isinstance(raw_audio, list):
169
+ raw_audio = [raw_audio]
170
+ if not isinstance(raw_video, list):
171
+ raw_video = [raw_video]
172
+
173
+ audio = [self._load_audio(sample) if sample is not None else None for sample in raw_audio]
174
+ video = [self._load_video(sample, extract_mouth) if sample is not None else None for sample in raw_video]
175
+ for batch_idx in range(len(audio)):
176
+ sample_a = audio[batch_idx]
177
+ sample_v = video[batch_idx]
178
+ assert sample_a is not None or sample_v is not None
179
+ if sample_a is None:
180
+ sample_a = torch.zeros((sample_v.shape[0], 26 * self.stack_order_audio))
181
+ audio[batch_idx] = sample_a
182
+ elif sample_v is None: # 25 fps
183
+ sample_v = torch.zeros((sample_a.shape[0], 1, self.image_crop_size, self.image_crop_size))
184
+ video[batch_idx] = sample_v
185
+
186
+ audio, video = self._align_time_steps(audio, video)
187
+ max_length = max(len(data) for data in audio)
188
+ input_values = []
189
+ pixel_values = []
190
+ padding_mask = []
191
+ for feat_audio, feat_video in zip(audio, video):
192
+ remainder_length = max_length - len(feat_audio)
193
+ audio_remainder = torch.zeros(
194
+ size=(remainder_length,) + feat_audio.size()[1:],
195
+ dtype=feat_audio.dtype,
196
+ )
197
+ video_remainder = torch.zeros(
198
+ size=(remainder_length,) + feat_video.size()[1:],
199
+ dtype=feat_video.dtype,
200
+ )
201
+
202
+ feat_audio = torch.cat((feat_audio, audio_remainder))
203
+ feat_video = torch.cat((feat_video, video_remainder))
204
+ if self.max_sample_size:
205
+ feat_audio = feat_audio[: self.max_sample_size]
206
+ feat_video = feat_video[: self.max_sample_size]
207
+ pad_mask = torch.zeros(max_length)
208
+ pad_mask[max_length - remainder_length :] = 1
209
+
210
+ input_values.append(feat_audio)
211
+ pixel_values.append(feat_video)
212
+ padding_mask.append(pad_mask)
213
+
214
+ input_values = torch.stack(input_values)
215
+ batch = BatchFeature(
216
+ {
217
+ "input_values": (
218
+ F.layer_norm(input_values, input_values.shape[2:]) if self.normalize else input_values
219
+ ),
220
+ "pixel_values": self.transforms(torch.stack(pixel_values)),
221
+ "padding_mask": torch.stack(padding_mask),
222
+ }
223
+ )
224
+ return batch
225
+
226
+ def to_dict(self):
227
+ output = super().to_dict()
228
+ output["transforms"] = self._transforms_to_dict(output["transforms"])
229
+ return output
230
+
231
+ def _transforms_to_dict(self, transforms: transforms.Compose):
232
+ output = []
233
+ for component in transforms.__dict__["transforms"]:
234
+ name = component.__class__.__name__
235
+ component_dict = {"transforms_type": name}
236
+ for k, v in component.__dict__.items():
237
+ if k.startswith("_"):
238
+ continue
239
+ component_dict[k] = str(v)
240
+ output.append(component_dict)
241
+ return output
preprocessor_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor",
4
+ "AutoProcessor": "processing_avhubert.AVHubertProcessor"
5
+ },
6
+ "feature_extractor_type": "AVHubertFeatureExtractor",
7
+ "image_crop_size": 88,
8
+ "landmark_indices": [
9
+ 5,
10
+ 411,
11
+ 199,
12
+ 187
13
+ ],
14
+ "max_sample_size": null,
15
+ "min_detection_confidence": 0.5,
16
+ "min_tracking_confidence": 0.5,
17
+ "normalize": true,
18
+ "processor_class": "AVHubertProcessor",
19
+ "refine_landmarks": false,
20
+ "sr": 16000,
21
+ "stack_order_audio": 4,
22
+ "static_image_mode": false,
23
+ "transforms": [
24
+ {
25
+ "training": "True",
26
+ "transforms_type": "ToImage"
27
+ },
28
+ {
29
+ "size": "(88, 88)",
30
+ "training": "True",
31
+ "transforms_type": "CenterCrop"
32
+ },
33
+ {
34
+ "dtype": "torch.float32",
35
+ "scale": "True",
36
+ "training": "True",
37
+ "transforms_type": "ToDtype"
38
+ },
39
+ {
40
+ "inplace": "False",
41
+ "mean": "[0.421]",
42
+ "std": "[0.165]",
43
+ "training": "True",
44
+ "transforms_type": "Normalize"
45
+ }
46
+ ]
47
+ }
processing_avhubert.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from contextlib import contextmanager
3
+
4
+ import numpy as np
5
+ from transformers import ProcessorMixin
6
+
7
+
8
+ class AVHubertProcessor(ProcessorMixin):
9
+ r"""
10
+ Constructs a AVHubert processor which wraps a AVHubert feature extractor and a AVHubert CTC tokenizer into a single
11
+ processor.
12
+
13
+ [`AVHubertProcessor`] offers all the functionalities of [`AVHubertFeatureExtractor`] and [`PreTrainedTokenizer`].
14
+ See the docstring of [`~AVHubertProcessor.__call__`] and [`~AVHubertProcessor.decode`] for more information.
15
+
16
+ Args:
17
+ feature_extractor (`AVHubertFeatureExtractor`):
18
+ An instance of [`AVHubertFeatureExtractor`]. The feature extractor is a required input.
19
+ tokenizer ([`PreTrainedTokenizer`]):
20
+ An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
21
+ """
22
+
23
+ feature_extractor_class = "AutoFeatureExtractor"
24
+ tokenizer_class = "PreTrainedTokenizerFast"
25
+
26
+ def __init__(self, feature_extractor, tokenizer):
27
+ super().__init__(feature_extractor, tokenizer)
28
+ self.current_processor = self.feature_extractor
29
+ self._in_target_context_manager = False
30
+
31
+ def __call__(
32
+ self,
33
+ raw_audio: np.ndarray | str | list[np.ndarray] | list[str] | None = None,
34
+ raw_video: np.ndarray | str | list[np.ndarray] | list[str] | None = None,
35
+ text: str | list[str] | None = None,
36
+ **kwargs,
37
+ ):
38
+ """
39
+ When used in normal mode, this method forwards all its arguments to AVHubertFeatureExtractor's
40
+ [`~AVHubertFeatureExtractor.__call__`] and returns its output. If used in the context
41
+ [`~AVHubertProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
42
+ [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
43
+ """
44
+ is_batched = isinstance(raw_audio, list)
45
+ # For backward compatibility
46
+ if self._in_target_context_manager:
47
+ return self.current_processor(raw_audio, raw_video, text)
48
+
49
+ if raw_audio is None and raw_video is None and text is None:
50
+ raise ValueError("You need to specify either an `raw_audio`, `raw_video` or `text` input to process.")
51
+
52
+ if raw_audio is not None or raw_video is not None:
53
+ inputs = self.feature_extractor(raw_audio, raw_video, **kwargs)
54
+ if text is not None:
55
+ if "return_tensors" not in kwargs.keys():
56
+ kwargs["return_tensors"] = "pt"
57
+ if not is_batched:
58
+ text = [text]
59
+ text = [
60
+ (
61
+ tokens
62
+ if tokens.startswith("<s>") and tokens.endswith("</s>")
63
+ else (
64
+ tokens + "</s>" # append </s>
65
+ if tokens.startswith("<s>")
66
+ else (
67
+ "<s>" + tokens # prepend <s>
68
+ if tokens.endswith("</s>")
69
+ else "<s>" + tokens + "</s>" # add <s>/</s>
70
+ )
71
+ )
72
+ )
73
+ for tokens in text
74
+ ]
75
+
76
+ kwargs.pop("extract_mouth", None)
77
+ encodings = self.tokenizer(text, **kwargs)
78
+
79
+ if text is None:
80
+ return inputs
81
+ elif raw_audio is None and raw_video is None:
82
+ return encodings
83
+ else:
84
+ inputs["decoder_input_ids"] = encodings["input_ids"][:, :-1].clone()
85
+ inputs["decoder_attention_mask"] = encodings["attention_mask"][:, :-1]
86
+ inputs["labels"] = encodings["input_ids"][:, 1:]
87
+ return inputs
88
+
89
+ def batch_decode(self, *args, **kwargs):
90
+ """
91
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
92
+ refer to the docstring of this method for more information.
93
+ """
94
+ return self.tokenizer.batch_decode(*args, **kwargs)
95
+
96
+ def decode(self, *args, **kwargs):
97
+ """
98
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
99
+ to the docstring of this method for more information.
100
+ """
101
+ return self.tokenizer.decode(*args, **kwargs)
102
+
103
+ @contextmanager
104
+ def as_target_processor(self):
105
+ """
106
+ Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
107
+ AVHubert.
108
+ """
109
+ warnings.warn(
110
+ "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
111
+ "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
112
+ "your audio inputs, or in a separate call."
113
+ )
114
+ self._in_target_context_manager = True
115
+ self.current_processor = self.tokenizer
116
+ yield
117
+ self.current_processor = self.feature_extractor
118
+ self._in_target_context_manager = False
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_avhubert.AVHubertProcessor"
4
+ },
5
+ "processor_class": "AVHubertProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "3000": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "3001": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3002": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "auto_map": {
37
+ "AutoProcessor": "processing_avhubert.AVHubertProcessor"
38
+ },
39
+ "bos_token": "<s>",
40
+ "clean_up_tokenization_spaces": true,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "model_max_length": 1000000000000000019884624838656,
44
+ "pad_token": "<pad>",
45
+ "padding_side": "right",
46
+ "processor_class": "AVHubertProcessor",
47
+ "tokenizer_class": "PreTrainedTokenizerFast",
48
+ "unk_token": "<unk>"
49
+ }