Upload processor

Browse files

Files changed (7) hide show

feature_extraction_avhubert.py +241 -0
preprocessor_config.json +47 -0
processing_avhubert.py +118 -0
processor_config.json +6 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +49 -0

feature_extraction_avhubert.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import cv2
+import librosa
+import mediapipe as mp
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.v2 as transforms
+from numpy.typing import NDArray
+from python_speech_features import logfbank
+from transformers import FeatureExtractionMixin
+from transformers.feature_extraction_utils import BatchFeature
+mp_face_mesh = mp.solutions.face_mesh
+class AVHubertFeatureExtractor(FeatureExtractionMixin):
+    model_input_names = ["input_values", "pixel_values"]
+    def __init__(
+        self,
+        max_sample_size: int | None = None,
+        normalize: bool = True,
+        stack_order_audio: int = 4,
+        image_crop_size: int = 88,
+        image_mean: float = 0.421,
+        image_std: float = 0.165,
+        sr: int = 16_000,
+        static_image_mode: bool = False,
+        refine_landmarks: bool = False,
+        min_detection_confidence: float = 0.5,
+        min_tracking_confidence: float = 0.5,
+        landmark_indices: tuple[int, ...] = (5, 411, 199, 187),  # (top, right, bottom, left) of mouth
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.max_sample_size = max_sample_size
+        self.normalize = normalize
+        self.stack_order_audio = stack_order_audio
+        self.image_crop_size = image_crop_size
+        self.transforms = transforms.Compose(
+            [
+                transforms.ToImage(),
+                transforms.CenterCrop(image_crop_size),
+                transforms.ToDtype(torch.float32, scale=True),
+                transforms.Normalize([image_mean], [image_std]),
+            ]
+        )
+        self.sr = sr
+        self.static_image_mode = static_image_mode
+        self.refine_landmarks = refine_landmarks
+        self.min_detection_confidence = min_detection_confidence
+        self.min_tracking_confidence = min_tracking_confidence
+        self.landmark_indices = landmark_indices
+    def _load_video(self, video: str | NDArray[np.uint8], extract_mouth: bool = False) -> torch.FloatTensor:
+        """Input video must be in RGB format if type is numpy array."""
+        if isinstance(video, str):
+            cap = cv2.VideoCapture(video)
+            frames = []
+            for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if not extract_mouth:  # Already extracted mouth
+                    frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))
+                else:
+                    frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            frames_np = np.stack(frames, axis=0)
+        else:
+            frames_np = video
+            if not extract_mouth:  # Already extracted mouth
+                frames_np = np.stack([cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) for frame in frames_np], axis=0)
+        if extract_mouth:
+            frames_np = self._extract_mouth(frames_np)
+        return torch.from_numpy(frames_np).unsqueeze(dim=1)
+    def _extract_mouth(self, frames: NDArray[np.uint8]) -> NDArray[np.uint8]:
+        mouth_frames = []
+        top_idx, right_idx, bottom_idx, left_idx = self.landmark_indices
+        with mp_face_mesh.FaceMesh(
+            static_image_mode=self.static_image_mode,
+            max_num_faces=1,
+            refine_landmarks=self.refine_landmarks,
+            min_detection_confidence=self.min_detection_confidence,
+            min_tracking_confidence=self.min_tracking_confidence,
+        ) as face_mesh:
+            for frame in frames:
+                res = face_mesh.process(frame)
+                if res.multi_face_landmarks is None or len(res.multi_face_landmarks) == 0:
+                    mouth_frames.append(np.zeros([self.image_crop_size, self.image_crop_size], dtype=np.uint8))
+                    continue
+                landmarks = res.multi_face_landmarks[0].landmark
+                top = landmarks[top_idx]
+                left = landmarks[left_idx]
+                right = landmarks[right_idx]
+                bottom = landmarks[bottom_idx]
+                H, W = frame.shape[:2]
+                xmax = max(top.x, left.x, right.x, bottom.x)
+                ymax = max(top.y, left.y, right.y, bottom.y)
+                xmin = min(top.x, left.x, right.x, bottom.x)
+                ymin = min(top.y, left.y, right.y, bottom.y)
+                patch_size = max((xmax - xmin) * W, (ymax - ymin) * H)  # To extract square region
+                half = int(patch_size / 2)
+                y_center = int(ymin * H) + int(((ymax - ymin) / 2) * H)
+                x_center = int(xmin * W) + int(((xmax - xmin) / 2) * W)
+                lip = frame[
+                    y_center - half : y_center + half,
+                    x_center - half : x_center + half,
+                    :,
+                ]
+                try:
+                    lip = cv2.resize(lip, (self.image_crop_size, self.image_crop_size))
+                except Exception:
+                    lip = np.zeros([self.image_crop_size, self.image_crop_size, 3], dtype=np.uint8)
+                mouth_frames.append(cv2.cvtColor(lip, cv2.COLOR_RGB2GRAY))
+        return np.stack(mouth_frames, axis=0)
+    def _load_audio(self, audio: str | NDArray[np.float32]) -> torch.FloatTensor:
+        def stacker(feats, stack_order):
+            feat_dim = feats.shape[1]
+            if len(feats) % stack_order != 0:
+                res = stack_order - len(feats) % stack_order
+                res = np.zeros([res, feat_dim]).astype(feats.dtype)
+                feats = np.concatenate([feats, res], axis=0)
+            feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order * feat_dim)
+            return feats
+        sr = None
+        if isinstance(audio, str):
+            audio, sr = librosa.load(audio, sr=16_000)
+        if sr is None:
+            sr = self.sr
+        fbank = logfbank(audio, samplerate=sr).astype(np.float32)
+        fbank = stacker(fbank, self.stack_order_audio)
+        return torch.from_numpy(fbank)
+    def _align_time_steps(
+        self, audio: list[torch.FloatTensor], video: list[torch.FloatTensor]
+    ) -> tuple[list[torch.FloatTensor], list[torch.FloatTensor]]:
+        aligned_indices = []
+        for sample_audio, sample_video in zip(audio, video):
+            diff = len(sample_audio) - len(sample_video)
+            if diff != 0:
+                aligned_indices.append(
+                    torch.arange(0, len(sample_audio)).float() * len(sample_video) / len(sample_audio)
+                )
+            else:
+                aligned_indices.append(torch.arange(0, len(sample_audio)))
+        return (
+            audio,
+            [
+                sample[torch.clamp(torch.floor(indices), max=sample.shape[0] - 1).long()]
+                for sample, indices in zip(video, aligned_indices)
+            ],
+        )
+    def __call__(
+        self,
+        raw_audio: NDArray[np.float32] | str | list[NDArray[np.float32]] | list[str] | None = None,
+        raw_video: NDArray[np.uint8] | str | list[NDArray[np.uint8]] | list[str] | None = None,
+        extract_mouth: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        if not isinstance(raw_audio, list):
+            raw_audio = [raw_audio]
+        if not isinstance(raw_video, list):
+            raw_video = [raw_video]
+        audio = [self._load_audio(sample) if sample is not None else None for sample in raw_audio]
+        video = [self._load_video(sample, extract_mouth) if sample is not None else None for sample in raw_video]
+        for batch_idx in range(len(audio)):
+            sample_a = audio[batch_idx]
+            sample_v = video[batch_idx]
+            assert sample_a is not None or sample_v is not None
+            if sample_a is None:
+                sample_a = torch.zeros((sample_v.shape[0], 26 * self.stack_order_audio))
+                audio[batch_idx] = sample_a
+            elif sample_v is None:  # 25 fps
+                sample_v = torch.zeros((sample_a.shape[0], 1, self.image_crop_size, self.image_crop_size))
+                video[batch_idx] = sample_v
+        audio, video = self._align_time_steps(audio, video)
+        max_length = max(len(data) for data in audio)
+        input_values = []
+        pixel_values = []
+        padding_mask = []
+        for feat_audio, feat_video in zip(audio, video):
+            remainder_length = max_length - len(feat_audio)
+            audio_remainder = torch.zeros(
+                size=(remainder_length,) + feat_audio.size()[1:],
+                dtype=feat_audio.dtype,
+            )
+            video_remainder = torch.zeros(
+                size=(remainder_length,) + feat_video.size()[1:],
+                dtype=feat_video.dtype,
+            )
+            feat_audio = torch.cat((feat_audio, audio_remainder))
+            feat_video = torch.cat((feat_video, video_remainder))
+            if self.max_sample_size:
+                feat_audio = feat_audio[: self.max_sample_size]
+                feat_video = feat_video[: self.max_sample_size]
+            pad_mask = torch.zeros(max_length)
+            pad_mask[max_length - remainder_length :] = 1
+            input_values.append(feat_audio)
+            pixel_values.append(feat_video)
+            padding_mask.append(pad_mask)
+        input_values = torch.stack(input_values)
+        batch = BatchFeature(
+            {
+                "input_values": (
+                    F.layer_norm(input_values, input_values.shape[2:]) if self.normalize else input_values
+                ),
+                "pixel_values": self.transforms(torch.stack(pixel_values)),
+                "padding_mask": torch.stack(padding_mask),
+            }
+        )
+        return batch
+    def to_dict(self):
+        output = super().to_dict()
+        output["transforms"] = self._transforms_to_dict(output["transforms"])
+        return output
+    def _transforms_to_dict(self, transforms: transforms.Compose):
+        output = []
+        for component in transforms.__dict__["transforms"]:
+            name = component.__class__.__name__
+            component_dict = {"transforms_type": name}
+            for k, v in component.__dict__.items():
+                if k.startswith("_"):
+                    continue
+                component_dict[k] = str(v)
+            output.append(component_dict)
+        return output

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "auto_map": {
+    "AutoFeatureExtractor": "feature_extraction_avhubert.AVHubertFeatureExtractor",
+    "AutoProcessor": "processing_avhubert.AVHubertProcessor"
+  },
+  "feature_extractor_type": "AVHubertFeatureExtractor",
+  "image_crop_size": 88,
+  "landmark_indices": [
+    5,
+    411,
+    199,
+    187
+  ],
+  "max_sample_size": null,
+  "min_detection_confidence": 0.5,
+  "min_tracking_confidence": 0.5,
+  "normalize": true,
+  "processor_class": "AVHubertProcessor",
+  "refine_landmarks": false,
+  "sr": 16000,
+  "stack_order_audio": 4,
+  "static_image_mode": false,
+  "transforms": [
+    {
+      "training": "True",
+      "transforms_type": "ToImage"
+    },
+    {
+      "size": "(88, 88)",
+      "training": "True",
+      "transforms_type": "CenterCrop"
+    },
+    {
+      "dtype": "torch.float32",
+      "scale": "True",
+      "training": "True",
+      "transforms_type": "ToDtype"
+    },
+    {
+      "inplace": "False",
+      "mean": "[0.421]",
+      "std": "[0.165]",
+      "training": "True",
+      "transforms_type": "Normalize"
+    }
+  ]
+}

processing_avhubert.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import warnings
+from contextlib import contextmanager
+import numpy as np
+from transformers import ProcessorMixin
+class AVHubertProcessor(ProcessorMixin):
+    r"""
+    Constructs a AVHubert processor which wraps a AVHubert feature extractor and a AVHubert CTC tokenizer into a single
+    processor.
+    [`AVHubertProcessor`] offers all the functionalities of [`AVHubertFeatureExtractor`] and [`PreTrainedTokenizer`].
+    See the docstring of [`~AVHubertProcessor.__call__`] and [`~AVHubertProcessor.decode`] for more information.
+    Args:
+        feature_extractor (`AVHubertFeatureExtractor`):
+            An instance of [`AVHubertFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+    def __call__(
+        self,
+        raw_audio: np.ndarray | str | list[np.ndarray] | list[str] | None = None,
+        raw_video: np.ndarray | str | list[np.ndarray] | list[str] | None = None,
+        text: str | list[str] | None = None,
+        **kwargs,
+    ):
+        """
+        When used in normal mode, this method forwards all its arguments to AVHubertFeatureExtractor's
+        [`~AVHubertFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~AVHubertProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
+        [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
+        """
+        is_batched = isinstance(raw_audio, list)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(raw_audio, raw_video, text)
+        if raw_audio is None and raw_video is None and text is None:
+            raise ValueError("You need to specify either an `raw_audio`, `raw_video` or `text` input to process.")
+        if raw_audio is not None or raw_video is not None:
+            inputs = self.feature_extractor(raw_audio, raw_video, **kwargs)
+        if text is not None:
+            if "return_tensors" not in kwargs.keys():
+                kwargs["return_tensors"] = "pt"
+            if not is_batched:
+                text = [text]
+            text = [
+                (
+                    tokens
+                    if tokens.startswith("<s>") and tokens.endswith("</s>")
+                    else (
+                        tokens + "</s>"  # append </s>
+                        if tokens.startswith("<s>")
+                        else (
+                            "<s>" + tokens  # prepend <s>
+                            if tokens.endswith("</s>")
+                            else "<s>" + tokens + "</s>"  # add <s>/</s>
+                        )
+                    )
+                )
+                for tokens in text
+            ]
+            kwargs.pop("extract_mouth", None)
+            encodings = self.tokenizer(text, **kwargs)
+        if text is None:
+            return inputs
+        elif raw_audio is None and raw_video is None:
+            return encodings
+        else:
+            inputs["decoder_input_ids"] = encodings["input_ids"][:, :-1].clone()
+            inputs["decoder_attention_mask"] = encodings["attention_mask"][:, :-1]
+            inputs["labels"] = encodings["input_ids"][:, 1:]
+            return inputs
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        AVHubert.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_avhubert.AVHubertProcessor"
+  },
+  "processor_class": "AVHubertProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3000": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3001": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3002": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoProcessor": "processing_avhubert.AVHubertProcessor"
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "right",
+  "processor_class": "AVHubertProcessor",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}