| import os |
| import torch |
| import torch.nn as nn |
| import numpy as np |
| from torchvision import transforms as T |
| from torchvision.transforms.v2 import ToDtype |
| from decord import VideoReader, cpu |
| import gradio as gr |
|
|
| |
| |
| |
| model_path = 'vifi_clip_30_epochs_k400_full_finetuned.pth' |
| if not os.path.exists(model_path): |
| print(f"π½ Downloading model to {model_path}...") |
| os.system("pip install -q gdown") |
| os.system("gdown --id 1Nx30Kbu5xnv6dPwz4I3Ivy380LCdp1Md -O vifi_clip_30_epochs_k400_full_finetuned.pth") |
|
|
| |
| |
| |
| def _transform(n_px=224): |
| return T.Compose([ |
| ToDtype(torch.float32, scale=True), |
| T.Resize(n_px, antialias=True), |
| T.CenterCrop(n_px), |
| T.Normalize((0.48145466, 0.4578275, 0.40821073), |
| (0.26862954, 0.26130258, 0.27577711)), |
| ]) |
|
|
| |
| |
| |
| class ClassificationHead(nn.Module): |
| def __init__(self, input_dim=512, num_classes=1): |
| super().__init__() |
| self.dense = nn.Linear(input_dim, num_classes) |
|
|
| def forward(self, x): |
| return self.dense(x) |
|
|
| |
| |
| |
| from trainers import vificlip |
| from utils.config import get_config |
| from utils.logger import create_logger |
|
|
| cfgpth = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml' |
| classifier_path = 'best_detector_model.pt' |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| class parse_option: |
| def __init__(self): |
| self.config = cfgpth |
| self.output = "exp" |
| self.resume = model_path |
| self.only_test = True |
| self.opts = None |
| self.batch_size = None |
| self.pretrained = None |
| self.accumulation_steps = None |
| self.local_rank = 0 |
|
|
| args = parse_option() |
| config = get_config(args) |
| logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}") |
| model = vificlip.returnCLIP(config, logger, class_names=["true", "false"]) |
| model = model.float().to(device) |
| feature_extractor = model |
|
|
| classifier = ClassificationHead() |
| classifier.load_state_dict(torch.load(classifier_path, map_location=device)) |
| classifier.to(device) |
| classifier.eval() |
|
|
| |
| |
| |
| def predict_video(video_path, threshold=0.5): |
| preprocess = _transform(224) |
| try: |
| vr = VideoReader(video_path, ctx=cpu(0)) |
| total_frames = len(vr) |
| num_frames = 16 |
|
|
| if total_frames > num_frames: |
| start = np.random.randint(0, total_frames - num_frames) |
| indices = list(range(start, start + num_frames)) |
| else: |
| indices = list(range(total_frames)) |
| indices += [total_frames - 1] * (num_frames - len(indices)) |
|
|
| frames = vr.get_batch(indices).asnumpy() |
| video_tensor = torch.from_numpy(frames).permute(0, 3, 1, 2) |
| video_tensor = preprocess(video_tensor).unsqueeze(0).to(device) |
|
|
| B, T, C, H, W = video_tensor.shape |
| input_clip = video_tensor.view(B * T, C, H, W) |
|
|
| with torch.no_grad(): |
| features = feature_extractor.image_encoder(input_clip) |
| features = features.view(B, T, -1).mean(dim=1) |
| logits = classifier(features) |
| prob = torch.sigmoid(logits).item() |
| label = "Real" if prob >= threshold else "Fake" |
|
|
| return f"{label} (prob: {prob:.4f}, threshold: {threshold})" |
| except Exception as e: |
| return f"β Error: {str(e)}" |
|
|
| |
| |
| |
| gr.Interface( |
| fn=predict_video, |
| inputs=[ |
| gr.Video(type="filepath", label="Upload Video (.mp4)"), |
| gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Threshold (Real β₯ Threshold)") |
| ], |
| outputs="text", |
| title="π§ Deepfake Detection with ViFi-CLIP", |
| description="Upload a video to classify it as Real or Fake. Threshold slider lets you adjust sensitivity." |
| ).launch() |
|
|