| import subprocess |
| from typing import List, Dict, Any |
| from dataclasses import dataclass |
| from abc import ABC, abstractmethod |
| from PIL import Image |
| from pathlib import Path |
| import numpy as np |
| import cv2 |
| import clip |
| import torch |
| from torch import nn |
| import torch.nn.functional as F |
|
|
| from typing import Tuple |
| import os |
| import json |
| from diffusers import CogVideoXPipeline |
| from diffusers.utils import export_to_video |
| from video_generation_evaluation.toolkit.fvd import get_dataset_features, I3DFeatureExtractor |
| from numpy import cov |
| from numpy import mean |
| from scipy.linalg import sqrtm |
| from video_generation_evaluation.evaluate import task2dimension |
|
|
|
|
| class BaseTask(ABC): |
| def __init__(self, task_data: str, model): |
| self.task_data = task_data |
| self.model = model |
| self.data = self._parse_data(task_data) |
|
|
| @abstractmethod |
| def _parse_data(self, task_data: Dict[str, Any]): |
| pass |
|
|
| @abstractmethod |
| def evaluate(self) -> Dict[str, float]: |
| pass |
|
|
| @abstractmethod |
| def run_inference(self): |
| pass |
|
|
| class T2VTask(BaseTask): |
| def _parse_result_file(self, output_dir: Path) -> float | None: |
| for jsonfile in output_dir.iterdir(): |
| if "eval" in jsonfile.name: |
| with open(jsonfile.as_posix(), "r") as file: |
| data = json.load(file) |
| |
| return float(data[self.taskname][0]) |
| |
| def _parse_data(self, task_data): |
| with open(task_data, "r") as file: |
| annos = json.load(file) |
| taskname = annos["task"].replace(" ", "") |
| self.taskname = taskname |
| self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) |
| return annos["data"] |
|
|
| def run_inference(self): |
| for d in self.data: |
| prompt = d["input"]["prompt"] |
| for i in range(5): |
| video = self.model(prompt, generator=torch.Generator(self.model.device).manual_seed(i)).frames[0] |
| save_name = prompt + "-" + str(i) + ".mp4" |
| save_path = os.path.join(self.save_root, save_name) |
| export_to_video(video, save_path, fps=8) |
|
|
| class FVDEval(T2VTask): |
| def evaluate(self, real_video_root): |
| model = I3DFeatureExtractor().cuda().eval() |
| |
| real_features = get_dataset_features(real_video_root, model) |
| generated_features = get_dataset_features(self.save_root, model) |
|
|
| mu_real = mean(real_features, axis=0) |
| mu_generated = mean(generated_features, axis=0) |
|
|
| sigma_real = cov(real_features, rowvar=False) |
| sigma_generated = cov(generated_features, rowvar=False) |
|
|
| diff = mu_real - mu_generated |
| covmean, _ = sqrtm(sigma_real.dot(sigma_generated), disp=False) |
| if np.iscomplexobj(covmean): |
| covmean = covmean.real |
| fvd = diff.dot(diff) + np.trace(sigma_real + sigma_generated - 2 * covmean) |
| print(f"{self.taskname} score: {fvd}") |
| return fvd |
|
|
| class ThirdPartyEval(T2VTask): |
| def evaluate(self): |
| videos_path = Path(self.save_root).resolve() |
| dimension = task2dimension[self.taskname] |
| full_info = Path("./full_info_t2v.json").resolve() |
| output_dir = Path("./evaluation_results").resolve() |
| output_dir = output_dir.joinpath(self.taskname) |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| cmd = [ |
| "python", "-W", "ignore", "evaluate.py", |
| "--full_json_dir", str(full_info), |
| "--videos_path", str(videos_path), |
| "--dimension", dimension, |
| "--output_path", str(output_dir) |
| ] |
|
|
| try: |
| subprocess.run(cmd, check=True) |
| except subprocess.CalledProcessError as exc: |
| raise RuntimeError(f"Evaluation failed: {exc}") from exc |
| |
| score = self._parse_result_file(Path(output_dir)) |
| print(f"{self.taskname} score: {score}") |
| return score |
|
|
| class I2VTask(BaseTask): |
| def _parse_result_file(self, output_dir: Path) -> float | None: |
| score = 0 |
| for jsonfile in output_dir.iterdir(): |
| if "eval" in jsonfile.name: |
| with open(jsonfile.as_posix(), "r") as file: |
| data: dict = json.load(file) |
| score += list(data.values())[0][0] |
| return score |
| |
| def _parse_data(self, task_data): |
| self.dirpath = os.path.dirname(task_data) |
| with open(task_data, "r") as file: |
| annos = json.load(file) |
| taskname = annos["task"].replace(" ", "") |
| self.taskname = taskname |
| self.dimensions = ("subject_consistency", "overall_consistency", "motion_smoothness", "dynamic_degree") |
| self.save_root = os.path.join("General-Bench", "Video-Generation", taskname) |
|
|
| def run_inference(self): |
| for d in self.data: |
| prompt = d["input"]["prompt"] |
| image = d["input"]["image"] |
| image = os.path.join(self.dirpath, image) |
| for i in range(5): |
| video = self.model( |
| prompt=prompt, |
| image=image, |
| generator=torch.Generator(self.model.device).manual_seed(i) |
| ).frames[0] |
| save_name = prompt + "-" + str(i) + ".mp4" |
| save_path = os.path.join(self.save_root, save_name) |
| export_to_video(video, save_path, fps=8) |
| |
| def evaluate(self): |
| taskname = self.taskname |
| full_info = Path("./full_info_i2v.json").resolve() |
| output_dir = Path("./evaluation_results").resolve() |
| output_dir = output_dir.joinpath(taskname) |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| for dimension in self.dimensions: |
| cmd = [ |
| "python", "-W", "ignore", "evaluate.py", |
| "--full_json_dir", str(full_info), |
| "--videos_path", str(self.save_root), |
| "--dimension", dimension, |
| "--output_path", str(output_dir) |
| ] |
| try: |
| subprocess.run(cmd, check=True) |
| except subprocess.CalledProcessError as exc: |
| raise RuntimeError(f"Evaluation failed: {exc}") from exc |
|
|
| score = self._parse_result_file(Path(output_dir)) |
| print(f"{self.taskname} score: {score}") |
| return score |
| |
| class AthleticsT2V(FVDEval): pass |
|
|
| class HumanT2V(FVDEval): pass |
|
|
| class ConcertT2V(FVDEval): pass |
|
|
| class TerrestrialAnimalT2V(FVDEval): pass |
|
|
| class WaterSportsT2V(FVDEval): pass |
|
|
| class ActionT2V(ThirdPartyEval): pass |
|
|
| class ArtisticT2V(ThirdPartyEval): pass |
|
|
| class BackgroundConsistency(ThirdPartyEval): pass |
|
|
| class CameraMotionT2V(ThirdPartyEval): pass |
|
|
| class ClassConditionedT2V(ThirdPartyEval): pass |
|
|
| class ColorT2V(ThirdPartyEval): pass |
|
|
| class DynamicT2V(ThirdPartyEval): pass |
|
|
| class MaterialT2V(ThirdPartyEval): pass |
|
|
| class MultiClassConditionedT2V(ThirdPartyEval): pass |
|
|
| class SceneT2V(ThirdPartyEval): pass |
|
|
| class SpatialRelationT2V(ThirdPartyEval): pass |
|
|
| class StaticT2V(ThirdPartyEval): pass |
|
|
| class StyleT2V(ThirdPartyEval): pass |
|
|
| class ArchitectureI2V(I2VTask): pass |
|
|
| class ClothI2V(I2VTask): pass |
|
|
| class FoodI2V(I2VTask): pass |
|
|
| class FurnitureI2V(I2VTask): pass |
|
|
| class HumanI2V(I2VTask): pass |
|
|
| class PetI2V(I2VTask): pass |
|
|
| class PlantI2V(I2VTask): pass |
|
|
| class SceneI2V(I2VTask): pass |
|
|
| class VehicleI2V(I2VTask): pass |
|
|
| class WeatherI2V(I2VTask): pass |
|
|
| class WildAnimalI2V(I2VTask): pass |
|
|
|
|
| if __name__ == "__main__": |
| root = Path("General-Bench-Openset/video/generation") |
| |
| task_type = "T2V" |
| model = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda") |
| |
| task_files = [ |
| "AthleticsT2V", |
| "HumanT2V", |
| "ConcertT2V", |
| "TerrestrialAnimalT2V", |
| "WaterSportsT2V", |
| "ActionT2V", |
| "ArtisticT2V", |
| "BackgroundConsistency", |
| "CameraMotionT2V", |
| "ClassConditionedT2V", |
| "ColorT2V", |
| "DynamicT2V", |
| "MaterialT2V", |
| "MultiClassConditionedT2V", |
| "SceneT2V", |
| "SpatialRelationT2V", |
| "StaticT2V", |
| "StyleT2V", |
| "ArchitectureI2V", |
| "ClothI2V", |
| "FoodI2V", |
| "FurnitureI2V", |
| "HumanI2V", |
| "PetI2V", |
| "PlantI2V", |
| "SceneI2V", |
| "VehicleI2V", |
| "WeatherI2V", |
| "WildAnimalI2V", |
| ] |
|
|
| task_files = [root.joinpath(task, "annotation.json") for task in task_files] |
|
|
| for idx, file in enumerate(task_files): |
| if file.exists(): |
| continue |
|
|
| with open(file.as_posix(), 'r', encoding='utf-8') as f: |
| task_data = json.load(f) |
|
|
| task_name = task_data["task"] |
| print(f"Running evaluation for task {idx + 1}: {task_name}") |
|
|
| TASK_MAPPING = { |
| "AthleticsT2V": AthleticsT2V, |
| "HumanT2V": HumanT2V, |
| "ConcertT2V": ConcertT2V, |
| "TerrestrialAnimalT2V": TerrestrialAnimalT2V, |
| "WaterSportsT2V": WaterSportsT2V, |
| "ActionT2V": ActionT2V, |
| "ArtisticT2V": ArtisticT2V, |
| "BackgroundConsistency": BackgroundConsistency, |
| "CameraMotionT2V": CameraMotionT2V, |
| "ClassConditionedT2V": ClassConditionedT2V, |
| "ColorT2V": ColorT2V, |
| "DynamicT2V": DynamicT2V, |
| "MaterialT2V": MaterialT2V, |
| "MultiClassConditionedT2V": MultiClassConditionedT2V, |
| "SceneT2V": SceneT2V, |
| "SpatialRelationT2V": SpatialRelationT2V, |
| "StaticT2V": StaticT2V, |
| "StyleT2V": StyleT2V, |
| "ArchitectureI2V": ArchitectureI2V, |
| "ClothI2V": ClothI2V, |
| "FoodI2V": FoodI2V, |
| "FurnitureI2V": FurnitureI2V, |
| "HumanI2V": HumanI2V, |
| "PetI2V": PetI2V, |
| "PlantI2V": PlantI2V, |
| "SceneI2V": SceneI2V, |
| "VehicleI2V": VehicleI2V, |
| "WeatherI2V": WeatherI2V, |
| "WildAnimalI2V": WildAnimalI2V, |
| } |
|
|
| clean_task_name = task_name.replace(" ", "") |
| task_class = TASK_MAPPING.get(clean_task_name) |
| if task_class is None: |
| raise NotImplementedError |
| elif task_type not in clean_task_name: |
| continue |
| else: |
| task = task_class(file.as_posix(), model) |
|
|
| task.run_inference() |
| metrics = task.evaluate() |
| print("Task name: ", task_name, "Task type: ", task_type, "Evaluation results:", metrics) |