Spaces:

Nayefleb
/

Lance

Running on Zero

App Files Files Community

Nayefleb commited on about 11 hours ago

Commit

bc4cd2c

verified ·

1 Parent(s): 04b2815

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -288

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # =========================================================
-# ZERO GPU PATCHED + ALL TASKS ENABLED + QWEN FIX
 # Hugging Face Spaces Compatible
 # =========================================================
@@ -34,7 +35,11 @@ os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 # LOGIN
 # =========================================================
-from huggingface_hub import login, snapshot_download
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -48,9 +53,8 @@ if HF_TOKEN:
 from safetensors.torch import load_file
 from transformers import (
-    AutoProcessor,
-    Qwen2_5_VLForConditionalGeneration,
     set_seed,
 )
 from transformers.utils import is_flash_attn_2_available
@@ -131,27 +135,17 @@ snapshot_download(
     token=HF_TOKEN,
 )
-# =========================================================
-# DOWNLOAD QWEN 2.5 VL
-# =========================================================
-QWEN_VL_REPO = "Qwen/Qwen2.5-VL-7B-Instruct"
-QWEN_VL_PATH = MODEL_CACHE_DIR / "Qwen2.5-VL-7B-Instruct"
-snapshot_download(
-    repo_id=QWEN_VL_REPO,
-    local_dir=str(QWEN_VL_PATH),
-    local_dir_use_symlinks=False,
-    token=HF_TOKEN,
-)
 DEFAULT_MODEL_PATH = str(
     MODEL_CACHE_DIR / "Lance_3B_Video"
 )
 print("DEFAULT_MODEL_PATH =", DEFAULT_MODEL_PATH)
-print("QWEN_VL_PATH =", QWEN_VL_PATH)
 # =========================================================
 # DEFAULTS
@@ -255,35 +249,7 @@ class LancePipeline:
             if not torch.cuda.is_available():
                 raise RuntimeError("CUDA unavailable")
-            print("Initializing Lance pipeline...")
-            # =====================================================
-            # QWEN VL LOAD FIX
-            # =====================================================
-            print("Loading Qwen2.5 VL Processor...")
-            self.qwen_processor = AutoProcessor.from_pretrained(
-                str(QWEN_VL_PATH),
-                trust_remote_code=True,
-                token=HF_TOKEN,
-            )
-            print("Loading Qwen2.5 VL Model...")
-            self.qwen_vl_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                str(QWEN_VL_PATH),
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                trust_remote_code=True,
-                token=HF_TOKEN,
-            )
-            print("Qwen2.5 VL loaded successfully")
-            # =====================================================
-            # LANCE CONFIG
-            # =====================================================
             model_args = ModelArguments(
                 model_path=DEFAULT_MODEL_PATH,
@@ -298,10 +264,10 @@ class LancePipeline:
             )
             # =====================================================
-            # FORCE CORRECT VIT PATH
             # =====================================================
-            model_args.vit_path = str(QWEN_VL_PATH)
             data_args = DataArguments()
@@ -334,6 +300,10 @@ class LancePipeline:
             set_seed(42)
             llm_config = Qwen2Config.from_json_file(
                 str(Path(model_args.model_path) / "llm_config.json")
             )
@@ -341,52 +311,65 @@ class LancePipeline:
             language_model = Qwen2ForCausalLM(llm_config)
             # =====================================================
-            # FIXED VIT CONFIG
             # =====================================================
-            print("Loading VIT config from:", model_args.vit_path)
-            from transformers import AutoConfig
-            vit_config = AutoConfig.from_pretrained(
-                model_args.vit_path,
-                trust_remote_code=True,
                 token=HF_TOKEN,
             )
             vit_config._attn_implementation = "eager"
             vit_model = Qwen2_5_VisionTransformerPretrainedModel(
                 vit_config
             )
-            vit_weights_path = Path(model_args.vit_path) / "model.safetensors"
-            if vit_weights_path.exists():
-                print("Loading VIT weights:", vit_weights_path)
-                vit_weights = load_file(
-                    str(vit_weights_path)
-                )
-                missing, unexpected = vit_model.load_state_dict(
-                    vit_weights,
-                    strict=False
-                )
-                print("Missing keys:", len(missing))
-                print("Unexpected keys:", len(unexpected))
-                clean_memory(vit_weights)
-            else:
-                print("WARNING: model.safetensors not found")
             vae_model = WanVideoVAE()
             vae_config = deepcopy(vae_model.vae_config)
             config = LanceConfig(
                 visual_gen=True,
                 visual_und=True,
@@ -410,6 +393,8 @@ class LancePipeline:
                 training_args=inference_args,
             )
             model = model.to(
                 device="cuda",
                 dtype=torch.bfloat16,
@@ -448,6 +433,10 @@ class LancePipeline:
             print("Lance initialized successfully")
     def generate(
         self,
         task,
@@ -465,231 +454,245 @@ class LancePipeline:
         cfg_text_scale,
     ):
-        task = normalize_task(task)
-        actual_seed = normalize_seed(int(seed))
-        set_seed(actual_seed)
-        save_dir = RESULTS_ROOT / str(time.time())
-        save_dir.mkdir(parents=True, exist_ok=True)
-        inference_args = deepcopy(
-            self.base_inference_args
-        )
-        inference_args.video_height = int(height)
-        inference_args.video_width = int(width)
-        inference_args.num_frames = int(num_frames)
-        inference_args.validation_num_timesteps = (
-            validation_num_timesteps
-        )
-        inference_args.validation_timestep_shift = (
-            validation_timestep_shift
-        )
-        inference_args.task = task
-        prompt_file = TMP_INPUT_DIR / f"prompt_{time.time()}.json"
-        # =====================================================
-        # PAYLOADS
-        # =====================================================
-        if task == TASK_T2V:
-            payload = {
-                "000000.mp4": prompt
-            }
-        elif task == TASK_T2I:
-            payload = {
-                "000000.png": prompt
-            }
-        elif task == TASK_IMAGE_EDIT:
-            payload = {
-                "000000": {
-                    "interleave_array": [
-                        input_image,
-                        [prompt, ""]
-                    ],
-                    "element_dtype_array": [
-                        "image",
-                        "text"
-                    ],
-                    "istarget_in_interleave": [
-                        0,
-                        1
-                    ],
                 }
-            }
-        elif task == TASK_VIDEO_EDIT:
-            payload = {
-                "000000": {
-                    "interleave_array": [
-                        input_video,
-                        [prompt, ""]
-                    ],
-                    "element_dtype_array": [
-                        "video",
-                        "text"
-                    ],
-                    "istarget_in_interleave": [
-                        0,
-                        1
-                    ],
                 }
-            }
-        elif task == TASK_X2T_IMAGE:
-            payload = {
-                "000000": {
-                    "interleave_array": [
-                        input_image,
-                        [
-                            "Describe the image",
-                            question,
-                            ""
-                        ]
-                    ],
-                    "element_dtype_array": [
-                        "image",
-                        "text"
-                    ],
-                    "istarget_in_interleave": [
-                        0,
-                        1
-                    ],
                 }
-            }
-        elif task == TASK_X2T_VIDEO:
-            payload = {
-                "000000": {
-                    "interleave_array": [
-                        input_video,
-                        [
-                            "Describe the video",
-                            question,
-                            ""
-                        ]
-                    ],
-                    "element_dtype_array": [
-                        "video",
-                        "text"
-                    ],
-                    "istarget_in_interleave": [
-                        0,
-                        1
-                    ],
                 }
-            }
-        else:
-            return (
-                None,
-                None,
-                "",
-                "Invalid task",
-                "",
             )
-        with open(prompt_file, "w") as f:
-            json.dump(payload, f)
-        dataset_config = DataConfig.from_yaml(
-            str(prompt_file)
-        )
-        val_dataset = ValidationDataset(
-            jsonl_path=str(prompt_file),
-            tokenizer=self.tokenizer,
-            data_args=self.base_data_args,
-            model_args=self.base_model_args,
-            training_args=inference_args,
-            new_token_ids=self.new_token_ids,
-            dataset_config=dataset_config,
-            local_rank=0,
-            world_size=1,
-        )
-        val_data_cpu = simple_custom_collate(
-            [val_dataset[0]]
-        )
-        validate_on_fixed_batch(
-            fsdp_model=self.model,
-            vae_model=self.vae_model,
-            tokenizer=self.tokenizer,
-            val_data_cpu=val_data_cpu,
-            training_args=inference_args,
-            model_args=self.base_model_args,
-            inference_args=inference_args,
-            new_token_ids=self.new_token_ids,
-            image_token_id=self.image_token_id,
-            device="cuda",
-            save_source_video=False,
-            save_path_gen=str(save_dir),
-            save_path_gt="",
-        )
-        clean_memory()
-        gc.collect()
-        torch.cuda.empty_cache()
-        videos = list(save_dir.glob("*.mp4"))
-        images = list(save_dir.glob("*.png"))
-        if len(videos) > 0:
-            return (
-                str(videos[0]),
-                None,
-                "",
-                "Success",
-                "",
             )
-        if len(images) > 0:
             return (
                 None,
-                str(images[0]),
                 "",
-                "Success",
                 "",
             )
-        if task in [TASK_X2T_IMAGE, TASK_X2T_VIDEO]:
             return (
                 None,
                 None,
-                "Understanding complete",
-                "Success",
                 "",
             )
-        return (
-            None,
-            None,
-            "",
-            "No output generated",
-            "",
-        )
 # =========================================================
 # GLOBAL
 # =========================================================
@@ -717,37 +720,23 @@ def run_task(
     cfg_text_scale,
 ):
-    try:
-        PIPELINE.initialize()
-        return PIPELINE.generate(
-            task=task,
-            prompt=prompt,
-            input_image=input_image,
-            input_video=input_video,
-            question=question,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            seed=seed,
-            resolution=resolution,
-            validation_num_timesteps=validation_num_timesteps,
-            validation_timestep_shift=validation_timestep_shift,
-            cfg_text_scale=cfg_text_scale,
-        )
-    except Exception as e:
-        traceback_str = traceback.format_exc()
-        return (
-            None,
-            None,
-            "",
-            f"ERROR: {str(e)}",
-            traceback_str,
-        )
 # =========================================================
 # UI

 # =========================================================
+# ZERO GPU PATCHED + ALL TASKS ENABLED
+# Qwen2.5-VL FIXED VERSION
 # Hugging Face Spaces Compatible
 # =========================================================
 # LOGIN
 # =========================================================
+from huggingface_hub import (
+    login,
+    snapshot_download,
+    hf_hub_download,
+)
 HF_TOKEN = os.getenv("HF_TOKEN")
 from safetensors.torch import load_file
 from transformers import (
     set_seed,
+    AutoConfig,
 )
 from transformers.utils import is_flash_attn_2_available
     token=HF_TOKEN,
 )
 DEFAULT_MODEL_PATH = str(
     MODEL_CACHE_DIR / "Lance_3B_Video"
 )
 print("DEFAULT_MODEL_PATH =", DEFAULT_MODEL_PATH)
+# =========================================================
+# QWEN VL
+# =========================================================
+QWEN_VL_REPO = "Qwen/Qwen2.5-VL-7B-Instruct"
 # =========================================================
 # DEFAULTS
             if not torch.cuda.is_available():
                 raise RuntimeError("CUDA unavailable")
+            print("Initializing Lance...")
             model_args = ModelArguments(
                 model_path=DEFAULT_MODEL_PATH,
             )
             # =====================================================
+            # IMPORTANT FIX
             # =====================================================
+            model_args.vit_path = QWEN_VL_REPO
             data_args = DataArguments()
             set_seed(42)
+            # =====================================================
+            # LLM
+            # =====================================================
             llm_config = Qwen2Config.from_json_file(
                 str(Path(model_args.model_path) / "llm_config.json")
             )
             language_model = Qwen2ForCausalLM(llm_config)
             # =====================================================
+            # FIXED QWEN2.5-VL LOADING
             # =====================================================
+            print("Loading Qwen2.5-VL config...")
+            full_qwen_config = AutoConfig.from_pretrained(
+                QWEN_VL_REPO,
                 token=HF_TOKEN,
+                trust_remote_code=True,
             )
+            vit_config = full_qwen_config.vision_config
             vit_config._attn_implementation = "eager"
+            print("Creating vision transformer...")
             vit_model = Qwen2_5_VisionTransformerPretrainedModel(
                 vit_config
             )
+            # =====================================================
+            # LOAD WEIGHTS
+            # =====================================================
+            print("Downloading Qwen weights...")
+            vit_weights_path = hf_hub_download(
+                repo_id=QWEN_VL_REPO,
+                filename="model.safetensors",
+                token=HF_TOKEN,
+            )
+            print("Loading VIT weights...")
+            vit_weights = load_file(vit_weights_path)
+            missing, unexpected = vit_model.load_state_dict(
+                vit_weights,
+                strict=False,
+            )
+            print("Missing keys:", len(missing))
+            print("Unexpected keys:", len(unexpected))
+            clean_memory(vit_weights)
+            # =====================================================
+            # VAE
+            # =====================================================
             vae_model = WanVideoVAE()
             vae_config = deepcopy(vae_model.vae_config)
+            # =====================================================
+            # CONFIG
+            # =====================================================
             config = LanceConfig(
                 visual_gen=True,
                 visual_und=True,
                 training_args=inference_args,
             )
+            print("Moving model to CUDA...")
             model = model.to(
                 device="cuda",
                 dtype=torch.bfloat16,
             print("Lance initialized successfully")
+    # =========================================================
+    # GENERATE
+    # =========================================================
     def generate(
         self,
         task,
         cfg_text_scale,
     ):
+        try:
+            task = normalize_task(task)
+            actual_seed = normalize_seed(int(seed))
+            set_seed(actual_seed)
+            save_dir = RESULTS_ROOT / str(time.time())
+            save_dir.mkdir(parents=True, exist_ok=True)
+            inference_args = deepcopy(
+                self.base_inference_args
+            )
+            inference_args.video_height = int(height)
+            inference_args.video_width = int(width)
+            inference_args.num_frames = int(num_frames)
+            inference_args.validation_num_timesteps = (
+                validation_num_timesteps
+            )
+            inference_args.validation_timestep_shift = (
+                validation_timestep_shift
+            )
+            inference_args.task = task
+            prompt_file = TMP_INPUT_DIR / "prompt.json"
+            # =====================================================
+            # PAYLOADS
+            # =====================================================
+            if task == TASK_T2V:
+                payload = {
+                    "000000.mp4": prompt
+                }
+            elif task == TASK_T2I:
+                payload = {
+                    "000000.png": prompt
+                }
+            elif task == TASK_IMAGE_EDIT:
+                payload = {
+                    "000000": {
+                        "interleave_array": [
+                            input_image,
+                            [prompt, ""]
+                        ],
+                        "element_dtype_array": [
+                            "image",
+                            "text"
+                        ],
+                        "istarget_in_interleave": [
+                            0,
+                            1
+                        ],
+                    }
                 }
+            elif task == TASK_VIDEO_EDIT:
+                payload = {
+                    "000000": {
+                        "interleave_array": [
+                            input_video,
+                            [prompt, ""]
+                        ],
+                        "element_dtype_array": [
+                            "video",
+                            "text"
+                        ],
+                        "istarget_in_interleave": [
+                            0,
+                            1
+                        ],
+                    }
                 }
+            elif task == TASK_X2T_IMAGE:
+                payload = {
+                    "000000": {
+                        "interleave_array": [
+                            input_image,
+                            [
+                                "Describe the image",
+                                question,
+                                ""
+                            ]
+                        ],
+                        "element_dtype_array": [
+                            "image",
+                            "text"
+                        ],
+                        "istarget_in_interleave": [
+                            0,
+                            1
+                        ],
+                    }
                 }
+            elif task == TASK_X2T_VIDEO:
+                payload = {
+                    "000000": {
+                        "interleave_array": [
+                            input_video,
+                            [
+                                "Describe the video",
+                                question,
+                                ""
+                            ]
+                        ],
+                        "element_dtype_array": [
+                            "video",
+                            "text"
+                        ],
+                        "istarget_in_interleave": [
+                            0,
+                            1
+                        ],
+                    }
                 }
+            else:
+                return (
+                    None,
+                    None,
+                    "",
+                    "Invalid task",
+                    "",
+                )
+            with open(prompt_file, "w") as f:
+                json.dump(payload, f)
+            dataset_config = DataConfig.from_yaml(
+                str(prompt_file)
             )
+            val_dataset = ValidationDataset(
+                jsonl_path=str(prompt_file),
+                tokenizer=self.tokenizer,
+                data_args=self.base_data_args,
+                model_args=self.base_model_args,
+                training_args=inference_args,
+                new_token_ids=self.new_token_ids,
+                dataset_config=dataset_config,
+                local_rank=0,
+                world_size=1,
+            )
+            val_data_cpu = simple_custom_collate(
+                [val_dataset[0]]
             )
+            validate_on_fixed_batch(
+                fsdp_model=self.model,
+                vae_model=self.vae_model,
+                tokenizer=self.tokenizer,
+                val_data_cpu=val_data_cpu,
+                training_args=inference_args,
+                model_args=self.base_model_args,
+                inference_args=inference_args,
+                new_token_ids=self.new_token_ids,
+                image_token_id=self.image_token_id,
+                device="cuda",
+                save_source_video=False,
+                save_path_gen=str(save_dir),
+                save_path_gt="",
+            )
+            clean_memory()
+            gc.collect()
+            torch.cuda.empty_cache()
+            videos = list(save_dir.glob("*.mp4"))
+            images = list(save_dir.glob("*.png"))
+            if len(videos) > 0:
+                return (
+                    str(videos[0]),
+                    None,
+                    "",
+                    "Success",
+                    "",
+                )
+            if len(images) > 0:
+                return (
+                    None,
+                    str(images[0]),
+                    "",
+                    "Success",
+                    "",
+                )
+            if task in [TASK_X2T_IMAGE, TASK_X2T_VIDEO]:
+                return (
+                    None,
+                    None,
+                    "Understanding complete",
+                    "Success",
+                    "",
+                )
             return (
                 None,
+                None,
                 "",
+                "No output generated",
                 "",
             )
+        except Exception as e:
+            traceback.print_exc()
             return (
                 None,
                 None,
                 "",
+                f"ERROR: {str(e)}",
+                traceback.format_exc(),
             )
 # =========================================================
 # GLOBAL
 # =========================================================
     cfg_text_scale,
 ):
+    PIPELINE.initialize()
+    return PIPELINE.generate(
+        task=task,
+        prompt=prompt,
+        input_image=input_image,
+        input_video=input_video,
+        question=question,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        seed=seed,
+        resolution=resolution,
+        validation_num_timesteps=validation_num_timesteps,
+        validation_timestep_shift=validation_timestep_shift,
+        cfg_text_scale=cfg_text_scale,
+    )
 # =========================================================
 # UI