Upload 11 files

Browse files

Files changed (11) hide show

Act_ckpt/config.yaml +47 -0
Act_ckpt/model_best.pt +3 -0
Cha_ckpt/C3D/config.yaml +45 -0
Cha_ckpt/C3D/model_best.pt +3 -0
Cha_ckpt/I3D/config.yaml +45 -0
Cha_ckpt/I3D/model_best.pt +3 -0
Cha_ckpt/VGG/config.yaml +45 -0
Cha_ckpt/VGG/model_best.pt +3 -0
README.md +134 -3
TACoS_ckpt/config.yaml +52 -0
TACoS_ckpt/model_best.pt +3 -0

Act_ckpt/config.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+activitynetcaptions:
+  batch_size: 128
+  clip_frames:
+  - 16
+  epoch: 15
+  feature_dim: 500
+  feature_dir: /data/activitynetcaptions/c3d
+  moment_length_factors:
+  - 0.2
+  - 0.3
+  - 0.5
+  - 0.7
+  - 0.8
+  overlapping_factors:
+  - 0.0
+  - 0.1
+  - 0.2
+  - 0.3
+  - 0.4
+  - 0.5
+  - 0.6
+  - 0.7
+  - 0.8
+  - 0.9
+  pooling_func: max_pooling
+  sigma_factor: 0.4
+  stride: 8
+  video_feature_len: 256
+dataset_name: activitynetcaptions
+exp_dir: log
+gpu: '0'
+model:
+  dim: 512
+  dropout: 0.1
+  glove_path: /data/glove.840B.300d.txt
+  n_layers: 2
+  temp: 0.07
+  topk: 5
+seed: 100
+train:
+  clip_norm: 1.0
+  dev: false
+  init_lr: 0.0001
+alpha: 4
+beta: 3
+gamma: 0.25

Act_ckpt/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:265bbf52694623b45f6dbd582b6e6e3debefaf19f7f357fc5e932555d5405ffc
+size 68353819

Cha_ckpt/C3D/config.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+charadessta:
+  batch_size: 128
+  clip_frames:
+  - 8
+  epoch: 30
+  feature_dim: 4096
+  feature_dir: /data/charadessta/c3d
+  moment_length_factors:
+  - 0.25
+  - 0.3
+  - 0.35
+  overlapping_factors:
+  - 0.0
+  - 0.1
+  - 0.2
+  - 0.3
+  - 0.4
+  - 0.5
+  - 0.6
+  - 0.7
+  - 0.8
+  - 0.9
+  pooling_func: max_pooling
+  sigma_factor: 0.3
+  stride: 4
+  video_feature_len: 256
+dataset_name: charadessta
+exp_dir: log
+gpu: '0'
+model:
+  dim: 512
+  dropout: 0.1
+  glove_path: /data/glove.840B.300d.txt
+  n_layers: 2
+  temp: 0.07
+  topk: 5
+seed: 0
+train:
+  clip_norm: 1.0
+  dev: false
+  init_lr: 0.0001
+alpha: 1
+beta: 1
+gamma: 0.25

Cha_ckpt/C3D/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:031a35b44d8405e8790c5918c31ace71220869e56972e73d92e77e4bcb7c0f8e
+size 75748154

Cha_ckpt/I3D/config.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+charadessta:
+  batch_size: 512
+  clip_frames:
+  - 8
+  epoch: 30
+  feature_dim: 1024
+  feature_dir: /data/charadessta/i3d
+  moment_length_factors:
+  - 0.25
+  - 0.3
+  - 0.35
+  overlapping_factors:
+  - 0.0
+  - 0.1
+  - 0.2
+  - 0.3
+  - 0.4
+  - 0.5
+  - 0.6
+  - 0.7
+  - 0.8
+  - 0.9
+  pooling_func: max_pooling
+  sigma_factor: 0.3
+  stride: 4
+  video_feature_len: 128
+dataset_name: charadessta
+exp_dir: log
+gpu: '0'
+model:
+  dim: 512
+  dropout: 0.1
+  glove_path: /data/glove.840B.300d.txt
+  n_layers: 2
+  temp: 0.07
+  topk: 5
+seed: 1
+train:
+  clip_norm: 1.0
+  dev: false
+  init_lr: 0.0001
+alpha: 4
+beta: 3
+gamma: 0.25

Cha_ckpt/I3D/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:847333cbb9fb70e1d761711c658df061267c137802a3df0ee786a17c124512a8
+size 69069211

Cha_ckpt/VGG/config.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+charadessta:
+  batch_size: 256
+  clip_frames:
+  - 8
+  epoch: 30
+  feature_dim: 4096
+  feature_dir: /data/charadessta/vgg
+  moment_length_factors:
+  - 0.25
+  - 0.3
+  - 0.35
+  overlapping_factors:
+  - 0.0
+  - 0.1
+  - 0.2
+  - 0.3
+  - 0.4
+  - 0.5
+  - 0.6
+  - 0.7
+  - 0.8
+  - 0.9
+  pooling_func: max_pooling
+  sigma_factor: 0.3
+  stride: 4
+  video_feature_len: 256
+dataset_name: charadessta
+exp_dir: log
+gpu: '2'
+model:
+  dim: 512
+  dropout: 0.1
+  glove_path: /data/glove.840B.300d.txt
+  n_layers: 2
+  temp: 0.07
+  topk: 5
+seed: 1
+train:
+  clip_norm: 1.0
+  dev: false
+  init_lr: 0.0001
+alpha: 1
+beta: 1
+gamma: 0.25

Cha_ckpt/VGG/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e641b17b826f46d69dd44ca3cf02669eb8093081e247407f6ec98c1fb9bd0810
+size 75748154

README.md CHANGED Viewed

@@ -1,3 +1,134 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+tags:
+- video-moment-retrieval
+- point-supervised
+- vision-language
+- multimodal
+- representation-shift
+- pytorch
+---
+<a id="top"></a>
+<div align="center">
+  <h1>🚀 DRONE: Cross-modal Representation Shift Refinement for Point-supervised Video Moment Retrieval</h1>
+  <p>
+    <b>Kun Wang</b><sup>1</sup>&nbsp;
+    <b>Yupeng Hu</b><sup>1✉</sup>&nbsp;
+    <b>Hao Liu</b><sup>1</sup>&nbsp;
+    <b>Jiang Shao</b><sup>1</sup>&nbsp;
+    <b>Liqiang Nie</b><sup>2</sup>
+  </p>
+  <p>
+    <sup>1</sup>School of Software, Shandong University, Jinan, China<br>
+    <sup>2</sup>School of Computer Science and Technology, Harbin Institute of Technology (Shenzhen), Shenzhen, China<br>
+    <sup>✉</sup>Corresponding author
+  </p>
+</div>
+These are the official implementation, pre-trained model weights, and configuration files for **DRONE**, a point-supervised Video Moment Retrieval (VMR) framework designed to mitigate cross-modal representation shift.
+🔗 **Paper:** [Accepted by ACM TOIS 2026](https://dl.acm.org/doi/10.1145/3786606)
+🔗 **GitHub Repository:** [iLearn-Lab/DRONE](https://github.com/iLearn-Lab/DRONE)
+---
+## 📌 Model Information
+### 1. Model Name
+**DRONE** (Cross-modal Representation Shift Refinement)
+### 2. Task Type & Applicable Tasks
+- **Task Type:** Point-supervised Video Moment Retrieval (VMR) / Vision-Language / Multimodal Learning
+- **Applicable Tasks:** Localizing temporal segments in untrimmed videos that match natural language queries, utilizing only point-level supervision to reduce annotation costs while actively addressing cross-modal representation shifts.
+### 3. Project Introduction
+Point-supervised Video Moment Retrieval (VMR) aims to localize the temporal segment in a video that matches a natural language query using only single-frame annotations. **DRONE** addresses the cross-modal representation shift issue inherent in this setting, which progressively improves temporal alignment and semantic consistency between video and text representations.
+> 💡 **Method Highlight:** DRONE introduces **Pseudo-Frame Temporal Alignment (PTA)** and **Curriculum-Guided Semantic Refinement (CSR)**. Together, these modules systematically mitigate representation shifts, allowing the model to bridge the semantic gap between visual frames and textual queries effectively.
+### 4. Training Data Source
+The model supports and is evaluated on three standard VMR datasets:
+- **ActivityNet Captions**
+- **Charades-STA**
+- **TACoS**
+*(Follows splits and feature preparation from [ViGA](https://github.com/r-cui/ViGA))*
+---
+## 🚀 Usage & Basic Inference
+### Step 1: Prepare the Environment
+Clone the GitHub repository and set up the virtual environment:
+```bash
+git clone https://github.com/iLearn-Lab/DRONE.git
+cd DRONE
+```
+```bash
+python -m venv .venv
+source .venv/bin/activate   # Linux / Mac
+# .venv\Scripts\activate    # Windows
+```
+```bash
+pip install numpy scipy pyyaml tqdm
+```
+### Step 2: Download Model Weights & Data
+1. **Pre-trained Checkpoints:** Download the model checkpoints (includes `Act_ckpt/`, `Cha_ckpt/`, and `TACoS_ckpt/`).
+2. **Datasets & Features:** Follow [ViGA](https://github.com/r-cui/ViGA)'s dataset preparation guidelines for ActivityNet Captions, Charades-STA, and TACoS.
+3. **Configuration:** Before running, ensure you replace the local dataset root and feature paths in `src/config.yaml` and `src/utils/utils.py` with your actual local paths.
+### Step 3: Run Training & Evaluation
+**Training from Scratch:**
+Depending on the dataset you want to train on, run the following commands:
+#### For ActivityNet Captions
+python -m src.experiment.train --task activitynetcaptions
+#### For Charades-STA
+python -m src.experiment.train --task charadessta
+#### For TACoS
+python -m src.experiment.train --task tacos
+**Evaluation (Eval):**
+To evaluate a trained experiment folder (which should contain `config.yaml` and `model_best.pt`), run:
+python -m src.experiment.eval --exp path/to/your/experiment_folder
+---
+## ⚠️ Limitations & Notes
+**Disclaimer:** This framework and its pre-trained weights are intended for **academic research purposes only**.
+- The model requires access to the original source datasets (ActivityNet Captions, Charades-STA, TACoS) for full evaluation.
+- While designed to mitigate cross-modal representation shifts, performance relies on the quality of the point-level annotations and the inherent capacities of the selected visual backbones (C3D, I3D, VGG).
+---
+## 🤝 Acknowledgements & Contact
+- **Acknowledgement:** This implementation and data organization are inspired by the [ViGA](https://github.com/r-cui/ViGA) open-source community. Thanks to all collaborators and contributors of this project.
+- **Contact:** If you have any questions, feel free to contact me at `khylon.kun.wang@gmail.com`.
+---
+## 📝⭐️ Citation
+If you find our work or this repository useful in your research, please consider citing our paper:
+@article{wang2026cross,
+  title={Cross-Modal Representation Shift Refinement for Point-supervised Video Moment Retrieval},
+  author={Wang, Kun and Hu, Yupeng and Liu, Hao and Shao, Jiang and Nie, Liqiang},
+  journal={ACM Transactions on Information Systems},
+  volume={44},
+  number={3},
+  pages={1--30},
+  year={2026},
+  publisher={ACM New York, NY}
+}

TACoS_ckpt/config.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+dataset_name: tacos
+exp_dir: log
+gpu: '0'
+model:
+  dim: 512
+  dropout: 0.1
+  glove_path: /data/glove.840B.300d.txt
+  n_layers: 2
+  temp: 0.07
+  topk: 5
+seed: 16
+tacos:
+  batch_size: 64
+  clip_frames:
+  - 32
+  epoch: 15
+  feature_dim: 4096
+  feature_dir: /data/tacos/c3d
+  moment_length_factors:
+  - 0.05
+  - 0.06
+  - 0.075
+  - 0.085
+  - 0.1
+  - 0.125
+  - 0.15
+  - 0.175
+  - 0.3
+  - 0.4
+  overlapping_factors:
+  - 0.0
+  - 0.1
+  - 0.2
+  - 0.3
+  - 0.4
+  - 0.5
+  - 0.6
+  - 0.7
+  - 0.8
+  - 0.9
+  pooling_func: mean_pooling
+  sigma_factor: 1.0
+  stride: 16
+  video_feature_len: 512
+train:
+  clip_norm: 1.0
+  dev: false
+  init_lr: 0.0001
+alpha: 4
+beta: 3
+gamma: 0.25

TACoS_ckpt/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e96b295f39c656006a944f7c978c160762b9ddc2f1d26f43a06472f8e24969fd
+size 76667803