wkun03 commited on
Commit
7b132bc
·
verified ·
1 Parent(s): 8aad070

Upload 11 files

Browse files
Act_ckpt/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ activitynetcaptions:
2
+ batch_size: 128
3
+ clip_frames:
4
+ - 16
5
+ epoch: 15
6
+ feature_dim: 500
7
+ feature_dir: /data/activitynetcaptions/c3d
8
+ moment_length_factors:
9
+ - 0.2
10
+ - 0.3
11
+ - 0.5
12
+ - 0.7
13
+ - 0.8
14
+ overlapping_factors:
15
+ - 0.0
16
+ - 0.1
17
+ - 0.2
18
+ - 0.3
19
+ - 0.4
20
+ - 0.5
21
+ - 0.6
22
+ - 0.7
23
+ - 0.8
24
+ - 0.9
25
+ pooling_func: max_pooling
26
+ sigma_factor: 0.4
27
+ stride: 8
28
+ video_feature_len: 256
29
+ dataset_name: activitynetcaptions
30
+ exp_dir: log
31
+ gpu: '0'
32
+ model:
33
+ dim: 512
34
+ dropout: 0.1
35
+ glove_path: /data/glove.840B.300d.txt
36
+ n_layers: 2
37
+ temp: 0.07
38
+ topk: 5
39
+ seed: 100
40
+ train:
41
+ clip_norm: 1.0
42
+ dev: false
43
+ init_lr: 0.0001
44
+
45
+ alpha: 4
46
+ beta: 3
47
+ gamma: 0.25
Act_ckpt/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:265bbf52694623b45f6dbd582b6e6e3debefaf19f7f357fc5e932555d5405ffc
3
+ size 68353819
Cha_ckpt/C3D/config.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ charadessta:
2
+ batch_size: 128
3
+ clip_frames:
4
+ - 8
5
+ epoch: 30
6
+ feature_dim: 4096
7
+ feature_dir: /data/charadessta/c3d
8
+ moment_length_factors:
9
+ - 0.25
10
+ - 0.3
11
+ - 0.35
12
+ overlapping_factors:
13
+ - 0.0
14
+ - 0.1
15
+ - 0.2
16
+ - 0.3
17
+ - 0.4
18
+ - 0.5
19
+ - 0.6
20
+ - 0.7
21
+ - 0.8
22
+ - 0.9
23
+ pooling_func: max_pooling
24
+ sigma_factor: 0.3
25
+ stride: 4
26
+ video_feature_len: 256
27
+ dataset_name: charadessta
28
+ exp_dir: log
29
+ gpu: '0'
30
+ model:
31
+ dim: 512
32
+ dropout: 0.1
33
+ glove_path: /data/glove.840B.300d.txt
34
+ n_layers: 2
35
+ temp: 0.07
36
+ topk: 5
37
+ seed: 0
38
+ train:
39
+ clip_norm: 1.0
40
+ dev: false
41
+ init_lr: 0.0001
42
+
43
+ alpha: 1
44
+ beta: 1
45
+ gamma: 0.25
Cha_ckpt/C3D/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:031a35b44d8405e8790c5918c31ace71220869e56972e73d92e77e4bcb7c0f8e
3
+ size 75748154
Cha_ckpt/I3D/config.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ charadessta:
2
+ batch_size: 512
3
+ clip_frames:
4
+ - 8
5
+ epoch: 30
6
+ feature_dim: 1024
7
+ feature_dir: /data/charadessta/i3d
8
+ moment_length_factors:
9
+ - 0.25
10
+ - 0.3
11
+ - 0.35
12
+ overlapping_factors:
13
+ - 0.0
14
+ - 0.1
15
+ - 0.2
16
+ - 0.3
17
+ - 0.4
18
+ - 0.5
19
+ - 0.6
20
+ - 0.7
21
+ - 0.8
22
+ - 0.9
23
+ pooling_func: max_pooling
24
+ sigma_factor: 0.3
25
+ stride: 4
26
+ video_feature_len: 128
27
+ dataset_name: charadessta
28
+ exp_dir: log
29
+ gpu: '0'
30
+ model:
31
+ dim: 512
32
+ dropout: 0.1
33
+ glove_path: /data/glove.840B.300d.txt
34
+ n_layers: 2
35
+ temp: 0.07
36
+ topk: 5
37
+ seed: 1
38
+ train:
39
+ clip_norm: 1.0
40
+ dev: false
41
+ init_lr: 0.0001
42
+
43
+ alpha: 4
44
+ beta: 3
45
+ gamma: 0.25
Cha_ckpt/I3D/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847333cbb9fb70e1d761711c658df061267c137802a3df0ee786a17c124512a8
3
+ size 69069211
Cha_ckpt/VGG/config.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ charadessta:
2
+ batch_size: 256
3
+ clip_frames:
4
+ - 8
5
+ epoch: 30
6
+ feature_dim: 4096
7
+ feature_dir: /data/charadessta/vgg
8
+ moment_length_factors:
9
+ - 0.25
10
+ - 0.3
11
+ - 0.35
12
+ overlapping_factors:
13
+ - 0.0
14
+ - 0.1
15
+ - 0.2
16
+ - 0.3
17
+ - 0.4
18
+ - 0.5
19
+ - 0.6
20
+ - 0.7
21
+ - 0.8
22
+ - 0.9
23
+ pooling_func: max_pooling
24
+ sigma_factor: 0.3
25
+ stride: 4
26
+ video_feature_len: 256
27
+ dataset_name: charadessta
28
+ exp_dir: log
29
+ gpu: '2'
30
+ model:
31
+ dim: 512
32
+ dropout: 0.1
33
+ glove_path: /data/glove.840B.300d.txt
34
+ n_layers: 2
35
+ temp: 0.07
36
+ topk: 5
37
+ seed: 1
38
+ train:
39
+ clip_norm: 1.0
40
+ dev: false
41
+ init_lr: 0.0001
42
+
43
+ alpha: 1
44
+ beta: 1
45
+ gamma: 0.25
Cha_ckpt/VGG/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e641b17b826f46d69dd44ca3cf02669eb8093081e247407f6ec98c1fb9bd0810
3
+ size 75748154
README.md CHANGED
@@ -1,3 +1,134 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - video-moment-retrieval
5
+ - point-supervised
6
+ - vision-language
7
+ - multimodal
8
+ - representation-shift
9
+ - pytorch
10
+ ---
11
+
12
+ <a id="top"></a>
13
+ <div align="center">
14
+ <h1>🚀 DRONE: Cross-modal Representation Shift Refinement for Point-supervised Video Moment Retrieval</h1>
15
+
16
+ <p>
17
+ <b>Kun Wang</b><sup>1</sup>&nbsp;
18
+ <b>Yupeng Hu</b><sup>1✉</sup>&nbsp;
19
+ <b>Hao Liu</b><sup>1</sup>&nbsp;
20
+ <b>Jiang Shao</b><sup>1</sup>&nbsp;
21
+ <b>Liqiang Nie</b><sup>2</sup>
22
+ </p>
23
+
24
+ <p>
25
+ <sup>1</sup>School of Software, Shandong University, Jinan, China<br>
26
+ <sup>2</sup>School of Computer Science and Technology, Harbin Institute of Technology (Shenzhen), Shenzhen, China<br>
27
+ <sup>✉</sup>Corresponding author
28
+ </p>
29
+ </div>
30
+
31
+ These are the official implementation, pre-trained model weights, and configuration files for **DRONE**, a point-supervised Video Moment Retrieval (VMR) framework designed to mitigate cross-modal representation shift.
32
+
33
+ 🔗 **Paper:** [Accepted by ACM TOIS 2026](https://dl.acm.org/doi/10.1145/3786606)
34
+ 🔗 **GitHub Repository:** [iLearn-Lab/DRONE](https://github.com/iLearn-Lab/DRONE)
35
+
36
+ ---
37
+
38
+ ## 📌 Model Information
39
+
40
+ ### 1. Model Name
41
+ **DRONE** (Cross-modal Representation Shift Refinement)
42
+
43
+ ### 2. Task Type & Applicable Tasks
44
+ - **Task Type:** Point-supervised Video Moment Retrieval (VMR) / Vision-Language / Multimodal Learning
45
+ - **Applicable Tasks:** Localizing temporal segments in untrimmed videos that match natural language queries, utilizing only point-level supervision to reduce annotation costs while actively addressing cross-modal representation shifts.
46
+
47
+ ### 3. Project Introduction
48
+ Point-supervised Video Moment Retrieval (VMR) aims to localize the temporal segment in a video that matches a natural language query using only single-frame annotations. **DRONE** addresses the cross-modal representation shift issue inherent in this setting, which progressively improves temporal alignment and semantic consistency between video and text representations.
49
+
50
+ > 💡 **Method Highlight:** DRONE introduces **Pseudo-Frame Temporal Alignment (PTA)** and **Curriculum-Guided Semantic Refinement (CSR)**. Together, these modules systematically mitigate representation shifts, allowing the model to bridge the semantic gap between visual frames and textual queries effectively.
51
+
52
+ ### 4. Training Data Source
53
+ The model supports and is evaluated on three standard VMR datasets:
54
+ - **ActivityNet Captions**
55
+ - **Charades-STA**
56
+ - **TACoS**
57
+ *(Follows splits and feature preparation from [ViGA](https://github.com/r-cui/ViGA))*
58
+
59
+ ---
60
+
61
+ ## 🚀 Usage & Basic Inference
62
+
63
+ ### Step 1: Prepare the Environment
64
+ Clone the GitHub repository and set up the virtual environment:
65
+ ```bash
66
+ git clone https://github.com/iLearn-Lab/DRONE.git
67
+ cd DRONE
68
+ ```
69
+ ```bash
70
+ python -m venv .venv
71
+ source .venv/bin/activate # Linux / Mac
72
+ # .venv\Scripts\activate # Windows
73
+ ```
74
+ ```bash
75
+ pip install numpy scipy pyyaml tqdm
76
+ ```
77
+
78
+ ### Step 2: Download Model Weights & Data
79
+ 1. **Pre-trained Checkpoints:** Download the model checkpoints (includes `Act_ckpt/`, `Cha_ckpt/`, and `TACoS_ckpt/`).
80
+ 2. **Datasets & Features:** Follow [ViGA](https://github.com/r-cui/ViGA)'s dataset preparation guidelines for ActivityNet Captions, Charades-STA, and TACoS.
81
+ 3. **Configuration:** Before running, ensure you replace the local dataset root and feature paths in `src/config.yaml` and `src/utils/utils.py` with your actual local paths.
82
+
83
+ ### Step 3: Run Training & Evaluation
84
+
85
+ **Training from Scratch:**
86
+ Depending on the dataset you want to train on, run the following commands:
87
+
88
+ #### For ActivityNet Captions
89
+ python -m src.experiment.train --task activitynetcaptions
90
+
91
+ #### For Charades-STA
92
+ python -m src.experiment.train --task charadessta
93
+
94
+ #### For TACoS
95
+ python -m src.experiment.train --task tacos
96
+
97
+
98
+ **Evaluation (Eval):**
99
+ To evaluate a trained experiment folder (which should contain `config.yaml` and `model_best.pt`), run:
100
+
101
+ python -m src.experiment.eval --exp path/to/your/experiment_folder
102
+
103
+ ---
104
+
105
+ ## ⚠️ Limitations & Notes
106
+
107
+ **Disclaimer:** This framework and its pre-trained weights are intended for **academic research purposes only**.
108
+ - The model requires access to the original source datasets (ActivityNet Captions, Charades-STA, TACoS) for full evaluation.
109
+ - While designed to mitigate cross-modal representation shifts, performance relies on the quality of the point-level annotations and the inherent capacities of the selected visual backbones (C3D, I3D, VGG).
110
+
111
+ ---
112
+
113
+ ## 🤝 Acknowledgements & Contact
114
+
115
+ - **Acknowledgement:** This implementation and data organization are inspired by the [ViGA](https://github.com/r-cui/ViGA) open-source community. Thanks to all collaborators and contributors of this project.
116
+ - **Contact:** If you have any questions, feel free to contact me at `khylon.kun.wang@gmail.com`.
117
+
118
+ ---
119
+
120
+ ## 📝⭐️ Citation
121
+
122
+ If you find our work or this repository useful in your research, please consider citing our paper:
123
+
124
+
125
+ @article{wang2026cross,
126
+ title={Cross-Modal Representation Shift Refinement for Point-supervised Video Moment Retrieval},
127
+ author={Wang, Kun and Hu, Yupeng and Liu, Hao and Shao, Jiang and Nie, Liqiang},
128
+ journal={ACM Transactions on Information Systems},
129
+ volume={44},
130
+ number={3},
131
+ pages={1--30},
132
+ year={2026},
133
+ publisher={ACM New York, NY}
134
+ }
TACoS_ckpt/config.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: tacos
2
+ exp_dir: log
3
+ gpu: '0'
4
+ model:
5
+ dim: 512
6
+ dropout: 0.1
7
+ glove_path: /data/glove.840B.300d.txt
8
+ n_layers: 2
9
+ temp: 0.07
10
+ topk: 5
11
+ seed: 16
12
+ tacos:
13
+ batch_size: 64
14
+ clip_frames:
15
+ - 32
16
+ epoch: 15
17
+ feature_dim: 4096
18
+ feature_dir: /data/tacos/c3d
19
+ moment_length_factors:
20
+ - 0.05
21
+ - 0.06
22
+ - 0.075
23
+ - 0.085
24
+ - 0.1
25
+ - 0.125
26
+ - 0.15
27
+ - 0.175
28
+ - 0.3
29
+ - 0.4
30
+ overlapping_factors:
31
+ - 0.0
32
+ - 0.1
33
+ - 0.2
34
+ - 0.3
35
+ - 0.4
36
+ - 0.5
37
+ - 0.6
38
+ - 0.7
39
+ - 0.8
40
+ - 0.9
41
+ pooling_func: mean_pooling
42
+ sigma_factor: 1.0
43
+ stride: 16
44
+ video_feature_len: 512
45
+ train:
46
+ clip_norm: 1.0
47
+ dev: false
48
+ init_lr: 0.0001
49
+
50
+ alpha: 4
51
+ beta: 3
52
+ gamma: 0.25
TACoS_ckpt/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e96b295f39c656006a944f7c978c160762b9ddc2f1d26f43a06472f8e24969fd
3
+ size 76667803