Upload 7 files
Browse files- Cha_ckpt/i3d/config.yaml +45 -0
- Cha_ckpt/i3d/model_best.pt +3 -0
- Cha_ckpt/vgg/config.yaml +49 -0
- Cha_ckpt/vgg/model_best.pt +3 -0
- README.md +114 -0
- TACoS_ckpt/config.yaml +51 -0
- TACoS_ckpt/model_best.pt +3 -0
Cha_ckpt/i3d/config.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
charadessta:
|
| 2 |
+
batch_size: 512
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 8
|
| 5 |
+
epoch: 15
|
| 6 |
+
feature_dim: 1024
|
| 7 |
+
feature_dir: /data/charadessta/i3d
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.25
|
| 10 |
+
- 0.3
|
| 11 |
+
- 0.35
|
| 12 |
+
overlapping_factors:
|
| 13 |
+
- 0.0
|
| 14 |
+
- 0.1
|
| 15 |
+
- 0.2
|
| 16 |
+
- 0.3
|
| 17 |
+
- 0.4
|
| 18 |
+
- 0.5
|
| 19 |
+
- 0.6
|
| 20 |
+
- 0.7
|
| 21 |
+
- 0.8
|
| 22 |
+
- 0.9
|
| 23 |
+
pooling_func: max_pooling
|
| 24 |
+
sigma_factor: 0.3
|
| 25 |
+
stride: 4
|
| 26 |
+
video_feature_len: 128
|
| 27 |
+
frac: 0.157
|
| 28 |
+
width: 20
|
| 29 |
+
alpha: 10
|
| 30 |
+
beta: 0.002
|
| 31 |
+
dataset_name: charadessta
|
| 32 |
+
exp_dir: log
|
| 33 |
+
gpu: '0'
|
| 34 |
+
model:
|
| 35 |
+
dim: 512
|
| 36 |
+
dropout: 0.1
|
| 37 |
+
glove_path: /data/glove.840B.300d.txt
|
| 38 |
+
n_layers: 2
|
| 39 |
+
temp: 0.07
|
| 40 |
+
topk: 1
|
| 41 |
+
seed: 1
|
| 42 |
+
train:
|
| 43 |
+
clip_norm: 1.0
|
| 44 |
+
dev: false
|
| 45 |
+
init_lr: 0.0001
|
Cha_ckpt/i3d/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d97e2e75be754bbd0480ea11c5a3c01830e33c7fab1ba5cdb4adc89ab11f904
|
| 3 |
+
size 56988191
|
Cha_ckpt/vgg/config.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
charadessta:
|
| 2 |
+
batch_size: 256
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 8
|
| 5 |
+
epoch: 15
|
| 6 |
+
feature_dim: 4096
|
| 7 |
+
feature_dir: /data/charadessta/vgg
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.1
|
| 10 |
+
- 0.15
|
| 11 |
+
- 0.2
|
| 12 |
+
- 0.25
|
| 13 |
+
- 0.3
|
| 14 |
+
- 0.35
|
| 15 |
+
- 0.4
|
| 16 |
+
overlapping_factors:
|
| 17 |
+
- 0.0
|
| 18 |
+
- 0.1
|
| 19 |
+
- 0.2
|
| 20 |
+
- 0.3
|
| 21 |
+
- 0.4
|
| 22 |
+
- 0.5
|
| 23 |
+
- 0.6
|
| 24 |
+
- 0.7
|
| 25 |
+
- 0.8
|
| 26 |
+
- 0.9
|
| 27 |
+
pooling_func: max_pooling
|
| 28 |
+
sigma_factor: 0.3
|
| 29 |
+
stride: 4
|
| 30 |
+
video_feature_len: 256
|
| 31 |
+
frac: 0.115
|
| 32 |
+
width: 30
|
| 33 |
+
alpha: 10
|
| 34 |
+
beta: 0.005
|
| 35 |
+
dataset_name: charadessta
|
| 36 |
+
exp_dir: log
|
| 37 |
+
gpu: '0'
|
| 38 |
+
model:
|
| 39 |
+
dim: 512
|
| 40 |
+
dropout: 0.1
|
| 41 |
+
glove_path: /data/glove.840B.300d.txt
|
| 42 |
+
n_layers: 2
|
| 43 |
+
temp: 0.07
|
| 44 |
+
topk: 1
|
| 45 |
+
seed: 1
|
| 46 |
+
train:
|
| 47 |
+
clip_norm: 1.0
|
| 48 |
+
dev: false
|
| 49 |
+
init_lr: 0.0001
|
Cha_ckpt/vgg/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f89802dca0a9935c627a85e35291a7efd41e5107505db9862836f420e5591b5f
|
| 3 |
+
size 63908088
|
README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- video-moment-retrieval
|
| 5 |
+
- frame-supervised
|
| 6 |
+
- temporal-localization
|
| 7 |
+
- vision-language
|
| 8 |
+
- multimodal
|
| 9 |
+
- pytorch
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
<a id="top"></a>
|
| 13 |
+
<div align="center">
|
| 14 |
+
<h1>Gaming for Boundary: Elastic Localization for Frame-Supervised Video Moment Retrieval</h1>
|
| 15 |
+
|
| 16 |
+
<p>
|
| 17 |
+
<b>Hao Liu</b><sup>1</sup>
|
| 18 |
+
<b>Yupeng Hu</b><sup>1✉</sup>
|
| 19 |
+
<b>Kun Wang</b><sup>1</sup>
|
| 20 |
+
<b>Yinwei Wei</b><sup>1</sup>
|
| 21 |
+
<b>Liqiang Nie</b><sup>2</sup>
|
| 22 |
+
</p>
|
| 23 |
+
|
| 24 |
+
<p>
|
| 25 |
+
<sup>1</sup>School of Software, Shandong University, Jinan, China<br>
|
| 26 |
+
<sup>2</sup>School of Computer Science and Technology, Harbin Institute of Technology (Shenzhen), Shenzhen, China
|
| 27 |
+
</p>
|
| 28 |
+
</div>
|
| 29 |
+
|
| 30 |
+
This is the official PyTorch implementation of **GOAL**, a frame-supervised Video Moment Retrieval (VMR) framework for elastic boundary localization via a game-based paradigm and Dynamic Updating Technique (DUT).
|
| 31 |
+
|
| 32 |
+
🔗 **Paper:** [SIGIR 2025](https://doi.org/10.1145/3726302.3729984)
|
| 33 |
+
🔗 **GitHub Repository:** [iLearn-Lab/SIGIR25-GOAL](https://github.com/iLearn-Lab/SIGIR25-GOAL)
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Model Information
|
| 38 |
+
|
| 39 |
+
### 1. Model Name
|
| 40 |
+
**GOAL** (**G**aming f**O**r el**A**stic **L**ocalization).
|
| 41 |
+
|
| 42 |
+
### 2. Task Type & Applicable Tasks
|
| 43 |
+
- **Task Type:** Frame-Supervised Video Moment Retrieval (VMR) / Temporal Localization / Vision-Language Learning
|
| 44 |
+
- **Applicable Tasks:** Retrieving the temporal moment in a video that matches a natural language query using a single annotated frame, with a focus on ambiguous temporal boundary localization.
|
| 45 |
+
|
| 46 |
+
### 3. Project Introduction
|
| 47 |
+
Frame-supervised Video Moment Retrieval (VMR) aims to retrieve the temporal moment in a video that matches a natural language query using only a single annotated frame. While this setting reduces annotation cost, it brings severe ambiguity in temporal boundary prediction.
|
| 48 |
+
|
| 49 |
+
**GOAL** addresses this challenge through a **game-based paradigm** with three players, namely **KFP**, **AFP**, and **BP**, together with a **Dynamic Updating Technique (DUT)** that progressively refines boundary decisions through unilateral and bilateral updates for more elastic localization.
|
| 50 |
+
|
| 51 |
+
### 4. Training Data Source
|
| 52 |
+
The model is trained and evaluated on standard frame-supervised VMR benchmarks:
|
| 53 |
+
- **ActivityNet Captions**
|
| 54 |
+
- **Charades-STA**
|
| 55 |
+
- **TACoS**
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## Usage & Basic Inference
|
| 60 |
+
|
| 61 |
+
This codebase provides training and evaluation scripts for frame-supervised VMR, as well as checkpoints for quick reproduction.
|
| 62 |
+
|
| 63 |
+
### Step 1: Prepare the Environment
|
| 64 |
+
Clone the GitHub repository and install dependencies:
|
| 65 |
+
```bash
|
| 66 |
+
git clone https://github.com/iLearn-Lab/SIGIR25-GOAL.git
|
| 67 |
+
cd GOAL
|
| 68 |
+
python -m venv .venv
|
| 69 |
+
source .venv/bin/activate # Linux / Mac
|
| 70 |
+
# .venv\Scripts\activate # Windows
|
| 71 |
+
pip install numpy scipy pyyaml tqdm
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
### Step 2: Download Model Weights & Data
|
| 75 |
+
Prepare features and raw annotations following [ViGA](https://github.com/r-cui/ViGA)'s dataset preparation protocol.
|
| 76 |
+
|
| 77 |
+
Before running the code, please check and replace local dataset and feature paths in:
|
| 78 |
+
- `src/config.yaml`
|
| 79 |
+
- `src/utils/utils.py`
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
### Step 3: Run Inference
|
| 83 |
+
|
| 84 |
+
To evaluate a trained experiment folder, run:
|
| 85 |
+
```bash
|
| 86 |
+
python -m src.experiment.eval --exp path/to/your/experiment_folder
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## Limitations & Notes
|
| 92 |
+
|
| 93 |
+
**Disclaimer:** This repository is intended for **academic research purposes only**.
|
| 94 |
+
- The model requires access to the original benchmark datasets and extracted video features for evaluation.
|
| 95 |
+
- Some configuration files currently contain local path settings and should be updated before use.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## Citation
|
| 100 |
+
|
| 101 |
+
If you find our work useful in your research, please consider citing our paper:
|
| 102 |
+
|
| 103 |
+
```bibtex
|
| 104 |
+
@inproceedings{liu2025gaming,
|
| 105 |
+
title={Gaming for Boundary: Elastic Localization for Frame-Supervised Video Moment Retrieval},
|
| 106 |
+
author={Liu, Hao and Hu, Yupeng and Wang, Kun and Wei, Yinwei and Nie, Liqiang},
|
| 107 |
+
booktitle={Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
| 108 |
+
year={2025},
|
| 109 |
+
doi={10.1145/3726302.3729984}
|
| 110 |
+
}
|
| 111 |
+
```
|
| 112 |
+
---
|
| 113 |
+
## Contact
|
| 114 |
+
**If you have any questions, feel free to contact me at liuh90210@gmail.com**.
|
TACoS_ckpt/config.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_name: tacos
|
| 2 |
+
exp_dir: log
|
| 3 |
+
gpu: '0'
|
| 4 |
+
model:
|
| 5 |
+
dim: 512
|
| 6 |
+
dropout: 0.1
|
| 7 |
+
glove_path: /data/glove.840B.300d.txt
|
| 8 |
+
n_layers: 2
|
| 9 |
+
temp: 0.07
|
| 10 |
+
topk: 1
|
| 11 |
+
seed: 1
|
| 12 |
+
tacos:
|
| 13 |
+
batch_size: 128
|
| 14 |
+
clip_frames:
|
| 15 |
+
- 32
|
| 16 |
+
epoch: 30
|
| 17 |
+
feature_dim: 4096
|
| 18 |
+
feature_dir: /data/tacos/c3d
|
| 19 |
+
moment_length_factors:
|
| 20 |
+
- 0.05
|
| 21 |
+
- 0.1
|
| 22 |
+
- 0.15
|
| 23 |
+
- 0.2
|
| 24 |
+
- 0.25
|
| 25 |
+
- 0.3
|
| 26 |
+
- 0.35
|
| 27 |
+
- 0.4
|
| 28 |
+
overlapping_factors:
|
| 29 |
+
- 0.0
|
| 30 |
+
- 0.1
|
| 31 |
+
- 0.2
|
| 32 |
+
- 0.3
|
| 33 |
+
- 0.4
|
| 34 |
+
- 0.5
|
| 35 |
+
- 0.6
|
| 36 |
+
- 0.7
|
| 37 |
+
- 0.8
|
| 38 |
+
- 0.9
|
| 39 |
+
pooling_func: max_pooling
|
| 40 |
+
sigma_factor: 1.0
|
| 41 |
+
stride: 16
|
| 42 |
+
video_feature_len: 512
|
| 43 |
+
frac: 0.016
|
| 44 |
+
width: 30
|
| 45 |
+
alpha: 10
|
| 46 |
+
beta: 0.002
|
| 47 |
+
|
| 48 |
+
train:
|
| 49 |
+
clip_norm: 1.0
|
| 50 |
+
dev: false
|
| 51 |
+
init_lr: 0.0001
|
TACoS_ckpt/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21d9db952aa6ef13ec0ca3472a969a482b8055a641ebf12f92c3509eadcddca8
|
| 3 |
+
size 65559288
|