Upload 11 files
Browse files- Act_ckpt/config.yaml +47 -0
- Act_ckpt/model_best.pt +3 -0
- Cha_ckpt/C3D/config.yaml +45 -0
- Cha_ckpt/C3D/model_best.pt +3 -0
- Cha_ckpt/I3D/config.yaml +45 -0
- Cha_ckpt/I3D/model_best.pt +3 -0
- Cha_ckpt/VGG/config.yaml +45 -0
- Cha_ckpt/VGG/model_best.pt +3 -0
- README.md +134 -3
- TACoS_ckpt/config.yaml +52 -0
- TACoS_ckpt/model_best.pt +3 -0
Act_ckpt/config.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
activitynetcaptions:
|
| 2 |
+
batch_size: 128
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 16
|
| 5 |
+
epoch: 15
|
| 6 |
+
feature_dim: 500
|
| 7 |
+
feature_dir: /data/activitynetcaptions/c3d
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.2
|
| 10 |
+
- 0.3
|
| 11 |
+
- 0.5
|
| 12 |
+
- 0.7
|
| 13 |
+
- 0.8
|
| 14 |
+
overlapping_factors:
|
| 15 |
+
- 0.0
|
| 16 |
+
- 0.1
|
| 17 |
+
- 0.2
|
| 18 |
+
- 0.3
|
| 19 |
+
- 0.4
|
| 20 |
+
- 0.5
|
| 21 |
+
- 0.6
|
| 22 |
+
- 0.7
|
| 23 |
+
- 0.8
|
| 24 |
+
- 0.9
|
| 25 |
+
pooling_func: max_pooling
|
| 26 |
+
sigma_factor: 0.4
|
| 27 |
+
stride: 8
|
| 28 |
+
video_feature_len: 256
|
| 29 |
+
dataset_name: activitynetcaptions
|
| 30 |
+
exp_dir: log
|
| 31 |
+
gpu: '0'
|
| 32 |
+
model:
|
| 33 |
+
dim: 512
|
| 34 |
+
dropout: 0.1
|
| 35 |
+
glove_path: /data/glove.840B.300d.txt
|
| 36 |
+
n_layers: 2
|
| 37 |
+
temp: 0.07
|
| 38 |
+
topk: 5
|
| 39 |
+
seed: 100
|
| 40 |
+
train:
|
| 41 |
+
clip_norm: 1.0
|
| 42 |
+
dev: false
|
| 43 |
+
init_lr: 0.0001
|
| 44 |
+
|
| 45 |
+
alpha: 4
|
| 46 |
+
beta: 3
|
| 47 |
+
gamma: 0.25
|
Act_ckpt/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:265bbf52694623b45f6dbd582b6e6e3debefaf19f7f357fc5e932555d5405ffc
|
| 3 |
+
size 68353819
|
Cha_ckpt/C3D/config.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
charadessta:
|
| 2 |
+
batch_size: 128
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 8
|
| 5 |
+
epoch: 30
|
| 6 |
+
feature_dim: 4096
|
| 7 |
+
feature_dir: /data/charadessta/c3d
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.25
|
| 10 |
+
- 0.3
|
| 11 |
+
- 0.35
|
| 12 |
+
overlapping_factors:
|
| 13 |
+
- 0.0
|
| 14 |
+
- 0.1
|
| 15 |
+
- 0.2
|
| 16 |
+
- 0.3
|
| 17 |
+
- 0.4
|
| 18 |
+
- 0.5
|
| 19 |
+
- 0.6
|
| 20 |
+
- 0.7
|
| 21 |
+
- 0.8
|
| 22 |
+
- 0.9
|
| 23 |
+
pooling_func: max_pooling
|
| 24 |
+
sigma_factor: 0.3
|
| 25 |
+
stride: 4
|
| 26 |
+
video_feature_len: 256
|
| 27 |
+
dataset_name: charadessta
|
| 28 |
+
exp_dir: log
|
| 29 |
+
gpu: '0'
|
| 30 |
+
model:
|
| 31 |
+
dim: 512
|
| 32 |
+
dropout: 0.1
|
| 33 |
+
glove_path: /data/glove.840B.300d.txt
|
| 34 |
+
n_layers: 2
|
| 35 |
+
temp: 0.07
|
| 36 |
+
topk: 5
|
| 37 |
+
seed: 0
|
| 38 |
+
train:
|
| 39 |
+
clip_norm: 1.0
|
| 40 |
+
dev: false
|
| 41 |
+
init_lr: 0.0001
|
| 42 |
+
|
| 43 |
+
alpha: 1
|
| 44 |
+
beta: 1
|
| 45 |
+
gamma: 0.25
|
Cha_ckpt/C3D/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:031a35b44d8405e8790c5918c31ace71220869e56972e73d92e77e4bcb7c0f8e
|
| 3 |
+
size 75748154
|
Cha_ckpt/I3D/config.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
charadessta:
|
| 2 |
+
batch_size: 512
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 8
|
| 5 |
+
epoch: 30
|
| 6 |
+
feature_dim: 1024
|
| 7 |
+
feature_dir: /data/charadessta/i3d
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.25
|
| 10 |
+
- 0.3
|
| 11 |
+
- 0.35
|
| 12 |
+
overlapping_factors:
|
| 13 |
+
- 0.0
|
| 14 |
+
- 0.1
|
| 15 |
+
- 0.2
|
| 16 |
+
- 0.3
|
| 17 |
+
- 0.4
|
| 18 |
+
- 0.5
|
| 19 |
+
- 0.6
|
| 20 |
+
- 0.7
|
| 21 |
+
- 0.8
|
| 22 |
+
- 0.9
|
| 23 |
+
pooling_func: max_pooling
|
| 24 |
+
sigma_factor: 0.3
|
| 25 |
+
stride: 4
|
| 26 |
+
video_feature_len: 128
|
| 27 |
+
dataset_name: charadessta
|
| 28 |
+
exp_dir: log
|
| 29 |
+
gpu: '0'
|
| 30 |
+
model:
|
| 31 |
+
dim: 512
|
| 32 |
+
dropout: 0.1
|
| 33 |
+
glove_path: /data/glove.840B.300d.txt
|
| 34 |
+
n_layers: 2
|
| 35 |
+
temp: 0.07
|
| 36 |
+
topk: 5
|
| 37 |
+
seed: 1
|
| 38 |
+
train:
|
| 39 |
+
clip_norm: 1.0
|
| 40 |
+
dev: false
|
| 41 |
+
init_lr: 0.0001
|
| 42 |
+
|
| 43 |
+
alpha: 4
|
| 44 |
+
beta: 3
|
| 45 |
+
gamma: 0.25
|
Cha_ckpt/I3D/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:847333cbb9fb70e1d761711c658df061267c137802a3df0ee786a17c124512a8
|
| 3 |
+
size 69069211
|
Cha_ckpt/VGG/config.yaml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
charadessta:
|
| 2 |
+
batch_size: 256
|
| 3 |
+
clip_frames:
|
| 4 |
+
- 8
|
| 5 |
+
epoch: 30
|
| 6 |
+
feature_dim: 4096
|
| 7 |
+
feature_dir: /data/charadessta/vgg
|
| 8 |
+
moment_length_factors:
|
| 9 |
+
- 0.25
|
| 10 |
+
- 0.3
|
| 11 |
+
- 0.35
|
| 12 |
+
overlapping_factors:
|
| 13 |
+
- 0.0
|
| 14 |
+
- 0.1
|
| 15 |
+
- 0.2
|
| 16 |
+
- 0.3
|
| 17 |
+
- 0.4
|
| 18 |
+
- 0.5
|
| 19 |
+
- 0.6
|
| 20 |
+
- 0.7
|
| 21 |
+
- 0.8
|
| 22 |
+
- 0.9
|
| 23 |
+
pooling_func: max_pooling
|
| 24 |
+
sigma_factor: 0.3
|
| 25 |
+
stride: 4
|
| 26 |
+
video_feature_len: 256
|
| 27 |
+
dataset_name: charadessta
|
| 28 |
+
exp_dir: log
|
| 29 |
+
gpu: '2'
|
| 30 |
+
model:
|
| 31 |
+
dim: 512
|
| 32 |
+
dropout: 0.1
|
| 33 |
+
glove_path: /data/glove.840B.300d.txt
|
| 34 |
+
n_layers: 2
|
| 35 |
+
temp: 0.07
|
| 36 |
+
topk: 5
|
| 37 |
+
seed: 1
|
| 38 |
+
train:
|
| 39 |
+
clip_norm: 1.0
|
| 40 |
+
dev: false
|
| 41 |
+
init_lr: 0.0001
|
| 42 |
+
|
| 43 |
+
alpha: 1
|
| 44 |
+
beta: 1
|
| 45 |
+
gamma: 0.25
|
Cha_ckpt/VGG/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e641b17b826f46d69dd44ca3cf02669eb8093081e247407f6ec98c1fb9bd0810
|
| 3 |
+
size 75748154
|
README.md
CHANGED
|
@@ -1,3 +1,134 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- video-moment-retrieval
|
| 5 |
+
- point-supervised
|
| 6 |
+
- vision-language
|
| 7 |
+
- multimodal
|
| 8 |
+
- representation-shift
|
| 9 |
+
- pytorch
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
<a id="top"></a>
|
| 13 |
+
<div align="center">
|
| 14 |
+
<h1>🚀 DRONE: Cross-modal Representation Shift Refinement for Point-supervised Video Moment Retrieval</h1>
|
| 15 |
+
|
| 16 |
+
<p>
|
| 17 |
+
<b>Kun Wang</b><sup>1</sup>
|
| 18 |
+
<b>Yupeng Hu</b><sup>1✉</sup>
|
| 19 |
+
<b>Hao Liu</b><sup>1</sup>
|
| 20 |
+
<b>Jiang Shao</b><sup>1</sup>
|
| 21 |
+
<b>Liqiang Nie</b><sup>2</sup>
|
| 22 |
+
</p>
|
| 23 |
+
|
| 24 |
+
<p>
|
| 25 |
+
<sup>1</sup>School of Software, Shandong University, Jinan, China<br>
|
| 26 |
+
<sup>2</sup>School of Computer Science and Technology, Harbin Institute of Technology (Shenzhen), Shenzhen, China<br>
|
| 27 |
+
<sup>✉</sup>Corresponding author
|
| 28 |
+
</p>
|
| 29 |
+
</div>
|
| 30 |
+
|
| 31 |
+
These are the official implementation, pre-trained model weights, and configuration files for **DRONE**, a point-supervised Video Moment Retrieval (VMR) framework designed to mitigate cross-modal representation shift.
|
| 32 |
+
|
| 33 |
+
🔗 **Paper:** [Accepted by ACM TOIS 2026](https://dl.acm.org/doi/10.1145/3786606)
|
| 34 |
+
🔗 **GitHub Repository:** [iLearn-Lab/DRONE](https://github.com/iLearn-Lab/DRONE)
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## 📌 Model Information
|
| 39 |
+
|
| 40 |
+
### 1. Model Name
|
| 41 |
+
**DRONE** (Cross-modal Representation Shift Refinement)
|
| 42 |
+
|
| 43 |
+
### 2. Task Type & Applicable Tasks
|
| 44 |
+
- **Task Type:** Point-supervised Video Moment Retrieval (VMR) / Vision-Language / Multimodal Learning
|
| 45 |
+
- **Applicable Tasks:** Localizing temporal segments in untrimmed videos that match natural language queries, utilizing only point-level supervision to reduce annotation costs while actively addressing cross-modal representation shifts.
|
| 46 |
+
|
| 47 |
+
### 3. Project Introduction
|
| 48 |
+
Point-supervised Video Moment Retrieval (VMR) aims to localize the temporal segment in a video that matches a natural language query using only single-frame annotations. **DRONE** addresses the cross-modal representation shift issue inherent in this setting, which progressively improves temporal alignment and semantic consistency between video and text representations.
|
| 49 |
+
|
| 50 |
+
> 💡 **Method Highlight:** DRONE introduces **Pseudo-Frame Temporal Alignment (PTA)** and **Curriculum-Guided Semantic Refinement (CSR)**. Together, these modules systematically mitigate representation shifts, allowing the model to bridge the semantic gap between visual frames and textual queries effectively.
|
| 51 |
+
|
| 52 |
+
### 4. Training Data Source
|
| 53 |
+
The model supports and is evaluated on three standard VMR datasets:
|
| 54 |
+
- **ActivityNet Captions**
|
| 55 |
+
- **Charades-STA**
|
| 56 |
+
- **TACoS**
|
| 57 |
+
*(Follows splits and feature preparation from [ViGA](https://github.com/r-cui/ViGA))*
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## 🚀 Usage & Basic Inference
|
| 62 |
+
|
| 63 |
+
### Step 1: Prepare the Environment
|
| 64 |
+
Clone the GitHub repository and set up the virtual environment:
|
| 65 |
+
```bash
|
| 66 |
+
git clone https://github.com/iLearn-Lab/DRONE.git
|
| 67 |
+
cd DRONE
|
| 68 |
+
```
|
| 69 |
+
```bash
|
| 70 |
+
python -m venv .venv
|
| 71 |
+
source .venv/bin/activate # Linux / Mac
|
| 72 |
+
# .venv\Scripts\activate # Windows
|
| 73 |
+
```
|
| 74 |
+
```bash
|
| 75 |
+
pip install numpy scipy pyyaml tqdm
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Step 2: Download Model Weights & Data
|
| 79 |
+
1. **Pre-trained Checkpoints:** Download the model checkpoints (includes `Act_ckpt/`, `Cha_ckpt/`, and `TACoS_ckpt/`).
|
| 80 |
+
2. **Datasets & Features:** Follow [ViGA](https://github.com/r-cui/ViGA)'s dataset preparation guidelines for ActivityNet Captions, Charades-STA, and TACoS.
|
| 81 |
+
3. **Configuration:** Before running, ensure you replace the local dataset root and feature paths in `src/config.yaml` and `src/utils/utils.py` with your actual local paths.
|
| 82 |
+
|
| 83 |
+
### Step 3: Run Training & Evaluation
|
| 84 |
+
|
| 85 |
+
**Training from Scratch:**
|
| 86 |
+
Depending on the dataset you want to train on, run the following commands:
|
| 87 |
+
|
| 88 |
+
#### For ActivityNet Captions
|
| 89 |
+
python -m src.experiment.train --task activitynetcaptions
|
| 90 |
+
|
| 91 |
+
#### For Charades-STA
|
| 92 |
+
python -m src.experiment.train --task charadessta
|
| 93 |
+
|
| 94 |
+
#### For TACoS
|
| 95 |
+
python -m src.experiment.train --task tacos
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
**Evaluation (Eval):**
|
| 99 |
+
To evaluate a trained experiment folder (which should contain `config.yaml` and `model_best.pt`), run:
|
| 100 |
+
|
| 101 |
+
python -m src.experiment.eval --exp path/to/your/experiment_folder
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
|
| 105 |
+
## ⚠️ Limitations & Notes
|
| 106 |
+
|
| 107 |
+
**Disclaimer:** This framework and its pre-trained weights are intended for **academic research purposes only**.
|
| 108 |
+
- The model requires access to the original source datasets (ActivityNet Captions, Charades-STA, TACoS) for full evaluation.
|
| 109 |
+
- While designed to mitigate cross-modal representation shifts, performance relies on the quality of the point-level annotations and the inherent capacities of the selected visual backbones (C3D, I3D, VGG).
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
## 🤝 Acknowledgements & Contact
|
| 114 |
+
|
| 115 |
+
- **Acknowledgement:** This implementation and data organization are inspired by the [ViGA](https://github.com/r-cui/ViGA) open-source community. Thanks to all collaborators and contributors of this project.
|
| 116 |
+
- **Contact:** If you have any questions, feel free to contact me at `khylon.kun.wang@gmail.com`.
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## 📝⭐️ Citation
|
| 121 |
+
|
| 122 |
+
If you find our work or this repository useful in your research, please consider citing our paper:
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@article{wang2026cross,
|
| 126 |
+
title={Cross-Modal Representation Shift Refinement for Point-supervised Video Moment Retrieval},
|
| 127 |
+
author={Wang, Kun and Hu, Yupeng and Liu, Hao and Shao, Jiang and Nie, Liqiang},
|
| 128 |
+
journal={ACM Transactions on Information Systems},
|
| 129 |
+
volume={44},
|
| 130 |
+
number={3},
|
| 131 |
+
pages={1--30},
|
| 132 |
+
year={2026},
|
| 133 |
+
publisher={ACM New York, NY}
|
| 134 |
+
}
|
TACoS_ckpt/config.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_name: tacos
|
| 2 |
+
exp_dir: log
|
| 3 |
+
gpu: '0'
|
| 4 |
+
model:
|
| 5 |
+
dim: 512
|
| 6 |
+
dropout: 0.1
|
| 7 |
+
glove_path: /data/glove.840B.300d.txt
|
| 8 |
+
n_layers: 2
|
| 9 |
+
temp: 0.07
|
| 10 |
+
topk: 5
|
| 11 |
+
seed: 16
|
| 12 |
+
tacos:
|
| 13 |
+
batch_size: 64
|
| 14 |
+
clip_frames:
|
| 15 |
+
- 32
|
| 16 |
+
epoch: 15
|
| 17 |
+
feature_dim: 4096
|
| 18 |
+
feature_dir: /data/tacos/c3d
|
| 19 |
+
moment_length_factors:
|
| 20 |
+
- 0.05
|
| 21 |
+
- 0.06
|
| 22 |
+
- 0.075
|
| 23 |
+
- 0.085
|
| 24 |
+
- 0.1
|
| 25 |
+
- 0.125
|
| 26 |
+
- 0.15
|
| 27 |
+
- 0.175
|
| 28 |
+
- 0.3
|
| 29 |
+
- 0.4
|
| 30 |
+
overlapping_factors:
|
| 31 |
+
- 0.0
|
| 32 |
+
- 0.1
|
| 33 |
+
- 0.2
|
| 34 |
+
- 0.3
|
| 35 |
+
- 0.4
|
| 36 |
+
- 0.5
|
| 37 |
+
- 0.6
|
| 38 |
+
- 0.7
|
| 39 |
+
- 0.8
|
| 40 |
+
- 0.9
|
| 41 |
+
pooling_func: mean_pooling
|
| 42 |
+
sigma_factor: 1.0
|
| 43 |
+
stride: 16
|
| 44 |
+
video_feature_len: 512
|
| 45 |
+
train:
|
| 46 |
+
clip_norm: 1.0
|
| 47 |
+
dev: false
|
| 48 |
+
init_lr: 0.0001
|
| 49 |
+
|
| 50 |
+
alpha: 4
|
| 51 |
+
beta: 3
|
| 52 |
+
gamma: 0.25
|
TACoS_ckpt/model_best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e96b295f39c656006a944f7c978c160762b9ddc2f1d26f43a06472f8e24969fd
|
| 3 |
+
size 76667803
|