Video-Highlights

Build error

App Files Files Community

yeliudev commited on 11 days ago

Commit

3ffab64

verified ·

1 Parent(s): b0d1738

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -14

app.py CHANGED Viewed

@@ -19,11 +19,11 @@ import pandas as pd
 TITLE = '🌀R2-Tuning: Efficient Image-to-Video Transfer Learning for Video Temporal Grounding'
 TITLE_MD = '<h1 align="center">🌀R<sup>2</sup>-Tuning: Efficient Image-to-Video Transfer Learning for Video Temporal Grounding</h1>'
-DESCRIPTION_MD = 'R<sup>2</sup>-Tuning is a parameter- and memory-efficient transfer learning method for video temporal grounding. Please find more details in our <a href="https://arxiv.org/abs/2404.00801" target="_blank">Tech Report</a> and <a href="https://github.com/yeliudev/R2-Tuning" target="_blank">GitHub Repo</a>.'
-GUIDE_MD = '### User Guide:\n1. Upload a video or click "random" to sample one.\n2. Input a text query. A good practice is to write a sentence with 5~15 words.\n3. Click "submit" and you\'ll see the moment retrieval and highlight detection results on the right.'
 CONFIG = 'configs/qvhighlights/r2_tuning_qvhighlights.py'
-WEIGHT = 'https://huggingface.co/yeliudev/R2-Tuning/resolve/main/checkpoints/r2_tuning_qvhighlights-ed516355.pth'
 # yapf:disable
 EXAMPLES = [
@@ -45,7 +45,7 @@ def load_video(video_path, cfg):
     decord.bridge.set_bridge('torch')
     vr = VideoReader(video_path)
-    stride = vr.get_avg_fps() / cfg.data.val.fps
     fm_idx = [min(round(i), len(vr) - 1) for i in np.arange(0, len(vr), stride).tolist()]
     video = vr.get_batch(fm_idx).permute(0, 3, 1, 2).float() / 255
@@ -75,6 +75,8 @@ def init_model(config, checkpoint):
 def main(video, query, model, cfg):
     if len(query) == 0:
         raise gr.Error('Text query can not be empty.')
@@ -86,19 +88,16 @@ def main(video, query, model, cfg):
     query = clip.tokenize(query, truncate=True)
     device = next(model.parameters()).device
-    data = dict(video=video.to(device), query=query.to(device), fps=[cfg.data.val.fps])
     with torch.inference_mode():
         pred = model(data)
-    mr = pred['_out']['boundary'][:5].cpu().tolist()
-    mr = [[convert_time(p[0]), convert_time(p[1]), round(p[2], 2)] for p in mr]
     hd = pred['_out']['saliency'].cpu()
     hd = ((hd - hd.min()) / (hd.max() - hd.min()) * 0.9 + 0.05).tolist()
-    hd = pd.DataFrame(dict(x=range(0, len(hd) * 2, 2), y=hd))
-    return mr, hd
 model, cfg = init_model(CONFIG, WEIGHT)
@@ -121,8 +120,6 @@ with gr.Blocks(title=TITLE) as demo:
                 submit_btn = gr.Button(value='🚀 Submit')
         with gr.Column():
-            mr = gr.DataFrame(
-                headers=['Start Time', 'End Time', 'Score'], label='Moment Retrieval')
             hd = gr.LinePlot(
                 x='x',
                 y='y',
@@ -131,6 +128,6 @@ with gr.Blocks(title=TITLE) as demo:
                 label='Highlight Detection')
         random_btn.click(lambda: random.sample(EXAMPLES, 1)[0], None, [video, query])
-        submit_btn.click(fn, [video, query], [mr, hd])
-demo.launch()

 TITLE = '🌀R2-Tuning: Efficient Image-to-Video Transfer Learning for Video Temporal Grounding'
 TITLE_MD = '<h1 align="center">🌀R<sup>2</sup>-Tuning: Efficient Image-to-Video Transfer Learning for Video Temporal Grounding</h1>'
+DESCRIPTION_MD = 'R<sup>2</sup>-Tuning is a parameter- and memory-efficient transfer learning method for video temporal grounding.'
+GUIDE_MD = '### User Guide:\n1. Upload a video or click "random" to sample one.\n2. Input a text query. A good practice is to write a sentence with 5~15 words.\n3. Click "submit" and you\'ll see the highlight detection results on the right.'
 CONFIG = 'configs/qvhighlights/r2_tuning_qvhighlights.py'
+WEIGHT = 'https://huggingface.co/yeliudev/R2-Tuning/resolve/main/checkpoints/r2_tuning_youtube_sur-d384d8b2.pth'
 # yapf:disable
 EXAMPLES = [
     decord.bridge.set_bridge('torch')
     vr = VideoReader(video_path)
+    stride = vr.get_avg_fps() / 1
     fm_idx = [min(round(i), len(vr) - 1) for i in np.arange(0, len(vr), stride).tolist()]
     video = vr.get_batch(fm_idx).permute(0, 3, 1, 2).float() / 255
 def main(video, query, model, cfg):
+    query = 'surfing'
     if len(query) == 0:
         raise gr.Error('Text query can not be empty.')
     query = clip.tokenize(query, truncate=True)
     device = next(model.parameters()).device
+    data = dict(video=video.to(device), query=query.to(device), fps=[1])
     with torch.inference_mode():
         pred = model(data)
     hd = pred['_out']['saliency'].cpu()
     hd = ((hd - hd.min()) / (hd.max() - hd.min()) * 0.9 + 0.05).tolist()
+    hd = pd.DataFrame(dict(x=range(len(hd) * 1 -1, -1, -1), y=hd))
+    return hd
 model, cfg = init_model(CONFIG, WEIGHT)
                 submit_btn = gr.Button(value='🚀 Submit')
         with gr.Column():
             hd = gr.LinePlot(
                 x='x',
                 y='y',
                 label='Highlight Detection')
         random_btn.click(lambda: random.sample(EXAMPLES, 1)[0], None, [video, query])
+        submit_btn.click(fn, [video, query], hd)
+demo.launch()