Spaces:

mippia
/

MPD-demo

Sleeping

App Files Files Community

slslslrhfem commited on Sep 16, 2025

Commit

773ceaa

1 Parent(s): 629cbd9

change download mechanism

Browse files

Files changed (2) hide show

app.py +43 -72
compare.py +402 -423

app.py CHANGED Viewed

@@ -207,10 +207,10 @@ def find_song_file_by_title(song_title):
     return None
-@spaces.GPU(duration=300)  # 5분으로 설정
 def process_audio_for_matching(audio_file):
     if audio_file is None:
-        return """
         <div style='text-align: center; color: #dc2626; padding: 30px; background: #fef2f2; border-radius: 12px; border: 2px dashed #fecaca;'>
             <h3>No Audio File</h3>
             <p>Please upload an audio file to get started!</p>
@@ -220,7 +220,7 @@ def process_audio_for_matching(audio_file):
     result = inference(audio_file)
     if result.get('message') != 'success':
-        return f"""
         <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
             <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
             <p style="color: #a16207; font-size: 1.1em;">{result.get('message', 'Unknown error occurred')}</p>
@@ -229,63 +229,34 @@ def process_audio_for_matching(audio_file):
     matches = result.get('matches', [])
     if not matches:
-        return """
         <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
             <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
             <p style="color: #a16207; font-size: 1.1em;">No matching vocals found in the dataset.</p>
         </div>
         """
     # Generate match results HTML
     matches_html = ""
     for match in matches:
         rank = match.get('rank', 0)
-        song_title = match.get('song_title', 'Unknown Song')
         confidence = match.get('confidence', '0%')
         test_time = match.get('test_time', 0)
-        library_time = match.get('library_time', 0)
         # Ranking colors
         rank_colors = {1: '#dc2626', 2: '#ea580c', 3: '#16a34a'}
         rank_color = rank_colors.get(rank, '#6b7280')
-        # Find song file
-        song_file_path = find_song_file_by_title(song_title)
-        # Create audio player
-        audio_player = ""
-        if song_file_path and os.path.exists(song_file_path):
-            # Use absolute path for Gradio file serving
-            audio_player = f"""
-            <div style="margin: 15px 0; padding: 15px; background: #f8fafc; border-radius: 8px;">
-                <div style="text-align: center; margin-bottom: 10px;">
-                    <strong style="color: #1f2937;">Play matched vocal section</strong>
-                </div>
-                <audio controls preload="metadata" style="width: 100%;">
-                    <source src="/file={song_file_path}" type="audio/mpeg">
-                    Your browser does not support the audio element.
-                </audio>
-                <div style="text-align: center; margin-top: 8px;">
-                    <button onclick="seekToTime(this.parentElement.previousElementSibling.querySelector('audio'), {library_time})"
-                            style="background: #2563eb; color: white; border: none; padding: 5px 15px; border-radius: 6px; cursor: pointer; font-size: 0.9em;">
-                        Jump to {library_time:.1f}s
-                    </button>
-                </div>
-                <p style="font-size: 0.8em; color: #374151; text-align: center; margin: 5px 0 0 0;">
-                    Vocal match found at {library_time:.1f}s
-                </p>
-            </div>
-            """
-            file_info = f"Found: {os.path.basename(song_file_path)}"
-        else:
-            audio_player = f"""
-            <div style="margin: 10px 0; padding: 10px; background: #fefce8; border-radius: 8px; text-align: center;">
-                <p style="color: #a16207; margin: 0;">Song file not found for playback</p>
-                <p style="font-size: 0.8em; color: #a16207; margin: 5px 0 0 0;">Match at {library_time:.1f}s in "{song_title}"</p>
-            </div>
-            """
-            file_info = f"Song file not found: {song_title}"
         matches_html += f"""
         <div style="background: #ffffff; border-radius: 12px; padding: 20px; margin: 15px 0;
                     border-left: 5px solid {rank_color}; box-shadow: 0 3px 10px rgba(0,0,0,0.1);">
@@ -294,7 +265,7 @@ def process_audio_for_matching(audio_file):
                     <span style="background: {rank_color}; color: white; padding: 4px 8px; border-radius: 15px; font-size: 0.8em; margin-right: 10px;">
                         #{rank}
                     </span>
-                    {song_title}
                 </h3>
                 <span style="background: #f3f4f6; color: #111827; padding: 6px 12px; border-radius: 20px; font-weight: 600;">
                     {confidence}
@@ -309,20 +280,14 @@ def process_audio_for_matching(audio_file):
                     </div>
                     <div>
                         <strong style="color: #1f2937;">Matched At</strong>
-                        <br><span style="color: #16a34a; font-size: 1.1em;">{library_time:.1f}s</span>
                     </div>
                 </div>
             </div>
-            {audio_player}
-            <div style="font-size: 0.9em; color: #374151; text-align: center; margin-top: 10px;">
-                {file_info}
-            </div>
         </div>
         """
-    formatted_result = f"""
     <div style="background: #ffffff; border-radius: 16px; padding: 30px;
                 box-shadow: 0 4px 20px rgba(0,0,0,0.08); border: 1px solid #e5e7eb; margin: 10px 0;">
         <div style="text-align: center; margin-bottom: 25px;">
@@ -334,21 +299,17 @@ def process_audio_for_matching(audio_file):
         <div style="text-align: center; margin-top: 25px; padding: 15px; background: #f3f4f6; border-radius: 8px;">
             <p style="color: #374151; margin: 0; font-size: 0.95em;">
-                <strong>How to read results:</strong> Vocal similarity scores with timestamp locations.
-                Play the audio to hear the matched vocal sections.
             </p>
         </div>
     </div>
-    <script>
-    function seekToTime(audio, time) {{
-        audio.currentTime = time;
-        audio.play();
-    }}
-    </script>
     """
-    return formatted_result
 # CSS styles
 custom_css = """
@@ -421,7 +382,7 @@ h1 {
 }
 """
-# Gradio interface
 demo = gr.Interface(
     fn=process_audio_for_matching,
     inputs=gr.Audio(
@@ -429,10 +390,16 @@ demo = gr.Interface(
         label="Upload Your Audio File",
         elem_classes=["upload-container"]
     ),
-    outputs=gr.HTML(
-        label="Similarity Results",
-        elem_classes=["output-container"]
-    ),
     title="Music Plagiarism Detection",
     description="""
     <div style="text-align: center; font-size: 1.1em; color: #374151; margin: 25px 0; line-height: 1.6;">
@@ -443,11 +410,15 @@ demo = gr.Interface(
             Submitted to ICASSP 2026
         </p>
         <hr style="border: none; border-top: 1px solid #e5e7eb; margin: 20px 0;">
-        <p>Upload any music file to detect vocal similarities in the Covers80 dataset.</p>
-        <p>The system analyzes only vocal characteristics, ignoring instrumental parts.</p>
-        <p style="font-size: 0.95em; color: #6b7280; margin-top: 15px;">
-            Supported formats: MP3, WAV, M4A, FLAC<br>
-            Processing may take some time
         </p>
     </div>
     """,

     return None
+@spaces.GPU(duration=300)
 def process_audio_for_matching(audio_file):
     if audio_file is None:
+        return None, """
         <div style='text-align: center; color: #dc2626; padding: 30px; background: #fef2f2; border-radius: 12px; border: 2px dashed #fecaca;'>
             <h3>No Audio File</h3>
             <p>Please upload an audio file to get started!</p>
     result = inference(audio_file)
     if result.get('message') != 'success':
+        return None, f"""
         <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
             <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
             <p style="color: #a16207; font-size: 1.1em;">{result.get('message', 'Unknown error occurred')}</p>
     matches = result.get('matches', [])
     if not matches:
+        return None, """
         <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
             <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
             <p style="color: #a16207; font-size: 1.1em;">No matching vocals found in the dataset.</p>
         </div>
         """
+    # Get the best match for audio playback
+    best_match = matches[0]
+    song_title = best_match.get('song_title', 'Unknown Song')
+    library_time = best_match.get('library_time', 0)
+    # Find song file
+    song_file_path = find_song_file_by_title(song_title)
     # Generate match results HTML
     matches_html = ""
     for match in matches:
         rank = match.get('rank', 0)
+        song_title_display = match.get('song_title', 'Unknown Song')
         confidence = match.get('confidence', '0%')
         test_time = match.get('test_time', 0)
+        library_time_display = match.get('library_time', 0)
         # Ranking colors
         rank_colors = {1: '#dc2626', 2: '#ea580c', 3: '#16a34a'}
         rank_color = rank_colors.get(rank, '#6b7280')
         matches_html += f"""
         <div style="background: #ffffff; border-radius: 12px; padding: 20px; margin: 15px 0;
                     border-left: 5px solid {rank_color}; box-shadow: 0 3px 10px rgba(0,0,0,0.1);">
                     <span style="background: {rank_color}; color: white; padding: 4px 8px; border-radius: 15px; font-size: 0.8em; margin-right: 10px;">
                         #{rank}
                     </span>
+                    {song_title_display}
                 </h3>
                 <span style="background: #f3f4f6; color: #111827; padding: 6px 12px; border-radius: 20px; font-weight: 600;">
                     {confidence}
                     </div>
                     <div>
                         <strong style="color: #1f2937;">Matched At</strong>
+                        <br><span style="color: #16a34a; font-size: 1.1em;">{library_time_display:.1f}s</span>
                     </div>
                 </div>
             </div>
         </div>
         """
+    results_html = f"""
     <div style="background: #ffffff; border-radius: 16px; padding: 30px;
                 box-shadow: 0 4px 20px rgba(0,0,0,0.08); border: 1px solid #e5e7eb; margin: 10px 0;">
         <div style="text-align: center; margin-bottom: 25px;">
         <div style="text-align: center; margin-top: 25px; padding: 15px; background: #f3f4f6; border-radius: 8px;">
             <p style="color: #374151; margin: 0; font-size: 0.95em;">
+                <strong>Audio Player:</strong> Playing the best match starting from the matched timestamp ({library_time:.1f}s)
             </p>
         </div>
     </div>
     """
+    # Return audio file with timestamp and results
+    if song_file_path and os.path.exists(song_file_path):
+        return (song_file_path, library_time), results_html
+    else:
+        return None, results_html
 # CSS styles
 custom_css = """
 }
 """
+# Gradio interface - using original Interface with multiple outputs
 demo = gr.Interface(
     fn=process_audio_for_matching,
     inputs=gr.Audio(
         label="Upload Your Audio File",
         elem_classes=["upload-container"]
     ),
+    outputs=[
+        gr.Audio(
+            label="Best Match Audio (plays from matched timestamp)",
+            elem_classes=["output-container"]
+        ),
+        gr.HTML(
+            label="Analysis Results",
+            elem_classes=["output-container"]
+        )
+    ],
     title="Music Plagiarism Detection",
     description="""
     <div style="text-align: center; font-size: 1.1em; color: #374151; margin: 25px 0; line-height: 1.6;">
             Submitted to ICASSP 2026
         </p>
         <hr style="border: none; border-top: 1px solid #e5e7eb; margin: 20px 0;">
+        <p><strong>⚠️ Demo Version Notice:</strong><br>
+        This demo differs from the paper version and focuses exclusively on vocal segment transcription.</p>
+        <p>Upload any music file to detect vocal similarities in the Covers80 dataset.<br>
+        The system analyzes only vocal characteristics, ignoring instrumental parts.</p>
+        <p style="font-size: 0.95em; color: #dc2626; font-weight: 600; margin-top: 15px;">
+            ⏱️ Processing can take up to 2 minutes per file
+        </p>
+        <p style="font-size: 0.95em; color: #6b7280; margin-top: 10px;">
+            Supported formats: MP3, WAV, M4A, FLAC
         </p>
     </div>
     """,

compare.py CHANGED Viewed

@@ -1,444 +1,423 @@
-import spaces
-import gradio as gr
 import torch
-import librosa
-import numpy as np
-import subprocess
-import sys
 import os
 import glob
-from pathlib import Path
-from huggingface_hub import snapshot_download
-import shutil
-token = os.getenv("HF_TOKEN")
-# Install madmom from GitHub
-def install_madmom():
-    subprocess.check_call([
-        sys.executable, "-m", "pip", "install",
-        "git+https://github.com/CPJKU/madmom", "--no-cache-dir"
-    ])
-    print("madmom installed from GitHub")
-install_madmom()
-# Add current directory to Python path for ml_models
-sys.path.insert(0, '.')
-sys.path.insert(0, './ml_models')
-def download_data_from_hub():
-    print("=== DOWNLOAD FUNCTION START ===")
-    base_dir = Path(".")
-    data_repo_id = "mippia/music-data"
-    print(f"Base directory: {base_dir.absolute()}")
-    print(f"Repository: {data_repo_id}")
-    folders_to_check = ["covers80", "ml_models"]
-    downloaded_folders = {}
-    # Check LFS file
-    lfs_file = base_dir / "1005_e_4"
-    print(f"Checking LFS file: {lfs_file}")
-    if lfs_file.exists():
-        file_size = lfs_file.stat().st_size / (1024*1024)
-        print(f"LFS file found: {file_size:.1f} MB")
-        downloaded_folders["1005_e_4"] = str(lfs_file)
-    else:
-        print("LFS file not found")
-        downloaded_folders["1005_e_4"] = None
-    # Check existing folders
-    print("=== CHECKING EXISTING FOLDERS ===")
-    for folder in folders_to_check:
-        folder_path = base_dir / folder
-        print(f"Checking {folder} at {folder_path}")
-        if folder_path.exists():
-            if any(folder_path.iterdir()):
-                print(f"  {folder} exists and has content")
-            else:
-                print(f"  {folder} exists but is empty")
-        else:
-            print(f"  {folder} does not exist")
-    all_folders_exist = all((base_dir / folder).exists() and any((base_dir / folder).iterdir())
-                           for folder in folders_to_check)
-    print(f"All folders exist: {all_folders_exist}")
-    if not all_folders_exist:
-        print("=== STARTING DOWNLOAD ===")
-        # Download to a temporary directory first
-        temp_dir = base_dir / "temp_download"
-        print(f"Creating temp directory: {temp_dir}")
-        temp_dir.mkdir(exist_ok=True)
-        print("Calling snapshot_download...")
-        downloaded_path = snapshot_download(
-            repo_id=data_repo_id,
-            repo_type="dataset",
-            local_dir=str(temp_dir),
-            local_dir_use_symlinks=False,
-            token=token,
-            ignore_patterns=["*.md", "*.txt", ".gitattributes", "README.md"]
         )
-        print(f"Download completed to: {downloaded_path}")
-        # Check what was downloaded
-        print("=== CHECKING TEMP DOWNLOAD CONTENTS ===")
-        print(f"Temp directory contents:")
-        for item in temp_dir.iterdir():
-            item_type = "DIR" if item.is_dir() else "FILE"
-            print(f"  {item.name} ({item_type})")
-            if item.is_dir():
-                file_count = len([f for f in item.rglob("*") if f.is_file()])
-                print(f"    Contains {file_count} files")
-        # Move folders from temp to current directory
-        print("=== MOVING FOLDERS ===")
-        for folder_name in folders_to_check:
-            temp_folder_path = temp_dir / folder_name
-            target_folder_path = base_dir / folder_name
-            print(f"Processing {folder_name}:")
-            print(f"  Source: {temp_folder_path}")
-            print(f"  Target: {target_folder_path}")
-            print(f"  Source exists: {temp_folder_path.exists()}")
-            if temp_folder_path.exists():
-                # Remove existing target if it exists
-                if target_folder_path.exists():
-                    print(f"  Removing existing target directory")
-                    shutil.rmtree(target_folder_path)
-                # Move folder
-                print(f"  Moving folder...")
-                shutil.move(str(temp_folder_path), str(target_folder_path))
-                # Verify move
-                if target_folder_path.exists():
-                    file_count = len([f for f in target_folder_path.rglob("*") if f.is_file()])
-                    print(f"  SUCCESS: {folder_name} moved with {file_count:,} files")
-                    downloaded_folders[folder_name] = str(target_folder_path)
                 else:
-                    print(f"  ERROR: Move failed for {folder_name}")
-                    downloaded_folders[folder_name] = None
-            else:
-                print(f"  ERROR: {folder_name} not found in temp download")
-                downloaded_folders[folder_name] = None
-        # Clean up temp directory
-        print("=== CLEANING UP TEMP DIRECTORY ===")
-        if temp_dir.exists():
-            shutil.rmtree(temp_dir)
-            print("Temp directory removed")
-    else:
-        print("=== USING EXISTING FOLDERS ===")
-        for folder_name in folders_to_check:
-            folder_path = base_dir / folder_name
-            if folder_path.exists():
-                file_count = len([f for f in folder_path.rglob("*") if f.is_file()])
-                print(f"{folder_name}: {file_count:,} files")
-                downloaded_folders[folder_name] = str(folder_path)
-            else:
-                downloaded_folders[folder_name] = None
-    print("=== FINAL STATUS ===")
-    for key, value in downloaded_folders.items():
-        print(f"{key}: {value}")
-    print("=== DOWNLOAD FUNCTION END ===")
-    return downloaded_folders
-# Download data and check results
-print("Starting Music Plagiarism Detection App...")
-folders = download_data_from_hub()
-# Final verification
-print("=== FINAL VERIFICATION ===")
-current_dir = Path(".")
-print(f"Current directory contents after download:")
-for item in current_dir.iterdir():
-    item_type = "DIR" if item.is_dir() else "FILE"
-    print(f"  {item.name} ({item_type})")
-# Check ml_models specifically
-ml_models_path = Path("ml_models")
-print(f"ml_models check:")
-print(f"  Exists: {ml_models_path.exists()}")
-if ml_models_path.exists():
-    print(f"  Is directory: {ml_models_path.is_dir()}")
-    print(f"  Contents:")
-    for item in ml_models_path.iterdir():
-        print(f"    {item.name}")
-# Import inference
-print("=== IMPORTING INFERENCE ===")
-from inference import inference
-def find_song_file_by_title(song_title):
-    covers80_path = Path("covers80")
-    if not covers80_path.exists():
-        return None
-    # Try exact match patterns
-    exact_patterns = [
-        f"{song_title}.mp3",
-        f"*{song_title}.mp3",
-        f"{song_title}*.mp3"
-    ]
-    for pattern in exact_patterns:
-        matches = list(covers80_path.glob(pattern))
-        if matches:
-            return str(matches[0])
-    # Try partial matches
-    song_parts = song_title.replace('_', ' ').split()
-    for part in song_parts:
-        if len(part) > 3:
-            matches = list(covers80_path.glob(f"*{part}*.mp3"))
-            if matches:
-                return str(matches[0])
-    return None
-@spaces.GPU(duration=300)
-def process_audio_for_matching(audio_file):
-    if audio_file is None:
-        return None, """
-        <div style='text-align: center; color: #dc2626; padding: 30px; background: #fef2f2; border-radius: 12px; border: 2px dashed #fecaca;'>
-            <h3>No Audio File</h3>
-            <p>Please upload an audio file to get started!</p>
-        </div>
-        """
-    result = inference(audio_file)
-    if result.get('message') != 'success':
-        return None, f"""
-        <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
-            <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
-            <p style="color: #a16207; font-size: 1.1em;">{result.get('message', 'Unknown error occurred')}</p>
-        </div>
-        """
-    matches = result.get('matches', [])
-    if not matches:
-        return None, """
-        <div style="text-align: center; padding: 25px; background: #fefce8; border-radius: 12px; border: 1px solid #fde047; margin: 10px 0;">
-            <h3 style="color: #a16207; margin-bottom: 15px;">No Matches Found</h3>
-            <p style="color: #a16207; font-size: 1.1em;">No matching vocals found in the dataset.</p>
-        </div>
-        """
-    # Get the best match for audio playback
-    best_match = matches[0]
-    song_title = best_match.get('song_title', 'Unknown Song')
-    library_time = best_match.get('library_time', 0)
-    # Find song file
-    song_file_path = find_song_file_by_title(song_title)
-    # Generate match results HTML
-    matches_html = ""
-    for match in matches:
-        rank = match.get('rank', 0)
-        song_title_display = match.get('song_title', 'Unknown Song')
-        confidence = match.get('confidence', '0%')
-        test_time = match.get('test_time', 0)
-        library_time_display = match.get('library_time', 0)
-        # Ranking colors
-        rank_colors = {1: '#dc2626', 2: '#ea580c', 3: '#16a34a'}
-        rank_color = rank_colors.get(rank, '#6b7280')
-        matches_html += f"""
-        <div style="background: #ffffff; border-radius: 12px; padding: 20px; margin: 15px 0;
-                    border-left: 5px solid {rank_color}; box-shadow: 0 3px 10px rgba(0,0,0,0.1);">
-            <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 10px;">
-                <h3 style="color: #111827; margin: 0; font-size: 1.2em;">
-                    <span style="background: {rank_color}; color: white; padding: 4px 8px; border-radius: 15px; font-size: 0.8em; margin-right: 10px;">
-                        #{rank}
-                    </span>
-                    {song_title_display}
-                </h3>
-                <span style="background: #f3f4f6; color: #111827; padding: 6px 12px; border-radius: 20px; font-weight: 600;">
-                    {confidence}
-                </span>
-            </div>
-            <div style="background: #f9fafb; border-radius: 8px; padding: 12px; margin: 10px 0;">
-                <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; text-align: center;">
-                    <div>
-                        <strong style="color: #1f2937;">Your Audio</strong>
-                        <br><span style="color: #dc2626; font-size: 1.1em;">{test_time:.1f}s</span>
-                    </div>
-                    <div>
-                        <strong style="color: #1f2937;">Matched At</strong>
-                        <br><span style="color: #16a34a; font-size: 1.1em;">{library_time_display:.1f}s</span>
-                    </div>
-                </div>
-            </div>
-        </div>
-        """
-    results_html = f"""
-    <div style="background: #ffffff; border-radius: 16px; padding: 30px;
-                box-shadow: 0 4px 20px rgba(0,0,0,0.08); border: 1px solid #e5e7eb; margin: 10px 0;">
-        <div style="text-align: center; margin-bottom: 25px;">
-            <h2 style="color: #111827; margin-bottom: 10px; font-size: 1.8em;">Vocal Matching Results</h2>
-            <p style="color: #374151; font-size: 1.1em;">Found {len(matches)} similar vocals in Covers80 dataset</p>
-        </div>
-        {matches_html}
-        <div style="text-align: center; margin-top: 25px; padding: 15px; background: #f3f4f6; border-radius: 8px;">
-            <p style="color: #374151; margin: 0; font-size: 0.95em;">
-                <strong>Audio Player:</strong> Playing the best match starting from the matched timestamp ({library_time:.1f}s)
-            </p>
-        </div>
-    </div>
-    """
-    # Return audio file with timestamp and results
-    if song_file_path and os.path.exists(song_file_path):
-        return (song_file_path, library_time), results_html
-    else:
-        return None, results_html
-# CSS styles
-custom_css = """
-.gradio-container {
-    background: #f9fafb !important;
-    min-height: 100vh;
-    padding: 20px;
-}
-.main-container {
-    background: #ffffff !important;
-    border-radius: 16px !important;
-    box-shadow: 0 4px 20px rgba(0,0,0,0.08) !important;
-    margin: 0 auto !important;
-    padding: 40px !important;
-    max-width: 900px;
-    border: 1px solid #e5e7eb !important;
-}
-h1 {
-    text-align: center !important;
-    font-size: 2.5em !important;
-    font-weight: 700 !important;
-    margin-bottom: 15px !important;
-    color: #111827 !important;
-}
-.gradio-markdown p {
-    text-align: center !important;
-    font-size: 1.1em !important;
-    color: #374151 !important;
-    margin-bottom: 25px !important;
-    line-height: 1.6;
-}
-.upload-container {
-    background: #ffffff !important;
-    border-radius: 12px !important;
-    padding: 25px !important;
-    border: 2px dashed #d1d5db !important;
-    margin-bottom: 25px !important;
-    transition: all 0.3s ease !important;
-}
-.upload-container:hover {
-    border-color: #2563eb !important;
-    background: #f9fafb !important;
-}
-.output-container {
-    background: #ffffff !important;
-    border-radius: 12px !important;
-    padding: 20px !important;
-    border: 1px solid #e5e7eb !important;
-    min-height: 200px !important;
-}
-.gr-button {
-    background: #2563eb !important;
-    color: #ffffff !important;
-    border: none !important;
-    border-radius: 8px !important;
-    padding: 12px 24px !important;
-    font-weight: 500 !important;
-    font-size: 1em !important;
-    transition: all 0.2s ease !important;
-}
-.gr-button:hover {
-    background: #1d4ed8 !important;
-    transform: translateY(-1px) !important;
-    box-shadow: 0 4px 12px rgba(37, 99, 235, 0.25) !important;
-}
-@media (max-width: 768px) {
-    h1 { font-size: 2em !important; }
-    .main-container { margin: 10px !important; padding: 25px !important; }
-    .upload-container { padding: 20px !important; }
-}
-"""
-# Gradio interface - using original Interface with multiple outputs
-demo = gr.Interface(
-    fn=process_audio_for_matching,
-    inputs=gr.Audio(
-        type="filepath",
-        label="Upload Your Audio File",
-        elem_classes=["upload-container"]
-    ),
-    outputs=[
-        gr.Audio(
-            label="Best Match Audio (plays from matched timestamp)",
-            elem_classes=["output-container"]
-        ),
-        gr.HTML(
-            label="Analysis Results",
-            elem_classes=["output-container"]
-        )
-    ],
-    title="Music Plagiarism Detection",
-    description="""
-    <div style="text-align: center; font-size: 1.1em; color: #374151; margin: 25px 0; line-height: 1.6;">
-        <p><strong>Music Plagiarism Detection: Problem Formulation and a Segment-based Solution</strong></p>
-        <p style="font-size: 0.9em; color: #6b7280; margin: 10px 0;">
-            Authors: Seonghyeon Go, Yumin Kim<br>
-            MIPPIA Inc.<br>
-            Submitted to ICASSP 2026
-        </p>
-        <hr style="border: none; border-top: 1px solid #e5e7eb; margin: 20px 0;">
-        <p><strong>⚠️ Demo Version Notice:</strong><br>
-        This demo differs from the paper version and focuses exclusively on vocal segment transcription.</p>
-        <p>Upload any music file to detect vocal similarities in the Covers80 dataset.<br>
-        The system analyzes only vocal characteristics, ignoring instrumental parts.</p>
-        <p style="font-size: 0.95em; color: #dc2626; font-weight: 600; margin-top: 15px;">
-            ⏱️ Processing can take up to 2 minutes per file
-        </p>
-        <p style="font-size: 0.95em; color: #6b7280; margin-top: 10px;">
-            Supported formats: MP3, WAV, M4A, FLAC
-        </p>
-    </div>
-    """,
-    examples=[],
-    css=custom_css,
-    theme=gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="gray",
-        neutral_hue="gray",
-        font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
-    ),
-    elem_classes=["main-container"],
-    allow_flagging="never"
-)
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        show_api=False,
-        show_error=True,
-        share=False
-    )

 import torch
+import heapq
+import jsonpickle
 import os
+import pandas as pd
+import random
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from compare_utils import remove_1, algorithmic_collate3, CompareHelper, quantize_image, infos_to_pianorolls, get_duration_in_interval, shift_image_optimized, piano_roll_to_chroma, calculate_correlation
 import glob
+from torch.utils.data import Dataset
+import unicodedata
+covers80_path = "covers80"
+youtubecover_jsons = glob.glob(os.path.join(covers80_path, "*.json"))
+def get_one_result(info_json):
+    results = []
+    device = torch.device('cpu')
+    use_new_bpm = False
+    inst = 'vocal'
+    # info_json 처리
+    test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=[inst])
+    imgs, labels, points = test_dataset[0]
+    test_images = [img for img in imgs]
+    test_labels = [label for label in labels]
+    test_points = [remove_1(point) for point in points]
+    try:
+        test_images = torch.cat(test_images).to(device)
+    except:
+        test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=['vocal'], condition=0)
+        imgs, labels, points = test_dataset[0]
+        test_images = [img for img in imgs]
+        test_labels = [label for label in labels]
+        test_points = [remove_1(point) for point in points]
+        try:
+            test_images = torch.cat(test_images).to(device)
+        except Exception as e:
+            test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=['vocal'], condition=0)
+            imgs, labels, points = test_dataset[0]
+            test_images = [img for img in imgs]
+            test_labels = [label for label in labels]
+            test_points = [remove_1(point) for point in points]
+            try:
+                test_images = torch.cat(test_images).to(device)
+            except:
+                print(e)
+                return ["there is no note for this song"], []
+    test_bpms = torch.tensor([label['bpm'] for label in labels])
+    test_bpms_expanded = test_bpms[:, None]
+    test_images_expanded = test_images[:, None, :, :].to(device)
+    # youtubecover_jsons 처리
+    additional_test_dataset = TestDataset2(youtubecover_jsons, inst=[inst], condition=0)
+    additional_test_loader = DataLoader(additional_test_dataset, batch_size=40, collate_fn=algorithmic_collate3)
+    compare_result = []
+    max_heap_size = 1000
+    for idx, (additional_library_images, additional_library_labels, additional_library_points) in tqdm(enumerate(additional_test_loader)):
+        additional_library_images = torch.cat(additional_library_images).to(device)
+        additional_library_images = additional_library_images.squeeze(1)
+        additional_library_images_expanded = additional_library_images[None, :, :, :].to(device)
+        additional_library_bpms = torch.tensor([label['bpm'] for label in additional_library_labels]).to(device)
+        additional_library_bpms_expanded = additional_library_bpms[None, :]
+        metrics = calculate_metric_optimized(
+            test_images_expanded,
+            additional_library_images_expanded,
+            test_points,
+            additional_library_points,
+            test_bpms_expanded,
+            additional_library_bpms_expanded,
+            device
         )
+        max_matching_score = torch.zeros_like(metrics)
+        for i, test_label in enumerate(test_labels):
+            for j, additional_library_label in enumerate(additional_library_labels):
+                metric = metrics[i, j].item()
+                # chord1 = test_labels[i]['chord']
+                # chord2 = additional_library_labels[j]['chord']
+                # matching_count = sum(c1 == c2 and c1 != 'Unknown' for c1, c2 in zip(chord1, chord2))
+                # matching_score = [0, 0.02, 0.05, 0.09, 0.16]
+                # max_matching_score[i, j] = matching_score[int(matching_count)]
+                final_metric = (metric)
+                if final_metric > 1:
+                    final_metric = 1
+                result_entry = CompareHelper([final_metric, test_label, additional_library_label, test_points[i], additional_library_points[j]])
+                # heap 크기 제한 로직
+                if len(compare_result) < max_heap_size:
+                    heapq.heappush(compare_result, result_entry)
                 else:
+                    # heap이 가득 찬 경우, 최소값보다 큰 경우에만 교체
+                    if result_entry.data[0] > compare_result[0].data[0]:
+                        heapq.heappop(compare_result)  # 최소값 제거
+                        heapq.heappush(compare_result, result_entry)  # 새로운 값 추가
+    sorted_compare_results = sorted(compare_result, key=lambda x: x.data[0], reverse=True)
+    return sorted_compare_results
+class TestDataset(Dataset):
+    def __init__(self, info_path, use_all=False, use_new_bpm=False, inst=['vocal','melody'],condition=4):
+        if use_new_bpm:
+            self.library_files = [info_path.replace(".json", "newbpm.json")]
+        else:
+            self.library_files = [info_path]
+        self.info_path = info_path
+        self.use_all = use_all
+        self.inst = inst
+        self.condition = condition
+    def __len__(self):
+        return 1#len(self.library_files) # use_new_bpm이어도 그냥 1임
+    def get_chords(self, chord_info, time1, time2):
+        if chord_info is None:
+            return ['Unknown', 'Unknown', 'Unknown', 'Unknown']
+        # time1과 time2 사이의 간격을 4등분
+        intervals = [(time1 + i * (time2 - time1) / 4, time1 + (i + 1) * (time2 - time1) / 4) for i in range(4)]
+        selected_chords = []
+        for start_interval, end_interval in intervals:
+            best_chord = None
+            best_duration = 0
+            for chord in chord_info:
+                if chord['start'] <= end_interval and chord['end'] >= start_interval:
+                    duration = get_duration_in_interval(chord, start_interval, end_interval)
+                    if duration > best_duration:
+                        best_duration = duration
+                        best_chord = chord['chord']
+            if best_chord:
+                selected_chords.append(best_chord)
+            else:
+                selected_chords.append('Unknown')
+        return selected_chords
+    def get_structure(self, segment_label, time1, time2):
+        max_overlap = 0
+        target_label = None
+        for segment in segment_label:
+            # Calculate overlap between the segment and the time range
+            overlap = min(segment['end'], time2) - max(segment['start'], time1)
+            # If the overlap is negative, it means there is no overlap
+            if overlap > 0:
+                # Check if this is the maximum overlap found so far
+                if overlap > max_overlap:
+                    max_overlap = overlap
+                    target_label = segment['label']
+        return target_label
+    def __getitem__(self, idx):
+        images=[]
+        labels=[]
+        points=[]
+        info_links = self.library_files
+        for info_link in info_links:
+            with open(info_link, 'rb') as f:
+                infos =jsonpickle.decode(f.read())
+                test_piano, test_timing, test_point = infos_to_pianorolls(infos, self.use_all)
+                one_bar_beat = (infos['beat_times'][1] - infos['beat_times'][0]) * infos['rhythm']
+                for key in test_piano.keys():
+                    if key in self.inst:
+                        for time,image in test_piano[key].items():
+                            second_values = [item[1] for item in test_point[key][time]]
+                            unique_values = set(second_values)
+                            condition = self.condition
+                            if len(test_point[key][time]) > 4 and len(unique_values) >= 1:
+                                image = torch.tensor(image).transpose(0, 1).unsqueeze(dim=0).float()  # 1, 128, 192(64)
+                                time1 = infos['downbeat_start'] + one_bar_beat * int(test_timing[time])
+                                time2 = time1 + 4 * one_bar_beat
+                                chord = self.get_chords(infos['chord_info'], time1, time2)
+                                title = unicodedata.normalize('NFC', infos['title'])
+                                label = {
+                                    "title": title,
+                                    "bpm": infos['bpm'],
+                                    "newbpm": infos['new_bpm'],
+                                    "inst": key,
+                                    "time": time1,
+                                    "time2": time2,
+                                    "link": infos['link'],
+                                    "shift": 0,
+                                    "platform": infos['platform'],
+                                    "song_start": infos['downbeat_start'] + one_bar_beat * int(test_timing[0]),
+                                    "song_end": infos['beat_times'][-1],
+                                    "chord": chord,
+                                    "used_time": None,
+                                    "info_link": info_link
+                                }
+                                images.append(quantize_image(image))
+                                labels.append(label)
+                                points.append(test_point[key][time])
+        return images, labels, points
+def compare_titles(title1, title2):
+    """특수문자와 공백을 모두 제거하고 소문자로 변환하여 비교"""
+    def strip_to_basics(title):
+        # 알파벳, 숫자만 남기고 전부 제거 후 소문자로 변환
+        return ''.join(c.lower() for c in title if c.isalnum())
+    return strip_to_basics(title1) == strip_to_basics(title2)
+class TestDataset2(Dataset):
+    def __init__(self, library_files, inst=['vocal','melody'],condition=4):
+        self.library_files = library_files # 그냥 여기에 list를 다 박아야함
+        self.use_all = True
+        self.inst = inst
+        self.condition = condition
+    def __len__(self):
+        return len(self.library_files) # use_new_bpm이어도 그냥 1임
+    def get_chords(self, chord_info, time1, time2):
+        if chord_info is None:
+            return ['Unknown', 'Unknown', 'Unknown', 'Unknown']
+        # time1과 time2 사이의 간격을 4등분
+        intervals = [(time1 + i * (time2 - time1) / 4, time1 + (i + 1) * (time2 - time1) / 4) for i in range(4)]
+        selected_chords = []
+        for start_interval, end_interval in intervals:
+            best_chord = None
+            best_duration = 0
+            for chord in chord_info:
+                if chord['start'] <= end_interval and chord['end'] >= start_interval:
+                    duration = get_duration_in_interval(chord, start_interval, end_interval)
+                    if duration > best_duration:
+                        best_duration = duration
+                        best_chord = chord['chord']
+            if best_chord:
+                selected_chords.append(best_chord)
+            else:
+                selected_chords.append('Unknown')
+        return selected_chords
+    def get_structure(self, segment_label, time1, time2):
+        max_overlap = 0
+        target_label = None
+        for segment in segment_label:
+            # Calculate overlap between the segment and the time range
+            overlap = min(segment['end'], time2) - max(segment['start'], time1)
+            # If the overlap is negative, it means there is no overlap
+            if overlap > 0:
+                # Check if this is the maximum overlap found so far
+                if overlap > max_overlap:
+                    max_overlap = overlap
+                    target_label = segment['label']
+        return target_label
+    def __getitem__(self, idx):
+        images=[]
+        labels=[]
+        points=[]
+        # 한 번에 하나의 파일만 처리하도록 수정
+        info_link = self.library_files[idx]  # idx에 해당하는 파일만
+        with open(info_link, 'rb') as f:
+            infos =jsonpickle.decode(f.read())
+            test_piano, test_timing, test_point = infos_to_pianorolls(infos, True)
+            one_bar_beat = (infos['beat_times'][1] - infos['beat_times'][0]) * infos['rhythm']
+            for key in test_piano.keys():
+                if key in self.inst:
+                    for time,image in test_piano[key].items():
+                        second_values = [item[1] for item in test_point[key][time]]
+                        unique_values = set(second_values)
+                        title = unicodedata.normalize('NFC', infos['title'])
+                        if len(test_point[key][time]) > 4 and len(unique_values) >= 1:
+                            image = torch.tensor(image).transpose(0, 1).unsqueeze(dim=0).float()  # 1, 128, 192(64)
+                            time1 = infos['downbeat_start'] + one_bar_beat * int(test_timing[time])
+                            time2 = time1 + 4 * one_bar_beat
+                            chord = self.get_chords(infos['chord_info'], time1, time2)
+                            title = unicodedata.normalize('NFC', infos['title'])
+                            label = {
+                                "title": title,
+                                "bpm": infos['bpm'],
+                                "newbpm": infos['new_bpm'],
+                                "inst": key,
+                                "time": time1,
+                                "time2": time2,
+                                "shift": 0,
+                                "platform": 'youtube',
+                                "song_start": infos['downbeat_start'] + one_bar_beat * int(test_timing[0]),
+                                "song_end": infos['beat_times'][-1],
+                                "chord": chord,
+                                "used_time": None,
+                                "info_link": info_link
+                            }
+                            images.append(quantize_image(image))
+                            labels.append(label)
+                            points.append(test_point[key][time])
+        return images, labels, points
+def calculate_metric_optimized(images1, images2, points1, points2, bpms1, bpms2, device):
+    images1 = piano_roll_to_chroma(images1)
+    images2 = piano_roll_to_chroma(images2)
+    min_length1 = min(images1.shape[0], len(points1))
+    min_length2 = min(images2.shape[1], len(points2))
+    images1 = images1[:min_length1]
+    images2 = images2[:min_length2]
+    points1 = points1[:min_length1]
+    points2 = points2[:min_length2]
+    bpms1 = bpms1[:,:min_length1]
+    bpms2 = bpms2[:,:min_length2]
+    rhythm_images2 = torch.zeros((images2.shape[1], 64)).to(device)
+    if rhythm_images2.shape[0] < len(points2):
+        rhythm_images2 = torch.zeros((len(points2), 64)).to(device)
+    for j, points in enumerate(points2):
+        if j < len(rhythm_images2):
+            points_tensor = torch.tensor(points).to(device)
+            indices = torch.round(points_tensor[:, 0] / 3.0).long()
+            indices = torch.clamp(indices, max=63)
+            rhythm_images2[j, indices] = 1
+    # 모든 시프트 조합에 대한 이미지 계산 및 연결
+    shifted_images1_list = []
+    shifted_bpms1_list = []
+    shift_count = 0
+    for pitch_shifts in [0]: # 이 [0]을 pitch variation 등으로 구현해서 다른 변수를 넣을 수 있긴함
+        for time_shifts in [-5,-4,-3,-2,-1 ,0,1,2,3,4,5]:
+            shifted_images1_list.append(shift_image_optimized(images1, time_shifts, pitch_shifts))
+            shifted_bpms1_list.append(bpms1)
+            shift_count+=1
+    shifted_images1_batch = torch.cat(shifted_images1_list, dim=0).to(device)
+    shifted_bpms1_batch = torch.cat(shifted_bpms1_list, dim=0).to(device)
+    # rhythm_images1 계산
+    rhythm_images1_batch = torch.zeros((shifted_images1_batch.shape[0], 64)).to(device)
+    dtw_images1_batch = torch.zeros_like(rhythm_images1_batch)
+    for i, points in enumerate(points1):
+        points_tensor = torch.tensor(points).to(device)
+        start_times = torch.round(points_tensor[:, 0] / 3.0).long()
+        pitches = points_tensor[:, 1].long()
+        # 시간과 피치를 64와 128로 제한
+        start_times = torch.clamp(start_times, max=63)
+        pitches = torch.clamp(pitches, max=127)
+        # 다음 노트의 시작 시간 계산
+        end_times = torch.cat([start_times[1:], torch.tensor([64]).to(device)])
+        # rhythm_images1_batch 채우기 (변경 없음)
+        for k in range(len(shifted_images1_list)):
+            rhythm_images1_batch[i + k * len(points1), start_times] = 1
+                # dtw_images1_batch를 직접 채우기
+            batch_index = i + k * len(points1)
+            # 피치 값을 확장하여 각 구간에 설정
+            for j in range(len(start_times)):
+                dtw_images1_batch[batch_index, start_times[j]:end_times[j]] = pitches[j].float()
+        # dtw_images2_batch 초기화
+    dtw_images2_batch = torch.zeros_like(rhythm_images2).to(device)
+    for j, points in enumerate(points2):
+        if j < len(dtw_images2_batch):
+            points_tensor = torch.tensor(points).to(device)
+            start_times = torch.round(points_tensor[:, 0] / 3.0).long()
+            pitches = points_tensor[:, 1].long()
+            # 시간과 피치를 64와 128로 제한
+            start_times = torch.clamp(start_times, max=63)
+            pitches = torch.clamp(pitches, max=127)
+            # 다음 노트의 시작 시간 계산
+            end_times = torch.cat([start_times[1:], torch.tensor([64]).to(device)])
+            # dtw_images2_batch 채우기
+            batch_mask = torch.zeros(dtw_images2_batch.size(1)).to(device)
+            # 피치 값을 확장하여 각 구간에 설정
+            for i in range(len(start_times)):
+                batch_mask[start_times[i]:end_times[i]] = pitches[i].float()
+            dtw_images2_batch[j] = batch_mask
+    min_bpm_optimized = torch.min(shifted_bpms1_batch, bpms2)
+    max_bpm_optimized = torch.max(shifted_bpms1_batch, bpms2)
+    bpm_ratio_optimized = (min_bpm_optimized / max_bpm_optimized)**0.65
+    max_shift = 8
+    correlation = calculate_correlation(rhythm_images1_batch, rhythm_images2, max_shift, device)
+    #dtw = dtw_with_library(dtw_images1_batch, dtw_images2_batch)#batch_sequence_similarity(dtw_images1_batch, dtw_images2_batch) # 1에 가까울수록 유사도가 높음
+    unique_pitches_intersection = ((shifted_images1_batch * images2).sum(dim=(3)) > 0).float().sum(dim=2)
+    unique_pitches_image2 = (images2.sum(dim=(3)) > 0).float().sum(dim=2)
+    unique_pitches_image1 = (shifted_images1_batch.sum(dim=(3)) > 0).float().sum(dim=2)
+    difficulty = 1 / (1 + torch.exp(((unique_pitches_image2 + unique_pitches_image1) - 9) * -0.5))
+    pitch_score = 2 * unique_pitches_intersection / (unique_pitches_image2 + unique_pitches_image1)
+    final_pitch_score = pitch_score * difficulty
+    total = (shifted_images1_batch + images2).clamp_(0, 1).sum(dim=(2, 3))
+    intersection = (shifted_images1_batch * images2).sum(dim=(2, 3))
+    ratio = intersection / total
+    metrics =  (0.5 + 1 * final_pitch_score) * ((ratio) * (1.05) + 0.15 * torch.maximum(correlation, ratio)) * bpm_ratio_optimized # (0.6+1*mse_values) *
+    metrics = metrics.clamp_(0, 1)
+    metrics_reshaped = metrics.view(shift_count, -1, *metrics.shape[1:])
+    max_metric, _ = torch.max(metrics_reshaped, dim=0)
+    return max_metric