Spaces:

mippia
/

MPD-demo

Sleeping

App Files Files Community

slslslrhfem commited on Sep 15, 2025

Commit

5288edb

1 Parent(s): e99e064

first_push

Browse files

Files changed (10) hide show

.gitignore +0 -1
README.md +20 -0
compare.py +423 -0
compare_utils.py +324 -0
music_info.py +33 -0
runtime.txt +1 -0
segment_transcription.py +106 -0
test.py +6 -0
utils.py +99 -0
wav_quantizer.py +162 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,4 @@
 covers80/
 ml_models/
 __pycache__/
-*.pyc
 .env

 covers80/
 ml_models/
 __pycache__/
 .env

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: Music Plagiarism Detection Demo
+emoji: 🎵
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: gpl-3.0
+---
+# Music Plagiarism Detection: Problem Formulation and A Segment-Based Solution
+**ICASSP 2026 Demo**
+**Authors:** Seonghyeon Go*, Yumin Kim*
+**Affiliation:** MIPPIA Inc.
+Upload a song and find the most similar vocal match from the covers80 dataset.

compare.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import torch
+import heapq
+import jsonpickle
+import os
+import pandas as pd
+import random
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from compare_utils import remove_1, algorithmic_collate3, CompareHelper, quantize_image, infos_to_pianorolls, get_duration_in_interval, shift_image_optimized, piano_roll_to_chroma, calculate_correlation
+import glob
+from torch.utils.data import Dataset
+import unicodedata
+covers80_path = "covers80"
+youtubecover_jsons = glob.glob(os.path.join(covers80_path, "*.json"))
+def get_one_result(info_json):
+    results = []
+    device = torch.device('cpu')
+    use_new_bpm = False
+    inst = 'vocal'
+    # info_json 처리
+    test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=[inst])
+    imgs, labels, points = test_dataset[0]
+    test_images = [img for img in imgs]
+    test_labels = [label for label in labels]
+    test_points = [remove_1(point) for point in points]
+    try:
+        test_images = torch.cat(test_images).to(device)
+    except:
+        test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=['vocal'], condition=0)
+        imgs, labels, points = test_dataset[0]
+        test_images = [img for img in imgs]
+        test_labels = [label for label in labels]
+        test_points = [remove_1(point) for point in points]
+        try:
+            test_images = torch.cat(test_images).to(device)
+        except Exception as e:
+            test_dataset = TestDataset(info_json, use_new_bpm=use_new_bpm, inst=['vocal'], condition=0)
+            imgs, labels, points = test_dataset[0]
+            test_images = [img for img in imgs]
+            test_labels = [label for label in labels]
+            test_points = [remove_1(point) for point in points]
+            try:
+                test_images = torch.cat(test_images).to(device)
+            except:
+                print(e)
+                return ["there is no note for this song"], []
+    test_bpms = torch.tensor([label['bpm'] for label in labels])
+    test_bpms_expanded = test_bpms[:, None]
+    test_images_expanded = test_images[:, None, :, :].to(device)
+    # youtubecover_jsons 처리
+    additional_test_dataset = TestDataset2(youtubecover_jsons, inst=[inst], condition=0)
+    additional_test_loader = DataLoader(additional_test_dataset, batch_size=5, collate_fn=algorithmic_collate3)
+    compare_result = []
+    max_heap_size = 1000
+    for idx, (additional_library_images, additional_library_labels, additional_library_points) in tqdm(enumerate(additional_test_loader)):
+        additional_library_images = torch.cat(additional_library_images).to(device)
+        additional_library_images = additional_library_images.squeeze(1)
+        additional_library_images_expanded = additional_library_images[None, :, :, :].to(device)
+        additional_library_bpms = torch.tensor([label['bpm'] for label in additional_library_labels]).to(device)
+        additional_library_bpms_expanded = additional_library_bpms[None, :]
+        metrics = calculate_metric_optimized(
+            test_images_expanded,
+            additional_library_images_expanded,
+            test_points,
+            additional_library_points,
+            test_bpms_expanded,
+            additional_library_bpms_expanded,
+            device
+        )
+        max_matching_score = torch.zeros_like(metrics)
+        for i, test_label in enumerate(test_labels):
+            for j, additional_library_label in enumerate(additional_library_labels):
+                metric = metrics[i, j].item()
+                # chord1 = test_labels[i]['chord']
+                # chord2 = additional_library_labels[j]['chord']
+                # matching_count = sum(c1 == c2 and c1 != 'Unknown' for c1, c2 in zip(chord1, chord2))
+                # matching_score = [0, 0.02, 0.05, 0.09, 0.16]
+                # max_matching_score[i, j] = matching_score[int(matching_count)]
+                # final_metric = (metric + matching_score[int(matching_count)])
+                if final_metric > 1:
+                    final_metric = 1
+                result_entry = CompareHelper([final_metric, test_label, additional_library_label, test_points[i], additional_library_points[j]])
+                # heap 크기 제한 로직
+                if len(compare_result) < max_heap_size:
+                    heapq.heappush(compare_result, result_entry)
+                else:
+                    # heap이 가득 찬 경우, 최소값보다 큰 경우에만 교체
+                    if result_entry.data[0] > compare_result[0].data[0]:
+                        heapq.heappop(compare_result)  # 최소값 제거
+                        heapq.heappush(compare_result, result_entry)  # 새로운 값 추가
+    sorted_compare_results = sorted(compare_result, key=lambda x: x.data[0], reverse=True)
+    return sorted_compare_results
+class TestDataset(Dataset):
+    def __init__(self, info_path, use_all=False, use_new_bpm=False, inst=['vocal','melody'],condition=4):
+        if use_new_bpm:
+            self.library_files = [info_path.replace(".json", "newbpm.json")]
+        else:
+            self.library_files = [info_path]
+        self.info_path = info_path
+        self.use_all = use_all
+        self.inst = inst
+        self.condition = condition
+    def __len__(self):
+        return 1#len(self.library_files) # use_new_bpm이어도 그냥 1임
+    def get_chords(self, chord_info, time1, time2):
+        if chord_info is None:
+            return ['Unknown', 'Unknown', 'Unknown', 'Unknown']
+        # time1과 time2 사이의 간격을 4등분
+        intervals = [(time1 + i * (time2 - time1) / 4, time1 + (i + 1) * (time2 - time1) / 4) for i in range(4)]
+        selected_chords = []
+        for start_interval, end_interval in intervals:
+            best_chord = None
+            best_duration = 0
+            for chord in chord_info:
+                if chord['start'] <= end_interval and chord['end'] >= start_interval:
+                    duration = get_duration_in_interval(chord, start_interval, end_interval)
+                    if duration > best_duration:
+                        best_duration = duration
+                        best_chord = chord['chord']
+            if best_chord:
+                selected_chords.append(best_chord)
+            else:
+                selected_chords.append('Unknown')
+        return selected_chords
+    def get_structure(self, segment_label, time1, time2):
+        max_overlap = 0
+        target_label = None
+        for segment in segment_label:
+            # Calculate overlap between the segment and the time range
+            overlap = min(segment['end'], time2) - max(segment['start'], time1)
+            # If the overlap is negative, it means there is no overlap
+            if overlap > 0:
+                # Check if this is the maximum overlap found so far
+                if overlap > max_overlap:
+                    max_overlap = overlap
+                    target_label = segment['label']
+        return target_label
+    def __getitem__(self, idx):
+        images=[]
+        labels=[]
+        points=[]
+        info_links = self.library_files
+        for info_link in info_links:
+            with open(info_link, 'rb') as f:
+                infos =jsonpickle.decode(f.read())
+                test_piano, test_timing, test_point = infos_to_pianorolls(infos, self.use_all)
+                one_bar_beat = (infos['beat_times'][1] - infos['beat_times'][0]) * infos['rhythm']
+                for key in test_piano.keys():
+                    if key in self.inst:
+                        for time,image in test_piano[key].items():
+                            second_values = [item[1] for item in test_point[key][time]]
+                            unique_values = set(second_values)
+                            condition = self.condition
+                            if len(test_point[key][time]) > 4 and len(unique_values) >= 1:
+                                image = torch.tensor(image).transpose(0, 1).unsqueeze(dim=0).float()  # 1, 128, 192(64)
+                                time1 = infos['downbeat_start'] + one_bar_beat * int(test_timing[time])
+                                time2 = time1 + 4 * one_bar_beat
+                                chord = self.get_chords(infos['chord_info'], time1, time2)
+                                title = unicodedata.normalize('NFC', infos['title'])
+                                label = {
+                                    "title": title,
+                                    "bpm": infos['bpm'],
+                                    "newbpm": infos['new_bpm'],
+                                    "inst": key,
+                                    "time": time1,
+                                    "time2": time2,
+                                    "link": infos['link'],
+                                    "shift": 0,
+                                    "platform": infos['platform'],
+                                    "song_start": infos['downbeat_start'] + one_bar_beat * int(test_timing[0]),
+                                    "song_end": infos['beat_times'][-1],
+                                    "chord": chord,
+                                    "used_time": None,
+                                    "info_link": info_link
+                                }
+                                images.append(quantize_image(image))
+                                labels.append(label)
+                                points.append(test_point[key][time])
+        return images, labels, points
+def compare_titles(title1, title2):
+    """특수문자와 공백을 모두 제거하고 소문자로 변환하여 비교"""
+    def strip_to_basics(title):
+        # 알파벳, 숫자만 남기고 전부 제거 후 소문자로 변환
+        return ''.join(c.lower() for c in title if c.isalnum())
+    return strip_to_basics(title1) == strip_to_basics(title2)
+class TestDataset2(Dataset):
+    def __init__(self, library_files, inst=['vocal','melody'],condition=4):
+        self.library_files = library_files # 그냥 여기에 list를 다 박아야함
+        self.use_all = True
+        self.inst = inst
+        self.condition = condition
+    def __len__(self):
+        return len(self.library_files) # use_new_bpm이어도 그냥 1임
+    def get_chords(self, chord_info, time1, time2):
+        if chord_info is None:
+            return ['Unknown', 'Unknown', 'Unknown', 'Unknown']
+        # time1과 time2 사이의 간격을 4등분
+        intervals = [(time1 + i * (time2 - time1) / 4, time1 + (i + 1) * (time2 - time1) / 4) for i in range(4)]
+        selected_chords = []
+        for start_interval, end_interval in intervals:
+            best_chord = None
+            best_duration = 0
+            for chord in chord_info:
+                if chord['start'] <= end_interval and chord['end'] >= start_interval:
+                    duration = get_duration_in_interval(chord, start_interval, end_interval)
+                    if duration > best_duration:
+                        best_duration = duration
+                        best_chord = chord['chord']
+            if best_chord:
+                selected_chords.append(best_chord)
+            else:
+                selected_chords.append('Unknown')
+        return selected_chords
+    def get_structure(self, segment_label, time1, time2):
+        max_overlap = 0
+        target_label = None
+        for segment in segment_label:
+            # Calculate overlap between the segment and the time range
+            overlap = min(segment['end'], time2) - max(segment['start'], time1)
+            # If the overlap is negative, it means there is no overlap
+            if overlap > 0:
+                # Check if this is the maximum overlap found so far
+                if overlap > max_overlap:
+                    max_overlap = overlap
+                    target_label = segment['label']
+        return target_label
+    def __getitem__(self, idx):
+        images=[]
+        labels=[]
+        points=[]
+        # 한 번에 하나의 파일만 처리하도록 수정
+        info_link = self.library_files[idx]  # idx에 해당하는 파일만
+        with open(info_link, 'rb') as f:
+            infos =jsonpickle.decode(f.read())
+            test_piano, test_timing, test_point = infos_to_pianorolls(infos, True)
+            one_bar_beat = (infos['beat_times'][1] - infos['beat_times'][0]) * infos['rhythm']
+            for key in test_piano.keys():
+                if key in self.inst:
+                    for time,image in test_piano[key].items():
+                        second_values = [item[1] for item in test_point[key][time]]
+                        unique_values = set(second_values)
+                        title = unicodedata.normalize('NFC', infos['title'])
+                        if len(test_point[key][time]) > 4 and len(unique_values) >= 1:
+                            image = torch.tensor(image).transpose(0, 1).unsqueeze(dim=0).float()  # 1, 128, 192(64)
+                            time1 = infos['downbeat_start'] + one_bar_beat * int(test_timing[time])
+                            time2 = time1 + 4 * one_bar_beat
+                            chord = self.get_chords(infos['chord_info'], time1, time2)
+                            title = unicodedata.normalize('NFC', infos['title'])
+                            label = {
+                                "title": title,
+                                "bpm": infos['bpm'],
+                                "newbpm": infos['new_bpm'],
+                                "inst": key,
+                                "time": time1,
+                                "time2": time2,
+                                "shift": 0,
+                                "platform": 'youtube',
+                                "song_start": infos['downbeat_start'] + one_bar_beat * int(test_timing[0]),
+                                "song_end": infos['beat_times'][-1],
+                                "chord": chord,
+                                "used_time": None,
+                                "info_link": info_link
+                            }
+                            images.append(quantize_image(image))
+                            labels.append(label)
+                            points.append(test_point[key][time])
+        return images, labels, points
+def calculate_metric_optimized(images1, images2, points1, points2, bpms1, bpms2, device):
+    images1 = piano_roll_to_chroma(images1)
+    images2 = piano_roll_to_chroma(images2)
+    min_length1 = min(images1.shape[0], len(points1))
+    min_length2 = min(images2.shape[1], len(points2))
+    images1 = images1[:min_length1]
+    images2 = images2[:min_length2]
+    points1 = points1[:min_length1]
+    points2 = points2[:min_length2]
+    bpms1 = bpms1[:,:min_length1]
+    bpms2 = bpms2[:,:min_length2]
+    rhythm_images2 = torch.zeros((images2.shape[1], 64)).to(device)
+    if rhythm_images2.shape[0] < len(points2):
+        rhythm_images2 = torch.zeros((len(points2), 64)).to(device)
+    for j, points in enumerate(points2):
+        if j < len(rhythm_images2):
+            points_tensor = torch.tensor(points).to(device)
+            indices = torch.round(points_tensor[:, 0] / 3.0).long()
+            indices = torch.clamp(indices, max=63)
+            rhythm_images2[j, indices] = 1
+    # 모든 시프트 조합에 대한 이미지 계산 및 연결
+    shifted_images1_list = []
+    shifted_bpms1_list = []
+    shift_count = 0
+    for pitch_shifts in [0]: # 이 [0]을 pitch variation 등으로 구현해서 다른 변수를 넣을 수 있긴함
+        for time_shifts in [-5,-4,-3,-2,-1 ,0,1,2,3,4,5]:
+            shifted_images1_list.append(shift_image_optimized(images1, time_shifts, pitch_shifts))
+            shifted_bpms1_list.append(bpms1)
+            shift_count+=1
+    shifted_images1_batch = torch.cat(shifted_images1_list, dim=0).to(device)
+    shifted_bpms1_batch = torch.cat(shifted_bpms1_list, dim=0).to(device)
+    # rhythm_images1 계산
+    rhythm_images1_batch = torch.zeros((shifted_images1_batch.shape[0], 64)).to(device)
+    dtw_images1_batch = torch.zeros_like(rhythm_images1_batch)
+    for i, points in enumerate(points1):
+        points_tensor = torch.tensor(points).to(device)
+        start_times = torch.round(points_tensor[:, 0] / 3.0).long()
+        pitches = points_tensor[:, 1].long()
+        # 시간과 피치를 64와 128로 제한
+        start_times = torch.clamp(start_times, max=63)
+        pitches = torch.clamp(pitches, max=127)
+        # 다음 노트의 시작 시간 계산
+        end_times = torch.cat([start_times[1:], torch.tensor([64]).to(device)])
+        # rhythm_images1_batch 채우기 (변경 없음)
+        for k in range(len(shifted_images1_list)):
+            rhythm_images1_batch[i + k * len(points1), start_times] = 1
+                # dtw_images1_batch를 직접 채우기
+            batch_index = i + k * len(points1)
+            # 피치 값을 확장하여 각 구간에 설정
+            for j in range(len(start_times)):
+                dtw_images1_batch[batch_index, start_times[j]:end_times[j]] = pitches[j].float()
+        # dtw_images2_batch 초기화
+    dtw_images2_batch = torch.zeros_like(rhythm_images2).to(device)
+    for j, points in enumerate(points2):
+        if j < len(dtw_images2_batch):
+            points_tensor = torch.tensor(points).to(device)
+            start_times = torch.round(points_tensor[:, 0] / 3.0).long()
+            pitches = points_tensor[:, 1].long()
+            # 시간과 피치를 64와 128로 제한
+            start_times = torch.clamp(start_times, max=63)
+            pitches = torch.clamp(pitches, max=127)
+            # 다음 노트의 시작 시간 계산
+            end_times = torch.cat([start_times[1:], torch.tensor([64]).to(device)])
+            # dtw_images2_batch 채우기
+            batch_mask = torch.zeros(dtw_images2_batch.size(1)).to(device)
+            # 피치 값을 확장하여 각 구간에 설정
+            for i in range(len(start_times)):
+                batch_mask[start_times[i]:end_times[i]] = pitches[i].float()
+            dtw_images2_batch[j] = batch_mask
+    min_bpm_optimized = torch.min(shifted_bpms1_batch, bpms2)
+    max_bpm_optimized = torch.max(shifted_bpms1_batch, bpms2)
+    bpm_ratio_optimized = (min_bpm_optimized / max_bpm_optimized)**0.65
+    max_shift = 8
+    correlation = calculate_correlation(rhythm_images1_batch, rhythm_images2, max_shift, device)
+    #dtw = dtw_with_library(dtw_images1_batch, dtw_images2_batch)#batch_sequence_similarity(dtw_images1_batch, dtw_images2_batch) # 1에 가까울수록 유사도가 높음
+    unique_pitches_intersection = ((shifted_images1_batch * images2).sum(dim=(3)) > 0).float().sum(dim=2)
+    unique_pitches_image2 = (images2.sum(dim=(3)) > 0).float().sum(dim=2)
+    unique_pitches_image1 = (shifted_images1_batch.sum(dim=(3)) > 0).float().sum(dim=2)
+    difficulty = 1 / (1 + torch.exp(((unique_pitches_image2 + unique_pitches_image1) - 9) * -0.5))
+    pitch_score = 2 * unique_pitches_intersection / (unique_pitches_image2 + unique_pitches_image1)
+    final_pitch_score = pitch_score * difficulty
+    total = (shifted_images1_batch + images2).clamp_(0, 1).sum(dim=(2, 3))
+    intersection = (shifted_images1_batch * images2).sum(dim=(2, 3))
+    ratio = intersection / total
+    metrics =  (0.5 + 1 * final_pitch_score) * ((ratio) * (1.05) + 0.15 * torch.maximum(correlation, ratio)) * bpm_ratio_optimized # (0.6+1*mse_values) *
+    metrics = metrics.clamp_(0, 1)
+    metrics_reshaped = metrics.view(shift_count, -1, *metrics.shape[1:])
+    max_metric, _ = torch.max(metrics_reshaped, dim=0)
+    return max_metric

compare_utils.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import torch
+import numpy as np
+def remove_1(points):
+    filtered_points = [point for point in points if point[2] != 1]
+    return filtered_points
+class CompareHelper:
+    def __init__(self, data):
+        self.data = data
+    def __lt__(self, other):
+        return self.data[0] < other.data[0]
+def get_duration_in_interval(chord, start_interval, end_interval):
+    """Interval 내에서 chord의 지속 시간을 반환합니다."""
+    return min(chord['end'], end_interval) - max(chord['start'], start_interval)
+def shift_image_optimized(image, x_shift, y_shift): # 이거 y랑 x랑 뒤집어야함.. time, pitch
+    # 이미지를 x와 y 방향으로 동시에 시프트
+    _, _, height, width = image.size()
+    # torch.roll을 사용하여 이미지를 시프트
+    shifted_image = torch.roll(image, shifts=(x_shift, y_shift), dims=(3, 2))
+    # 시프트에 따라 이미지의 가장자리를 잘라냄
+    if x_shift > 0:
+        shifted_image[:, :, :, :x_shift] = 0
+    elif x_shift < 0:
+        shifted_image[:, :, :, x_shift:] = 0
+    #if y_shift > 0:
+    #    shifted_image[:, :, :y_shift, :] = 0
+    #elif y_shift < 0:
+    #    shifted_image[:, :, y_shift:, :] = 0
+    return shifted_image
+def algorithmic_collate3(batch):
+    imgs, labels, points = zip(*batch)
+    return_images = []
+    return_labels = []
+    return_points = []
+    for img_list in imgs:
+        return_images.extend(img_list)  # 한 단계 더 풀어줌
+    for label in labels:
+        return_labels.extend(label)
+    for point in points:
+        return_points.extend(point)
+    return return_images, return_labels, return_points
+def quantize_image(image):
+    """
+    Quantize the given image tensor.
+    :param image: torch.Tensor, shape [1, 128, 192], binary values
+    :return: torch.Tensor, shape [1, 128, 64], quantized values
+    """
+    quantized_image = torch.zeros(1, 128, 64)
+    # Loop through each new pixel position
+    for i in range(64):
+        # Define the original image slice indexes
+        # For the first slice, consider only first 2 columns
+        if i == 0:
+            start_idx = 0
+            end_idx = start_idx + 2
+        # For other slices, consider 3 columns
+        else:
+            start_idx = i * 3 - 1
+            end_idx = start_idx + 3
+        # Check if there's at least one '1' in the window
+        quantized_image[:, :, i] = (image[:, :, start_idx:end_idx].sum(dim=2) > 0).float()
+    return quantized_image
+def piano_roll_to_chroma(piano_roll):
+    """
+    Convert a binary piano roll tensor to a binary chroma tensor.
+    Parameters:
+        piano_roll (torch.Tensor): The binary piano roll tensor with shape
+            (batch_size, num_channels, num_pitches, num_frames).
+    Returns:
+        torch.Tensor: The binary chroma tensor with shape
+            (batch_size, num_channels, 12, num_frames).
+    """
+    if piano_roll.shape[2] == 12:
+        return piano_roll
+    # Ensure the piano roll is binary
+    binary_piano_roll = (piano_roll > 0).float()
+    # Initialize chroma tensor
+    chroma = torch.zeros(
+        (binary_piano_roll.shape[0], binary_piano_roll.shape[1], 12, binary_piano_roll.shape[3]),
+        device=binary_piano_roll.device,
+    )
+    # Sum along the pitch classes modulo 12 (pitches)
+    for i in range(12):
+        chroma[:, :, i, :] = binary_piano_roll[:, :, i::12, :].max(dim=2).values
+    return chroma
+def calculate_correlation(tensor1, tensor2, max_shift,device):
+    #tensor1 = apply_gaussian_filter_1d_to_batch(tensor1,1.5)
+    # 초기 최대 상관계수 행렬을 낮은 값으로 초기화
+    max_correlation = torch.full((tensor1.size(0), tensor2.size(0)), float('-inf')).to(device)
+    for shift in range(-max_shift, max_shift + 1):
+        # tensor2를 시프트
+        shifted_tensor2 = torch.roll(tensor2, shifts=shift, dims=1)
+        #shifted_tensor2 = apply_gaussian_filter_1d_to_batch(torch.roll(tensor2, shifts=shift, dims=1),1.5)
+        # 코사인 유사도 계산
+        tensor1_norm = tensor1 / tensor1.norm(dim=1, keepdim=True)
+        tensor2_norm = shifted_tensor2 / tensor2.norm(dim=1, keepdim=True)
+        cosine_similarity = torch.mm(tensor1_norm, tensor2_norm.t())
+        max_correlation = torch.max(max_correlation, cosine_similarity)
+        """
+         # L1 코사인 유사도라 해야하나..? 여튼 단순 노트 유사도 계산
+        tensor1_expanded = tensor1.unsqueeze(1)
+        tensor2_expanded = shifted_tensor2.unsqueeze(0)
+        both_one = tensor1_expanded * tensor2_expanded
+        # 두 벡터 모두에서 1인 요소의 개수 및 1인 요소의 총합 계산
+        both_one_sum = both_one.sum(dim=2)
+        total_one_sum = tensor1_expanded.sum(dim=2) + tensor2_expanded.sum(dim=2)
+        metric_matrix = both_one_sum / total_one_sum
+        max_correlation = torch.max(max_correlation, metric_matrix)
+        """
+    return max_correlation
+def infos_to_pianorolls(info, use_all):
+    pianorolls={}
+    #chromas={} # chroma deprecated
+    CONLON_points={}
+    # melody_pianorolls={}
+    # bass_pianorolls={}
+    vocal_pianorolls={}
+    # boundary_pianorolls={}
+    #melody_chromas={}
+    #bass_chromas={}
+    #vocal_chromas={}
+    # melody_CONLON_points={}
+    # bass_CONLON_points={}
+    vocal_CONLON_points={}
+    # boundary_CONLON_points={}
+    start_points = infos_to_startpoint(info, use_all)
+    #shift_val = np.argmax(chart_fit)
+    shift_val = 0
+    for idx, i in enumerate(start_points):
+        #bass를 좀 깔끔하게 만듭니다. Heuristic함
+        """
+        cleansed_bass={}
+        for key, bar in info.bass_info.items():
+            if len(bar)>0:
+                bar=np.array(bar)
+                remain_notes=[]
+                to_quantize = 16 # 16분 음표 하나당 최대 1개의 Note를 남깁니다.
+                idx_quantize = 48/to_quantize
+                for j in range(to_quantize):
+                    bass_idx = np.where((bar[:,4]//idx_quantize == j))
+                    notes = bar[bass_idx]
+                    best_note = get_best_bass(chart_info, notes)
+                    if best_note is not None:
+                        remain_notes.append(best_note)
+                cleansed_bass[key] = np.array(remain_notes)
+        """
+        # cleansed_bass = info['bass_info']
+        # melody = [
+        #     info['melody_info'].get(str(i), []) if info['melody_info'] is not None else [],
+        #     info['melody_info'].get(str(i+1), []) if info['melody_info'] is not None else [],
+        #     info['melody_info'].get(str(i+2), []) if info['melody_info'] is not None else [],
+        #     info['melody_info'].get(str(i+3), []) if info['melody_info'] is not None else []
+        # ]
+        # bass = [
+        #     info['bass_info'].get(str(i), []) if info['bass_info'] is not None else [],
+        #     info['bass_info'].get(str(i+1), []) if info['bass_info'] is not None else [],
+        #     info['bass_info'].get(str(i+2), []) if info['bass_info'] is not None else [],
+        #     info['bass_info'].get(str(i+3), []) if info['bass_info'] is not None else []
+        # ]
+        vocal = [
+            info['vocal_info'].get(str(i), []) if info['vocal_info'] is not None else [],
+            info['vocal_info'].get(str(i+1), []) if info['vocal_info'] is not None else [],
+            info['vocal_info'].get(str(i+2), []) if info['vocal_info'] is not None else [],
+            info['vocal_info'].get(str(i+3), []) if info['vocal_info'] is not None else []
+        ]
+        # boundary = [
+        #     info['boundaries'].get(str(i), []) if info['boundaries'] is not None else [],
+        #     info['boundaries'].get(str(i+1), []) if info['boundaries'] is not None else [],
+        #     info['boundaries'].get(str(i+2), []) if info['boundaries'] is not None else [],
+        #     info['boundaries'].get(str(i+3), []) if info['boundaries'] is not None else []
+        # ]
+        #piano = [info.piano_info.get(str(i),[]),info.piano_info.get(str(i+1),[]),info.piano_info.get(str(i+2), []),info.piano_info.get(str(i+3),[])]
+        # melody_pianoroll,  melody_CONLON_point = bar_notes_to_pianoroll(melody, shift_val)
+        # bass_pianoroll, bass_CONLON_point = bar_notes_to_pianoroll(bass, shift_val)
+        vocal_pianoroll,vocal_CONLON_point = bar_notes_to_pianoroll(vocal, shift_val)
+        # boundary_pianoroll, boundary_CONLON_point = bar_notes_to_pianoroll(boundary, shift_val)
+        #piano_pianoroll, piano_chroma, piano_CONLON_point = bar_notes_to_pianoroll(piano, shift_val)
+        # melody_pianorolls[idx]=melody_pianoroll
+        # bass_pianorolls[idx] = bass_pianoroll
+        vocal_pianorolls[idx] = vocal_pianoroll
+        # boundary_pianorolls[idx]= boundary_pianoroll
+        #piano_pianorolls[idx] = piano_pianoroll
+        #melody_chromas[idx]=melody_chroma
+        #bass_chromas[idx] = bass_chroma
+        #vocal_chromas[idx] = vocal_chroma
+        #piano_chromas[idx] = piano_chroma
+        # melody_CONLON_points[idx] = melody_CONLON_point
+        # bass_CONLON_points[idx] = bass_CONLON_point
+        vocal_CONLON_points[idx] = vocal_CONLON_point
+        # boundary_CONLON_points[idx] = boundary_CONLON_point
+        #piano_CONLON_points[idx] = piano_CONLON_point
+    # pianorolls['melody'] = melody_pianorolls
+    # pianorolls['bass'] = bass_pianorolls
+    pianorolls['vocal'] = vocal_pianorolls
+    # pianorolls['boundary'] = boundary_pianorolls
+    #pianorolls['piano'] = piano_pianorolls
+    #chromas['melody'] = melody_chromas
+    #chromas['bass'] = bass_chromas
+    #chromas['vocal'] = vocal_chromas
+    #chromas['piano'] = piano_chromas
+    # CONLON_points['melody'] = melody_CONLON_points
+    # CONLON_points['bass'] = bass_CONLON_points
+    CONLON_points['vocal'] = vocal_CONLON_points
+    # CONLON_points['boundary'] = boundary_CONLON_points
+    #CONLON_points['piano'] = piano_CONLON_points
+    return pianorolls, start_points, CONLON_points # chroma deprecated
+def bar_notes_to_pianoroll(bars,shift_val):
+    pianoroll = np.zeros((192,128)) #
+    conlon_points = []
+    for j, bar in enumerate(bars):
+        j_offset = j * 48  # 반복되는 계산을 변수에 저장
+        for note in bar:
+            start, pitch, end = int(note[4]), int(note[2]), int(note[5])
+            duration = (end - start + 1)
+            start_idx = start + j_offset  # 인덱스 계산 최적화
+            end_idx = end + j_offset + 1
+            conlon_points.append([start_idx, pitch, duration])
+            pianoroll[start_idx:end_idx, pitch] = 1  # 슬라이싱을 사용한 효율적인 할당
+    return pianoroll, conlon_points
+def infos_to_startpoint(info,use_all):
+    downbeat_start = info['downbeat_start']
+    boundary = round((info['beat_times'][-1] -downbeat_start)/(4*(info['beat_times'][1]-info['beat_times'][0])))-1
+    song_structure_sp = [i for i in range(boundary+1)]
+    song_structure_sp = refine_breakpoints_custom(song_structure_sp)
+    if use_all:
+        song_structure_sp = [i for i in range(song_structure_sp[-1])]
+    return song_structure_sp
+def refine_breakpoints_custom(breakpoints, interval=4):
+    refined = []
+    unique_breakpoints = []
+    for point in breakpoints:
+        if point not in unique_breakpoints and point>0: # 0빼고 시작이 애매하긴한데, 예를 들어 verse가 6에서 시작이면 0~4보냐 2~6을 보냐 차이.
+            unique_breakpoints.append(point)
+    # Determine the starting point
+    if len(unique_breakpoints)==0:
+        unique_breakpoints.append(0)
+    starting_point = unique_breakpoints[0] % interval
+    if starting_point != unique_breakpoints[0]:
+        for point in range(starting_point, unique_breakpoints[0], interval):
+            if point > -1:  # Ensure the point is positive
+                refined.append(point)
+    for i in range(len(unique_breakpoints)):
+        # Add the current breakpoint
+        refined.append(unique_breakpoints[i])
+        # Check if there is a next breakpoint
+        if i + 1 < len(unique_breakpoints):
+            next_point = unique_breakpoints[i]
+            while next_point + 2*interval <= unique_breakpoints[i + 1]:
+                next_point += interval
+                refined.append(next_point)
+    if len(refined)==0:
+        refined = [0]
+    return refined

music_info.py ADDED Viewed

	@@ -0,0 +1,33 @@

+class Music_info:
+    def __init__(self,melody_info=None, bass_info=None, drum_info=None, chord_info=None, vocal_info=None, piano_info=None, chart_scale=None,
+                 title="default_title", bpm=None, rhythm = None, downbeat_start=None, beat_times=None, boundaries = None,
+                   segment_label= None, link=None,platform=None, newbpm=None, key=None, structure_starting_point=None, structure_json=None, preview_music_path=None):
+        self.melody_info = melody_info
+        self.bass_info = bass_info
+        self.drum_info = drum_info
+        self.chord_info = chord_info
+        self.vocal_info = vocal_info
+        self.piano_info = piano_info # None for now
+        self.chart_scale = chart_scale
+        self.title = title
+        self.bpm = bpm
+        self.rhythm = rhythm
+        self.downbeat_start = downbeat_start
+        self.beat_times = beat_times
+        self.boundaries = boundaries # toplines. idk why I used w
+        self.segment_label = segment_label
+        self.link = link
+        self.preview_music_path = preview_music_path
+        self.platform = platform
+        self.new_bpm = newbpm
+        self.key = key
+        self.structure_starting_point = structure_starting_point
+        self.structure_json = structure_json # 이게 진짜 어려움. lyric이나 chord, 곡 구조 등의 정보를 index키와 함께 저장해야함.
+    def __str__(self):
+        return str(self.__class__) + ": " + str(self.__dict__)

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.8.18

segment_transcription.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import numpy as np
+import librosa
+import soundfile
+import demucs.separate
+from wav_quantizer import wav_quantizing
+from ml_models.AST.do_everything import vocal_trans
+from music_info import Music_info
+from ml_models.DilatedTransformer import Demixed_DilatedTransformerModel
+from madmom.features.beats import DBNBeatTrackingProcessor
+import shutil
+from madmom.features.downbeats import DBNDownBeatTrackingProcessor
+from utils import vocal_midi2note, quantize, chord_quantize, save_to_json
+downbeat_model = Demixed_DilatedTransformerModel(attn_len=5, instr=5, ntoken=2,
+                                            dmodel=256, nhead=8, d_hid=1024,
+                                            nlayers=9,  norm_first=True)
+beat_tracker = DBNBeatTrackingProcessor(min_bpm=55.0, max_bpm=215.0, fps=44100/1024,
+                                        transition_lambda=100, observation_lambda=6,
+                                        num_tempi=None, threshold=0.2)
+downbeat_tracker = DBNDownBeatTrackingProcessor(beats_per_bar=[3, 4],
+                                                min_bpm=55.0, max_bpm=215.0, fps=44100/1024,
+                                                transition_lambda=100, observation_lambda=6,
+                                                num_tempi=None, threshold=0.2)
+device = 'cuda'
+def segment_transcription(audio_path):
+    # Make it simple, just a demucs and bpm quantization, and vocal_transcription and chord transciption only!
+    # ...Maybe not simple
+    # we use chord transcription from omnizart, which needs python 3.8 file
+    wav_path = audio_path
+    wav_name = os.path.splitext(os.path.basename(wav_path))[0]
+    demucs.separate.main(["--two-stems", "piano", "-n", "htdemucs_6s", wav_path])
+    piano_wav_name = "separated/htdemucs_6s/" + wav_name + "/piano.wav"
+    others_name = "separated/htdemucs_6s/" + wav_name + "/no_piano.wav"
+    to_name = "separated/htdemucs_6s/" + wav_name + "/" + wav_name + ".wav"
+    os.rename(others_name, to_name)
+    demucs.separate.main(["-n", "htdemucs", to_name])
+    vocal_wav_name = "separated/htdemucs/" + wav_name + "/vocals.wav"
+    drum_wav_name = "separated/htdemucs/" + wav_name + "/drums.wav"
+    other_wav_name = "separated/htdemucs/" + wav_name + "/other.wav"
+    bass_wav_name = "separated/htdemucs/" + wav_name + "/bass.wav"
+    vocal_wav_path = os.path.abspath("separated/htdemucs/" + wav_name + "/vocals.wav")
+    drum_wav_path = os.path.abspath("separated/htdemucs/" + wav_name + "/drums.wav")
+    other_wav_path = os.path.abspath("separated/htdemucs/" + wav_name + "/other.wav")
+    bass_wav_path = os.path.abspath("separated/htdemucs/" + wav_name + "/bass.wav")
+    abs_wav_path = os.path.abspath(wav_path)
+    abs_file_path = os.path.abspath(wav_path)
+    vocals = librosa.load(vocal_wav_name, sr=44100, mono=False)[0]
+    piano = librosa.load(piano_wav_name, sr=44100, mono=False)[0]
+    drums = librosa.load(drum_wav_name, sr=44100, mono=False)[0]
+    bass = librosa.load(bass_wav_name, sr=44100, mono=False)[0]
+    other = librosa.load(other_wav_name, sr=44100, mono=False)[0]
+    spleeter_dict = {
+        'vocals': np.asarray(vocals).T,
+        'piano': np.asarray(piano).T,
+        'drums': np.asarray(drums).T,
+        'bass': np.asarray(bass).T,
+        'other': np.asarray(other).T
+    }
+    real_others = librosa.load(piano_wav_name, sr=44100, mono=False)[0] + librosa.load(other_wav_name, sr=44100, mono=False)[0]
+    soundfile.write(other_wav_name, real_others.T, 44100)
+    quantize_result = wav_quantizing(wav_path, spleeter_dict, downbeat_model, beat_tracker, downbeat_tracker, device)
+    vocal_notes = vocal_midi2note(vocal_trans(vocal_wav_path, device=device))
+    #chord_info = transcript("chord", wav_path)[1]
+    sav_path = wav_path[:-4] + ".json"
+    beat_times, downbeat_start, rhythm, bpm = quantize_result[0]
+    chord_time_gap = (beat_times[1] - beat_times[0]) * rhythm
+    vocal_infos = quantize(vocal_notes, beat_times, downbeat_start, chord_time_gap)
+    # chord_infos = chord_quantize(chord_info, beat_times)
+    wav_music_info = Music_info(
+            melody_info=None,
+            bass_info=None,
+            chord_info=None,
+            vocal_info=vocal_infos,
+            chart_scale=None,
+            title=str(wav_name),
+            bpm=int(bpm),
+            rhythm=int(rhythm),
+            downbeat_start=float(downbeat_start),
+            beat_times=beat_times,
+            boundaries=None,
+            segment_label=None,
+            link=None,
+        )
+    os.makedirs(os.path.dirname(sav_path), exist_ok=True)
+    save_to_json(wav_music_info, sav_path)
+    if os.path.exists("separated"):
+        shutil.rmtree("separated")
+    return sav_path

test.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from inference import inference
+if __name__ == "__main__":
+    result = inference("/home/ubuntu/data/coding/icassp-plagiarism-demo/KEON ＜3 - I GASLIGHT MYSELF ｜ Udio [The%20Untitled].mp3")
+    print(result)

utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import pretty_midi
+import jsonpickle
+def vocal_midi2note(midi):
+    """
+    """
+    notes=[]
+    for note in midi:
+        pretty_note =pretty_midi.Note(velocity=100, start=note[0], end=note[1], pitch=note[2])
+        notes.append(pretty_note)
+    return notes
+def quantize(notes, beat_times, downbeat_start, chord_time_gap):
+    """
+    어떤 Note가 몇번째 Bar의 몇번째 timing부터 몇번째 timing까지 나타나는지를 return해서 준다.
+    Pianoroll의 Index를 넘겨준다? 라고 생각하면 적당히 맞다.
+    ex) 1마디가 1초인 곡에서 연주 시간이 4.25~4.75인 음이 있고, 1마디를 48분 음표까지 고려한다면
+    5번째 마디에 12~35까지 연주함.. 이라는 정보를 건네줌
+    """
+    first_beat = downbeat_start
+    one_beat_time = beat_times[1]-beat_times[0] #그냥 1비트
+    quantize_48th_time = one_beat_time/12
+    beat_num = chord_time_gap//one_beat_time * 12 # 4박자 곡이면 48, 3박자 곡이면 36 -> 이거 24나오면.. 시각화 망가지겠네?
+    max_idx=0
+    for note in notes:
+        start_idx = round((note.start-downbeat_start)/quantize_48th_time)
+        end_idx = round((note.end-downbeat_start)/quantize_48th_time)
+        if max_idx <int(start_idx // beat_num):
+            max_idx = int(start_idx// beat_num)
+    note_info={str(key) : [] for key in range(max_idx)}
+    for note in notes:
+        if note.start>downbeat_start: # 극초반의 일부 음표가 생략될 수도 있긴합니다.
+            start_idx = round((note.start-downbeat_start)/quantize_48th_time)
+            end_idx = round((note.end-downbeat_start)/quantize_48th_time)
+            if end_idx == start_idx:
+                end_idx+=1
+            note_start = start_idx * quantize_48th_time + first_beat
+            note_end  = end_idx * quantize_48th_time + first_beat
+            note_pitch = note.pitch
+            note_velocity = note.velocity
+            bar_idx = int(start_idx // beat_num)
+            bar_pos = start_idx % beat_num
+            bar_pos_end = end_idx % beat_num # 이거 때문에, 음 길이가 한 마디를 못넘어 감 *** 예를들어 beatnum이 48이고 35~67이라 하면 35 ~ 19 되었다가 if문 타면서 35~47됨.
+            if bar_pos_end<bar_pos and int(end_idx//beat_num) > bar_idx:
+                bar_pos_end = (int(end_idx//beat_num) - bar_idx) * beat_num # 이제는 구현 함. 나중에 index에러 반드시 날거임
+            if bar_pos_end<bar_pos:
+                bar_pos_end = beat_num-1
+            note = [float(note_start), float(note_end), int(note_pitch), int(note_velocity), int(bar_pos), int(bar_pos_end)]
+            #note = {'start':note_start, 'end':note_end, 'pitch':note_pitch, 'velocity':note_velocity, 'start_idx':bar_pos, 'end_idx':bar_pos_end}
+            if str(bar_idx) not in note_info:
+                note_info[str(bar_idx)]=[note]
+            else:
+                note_info[str(bar_idx)].append(note)
+    return note_info
+def chord_quantize(chord_info, beat_times):
+    """
+    returns Quantized Chord info, First chord starting point and chord time(3박이냐 4박이냐에 따라 chord time이 달라집니다. 코드 변화가 한 마디 내에서 여러번 나올 수 있긴 하지만 전반적으로 마디 가장 처음 1번 이루어진다는 가정을 사용합니다.)
+    first chord는 첫 Downbeat의 시작을 의미합니다. 다만 고쳐야할 것 같네요..
+    """
+    first_beat = beat_times[0]
+    one_beat_time = beat_times[1]-beat_times[0]
+    q_chord_info = []
+    for chord in chord_info:
+        chord_dict={}
+        chord_dict['chord'] = chord['chord']
+        chord_dict['start'] = float(round((chord['start']-first_beat)/one_beat_time) * one_beat_time + first_beat) # 0.2, 0.6, 1.0, 1.4 .... 가 있고 chord timing이 1.9라면 1.8을 return하는 코드
+        end_time = round((chord['end']-first_beat)/one_beat_time) * one_beat_time + first_beat
+        if end_time==chord_dict['start']:
+            end_time += one_beat_time
+        chord_dict['end'] = float(end_time)
+        q_chord_info.append(chord_dict)
+    return q_chord_info
+def save_to_json(data, filename):
+    """데이터를 JSON 파일로 저장합니다."""
+    with open(filename, 'w', encoding='utf-8') as file:
+        # JSON 형식으로 변환
+        json_data = jsonpickle.encode(data, unpicklable=False)
+        # 파일에 쓰기
+        file.write(json_data)

wav_quantizer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import librosa
+import numpy as np
+import torch
+import scipy.stats as st
+from librosa.core import istft, stft
+from scipy.signal.windows import hann
+def wav_quantizing(wav_file, ori, downbeat_model, beat_tracker, downbeat_tracker, device, bpm=None):
+    """
+    Get beat timing of given wav_file. This module assumes wav has integer bpm.
+    input : path of wav_file
+    output : Beat Timing of given wav file in seconds.
+    """
+    y,sr = librosa.load(wav_file, sr=44100)
+    mel_f = librosa.filters.mel(sr=44100, n_fft=4096, n_mels=128, fmin=30, fmax=11000).T
+    x = np.stack([np.dot(np.abs(np.mean(_stft(ori[key]), axis=-1))**2, mel_f) for key in ori])
+    #Initialize Beat Transformer to estimate (down-)beat activation from demixed input
+    model = downbeat_model
+    model.eval()
+    PARAM_PATH = {
+        4: "ml_models/Beat-Transformer/checkpoint/fold_4_trf_param.pt", # 원래 다른 수도 있었는데, 용량 최적화를 위해 지움.
+    }
+    x = np.transpose(x, (0, 2, 1))
+    x = np.stack([librosa.power_to_db(x[i], ref=np.max) for i in range(len(x))])
+    x = np.transpose(x, (0, 2, 1))
+    FOLD = 4
+    model.load_state_dict(torch.load(PARAM_PATH[FOLD], map_location=torch.device('cuda'))['state_dict'])
+    model.to(device)
+    model.eval()
+    model_input = torch.from_numpy(x).unsqueeze(0).float().to(device)
+    activation, _ = model(model_input)
+    beat_activation = torch.sigmoid(activation[0, :, 0]).detach().cpu().numpy()
+    downbeat_activation = torch.sigmoid(activation[0, :, 1]).detach().cpu().numpy()
+    dbn_beat_pred = beat_tracker(beat_activation)
+    combined_act = np.concatenate((np.maximum(beat_activation - downbeat_activation,
+                                            np.zeros(beat_activation.shape)
+                                            )[:, np.newaxis],
+                                downbeat_activation[:, np.newaxis]
+                                ), axis=-1)   #(T, 2)
+    dbn_downbeat_pred = downbeat_tracker(combined_act)
+    dbn_downbeat_pred = dbn_downbeat_pred[dbn_downbeat_pred[:, 1]==1][:, 0]
+    beat_times_ori = dbn_beat_pred
+    m_res = st.linregress(np.arange(len(beat_times_ori)),beat_times_ori)
+    if bpm:
+        bpms=[]
+        if bpm>100:
+            bpms = [bpm, bpm/2]
+            bpm_ratios = [1,1/2]
+        else:
+            bpms = [bpm, bpm*2]
+            bpm_ratios = [1,2]
+    else:
+        bpm = 60/m_res.slope
+        # bpms=[]
+        # if bpm>100:
+        #     bpms = [round(bpm), round(bpm/2)]
+        #     bpm_ratios = [1,1/2]
+        # else:
+        #     bpms = [round(bpm), round(bpm*2)]
+        #     bpm_ratios = [1,2]
+        bpms = [round(bpm)]
+        bpm_ratios = [1]
+    results=[]
+    for i, int_bpm in enumerate(bpms):
+        bpm_ratio = bpm_ratios[i]
+        interpolated_beat_times = interpolate_beat_times(bpm_ratio, int_bpm, beat_times_ori)
+        if i==0:
+            time_shifted = beat_times_ori-interpolated_beat_times[0::bpm_ratio]
+            mode_timing = st.mode(np.around(time_shifted,2)) # 이 매커니즘은 정 bpm에서 계산한걸 그대로 사용하는거로..
+        beat_times = interpolated_beat_times +mode_timing.mode
+        while beat_times[0]>60/int_bpm:
+            beat_times=beat_times - 60/int_bpm
+        if beat_times[0]<0:
+            beat_times=beat_times + 60/int_bpm
+        while len(y)/44100<beat_times[-1]: # if the beat_time has larger value than full song's length due to shift or something
+            beat_times = beat_times[:-1]
+        beat_times = beat_times[:-1] #
+        time_gap = dbn_downbeat_pred[1:]-dbn_downbeat_pred[:-1]
+        time_gap = np.round(time_gap/(beat_times[1]-beat_times[0]))
+        if len(time_gap)==0:
+            rhythm = 4
+        else:
+            rhythm = int(st.mode(time_gap).mode)
+            if rhythm % 3 ==0:
+                rhythm = 3
+            else:
+                rhythm = 4
+        downbeat_time = np.remainder(dbn_downbeat_pred, (beat_times[1]-beat_times[0])*rhythm)
+        start_downbeat_time = (downbeat_time - beat_times[0]) / (beat_times[1]-beat_times[0])
+        start_downbeat_time = st.mode(np.round(start_downbeat_time)).mode
+        start_downbeat_time = find_nearest(beat_times, beat_times[0] + start_downbeat_time * (beat_times[1]-beat_times[0]))
+        results.append((beat_times.tolist(), start_downbeat_time , rhythm, int_bpm))
+    return results
+def interpolate_beat_times(bpm_ratio, int_bpm, beat_times):
+    beat_steps_8th =  np.linspace(0, int(beat_times.size*bpm_ratio)-1, int(beat_times.size*bpm_ratio)) * (60 / int_bpm)
+    return beat_steps_8th
+def find_nearest(array, value):
+    array = np.asarray(array)
+    idx = (np.abs(array - value)).argmin()
+    return array[idx]
+def _stft(data: np.ndarray, inverse: bool = False, length = None ):
+        """
+        Single entrypoint for both stft and istft. This computes stft and
+        istft with librosa on stereo data. The two channels are processed
+        separately and are concatenated together in the result. The
+        expected input formats are: (n_samples, 2) for stft and (T, F, 2)
+        for istft.
+        Parameters:
+            data (numpy.array):
+                Array with either the waveform or the complex spectrogram
+                depending on the parameter inverse
+            inverse (bool):
+                (Optional) Should a stft or an istft be computed.
+            length (Optional[int]):
+        Returns:
+            numpy.ndarray:
+                Stereo data as numpy array for the transform. The channels
+                are stored in the last dimension.
+        """
+        assert not (inverse and length is None)
+        data = np.asfortranarray(data)
+        N = 4096
+        H = 1024
+        win = hann(N, sym=False)
+        fstft = istft if inverse else stft
+        win_len_arg = {"win_length": None, "length": None} if inverse else {"n_fft": N}
+        n_channels = data.shape[-1]
+        out = []
+        for c in range(n_channels):
+            d = (
+                np.concatenate((np.zeros((N,)), data[:, c], np.zeros((N,))))
+                if not inverse
+                else data[:, :, c].T
+            )
+            s = fstft(d, hop_length=H, window=win, center=False, **win_len_arg)
+            if inverse:
+                s = s[N : N + length]
+            s = np.expand_dims(s.T, 2 - inverse)
+            out.append(s)
+        if len(out) == 1:
+            return out[0]
+        return np.concatenate(out, axis=2 - inverse)