CRSTC / src /clustering.py

Upload 218 files

dd1cb8f verified over 1 year ago

11.2 kB

	import numpy as np
	import librosa
	import torch
	import matplotlib.pyplot as plt

	from metrics.event_based_metrics import event_metrics
	from src.audio_preprocessing import readLabels, object_padding, fbank_features_extraction
	from tslearn.clustering import TimeSeriesKMeans, KShape
	from tslearn.metrics import dtw
	from sklearn.cluster import KMeans, AffinityPropagation, AgglomerativeClustering, MeanShift, estimate_bandwidth, DBSCAN, OPTICS, Birch
	from sklearn.metrics import accuracy_score, f1_score

	import os
	os.environ["OMP_NUM_THREADS"] = '3'

	def standardize_array(array):
	mean = np.mean(array, axis=0)
	std = np.std(array, axis=0)

	# Avoid division by zero
	std[std == 0] = 1
	standardized_array = (array - mean) / std
	return standardized_array

	# Functions to cluster and label a single audio's frames
	def kmeans_clustering(audio_data, n_clusters=2):
	kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(audio_data)
	labels = kmeans.predict(audio_data)
	return labels

	def dtw_kmedoids_clustering(audio_data, n_clusters=2):
	km = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", max_iter=10, random_state=42)
	labels = km.fit_predict(audio_data)
	return labels

	def kshape_clustering(audio_data, n_clusters=2):
	ks = KShape(n_clusters=n_clusters, max_iter=10, random_state=42)
	labels = ks.fit_predict(audio_data)
	return labels

	def affinity_propagation_clustering(audio_data):
	af = AffinityPropagation(random_state=42)
	labels = af.fit_predict(audio_data)
	return labels

	def agglomerative_clustering(audio_data, n_clusters=2):
	agg = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='average')
	distances = [[dtw(x, y) for y in audio_data] for x in audio_data]
	labels = agg.fit_predict(distances)
	return labels

	def mean_shift_clustering(audio_data):
	bandwidth = estimate_bandwidth(audio_data, quantile=0.2)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
	labels = ms.fit_predict(audio_data)
	return labels

	def bisecting_kmeans_clustering(audio_data, n_clusters=2):
	clusters = [audio_data]
	while len(clusters) < n_clusters:
	largest_cluster_idx = max(range(len(clusters)), key=lambda i: len(clusters[i]))
	largest_cluster = clusters[largest_cluster_idx]

	km = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=10, random_state=42)
	sub_labels = km.fit_predict(largest_cluster)
	sub_cluster1 = largest_cluster[sub_labels == 0]
	sub_cluster2 = largest_cluster[sub_labels == 1]

	clusters.pop(largest_cluster_idx)
	clusters.append(sub_cluster1)
	clusters.append(sub_cluster2)

	labels = [-1] * len(audio_data)
	for i, cluster in enumerate(clusters):
	for idx in [j for j, x in enumerate(audio_data) if x in cluster]:
	labels[idx] = i

	return np.array(labels)

	def dbscan_clustering(audio_data, eps=0.5, min_samples=5):
	dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=dtw)
	labels = dbscan.fit_predict(audio_data)
	return labels

	def optics_clustering(audio_data, min_samples=5):
	optics = OPTICS(min_samples=min_samples, metric=dtw, cluster_method='xi')
	labels = optics.fit_predict(audio_data)
	return labels

	def birch_clustering(audio_data, n_clusters=None, branching_factor=50, threshold=0.5):
	birch = Birch(n_clusters=n_clusters, branching_factor=branching_factor, threshold=threshold)
	labels = birch.fit_predict(audio_data)
	return labels

	def clustering_predicting(model, annotation_file, audio_file, max_length, clustering_method="kmeans", k=2):
	signal, fs = librosa.load(audio_file)
	signal = object_padding(signal, max_length)
	truth_labels = readLabels(path=annotation_file, sample_rate=fs)
	truth_labels = object_padding(truth_labels, max_length)

	test_audio = fbank_features_extraction([audio_file], max_length)

	test_input = torch.tensor(test_audio, dtype=torch.float32)
	x_recon, test_latent, test_u, loss = model(test_input)

	clustering_input = standardize_array(test_u.reshape((703, -1)).detach().numpy())
	clustering_label = None

	if clustering_method == "kmeans":
	clustering_label = kmeans_clustering(clustering_input, n_clusters=k)
	elif clustering_method == "dtw":
	clustering_label = dtw_kmedoids_clustering(clustering_input, n_clusters=k)
	elif clustering_method == "kshape":
	clustering_label = kshape_clustering(clustering_input, n_clusters=k)
	elif clustering_method == "affinity":
	clustering_label = affinity_propagation_clustering(clustering_input)
	elif clustering_method == "agglomerative":
	clustering_label = agglomerative_clustering(clustering_input, n_clusters=k)
	elif clustering_method == "mean_shift":
	clustering_label = mean_shift_clustering(clustering_input)
	elif clustering_method == "bisecting":
	clustering_label = bisecting_kmeans_clustering(clustering_input, n_clusters=k)
	elif clustering_method == "DBSCAN":
	clustering_label = dbscan_clustering(clustering_input)
	elif clustering_method == "OPTICS":
	clustering_label = optics_clustering(clustering_input)
	elif clustering_method == "Birch":
	clustering_label = birch_clustering(clustering_input, n_clusters=k)

	label_timeseries = np.zeros(max_length)
	begin = int(0)
	end = int(0.025 *fs)
	shift_step = int(0.01 * fs)
	for i in range(clustering_label.shape[0]):
	label_timeseries[begin:end] = abs(clustering_label[i])
	begin = begin + shift_step
	end = end + shift_step

	return signal, fs, np.array(truth_labels), label_timeseries

	def signal_visualization(signal, fs, truth_labels, label_timeseries):
	# define time axis
	Ns = len(signal) # number of sample
	Ts = 1 / fs # sampling period
	t = np.arange(Ns) * Ts # time axis in seconds
	norm_coef = 1.1 * np.max(signal)
	edge_ind = np.min([signal.shape[0], len(truth_labels)])

	plt.figure(figsize=(24, 6))
	plt.plot(t[:edge_ind], signal[:edge_ind])
	plt.plot(t[:edge_ind], truth_labels[:edge_ind] * norm_coef)
	plt.plot(t[:edge_ind], label_timeseries[:edge_ind] * norm_coef)

	plt.title("Ground truth labels")
	plt.legend(['Signal', 'Cry', 'Clusters'])
	plt.show()

	def cluster_visualization(signal, fs, truth_labels, label_timeseries):
	# define time axis
	Ns = len(signal) # number of sample
	Ts = 1 / fs # sampling period
	t = np.arange(Ns) * Ts # time axis in seconds
	norm_coef = 1.1 * np.max(signal)
	edge_ind = np.min([signal.shape[0], len(truth_labels)])

	plt.figure(figsize=(24, 6))
	line_signal, = plt.plot(t[:edge_ind], signal[:edge_ind])

	# Identify 'cry' and 'non-cry' segments
	cry_indices = np.where(truth_labels == 1)[0]
	non_cry_indices = np.where(truth_labels == 0)[0]

	# Fill rectangular segments for 'cry' and 'non-cry'
	# Identify start and end points for each continuous segment
	start_cry = np.insert(np.where(np.diff(cry_indices) != 1)[0] + 1, 0, 0)
	end_cry = np.append(np.where(np.diff(cry_indices) != 1)[0], len(cry_indices) - 1)

	# Fill rectangular segments
	for start, end in zip(start_cry, end_cry):
	plt.fill_between(
	t[cry_indices[start:end+1]],
	0,
	norm_coef,
	color='orange',
	alpha=0.5, # Adjust transparency as needed
	label='Cry' if start == start_cry[0] else None # Avoid duplicate labels
	)
	legend_handles = []
	legend_handles.append(plt.Rectangle((0, 0), 1, 1, color='orange', alpha=0.5))

	start_non_cry = np.insert(np.where(np.diff(non_cry_indices) != 1)[0] + 1, 0, 0)
	end_non_cry = np.append(np.where(np.diff(non_cry_indices) != 1)[0], len(non_cry_indices) - 1)

	# Fill rectangular segments
	for start, end in zip(start_non_cry, end_non_cry):
	plt.fill_between(
	t[non_cry_indices[start:end+1]],
	0,
	norm_coef,
	color='gray',
	alpha=0.5, # Adjust transparency as needed
	label='Non-cry' if start == start_non_cry[0] else None # Avoid duplicate labels
	)
	legend_handles.append(plt.Rectangle((0, 0), 1, 1, color='gray', alpha=0.5))

	# Get unique values in label_timeseries to assign distinct colors
	unique_labels = np.unique(label_timeseries)
	cmap = plt.get_cmap('tab10') # You can choose other colormaps as needed

	# Fill rectangular segments for each unique label
	for i, label in enumerate(unique_labels):
	label_indices = np.where(label_timeseries == label)[0]

	# Identify start and end points for each continuous segment
	start_indices = np.insert(np.where(np.diff(label_indices) != 1)[0] + 1, 0, 0)
	end_indices = np.append(np.where(np.diff(label_indices) != 1)[0], len(label_indices) - 1)

	# Fill rectangular segments
	for start, end in zip(start_indices, end_indices):
	plt.fill_between(
	t[label_indices[start:end+1]],
	0,
	-norm_coef,
	color=cmap(i),
	alpha=0.5, # Adjust transparency as needed
	label=f'Cluster {label}' if start == start_indices[0] else None # Avoid duplicate labels
	)
	legend_handles.append(plt.Rectangle((0, 0), 1, 1, color=cmap(i), alpha=0.5))

	plt.title("Audio Clustering")
	plt.legend(
	[line_signal] + legend_handles,
	['Signal'] + ['Cry', 'Non-Cry'] + [f'Cluster {label}' for label in unique_labels]
	)
	plt.show()

	def clustering_evaluatation(model, max_length, audio_files, annotation_files, domain_index=None, clustering_method="kmeans", k=2):
	acc_list, framef_list, eventf_list, iou_list = [], [], [], []
	switch_list = []
	if domain_index is None:
	domain_index = range(len(audio_files))
	for i in domain_index:
	annotation_file = annotation_files[i]
	audio_file = audio_files[i]
	clustering_switch = False
	_, _, truth_labels, label_timeseries = clustering_predicting(model, annotation_file, audio_file, max_length, clustering_method, k)
	temp_accuracy = accuracy_score(truth_labels, label_timeseries)
	framef = max(f1_score(1 - label_timeseries > 0, truth_labels), f1_score(label_timeseries > 0, truth_labels))
	if temp_accuracy < 0.5:
	clustering_accuracy = 1-temp_accuracy
	clustering_switch = True
	else:
	clustering_accuracy = temp_accuracy
	acc_list.append(clustering_accuracy)
	switch_list.append(clustering_switch)
	framef_list.append(framef)
	eventf, iou, _, _, _ = event_metrics(truth_labels, label_timeseries, tolerance=2000, overlap_threshold=0.75)
	eventf_list.append(eventf)
	iou_list.append(iou)

	return acc_list, framef_list, eventf_list, iou_list, switch_list