| import argparse
|
| import time
|
| import random
|
| from model import *
|
| import math
|
| import torch,gc
|
| import torch.nn as nn
|
| import torch.nn.functional as F
|
| from train_methods import *
|
| import logging
|
| import sys
|
| import numpy as np
|
| import matplotlib.pyplot as plt
|
| from Datasets import *
|
| from config import *
|
| from data_loader import *
|
| import mat73
|
| from anchors import *
|
| from Cluster import *
|
| parser = argparse.ArgumentParser(description='CAPIMAC in PyTorch')
|
| parser.add_argument('--data', default='1', type=int,
|
| help='choice of dataset, 0-HW,1-3Sources,2BBC,3-Scene15, 4-Caltech101,5-ORL_mtv,6-Caltech_7,7-Reuters,'
|
| '8-20newsgroups,9-100leaves,10-BBC4,11-MSRCv1,12-BDGP,13-HandWritten,14-yale_mtv,15-Wikipedia-test,16-Movies,17-Prokaryotic,18-ALOI,19-flower17')
|
| parser.add_argument('-bs', '--batch-size', default='1024', type=int, help='number of batch size')
|
| parser.add_argument('-e', '--epochs', default='200', type=int, help='number of epochs to run')
|
| parser.add_argument('-lr', '--learn-rate', default='0.0001', type=float, help='learning rate of adam')
|
| parser.add_argument('-ap', '--aligned-prop', default='0.5', type=float,
|
| help='originally aligned proportions in the partially view-aligned data')
|
| parser.add_argument('--gpu', default=0, type=int, help='GPU device idx to use.')
|
| parser.add_argument('-cp', '--complete-prop', default='0.5', type=float,
|
| help='originally complete proportions in the partially sample-missing data')
|
| parser.add_argument('-m', '--margin', default='5', type=int, help='initial margin')
|
| parser.add_argument('-s', '--start-fine', default=True, type=bool, help='flag to start use robust loss or not')
|
| parser.add_argument('-np', '--neg-num', default='30', type=int, help='the ratio of negative to positive pairs')
|
| parser.add_argument('-noise', '--noisy-training', type=bool, default=True,
|
| help='training with real labels or noisy labels')
|
| parser.add_argument('-r', '--robust', default=1, type=int, help='use our robust loss or not')
|
|
|
| dim=0
|
| class NoiseRobustLoss(nn.Module):
|
| def __init__(self):
|
| super(NoiseRobustLoss, self).__init__()
|
|
|
| def forward(self, pair_dist, P, margin, use_robust_loss, args):
|
|
|
| dist_sq = pair_dist * pair_dist
|
| P = P.to(torch.float32)
|
| N = len(P)
|
| if use_robust_loss == 1:
|
| if args.start_fine:
|
| loss = P * dist_sq + (1 - P) * (1 / margin) * torch.pow(
|
| torch.clamp(torch.pow(pair_dist, 0.5) * (0.5*margin - pair_dist), min=0.0), 2)
|
| else:
|
| loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
|
| else:
|
| loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
|
| loss = torch.sum(loss) / (2.0 * N)
|
| return loss
|
| def load_data(align_prop,complete_prop,neg_num,is_noise,dataset):
|
| global dim
|
| NetSeed = random.randint(1, 1000)
|
|
|
| print(NetSeed)
|
| np.random.seed(NetSeed)
|
| torch.backends.cudnn.deterministic = True
|
| torch.manual_seed(NetSeed)
|
| torch.cuda.manual_seed(NetSeed)
|
| args = parser.parse_args()
|
| all_data = []
|
| map_pairs = []
|
| label = []
|
| train_pairs = []
|
|
|
| if dataset=='Caltech101_7':
|
| path = './datasets/' + dataset + '.mat'
|
| mat = mat73.loadmat(path)
|
| else:
|
| mat = sio.loadmat('./datasets/' + dataset + '.mat')
|
| if dataset == 'Scene15':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'HandWritten':
|
| data = mat['X'][0][1:3]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == '3Sources':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'ALOI':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['gt'])
|
| elif dataset == 'BBCsports':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'Caltech101':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'Reuters_dim10':
|
| data = []
|
| data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
|
| data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
|
| label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
|
| elif dataset == 'ORL_mtv':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['gt'])
|
| elif dataset == 'Caltech101_7':
|
| data = mat['data'][3:5]
|
| data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
|
| data[0], data[1] = np.array(data[0]), np.array(data[1])
|
| label = np.squeeze(mat['labels'])
|
| elif dataset == 'Reuters':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == '20NewsGroups':
|
| data = mat['data'][0][1:3]
|
| label = np.squeeze(mat['truelabel'][0][0])
|
| elif dataset == '100leaves':
|
| mat['data'][0][0], mat['data'][0][1] = mat['data'][0][0].T, mat['data'][0][1].T
|
| data = mat['data'][0][0:2]
|
| label = np.squeeze(mat['truelabel'][0][0])
|
| elif dataset == 'BBC4':
|
| data = mat['data'][0][0:2]
|
| label = np.squeeze(mat['truelabel'][0][0])
|
|
|
| elif dataset == 'MSRCv1':
|
| data = mat['X'][0][1:3]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'BDGP':
|
| mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['gt'])
|
| elif dataset == 'HandWritten':
|
| data = mat['X'][0][1:3]
|
| label = np.squeeze(mat['Y'])
|
| elif dataset == 'yale_mtv':
|
| mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
|
| data = mat['X'][0][0:2]
|
|
|
| label = np.squeeze(mat['gt'])
|
| elif dataset == 'Wikipedia-test':
|
| data = mat['X'][0:2][0:2]
|
| data = np.squeeze(data.T)
|
|
|
| label = np.squeeze(mat['y'])
|
| elif dataset == 'Movies':
|
| data = mat['X'][0:2][0:2]
|
| data = np.squeeze(data.T)
|
|
|
| label = np.squeeze(mat['y'])
|
| elif dataset == 'Prokaryotic':
|
| value1 = mat['X'][0][0]
|
| value2 = mat['X'][2][0]
|
| data = [value1, value2]
|
|
|
| label = np.squeeze(mat['y'])
|
| elif dataset == 'flower17':
|
| data = mat['X'][0][0:2]
|
| label = np.squeeze(mat['Y'])
|
| divide_seed = random.randint(1, 1000)
|
| train_idx, test_idx = TT_split(len(label), 1 - align_prop, divide_seed)
|
| train_label, test_label = label[train_idx], label[test_idx]
|
| if dataset == 'Caltech101_7':
|
| data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
|
| print(np.shape(data[0]))
|
| train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
|
| '''获取对齐部分的潜在表示'''
|
| map_pairs.append(train_X)
|
| map_pairs.append(train_Y)
|
| h0 , h1,epoch_time=pretrain(map_pairs, args)
|
| all_label = np.concatenate((train_label, test_label))
|
| '''获取初始训练数据和测试数据'''
|
| if align_prop != 1:
|
| shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
|
| test_Y = test_Y[shuffle_idx]
|
| test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
|
| elif align_prop == 1:
|
| all_data.append(train_X.T)
|
| all_data.append(train_Y.T)
|
| '''不完整部分'''
|
| test_mask = get_sn(2, len(test_label), 1 - complete_prop)
|
| X_mask, Y_mask = test_mask[:, 0].astype(np.bool_), test_mask[:, 1].astype(np.bool_)
|
|
|
|
|
| test_X, test_Y = test_X[X_mask], test_Y[Y_mask]
|
| test_label_X, test_label_Y=test_label_X[X_mask], test_label_Y[Y_mask]
|
| if align_prop != 1:
|
| all_label_X = np.concatenate((train_label, test_label_X))
|
| all_label_Y = np.concatenate((train_label, test_label_Y))
|
| all_data.append(np.concatenate((train_X, test_X)).T)
|
| all_data.append(np.concatenate((train_Y, test_Y)).T)
|
| all_label = np.concatenate((train_label, test_label))
|
|
|
|
|
|
|
|
|
|
|
| elif align_prop == 1:
|
| all_label_X, all_label_Y = train_label, train_label
|
| all_label = train_label
|
| '''构建训练对'''
|
| view0, view1, noisy_labels, real_labels, _, _ = get_pairs(train_X, train_Y, neg_num, train_label)
|
| count = 0
|
| for i in range(len(noisy_labels)):
|
| if noisy_labels[i] != real_labels[i]:
|
| count += 1
|
| print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
|
|
|
| if is_noise == 0:
|
| print("----------------------Training with real_labels----------------------")
|
| train_pair_labels = real_labels
|
| else:
|
| print("----------------------Training with noisy_labels----------------------")
|
| train_pair_labels = noisy_labels
|
| '''初始化锚点'''
|
| num_unique_labels = np.unique(all_label).shape[0]
|
|
|
| anchors0,anchors1,len_indices=get_anchors(h0,h1,map_pairs,num_unique_labels)
|
|
|
| '''数据重表示'''
|
| view0,view1,all_data[0],all_data[1]=torch.from_numpy(view0).float(),torch.from_numpy(view1).float(),torch.from_numpy(all_data[0]).float(),torch.from_numpy(all_data[1]).float()
|
|
|
| view0, view1, all_data[0],all_data[1]=find_nanchor(anchors0,view0),find_nanchor(anchors1,view1),find_nanchor(anchors0,all_data[0].T),find_nanchor(anchors1,all_data[1].T)
|
|
|
| view0, view1, all_data[0], all_data[1]=np.array(view0),np.array(view1),np.array(all_data[0]),np.array(all_data[1])
|
| print(np.shape(view0),'view0')
|
| train_pairs.append(view0)
|
| train_pairs.append(view1)
|
| train_pair_real_labels = real_labels
|
| dim=view0.shape[0]
|
| return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
|
|
|
| def normalize(x):
|
| x = (x - np.tile(np.min(x, axis=0), (x.shape[0], 1))) / np.tile((np.max(x, axis=0) - np.min(x, axis=0)),
|
| (x.shape[0], 1))
|
| return x
|
| def loader(train_bs, align_prop, complete_prop,neg_num, is_noise, dataset):
|
| """
|
| :param train_bs: batch size for training, default is 1024
|
| :param neg_prop: negative / positive pairs' ratio
|
| :param test_prop: known aligned proportions for training MvCLN
|
| :param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes
|
| :param data_idx: choice of dataset
|
| :return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
|
| """
|
| train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed\
|
| = load_data(align_prop,complete_prop,neg_num,is_noise, dataset)
|
| train_pair_dataset = GetDataset(train_pairs, train_pair_labels, train_pair_real_labels)
|
|
|
| train_pair_loader = DataLoader(
|
| train_pair_dataset,
|
| batch_size=train_bs,
|
| shuffle=True,
|
| drop_last=True
|
| )
|
| return train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
|
|
|
| if __name__ == '__main__':
|
| for i in range(1):
|
| args = parser.parse_args()
|
| data_name = ['HandWritten', '3Sources', 'BBCsports', 'Scene15', 'Caltech101', 'ORL_mtv', 'Caltech101_7', 'Reuters',
|
| '20NewsGroups','100leaves','BBC4','MSRCv1','BDGP','HandWritten','yale_mtv','Wikipedia-test','Movies','Prokaryotic','ALOI','flower17']
|
| train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim, outfeature ,divide_seed=loader(args.batch_size, args.aligned_prop,args.complete_prop,args.neg_num,args.noisy_training,data_name[args.data])
|
|
|
| model = Anchormodel(dim,outfeature).to(args.gpu)
|
| criterion = NoiseRobustLoss().to(args.gpu)
|
|
|
| optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
|
| CAR_list = []
|
| acc_list, nmi_list, ari_list,f_list,f1_list,pre_list,pre2_list,rec_list,pur_list = [], [], [],[], [], [],[], [], []
|
| train_time = 0
|
| all_data[0], all_data[1]=torch.from_numpy(all_data[0]), torch.from_numpy(all_data[1])
|
| for i in range(0, args.epochs + 1):
|
| if i == 0:
|
| with torch.no_grad():
|
| epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
|
| else:
|
| epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
|
|
|
| v0, v1, pred_label, alignment_rate = tiny_infer(model, args.gpu, all_data, all_label_X, all_label_Y)
|
| CAR_list.append(alignment_rate)
|
| data = []
|
| data.append(v0)
|
| data.append(v1)
|
|
|
| y_pred, ret, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity = Clustering(data,
|
| pred_label)
|
| if i % 10 == 0:
|
| print(accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity)
|
|
|
|
|
|
|
|
|
| acc_list.append(ret['kmeans']['ACC'])
|
| nmi_list.append(ret['kmeans']['NMI'])
|
| ari_list.append(ret['kmeans']['ARI'])
|
| f_list.append(ret['kmeans']['F1'])
|
| f1_list.append(ret['kmeans']['F2'])
|
| pre_list.append(ret['kmeans']['PRE'])
|
| pre2_list.append(ret['kmeans']['PRE2'])
|
| rec_list.append(ret['kmeans']['REC'])
|
| pur_list.append(ret['kmeans']['PUR'])
|
| print('ACC:', max(acc_list))
|
| print("NMI:", max(nmi_list))
|
| print("ARI:", max(ari_list))
|
| print("F1:", max(f_list))
|
| print("F2:", max(f1_list))
|
| print("PRE:", max(pre_list))
|
| print("PRE2:", max(pre2_list))
|
| print("REC:", max(rec_list))
|
| print("PUR:", max(pur_list))
|
| logging.info('******** End, training time = {} s ********'.format(round(train_time, 2))) |