CAPIMAC / run.py
bestow136's picture
Upload 13 files
8ffcfd0 verified
import argparse
import time
import random
from model import *
import math
import torch,gc
import torch.nn as nn
import torch.nn.functional as F
from train_methods import *
import logging
import sys
import numpy as np
import matplotlib.pyplot as plt
from Datasets import *
from config import *
from data_loader import *
import mat73
from anchors import *
from Cluster import *
parser = argparse.ArgumentParser(description='CAPIMAC in PyTorch')
parser.add_argument('--data', default='1', type=int,
help='choice of dataset, 0-HW,1-3Sources,2BBC,3-Scene15, 4-Caltech101,5-ORL_mtv,6-Caltech_7,7-Reuters,'
'8-20newsgroups,9-100leaves,10-BBC4,11-MSRCv1,12-BDGP,13-HandWritten,14-yale_mtv,15-Wikipedia-test,16-Movies,17-Prokaryotic,18-ALOI,19-flower17')
parser.add_argument('-bs', '--batch-size', default='1024', type=int, help='number of batch size')
parser.add_argument('-e', '--epochs', default='200', type=int, help='number of epochs to run')
parser.add_argument('-lr', '--learn-rate', default='0.0001', type=float, help='learning rate of adam')
parser.add_argument('-ap', '--aligned-prop', default='0.5', type=float,
help='originally aligned proportions in the partially view-aligned data')
parser.add_argument('--gpu', default=0, type=int, help='GPU device idx to use.')
parser.add_argument('-cp', '--complete-prop', default='0.5', type=float,
help='originally complete proportions in the partially sample-missing data')
parser.add_argument('-m', '--margin', default='5', type=int, help='initial margin')
parser.add_argument('-s', '--start-fine', default=True, type=bool, help='flag to start use robust loss or not')
parser.add_argument('-np', '--neg-num', default='30', type=int, help='the ratio of negative to positive pairs')
parser.add_argument('-noise', '--noisy-training', type=bool, default=True,
help='training with real labels or noisy labels')
parser.add_argument('-r', '--robust', default=1, type=int, help='use our robust loss or not')
dim=0
class NoiseRobustLoss(nn.Module):
def __init__(self):
super(NoiseRobustLoss, self).__init__()
def forward(self, pair_dist, P, margin, use_robust_loss, args):
# print(max(pair_dist))
dist_sq = pair_dist * pair_dist
P = P.to(torch.float32)
N = len(P)
if use_robust_loss == 1:
if args.start_fine:
loss = P * dist_sq + (1 - P) * (1 / margin) * torch.pow(
torch.clamp(torch.pow(pair_dist, 0.5) * (0.5*margin - pair_dist), min=0.0), 2)
else:
loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
else:
loss = P * dist_sq + (1 - P) * torch.pow(torch.clamp(margin - pair_dist, min=0.0), 2)
loss = torch.sum(loss) / (2.0 * N)
return loss
def load_data(align_prop,complete_prop,neg_num,is_noise,dataset):
global dim
NetSeed = random.randint(1, 1000)
# NetSeed=72
print(NetSeed)
np.random.seed(NetSeed)
torch.backends.cudnn.deterministic = True
torch.manual_seed(NetSeed) # 为CPU设置随机种子
torch.cuda.manual_seed(NetSeed) # 为当前GPU设置随机种子
args = parser.parse_args()
all_data = []
map_pairs = []
label = []
train_pairs = []
if dataset=='Caltech101_7':
path = './datasets/' + dataset + '.mat' # 路径
mat = mat73.loadmat(path) # 加载mat文件
else:
mat = sio.loadmat('./datasets/' + dataset + '.mat')
if dataset == 'Scene15':
data = mat['X'][0][0:2] # 20, 59 dimensions
label = np.squeeze(mat['Y'])
elif dataset == 'HandWritten':
data = mat['X'][0][1:3]
label = np.squeeze(mat['Y'])
elif dataset == '3Sources':
data = mat['X'][0][0:2]
label = np.squeeze(mat['Y'])
elif dataset == 'ALOI':
data = mat['X'][0][0:2]
label = np.squeeze(mat['gt'])
elif dataset == 'BBCsports':
data = mat['X'][0][0:2]
label = np.squeeze(mat['Y'])
elif dataset == 'Caltech101':
data = mat['X'][0][0:2]
label = np.squeeze(mat['Y'])
elif dataset == 'Reuters_dim10':
data = [] # 18758 samples
data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
elif dataset == 'ORL_mtv':
data = mat['X'][0][0:2]
label = np.squeeze(mat['gt'])
elif dataset == 'Caltech101_7':
data = mat['data'][3:5]
data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
data[0], data[1] = np.array(data[0]), np.array(data[1])
label = np.squeeze(mat['labels'])
elif dataset == 'Reuters':
data = mat['X'][0][0:2]
label = np.squeeze(mat['Y'])
elif dataset == '20NewsGroups':
data = mat['data'][0][1:3]
label = np.squeeze(mat['truelabel'][0][0])
elif dataset == '100leaves':
mat['data'][0][0], mat['data'][0][1] = mat['data'][0][0].T, mat['data'][0][1].T
data = mat['data'][0][0:2]
label = np.squeeze(mat['truelabel'][0][0])
elif dataset == 'BBC4':
data = mat['data'][0][0:2]
label = np.squeeze(mat['truelabel'][0][0])
# print(label)
elif dataset == 'MSRCv1':
data = mat['X'][0][1:3]
label = np.squeeze(mat['Y'])
elif dataset == 'BDGP':
mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
data = mat['X'][0][0:2]
label = np.squeeze(mat['gt'])
elif dataset == 'HandWritten':
data = mat['X'][0][1:3]
label = np.squeeze(mat['Y'])
elif dataset == 'yale_mtv':
mat['X'][0][0], mat['X'][0][1] = mat['X'][0][0].T, mat['X'][0][1].T
data = mat['X'][0][0:2]
# print((data))
label = np.squeeze(mat['gt'])
elif dataset == 'Wikipedia-test':
data = mat['X'][0:2][0:2]
data = np.squeeze(data.T)
# print(data)
label = np.squeeze(mat['y'])
elif dataset == 'Movies':
data = mat['X'][0:2][0:2]
data = np.squeeze(data.T)
# print(data)
label = np.squeeze(mat['y'])
elif dataset == 'Prokaryotic':
value1 = mat['X'][0][0]
value2 = mat['X'][2][0]
data = [value1, value2]
# print(data)
label = np.squeeze(mat['y'])
elif dataset == 'flower17':
data = mat['X'][0][0:2]
label = np.squeeze(mat['Y'])
divide_seed = random.randint(1, 1000)
train_idx, test_idx = TT_split(len(label), 1 - align_prop, divide_seed)
train_label, test_label = label[train_idx], label[test_idx]
if dataset == 'Caltech101_7':
data[0], data[1] = np.squeeze(data[0]), np.squeeze(data[1])
print(np.shape(data[0]))
train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
'''获取对齐部分的潜在表示'''
map_pairs.append(train_X)
map_pairs.append(train_Y)
h0 , h1,epoch_time=pretrain(map_pairs, args)
all_label = np.concatenate((train_label, test_label))
'''获取初始训练数据和测试数据'''
if align_prop != 1:
shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
test_Y = test_Y[shuffle_idx]
test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
elif align_prop == 1:
all_data.append(train_X.T)
all_data.append(train_Y.T)
'''不完整部分'''
test_mask = get_sn(2, len(test_label), 1 - complete_prop)
X_mask, Y_mask = test_mask[:, 0].astype(np.bool_), test_mask[:, 1].astype(np.bool_)
# test_X[~X_mask] = 0
# test_Y[~Y_mask] = 0
test_X, test_Y = test_X[X_mask], test_Y[Y_mask]
test_label_X, test_label_Y=test_label_X[X_mask], test_label_Y[Y_mask]
if align_prop != 1:
all_label_X = np.concatenate((train_label, test_label_X))
all_label_Y = np.concatenate((train_label, test_label_Y))
all_data.append(np.concatenate((train_X, test_X)).T)
all_data.append(np.concatenate((train_Y, test_Y)).T)
all_label = np.concatenate((train_label, test_label))
# all_label_X = test_label_X
# all_label_Y = test_label_Y
# all_data.append(test_X.T)
# all_data.append(test_Y.T)
# all_label = test_label
elif align_prop == 1:
all_label_X, all_label_Y = train_label, train_label
all_label = train_label
'''构建训练对'''
view0, view1, noisy_labels, real_labels, _, _ = get_pairs(train_X, train_Y, neg_num, train_label)
count = 0
for i in range(len(noisy_labels)):
if noisy_labels[i] != real_labels[i]:
count += 1
print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
if is_noise == 0: # training with real_labels, v/t with real_labels
print("----------------------Training with real_labels----------------------")
train_pair_labels = real_labels
else: # training with labels, v/t with real_labels
print("----------------------Training with noisy_labels----------------------")
train_pair_labels = noisy_labels
'''初始化锚点'''
num_unique_labels = np.unique(all_label).shape[0]
anchors0,anchors1,len_indices=get_anchors(h0,h1,map_pairs,num_unique_labels)#h0是tensor
'''数据重表示'''
view0,view1,all_data[0],all_data[1]=torch.from_numpy(view0).float(),torch.from_numpy(view1).float(),torch.from_numpy(all_data[0]).float(),torch.from_numpy(all_data[1]).float()
view0, view1, all_data[0],all_data[1]=find_nanchor(anchors0,view0),find_nanchor(anchors1,view1),find_nanchor(anchors0,all_data[0].T),find_nanchor(anchors1,all_data[1].T)
#锚点数×样本数,增强锚点图
view0, view1, all_data[0], all_data[1]=np.array(view0),np.array(view1),np.array(all_data[0]),np.array(all_data[1])
print(np.shape(view0),'view0')
train_pairs.append(view0)
train_pairs.append(view1)
train_pair_real_labels = real_labels
dim=view0.shape[0]
return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
def normalize(x):
x = (x - np.tile(np.min(x, axis=0), (x.shape[0], 1))) / np.tile((np.max(x, axis=0) - np.min(x, axis=0)),
(x.shape[0], 1))
return x
def loader(train_bs, align_prop, complete_prop,neg_num, is_noise, dataset):
"""
:param train_bs: batch size for training, default is 1024
:param neg_prop: negative / positive pairs' ratio
:param test_prop: known aligned proportions for training MvCLN
:param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes
:param data_idx: choice of dataset
:return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
"""
train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed\
= load_data(align_prop,complete_prop,neg_num,is_noise, dataset)
train_pair_dataset = GetDataset(train_pairs, train_pair_labels, train_pair_real_labels)
train_pair_loader = DataLoader(
train_pair_dataset,
batch_size=train_bs,
shuffle=True,
drop_last=True
)
return train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim,num_unique_labels,divide_seed
if __name__ == '__main__':
for i in range(1):
args = parser.parse_args()
data_name = ['HandWritten', '3Sources', 'BBCsports', 'Scene15', 'Caltech101', 'ORL_mtv', 'Caltech101_7', 'Reuters',
'20NewsGroups','100leaves','BBC4','MSRCv1','BDGP','HandWritten','yale_mtv','Wikipedia-test','Movies','Prokaryotic','ALOI','flower17']
train_pair_loader, all_data, all_label, all_label_X, all_label_Y, dim, outfeature ,divide_seed=loader(args.batch_size, args.aligned_prop,args.complete_prop,args.neg_num,args.noisy_training,data_name[args.data])
model = Anchormodel(dim,outfeature).to(args.gpu)
criterion = NoiseRobustLoss().to(args.gpu)
# criterion_mse = nn.MSELoss().to(args.gpu)
optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate)
CAR_list = []
acc_list, nmi_list, ari_list,f_list,f1_list,pre_list,pre2_list,rec_list,pur_list = [], [], [],[], [], [],[], [], []
train_time = 0
all_data[0], all_data[1]=torch.from_numpy(all_data[0]), torch.from_numpy(all_data[1])
for i in range(0, args.epochs + 1):
if i == 0:
with torch.no_grad():
epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
else:
epoch_time = train2(train_pair_loader, model, criterion, optimizer, i, args)
# test
v0, v1, pred_label, alignment_rate = tiny_infer(model, args.gpu, all_data, all_label_X, all_label_Y)
CAR_list.append(alignment_rate)
data = []
data.append(v0)
data.append(v1)
y_pred, ret, accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity = Clustering(data,
pred_label)
if i % 10 == 0:
print(accuracy, nmi, ari, f_score, f_score2, precision, precision2, recall, purity)
# logging.info("******** testing ********")
# logging.info(
# "CAR={} kmeans: acc={} nmi={} ari={}".format(round(alignment_rate, 4), ret['kmeans']['accuracy'],
# ret['kmeans']['NMI'], ret['kmeans']['ARI']))
acc_list.append(ret['kmeans']['ACC'])
nmi_list.append(ret['kmeans']['NMI'])
ari_list.append(ret['kmeans']['ARI'])
f_list.append(ret['kmeans']['F1'])
f1_list.append(ret['kmeans']['F2'])
pre_list.append(ret['kmeans']['PRE'])
pre2_list.append(ret['kmeans']['PRE2'])
rec_list.append(ret['kmeans']['REC'])
pur_list.append(ret['kmeans']['PUR'])
print('ACC:', max(acc_list))
print("NMI:", max(nmi_list))
print("ARI:", max(ari_list))
print("F1:", max(f_list))
print("F2:", max(f1_list))
print("PRE:", max(pre_list))
print("PRE2:", max(pre2_list))
print("REC:", max(rec_list))
print("PUR:", max(pur_list))
logging.info('******** End, training time = {} s ********'.format(round(train_time, 2)))