| """ |
| Goal |
| --- |
| 1. Read test results from log.txt files |
| 2. Compute mean and std across different folders (seeds) |
| 3. Compute all datasets' accuracy and h-mean |
| 4. Save the results to an Excel file |
| Usage |
| --- |
| Assume the output files are saved under output/my_experiment, |
| which contains results of different seeds, e.g., |
| |
| my_experiment/ |
| seed1/ |
| log.txt |
| seed2/ |
| log.txt |
| seed3/ |
| log.txt |
| |
| Run the following command from the root directory: |
| |
| $ python tools/parse_test_res.py output/my_experiment |
| |
| Add --ci95 to the argument if you wanna get 95% confidence |
| interval instead of standard deviation: |
| |
| $ python tools/parse_test_res.py output/my_experiment --ci95 |
| |
| If my_experiment/ has the following structure, |
| |
| my_experiment/ |
| exp-1/ |
| seed1/ |
| log.txt |
| ... |
| seed2/ |
| log.txt |
| ... |
| seed3/ |
| log.txt |
| ... |
| exp-2/ |
| ... |
| exp-3/ |
| ... |
| |
| Run |
| |
| $ python tools/parse_test_res.py output/my_experiment --multi-exp |
| """ |
| import re |
| import numpy as np |
| import os.path as osp |
| import argparse |
| import pandas as pd |
| from collections import OrderedDict, defaultdict |
|
|
| from dassl.utils import check_isfile, listdir_nohidden |
|
|
|
|
| b2n_dataset = [ |
| "imagenet", |
| "caltech101", |
| "fgvc_aircraft", |
| "oxford_flowers", |
| "dtd", |
| "eurosat", |
| "food101", |
| "oxford_pets", |
| "stanford_cars", |
| "sun397", |
| "ucf101", |
| ] |
| cross_dataset = [ |
| "caltech101", |
| "fgvc_aircraft", |
| "oxford_flowers", |
| "dtd", |
| "eurosat", |
| "food101", |
| "oxford_pets", |
| "stanford_cars", |
| "sun397", |
| "ucf101", |
| ] |
| dg_dataset = [ |
| "imagenet", |
| "imagenetv2", |
| "imagenet_sketch", |
| "imagenet_a", |
| "imagenet_r", |
| ] |
| def compute_ci95(res): |
| return 1.96 * np.std(res) / np.sqrt(len(res)) |
|
|
|
|
| def parse_function(*metrics, directory="", args=None, end_signal=None): |
| print(f"Parsing files in {directory}") |
| output_results = OrderedDict() |
| output_results['accuracy'] = 0.0 |
|
|
| try: |
| subdirs = listdir_nohidden(directory, sort=True) |
| except: |
| print("no folder") |
| return output_results |
|
|
| |
| outputs = [] |
|
|
| for subdir in subdirs: |
| fpath = osp.join(directory, subdir, "log.txt") |
| |
| assert check_isfile(fpath) |
| good_to_go = False |
| output = OrderedDict() |
|
|
| with open(fpath, "r") as f: |
| lines = f.readlines() |
|
|
| for line in lines: |
| line = line.strip() |
|
|
| if line == end_signal: |
| good_to_go = True |
| |
| |
| for metric in metrics: |
| match = metric["regex"].search(line) |
| if match and good_to_go: |
| if "file" not in output: |
| output["file"] = fpath |
| num = float(match.group(1)) |
| name = metric["name"] |
| output[name] = num |
|
|
| if output: |
| outputs.append(output) |
|
|
| |
| |
| if len(outputs) <= 0: |
| print("Nothing found in :") |
| print(directory) |
| return output_results |
|
|
| metrics_results = defaultdict(list) |
|
|
| for output in outputs: |
| msg = "" |
| for key, value in output.items(): |
| if isinstance(value, float): |
| msg += f"{key}: {value:.2f}%. " |
| else: |
| msg += f"{key}: {value}. " |
| if key != "file": |
| metrics_results[key].append(value) |
| print(msg) |
|
|
| |
|
|
| print("===") |
| print(f"Summary of directory: {directory}") |
| for key, values in metrics_results.items(): |
| avg = np.mean(values) |
| std = compute_ci95(values) if args.ci95 else np.std(values) |
| print(f"* {key}: {avg:.2f}% +- {std:.2f}%") |
| output_results[key] = avg |
| print("===") |
|
|
| return output_results |
|
|
|
|
| def main(args, end_signal): |
| metric = { |
| "name": args.keyword, |
| "regex": re.compile(fr"\* {args.keyword}: ([\.\deE+-]+)%"), |
| } |
|
|
| if args.type == "base2new": |
| all_dataset = b2n_dataset |
| final_results = defaultdict(list) |
| final_results1 = defaultdict(list) |
| pattern = r'\b(' + '|'.join(map(re.escape, all_dataset)) + r')\b' |
| |
| p=args.directory |
| path_str = re.sub(pattern, "{}", p) |
| all_dic = [path_str.format(dataset)for dataset in all_dataset] |
| |
| all_dic1 = [] |
| if "train_base" in all_dic[0]: |
| for p in all_dic: |
| |
| all_dic1.append(p.replace("train_base", "test_new")) |
| |
| elif "test_new" in all_dic[0]: |
| for p in all_dic: |
| |
| |
| all_dic1.append(p.replace("test_new", "train_base")) |
|
|
| temp = all_dic |
| all_dic = all_dic1 |
| all_dic1= temp |
| |
| for i, directory in enumerate(all_dic): |
| results = parse_function( |
| metric, directory=directory, args=args, end_signal=end_signal |
| ) |
| for key, value in results.items(): |
| final_results[key].append(value) |
| |
| for i, directory in enumerate(all_dic1): |
| results1 = parse_function( |
| metric, directory=directory, args=args, end_signal=end_signal |
| ) |
| for key, value in results1.items(): |
| final_results1[key].append(value) |
| |
| |
| output_data = [] |
| for i in range(len(all_dataset)): |
| base = final_results['accuracy'][i] |
| new = final_results1['accuracy'][i] |
| try: |
| h = 2 / (1/base + 1/new) |
| except: |
| h = 0 |
| result = { |
| 'Dataset': all_dataset[i], |
| 'Base Accuracy': base, |
| 'New Accuracy': new, |
| 'H-Mean': h |
| } |
| output_data.append(result) |
| print(f"{all_dataset[i]:<20}: base: {base:>6.2f} new: {new:>6.2f} h: {h:>6.2f}") |
|
|
| output_df = pd.DataFrame(output_data) |
|
|
| |
| output_file = "form_results_base2new.xlsx" |
| output_df.to_excel(output_file, index=False) |
|
|
|
|
| print("Average performance:") |
| |
| for key, values in final_results.items(): |
| avg_base = np.mean(values) |
| print('base') |
| print(f"* {key}: {avg_base:.2f}%") |
|
|
| for key, values in final_results1.items(): |
| avg_new = np.mean(values) |
| print('new') |
| print(f"* {key}: {avg_new:.2f}%") |
| |
| try: |
| avg_h = 2 / (1/avg_base + 1/avg_new) |
| except: |
| avg_h = 0 |
| print(f'h: {avg_h:.2f}%') |
| else: |
| if args.type == "fewshot": |
| all_dataset = b2n_dataset |
| elif args.type == "cross": |
| all_dataset = cross_dataset |
| elif args.type == "dg": |
| all_dataset = dg_dataset |
|
|
| final_results = defaultdict(list) |
| pattern = r'\b(' + '|'.join(map(re.escape, all_dataset)) + r')\b' |
| p=args.directory |
| path_str = re.sub(pattern, "{}", p) |
| all_dic = [path_str.format(dataset)for dataset in all_dataset] |
| |
|
|
| for i, directory in enumerate(all_dic): |
| results = parse_function( |
| metric, directory=directory, args=args, end_signal=end_signal |
| ) |
| for key, value in results.items(): |
| final_results[key].append(value) |
| |
| output_data = [] |
| for i in range(len(all_dataset)): |
| base = final_results['accuracy'][i] |
| |
| result = { |
| 'Dataset': all_dataset[i], |
| 'Accuracy': base, |
| } |
| output_data.append(result) |
| print(f"{all_dataset[i]:<20}: Accuracy: {base:>6.2f}") |
|
|
| output_df = pd.DataFrame(output_data) |
|
|
| |
| output_file = "form_results_"+args.type+".xlsx" |
| output_df.to_excel(output_file, index=False) |
|
|
|
|
| print("Average performance:") |
|
|
| for key, values in final_results.items(): |
| avg_base = np.mean(values) |
| print(f"* {key}: {avg_base:.2f}%") |
|
|
| |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("directory", type=str, help="path to directory") |
| parser.add_argument("-type", type=str, |
| choices=['base2new', 'fewshot', 'cross', 'dg'], |
| help="task type:base2new, fewshot, cross, dg") |
| parser.add_argument( |
| "--ci95", action="store_true", help=r"compute 95\% confidence interval" |
| ) |
| parser.add_argument("--test-log", action="store_true", help="parse test-only logs") |
| parser.add_argument( |
| "--multi-exp", action="store_true", help="parse multiple experiments" |
| ) |
| parser.add_argument( |
| "--keyword", default="accuracy", type=str, help="which keyword to extract" |
| ) |
| args = parser.parse_args() |
| |
| end_signal = "=> result" |
| if args.test_log: |
| end_signal = "=> result" |
|
|
| main(args, end_signal) |
|
|
|
|