diff --git a/source_selection/source_selection.py b/source_selection/source_selection.py new file mode 100644 index 0000000000000000000000000000000000000000..2762ffe0980f6fcdf6bcfb29a0932603d8a4c215 --- /dev/null +++ b/source_selection/source_selection.py @@ -0,0 +1,2420 @@ +#!python + +# ------- +# IMPORTS +# ------- + +# General Libraries +import os +import sys +import time +import math +import random +import datetime +import subprocess +import pickle +import copy +import inspect +import itertools +import warnings +import shutil + +# Numpy Library +import numpy as np + +# Making numpy single threaded, so I can multithread this program myself: +os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=4 +os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=4 +os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=6 +os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=4 +os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=6 + +# Pandas Library +import pandas as pd +warnings.filterwarnings('ignore', category=FutureWarning, module='seaborn') + +# Scipy +from scipy import stats +from scipy.stats import binomtest, ttest_rel,ttest_ind, wilcoxon +from scipy.stats import gaussian_kde + + + +# Scikit-learn library +from sklearn.pipeline import make_pipeline +from sklearn.metrics import accuracy_score,confusion_matrix,root_mean_squared_error,ConfusionMatrixDisplay,RocCurveDisplay, balanced_accuracy_score,roc_auc_score, f1_score, median_absolute_error +from sklearn.model_selection import LeaveOneGroupOut, KFold, StratifiedKFold, StratifiedShuffleSplit, cross_val_predict, GroupKFold,cross_validate +from sklearn import linear_model +from sklearn import svm +from sklearn import tree +from sklearn import manifold +from sklearn import dummy +from sklearn.preprocessing import StandardScaler,KBinsDiscretizer, PolynomialFeatures,PowerTransformer, LabelBinarizer,RobustScaler, QuantileTransformer, MinMaxScaler +from sklearn import neural_network +from sklearn import discriminant_analysis +from sklearn import neighbors +from sklearn import ensemble +from sklearn.inspection import permutation_importance + + + +# Matplotlib Library +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +from matplotlib import colormaps +import matplotlib.colors as mcolors + +# Seaborn +import seaborn as sns + + +mpl.rcParams['font.family'] = "serif" + +# Random +random_state = 1234 +rng = np.random.default_rng(seed=1234) + +# ----------------- +# GLOBAL PARAMETERS +# ----------------- + +# Label in the loaded dataframes. +DATA_FOR_LABEL = 'Accuracy_after_transfer_learning' # 'Benefit_transfer_learning' or Accuracy_after_transfer_learning + + +LOWER_ACCURACY_LIMIT = 0.65 # Lower limit for intra-subject performance for subject to be included. +UPPER_ACCURACY_LIMIT = 1.1 # Upper limit for intra-subject performance for subject to be included. 1 is max +NBR_USERS_FOR_VISUALIZATION = None # If None, use all. + + +# Datafolder to load data from +# Identifying name for the data. used in folder and figure names. +DATA_FOLDER = 'data/data_all_users_hands_feet' +NAME_STRING_DATA = 'feet_hands' + +# DATA_FOLDER = 'data/data_all_users_left_right' +# NAME_STRING_DATA = 'left_right' + + + + +# Type of splitter for cross validation and number of folds. +SPLITTER = 'group_k_fold' # 'leave_one_group_out', 'group_k_fold', 'random' +NBR_FOLDS = 10 + + +# Methods to be included in max of methods method. +METHODS_IN_MAX_OF_MEHTODS = ['best_teacher','best_source_accuracy','my_method'] # 'random','best_teacher','best_source_accuracy','my_method' +# How many samples from the max of methods method to analyze. +NBR_OF_VALUES_FOR_EACH_METHOD_IN_MAX_OF_METHODS = [1,2,3,4,5] + +# Name for folder where data is to be stored. (for example trained estimators) +DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS = f'{NAME_STRING_DATA}_lower_accuracy_limit_{LOWER_ACCURACY_LIMIT}' + +# What to plot +PLOTTING_EVALUATION = True # Final evaluation of methods +PLOTTING_ESTIMATORS = True # Estimation results +PLOTTING_DATA = False # Plott scatterplits and boxplot of data +IMPORTANCE = False # Visualize feature impoartance for estimator + +# SCORING_FUNCTION = balanced_accuracy_score +SCORING_FUNCTION = accuracy_score +MAX_VALUES_FOR_EACH_METHOD_IN_BASELINE_COMPARISION = 21 + +INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS={} + + + +FEATURES = [ + 'Target:Class1-Target:Class2', + 'Source:Class1-Source:Class2', + 'Target:Class1-Source:Class1', + 'Target:Class1-Source:Class2', + + 'Target:Class2-Source:Class1', + 'Target:Class2-Source:Class2', + + 'Target:Dispersion:all', + 'Target:Dispersion:Class1', + 'Target:Dispersion:Class2', + + 'Source:Dispersion:all', + 'Source:Dispersion:Class1', + 'Source:Dispersion:Class2', + + + 'Source_accuracy', + # 'Target_accuracy', + + 'diff_class_1_class_2_source_target', + 'diff_dispersion_class_1_source_target', + 'diff_dispersion_class_2_source_target', + 'diff_class_1_target_to_source', + 'diff_class_2_target_to_source', + + # 'abs_diff_class_1_class_2_source_target', + # 'abs_diff_dispersion_class_1_source_target', + # 'abs_diff_dispersion_class_2_source_target', + + # 'squared_diff_class_1_class_2_source_target', + # 'squared_diff_dispersion_class_1_source_target', + # 'squared_diff_dispersion_class_2_source_target', + + # 'power_diff_class_1_class_2_source_target', + # 'power_diff_dispersion_class_1_source_target', + # 'power_diff_dispersion_class_2_source_target', + ] + + + +print(FEATURES) + + + +ALL_FEATURES = ['Target:Class1-Target:Class2', + 'Target:Center-Target:Class1', + 'Target:Center-Target:Class2', + 'Source:Class1-Source:Class2', + 'Source:Center-Source:Class1', + 'Source:Center-Source:Class2', + 'Target:Center-Source:Center', + 'Target:Center-Source:Class1', + 'Target:Center-Source:Class2', + 'Target:Class1-Source:Center', + 'Target:Class1-Source:Class1', + 'Target:Class1-Source:Class2', + 'Target:Class2-Source:Center', + 'Target:Class2-Source:Class1', + 'Target:Class2-Source:Class2', + 'Source:Dispersion:all', + 'Source:Dispersion:Class1', + 'Source:Dispersion:Class2', + 'Target:Dispersion:all', + 'Target:Dispersion:Class1', + 'Target:Dispersion:Class2', + ] + + + + + +ALL_CLFS = [] + +CLF_EVALUATION = [] + +# ----------- Regressors ----------- + + +regs_to_add = [ +('Dummy','Dummy','reg',make_pipeline(dummy.DummyRegressor(strategy='mean'))), + +# ('SVR','SVR','reg',make_pipeline(QuantileTransformer(output_distribution='uniform',random_state=random_state),svm.SVR(kernel='rbf',C=0.1))), +# ('SVR_1','SVR_1','reg',make_pipeline(RobustScaler(),svm.SVR(kernel='rbf',C=0.1))), + +('NN_1','(50,50)','reg',make_pipeline(RobustScaler(),neural_network.MLPRegressor(solver='lbfgs',activation='relu',hidden_layer_sizes=(50,50),alpha=0.0001,learning_rate='constant',random_state=random_state,max_iter=10000))), +# ('NN_2','(100,50,25,12)','reg',make_pipeline(RobustScaler(),neural_network.MLPRegressor(solver='lbfgs',activation='relu',hidden_layer_sizes=(100,50,25,12),alpha=0.0001,learning_rate='constant',random_state=random_state,max_iter=10000))), +# ('NN_3','(50,25)','reg',make_pipeline(RobustScaler(),neural_network.MLPRegressor(solver='lbfgs',activation='relu',hidden_layer_sizes=(50,25),alpha=0.0001,learning_rate='constant',random_state=random_state,max_iter=10000))), +] + + +[ALL_CLFS.append(clf) for clf in regs_to_add] +[CLF_EVALUATION.append((name,type_clf)) for (name,_,type_clf,_) in regs_to_add] + + + +print(__doc__) + +class Logger(object): + """For saving everything printed to file as well.""" + def __init__(self, filename="console_log.txt"): + """Initialize the class.""" + self.terminal = sys.stdout + self.log = open(filename, "a") + def write(self, message): + """Write both to stdout and to file.""" + self.terminal.write(message) + self.log.write(message) + def flush(self): + """We don't do flush.""" + pass + +def print_git_version(): + """We want to know what version of the source code we're dealing with.""" + # Ideally, commit to git _every time_ before you run the code. + try: + git_output = subprocess.check_output("git log -n 1", shell=True, stderr=subprocess.STDOUT) + print("\n\n### git log -n 1\n%s\n\n" % git_output.decode("utf-8")) + git_output2 = subprocess.check_output("git status", shell=True, stderr=subprocess.STDOUT) + print("### git status\n%s\n\n" % git_output2.decode("utf-8")) + git_output3 = subprocess.check_output("git diff", shell=True, stderr=subprocess.STDOUT) + print("### git diff\n%s\n\n\n" % git_output3.decode("utf-8")) + except: + print("\n\n### WARNING: The code you're running is NOT under GIT version control. You can do better. Behave.\n\n") + +def print_source_code(): + print("\n\n###\n### Listing the python source code used, from the file '%s':\n###\n\n" % __file__) + with open(__file__, 'r') as f: + print(f.read()) + print("\n\n###\n### End of python source code from the file '%s'.\n###\n\n" % __file__) + + +def write_description(file_name='description'): + with open('{}/{}.txt'.format(run_logdir,file_name), 'a') as f: + # Write some text to the file + f.write("This is a description file.\n") + + f.write('\nNBR_FOLDS: {}\n'.format(NBR_FOLDS)) + f.write('\nDATA_FOLDER: {}\n'.format(DATA_FOLDER)) + f.write(f'\nNAME_STRING_DATA: {NAME_STRING_DATA}') + f.write(f'\nLOWER_ACCURACY_LIMIT: {LOWER_ACCURACY_LIMIT}') + f.write(f'\n\nFEATURES:\n') + for feat in FEATURES: + f.write(f'\t{feat}\n') + + f.write('\n\n===== Classifier(s): =====') + for (name,description,clf_type,clf) in ALL_CLFS: + f.write('\n\n\t* {} : {} \n'.format(name,clf_type)) + f.write('\t ' + str(clf)) + f.write('\n\t {}'.format(description)) + + # if PLOTTING_EMBEDDINGS: + # f.write('\n\n\n\n===== Visualiser(s): =====') + # for (name,vis) in ALL_VISUALISERS: + # f.write('\n\n\t* {} \n'.format(name)) + # f.write('\t ' + str(vis)) + +def save_source_code(run_logdir): + print("\n\n###\n### Saving the python source code used, from the file '%s':\n###" % __file__) + shutil.copy2(__file__, f'{run_logdir}/_script.py') + + +# ----------------------------------------------- + + + + +def load_saved_data(data_folder='data', filter_same_target_source=True, plotting=False, name='', lower_accuracy_limit=-0.1, upper_accuracy_limit=1.1, nbr_subjects_for_visualization=None): + """ + Loads and processes saved data from a specified folder, filtering and augmenting the data as necessary for analysis. + + This function retrieves previously saved data (in the form of pickled DataFrames) from a specified folder, performs + filtering to remove intra-cluster (same target-source) entries, adds new computed features, and optionally plots accuracy + matrices. It is also possible to restrict the data to clusters within a given accuracy range for intra subject performance. + + Parameters + ---------- + data_folder : str, optional + The folder where the saved data is located. Default is 'data'. + + filter_same_target_source : bool, optional + Whether to filter out rows where the target and source clusters are the same (intra-cluster). Default is True. + + plotting : bool, optional + Whether to generate and save plots of accuracy matrices. Default is False. + + name : str, optional + A name for labeling the output files or plots. Default is ''. + + lower_accuracy_limit : float, optional + The lower limit of accuracy to filter clusters. Default is -0.1. Max <1 + + upper_accuracy_limit : float, optional + The upper limit of accuracy to filter clusters. Default is 1.1. Min >0 + + nbr_subjects_for_visualization : int, optional + The number of subjects to randomly select for visualization. If None, all subjects are used. Default is None. + + Returns + ------- + df : pandas.DataFrame + A DataFrame containing the filtered and augmented data, including computed features for transfer learning. + + random_selection_of_users : ndarray or None + A random selection of target clusters for visualization, or None if visualization is not performed. + + Notes + ----- + - The function loads pickled data files from the specified folder and concatenates them into a single DataFrame. + - Additional computed features, such as differences between classes and dispersions, are added to the DataFrame. + - The function filters out clusters with accuracy values outside the specified accuracy range and removes intra-cluster rows + if `filter_same_target_source` is True. + - If `plotting` is enabled, accuracy matrices are generated and saved, with the option to visualize only a subset of subjects. + - The function deletes temporary data structures to free up memory after processing. + """ + data_folder_path = './{}/'.format(data_folder) + + # Check if folder exist. + if not os.path.isdir(data_folder_path): + print('Not a folder: {}'.format(data_folder_path)) + import pdb; pdb.set_trace() + + # Load list of cluster names/subjects used. + pickle_file_path = os.path.join(data_folder_path, 'list_of_cluster_names.pickle') + with open(pickle_file_path, 'rb') as file: + list_of_cluster_names = pickle.load(file) + + # Load the number of clusters used if stored, otherwise get from list of cluster names. + try: + pickle_file_path = os.path.join(data_folder_path, 'nbr_of_clusters.pickle') + with open(pickle_file_path, 'rb') as file: + nbr_of_clusters = pickle.load(file) + except: + nbr_of_clusters = len(list_of_cluster_names) + + # Placeholder for data. + all_df_data = [] + + + + # Load stored dataframes with data. + for i, subdir in enumerate(list_of_cluster_names): + print('Loading data for {}'.format(subdir)) + subdir_path = os.path.join(data_folder_path, subdir) + + # Check if the path is indeed a directory + if not os.path.isdir(subdir_path): + print('Not a folder: {}'.format(subdir)) + continue # There might be a .DS_store. + + # == df_data + pickle_file_path = os.path.join(subdir_path, 'df_data.pickle') + + # Check if the pickle file exists + if os.path.exists(pickle_file_path): + # df_data_exist = True + with open(pickle_file_path, 'rb') as file: + data_here = pickle.load(file) + all_df_data.append(data_here) + + + + # Create dataframe of all retrieved data. + df = pd.concat(all_df_data,ignore_index=True) + + # Remove all intra subject data. + if filter_same_target_source: + df = df[df['Target_cluster'] != df['Source_cluster']] + df = df.reset_index(drop=True) + + # Save the matrix of accuracies for all subjects which is to be used a lot in the code. + save_matrix_of_accuracy_for_all_sub(df) + + # Add source accuracy as feature and remove clusters with too high/low accuracy. + # Calculate the mean 'Target_accuracy' for each 'Target_cluster' + target_acc = df.groupby('Target_cluster')['Target_accuracy'].mean() + # Filter out clusters outside the specified accuracy range + valid_clusters = target_acc[(target_acc >= lower_accuracy_limit) & (target_acc <= upper_accuracy_limit)].index + # Create a mask for rows where 'Target_cluster' or 'Source_cluster' is not in valid_clusters + invalid_mask = df['Target_cluster'].isin(valid_clusters) & df['Source_cluster'].isin(valid_clusters) + # Drop rows where either 'Target_cluster' or 'Source_cluster' are not in valid_clusters + df = df[invalid_mask] + # Reset index after filtering + df.reset_index(drop=True, inplace=True) + # Map 'Source_cluster' to their corresponding accuracy + df['Source_accuracy'] = df['Source_cluster'].map(target_acc) + + + + # ADD DIFF FEATURES + df['diff_class_1_class_2_source_target'] = df['Target:Class1-Target:Class2'] - df['Source:Class1-Source:Class2'] + df['diff_class_1_target_to_source'] = df['Target:Class1-Source:Class2'] - df['Target:Class1-Source:Class1'] + df['diff_class_2_target_to_source'] = df['Target:Class2-Source:Class1'] - df['Target:Class2-Source:Class2'] + df['diff_dispersion_class_1_source_target'] = df['Target:Dispersion:Class1'] - df['Source:Dispersion:Class1'] + df['diff_dispersion_class_2_source_target'] = df['Target:Dispersion:Class2'] - df['Source:Dispersion:Class2'] + df['abs_diff_class_1_class_2_source_target'] = np.abs(df['Target:Class1-Target:Class2'] - df['Source:Class1-Source:Class2']) + df['abs_diff_dispersion_class_1_source_target'] = np.abs(df['Source:Dispersion:Class1'] - df['Target:Dispersion:Class1']) + df['abs_diff_dispersion_class_2_source_target'] = np.abs(df['Source:Dispersion:Class2'] - df['Target:Dispersion:Class2']) + df['squared_diff_class_1_class_2_source_target'] = (df['Target:Class1-Target:Class2'] - df['Source:Class1-Source:Class2'])**2 + df['squared_diff_dispersion_class_1_source_target'] = (df['Source:Dispersion:Class1'] - df['Target:Dispersion:Class1'])**2 + df['squared_diff_dispersion_class_2_source_target'] = (df['Source:Dispersion:Class2'] - df['Target:Dispersion:Class2'])**2 + df['power_diff_class_1_class_2_source_target'] = (df['Target:Class1-Target:Class2'] - df['Source:Class1-Source:Class2'])**3 + df['power_diff_dispersion_class_1_source_target'] = (df['Source:Dispersion:Class1'] - df['Target:Dispersion:Class1'])**3 + df['power_diff_dispersion_class_2_source_target'] = (df['Source:Dispersion:Class2'] - df['Target:Dispersion:Class2'])**3 + + + # Pick subjects to use for visualization. + if nbr_subjects_for_visualization is None: + random_selection_of_users = None + else: + random_selection_of_users = np.unique(df['Target_cluster']) + random_selection_of_users = rng.choice(random_selection_of_users,size=nbr_subjects_for_visualization, replace=False) + + # Plot figures of accuracy matrix. + if plotting: + if random_selection_of_users is None: + figsize = (nbr_of_clusters*1.5,nbr_of_clusters*1.5) + if nbr_of_clusters > 20: + plt.rcParams.update({'font.size': 20}) + else: + figsize = (10,10) + + plot_four_accuracy_matrix_sorted(df, random_selection_of_users=random_selection_of_users,run_logdir=run_logdir,sorting='row_col_sum') + plot_one_accuracy_matrix_sorted(df, random_selection_of_users=random_selection_of_users,run_logdir=run_logdir,sorting='row_col_sum') + + # Delete data that is not to be used anymore. + del all_df_data + + return df, random_selection_of_users + + +def save_matrix_of_accuracy_for_all_sub(data): + """ + Generates and saves accuracy matrices for all target and source clusters if they do not already exist. + + This function creates a matrix of transfer learning accuracies between target and source clusters. It also computes + the average accuracy for each target cluster. These matrices are saved as pickled files in a designated directory. + If the matrices already exist, the function skips the generation process. + + Parameters + ---------- + data : pandas.DataFrame + A DataFrame containing the transfer learning data, with 'Target_cluster', 'Source_cluster', 'Target_accuracy', and + 'Accuracy_after_transfer_learning' columns. + + Returns + ------- + None + This function does not return any values but saves matrices of accuracies to disk. + + Notes + ----- + - The function checks if the matrices already exist. If they do, no further processing is done. + - If the matrices do not exist, it generates the following matrices: + 1. `matrix_of_accuracies`: A matrix where each entry represents the transfer learning accuracy from a source cluster to a target cluster. + 2. `target_accuracy`: A vector containing the intra-subject accuracy for each target cluster. + 3. `subj_order_matrix_of_accuracies`: A vector containing the order of unique target clusters. + - The matrices are saved as pickle files in the `data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices` directory. + - The function creates the directory if it does not exist. + """ + # Load stored matrices + full_file_path_1 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'matrix_of_accuracies_all_subj.pkl') + full_file_path_2 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'target_accuracy_all_subj.pkl') + full_file_path_3 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'subj_order_matrix_of_accuracies_all_subj.pkl') + if os.path.exists(full_file_path_1) and os.path.exists(full_file_path_2) and os.path.exists(full_file_path_3): + print(f'Matrices for all data exist') + # Do nothing, return. + # Generate matrices if they do not exist. + else: + print(f'No matrix_of_accuracies_all_subj found at {full_file_path_1}, creating...') + unique_clusters = np.unique(data['Target_cluster']) + # Placeholders + target_accuracy = np.zeros(len(unique_clusters)) + matrix_of_accuracies = np.zeros((len(unique_clusters),len(unique_clusters))) + # For every target user for every source user fint the transfer learning accuracy and store in the matrix. + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + target_accuracy[row] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + matrix_of_accuracies[row,col] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + continue + + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + matrix_of_accuracies[row,col] = np.mean(data_here['Accuracy_after_transfer_learning']) + + # Create the directory if it doesn't exist + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices', exist_ok=True) + # Save the file + if not os.path.exists(full_file_path_1): + with open(full_file_path_1, 'wb') as f: + pickle.dump(matrix_of_accuracies, f) + print('Saving matrix_of_accuracies') + if not os.path.exists(full_file_path_2): + with open(full_file_path_2, 'wb') as f: + pickle.dump(target_accuracy, f) + print('Saving target_accuracy') + if not os.path.exists(full_file_path_3): + with open(full_file_path_3, 'wb') as f: + pickle.dump(unique_clusters, f) + print('Saving subj_order_matrix_of_accuracies') + + return + + + + +# ------------------------------------------------------------ +# -------------------- REGRESSORS ---------------------------- +# ------------------------------------------------------------ + + +def run_one_regressor(df_data, clf, save_file_name='reg', run_logdir='.', plotting=False, nbr_folds=5): + """ + Runs a regression model on the provided data using cross-validation and optionally evaluates feature importance. + + This function applies a specified regressor (classifier or pipeline) to the input data, performs cross-validation using a + defined splitting strategy, and saves the model estimators for each fold. If an estimator already exists for a fold, it + is loaded from disk. Optionally, feature importance is calculated, and the results are then visualized. + + Parameters + ---------- + df_data : pandas.DataFrame + The input data containing features, labels, target clusters, and source clusters for training and testing. + + clf : estimator object + A regressor or pipeline implementing 'fit' and 'predict' methods, used to train and test on the provided data. + + save_file_name : str, optional + The name used for saving the trained models and any generated plots. Default is 'reg'. + + run_logdir : str, optional + The directory where any plots or models will be saved. Default is the current directory ('.'). + + plotting : bool, optional + Whether to plot regression results. Default is False. + + nbr_folds : int, optional + The number of cross-validation folds. Default is 5. + + Returns + ------- + score : float + The root mean squared error (RMSE) of the model across all cross-validation folds. + + Notes + ----- + - The cross-validation splitter is determined by the global variable `SPLITTER`. Options include 'leave_one_group_out', + 'group_k_fold', or 'random'. + - For each fold, the model is trained on the source data and tested on the target data. If a trained model for the target + cluster already exists, it is loaded from disk to avoid re-training. + - Model estimators are saved in `data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/saved_estimators/{save_file_name}`. + - Feature importance is evaluated using permutation importance if `IMPORTANCE` is set, and the results are plotted and saved. + - RMSE is used as the evaluation metric for the regression model's performance. + - If `plotting` is enabled, the regression results are visualized and saved to the specified directory. + """ + + # extract data to variables. + X = df_data[FEATURES] + y = df_data[DATA_FOR_LABEL] + groups = df_data['Target_cluster'] + source_clusters = df_data['Source_cluster'] + + + # Specify splitter function for cross evaluation. + if SPLITTER == 'leave_one_group_out': + cv_splitter = LeaveOneGroupOut() + elif SPLITTER == 'group_k_fold': + cv_splitter = GroupKFold(n_splits=nbr_folds) + elif SPLITTER == 'random': + cv_splitter = KFold(n_splits=nbr_folds) + else: + raise Exception("Wrong value for SPLITTER") + + # Placeholders + df_importance = pd.DataFrame() + y_all_folds =[] + y_pred_all_folds = [] + INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[save_file_name]={} + + # Iterate over folds. + for i, (train_index, test_index) in enumerate(cv_splitter.split(X, y, groups)): + # Extract training and test data indices + # Remove target data from source data. + if SPLITTER == 'leave_one_group_out': # remove samples where target is used as source. + target_here = groups[test_index[0]] + train_index = train_index[source_clusters[train_index]!=target_here] # Train index without target subject + elif SPLITTER == 'group_k_fold': + target_here = np.unique(groups[test_index]) + for t in target_here: + train_index = train_index[source_clusters[train_index]!=t] + test_index = test_index[source_clusters[test_index]!=t] + + # Extract training data + X_train = X.iloc[train_index] + y_train = y.iloc[train_index] + # Generate name for test data to save estimators. + names = np.unique(groups.iloc[test_index]) + names = '_'.join(names) + full_file_path_saved_estimator = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'saved_estimators',save_file_name, names + '.pkl') + # Check if the file exists + if os.path.exists(full_file_path_saved_estimator): + # Load the pipeline + with open(full_file_path_saved_estimator, 'rb') as f: + estimator = pickle.load(f) + print(f'Estimator loaded from {full_file_path_saved_estimator}') + estimator_loaded = True + else: + print(f'No estimator found at {full_file_path_saved_estimator}, training...') + estimator = clf + estimator.fit(X_train,y_train) + estimator_loaded = False + + # Extract test data + X_test = X.iloc[test_index] + y_test = y.iloc[test_index] + y_all_folds.append(y_test) + # Do prediction. + y_pred = estimator.predict(X_test) + y_pred_all_folds.append(y_pred) + + + # Save the estimator for use later. + INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[save_file_name][names] = copy.deepcopy(estimator) + if not estimator_loaded: + # Create the directory if it doesn't exist + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/saved_estimators/{save_file_name}', exist_ok=True) + + # Save the pipeline + with open(full_file_path_saved_estimator, 'wb') as f: + pickle.dump(estimator, f) + + # If the feature importance is beeing evaluated do the evaluation + if IMPORTANCE: + result = permutation_importance(estimator, X_test, y_test, n_repeats=5, random_state=42, n_jobs=2) + df_importance = pd.concat([df_importance,pd.DataFrame(result.importances.T,columns=FEATURES)],ignore_index=True) + + # If the feature importance is beeing evaluated plot the evaluation results. + if IMPORTANCE: + fig, ax = plt.subplots(figsize=(10,7),layout='constrained') + df_importance.boxplot(vert=False, whis=10,ax=ax) + ax.set_title("Permutation Importances (test set)") + ax.axvline(x=0, color="k", linestyle="--") + ax.set_xlabel("Decrease in accuracy score") + save_string = save_file_name # Filename + plt.savefig(f'{run_logdir}/importance_cla_{save_string}.png') # Save the figure in the specified directory + plt.close(fig) + + # Evaluate RMSE for estimator. + y_all_folds = np.concatenate(y_all_folds) + y_pred_all_folds = np.concatenate(y_pred_all_folds) + score = root_mean_squared_error(y_all_folds,y_pred_all_folds) + + # Plot regression results + if plotting: + plotting_regression_results(y_all_folds,y_pred_all_folds,save_file_name=f'{save_file_name}',run_logdir=run_logdir,score=score) + + + return score + + + +def plotting_regression_results(y_true, y_pred, groups=None, save_file_name='fig', run_logdir='.', score=0): + """ + Plots regression results, including observed vs. predicted values and residuals, and saves the plots. + + This function generates two subplots: one showing the actual vs. predicted values and another showing the residuals + vs. predicted values. If groups (such as cross-validation folds) are provided, points are color-coded by group. + The resulting figure is saved to the specified directory. + + Parameters + ---------- + y_true : ndarray + The true (observed) values of the target variable. + + y_pred : ndarray + The predicted values from the regression model. + + groups : ndarray, optional + An array representing group labels (e.g., cross-validation folds) to color-code the scatter plot. + If None, all points are plotted in a single color. Default is None. + + save_file_name : str, optional + The name used for saving the plot. Default is 'fig'. + + run_logdir : str, optional + The directory where the plot will be saved. Default is the current directory ('.'). + + score : float, optional + The root mean squared error (RMSE) of the model, used in the plot title. Default is 0. + + Returns + ------- + None + The function generates and saves the plot to disk but does not return any values. + + Notes + ----- + - The first subplot shows the actual (y_true) vs. predicted (y_pred) values, ideally forming a diagonal line. + - The second subplot shows residuals (y_true - y_pred) vs. predicted values to visualize model errors. + - If `groups` is provided, points are color-coded based on the group labels. + - The figure is saved in the format `reg_{save_file_name}.png` in the specified `run_logdir`. + """ + # Create figure + fig, axs = plt.subplots(figsize=(14,7),ncols=2,layout='constrained') + + # Plot the observed values vs the true values. Shold ideally be a diagonal. + plt.sca(axs[0]) + if groups is None: + plt.scatter(y_pred,y_true,color='gray', alpha=0.7) + else: # if there sould be some coloring of the plot based on the folds + uniqe_groups = np.unique(groups) + for g in uniqe_groups: + plt.scatter(y_pred[groups==g],y_true[groups==g], alpha=0.7,label=g) + plt.legend() + plt.plot(y_pred,y_pred,color='black',linestyle=':') + plt.title('Actual vs Predicted values') + plt.xlabel('Predicted value') + plt.ylabel('Observed value') + + # Plot residuals. + plt.sca(axs[1]) + if groups is None: + res = y_true-y_pred + plt.scatter(y_pred,res,color='gray', alpha=0.7) + else: # if there sould be some coloring of the plot based on the folds + uniqe_groups = np.unique(groups) + for g in uniqe_groups: + res = y_true[groups==g]-y_pred[groups==g] + plt.scatter(y_pred[groups==g],res, alpha=0.7,label=g) + plt.legend() + plt.plot([np.min(y_pred),np.max(y_pred)],[0,0],color='black',linestyle=':') + plt.title('Residuals vs Predicted values') + plt.xlabel('Predicted value') + plt.ylabel('Residuals') + + plt.suptitle('Cross validated results for {}_RMSE_{:4.4f}'.format(save_file_name,score)) + + # Save fig + save_string = save_file_name # Filename + plt.savefig('{}/reg_{}.png'.format(run_logdir,save_string)) # Save the figure in the specified directory + plt.close(fig) + return + + + + + + + + + +# --------------------------------------------------------------- +# -------------------- RUN ALL ---------------------------------- +# --------------------------------------------------------------- + + + + +def run_all_classifiers(data, clfs, run_logdir='.', plotting=False, nbr_folds=5): + """ + Runs multiple regression models, evaluates their performance, and prints the results. + + This function iterates through a list of classifiers or regressors, running each regressor on the provided data + using cross-validation. It collects and prints the root mean squared error (RMSE) for each model and displays the + top-performing regressors based on RMSE. + + Parameters + ---------- + data : pandas.DataFrame + The input data used for training and testing the regressors, including features and target values. + + clfs : list of tuples + A list of classifiers/regressors, where each tuple contains: + - name (str): The name of the classifier/regressor. + - description (str): A description of the model. + - type_clf (str): The type of model ('clf' for classifiers, 'reg' for regressors). + - clf (estimator object): The classifier or regressor object to be used. + + run_logdir : str, optional + The directory where any results or plots will be saved. Default is the current directory ('.'). + + plotting : bool, optional + Whether to generate plots for the regression results. Default is False. + + nbr_folds : int, optional + The number of cross-validation folds. Default is 5. + + Returns + ------- + None + The function prints the RMSE for each regressor and lists the top 5 regressors based on performance. It does not return any values. + + Notes + ----- + - For each regressor, the function calls `run_one_regressor` to train, test, and evaluate the model. + - Only regressors (type_clf='reg') are evaluated; classifiers (type_clf='clf') are skipped in this function. + - The RMSE scores for each regressor are collected and sorted in ascending order, with the top 5 results printed. + - If `plotting` is enabled, regression result plots are generated and saved in the specified directory. + """ + # Placeholders + all_scores = [] + regressors = [] + + #Iterate over all estimators. + for (name,description,type_clf,clf) in clfs: + print('\n\n\n') + print(f'===== {name} =====') + print(description) + + if type_clf == 'clf': + pass + elif type_clf == 'reg': + score = run_one_regressor(data,clf,save_file_name=name,run_logdir=run_logdir,plotting=plotting,nbr_folds=nbr_folds) + text = f'RMSE {score:4.4f} for {name} - {description}' + print(text) + all_scores.append(text) + regressors.append({"name": name, "score": score}) + + # Print results for estimators + print('\n --- Results ---') + [print(item) for item in all_scores] + + sorted_regressors = sorted(regressors, key=lambda x: (x["score"]), reverse=False) + print('\nTop 5 regressors based on RMSE:') + [print(f"{item['name']:15} - score={item['score']:4.4f}") for item in sorted_regressors[:5]] + + print('\n') + return + + + + +def data_plotting(data, run_logdir='.'): + """ + Generates descriptive statistics and various plots for data visualization and saves the plots to the specified directory. + + This function first prints summary statistics of the data and then generates a series of plots for exploratory data analysis (EDA), + including boxplots and scatter plots of the features. The scatter plots are also visualized after preprocessing with `RobustScaler`. + + Parameters + ---------- + data : pandas.DataFrame + The input data containing features and target labels for visualization. + + run_logdir : str, optional + The directory where the plots will be saved. Default is the current directory ('.'). + + Returns + ------- + None + The function generates and saves multiple plots to disk but does not return any values. + + Notes + ----- + - The function prints the descriptive statistics of the data, including the output of `data.describe()` and the first few rows + of the DataFrame. + - It generates two boxplots: one for all features in the data and another for a subset of features including the target label. + - Scatter plots are generated to visualize pairwise relationships between features and the target label. Two versions of the scatter plots + are created: + 1. Using the raw data. + 2. After scaling the data with `RobustScaler`. + - The plots are saved as PNG files in the specified directory, with filenames like 'boxplot_0.png', 'boxplot_1.png', and + 'data_visualisation.png'. + """ + print(data.describe()) + + for col in list(data.describe().columns): + print(data.describe()[col]) + print('\n') + print(data.head()) + + + + # Boxplot + fig, ax = plt.subplots(figsize=(10,7),layout='constrained') + data.boxplot(vert=False, whis=10,ax=ax) + ax.set_title("Values of features") + ax.axvline(x=0, color="k", linestyle="--") + ax.set_xlabel("Values") + save_string = 'boxplot_0' # Filename + plt.savefig(f'{run_logdir}/{save_string}.png') # Save the figure in the specified directory + plt.close(fig) + + + # Boxplot + fig, ax = plt.subplots(figsize=(10,7),layout='constrained') + data[FEATURES+[DATA_FOR_LABEL]].boxplot(vert=False, whis=10,ax=ax) + ax.set_title("Values of features") + ax.axvline(x=0, color="k", linestyle="--") + ax.set_xlabel("Values") + save_string = 'boxplot_1' # Filename + plt.savefig(f'{run_logdir}/{save_string}.png') # Save the figure in the specified directory + plt.close(fig) + + + # Scatter for features + data_list = FEATURES + [DATA_FOR_LABEL] + + fig, axs = plt.subplots(figsize=(len(data_list)*4,len(data_list)*4),ncols=len(data_list),nrows=len(data_list),layout='constrained') + for i in range(len(data_list)): + for j in range(len(data_list)): + plt.sca(axs[i,j]) + if j == 0: + plt.ylabel(data_list[i]) + if i == len(data_list)-1: + plt.xlabel(data_list[j]) + X1 = data[data_list[j]] + X2 = data[data_list[i]] + y_here = data[DATA_FOR_LABEL] + if i ==j: + plt.hist(X1) + else: + + plt.scatter(X1,X2,marker='o',alpha=0.3, c=y_here) + + save_string = 'data_visualisation' # Filename + plt.savefig('{}/{}.png'.format(run_logdir,save_string)) # Save the figure in the specified directory + plt.close(fig) + + # Scatter for features preprocessed with RobustScaler + fig, axs = plt.subplots(figsize=(len(data_list)*4,len(data_list)*4),ncols=len(data_list),nrows=len(data_list),layout='constrained') + X = data[data_list] + X = RobustScaler().fit_transform(X) + for i in range(len(data_list)): + for j in range(len(data_list)): + plt.sca(axs[i,j]) + if j == 0: + plt.ylabel(data_list[i]) + if i == len(data_list)-1: + plt.xlabel(data_list[j]) + X1 = X[:,j] + X2 = X[:,i] + y_here = data[DATA_FOR_LABEL] + if i ==j: + plt.hist(X1) + else: + + plt.scatter(X1,X2,marker='o',alpha=0.3, c=y_here) + + save_string = 'data_visualisation_robust_scaler' # Filename + plt.savefig('{}/{}.png'.format(run_logdir,save_string)) # Save the figure in the specified directory + plt.close(fig) + + + return + + + + + +def plot_four_accuracy_matrix_sorted(data, random_selection_of_users=None, run_logdir='.', sorting='row_col_sum'): + """ + Plots four accuracy matrices showing transfer learning accuracies for target and source clusters, sorted in different ways. + + This function generates four subplots representing the transfer learning accuracies between target and source clusters. + The matrices can be sorted in different ways to highlight various aspects of the data, such as intra-cluster accuracy or + best source-target combinations. The plots are saved to the specified directory in both PNG and PDF formats. + + Parameters + ---------- + data : pandas.DataFrame + A DataFrame containing transfer learning data, including 'Target_cluster', 'Source_cluster', and 'Accuracy_after_transfer_learning' columns. + + random_selection_of_users : list, optional + A list of randomly selected users (clusters) to focus on. If None, all clusters will be used. Default is None. + + run_logdir : str, optional + The directory where the plots will be saved. Default is the current directory ('.'). + + sorting : str, optional + The sorting method to be used for the accuracy matrices. Options include: + - 'row_col_sum': Sort by the sum of rows and columns. + - 'row_col_max': Sort by the maximum values in rows and columns. + - 'popularity_vote': Sort based on the popularity of source/target clusters. Default is 'row_col_sum'. + + Returns + ------- + None + The function generates and saves the plots but does not return any values. + + Notes + ----- + - The function checks if accuracy matrices are already saved in pickle files. If not, it generates them from the provided data. + - Four subplots are generated, each showing different sorting strategies for target and source clusters. + - Intra-cluster accuracy is highlighted by removing diagonal values from the accuracy matrix for sorting and by outlining intra-subject cells in the plot. + - The function saves the plots in both PNG and PDF formats with the filename `four_accuracy_plots_sorting_{sorting}`. + """ + + if random_selection_of_users is None: + unique_clusters = np.unique(data['Target_cluster']) + else: + unique_clusters = random_selection_of_users + target_accuracy = np.zeros(len(unique_clusters)) + + figsize = (11,10) + fig, axs= plt.subplots(figsize=figsize,ncols=2,nrows=2,layout='constrained') # + + # load data + full_file_path_1 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'matrix_of_accuracies.pkl') + full_file_path_2 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'target_accuracy.pkl') + full_file_path_3 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'subj_order_matrix_of_accuracies.pkl') + if os.path.exists(full_file_path_1) and os.path.exists(full_file_path_2) and os.path.exists(full_file_path_3): + # Load the pipeline + with open(full_file_path_1, 'rb') as f: + matrix_of_accuracies = pickle.load(f) + print(f'Matrix_of_accuracies loaded from {full_file_path_1}') + + with open(full_file_path_2, 'rb') as f: + target_accuracy = pickle.load(f) + print(f'target_accuracy loaded from {full_file_path_2}') + + with open(full_file_path_3, 'rb') as f: + subj_order_matrix_of_accuracies = pickle.load(f) + print(f'subj_order_matrix_of_accuracies loaded from {full_file_path_3}') + + # Generate data if not existing. + else: + print(f'No matrix_of_accuracies found at {full_file_path_1}, creating...') + + matrix_of_accuracies = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + target_accuracy[row] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + matrix_of_accuracies[row,col] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + continue + + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + matrix_of_accuracies[row,col] = np.mean(data_here['Accuracy_after_transfer_learning']) + + # Create the directory if it doesn't exist + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices', exist_ok=True) + # Save the file + if not os.path.exists(full_file_path_1): + with open(full_file_path_1, 'wb') as f: + pickle.dump(matrix_of_accuracies, f) + print('Saving matrix_of_accuracies') + if not os.path.exists(full_file_path_2): + with open(full_file_path_2, 'wb') as f: + pickle.dump(target_accuracy, f) + print('Saving target_accuracy') + if not os.path.exists(full_file_path_3): + with open(full_file_path_3, 'wb') as f: + pickle.dump(unique_clusters, f) + print('Saving subj_order_matrix_of_accuracies') + + # Placeholders + matrix_to_plot = copy.deepcopy(matrix_of_accuracies) + np.fill_diagonal(matrix_of_accuracies, 0) # Remove, intra subject accuracies which are on the diagonal. + row_sums = np.sum(matrix_of_accuracies, axis=1) # Sum of rows + col_sums = np.sum(matrix_of_accuracies, axis=0) # Sum of columns + row_maxs = np.max(matrix_of_accuracies, axis=1) + col_maxs = np.max(matrix_of_accuracies, axis=0) + + + # Sort the matrix and labels acording to intra subject performance + ax = axs[0,0] + if sorting == 'row_col_sum' or sorting == 'row_col_max' or sorting == 'popularity_vote': + row_sorted_indices = np.argsort(-target_accuracy) # Indices to sort rows in descending order + col_sorted_indices = np.argsort(-target_accuracy) + # Sorting rows and columns separately + matrix_to_plot_1 = matrix_to_plot[row_sorted_indices, :] + matrix_to_plot_1 = matrix_to_plot_1[:, col_sorted_indices] + + labels_rows = np.array(unique_clusters)[row_sorted_indices] + labels_columns = np.array(unique_clusters)[col_sorted_indices] + + # Accuracy plot + vmin = 0.5 + vmax = 1 + cax_0 = ax.matshow(matrix_to_plot_1, cmap='RdBu',vmin=vmin,vmax=vmax) + + + # Loop over data dimensions and create intra-subject highlights + for i,row_label in enumerate(labels_rows): + for j,col_label in enumerate(labels_columns): + if row_label==col_label: + highlight_cell(i,j,edgecolor="black", linewidth=1,ax=ax,fill=False) + + + # Sort the matrix and labels acording to best source user. + ax = axs[0,1] + if sorting == 'row_col_sum': + col_sorted_indices = np.argsort(-col_sums) # Indices to sort cols in descending order + row_sorted_indices = np.argsort(-target_accuracy) + elif sorting == 'row_col_max': + row_sorted_indices = np.argsort(-target_accuracy) # Indices to sort rows in descending order + col_sorted_indices = np.lexsort((-col_sums, -col_maxs)) + elif sorting == 'popularity_vote': + col_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='col') + row_sorted_indices = np.argsort(-target_accuracy) + + # Sorting rows and columns separately + matrix_to_plot_1 = matrix_to_plot[row_sorted_indices, :] + matrix_to_plot_1 = matrix_to_plot_1[:, col_sorted_indices] + labels_rows = np.array(unique_clusters)[row_sorted_indices] + labels_columns = np.array(unique_clusters)[col_sorted_indices] + + # Accuracy plot + vmin = 0.5 + vmax = 1 + cax_0 = ax.matshow(matrix_to_plot_1, cmap='RdBu',vmin=vmin,vmax=vmax) + + + # Loop over data dimensions and create intra-subject highlights + for i,row_label in enumerate(labels_rows): + for j,col_label in enumerate(labels_columns): + if row_label==col_label: + highlight_cell(i,j,edgecolor="black", linewidth=1,ax=ax,fill=False) + + + # Sort the matrix and labels acording to best target user. + ax = axs[1,0] + if sorting == 'row_col_sum': + row_sorted_indices = np.argsort(-row_sums) # Indices to sort rows in descending order + col_sorted_indices = np.argsort(-target_accuracy) + elif sorting == 'row_col_max': + col_sorted_indices = np.argsort(-target_accuracy) + row_sorted_indices = np.lexsort((-row_sums, -row_maxs)) + elif sorting == 'popularity_vote': + col_sorted_indices = np.argsort(-target_accuracy) + row_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='row') + # Sorting rows and columns separately + matrix_to_plot_1 = matrix_to_plot[row_sorted_indices, :] + matrix_to_plot_1 = matrix_to_plot_1[:, col_sorted_indices] + labels_rows = np.array(unique_clusters)[row_sorted_indices] + labels_columns = np.array(unique_clusters)[col_sorted_indices] + + # Accuracy plot + vmin = 0.5 + vmax = 1 + cax_0 = ax.matshow(matrix_to_plot_1, cmap='RdBu',vmin=vmin,vmax=vmax) + + + + # Loop over data dimensions and create intra-subject highlights + for i,row_label in enumerate(labels_rows): + for j,col_label in enumerate(labels_columns): + if row_label==col_label: + highlight_cell(i,j,edgecolor="black", linewidth=1,ax=ax,fill=False) + + + # Sort the matrix and labels acording to best target and source user. + ax = axs[1,1] + if sorting == 'row_col_sum': + row_sorted_indices = np.argsort(-row_sums) # Indices to sort rows in descending order + col_sorted_indices = np.argsort(-col_sums) + elif sorting == 'row_col_max': + row_sorted_indices = np.lexsort((-row_sums, -row_maxs)) + col_sorted_indices = np.lexsort((-col_sums, -col_maxs)) + elif sorting == 'popularity_vote': + col_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='col') + row_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='row') + # Sorting rows and columns separately + matrix_to_plot_1 = matrix_to_plot[row_sorted_indices, :] + matrix_to_plot_1 = matrix_to_plot_1[:, col_sorted_indices] + labels_rows = np.array(unique_clusters)[row_sorted_indices] + labels_columns = np.array(unique_clusters)[col_sorted_indices] + + # Accuracy plot + vmin = 0.5 + vmax = 1 + cax_0 = ax.matshow(matrix_to_plot_1, cmap='RdBu',vmin=vmin,vmax=vmax) + + + # Loop over data dimensions and create intra-subject highlights + for i,row_label in enumerate(labels_rows): + for j,col_label in enumerate(labels_columns): + if row_label==col_label: + highlight_cell(i,j,edgecolor="black", linewidth=1,ax=ax,fill=False) + + # Remove ticks and hide grid. + for ax in axs.flat: + ax.set_xticks([]) + ax.set_yticks([]) + ax.grid(False) + + + # Add colorbar + fig.colorbar(cax_0, ax=axs,shrink=0.75,label='Accuracy') + + plt.suptitle('Transfer learning accuracies') + + # Set subtitles and x/y-labels. + plt.sca(axs[0,0]) + plt.ylabel('Target user') + plt.title('(A) Intra subject accuracy') + + + plt.sca(axs[0,1]) + plt.title('(B) Intra subject accuracy and best source user') + + + plt.sca(axs[1,0]) + plt.xlabel('Source user') + plt.ylabel('Target user') + plt.title('(C) Best target user and intra subject accuracy') + + + plt.sca(axs[1,1]) + plt.xlabel('Source user') + plt.title('(D) Best target and best source user') + + + save_string = f'four_accuracy_plots_sorting_{sorting}' # Filename + plt.savefig(f'{run_logdir}/png_{save_string}.png') # Save the figure in the specified directory + plt.savefig(f'{run_logdir}/pdf_{save_string}.pdf') # Save the figure in the specified directory + plt.close(fig) + return + + + +def plot_one_accuracy_matrix_sorted(data, random_selection_of_users=None, run_logdir='.', sorting='row_col_sum'): + """ + Plots a single accuracy matrix showing transfer learning accuracies for target and source clusters, sorted in different ways. + + This function generates a plot representing the transfer learning accuracies between target and source clusters. + The matrix can be sorted in various ways based on specified criteria, and the plot is saved in both PNG and PDF formats. + + Parameters + ---------- + data : pandas.DataFrame + A DataFrame containing transfer learning data, including 'Target_cluster', 'Source_cluster', and 'Accuracy_after_transfer_learning' columns. + + random_selection_of_users : list, optional + A list of randomly selected users (clusters) to focus on. If None, all clusters will be used. Default is None. + + run_logdir : str, optional + The directory where the plot will be saved. Default is the current directory ('.'). + + sorting : str, optional + The sorting method to be used for the accuracy matrix. Options include: + - 'row_col_sum': Sort by the sum of rows and columns. + - 'row_col_max': Sort by the maximum values in rows and columns. + - 'popularity_vote': Sort based on the popularity of source/target clusters. Default is 'row_col_sum'. + + Returns + ------- + None + The function generates and saves the plot but does not return any values. + + Notes + ----- + - The function checks if accuracy matrices are already saved in pickle files. If not, it generates them from the provided data. + - The matrix is sorted according to the chosen sorting method, and intra-subject accuracies are highlighted. + - The plot is saved in both PNG and PDF formats with the filename `accuracy_plot_sorting_{sorting}`. + """ + + if random_selection_of_users is None: + unique_clusters = np.unique(data['Target_cluster']) + else: + unique_clusters = random_selection_of_users + target_accuracy = np.zeros(len(unique_clusters)) + + figsize = (9,7) + fig, ax= plt.subplots(figsize=figsize,ncols=1,nrows=1,layout='constrained') # + + # Load data + full_file_path_1 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'matrix_of_accuracies.pkl') + full_file_path_2 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'target_accuracy.pkl') + full_file_path_3 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'subj_order_matrix_of_accuracies.pkl') + if os.path.exists(full_file_path_1) and os.path.exists(full_file_path_2) and os.path.exists(full_file_path_3): + # Load the pipeline + with open(full_file_path_1, 'rb') as f: + matrix_of_accuracies = pickle.load(f) + print(f'Matrix_of_accuracies loaded from {full_file_path_1}') + + with open(full_file_path_2, 'rb') as f: + target_accuracy = pickle.load(f) + print(f'target_accuracy loaded from {full_file_path_2}') + + with open(full_file_path_3, 'rb') as f: + subj_order_matrix_of_accuracies = pickle.load(f) + print(f'subj_order_matrix_of_accuracies loaded from {full_file_path_3}') + + # Generate data if it could not be loaded. + else: + print(f'No matrix_of_accuracies found at {full_file_path_1}, creating...') + + matrix_of_accuracies = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + target_accuracy[row] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + matrix_of_accuracies[row,col] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + continue + + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + matrix_of_accuracies[row,col] = np.mean(data_here['Accuracy_after_transfer_learning']) + + # Create the directory if it doesn't exist + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices', exist_ok=True) + # Save the file + if not os.path.exists(full_file_path_1): + with open(full_file_path_1, 'wb') as f: + pickle.dump(matrix_of_accuracies, f) + print('Saving matrix_of_accuracies') + if not os.path.exists(full_file_path_2): + with open(full_file_path_2, 'wb') as f: + pickle.dump(target_accuracy, f) + print('Saving target_accuracy') + if not os.path.exists(full_file_path_3): + with open(full_file_path_3, 'wb') as f: + pickle.dump(unique_clusters, f) + print('Saving subj_order_matrix_of_accuracies') + + matrix_to_plot = copy.deepcopy(matrix_of_accuracies) + np.fill_diagonal(matrix_of_accuracies, 0) + row_sums = np.sum(matrix_of_accuracies, axis=1) # Sum of rows + col_sums = np.sum(matrix_of_accuracies, axis=0) # Sum of columns + row_maxs = np.max(matrix_of_accuracies, axis=1) + col_maxs = np.max(matrix_of_accuracies, axis=0) + + + # Sort the matrix and labels acording to best target and source user. + if sorting == 'row_col_sum': + row_sorted_indices = np.argsort(-row_sums) # Indices to sort rows in descending order + col_sorted_indices = np.argsort(-col_sums) + elif sorting == 'row_col_max': + row_sorted_indices = np.lexsort((-row_sums, -row_maxs)) + col_sorted_indices = np.lexsort((-col_sums, -col_maxs)) + elif sorting == 'popularity_vote': + col_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='col') + row_sorted_indices = get_order_popularity(matrix_of_accuracies,dim='row') + # Sorting rows and columns separately + matrix_to_plot_1 = matrix_to_plot[row_sorted_indices, :] + matrix_to_plot_1 = matrix_to_plot_1[:, col_sorted_indices] + labels_rows = np.array(unique_clusters)[row_sorted_indices] + labels_columns = np.array(unique_clusters)[col_sorted_indices] + + # Accuracy plot + vmin = 50 + vmax = 100 + cax_0 = ax.matshow(matrix_to_plot_1*100, cmap='RdBu',vmin=vmin,vmax=vmax) + + + # Loop over data dimensions and highlight intra-subject. + for i,row_label in enumerate(labels_rows): + for j,col_label in enumerate(labels_columns): + if row_label==col_label: + highlight_cell(i,j,edgecolor="black", linewidth=1,ax=ax,fill=False,facecolor='gray') + + + ax.set_xticks([]) + ax.set_yticks([]) + ax.grid(False) + + + fig.colorbar(cax_0, ax=ax,shrink=0.75,label='Transfer learning accuracy (%)') + + plt.sca(ax) + plt.xlabel('Source user') + plt.ylabel('Target user') + + + save_string = f'accuracy_plot_sorting_{sorting}' # Filename + plt.savefig(f'{run_logdir}/png_{save_string}.png') # Save the figure in the specified directory + plt.savefig(f'{run_logdir}/pdf_{save_string}.pdf') # Save the figure in the specified directory + plt.close(fig) + + + return + + + +def custom_split(s): + """ + Splits a string by underscores and rejoins every two consecutive parts with an underscore. + + This function splits a string by all underscores, then reassembles every two consecutive parts with an underscore, + effectively treating every second underscore as a delimiter. + + Parameters + ---------- + s : str + The input string to be split and rejoined. + + Returns + ------- + split_keys : list of str + A list of strings, where every two consecutive parts of the input string are rejoined with an underscore. + + Example + ------- + If the input string is 'a_b_c_d_e', the function returns ['a_b', 'c_d', 'e']. + """ + # Split the string by all underscores + parts = s.split('_') + # Rejoin every two parts with an underscore, effectively treating every second underscore as a delimiter + split_keys = ['_'.join(parts[i:i + 2]) for i in range(0, len(parts), 2)] + return split_keys + + + + + +# Function to find the key with 'subj_1' considering only every second underscore for splitting +def find_key_with_substring(keys, substring): + """ + Finds and returns the first key from a list that contains a specified substring after applying custom splitting. + + This function applies the `custom_split` function to each key in the list and checks if the specified substring + is present in the split result. If a match is found, the key is returned; otherwise, None is returned. + + Parameters + ---------- + keys : list of str + A list of strings (keys) to search through. + + substring : str + The substring to search for in the split keys. + + Returns + ------- + key : str or None + The first key that contains the specified substring after splitting. Returns None if no match is found. + + Example + ------- + Given a list `['subj_1_something', 'subj_2_else']` and `substring='subj_1'`, the function returns 'subj_1_something'. + """ + for key in keys: + # Apply the custom split logic + split_keys = custom_split(key) + # Check if 'subj_1' is exactly in the split keys + if substring in split_keys: + return key + return None + + + + + + +def baseline_comparision_mean_performance(data, clf_name, type_clf, run_logdir='.', extra_string_for_fig_name='', nbr_of_values_for_each_method_in_max_of_methods=2): + """ + Compares the performance of different baseline methods and plots the results for transfer learning accuracy. + + This function evaluates and compares various methods for selecting source clusters for transfer learning. It generates + accuracy matrices, calculates mean performance differences, and visualizes results through a series of plots. It also computes + statistical significance between methods using the Wilcoxon signed-rank test. + + Parameters + ---------- + data : pandas.DataFrame + A DataFrame containing the transfer learning data, including target and source clusters, and accuracy values. + + clf_name : str + The name of the classifier or regressor being evaluated used as my_method. + + type_clf : str + The type of the model, either 'clf' for classifier or 'reg' for regressor. + + run_logdir : str, optional + The directory where results and plots will be saved. Default is the current directory ('.'). + + extra_string_for_fig_name : str, optional + An additional string to append to the figure filenames. Default is ''. + + nbr_of_values_for_each_method_in_max_of_methods : int, optional + The number of top values to consider for each method when calculating the max performance across methods. Default is 2. + + Returns + ------- + None + The function generates and saves several plots but does not return any values. + + Notes + ----- + - The function loads or creates matrices for transfer learning accuracy and mean class distances. + - It evaluates several baseline methods, including: + 1. Intra-subject performance. + 2. Best source accuracy. + 3. Best teacher based on 30% of the students. + 4. Random source. + 5. Closest distance between classes. + 6. Oracle (best possible accuracy). + 7. Custom method (e.g., a regressor predicting the best source for the target). + 8. Worst-case performance. + - The results are plotted in two main ways: + 1. A line plot comparing the accuracy of each method for different target clusters, ordered by intra-subject accuracy. + 2. A matrix plot showing the mean difference in performance between methods and the statistical significance. + - The statistical significance is calculated using the Wilcoxon signed-rank test. + """ + print(f'\n === {clf_name}{extra_string_for_fig_name} === \n') + # Load/create matrix of accuracies. + unique_clusters = np.unique(data['Target_cluster']) + full_file_path_1 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'matrix_of_accuracies.pkl') + full_file_path_3 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'target_accuracy.pkl') + full_file_path_4 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'subj_order_matrix_of_accuracies.pkl') + + # Load matrices + if os.path.exists(full_file_path_1) and os.path.exists(full_file_path_3) and os.path.exists(full_file_path_4): + # Load the pipeline + print(f'Loading stored matrices for evaluation of {clf_name}') + with open(full_file_path_1, 'rb') as f: + matrix_of_accuracies = pickle.load(f) + print(f'matrix_of_accuracies loaded from {full_file_path_4}') + with open(full_file_path_3, 'rb') as f: + target_accuracy = pickle.load(f) + print(f'target_accuracy loaded from {full_file_path_3}') + with open(full_file_path_4, 'rb') as f: + unique_clusters = pickle.load(f) + print(f'subj_order_matrix_of_accuracies loaded from {full_file_path_4}') + + # Create if they did not exist. + else: + target_accuracy = np.zeros(len(unique_clusters)) + matrix_of_accuracies = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + target_accuracy[row] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + matrix_of_accuracies[row,col] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + continue + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + y_values = data_here['Accuracy_after_transfer_learning'] + matrix_of_accuracies[row,col] = np.mean(y_values) + print('new matrices created ') + + # load mean_class_distance + # If not stored - for each target user, go through all source users and find the abs class_distance between classes. Store as a matrix. For source = target store np.inf. + full_file_path_5 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'mean_class_distances.pkl') + if os.path.exists(full_file_path_5): + # Load the matrix + with open(full_file_path_5, 'rb') as f: + mean_class_distances = pickle.load(f) + print(f'mean_class_distances loaded from {full_file_path_4}') + else: + print('creating mean_class_distances') + mean_class_distances = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + mean_class_distances[row,col] = np.inf + continue + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + class_1_dist = data_here['Target:Class1-Source:Class1'] + class_2_dist = data_here['Target:Class2-Source:Class2'] + mean_class_distances[row,col] = np.mean(np.concatenate((class_1_dist,class_2_dist))) + print('new mean_class_distances created ') + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices', exist_ok=True) + # Save the file + if not os.path.exists(full_file_path_5): + with open(full_file_path_5, 'wb') as f: + pickle.dump(mean_class_distances, f) + print('Saving mean_class_distances') + + + + # ---- Baseline evaluation ------ + # number of source users each method can suggest. + nbr_random_values = nbr_of_values_for_each_method_in_max_of_methods*len(METHODS_IN_MAX_OF_MEHTODS) + + # Best teacher order + col_sums = np.sum(matrix_of_accuracies, axis=0) # Sum of columns + best_teacher_sorting = np.argsort(-col_sums) + best_teacher_indices = best_teacher_sorting[0:nbr_random_values] + + # Best source accuracy order. + best_source_accuracy_sorting = np.argsort(-target_accuracy) + best_source_accuracy_indices = best_source_accuracy_sorting[0:nbr_random_values] + + nbr_simulations = 5 # number of repetitions for methods based on random chance. + nbr_subjects = len(unique_clusters) + + # Create placeholder arrays + target_intra_accuracy = np.zeros((nbr_subjects)) + random_source = np.zeros((nbr_subjects,nbr_simulations)) + max_of_methods = np.zeros((nbr_subjects)) + closest_distance = np.zeros((nbr_subjects)) + best_teacher = np.zeros((nbr_subjects)) + best_teacher_30_percentage_students = np.zeros((nbr_subjects,nbr_simulations)) + best_source_accuracy = np.zeros((nbr_subjects)) + oracle = np.zeros((nbr_subjects)) + worst = np.zeros((nbr_subjects)) + my_method = np.zeros((nbr_subjects,nbr_simulations)) + + # iterate over all target clusters to find source data for the target user with the methods. + for row,(target_cluster) in enumerate(unique_clusters): + + max_of_methods_here = {} # Placeholder for max of methods. + + # ----- Intra subject ------ + target_intra_accuracy[row] = target_accuracy[row] + + + # ------ Best source_accuracy ------ + # If the best source accuracy is also the target user. + if row in best_source_accuracy_indices: + modified_indices = best_source_accuracy_indices[best_source_accuracy_indices != row] + if len(modified_indices) <1: + modified_indices = best_source_accuracy_sorting[1] # Only happens if there is only one subject + best_source_accuracy[row] = np.max(matrix_of_accuracies[row][modified_indices]) + max_of_methods_here['best_source_accuracy'] = max(matrix_of_accuracies[row][modified_indices[0:nbr_of_values_for_each_method_in_max_of_methods]]) + else: + best_source_accuracy[row] = np.max(matrix_of_accuracies[row][best_source_accuracy_indices]) + max_of_methods_here['best_source_accuracy'] = max(matrix_of_accuracies[row][best_source_accuracy_indices[0:nbr_of_values_for_each_method_in_max_of_methods]]) + + + + # ------ Best teacher ------ + # If the best teacher is also the target user. + if row in best_teacher_indices: + modified_indices = best_teacher_indices[best_teacher_indices != row] + if len(modified_indices) <1: + modified_indices = best_teacher_sorting[1] # Only happens if there is only one subject + best_teacher[row] = np.max(matrix_of_accuracies[row][modified_indices]) + else: + best_teacher[row] = np.max(matrix_of_accuracies[row][best_teacher_indices]) + + + # ------ Best teacher 30 percentage students ------ + # Same as best teacher but the best teacher is estimated from 30% of the users (approx = 30 users). + max_of_methods_here['best_teacher'] = [] + # Iterate over nbr_simulations since the selection of 30% students is random. + for i in range(nbr_simulations): + nbr_students = int(0.3 * nbr_subjects) + selected_indices = rng.choice(nbr_subjects, nbr_students, replace=False) + selected_indices = selected_indices[selected_indices!=row] # remove the row subject (current target user) from evaluation of best teacher. + # Select the rows + selected_rows = matrix_of_accuracies[selected_indices] + + # Order of best teacher. + col_sums = np.sum(selected_rows, axis=0) # Sum of columns + best_teacher_sorting = np.argsort(-col_sums) + best_teacher_indices = best_teacher_sorting[0:nbr_random_values] + + if row in best_teacher_indices: + modified_indices = best_teacher_indices[best_teacher_indices != row] + if len(modified_indices) <1: + modified_indices = best_teacher_sorting[1] # Only happens if there is only one subject + best_teacher_30_percentage_students[row,i] = np.max(matrix_of_accuracies[row][modified_indices]) + max_of_methods_here['best_teacher'].append(max(matrix_of_accuracies[row][modified_indices[0:nbr_of_values_for_each_method_in_max_of_methods]])) + else: + best_teacher_30_percentage_students[row,i] = np.max(matrix_of_accuracies[row][best_teacher_indices]) + max_of_methods_here['best_teacher'].append(max(matrix_of_accuracies[row][best_teacher_indices[0:nbr_of_values_for_each_method_in_max_of_methods]])) + max_of_methods_here['best_teacher'] = np.mean(max_of_methods_here['best_teacher']) # Max of methods is the mean of all iterations of random sampling. + + + + # ------ Random source ------ + max_of_methods_here['random'] = [] + # Iterate over random sampled users. + for i in range(nbr_simulations): + accuracies_here = matrix_of_accuracies[row] + filtered_accuracies = np.delete(accuracies_here, row) + random_values = rng.choice(filtered_accuracies,nbr_random_values,replace=False) + max_random_value = np.max(random_values) + random_source[row,i] = max_random_value + max_of_methods_here['random'] = np.max(random_values[0:nbr_of_values_for_each_method_in_max_of_methods]) + + + # ------ Oracle ------ + oracle[row] = np.max(matrix_of_accuracies[row]) + + + # ------ Worst ------ + worst[row] = np.min(matrix_of_accuracies[row]) + + + # ------ Closest_distance ------ + # for each subject, find the source data with smallest distance - basically argmin of the distance matrix for row row. + index = np.argsort(mean_class_distances[row]) + index_to_use = index[0:nbr_random_values] + closest_distance[row] = np.max(matrix_of_accuracies[row][index_to_use]) + + # ------ My method ------ + max_of_methods_here['my_method'] = [] + if clf_type == 'clf': + pass + elif clf_type == 'reg': + # Get the classifier for this target user. + key_in_dict = find_key_with_substring(INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[clf_name].keys(),target_cluster) + clf = INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[clf_name][key_in_dict] + # Get the data + data_here = data[(data['Target_cluster']==target_cluster)] + X = data_here[FEATURES] + # Predict + pred = clf.predict(X) + + # There are 5 folds of the data from the original MI classification. For each of these folds, predict the best source users and select the best out of these. + for i in range(5): + # Get data from fold i + predictions_fold_i = pred[i::5] + data_fold_i = data_here.iloc[i::5] + # Which source user is predicted to be best? + indices_best_pred = np.argsort(-predictions_fold_i) + data_best_pred = data_fold_i.iloc[indices_best_pred[0:nbr_random_values]] + sources_best_pred = data_best_pred['Source_cluster'] + # Get the index in the matrix of accuracies. + matches = np.isin(unique_clusters, sources_best_pred,assume_unique=True) + matching_indices = np.where(matches)[0] + # Get the transfer learning performance for the target user and source data. + values_best_predictions = matrix_of_accuracies[row][matching_indices] + # Select the best of the suggested ones. + my_method[row,i] = np.max(values_best_predictions) + max_of_methods_here['my_method'].append(max(values_best_predictions[0:nbr_of_values_for_each_method_in_max_of_methods])) + # For this method, the max of methods is the mean of the iterations over the folds. + max_of_methods_here['my_method'] = np.mean(max_of_methods_here['my_method']) + + # For each row, max of method is the max of the included methods. + max_of_methods[row] = max([max_of_methods_here[method] for method in METHODS_IN_MAX_OF_MEHTODS]) + + + # ====== PLOTTING ===== + # ----- First plot is the performance of each mehthod, subjects sorted after intra subject performance. + # Sorting order + sorted_indices = np.argsort(target_intra_accuracy) + # Sorting the results and taking the mean for the methos which have repeted samples. + target_intra_accuracy = target_intra_accuracy[sorted_indices] + best_teacher = best_teacher[sorted_indices] + best_teacher_30_percentage_students = np.mean(best_teacher_30_percentage_students,axis=1)[sorted_indices] + best_source_accuracy = best_source_accuracy[sorted_indices] + random_source = np.mean(random_source,axis=1)[sorted_indices] + oracle = oracle[sorted_indices] + worst = worst[sorted_indices] + closest_distance = closest_distance[sorted_indices] + my_method = np.mean(my_method,axis=1)[sorted_indices] + target_subject_name = unique_clusters[sorted_indices] + max_of_methods = max_of_methods[sorted_indices] + + # x for plotting. + x = np.arange(0,len(target_intra_accuracy)) + + figsize = (6,5) + figsize = (10,10) + fig, ax = plt.subplots(figsize=figsize,layout='constrained') + + plt.plot(x,target_intra_accuracy,linestyle='-',label='Intra subject',alpha=0.7) + plt.plot(x,random_source,linestyle='-',label='Random source',alpha=0.7) + plt.plot(x,best_source_accuracy,linestyle='-',label='Best source',alpha=0.7) + plt.plot(x,best_teacher_30_percentage_students,linestyle='-',label='Best teacher',alpha=0.7) + # plt.plot(x,best_source_voting,linestyle='-',label='Best source voting',alpha=0.7) + plt.plot(x,my_method,linestyle='-',label='My method',alpha=0.7) + plt.plot(x,oracle,linestyle='-',label='Oracle',alpha=0.7) + plt.plot(x,worst,linestyle='-',label='Worst',alpha=0.7) + plt.plot(x,closest_distance,linestyle='-',label='Distance',alpha=0.7) + + plt.xlabel('Subjects ordered in intra subject accuracy') + plt.ylabel('Accuracy') + plt.legend() + + save_string = f'baseline_comparision_1_{clf_name}{extra_string_for_fig_name}' # Filename + plt.savefig(f'{run_logdir}/png_{save_string}.png') # Save the figure in the specified directory + plt.savefig(f'{run_logdir}/pdf_{save_string}.pdf') # Save the figure in the specified directory + plt.close(fig) + + + # plt.rcdefaults() + + # ----- Second plot is the matrix with mean difference in performance for the methods and the statistical significance. + + # Included methods here are the ones that are plotted. + dict_of_data = { + # 'Worst':worst, + 'Intra-subject': target_intra_accuracy, + 'Random source': random_source, + 'Distance' : closest_distance, + 'Best source': best_source_accuracy, + #'Best teacher':best_teacher, + 'Best teacher':best_teacher_30_percentage_students, + 'Max of methods' : max_of_methods, + 'TransPerfPred' :my_method, + 'Oracle' : oracle, + } + + dict_of_colors = { + 'Intra-subject': 'tab:blue', + 'Random source': 'tab:orange', + 'Best teacher': 'tab:red', + 'Best source': 'tab:green', + 'TransPerfPred' : 'tab:purple', + 'Oracle' : 'tab:brown', + 'Max of methods':'tab:pink', + 'Worst': 'tab:cyan', + 'Distance': 'tab:cyan', + } + + + nbr_methods = len(list(dict_of_data.keys())) + # Placeholders. + all_mean_val = np.zeros((nbr_methods,nbr_methods)) + all_pval_wix = np.zeros((nbr_methods,nbr_methods)) + + # Find the mean difference in performance and the statisitcal significance between the methods. + for i, data_str in enumerate(dict_of_data.keys()): + for k,data_str_compare in enumerate(dict_of_data.keys()): + y_here = dict_of_data[data_str] - dict_of_data[data_str_compare] + if data_str != data_str_compare: + mean_val = np.mean(dict_of_data[data_str]-dict_of_data[data_str_compare]) + result_wix = wilcoxon(dict_of_data[data_str],dict_of_data[data_str_compare],zero_method='zsplit') + + all_mean_val[i,k] = mean_val + all_pval_wix[i,k] = result_wix.pvalue + + # Plot the results. + plot_baseline_results(all_mean_val*100,list(dict_of_data.keys()),highligt_statistics=all_pval_wix, figname=f'mean_val_{clf_name}_wix', run_logdir=run_logdir,legend=f'Mean difference in performance (% units)',extra_string_for_fig_name=extra_string_for_fig_name) + + + return + + + + +def plot_baseline_results(data, labels, highligt_statistics=None, figname='', run_logdir='.', high_val_is_good=True, legend='', extra_string_for_fig_name=''): + """ + Plots a matrix comparing the performance of different baseline methods and highlights statistically significant differences. + + This function generates a heatmap to visualize the performance differences between various baseline methods. It also highlights + cells with significant statistical results (if provided) and saves the plot in both PNG and PDF formats. + + Parameters + ---------- + data : ndarray of shape (n_methods, n_methods) + A matrix containing performance differences between methods for each pair of methods. + + labels : list of str + A list of method names corresponding to the rows and columns of the data matrix. + + highligt_statistics : ndarray of shape (n_methods, n_methods), optional + A matrix of p-values or significance markers. Cells with values above 0.05 are outlined to indicate no significant difference. + Default is None. + + figname : str, optional + A string to append to the filename of the saved figure. Default is ''. + + run_logdir : str, optional + The directory where the plots will be saved. Default is the current directory ('.'). + + high_val_is_good : bool, optional + If True, higher values in the matrix are considered better and are shown in green. If False, the color scale is inverted. Default is True. + + legend : str, optional + A label for the color bar in the plot to explain the values being displayed. Default is ''. + + extra_string_for_fig_name : str, optional + An additional string to append to the figure filenames. Default is ''. + + Returns + ------- + None + The function generates and saves the plot but does not return any values. + + Notes + ----- + - The function uses a diverging colormap ('RdYlGn') to display performance differences, with green indicating higher values if + `high_val_is_good=True`. + - It highlights the diagonal cells in white, as these represent self-comparisons. + - If statistical significance is provided, cells with p-values >= 0.05 are outlined with a black border. + - The plot is saved in both PNG and PDF formats with filenames `baseline_comparision_{figname}{extra_string_for_fig_name}`. + """ + # Plotting + fig, ax = plt.subplots(figsize=(11,10),layout='constrained') + vmin = np.min(data) + vmax = np.max(data) + middle = (vmax+vmin)/2 + color_limit = (vmax-middle)*0.7 + if high_val_is_good: + cax = ax.matshow(data, cmap='RdYlGn', vmin=vmin, vmax=vmax) + else: + cax = ax.matshow(data, cmap='RdYlGn_r', vmin=vmin, vmax=vmax) + + # Adding color bar + fig.colorbar(cax,label=legend,shrink=0.75) + + # Setting axis labels + ax.set_xticks(np.arange(len(labels))) + ax.set_yticks(np.arange(len(labels))) + ax.set_xticklabels(labels, rotation=90) + ax.set_yticklabels(labels) + + if highligt_statistics is None: + highligt_statistics=np.zeros(data.shape) + + + # Make diagonal white. + for i in range(len(labels)): + highlight_cell(i,i, ax=ax,fill=True, edgecolor=None,linewidth=0, color='white') + + # Adding values to the cells + for i in range(len(labels)): + for j in range(len(labels)): + if i != j: + value = data[i, j] + color = 'white' if value > middle+color_limit or value < middle-color_limit else 'black' + ax.text(j, i, f'{value:.2f}', va='center', ha='center', color=color,size='small') + if highligt_statistics[i,j] >=0.05: + highlight_cell(i,j, ax=ax,fill=False, edgecolor="black", linewidth=3) + + + save_string = f'baseline_comparision_{figname}{extra_string_for_fig_name}' # Filename + plt.savefig(f'{run_logdir}/png_{save_string}.png') # Save the figure in the specified directory + plt.savefig(f'{run_logdir}/pdf_{save_string}.pdf') # Save the figure in the specified directory + plt.close(fig) + return + + + + + + + +def baseline_comparision_varying_nbr_source_data(data, clf_name, type_clf, run_logdir='.', extra_string_for_fig_name='', max_values_for_each_method=5): + """ + Evaluates and compares various baseline methods by varying the number of source data points for transfer learning and plots the results. + + This function compares several methods for selecting source clusters for transfer learning, varying the number of source candidates from + 1 to the specified maximum. It computes the performance of each method relative to the oracle performance and generates plots to visualize + the comparison. + + Parameters + ---------- + data : pandas.DataFrame + A DataFrame containing the transfer learning data, including target and source clusters, and accuracy values. + + clf_name : str + The name of the classifier or regressor being evaluated. + + type_clf : str + The type of the model, either 'clf' for classifier or 'reg' for regressor. + + run_logdir : str, optional + The directory where the results and plots will be saved. Default is the current directory ('.'). + + extra_string_for_fig_name : str, optional + An additional string to append to the figure filenames. Default is ''. + + max_values_for_each_method : int, optional + The maximum number of source candidates each method can select. Default is 5. + + Returns + ------- + None + The function generates and saves a plot but does not return any values. + + Notes + ----- + - The function loads or creates matrices for transfer learning accuracy and mean class distances between target and source clusters. + - It compares several methods, including: + 1. Intra-subject performance. + 2. Best source accuracy. + 3. Best teacher (using all subjects or 30% of students). + 4. Random source. + 5. Closest distance between classes. + 6. Oracle (best possible accuracy). + 7. Custom method (e.g., a regressor predicting the best source for the target). + 8. Max of methods (selecting the best performance from multiple methods). + - The performance of each method is compared to the oracle performance (i.e., the best possible result). + - The results are plotted, showing the performance relative to the oracle as the number of source candidates increases. + - The plots are saved in both PNG and PDF formats with filenames like `baseline_comparision_varying_nbr_source_data_{clf_name}{extra_string_for_fig_name}`. + """ + print(f'\n === {clf_name}{extra_string_for_fig_name} === \n') + # Load/create matrix of accuracies. + unique_clusters = np.unique(data['Target_cluster']) + full_file_path_1 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'matrix_of_accuracies.pkl') + full_file_path_3 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'target_accuracy.pkl') + full_file_path_4 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'subj_order_matrix_of_accuracies.pkl') + + + if os.path.exists(full_file_path_1) and os.path.exists(full_file_path_3) and os.path.exists(full_file_path_4): + # Load the pipeline + print(f'Loading stored matrices for evaluation of {clf_name}') + with open(full_file_path_1, 'rb') as f: + matrix_of_accuracies = pickle.load(f) + print(f'matrix_of_accuracies loaded from {full_file_path_4}') + with open(full_file_path_3, 'rb') as f: + target_accuracy = pickle.load(f) + print(f'target_accuracy loaded from {full_file_path_3}') + with open(full_file_path_4, 'rb') as f: + unique_clusters = pickle.load(f) + print(f'subj_order_matrix_of_accuracies loaded from {full_file_path_4}') + + # Create matrices if they do not exist. + else: + target_accuracy = np.zeros(len(unique_clusters)) + matrix_of_accuracies = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + target_accuracy[row] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + matrix_of_accuracies[row,col] = np.mean(data['Target_accuracy'][data['Target_cluster']==target_cluster]) + continue + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + y_values = data_here['Accuracy_after_transfer_learning'] + matrix_of_accuracies[row,col] = np.mean(y_values) + print('new matrices created ') + + # load mean_class_distance + # If not stored - for each target user, go through all source users and find the abs class_distance between classes. Store as a matrix. For source = target store np.inf. + full_file_path_5 = os.path.join('data','estimators',DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS,'matrices', 'mean_class_distances.pkl') + if os.path.exists(full_file_path_5): + # Load the matrix + with open(full_file_path_5, 'rb') as f: + mean_class_distances = pickle.load(f) + print(f'mean_class_distances loaded from {full_file_path_4}') + + # Create if not existing + else: + print('creating mean_class_distances') + mean_class_distances = np.zeros((len(unique_clusters),len(unique_clusters))) + for row,(target_cluster) in enumerate(unique_clusters): + for col,(source_cluster) in enumerate(unique_clusters): + if target_cluster == source_cluster: + mean_class_distances[row,col] = np.inf + continue + data_here = data[(data['Target_cluster']==target_cluster)] # Get data where target=target and source=source. + data_here = data_here[(data_here['Source_cluster']==source_cluster)] + class_1_dist = data_here['Target:Class1-Source:Class1'] + class_2_dist = data_here['Target:Class2-Source:Class2'] + mean_class_distances[row,col] = np.mean(np.concatenate((class_1_dist,class_2_dist))) + print('new mean_class_distances created ') + os.makedirs(f'data/estimators/{DATAFOLDER_STORED_MATRICES_AND_ESTIMATORS}/matrices', exist_ok=True) + # Save the file + if not os.path.exists(full_file_path_5): + with open(full_file_path_5, 'wb') as f: + pickle.dump(mean_class_distances, f) + print('Saving mean_class_distances') + + + + + # PLACEHOLDERS + dict_of_data = { + 'Intra-subject': 1, + 'Random source': 1, + 'Distance' : 1, + 'Best source': 1, + #'Best teacher':best_teacher, + 'Best teacher':1, + 'Max of methods' : 1, + 'TransPerfPred' :1, + 'Oracle' : 1, + } + nbr_methods = len(list(dict_of_data.keys())) + all_mean_val = np.zeros((max_values_for_each_method,nbr_methods)) + + + # Iterate over all nbr of source data selections. + # max_values_for_each_method states how many source data each method can select at most. + for iteration in range(0,max_values_for_each_method): + + # nbr_random_values is how many souce data each method can suggest for this iteration. + nbr_random_values = iteration+1 + + # Best teacher sorting + col_sums = np.sum(matrix_of_accuracies, axis=0) # Sum of columns + best_teacher_sorting = np.argsort(-col_sums) + best_teacher_indices = best_teacher_sorting[0:nbr_random_values] + + # Best source accuracy sorting + best_source_accuracy_sorting = np.argsort(-target_accuracy) + best_source_accuracy_indices = best_source_accuracy_sorting[0:nbr_random_values] + + + # Create placeholder arrays + nbr_simulations = 5 + nbr_subjects = len(unique_clusters) + target_intra_accuracy = np.zeros((nbr_subjects)) + random_source = np.zeros((nbr_subjects,nbr_simulations)) + max_of_methods = np.zeros((nbr_subjects)) + closest_distance = np.zeros((nbr_subjects)) + best_teacher = np.zeros((nbr_subjects)) + best_teacher_30_percentage_students = np.zeros((nbr_subjects,nbr_simulations)) + best_source_accuracy = np.zeros((nbr_subjects)) + oracle = np.zeros((nbr_subjects)) + worst = np.zeros((nbr_subjects)) + my_method = np.zeros((nbr_subjects,nbr_simulations)) + + # Iterate over all target user. + for row,(target_cluster) in enumerate(unique_clusters): + + # Placeholder + max_of_methods_here = {} + + # ---- Intra subject ---- + target_intra_accuracy[row] = target_accuracy[row] + + + + # ---- Best source_accuracy ---- + # If target user is one of best source accuracy - remove + if row in best_source_accuracy_indices: + modified_indices = best_source_accuracy_indices[best_source_accuracy_indices != row] + if len(modified_indices) <1: + modified_indices = best_source_accuracy_sorting[1] # Only happens if there is only one subject + best_source_accuracy[row] = np.max(matrix_of_accuracies[row][modified_indices]) + max_of_methods_here['best_source_accuracy'] = np.max(matrix_of_accuracies[row][modified_indices]) + else: + best_source_accuracy[row] = np.max(matrix_of_accuracies[row][best_source_accuracy_indices]) + max_of_methods_here['best_source_accuracy'] = np.max(matrix_of_accuracies[row][best_source_accuracy_indices]) + + + + # ---- Best teacher ---- + # If target user is one of best teachers - remove + if row in best_teacher_indices: + modified_indices = best_teacher_indices[best_teacher_indices != row] + if len(modified_indices) <1: + modified_indices = best_teacher_sorting[1] # Only happens if there is only one subject + best_teacher[row] = np.max(matrix_of_accuracies[row][modified_indices]) + else: + best_teacher[row] = np.max(matrix_of_accuracies[row][best_teacher_indices]) + + + # ---- Best teacher 30 percentage students ---- + max_of_methods_here['best_teacher'] = [] + # Random selection of 30% users (approx 30 users) to use as students for evaluation of best teacher. + for i in range(nbr_simulations): + nbr_students = int(0.3 * nbr_subjects) + selected_indices = rng.choice(nbr_subjects, nbr_students, replace=False) + selected_indices = selected_indices[selected_indices!=row] # remove the row subject from evaluation of best teacher. + # Select the rows + selected_rows = matrix_of_accuracies[selected_indices] + + # Best teacher sorting + col_sums = np.sum(selected_rows, axis=0) # Sum of columns + best_teacher_sorting = np.argsort(-col_sums) + best_teacher_indices = best_teacher_sorting[0:nbr_random_values] + + # If target user is one of best teachers - remove + if row in best_teacher_indices: + modified_indices = best_teacher_indices[best_teacher_indices != row] + if len(modified_indices) <1: + modified_indices = best_teacher_sorting[1] # Only happens if there is only one subject + best_teacher_30_percentage_students[row,i] = np.max(matrix_of_accuracies[row][modified_indices]) + max_of_methods_here['best_teacher'].append(np.max(matrix_of_accuracies[row][modified_indices])) + else: + best_teacher_30_percentage_students[row,i] = np.max(matrix_of_accuracies[row][best_teacher_indices]) + max_of_methods_here['best_teacher'].append(np.max(matrix_of_accuracies[row][best_teacher_indices])) + max_of_methods_here['best_teacher'] = np.mean(max_of_methods_here['best_teacher']) + + + # ---- Random source ---- + max_of_methods_here['random'] = [] + for i in range(nbr_simulations): + accuracies_here = matrix_of_accuracies[row] + filtered_accuracies = np.delete(accuracies_here, row) + random_values = rng.choice(filtered_accuracies,nbr_random_values,replace=False) + max_random_value = np.max(random_values) + random_source[row,i] = max_random_value + max_of_methods_here['random'] = np.mean(random_source[row,i]) + + # ---- Oracle ---- + oracle[row] = np.max(matrix_of_accuracies[row]) + + # ---- Worst ---- + worst[row] = np.min(matrix_of_accuracies[row]) + + + # ---- Closest_distance ---- + # for each subject, find the source data with smallest distance - basically argmin of the distance matrix for row row. + index = np.argsort(mean_class_distances[row]) + index_to_use = index[0:nbr_random_values] + closest_distance[row] = np.max(matrix_of_accuracies[row][index_to_use]) + + # ---- My method ---- + # The estimator passed to the method is used as my_method. The estimator must have been trained previously and stored to disk. + max_of_methods_here['my_method'] = [] + if clf_type == 'clf': + pass + elif clf_type == 'reg': + # Find the estimator for this target user. + key_in_dict = find_key_with_substring(INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[clf_name].keys(),target_cluster) + clf = INDIVIDUAL_ESTIMATORS_FOR_TARGET_SUBJECTS[clf_name][key_in_dict] + + # Extract the data for this target user + data_here = data[(data['Target_cluster']==target_cluster)] + X = data_here[FEATURES] + pred = clf.predict(X) + + # The data has 5 folds from the MI classification, each of these folds serve as a sample to be averaged. + for i in range(5): + # Get data for this fold + predictions_fold_i = pred[i::5] + data_fold_i = data_here.iloc[i::5] + + # Sort source users based on predicted performance + indices_best_pred = np.argsort(-predictions_fold_i) + + # Get data for the best source users + data_best_pred = data_fold_i.iloc[indices_best_pred[0:nbr_random_values]] + sources_best_pred = data_best_pred['Source_cluster'] + + # Find the indices for the top source users in the matrix of accuracies. + matches = np.isin(unique_clusters, sources_best_pred,assume_unique=True) + matching_indices = np.where(matches)[0] + + # Get the transfer learning performance for the source users. + values_best_predictions = matrix_of_accuracies[row][matching_indices] + + # Select the best one. + my_method[row,i] = np.max(values_best_predictions) + max_of_methods_here['my_method'].append(np.max(values_best_predictions)) + + # Max of methods for this method is the mean of the folds. + max_of_methods_here['my_method'] = np.mean(max_of_methods_here['my_method']) + + # For each target user, get the best performance of the included methods for max_of_methods. + max_of_methods[row] = max([max_of_methods_here[method] for method in METHODS_IN_MAX_OF_MEHTODS]) + + + + # Find mean value for the methods with multiple iterations (random or folds.) + best_teacher_30_percentage_students = np.mean(best_teacher_30_percentage_students,axis=1) + random_source = np.mean(random_source,axis=1) + my_method = np.mean(my_method,axis=1) + + + + # Methods to plot. + dict_of_data = { + 'Intra-subject': target_intra_accuracy, + 'Random source': random_source, + 'Distance' : closest_distance, + 'Best source': best_source_accuracy, + #'Best teacher':best_teacher, + 'Best teacher':best_teacher_30_percentage_students, + 'Max of methods' : max_of_methods, + 'TransPerfPred' :my_method, + 'Oracle' : oracle, + } + + + # Compute the performance of each method compared to the oracle performance. + data_str = 'Oracle' + for k,data_str_compare in enumerate(dict_of_data.keys()): + mean_val = np.mean(dict_of_data[data_str_compare]-dict_of_data[data_str]) + all_mean_val[iteration,k] = mean_val + + # Align max of methods so that it only apperas as multiples of the numbers of included methods. + # max of mehtods is in position 5 in the dict above. + # 3 is the number of methods included in max of methods. + all_mean_val_copy = copy.copy(all_mean_val) + all_mean_val[:,5] = np.zeros(all_mean_val[:,5].shape) + i=1 + while i*3-1 < max_values_for_each_method: + all_mean_val[i*3-1,5] = all_mean_val_copy[i-1,5] + i = i+1 + + # Plotting + labels = list(dict_of_data.keys()) + linestyles = ['-',':','--','-.','-',None,'-','-'] + linewidths = [3,3,3,3,3,None,7,3] + markers = ['^','','','','',None,'','v'] + colors = ['tab:gray','tab:orange','tab:red','tab:purple','tab:blue',None,'tab:green','k'] + + fig, ax = plt.subplots(figsize=(9,7),layout='constrained') + for k,data_str_compare in enumerate(dict_of_data.keys()): + if data_str_compare != 'Max of methods': + plt.plot(np.arange(1,max_values_for_each_method+1),all_mean_val[:,k]*100,color=colors[k],alpha=0.9,linewidth=linewidths[k],linestyle=linestyles[k],marker=markers[k],markersize=10, label=labels[k]) + else: + plt.plot(np.arange(3,max_values_for_each_method+1,3),all_mean_val[2:max_values_for_each_method+1:3,k]*100, color='tab:gray',alpha=0.9, label=labels[k],marker='o',markersize=8,linestyle=':') + plt.legend(loc='lower right') + plt.xlabel('Number of source candidates') + plt.ylabel('Performance compared to Oracle') + plt.grid(True) + plt.margins(0.015) + plt.xticks(np.arange(1, max_values_for_each_method + 1,2)) + + + save_string = f'baseline_comparision_varying_nbr_source_data_{clf_name}{extra_string_for_fig_name}' # Filename + plt.savefig(f'{run_logdir}/png_{save_string}.png') # Save the figure in the specified directory + plt.savefig(f'{run_logdir}/pdf_{save_string}.pdf') # Save the figure in the specified directory + plt.close(fig) + + + return + + + + + +def highlight_cell(row, col, ax=None, fill=False, **kwargs): + """ + Highlights a specific cell in a matrix plot by drawing a rectangle around it. + + This function adds a rectangular patch to highlight a specific cell in a plot (typically a heatmap or matrix plot). + The cell is defined by its row and column indices, and the rectangle can optionally be filled with a color. + + Parameters + ---------- + row : int + The row index of the cell to highlight. + + col : int + The column index of the cell to highlight. + + ax : matplotlib.axes.Axes, optional + The axes object on which to draw the rectangle. If None, the current axes will be used. Default is None. + + fill : bool, optional + Whether to fill the rectangle with color. Default is False. + + **kwargs : dict, optional + Additional keyword arguments to pass to `matplotlib.patches.Rectangle` (e.g., edgecolor, facecolor, linewidth). + + Returns + ------- + rect : matplotlib.patches.Rectangle + The rectangle patch added to the plot. + + Notes + ----- + This function is typically used to highlight specific cells in heatmaps or matrix plots, where each cell represents + a data point. The rectangle can be customized using the `**kwargs` to control the appearance of the highlight. + """ + y=row + x=col + rect = plt.Rectangle((x-.5, y-.5), 1,1, fill=fill, **kwargs) + ax = ax or plt.gca() + ax.add_patch(rect) + return rect + + + +# ----------------------------------------------- +# Main +# ----------------------------------------------- +# execute only if run as a script +if __name__ == "__main__": + begin_time = datetime.datetime.now() + log_folder = "my_logs" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + # Create run directory to store models and logs in: + root_logdir = os.path.join(os.curdir, log_folder) + run_id = "%s_on_%s" % (time.strftime("run_%Y_%m_%d-%H_%M_%S"), os.uname()[1]) + run_logdir = os.path.join(root_logdir, run_id) + output_folder = run_logdir + os.mkdir(run_logdir) + sys.stderr = Logger("{}/console_err.txt".format(run_logdir)) + sys.stdout = Logger("{}/console_log.txt".format(run_logdir)) + print( + "\n### main.py was started on %s in %s with logs in %s" + % (os.uname()[1], os.getcwd(), run_logdir) + ) + print(sys.argv) + print_git_version() + # print_source_code() + save_source_code(run_logdir) + now = begin_time + now_string = str(now).replace(" ", "_").replace(":","_") + + plt.rcParams.update({'font.size': 15}) + + write_description() + + + # ------------------------------------------------------------------------------- + + # ----------------------------------------------- + # LOAD DATA + # ----------------------------------------------- + data, random_selection_of_users = load_saved_data(DATA_FOLDER,plotting=PLOTTING_EVALUATION,name=NAME_STRING_DATA,lower_accuracy_limit=LOWER_ACCURACY_LIMIT,upper_accuracy_limit=UPPER_ACCURACY_LIMIT,nbr_subjects_for_visualization=NBR_USERS_FOR_VISUALIZATION) + + # ----------------------------------------------- + # Run the code. + # ----------------------------------------------- + # Plot scatterplots and boxplot of the data features. + if PLOTTING_DATA: + data_plotting(data,run_logdir=run_logdir) + + # Train the classifiers to predict transfer learning performance. + run_all_classifiers(data,ALL_CLFS,run_logdir=run_logdir,plotting=PLOTTING_ESTIMATORS,nbr_folds=NBR_FOLDS) + + + if PLOTTING_EVALUATION: + for (clf_name,clf_type) in CLF_EVALUATION: + # Generates lineplot with methods compared over different number of selected source data. + baseline_comparision_varying_nbr_source_data(data,clf_name,clf_type,run_logdir=run_logdir,max_values_for_each_method=MAX_VALUES_FOR_EACH_METHOD_IN_BASELINE_COMPARISION,extra_string_for_fig_name=f'') + + # Generate matrix with mean performance and statistical significance. + for nbr_random_val in NBR_OF_VALUES_FOR_EACH_METHOD_IN_MAX_OF_METHODS: + baseline_comparision_mean_performance(data,clf_name,clf_type,run_logdir=run_logdir,nbr_of_values_for_each_method_in_max_of_methods=nbr_random_val,extra_string_for_fig_name=f'_nbr_values_evaluated_per_method_{nbr_random_val*len(METHODS_IN_MAX_OF_MEHTODS)}') + + try: + print('\nVoting classifier:') + [print(name) for (name,_) in clfs_for_voting_classifier] + except: + pass + + plt.close('all') + print("\n...done!") + print("### Total run time: %s" % (datetime.datetime.now() - begin_time))