Module jidenn.data.data_info

Expand source code
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
import tensorflow as tf
import pandas as pd
from typing import Union, List, Dict, Optional
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif

sns.set_theme(style="ticks")


def tf_dataset_to_pandas(dataset: tf.data.Dataset, var_names: List[str]) -> pd.DataFrame:
    # create numpy arrays of data, labels and weights from dataset
    data, labels, weights = zip(*dataset.as_numpy_iterator())
    data = np.array(data)
    labels = np.array(labels)
    weights = np.array(weights)

    # create pandas dataframe with data, labels and weights
    df = pd.DataFrame(data=data, columns=var_names)
    df['label'] = labels
    df['weight'] = weights
    return df


def explode_nested_variables(df: pd.DataFrame, exploding_column: str, max_iterations: int = 5) -> pd.DataFrame:
    for _ in range(max_iterations):
        try:
            df[exploding_column] = pd.to_numeric(df[exploding_column])
            break
        except (ValueError, TypeError):
            df = df.explode(exploding_column, ignore_index=True)
            df = df.sample(n=len(df.index)).reset_index(drop=True)
            continue
    return df


def plot_corrolation_matrix(corr_matrix: pd.DataFrame, save_path: str) -> None:
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # create correlation matrix of data without label
    fig = plt.figure(figsize=(21, 18))
    # [x0, y0, width, height]
    fig.add_axes([0.2, 0.2, 0.8, 0.8])

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr_matrix, cmap=cmap, center=0, square=True, linewidths=.5,
                annot=True, fmt='.1f', cbar_kws={'shrink': .8})
    # plt.xticks(rotation=40)
    plt.savefig(save_path, dpi=300)
    plt.close('all')


def generate_data_distributions(df: pd.DataFrame,
                                folder: str,
                                color_column: str = 'named_label',
                                hue_order: Optional[List[str]] = None,
                                xlabel_mapper: Optional[Dict[str, str]] = None) -> None:
    # corr_matrix = df.corr()
    # plot_corrolation_matrix(corr_matrix, os.path.join(folder, 'correlation_matrix.jpg'))

    var_names = list(df.columns)
    var_names.remove(color_column)
    os.makedirs(os.path.join(folder, 'jpg'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'pdf'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'jpg_log'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'pdf_log'), exist_ok=True)
    for var_name in var_names + ['label', 'weight']:
        small_df = df[[var_name, color_column]].copy()
        dtype = small_df[var_name].dtype

        if dtype == 'object':
            small_df = explode_nested_variables(small_df, var_name)
            small_df = small_df.loc[small_df[var_name] != 0]
        try:
            ax = sns.histplot(data=small_df, x=var_name, hue=color_column,
                              stat='density', element="step", fill=True,
                              palette='Set1', common_norm=False, hue_order=hue_order)
        except:
            ax = sns.histplot(data=small_df, x=var_name, hue=color_column,
                              stat='density', element="step", fill=True,
                              palette='Set1', common_norm=False, hue_order=hue_order, bins=100)

        plt.xlabel(xlabel_mapper[var_name] if xlabel_mapper is not None and var_name in xlabel_mapper else var_name)

        plt.savefig(os.path.join(folder, 'jpg', f'{var_name}.jpg'), dpi=300, bbox_inches='tight')
        plt.savefig(os.path.join(folder, 'pdf', f'{var_name}.pdf'), bbox_inches='tight')
        plt.yscale('log')
        plt.savefig(os.path.join(folder, 'jpg_log', f'{var_name}.jpg'), dpi=300, bbox_inches='tight')
        plt.savefig(os.path.join(folder, 'pdf_log', f'{var_name}.pdf'), bbox_inches='tight')

        plt.close('all')


def plot_feature_importance(df: pd.DataFrame, fig_path: str, score_name: str = 'score', variable_name: str = 'variable') -> None:
    feature_scores = df.sort_values(score_name, ascending=False).reset_index(drop=True)
    fig = plt.figure(figsize=(10, 15))
    # [x0, y0, width, height]
    ax = fig.add_axes([0.33, 0.05, 0.6, 0.92])
    sns.barplot(x=score_name, y=variable_name, data=feature_scores, orient='h')
    ax.set_ylabel(ylabel="")
    ax.set_xlabel(xlabel="Score")
    plt.savefig(fig_path, dpi=300)
    plt.close('all')


def feature_importance(df: pd.DataFrame,
                       folder: str,
                       k: Union[int, None] = None) -> None:
    X = df.drop(['label', 'weight'], axis=1)
    y = df['label']
    k = len(list(X.columns)) if k is None else k
    for score_name, score_func in zip(['linear', 'mutual'], [f_classif, mutual_info_classif]):
        bestfeatures = SelectKBest(score_func=score_func, k=k)
        fit = bestfeatures.fit(X, y)
        feature_scores = pd.DataFrame({'score': fit.scores_, 'variable': X.columns})
        plot_feature_importance(feature_scores, os.path.join(folder, f'feature_{score_name}.png'))

Functions

def explode_nested_variables(df: pandas.core.frame.DataFrame, exploding_column: str, max_iterations: int = 5) ‑> pandas.core.frame.DataFrame
Expand source code
def explode_nested_variables(df: pd.DataFrame, exploding_column: str, max_iterations: int = 5) -> pd.DataFrame:
    for _ in range(max_iterations):
        try:
            df[exploding_column] = pd.to_numeric(df[exploding_column])
            break
        except (ValueError, TypeError):
            df = df.explode(exploding_column, ignore_index=True)
            df = df.sample(n=len(df.index)).reset_index(drop=True)
            continue
    return df
def feature_importance(df: pandas.core.frame.DataFrame, folder: str, k: Optional[int] = None) ‑> None
Expand source code
def feature_importance(df: pd.DataFrame,
                       folder: str,
                       k: Union[int, None] = None) -> None:
    X = df.drop(['label', 'weight'], axis=1)
    y = df['label']
    k = len(list(X.columns)) if k is None else k
    for score_name, score_func in zip(['linear', 'mutual'], [f_classif, mutual_info_classif]):
        bestfeatures = SelectKBest(score_func=score_func, k=k)
        fit = bestfeatures.fit(X, y)
        feature_scores = pd.DataFrame({'score': fit.scores_, 'variable': X.columns})
        plot_feature_importance(feature_scores, os.path.join(folder, f'feature_{score_name}.png'))
def generate_data_distributions(df: pandas.core.frame.DataFrame, folder: str, color_column: str = 'named_label', hue_order: Optional[List[str]] = None, xlabel_mapper: Optional[Dict[str, str]] = None) ‑> None
Expand source code
def generate_data_distributions(df: pd.DataFrame,
                                folder: str,
                                color_column: str = 'named_label',
                                hue_order: Optional[List[str]] = None,
                                xlabel_mapper: Optional[Dict[str, str]] = None) -> None:
    # corr_matrix = df.corr()
    # plot_corrolation_matrix(corr_matrix, os.path.join(folder, 'correlation_matrix.jpg'))

    var_names = list(df.columns)
    var_names.remove(color_column)
    os.makedirs(os.path.join(folder, 'jpg'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'pdf'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'jpg_log'), exist_ok=True)
    os.makedirs(os.path.join(folder, 'pdf_log'), exist_ok=True)
    for var_name in var_names + ['label', 'weight']:
        small_df = df[[var_name, color_column]].copy()
        dtype = small_df[var_name].dtype

        if dtype == 'object':
            small_df = explode_nested_variables(small_df, var_name)
            small_df = small_df.loc[small_df[var_name] != 0]
        try:
            ax = sns.histplot(data=small_df, x=var_name, hue=color_column,
                              stat='density', element="step", fill=True,
                              palette='Set1', common_norm=False, hue_order=hue_order)
        except:
            ax = sns.histplot(data=small_df, x=var_name, hue=color_column,
                              stat='density', element="step", fill=True,
                              palette='Set1', common_norm=False, hue_order=hue_order, bins=100)

        plt.xlabel(xlabel_mapper[var_name] if xlabel_mapper is not None and var_name in xlabel_mapper else var_name)

        plt.savefig(os.path.join(folder, 'jpg', f'{var_name}.jpg'), dpi=300, bbox_inches='tight')
        plt.savefig(os.path.join(folder, 'pdf', f'{var_name}.pdf'), bbox_inches='tight')
        plt.yscale('log')
        plt.savefig(os.path.join(folder, 'jpg_log', f'{var_name}.jpg'), dpi=300, bbox_inches='tight')
        plt.savefig(os.path.join(folder, 'pdf_log', f'{var_name}.pdf'), bbox_inches='tight')

        plt.close('all')
def plot_corrolation_matrix(corr_matrix: pandas.core.frame.DataFrame, save_path: str) ‑> None
Expand source code
def plot_corrolation_matrix(corr_matrix: pd.DataFrame, save_path: str) -> None:
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # create correlation matrix of data without label
    fig = plt.figure(figsize=(21, 18))
    # [x0, y0, width, height]
    fig.add_axes([0.2, 0.2, 0.8, 0.8])

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr_matrix, cmap=cmap, center=0, square=True, linewidths=.5,
                annot=True, fmt='.1f', cbar_kws={'shrink': .8})
    # plt.xticks(rotation=40)
    plt.savefig(save_path, dpi=300)
    plt.close('all')
def plot_feature_importance(df: pandas.core.frame.DataFrame, fig_path: str, score_name: str = 'score', variable_name: str = 'variable') ‑> None
Expand source code
def plot_feature_importance(df: pd.DataFrame, fig_path: str, score_name: str = 'score', variable_name: str = 'variable') -> None:
    feature_scores = df.sort_values(score_name, ascending=False).reset_index(drop=True)
    fig = plt.figure(figsize=(10, 15))
    # [x0, y0, width, height]
    ax = fig.add_axes([0.33, 0.05, 0.6, 0.92])
    sns.barplot(x=score_name, y=variable_name, data=feature_scores, orient='h')
    ax.set_ylabel(ylabel="")
    ax.set_xlabel(xlabel="Score")
    plt.savefig(fig_path, dpi=300)
    plt.close('all')
def tf_dataset_to_pandas(dataset: tensorflow.python.data.ops.dataset_ops.DatasetV2, var_names: List[str]) ‑> pandas.core.frame.DataFrame
Expand source code
def tf_dataset_to_pandas(dataset: tf.data.Dataset, var_names: List[str]) -> pd.DataFrame:
    # create numpy arrays of data, labels and weights from dataset
    data, labels, weights = zip(*dataset.as_numpy_iterator())
    data = np.array(data)
    labels = np.array(labels)
    weights = np.array(weights)

    # create pandas dataframe with data, labels and weights
    df = pd.DataFrame(data=data, columns=var_names)
    df['label'] = labels
    df['weight'] = weights
    return df