Source code for ETIA.data.utils

from enum import Enum
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder




[docs]
class DataTypes(Enum):
    CONTINUOUS = 1
    DISCRETE = 2
    MIXED = 3
    GRAPH = 4
    COVARIANCE = 5
    ALL = 6


[docs]
def var_types_and_categorical_encoding(data, unique_val_thr=5):
    '''
    Returns information about the data type (continuous or categorical) of each column in data.
    Args:
        data: pandas array with possible nan values, str, int, floats and objects
        unique_val_thr: int value to

    Returns:
        data_type_info : numpy array with two columns :
            1st column has the names of the variables and the
            2nd column has the information 'continuous' or 'catagorical'
    '''


    d = {'var_type': ['continuous' for i in data.columns],
         'n_domain': [0 for i in data.columns]}
    data_type_info = pd.DataFrame(data=d, index=data.columns)

    for var in data.columns:
        # check 1: check if the column has only str
        cur_col = pd.to_numeric(data[var], errors='coerce')
        if pd.isna(cur_col).all():  # input is str
            data_type_info.loc[var, 'var_type'] = 'categorical'

        # check 2: check if there are less than thr number of unique values
        else:
            # continuous
            if len(data[var].unique()) < unique_val_thr:
                data_type_info.loc[var, 'var_type'] = 'categorical'
            # else:
            #     data[var] = data[var].astype('float')

    #   apply ordinal encoding to categorical variables
    # categorical_var_names = data_type_info.index[data_type_info['var_type'] == 'categorical'].tolist()
    categorical_var_names = data_type_info.index[data_type_info['var_type'] == 'categorical'] #.tolist()
    ord_encoder = OrdinalEncoder()
    ord_encoder.fit(data[categorical_var_names])
    data[categorical_var_names] = ord_encoder.transform(data[categorical_var_names])

    #   how many classes they have
    for var in categorical_var_names:
        unique_classes = data[var].unique()
        data_type_info.loc[var, 'n_domain'] = np.nanmax(unique_classes) + 1  # [0,1,...,maxC]


    # data type
    if data_type_info['var_type'].eq('continuous').all():
        data_type = 'continuous'
    elif data_type_info['var_type'].eq('categorical').all():
        data_type = 'categorical'
    else:
        data_type = 'mixed'
    # Check if the DataFrame contains any missing values
    data_type_info['contains_missing_values'] = data.isnull().values.any()

    # Check if the DataFrame contains any constant variables
    data_type_info['contains_constant_vars'] = (data.apply(pd.Series.nunique) == 1).any()

    return data, data_type_info, data_type



[docs]
def get_data_info(data):
    data_info = {}

    # Get the number of features (columns)
    data_info['num_features'] = len(data.columns)

    # Get the number of samples (rows)
    data_info['num_samples'] = len(data.index)

    # Check if the DataFrame contains any missing values
    data_info['contains_missing_values'] = data.isnull().values.any()

    # Check if the DataFrame contains any constant variables
    data_info['contains_constant_vars'] = (data.apply(pd.Series.nunique) == 1).any()