Source code for ETIA.data.utils

from enum import Enum
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder



[docs] class DataTypes(Enum): CONTINUOUS = 1 DISCRETE = 2 MIXED = 3 GRAPH = 4 COVARIANCE = 5 ALL = 6
[docs] def var_types_and_categorical_encoding(data, unique_val_thr=5): ''' Returns information about the data type (continuous or categorical) of each column in data. Args: data: pandas array with possible nan values, str, int, floats and objects unique_val_thr: int value to Returns: data_type_info : numpy array with two columns : 1st column has the names of the variables and the 2nd column has the information 'continuous' or 'catagorical' ''' d = {'var_type': ['continuous' for i in data.columns], 'n_domain': [0 for i in data.columns]} data_type_info = pd.DataFrame(data=d, index=data.columns) for var in data.columns: # check 1: check if the column has only str cur_col = pd.to_numeric(data[var], errors='coerce') if pd.isna(cur_col).all(): # input is str data_type_info.loc[var, 'var_type'] = 'categorical' # check 2: check if there are less than thr number of unique values else: # continuous if len(data[var].unique()) < unique_val_thr: data_type_info.loc[var, 'var_type'] = 'categorical' # else: # data[var] = data[var].astype('float') # apply ordinal encoding to categorical variables # categorical_var_names = data_type_info.index[data_type_info['var_type'] == 'categorical'].tolist() categorical_var_names = data_type_info.index[data_type_info['var_type'] == 'categorical'] #.tolist() ord_encoder = OrdinalEncoder() ord_encoder.fit(data[categorical_var_names]) data[categorical_var_names] = ord_encoder.transform(data[categorical_var_names]) # how many classes they have for var in categorical_var_names: unique_classes = data[var].unique() data_type_info.loc[var, 'n_domain'] = np.nanmax(unique_classes) + 1 # [0,1,...,maxC] # data type if data_type_info['var_type'].eq('continuous').all(): data_type = 'continuous' elif data_type_info['var_type'].eq('categorical').all(): data_type = 'categorical' else: data_type = 'mixed' # Check if the DataFrame contains any missing values data_type_info['contains_missing_values'] = data.isnull().values.any() # Check if the DataFrame contains any constant variables data_type_info['contains_constant_vars'] = (data.apply(pd.Series.nunique) == 1).any() return data, data_type_info, data_type
[docs] def get_data_info(data): data_info = {} # Get the number of features (columns) data_info['num_features'] = len(data.columns) # Get the number of samples (rows) data_info['num_samples'] = len(data.index) # Check if the DataFrame contains any missing values data_info['contains_missing_values'] = data.isnull().values.any() # Check if the DataFrame contains any constant variables data_info['contains_constant_vars'] = (data.apply(pd.Series.nunique) == 1).any()