Source code for ETIA.data.Dataset

from .utils import *

[docs] class Dataset: """ A class for representing datasets and providing functionalities for loading, manipulating, and processing datasets. Parameters ---------- filename : str, optional The name of the CSV file containing the dataset. Default is None. data_time_info : dict, optional Dictionary containing time-related information (lags, etc.). Default is None. time_series : bool, optional Boolean indicating if the dataset is time series data. Default is False. data : pd.DataFrame, optional A pandas DataFrame containing preloaded data (e.g., from AFS). Default is None. dataset_name : str, optional The name of the dataset. If not provided, it defaults to 'Preloaded Dataset' or the filename. Attributes ---------- dataset_name : str The name of the dataset. data_time_info : dict Information related to time and lags in the dataset. time_series : bool Boolean flag indicating if the data is a time series dataset. n_lags : int The number of time lags in the dataset. data : pd.DataFrame The loaded dataset. data_type_info : dict Information on the types of variables in the dataset. data_type : str General type of data (e.g., continuous, categorical). data_general_info : dict General information about the dataset. processed_data : dict Data after processing (currently empty). annotations : dict Annotations on the dataset (optional). Methods ------- load_file(filename) Loads a new dataset from a CSV file. load_np_dataset(dataset, column_names) Loads a new dataset from a NumPy array. load_pd_dataset(dataset) Loads a new dataset from a pandas DataFrame. convert_to_time_lag(n_lags) Converts the dataset into time-lagged data. get_dataset() Returns the dataset stored in the Dataset instance. get_data_type_info() Returns the data type information of the dataset. get_data_time_info() Returns the time-related information of the dataset. get_info() Returns all the general information of the dataset including type and time-related info. annotate_dataset(annotations) Stores annotations for the dataset. """ def __init__(self, filename=None, data_time_info=None, time_series=False, data=None, dataset_name=None): """ Initializes the Dataset object, either from a file or from a preloaded pandas DataFrame. Parameters ---------- filename : str, optional Name of the CSV file containing the dataset. Default is None. data_time_info : dict, optional Dictionary containing time-related information (lags, etc.). Default is None. time_series : bool, optional Boolean indicating if the dataset is time series data. Default is False. data : pd.DataFrame, optional A pandas DataFrame containing preloaded data (e.g., from AFS). Default is None. dataset_name : str, optional The name of the dataset. If not provided, defaults to 'Preloaded Dataset' or the filename. """ if data_time_info is None: data_time_info = {'n_lags': 0, 'time_lagged': False} self.dataset_name = filename if filename else 'Preloaded Dataset' self.data_time_info = data_time_info self.time_series = time_series self.n_lags = data_time_info['n_lags'] if data is not None: # Use the provided DataFrame self.data = data elif filename is not None: # Load from a CSV file self.data = pd.read_csv(filename, header=0) else: raise ValueError("Either a filename or a pandas DataFrame must be provided") # Process the data types _, self.data_type_info, self.data_type = var_types_and_categorical_encoding(self.data) self.data_general_info = get_data_info(self.data) if not self.data_time_info.get('time_lagged', False) and self.n_lags != 0: self.convert_to_time_lag(self.n_lags) self.processed_data = {}
[docs] def load_file(self, filename): """ Loads a new dataset from a CSV file. Parameters ---------- filename : str Name of the CSV file to load the dataset from. """ self.data = pd.read_csv(filename) _, self.data_type_info, self.data_type = var_types_and_categorical_encoding(self.data) self.data_general_info = get_data_info(self.data)
[docs] def load_np_dataset(self, dataset, column_names): """ Loads a new dataset from a NumPy array. Parameters ---------- dataset : np.ndarray The dataset as a NumPy array. column_names : list List of column names for the dataset. Raises ------ TypeError If the input is not a NumPy array. """ if not isinstance(dataset, np.ndarray): raise TypeError('load_np_dataset requires numpy array as input') self.data = pd.DataFrame(dataset, columns=column_names) _, self.data_type_info, self.data_type = var_types_and_categorical_encoding(self.data) self.data_general_info = get_data_info(self.data)
[docs] def load_pd_dataset(self, dataset): """ Loads a new dataset from a pandas DataFrame. Parameters ---------- dataset : pd.DataFrame The dataset as a pandas DataFrame. Raises ------ TypeError If the input is not a pandas DataFrame. """ if not isinstance(dataset, pd.DataFrame): raise TypeError('load_pd_dataset requires pd.DataFrame as input') self.data = dataset _, self.data_type_info, self.data_type = var_types_and_categorical_encoding(self.data) self.data_general_info = get_data_info(self.data)
[docs] def convert_to_time_lag(self, n_lags): """ Converts the dataset into time-lagged data. Parameters ---------- n_lags : int Number of time lags to add to the dataset. Returns ------- pd.DataFrame The dataset with added time lags (if applicable). """ # Placeholder for converting data # Implement the actual time-lagging logic here return self.data
[docs] def get_dataset(self): """ Returns the dataset stored in the Dataset instance. Returns ------- pd.DataFrame The loaded dataset. """ return self.data
[docs] def get_data_type_info(self): """ Returns the data type information of the dataset. Returns ------- dict A dictionary containing information about the variable types in the dataset. """ return self.data_type_info
[docs] def get_data_time_info(self): """ Returns the time-related information of the dataset. Returns ------- dict A dictionary containing time-related information such as lags and whether the dataset is time-lagged. """ return self.data_time_info
[docs] def get_info(self): """ Returns general information about the dataset, including data types and time-related information. Returns ------- dict A dictionary containing: - data_type_info: Information about variable types in the dataset. - data_time_info: Time-related information about the dataset. - data_type: General type of data (e.g., continuous, categorical). - data_general_info: General information about the dataset. - dataset_name: The name of the dataset. """ return { 'data_type_info': self.data_type_info, 'data_time_info': self.data_time_info, 'data_type': self.data_type, 'data_general_info': self.data_general_info, 'dataset_name': self.dataset_name }
[docs] def annotate_dataset(self, annotations): """ Stores annotations for the dataset. Parameters ---------- annotations : dict Dictionary of annotations related to the dataset. """ self.annotations = annotations