Source code for ETIA.AFS.feature_selector

import os
import subprocess
import logging
from typing import Dict, Optional, Any

import pandas as pd
import uuid


[docs] class FeatureSelector: """ Feature selection with the MXM R package. Methods ------- feature_selection(config, target_name, data_pd, dataset_name, train_idx_name=None, verbose=False) Runs the feature selection process based on the provided configuration. """ def __init__(self, r_path: str): """ Initializes the FeatureSelector. Parameters ---------- r_path : str Path to the Rscript executable for running R-based feature selection algorithms. """ self.r_path = r_path self.path_ = os.path.dirname(__file__) # Setup logging self.logger = logging.getLogger(__name__)
[docs] def run_r_script( self, script_path: str, data_file_path: str, target_name: str, config: Dict[str, Any], output_file: str, train_idx_name: Optional[str] = None, verbose: bool = False ) -> pd.DataFrame: """ Runs the specified R script for feature selection. """ args = [ self.r_path, '--vanilla', script_path, data_file_path, target_name, config['ind_test_name'], str(config['alpha']), str(config['k']), output_file, 'TRUE' if verbose else 'FALSE' ] if train_idx_name: train_idx_path = os.path.join(self.path_, train_idx_name) args.append(train_idx_path) result = subprocess.run(args, capture_output=True, text=True) if verbose: self.logger.info("R script stdout:") self.logger.info(result.stdout) self.logger.info("R script stderr:") self.logger.info(result.stderr) if result.returncode != 0: self.logger.error(f"R script {script_path} failed with return code {result.returncode}") self.logger.error(f"R script stderr: {result.stderr}") raise RuntimeError(f"R script {script_path} failed with return code {result.returncode}") selected_features_pd = pd.read_csv(output_file) return selected_features_pd
[docs] def fbed( self, target_name: str, config: Dict[str, Any], data_file_path: str, output_file: str, train_idx_name: Optional[str] = None, verbose: bool = False ) -> pd.DataFrame: """ Runs the FBED feature selection algorithm. """ script_path = os.path.join(self.path_, 'feature_selectors', 'fbed_with_idx.R') return self.run_r_script( script_path, data_file_path, target_name, config, output_file, train_idx_name, verbose )
[docs] def ses( self, target_name: str, config: Dict[str, Any], data_file_path: str, output_file: str, train_idx_name: Optional[str] = None, verbose: bool = False ) -> pd.DataFrame: """ Runs the SES feature selection algorithm. """ script_path = os.path.join(self.path_, 'feature_selectors', 'ses_with_idx.R') return self.run_r_script( script_path, data_file_path, target_name, config, output_file, train_idx_name, verbose )
[docs] def feature_selection( self, config: Dict[str, Any], target_name: str, data_pd: pd.DataFrame, dataset_name: str, train_idx_name: Optional[str] = None, verbose: bool = False ) -> pd.DataFrame: """ Runs the feature selection process based on the provided configuration. """ # Generate unique file names unique_id = str(uuid.uuid4()) data_file_name = f"{dataset_name}_{unique_id}.csv" data_file_path = os.path.join(self.path_, data_file_name) output_file = os.path.join(self.path_, f"selected_features_{unique_id}.csv") data_pd.to_csv(data_file_path, index=False) try: fs_name = config.get('fs_name') if fs_name == 'fbed': features = self.fbed(target_name, config, data_file_path, output_file, train_idx_name, verbose) elif fs_name == 'ses': features = self.ses(target_name, config, data_file_path, output_file, train_idx_name, verbose) else: raise ValueError(f"Unsupported feature selection algorithm: {fs_name}") return features finally: # Ensure the CSV files are deleted after feature selection if os.path.exists(data_file_path): os.remove(data_file_path) if os.path.exists(output_file): os.remove(output_file)