@misc{10481/106344, year = {2025}, url = {https://hdl.handle.net/10481/106344}, abstract = {Background: Disease classification using 16S rRNA microbiome data faces challenges of high dimensionality, compositionality, and sparsity, compounded by the inherent small sample sizes in many studies. Machine learning and feature selection techniques offer potential to identify robust biomarkers and improve classification performance; however, their comparative effectiveness across diverse methods and datasets has been insufficiently explored. This study evaluates multiple feature selection techniques alongside normalization strategies, focusing on their interplay with classifier performance. Results: Our analyses revealed that centered log-ratio normalization improves the performance of logistic regression and support vector machine models and facilitates feature selection, whereas random forest models yield strong results using relative abundances. Interestingly, presence–absence normalization was able to achieve similar performance compared to abundance-based transformations across classifiers. Among feature selection methods, minimum redundancy maximum relevancy (mRMR) surpassed most methods in identifying compact feature sets and demonstrated performance comparable to least absolute shrinkage and selection operator (LASSO), which obtained top results requiring lower computation times. Autoencoders needed larger latent spaces to perform well and lacked interpretability, Mutual Information suffered from redundancy, and ReliefF struggled with data sparsity. Conclusions: Overall, feature selection pipelines improved model focus and robustness via a massive reduction of the feature space. mRMR and LASSO emerged as the most effective methods across datasets}, organization = {Departamento de Ingeniería de Computadores, Automática y Robótica}, publisher = {Oxford University Press}, title = {Exploring the role of normalization and feature selection in microbiome disease classification pipelines}, doi = {https://doi.org/10.1093/gigascience/giaf096}, author = {Garach Vélez, Ignacio and Ortuño Guzmán, Francisco Manuel and Rojas Ruiz, Ignacio and Herrera Maldonado, Luis Javier}, }