@misc{10481/108279, year = {2025}, url = {https://hdl.handle.net/10481/108279}, abstract = {Flow cytometry is an advanced technique for analyzing cellular heterogeneity in biomedical research and clinical diagnostics. Its ability to generate multiparametric data has facilitated advancements in disease classification problems, particularly addressing challenges in distinguishing cell populations and predicting disease outcomes. In these classification tasks, most works consist on either labeling individual cells based on their phenotypic markers or categorizing patient samples as healthy or diseased. However, the complexity of flow cytometry data, characterized by hurdles such as spectral overlap, wide dynamic ranges or batch effects require the usage of preprocessing strategies prior to modeling analysis. This research provides a comprehensive survey of current preprocessing techniques for flow cytometry data used in classification tasks and discusses their specific applications, focusing on four key aspects of data treatment: signal compensation and transformation, batch effect mitigation, imperfect data treatment, and feature selection and class balance. Emphasis is placed on standardizing preprocessing workflows and addressing computational and analytical difficulties posed by the size of modern flow cytometry datasets. The paper also includes a discussion on future opportunities for improving preprocessing pipelines to improve the reproducibility of flow cytometry-based classification models. In short, this work serves as a reference for experts in the field, consolidating best practices in preprocessing and providing guidelines for the development of methodologies that optimize the classification of flow cytometry data.}, organization = {MCIN/AEI/10.13039/501100011033 PID2020-119032RB-I00}, organization = {University of Granada PP2024PP-07}, publisher = {IEEE}, keywords = {Flow cytometry}, keywords = {Preprocessing}, keywords = {Data quality}, title = {A survey of preprocessing techniques for flow cytometry data in classification tasks}, doi = {10.1109/TCBBIO.2025.3612573}, author = {Núñez Nepomuceno, David and Sáez Muñoz, José Antonio and Carmona Sáez, Pedro}, }