@misc{10481/102834, year = {2025}, month = {2}, url = {https://hdl.handle.net/10481/102834}, abstract = {Network traffic datasets are essential for the construction of traffic models, often using machine learning (ML) techniques. Among other applications, these models can be employed to solve complex optimization problems or to identify anomalous behaviors, i.e., behaviors that deviate from the established model. However, the performance of the ML model depends, among other factors, on the quality of the data used to train it. Benchmark datasets, with a profound impact on research findings, are often assumed to be of good quality by default. In this paper, we derive four variants of a benchmark dataset in network anomaly detection (UGR’16, a flow-based real-world traffic dataset designed for anomaly detection), and show that the choice among variants has a larger impact on model performance than the ML technique used to build the model. To analyze this phenomenon, we propose a methodology to investigate the causes of these differences and to assess the quality of the data labeling. Our results underline the importance of paying more attention to data quality assessment in network anomaly detection.}, organization = {Agencia Estatal de Investigación in Spain, MCIN/AEI/ 10.13039/501100011033, grant No. PID2020-113462RB-I00}, publisher = {MDPI}, keywords = {Netflow}, keywords = {UGR’16}, keywords = {anomaly detection}, title = {Data quality tools to enhance a network anomaly detection benchmark}, doi = {10.3390/data10030033}, author = {Camacho Páez, José and Rodríguez Gómez, Rafael A.}, }