@misc{10481/102834,
year = {2025},
month = {2},
url = {https://hdl.handle.net/10481/102834},
abstract = {Network traffic datasets are essential for the construction of traffic models, often
using machine learning (ML) techniques. Among other applications, these models can be
employed to solve complex optimization problems or to identify anomalous behaviors,
i.e., behaviors that deviate from the established model. However, the performance of
the ML model depends, among other factors, on the quality of the data used to train it.
Benchmark datasets, with a profound impact on research findings, are often assumed to be
of good quality by default. In this paper, we derive four variants of a benchmark dataset in
network anomaly detection (UGR’16, a flow-based real-world traffic dataset designed for
anomaly detection), and show that the choice among variants has a larger impact on model
performance than the ML technique used to build the model. To analyze this phenomenon,
we propose a methodology to investigate the causes of these differences and to assess the
quality of the data labeling. Our results underline the importance of paying more attention
to data quality assessment in network anomaly detection.},
organization = {Agencia Estatal de Investigación in Spain, MCIN/AEI/
10.13039/501100011033, grant No. PID2020-113462RB-I00},
publisher = {MDPI},
keywords = {Netflow},
keywords = {UGR’16},
keywords = {anomaly detection},
title = {Data quality tools to enhance a network anomaly detection benchmark},
doi = {10.3390/data10030033},
author = {Camacho Páez, José and Rodríguez Gómez, Rafael A.},
}