@misc{10481/76646,
year = {2022},
month = {7},
url = {http://hdl.handle.net/10481/76646},
abstract = {Data that have not been modeled cannot be correctly predicted. Under this assumption, this
research studies how k-fold cross-validation can introduce dataset shift in regression problems. This
fact implies data distributions in the training and test sets to be different and, therefore, a deterioration
of the model performance estimation. Even though the stratification of the output variable is widely
used in the field of classification to reduce the impacts of dataset shift induced by cross-validation, its
use in regression is not widespread in the literature. This paper analyzes the consequences for dataset
shift of including different regressand stratification schemes in cross-validation with regression data.
The results obtained show that these allow for creating more similar training and test sets, reducing
the presence of dataset shift related to cross-validation. The bias and deviation of the performance
estimation results obtained by regression algorithms are improved using the highest amounts of
strata, as are the number of cross-validation repetitions necessary to obtain these better results.},
organization = {MCIU/AEI/ERDF, UE	PGC2018098860-B-I00},
organization = {ERDF Operational Programme 2014-2020},
organization = {Economy and Knowledge Council of the Regional Government of Andalusia, Spain	
MCIN/AEI	CEX2020-001105-M
A-FQM-345-UGR18},
publisher = {MDPI},
keywords = {Cross-validation},
keywords = {Dataset shift},
keywords = {Target shift},
keywords = {Stratification},
keywords = {Regression},
title = {Impact of Regressand Stratification in Dataset Shift Caused by Cross-Validation},
doi = {10.3390/math10142538},
author = {Sáez Muñoz, José Antonio and Romero Béjar, José Luis},
}