@misc{10481/106865, year = {2025}, month = {7}, url = {https://hdl.handle.net/10481/106865}, abstract = {Generalising deep-learning models to perform well on unseen data domains with minimal retraining remains a significant challenge in computer vision. Even when the target task—such as quantifying the number of elements in an image—stays the same, data quality, shape, or form variations can deviate from the training conditions, often necessitating manual intervention. As a real-world industry problem, we aim to automate stock level estimation in retail cabinets. As technology advances, new cabinet models with varying shapes emerge alongside new camera types. This evolving scenario poses a substantial obstacle to deploying long-term, scalable solutions. To surmount the challenge of generalising to new cabinet models and cameras with minimal amounts of sample images, this research introduces a new solution. This paper proposes a novel ensemble model that combines DenseNet-201 and Vision Transformer (ViT-B/8) architectures to achieve generalisation in stock-level classification. The novelty aspect of our solution comes from the fact that we combine a transformer with a DenseNet model in order to capture both the local, hierarchical details and the long-range dependencies within the images, improving generalisation accuracy with less data. Key contributions include (i) a novel DenseNet-201 + ViT-B/8 feature-level fusion, (ii) an adaptation workflow that needs only two images per class, (iii) a balanced layer-unfreezing schedule, (iv) a publicly described domain-shift benchmark, and (v) a 47 pp accuracy gain over four standard few-shot baselines. Our approach leverages fine-tuning techniques to adapt two pre-trained models to the new retail cabinets (i.e., standing or horizontal) and camera types using only two images per class. Experimental results demonstrate that our method achieves high accuracy rates of 91% on new cabinets with the same camera and 89% on new cabinets with different cameras, significantly outperforming standard few-shot learning methods.}, organization = {Innovate UK and Unilever PLC (grant number KTP13113)}, organization = {European Union Next Generation - Ministry for Digital Transformation and the Civil Service (TSI-100927-2023-1 Project)}, publisher = {Rahi, B.; Sagmanli, D.; Oppong, F.; Pekaslan, D.; Triguero, I. Generalising Stock Detection in Retail Cabinets with Minimal Data Using a DenseNet and Vision Transformer Ensemble. Mach. Learn. Knowl. Extr. 2025, 7, 66. https://doi.org/10.3390/make7030066}, keywords = {ensemble model}, keywords = {Image classification}, keywords = {DenseNet-20}, title = {Generalising Stock Detection in Retail Cabinets with Minimal Data Using a DenseNet and Vision Transformer Ensemble}, doi = {10.3390/make7030066}, author = {Rahi, Babak and Sagmanli, Deniz and Oppong, Felix and Pekaslan, Direnc and Triguero, Isaac}, }