@misc{10481/91950,
year = {2023},
month = {10},
url = {https://hdl.handle.net/10481/91950},
abstract = {Capsule networks (CapsNets) aim to parse images
into a hierarchy of objects, parts, and their relations using a twostep
process involving part-whole transformation and hierarchical
component routing. However, this hierarchical relationship
modeling is computationally expensive, which has limited the
wider use of CapsNet despite its potential advantages. The
current state of CapsNet models primarily focuses on comparing
their performance with capsule baselines, falling short of achieving
the same level of proficiency as deep CNN variants in intricate
tasks. To address this limitation, we present an efficient approach
for learning capsules that surpasses canonical baseline models
and even demonstrates superior performance compared to highperforming
convolution models. Our contribution can be outlined
in two aspects: firstly, we introduce a group of subcapsules onto
which an input vector is projected. Subsequently, we present the
Hybrid Gromov-Wasserstein framework, which initially quantifies
the dissimilarity between the input and the components
modeled by the subcapsules, followed by determining their
alignment degree through optimal transport. This innovative
mechanism capitalizes on new insights into defining alignment
between the input and subcapsules, based on the similarity of
their respective component distributions. This approach enhances
CapsNets’ capacity to learn from intricate, high-dimensional
data while retaining their interpretability and hierarchical structure.
Our proposed model offers two distinct advantages: (i)
its lightweight nature facilitates the application of capsules to
more intricate vision tasks, including object detection; (ii) it
outperforms baseline approaches in these demanding tasks. Our
empirical findings illustrate that Hybrid Gromov-Wasserstein
Capsules (HGWCapsules) exhibit enhanced robustness against
affine transformations, scale effectively to larger datasets, and
surpass CNN and CapsNet models across various vision tasks.},
organization = {Queens University Startup under Project D8203EEC},
publisher = {Institute of Electrical and Electronics Engineers},
keywords = {Capsule Networks},
keywords = {Optimal Transport},
keywords = {Wasserstein Distances},
title = {Hybrid Gromov-Wasserstein Embedding for Capsule Learning},
doi = {10.1109/TNNLS.2023.3348657},
author = {Shamsolmoali, Pourya and Zareapoor, Masoumeh and Das, Swagatam and Granger, Eric and García López, Salvador},
}