#
# For licensing see accompanying LICENSE file.
# Copyright (C) 2023 Apple Inc. All Rights Reserved.
#
import argparse
from typing import Optional, Tuple
from torch import nn
from cvnets.layers import ConvLayer2d, Dropout, GlobalPool, LinearLayer
from cvnets.layers.activation import build_activation_layer
from cvnets.models import MODEL_REGISTRY
from cvnets.models.classification.base_image_encoder import BaseImageEncoder
from cvnets.models.classification.config.mobilenetv3 import get_configuration
from cvnets.modules import InvertedResidualSE
from utils.math_utils import bound_fn, make_divisible
[docs]@MODEL_REGISTRY.register(name="mobilenetv3", type="classification")
class MobileNetV3(BaseImageEncoder):
"""
This class implements the `MobileNetv3 architecture <https://arxiv.org/abs/1905.02244>`_
"""
[docs] def __init__(self, opts, *args, **kwargs) -> None:
width_mult = getattr(
opts, "model.classification.mobilenetv3.width_multiplier", 1.0
)
num_classes = getattr(opts, "model.classification.n_classes", 1000)
classifier_dropout = getattr(
opts, "model.classification.classifier_dropout", 0.0
)
if classifier_dropout == 0.0 or classifier_dropout is None:
val = round(0.2 * width_mult, 3)
classifier_dropout = bound_fn(min_val=0.0, max_val=0.2, value=val)
image_channels = 3
input_channels = make_divisible(16 * width_mult, 8)
mv3_config = get_configuration(opts)
super().__init__(opts, *args, **kwargs)
self.conv_1 = nn.Sequential()
self.conv_1.add_module(
name="conv_3x3_bn",
module=ConvLayer2d(
opts=opts,
in_channels=image_channels,
out_channels=input_channels,
kernel_size=3,
stride=2,
use_norm=True,
use_act=False,
),
)
self.conv_1.add_module(
name="act",
module=build_activation_layer(opts, act_type="hard_swish", inplace=True),
)
self.model_conf_dict["conv1"] = {"in": image_channels, "out": input_channels}
self.layer_1, out_channels = self._make_layer(
opts=opts,
mv3_config=mv3_config["layer_1"],
width_mult=width_mult,
input_channel=input_channels,
)
self.model_conf_dict["layer1"] = {"in": input_channels, "out": out_channels}
input_channels = out_channels
self.layer_2, out_channels = self._make_layer(
opts=opts,
mv3_config=mv3_config["layer_2"],
width_mult=width_mult,
input_channel=input_channels,
)
self.model_conf_dict["layer2"] = {"in": input_channels, "out": out_channels}
input_channels = out_channels
self.layer_3, out_channels = self._make_layer(
opts=opts,
mv3_config=mv3_config["layer_3"],
width_mult=width_mult,
input_channel=input_channels,
)
self.model_conf_dict["layer3"] = {"in": input_channels, "out": out_channels}
input_channels = out_channels
self.layer_4, out_channels = self._make_layer(
opts=opts,
mv3_config=mv3_config["layer_4"],
width_mult=width_mult,
input_channel=input_channels,
dilate=self.dilate_l4,
)
self.model_conf_dict["layer4"] = {"in": input_channels, "out": out_channels}
input_channels = out_channels
self.layer_5, out_channels = self._make_layer(
opts=opts,
mv3_config=mv3_config["layer_5"],
width_mult=width_mult,
input_channel=input_channels,
dilate=self.dilate_l5,
)
self.model_conf_dict["layer5"] = {"in": input_channels, "out": out_channels}
input_channels = out_channels
self.conv_1x1_exp = nn.Sequential()
out_channels = 6 * input_channels
self.conv_1x1_exp.add_module(
name="conv_1x1",
module=ConvLayer2d(
opts=opts,
in_channels=input_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_act=False,
use_norm=True,
),
)
self.conv_1x1_exp.add_module(
name="act",
module=build_activation_layer(opts, act_type="hard_swish", inplace=True),
)
self.model_conf_dict["exp_before_cls"] = {
"in": input_channels,
"out": out_channels,
}
pool_type = getattr(opts, "model.layer.global_pool", "mean")
last_channels = mv3_config["last_channels"]
self.classifier = nn.Sequential()
self.classifier.add_module(
name="global_pool", module=GlobalPool(pool_type=pool_type, keep_dim=False)
)
self.classifier.add_module(
name="fc1",
module=LinearLayer(
in_features=out_channels, out_features=last_channels, bias=True
),
)
self.classifier.add_module(
name="act",
module=build_activation_layer(opts, act_type="hard_swish", inplace=True),
)
if 0.0 < classifier_dropout < 1.0:
self.classifier.add_module(
name="classifier_dropout", module=Dropout(p=classifier_dropout)
)
self.classifier.add_module(
name="classifier_fc",
module=LinearLayer(
in_features=last_channels, out_features=num_classes, bias=True
),
)
self.model_conf_dict["cls"] = {"in": 6 * input_channels, "out": num_classes}
def _make_layer(
self,
opts,
mv3_config,
width_mult: float,
input_channel: int,
dilate: Optional[bool] = False,
*args,
**kwargs
) -> Tuple[nn.Module, int]:
prev_dilation = self.dilation
mv3_block = nn.Sequential()
count = 0
for i in range(len(mv3_config)):
for kernel_size, expansion_factor, in_channels, use_se, use_hs, stride in [
mv3_config[i]
]:
block_name = "mv3_s_{}_idx_{}".format(stride, count)
output_channel = make_divisible(
in_channels * width_mult, self.round_nearest
)
if dilate and count == 0:
self.dilation *= stride
stride = 1
layer = InvertedResidualSE(
opts=opts,
in_channels=input_channel,
out_channels=output_channel,
stride=stride,
expand_ratio=expansion_factor,
dilation=prev_dilation if count == 0 else self.dilation,
act_fn_name="hard_swish" if use_hs else "relu",
use_se=use_se,
)
mv3_block.add_module(name=block_name, module=layer)
count += 1
input_channel = output_channel
return mv3_block, input_channel
[docs] @classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(title=cls.__name__)
group.add_argument(
"--model.classification.mobilenetv3.mode",
type=str,
default="large",
help="Configuration for mobilenetv3. Default: large",
choices=("small", "large"),
)
group.add_argument(
"--model.classification.mobilenetv3.width-multiplier",
type=float,
default=1.0,
help="Width multiplier for mobilenetv3. Default: 1.0",
)
return parser