Source code for coremltools.converters.mil.mil.passes.defs.optimize_normalization

#  Copyright (c) 2020, Apple Inc. All rights reserved.
#
#  Use of this source code is governed by a BSD-3-clause license that can be
#  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause

from typing import List, Optional

import numpy as np

from coremltools import _logger as logger
from coremltools.converters.mil.mil import Block
from coremltools.converters.mil.mil import Builder as mb
from coremltools.converters.mil.mil import Operation, Program, Var
from coremltools.converters.mil.mil.passes.graph_pass import AbstractGraphPass
from coremltools.converters.mil.mil.passes.helper import (
    _check_no_output_connection,
    block_context_manager,
)
from coremltools.converters.mil.mil.passes.pass_registry import register_pass


[docs]@register_pass(namespace="common") class fuse_layernorm_or_instancenorm(AbstractGraphPass): """ A graph optimization pass on PyMIL to detect and fuse several variants of ``layer_norm`` or ``instance_norm``. Pattern 1 corresponds to either ``layer_norm`` or ``instance_norm``. Patterns 2-4 are ``instance_norm``. Pattern 5 is ``layer_norm``. You can find these patterns in the methods for this class in the source code. To quickly view the source code, click the **[source]** button at the end of the class definition. """ _DEBUG = False # set to true to plot the block before and after the transformation def apply(self, prog: Program): for f in prog.functions.values(): block_changed = True while block_changed: if self._DEBUG: import graphviz graphviz.Source( f.get_dot_string( highlight_debug_op_types=["instance_norm"], ) ).view(filename="/tmp/block_before_fuse_layernorm_or_instancenorm") logger.debug("Block before fuse_layernorm_or_instancenorm transform:\n{}".format(f)) block_changed = self._fuse_layernorm_or_instancenorm_block(f) if self._DEBUG: graphviz.Source( f.get_dot_string( highlight_debug_op_types=["instance_norm"], ) ).view(filename="/tmp/block_after_fuse_layernorm_or_instancenorm") logger.debug("Block after fuse_layernorm_or_instancenorm transform:\n{}".format(f)) @staticmethod def _check_reduce_op(reduce_op: Operation, mode: str = "reduce_mean") -> bool: """ Check whether or not the ``reduction`` op satisfies following conditions: - Mode is expected. - Does not change rank (``keep_dims`` is ``True``). - The ``axes`` is known at compile time. Parameters ---------- param reduce_op : ``reduce_op`` to check on. param mode : ``reduce`` mode """ if reduce_op is None: return False if reduce_op.op_type != mode: return False if reduce_op.keep_dims is None or reduce_op.keep_dims.val is None: return False if reduce_op.keep_dims.val is False: return False if reduce_op.axes is None or reduce_op.axes.val is None: return False return True @staticmethod def _check_child_op_types( op: Operation, child_op_types: List[str], check_order: bool = True ) -> bool: """ Returns ``True`` for child op types matching ``child_op_types``, otherwise returns ``False``. Parameters ---------- param op : Current op. param child_op_type : Expected child op type. param check_order : Ensure child in given order, defaults to ``True``. """ if op is None or len(op.outputs) != 1: return False child_ops = list(op.outputs[0].child_ops) if len(child_ops) != len(child_op_types): return False ops_types = [c.op_type for c in child_ops] if check_order is False: ops_types = sorted(ops_types) child_op_types = sorted(child_op_types) return ops_types == child_op_types @staticmethod def _try_get_child_op_type( op: Operation, child_op_type: str, index: int = 0 ) -> Optional[Operation]: """ Returns child op if type matches, otherwise returns ``None``. Parameters ---------- param op : Current op. param child_op_type : Expected child op type. param index : Child op index. """ if op is None: return None if len(op.outputs) != 1: return None child_ops = list(op.outputs[0].child_ops) if index >= len(child_ops): return None if child_ops[index].op_type != child_op_type: return None return child_ops[index] @staticmethod def _try_apply_transform( reduce_op: Operation, block: Block, gamma_var: Var, beta_var: Var, epsilon_var: Var, end_op: Operation, ops_to_remove: List[Operation], ) -> bool: """ Insert instance_norm / layer_norm and delete all ops. :param reduce_op: Start operation of the pattern. :param block: Block :param gamma_var: Gamma variable. :param beta_var: Beta variable. :param epsilon_var: Epsilon variable. :param end_op: End operation of the pattern. :param ops_to_remove: Operations to remove. """ if not _check_no_output_connection(block, ops_to_remove): return False axes = reduce_op.axes.val rank = len(reduce_op.x.shape) # check whether the pattern is instance_norm or layer_norm is_layernorm = False is_instancenorm = False is_require_rank4_transpose = False negative_axes = [a - rank if a >= 0 else a for a in axes] negative_axes.sort() gamma_rank = gamma_var.rank if gamma_var is not None else -1 beta_rank = beta_var.rank if beta_var is not None else -1 if gamma_rank == len(axes) and beta_rank == len(axes): # axes for layer_norm must be [-1] or [-1, -2] or [-1, -2, -3] and so on if negative_axes == list(range(-len(negative_axes), 0)): is_layernorm = True if rank == 4 and negative_axes == [-3]: is_layernorm = (gamma_var is None and beta_var is None) or (gamma_rank == 1 and beta_rank == 1) gamma_var = gamma_var.val if gamma_var else None beta_var = beta_var.val if beta_var else None if rank == 4 and (negative_axes == [-2, -1] or negative_axes == [-3, -2]): if ( len(np.squeeze(gamma_var.val).shape) == 1 and len(np.squeeze(beta_var.val).shape) == 1 ): is_instancenorm = True if negative_axes == [-3, -2]: is_require_rank4_transpose = True if not (is_instancenorm or is_layernorm): return False # remove all the ops, and replace with a layer_norm or instance_norm op out_name = end_op.outputs[0].name if is_require_rank4_transpose: x = mb.transpose( x=reduce_op.x, perm=[0, 3, 1, 2], name=out_name + "_transpose_nhwc_nchw", before_op=end_op, ) if is_instancenorm: x = mb.instance_norm( x=x if is_require_rank4_transpose else reduce_op.x, gamma=np.squeeze(gamma_var.val), beta=np.squeeze(beta_var.val), epsilon=epsilon_var, name=out_name + "_instancenorm" if is_require_rank4_transpose else out_name, before_op=end_op, ) else: # is_layernorm x = mb.layer_norm( x=x if is_require_rank4_transpose else reduce_op.x, axes=axes, gamma=gamma_var, beta=beta_var, epsilon=epsilon_var, name=out_name + "_layernorm" if is_require_rank4_transpose else out_name, before_op=end_op, ) if is_require_rank4_transpose: x = mb.transpose( x=x, perm=[0, 2, 3, 1], name=out_name + "_transpose_nchw_nhwc", before_op=end_op, ) end_op.enclosing_block.replace_uses_of_var_after_op( anchor_op=end_op, old_var=end_op.outputs[0], new_var=x ) # Remove all the ops at once block.remove_ops(ops_to_remove) return True def _try_match_and_transform_pattern_1(self, reduce_op, block) -> bool: """ Identify the pattern: ``y = gamma * (x - mean) / sqrt(variance + epsilon) + beta`` ``y = x * [gamma * rsqrt(variance + eps)] + (beta - mean * [gamma * rsqrt(variance + eps)])`` .. code-block:: x --> reduce_mean --> sub --> square --> reduce_mean --> add(epsilon) --> rsqrt | | ^ | | | | V |----------------------- mul (gamma) | | | | | --------|--------- | | | | | | | V | |----------------------------------------------------------------> mul | | | | V | |--------------------------------------------------------------> mul | | V | sub (beta) --> add --> [...] | ^ |------------------------------- This pattern corresponds to either ``layer_norm`` or ``instance_norm``. It is ``instance_norm`` if all of the following are true: - ``input`` is rank 4. - ``axes`` of ``reduce_mean`` is ``[-2, -1]`` or ``[-3, -2]`` (when ``[-3, -2]``, a channel first to channel last transpose would be inserted). - ``gamma`` and ``beta`` are rank 1, after ``squeeze``. It is ``layer_norm`` if all of the following are true: - ``axes`` is either ``[-1]``, ``[-1, -2]``, or ``[-1, -2, -3]``, and so on. - ``rank`` of ``gamma`` and ``beta`` is equal to the length of the ``axes``. """ ops_to_remove = [] root_var = reduce_op.x if root_var.shape is None: return False # check that root_var feeds into exactly 3 ops if len(list(root_var.child_ops)) != 3: return False if root_var.op is not None and not self._check_child_op_types( root_var.op, child_op_types=["reduce_mean", "sub", "mul"] ): return False # check 1st reduce_mean op if not self._check_reduce_op(reduce_op): return False ops_to_remove.append(reduce_op) # check 1st sub op if not self._check_child_op_types(reduce_op, ["sub", "mul"], check_order=False): return False child_ops_reduce_mean = list(reduce_op.outputs[0].child_ops) op_a = child_ops_reduce_mean[0] op_b = child_ops_reduce_mean[1] sub_op1 = op_a if op_a.op_type == "sub" else op_b if not (sub_op1.x == root_var and sub_op1.y == reduce_op.outputs[0]): return False ops_to_remove.append(sub_op1) # check square op square_op = self._try_get_child_op_type(sub_op1, "square") if square_op is None: return False ops_to_remove.append(square_op) # check second reduce mean reduce_op2 = self._try_get_child_op_type(square_op, "reduce_mean") if not self._check_reduce_op(reduce_op2): return False ops_to_remove.append(reduce_op2) # check add op (with epsilon) add_op1 = self._try_get_child_op_type(reduce_op2, "add") if add_op1 is None: return False epsilon_var = add_op1.y if add_op1.x == reduce_op2.outputs[0] else add_op1.x if epsilon_var.val is None or len(epsilon_var.val.shape) != 0: return False # must be scalar ops_to_remove.append(add_op1) # check rsqrt rsqrt_op = self._try_get_child_op_type(add_op1, "rsqrt") if rsqrt_op is None: return False ops_to_remove.append(rsqrt_op) # check mul (gamma) mul_op1 = self._try_get_child_op_type(rsqrt_op, "mul") if mul_op1 is None: return False gamma_var = mul_op1.y if mul_op1.x == rsqrt_op.outputs[0] else mul_op1.x if gamma_var.val is None: return False ops_to_remove.append(mul_op1) # check 2 muls after the gamma mul if not self._check_child_op_types(mul_op1, ["mul", "mul"]): return False child_ops = list(mul_op1.outputs[0].child_ops) mul_op2 = child_ops[0] mul_op3 = child_ops[1] mul_op2_other_var = mul_op2.x if mul_op2.y == mul_op1.outputs[0] else mul_op2.y mul_op3_other_var = mul_op3.x if mul_op3.y == mul_op1.outputs[0] else mul_op3.y if not ( (mul_op2_other_var == root_var and mul_op3_other_var == reduce_op.outputs[0]) or (mul_op2_other_var == reduce_op.outputs[0] and mul_op3_other_var == root_var) ): return False if mul_op2_other_var == root_var: mul_root_op = mul_op2 mul_mean_op = mul_op3 else: mul_root_op = mul_op3 mul_mean_op = mul_op2 ops_to_remove.append(mul_mean_op) ops_to_remove.append(mul_root_op) # check sub with beta sub_op2 = self._try_get_child_op_type(mul_mean_op, "sub") if sub_op2 is None: return False if sub_op2.y != mul_mean_op.outputs[0]: return False beta_var = sub_op2.x if beta_var.val is None: return False ops_to_remove.append(sub_op2) # check last add op add_op2 = self._try_get_child_op_type(sub_op2, "add") if add_op2 is None: return False if not (add_op2.x == mul_root_op.outputs[0] or add_op2.y == mul_root_op.outputs[0]): return False ops_to_remove.append(add_op2) return self._try_apply_transform( reduce_op, block, gamma_var, beta_var, epsilon_var, add_op2, ops_to_remove ) def _try_match_and_transform_pattern_2(self, reduce_op, block) -> bool: """ Identify the pattern: ``y = (x - mean) / pow(variance + epsilon) * gamma + beta`` This pattern corresponds to, and should be fused as, ``instance_norm``. All of the following conditions must be satisfied: 1. ``input`` is rank 4 tensor. 2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]``, or ``axes=[-3, -2]`` (a channel first to channel last transpose would be inserted in such cases). 3. ``gamma`` and ``beta`` are both shape ``(C,)`` after ``squeeze``, where ``C`` is number of channels. .. code-block:: |----> sub -----| const (0.5) | ^ | | | | V V x ---> mean square --> mean1 --> add_eps ---> pow const_gamma const_beta | | | | | | V V V V |----> sub1 --------------------------------> real_div --> mul_gamma --> add_beta --> ... """ ops_to_remove = [] root_var = reduce_op.x if root_var.shape is None: return False # check that root_var feeds into exactly 3 ops if len(root_var.child_ops) != 3: return False if root_var.op is not None and not self._check_child_op_types( root_var.op, child_op_types=["reduce_mean", "sub", "sub"] ): return False # check 1st reduce_mean op if not self._check_reduce_op(reduce_op): return False ops_to_remove.append(reduce_op) # check 1st sub op if not self._check_child_op_types(reduce_op, ["sub", "sub"]): return False child_ops_reduce_mean = list(reduce_op.outputs[0].child_ops) reduce_mean_child_op_a = child_ops_reduce_mean[0] reduce_mean_child_op_b = child_ops_reduce_mean[1] # One of sub op directly goes square, the other one goes real_div if list(reduce_mean_child_op_a.outputs[0].child_ops)[0].op_type == "square": sub_op0 = reduce_mean_child_op_a sub_op1 = reduce_mean_child_op_b else: sub_op0 = reduce_mean_child_op_b sub_op1 = reduce_mean_child_op_a if not (sub_op0.x == root_var and sub_op0.y == reduce_op.outputs[0]): return False if not (sub_op1.x == root_var and sub_op1.y == reduce_op.outputs[0]): return False ops_to_remove.append(sub_op0) ops_to_remove.append(sub_op1) # check square op square_op = self._try_get_child_op_type(sub_op0, "square") if square_op is None: return False ops_to_remove.append(square_op) # check second reduce mean reduce_op2 = self._try_get_child_op_type(square_op, "reduce_mean") if not self._check_reduce_op(reduce_op2): return False ops_to_remove.append(reduce_op2) # check add op (with epsilon) add_eps_op = self._try_get_child_op_type(reduce_op2, "add") if add_eps_op is None: return False epsilon_var = add_eps_op.y if add_eps_op.x == reduce_op2.outputs[0] else add_eps_op.x if epsilon_var.val is None or len(epsilon_var.val.shape) != 0: return False # must be scalar ops_to_remove.append(add_eps_op) # check pow pow_op = self._try_get_child_op_type(add_eps_op, "pow") if pow_op is None: return False if pow_op.y.val is None or not np.isclose(pow_op.y.val, 0.5): return False ops_to_remove.append(pow_op) # check real_div real_div_op = self._try_get_child_op_type(pow_op, "real_div") if real_div_op is None: return False if not (real_div_op.x == sub_op1.outputs[0] and real_div_op.y == pow_op.outputs[0]): return False ops_to_remove.append(real_div_op) # check mul with gamma mul_gamma_op = self._try_get_child_op_type(real_div_op, "mul") if mul_gamma_op is None: return False gamma_var = mul_gamma_op.y if mul_gamma_op.x == real_div_op.outputs[0] else mul_gamma_op.x if gamma_var.val is None: return False ops_to_remove.append(mul_gamma_op) # check add with beta add_beta_op = self._try_get_child_op_type(mul_gamma_op, "add") if add_beta_op is None: return False beta_var = add_beta_op.y if add_beta_op.x == mul_gamma_op.outputs[0] else add_beta_op.x if beta_var.val is None: return False ops_to_remove.append(add_beta_op) return self._try_apply_transform( reduce_op, block, gamma_var, beta_var, epsilon_var, add_beta_op, ops_to_remove ) def _try_match_and_transform_pattern_3(self, reduce_op, block) -> bool: """ Detect ``InstanceNorm`` pattern in TensorFlow-Addons. This pattern corresponds to, and should be fused as, ``instance_norm``. All of the following conditions must be satisfied: 1. ``input`` is rank 4 tensor. 2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]``, or ``axes=[-3, -2]`` (a channel first to channel last transpose would be inserted in such cases). 3. ``gamma`` and ``beta`` are absent. Default values for ``gamma`` and ``beta`` would be used. .. code-block:: |-------------------------------------------------| | | | V x --> mean square --> mean1 --> add_eps --> rsqrt --> mul2 --> mul_sub | | ^ | | | V | | | | --> sub -----| | | | V V |--------------------------------------------> mul1 -------------> add --> ... """ ops_to_remove = [] root_var = reduce_op.x if root_var.shape is None: return False # check that root_var feeds into exactly 3 ops if len(root_var.child_ops) != 3: return False if root_var.op is not None and not self._check_child_op_types( root_var.op, ["sub", "mul", "reduce_mean"] ): return False # check 1st reduce_mean op if not self._check_reduce_op(reduce_op): return False ops_to_remove.append(reduce_op) # check 1st sub op if not self._check_child_op_types(reduce_op, ["sub", "mul"], check_order=False): return False child_ops_reduce_mean = list(reduce_op.outputs[0].child_ops) reduce_mean_child_op_a = child_ops_reduce_mean[0] reduce_mean_child_op_b = child_ops_reduce_mean[1] sub_op1 = ( reduce_mean_child_op_a if reduce_mean_child_op_a.op_type == "sub" else reduce_mean_child_op_b ) if not (sub_op1.x == root_var and sub_op1.y == reduce_op.outputs[0]): return False ops_to_remove.append(sub_op1) # check square op square_op = self._try_get_child_op_type(sub_op1, "square") if square_op is None: return False ops_to_remove.append(square_op) # check second reduce mean reduce_op2 = self._try_get_child_op_type(square_op, "reduce_mean") if reduce_op2 is None or not self._check_reduce_op(reduce_op2): return False ops_to_remove.append(reduce_op2) # check add op (with epsilon) add_eps_op = self._try_get_child_op_type(reduce_op2, "add") if add_eps_op is None: return False epsilon_var = add_eps_op.y if add_eps_op.x == reduce_op2.outputs[0] else add_eps_op.x if epsilon_var.val is None or len(epsilon_var.val.shape) != 0: return False # must be scalar ops_to_remove.append(add_eps_op) # check rsqrt rsqrt_op = self._try_get_child_op_type(add_eps_op, "rsqrt") if rsqrt_op is None: return False ops_to_remove.append(rsqrt_op) # check mul 1 mul_op1 = self._try_get_child_op_type(rsqrt_op, "mul") if mul_op1 is None: return False if not ( (mul_op1.x == root_var and mul_op1.y == rsqrt_op.outputs[0]) or (mul_op1.x == rsqrt_op.outputs[0] and mul_op1.y == root_var) ): return False ops_to_remove.append(mul_op1) # check mul 2 mul_op2 = self._try_get_child_op_type(rsqrt_op, "mul", index=1) if mul_op2 is None: return False if not ( (mul_op2.x == reduce_op.outputs[0] and mul_op2.y == rsqrt_op.outputs[0]) or (mul_op2.x == rsqrt_op.outputs[0] and mul_op2.y == reduce_op.outputs[0]) ): return False ops_to_remove.append(mul_op2) # check mul (sub) mul_sub_op = self._try_get_child_op_type(mul_op2, "mul") if mul_sub_op is None: return False if mul_sub_op.y.val is None or mul_sub_op.y.val != -1: return False ops_to_remove.append(mul_sub_op) # check last add op add_op = self._try_get_child_op_type(mul_sub_op, "add") if add_op is None: return False if not ( (add_op.x == mul_op1.outputs[0] and add_op.y == mul_sub_op.outputs[0]) or (add_op.x == mul_sub_op.outputs[0] and add_op.y == mul_op1.outputs[0]) ): return False ops_to_remove.append(add_op) gamma_var = mb.const( val=np.ones(shape=(1, root_var.shape[1], 1, 1)), name="_fuse_layernorm_or_instancenorm_gamma", ) beta_var = mb.const( val=np.zeros(shape=(1, root_var.shape[1], 1, 1)), name="_fuse_layernorm_or_instancenorm_beta", ) return self._try_apply_transform( reduce_op, block, gamma_var, beta_var, epsilon_var, add_op, ops_to_remove ) def _try_match_and_transform_pattern_4(self, reduce_op: Operation, block: Block) -> bool: """ Identify the pattern: ``y = x * [gamma * rsqrt(variance + eps)] + (beta - mean * [gamma * rsqrt(variance + eps)])`` This pattern corresponds to, and should be fused as, ``instance_norm``. All of the following conditions must be satisfied: 1. ``input`` is rank 4 tensor. 2. ``reduce`` operates on spatial dimensions ``axes=[-2, -1]`` or ``axes=[-3, -2]`` (a channel first to channel last transpose would be inserted in such cases). 3. ``gamma`` and ``beta`` are both shape ``(C,)`` after ``squeeze``, where ``C`` is number of channels. .. code-block:: |-----------| | V |------> mul_square1 -----> sum1 -----> mul_mean1 | | | V x --> sum --> mul_mean ==> mul_square --> sub_variance --> add_eps --> rsqrt | | | | | V | | mul_gamma | | | | | |----------------| | | | V | |--------------------------------------------+-------------> mul2 | V | |----------------------------------------------------------> mul1 | | V | sub_beta --> add --> [...] | ^ |---------------------------| """ ops_to_remove = [] root_var = reduce_op.x if root_var.shape is None: return False # check that root_var feeds into exactly 4 ops if len(root_var.child_ops) != 4: return False if root_var.op is not None and not self._check_child_op_types( root_var.op, child_op_types=["mul", "mul", "reduce_sum", "mul"] ): return False # check 1st reduce_sum op if not self._check_reduce_op(reduce_op, mode="reduce_sum"): return False ops_to_remove.append(reduce_op) # check mul (mean) op mul_mean_op = self._try_get_child_op_type(reduce_op, "mul") if mul_mean_op is None: return False if mul_mean_op.y.shape != (): return False ops_to_remove.append(mul_mean_op) # check 1st mul (square) op if not self._check_child_op_types(mul_mean_op, child_op_types=["mul", "mul", "mul"]): return False # both 0 and 1 should be mul square op mul_square_op = self._try_get_child_op_type(mul_mean_op, "mul") if mul_square_op is None: return False if self._try_get_child_op_type(mul_mean_op, "mul", index=1) is None: return False ops_to_remove.append(mul_square_op) # Check another branch # check 2nd mul (square) op # both 0 and 1 should be mul square op 1 mul_square_op2 = list(root_var.child_ops)[0] ops_to_remove.append(mul_square_op2) # check 2nd reduce sum reduce_op2 = self._try_get_child_op_type(mul_square_op2, child_op_type="reduce_sum") if not self._check_reduce_op(reduce_op2, "reduce_sum"): return False ops_to_remove.append(reduce_op2) # check mul after 2nd reduce op mul_mean_op2 = self._try_get_child_op_type(reduce_op2, "mul") if mul_mean_op2 is None: return False if mul_mean_op2.y.shape != (): return False ops_to_remove.append(mul_mean_op2) # check sub (variance) sub_variance_op = self._try_get_child_op_type(mul_mean_op2, "sub") if sub_variance_op is None: return False if sub_variance_op.y != mul_square_op.outputs[0]: return False ops_to_remove.append(sub_variance_op) # check add op (epsilon) add_eps_op = self._try_get_child_op_type(sub_variance_op, "add") if add_eps_op is None: return False epsilon_var = add_eps_op.y if add_eps_op.x == sub_variance_op.outputs[0] else add_eps_op.x if epsilon_var.val is None or len(epsilon_var.val.shape) != 0: return False # must be scalar ops_to_remove.append(add_eps_op) # check rsqrt rsqrt_op = self._try_get_child_op_type(add_eps_op, "rsqrt") if rsqrt_op is None: return False ops_to_remove.append(rsqrt_op) # check mul (gamma) mul_gamma_op = self._try_get_child_op_type(rsqrt_op, "mul") if mul_gamma_op is None: return False gamma_var = mul_gamma_op.y if mul_gamma_op.x == rsqrt_op.outputs[0] else mul_gamma_op.x if gamma_var.val is None: return False ops_to_remove.append(mul_gamma_op) # check 2 muls after the gamma mul if not self._check_child_op_types(mul_gamma_op, ["mul", "mul"]): return False mul_gamma_child_ops = list(mul_gamma_op.outputs[0].child_ops) mul_op1 = mul_gamma_child_ops[0] mul_op2 = mul_gamma_child_ops[1] mul_op1_other_var = mul_op1.x if mul_op1.y == mul_gamma_op.outputs[0] else mul_op1.y mul_op2_other_var = mul_op2.x if mul_op2.y == mul_gamma_op.outputs[0] else mul_op2.y if not ( (mul_op1_other_var == root_var and mul_op2_other_var == mul_square_op.x) or (mul_op1_other_var == mul_square_op.x and mul_op2_other_var == root_var) ): return False if mul_op1_other_var == root_var: mul_op1, mul_op2 = mul_op1, mul_op2 else: mul_op2, mul_op1 = mul_op1, mul_op2 ops_to_remove.append(mul_op1) ops_to_remove.append(mul_op2) # check sub with beta sub_beta_op = self._try_get_child_op_type(mul_op2, "sub") if sub_beta_op is None: return False if sub_beta_op.y != mul_op2.outputs[0]: return False beta_var = sub_beta_op.x if beta_var.val is None: return False ops_to_remove.append(sub_beta_op) # check last add op add_op = self._try_get_child_op_type(sub_beta_op, "add") if add_op is None: return False if not ( (add_op.x == mul_op1.outputs[0] and add_op.y == sub_beta_op.outputs[0]) or (add_op.y == mul_op1.outputs[0] and add_op.x == sub_beta_op.outputs[0]) ): return False ops_to_remove.append(add_op) return self._try_apply_transform( reduce_op, block, gamma_var, beta_var, epsilon_var, add_op, ops_to_remove ) def _try_match_and_transform_pattern_5(self, reduce_op, block) -> bool: """ Detect BC1S ``LayerNorm`` pattern as in ml-ane-transformers. Identify two patterns, the first: ``y = (x - mean(x)) * rsqrt(variance(X) + eps)`` ``y = (x - mean(x)) * rsqrt(mean((x - mean(x))^2) + eps)`` .. code-block:: x --> reduce_mean --| | | |---| | V | V |----------------> sub --> mul --> reduce_mean --> add(epsilon) --> rsqrt | | | V |-------------------------------------------------> mul --> [...] If the optional elementwise weight and bias are set, the second pattern is: ``y = [(x - mean(x)) * rsqrt(mean((x - mean(x))^2) + eps) + beta] * gamma`` Note that this is different from the torch and MIL definitions of beta and gamma so beta is be scaled by gamma and applied before it. .. code-block:: x --> reduce_mean --| | | |---| | V | V |----------------> sub --> mul --> reduce_mean --> add(epsilon) --> rsqrt | | | V |-------------------------------------------------> mul | V add(beta) | V mul(gamma) --> [...] These pattern corresponds to a specific ``layer_norm``: - ``rank`` is 4. - ``axes`` is ``[1]`` - ``gamma`` and ``beta`` are applied as in ml-ane-transformers, in the opposite order of torch. """ ops_to_remove = [] root_var = reduce_op.x # check that root_var feeds into at least 2 ops if len(list(root_var.child_ops)) < 2: return False # Do not enforce that the only child ops are reduce_mean and sub as in other # patterns. There are models where the root op is used after the layer norm. # check 1st reduce_mean op if not self._check_reduce_op(reduce_op): return False if len(reduce_op.axes.val) != 1 or reduce_op.axes.val != [1] or not reduce_op.keep_dims.val: return False ops_to_remove.append(reduce_op) # check 1st sub op if not self._check_child_op_types(reduce_op, ["sub"], check_order=False): return False child_ops_reduce_mean = list(reduce_op.outputs[0].child_ops) sub_op1 = child_ops_reduce_mean[0] if sub_op1 is None or not self._check_child_op_types( sub_op1, child_op_types=["mul", "mul", "mul"] ): return False if not (sub_op1.x == root_var and sub_op1.y == reduce_op.outputs[0]): return False ops_to_remove.append(sub_op1) # check mul op (equivalent to a square op) square_op = self._try_get_child_op_type(sub_op1, "mul") if square_op is None or not self._check_child_op_types( square_op, child_op_types=["reduce_mean"] ): return False if square_op.x != square_op.y: return False ops_to_remove.append(square_op) # check second reduce mean reduce_op2 = self._try_get_child_op_type(square_op, "reduce_mean") if not self._check_reduce_op(reduce_op2) or not self._check_child_op_types( reduce_op2, child_op_types=["add"] ): return False if len(reduce_op2.axes.val) != 1 or reduce_op2.axes.val != [1] or not reduce_op2.keep_dims.val: return False ops_to_remove.append(reduce_op2) # check add op (with epsilon) add_op1 = self._try_get_child_op_type(reduce_op2, "add") if add_op1 is None or not self._check_child_op_types( add_op1, child_op_types=["rsqrt"] ): return False epsilon_var = add_op1.y if add_op1.x == reduce_op2.outputs[0] else add_op1.x if epsilon_var.val is None or len(epsilon_var.val.shape) != 0: return False # must be scalar ops_to_remove.append(add_op1) # check rsqrt rsqrt_op = self._try_get_child_op_type(add_op1, "rsqrt") if rsqrt_op is None or not self._check_child_op_types( rsqrt_op, child_op_types=["mul"] ): return False ops_to_remove.append(rsqrt_op) # Last op in pattern if there is no elementwise affine. mul_op = self._try_get_child_op_type(rsqrt_op, "mul") if mul_op is None: return False if mul_op.y != sub_op1.outputs[0] and mul_op.x != sub_op1.outputs[0]: return False ops_to_remove.append(mul_op) # Default values if no gamma or beta ops. end_op = mul_op gamma_var = None beta_var = None add_beta_op = self._try_get_child_op_type(mul_op, "add") mul_gamma_op = self._try_get_child_op_type(add_beta_op, "mul") has_beta_and_gamma = add_beta_op is not None and mul_gamma_op is not None # mul_op cannot be used except as an input to add_beta_op. if has_beta_and_gamma and not self._check_child_op_types( mul_op, child_op_types=["add"] ): # It would be possible to fuse this pattern as: # layer_norm(x, gamma=None, beta=None) -> add(beta) -> mul(gamma) -> ... # |-> other mul_op child ops # For simplicity don't handle this edge case. return False # add_beta_op cannot be used except as an input to mul_gamma_op. if has_beta_and_gamma and not self._check_child_op_types( add_beta_op, child_op_types=["mul"] ): # It would be possible to fuse this pattern as: # layer_norm(x, gamma=None, beta=None) -> add(beta) -> mul(gamma) -> ... # |-> other add_beta_op child ops # For simplicity don't handle this edge case. return False if has_beta_and_gamma: beta_var = add_beta_op.y if add_beta_op.x == mul_op.outputs[0] else add_beta_op.x gamma_var = mul_gamma_op.y if mul_gamma_op.x == add_beta_op.outputs[0] else mul_gamma_op.x gamma_var = mb.const( val=np.squeeze(gamma_var.val), name="_fuse_layernorm_gamma", ) # Scale beta by gamma. Note: this un-scaling introduces a small amount # of precision loss. # https://github.com/apple/ml-ane-transformers/blob/da64000fa56cc85b0859bc17cb16a3d753b8304a/ane_transformers/huggingface/distilbert.py#L31 beta_var = mb.const( val=np.squeeze(beta_var.val) * gamma_var.val, name="_fuse_layernorm_beta" ) ops_to_remove.append(add_beta_op) ops_to_remove.append(mul_gamma_op) end_op = mul_gamma_op if add_beta_op is None and mul_gamma_op is None: # Gamma and beta are optional in layer_norm. pass elif add_beta_op is None or mul_gamma_op is None: # If only one of gamma or beta is present, they could # be folded into the layer_norm op. For simplicity # don't handle this edge case. return False return self._try_apply_transform( reduce_op, block, gamma_var, beta_var, epsilon_var, end_op, ops_to_remove ) @block_context_manager def _fuse_layernorm_or_instancenorm_block(self, block: Block): fusion_status = False for i, op in enumerate(list(block.operations)): for b in op.blocks: block_changed = True while block_changed: block_changed = self._fuse_layernorm_or_instancenorm_block(b) if len(op.blocks) > 0: continue # start pattern match if reduce_mean op is encountered if op.op_type == "reduce_mean": if fusion_status is False: fusion_status = self._try_match_and_transform_pattern_1(op, block) if fusion_status is False: fusion_status = self._try_match_and_transform_pattern_2(op, block) if fusion_status is False: fusion_status = self._try_match_and_transform_pattern_3(op, block) if fusion_status is False: fusion_status = self._try_match_and_transform_pattern_5(op, block) # has to break as the downstream iterator is affected. if fusion_status: return fusion_status elif op.op_type == "reduce_sum": if fusion_status is False: fusion_status = self._try_match_and_transform_pattern_4(op, block) # has to break as the downstream iterator is affected. if fusion_status: return fusion_status return fusion_status