Source code for coremltools.models.pipeline

# Copyright (c) 2017, Apple Inc. All rights reserved.
#
# Use of this source code is governed by a BSD-3-clause license that can be
# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause

"""
Pipeline utils for this package.
"""
from .. import SPECIFICATION_VERSION as _SPECIFICATION_VERSION
from ..proto import Model_pb2 as _Model_pb2
from . import _feature_management
from . import model as _model
from ._interface_management import (set_classifier_interface_params,
                                    set_regressor_interface_params,
                                    set_training_features,
                                    set_transform_interface_params)


[docs]class Pipeline:
    """
    A pipeline model that exposes a sequence of models as a single model,
    It requires a set of inputs, a sequence of other models and a set of outputs.

    This class is the base class for :py:class:`PipelineClassifier` and
    :py:class:`PipelineRegressor`, which contain a sequence ending in a classifier
    or regressor and themselves behave like a classifier or regressor.  This class
    may be used directly for a sequence of feature transformer objects.

    """

[docs]    def __init__(self, input_features, output_features, training_features=None):
        """
        Create a pipeline of models to be executed sequentially.

        Parameters
        ----------

        input_features: [list of 2-tuples]
            Name(s) of the input features, given as a list of `('name', datatype)`
            tuples.  The datatypes entry can be any of the data types defined in the
            :py:mod:`models.datatypes` module.

        output_features: [list of features]
            Name(s) of the output features, given as a list of
            `('name',datatype)` tuples.  The datatypes entry can be any of the
            data types defined in the :py:mod:`models.datatypes` module.  All features
            must be either defined in the inputs or be produced by one of the
            contained models.

        """
        spec = _Model_pb2.Model()
        spec.specificationVersion = _SPECIFICATION_VERSION

        # Access this to declare it as a pipeline
        spec.pipeline

        spec = set_transform_interface_params(
            spec, input_features, output_features, training_features
        )

        # Save the spec as a member variable.
        self.spec = spec

    def _validate_updatable_pipeline_on_add_model(self, spec):
        if spec.isUpdatable:
            raise ValueError(
                "New sub-models cannot be added after the pipeline has been marked as updatable"
            )

[docs]    def add_model(self, spec):
        """
        Add a protobuf spec or :py:class:`models.MLModel` instance to the pipeline.

        All input features of this model must either match the input_features
        of the pipeline, or match the outputs of a previous model.

        Parameters
        ----------
        spec: [MLModel, Model_pb2]
            A protobuf spec or MLModel instance containing a model.
        """

        self._validate_updatable_pipeline_on_add_model(self.spec)

        if isinstance(spec, _model.MLModel):
            spec = spec._spec

        pipeline = self.spec.pipeline
        step_spec = pipeline.models.add()
        step_spec.CopyFrom(spec)

    def _validate_sub_models_and_make_updatable(self, pipeline, spec):

        num_models = len(pipeline.models)
        if num_models < 1:
            raise ValueError(
                "Pipeline does not seem to have any models. It should be marked as updatable only after adding all sub-models."
            )

        for model in pipeline.models[:-1]:
            if model.isUpdatable:
                raise ValueError(
                    "Only the last model can be updatable in an updatable pipeline."
                )

        last_model = pipeline.models[num_models - 1]
        if not last_model.isUpdatable:
            raise ValueError(
                "A pipeline can be made updatable only if the last model is updatable."
            )

        spec.isUpdatable = True

    def make_updatable(self):
        self._validate_sub_models_and_make_updatable(self.spec.pipeline, self.spec)

[docs]    def set_training_input(self, training_input):
        """
        Set the training inputs of the network spec.

        Parameters
        ----------
        training_input: [tuple]
            List of training input names and type of the network.
        """
        spec = self.spec
        set_training_features(spec, training_input)


[docs]class PipelineRegressor(Pipeline):
    """
    A pipeline model that exposes a sequence of models as a single model,
    It requires a set of inputs, a sequence of other models and a set of outputs.
    In this case the pipeline itself behaves as a regression model by designating
    a real valued output feature as its 'predicted feature'.
    """

[docs]    def __init__(self, input_features, output_features, training_features=None):
        """
        Create a set of pipeline models given a set of model specs.  The final
        output model must be a regression model.

        Parameters
        ----------

        input_features: [list of 2-tuples]
            Name(s) of the input features, given as a list of `('name', datatype)`
            tuples.  The datatypes entry can be any of the data types defined in the
            :py:mod:`models.datatypes` module.

        output_features: [list of features]
            Name(s) of the output features, given as a list of
            `('name',datatype)` tuples.  The datatypes entry can be any of the
            data types defined in the :py:mod:`models.datatypes` module.  All features
            must be either defined in the inputs or be produced by one of the
            contained models.

        """
        spec = _Model_pb2.Model()
        spec.specificationVersion = _SPECIFICATION_VERSION

        # Access this to declare it as a pipeline
        spec.pipelineRegressor
        spec = set_regressor_interface_params(
            spec, input_features, output_features, training_features
        )

        # Save as a member variable
        self.spec = spec

[docs]    def add_model(self, spec):
        """
        Add a protobuf spec or :py:class:`models.MLModel` instance to the pipeline.

        All input features of this model must either match the input_features
        of the pipeline, or match the outputs of a previous model.

        Parameters
        ----------
        spec: [MLModel, Model_pb2]
            A protobuf spec or MLModel instance containing a model.
        """

        super()._validate_updatable_pipeline_on_add_model(self.spec)

        if isinstance(spec, _model.MLModel):
            spec = spec._spec

        pipeline = self.spec.pipelineRegressor.pipeline
        step_spec = pipeline.models.add()
        step_spec.CopyFrom(spec)

    def make_updatable(self):
        super()._validate_sub_models_and_make_updatable(
            self.spec.pipelineRegressor.pipeline, self.spec
        )

[docs]    def set_training_input(self, training_input):
        """
        Set the training inputs of the network spec.

        Parameters
        ----------
        training_input: [tuple]
            List of training input names and type of the network.
        """
        spec = self.spec
        set_training_features(spec, training_input)


[docs]class PipelineClassifier(Pipeline):
    """
    A pipeline model that exposes a sequence of models as a single model,
    It requires a set of inputs, a sequence of other models and a set of outputs.
    In this case the pipeline itself behaves as a classification model by designating
    a discrete categorical output feature as its 'predicted feature'.
    """

[docs]    def __init__(
        self, input_features, class_labels, output_features=None, training_features=None
    ):
        """
        Create a set of pipeline models given a set of model specs.  The last
        model in this list must be a classifier model.

        Parameters
        ----------
        input_features: [list of 2-tuples]
            Name(s) of the input features, given as a list of `('name', datatype)`
            tuples.  The datatypes entry can be any of the data types defined in the
            :py:mod:`models.datatypes` module.

        class_labels: [list]
            A list of string or integer class labels to use in making predictions.
            This list must match the class labels in the model outputting the categorical
            predictedFeatureName

        output_features: [list]
            A string or a list of two strings specifying the names of the two
            output features, the first being a class label corresponding
            to the class with the highest predicted score, and the second being
            a dictionary mapping each class to its score. If `output_features`
            is a string, it specifies the predicted class label and the class
            scores is set to the default value of `"classProbability."`

        """

        output_features = _feature_management.process_or_validate_classifier_output_features(
            output_features, class_labels
        )

        spec = _Model_pb2.Model()
        spec.specificationVersion = _SPECIFICATION_VERSION
        spec = set_classifier_interface_params(
            spec,
            input_features,
            class_labels,
            "pipelineClassifier",
            output_features,
            training_features,
        )

        # Access this to declare it as a pipeline
        spec.pipelineClassifier

        # Save as a member variable
        self.spec = spec

[docs]    def add_model(self, spec):
        """
        Add a protobuf spec or :py:class:`models.MLModel` instance to the pipeline.

        All input features of this model must either match the input_features
        of the pipeline, or match the outputs of a previous model.

        Parameters
        ----------
        spec: [MLModel, Model_pb2]
            A protobuf spec or MLModel instance containing a model.
        """

        super()._validate_updatable_pipeline_on_add_model(self.spec)

        if isinstance(spec, _model.MLModel):
            spec = spec._spec
        pipeline = self.spec.pipelineClassifier.pipeline
        step_spec = pipeline.models.add()
        step_spec.CopyFrom(spec)

    def make_updatable(self):
        super(PipelineClassifier, self)._validate_sub_models_and_make_updatable(
            self.spec.pipelineClassifier.pipeline, self.spec
        )

[docs]    def set_training_input(self, training_input):
        """
        Set the training inputs of the network spec.

        Parameters
        ----------
        training_input: [tuple]
            List of training input names and type of the network.
        """
        spec = self.spec
        set_training_features(spec, training_input)