Source code for coremltools.models.nearest_neighbors.builder

# Copyright (c) 2019, Apple Inc. All rights reserved.
#
# Use of this source code is governed by a BSD-3-clause license that can be
# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause

import numpy as _np

import coremltools

from ...proto import FeatureTypes_pb2
from .. import datatypes


[docs] class KNearestNeighborsClassifierBuilder: """ Construct a CoreML KNearestNeighborsClassifier specification. Please see the Core ML Nearest Neighbors protobuf message for more information on KNearestNeighborsClassifier parameters. Examples -------- .. sourcecode:: python from coremltools.models.nearest_neighbors import KNearestNeighborsClassifierBuilder from coremltools.models.utils import save_spec # Create a KNearestNeighborsClassifier model that takes 4-dimensional input data and outputs a string label. >>> builder = KNearestNeighborsClassifierBuilder(input_name='input', ... output_name='output', ... number_of_dimensions=4, ... default_class_label='default_label') # save the spec by the builder >>> save_spec(builder.spec, 'knnclassifier.mlmodel') """ _VALID_INDEX_TYPES = ["linear", "kd_tree"] _VALID_WEIGHTING_SCHEMES = ["uniform", "inverse_distance"] _VALID_DISTANCE_METRICS = ["squared_euclidean"] # Optional parameter keys for constructor _PARAMETER_KEY_NUMBER_OF_NEIGHBORS = "number_of_neighbors" _PARAMETER_KEY_WEIGHTING_SCHEME = "weighting_scheme" _PARAMETER_KEY_INDEX_TYPE = "index_type" _PARAMETER_KEY_LEAF_SIZE = "leaf_size" _PARAMETER_KEY_INPUT_TYPE = "input_type" # Optional parameter default values _PARAMETER_DEFAULT_NUMBER_OF_NEIGHBORS = 5 _PARAMETER_DEFAULT_WEIGHTING_SCHEME = "uniform" _PARAMETER_DEFAULT_INDEX_TYPE = "linear" _PARAMETER_DEFAULT_LEAF_SIZE = 30 _PARAMETER_DEFAULT_INPUT_TYPE = "NotSpecified"
[docs] def __init__( self, input_name, output_name, number_of_dimensions, default_class_label, **kwargs ): """ Create a KNearestNeighborsClassifierBuilder object. Parameters ---------- input_name Name of the model input. output_name Name of the output. number_of_dimensions Number of dimensions of the input data. default_class_label The default class label to use for predictions. Must be either an int64 or a string. number_of_neighbors Number of neighbors to use for predictions. Default = 5 with allowed values between 1-1000. weighting_scheme Weight function used in prediction. One of ``'uniform'`` (default) or ``'inverse_distance'``. index_type Algorithm to compute nearest neighbors. One of ``'linear'`` (default), or ``'kd_tree'``. leaf_size Leaf size for the kd-tree. Ignored if index type is ``'linear'``. Default = 30. """ self.spec = coremltools.proto.Model_pb2.Model() self.spec.specificationVersion = ( coremltools._MINIMUM_NEAREST_NEIGHBORS_SPEC_VERSION ) # the model is initially empty - assume it's updatable self.is_updatable = True if number_of_dimensions <= 0: raise ValueError("number_of_dimensions must be >= 0") self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.numberOfDimensions = ( number_of_dimensions ) input_type = kwargs.get( self._PARAMETER_KEY_INPUT_TYPE, self._PARAMETER_DEFAULT_INPUT_TYPE ) input_feature_type = FeatureTypes_pb2.ArrayFeatureType.FLOAT32 if input_type == datatypes.Double: input_feature_type = FeatureTypes_pb2.ArrayFeatureType.DOUBLE input_feature = self.spec.description.input.add() input_feature.name = input_name input_feature.type.multiArrayType.dataType = input_feature_type input_feature.type.multiArrayType.shape.extend([number_of_dimensions]) training_features = self.spec.description.trainingInput.add() training_features.name = input_name training_features.type.multiArrayType.dataType = input_feature_type training_features.type.multiArrayType.shape.extend([number_of_dimensions]) output_label = self.spec.description.output.add() output_label.name = output_name output_label_probs = self.spec.description.output.add() output_label_probs.name = output_name + "Probs" training_features = self.spec.description.trainingInput.add() training_features.name = output_name if self._is_valid_text_type(default_class_label): output_label.type.stringType.MergeFromString(b"") training_features.type.stringType.MergeFromString(b"") output_label_probs.type.dictionaryType.stringKeyType.MergeFromString(b"") self.spec.kNearestNeighborsClassifier.stringClassLabels.MergeFromString(b"") self.spec.kNearestNeighborsClassifier.defaultStringLabel = ( default_class_label ) elif self._is_valid_number_type(default_class_label): output_label.type.int64Type.MergeFromString(b"") training_features.type.int64Type.MergeFromString(b"") output_label_probs.type.dictionaryType.int64KeyType.MergeFromString(b"") self.spec.kNearestNeighborsClassifier.int64ClassLabels.MergeFromString(b"") self.spec.kNearestNeighborsClassifier.defaultInt64Label = ( default_class_label ) else: raise TypeError( "default_class_label type ({}) is invalid. Must be either string or int64".format( type(default_class_label) ) ) self.spec.description.predictedFeatureName = output_label.name self.spec.description.predictedProbabilitiesName = output_label_probs.name number_of_neighbors = kwargs.get( self._PARAMETER_KEY_NUMBER_OF_NEIGHBORS, self._PARAMETER_DEFAULT_NUMBER_OF_NEIGHBORS, ) self.set_number_of_neighbors_with_bounds( number_of_neighbors, allowed_range=(1, 1000) ) # Can we think of a more sensible default value? self.weighting_scheme = kwargs.get( self._PARAMETER_KEY_WEIGHTING_SCHEME, self._PARAMETER_DEFAULT_WEIGHTING_SCHEME, ) index_type = kwargs.get( self._PARAMETER_KEY_INDEX_TYPE, self._PARAMETER_DEFAULT_INDEX_TYPE ) leaf_size = kwargs.get( self._PARAMETER_KEY_LEAF_SIZE, self._PARAMETER_DEFAULT_LEAF_SIZE ) self.set_index_type(index_type, leaf_size) # SED is currently the only supported distance metric self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.squaredEuclideanDistance.MergeFromString( b"" )
@property def author(self): """ Get the author for the KNearestNeighborsClassifier model. Returns ------- The author """ return self.spec.description.metadata.author @author.setter def author(self, author): """ Add an author for the KNearestNeighborsClassifier model. Parameters ---------- author The author. Returns ------- None """ self.spec.description.metadata.author = author @property def license(self): """ Get the license for the KNearestNeighborsClassifier model. Returns ------- The license. """ return self.spec.description.metadata.license @author.setter def license(self, license): """ Add a license for the KNearestNeighborsClassifier model. Parameters ---------- license The license. Returns ------- None """ self.spec.description.metadata.license = license @property def description(self): """ Get the description for the KNearestNeighborsClassifier model. Returns ------- The description. """ return self.spec.description.metadata.shortDescription @description.setter def description(self, description): """ Add a description for the model. Parameters ---------- description The description Returns ------- None """ self.spec.description.metadata.shortDescription = description @property def is_updatable(self): """ Check if the KNearestNeighborsClassifier is updatable. Returns ------- Is updatable. """ return self.spec.isUpdatable @is_updatable.setter def is_updatable(self, is_updatable): """ Set the KNearestNeighborsClassifier to be updatable. Parameters ---------- is_updatable Boolean Returns ------- None """ self.spec.isUpdatable = is_updatable @property def weighting_scheme(self): """ Get the weighting scheme for the KNearestNeighborsClassifier model. Returns ------- The weighting scheme. """ return self._weighting_scheme @weighting_scheme.setter def weighting_scheme(self, weighting_scheme): """ Set the weighting scheme for the KNearestNeighborsClassifier model. Parameters ---------- weighting_scheme One of [ ``'uniform'``, ``'inverse_distance'`` ]. Returns ------- None """ weighting_scheme = weighting_scheme.lower() if weighting_scheme not in self._VALID_WEIGHTING_SCHEMES: raise TypeError("Invalid weighting scheme") if weighting_scheme == "inverse_distance": self.spec.kNearestNeighborsClassifier.inverseDistanceWeighting.MergeFromString( b"" ) else: self.spec.kNearestNeighborsClassifier.uniformWeighting.MergeFromString(b"") # storing this in the object is just a convenience self._weighting_scheme = weighting_scheme @property def index_type(self): """ Get the index type for the KNearestNeighborsClassifier model. Returns ------- The index type. """ return self._index_type
[docs] def set_index_type(self, index_type, leaf_size=30): """ Set the index type for the KNearestNeighborsClassifier model. Parameters ---------- index_type One of [ ``'linear'``, ``'kd_tree'`` ]. leaf_size For kd_tree indexes, the leaf size to use (default = 30). Returns ------- None """ index_type = index_type.lower() if index_type not in self._VALID_INDEX_TYPES: raise TypeError("Invalid index type") if index_type == "kd_tree": if leaf_size <= 0: raise TypeError("leaf_size must be > 0") self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.singleKdTreeIndex.leafSize = ( leaf_size ) else: self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.linearIndex.MergeFromString( b"" ) # storing this in the object is just a convenience self._index_type = index_type
@property def leaf_size(self): """ Get the leaf size for the KNearestNeighborsClassifier. Returns ------- The leaf size. """ return ( self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.singleKdTreeIndex.leafSize ) @leaf_size.setter def leaf_size(self, leaf_size): """ Set the leaf size for a KNearestNeighborsClassifier model. Only for kd-tree indexes. Parameters ---------- leaf_size The leaf size. Returns ------- None """ if leaf_size <= 0: raise ValueError("leaf_size must be > 0") self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.singleKdTreeIndex.leafSize = ( leaf_size ) @property def number_of_dimensions(self): """ Get the number of dimensions of the input data for the KNearestNeighborsClassifier model. Returns ------- Number of dimensions. """ return ( self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.numberOfDimensions ) @property def number_of_neighbors(self): """ Get the number of neighbors value for the KNearestNeighborsClassifier model. Returns ------- The number of neighbors default value. """ return self.spec.kNearestNeighborsClassifier.numberOfNeighbors.defaultValue
[docs] def set_number_of_neighbors_with_bounds( self, number_of_neighbors, allowed_range=None, allowed_set=None ): """ Set the numberOfNeighbors parameter for the KNearestNeighborsClassifier model. Parameters ---------- allowed_range Tuple of (``min_value``, ``max_value``) defining the range of allowed values. allowed_values Set of allowed values for the number of neighbors. Returns ------- None """ if number_of_neighbors <= 0: raise ValueError("number_of_neighbors must be > 0") if allowed_range is None and allowed_set is None: raise ValueError( "Exactly one of allowed_range or allowed_values must be provided" ) if allowed_range is not None and allowed_set is not None: raise ValueError( "Exactly one of allowed_range or allowed_values must be provided" ) if allowed_range is not None: if not isinstance(allowed_range, tuple): raise TypeError( "allowed_range expects a tuple of (min_value, max_value)" ) if len(allowed_range) != 2: raise TypeError( "allowed_range expects a tuple of (min_value, max_value)" ) (min_value, max_value) = allowed_range if min_value <= 0: raise ValueError("allowed_range minimum must be > 0") if max_value < min_value: raise ValueError("allowed_range max_value must be >= min_value") if number_of_neighbors < min_value or number_of_neighbors > max_value: raise ValueError("number_of_neighbors is not within allowed range") self.spec.kNearestNeighborsClassifier.numberOfNeighbors.defaultValue = ( number_of_neighbors ) self.spec.kNearestNeighborsClassifier.numberOfNeighbors.range.minValue = ( min_value ) self.spec.kNearestNeighborsClassifier.numberOfNeighbors.range.maxValue = ( max_value ) elif allowed_set is not None: if not isinstance(allowed_set, set): raise TypeError("allowed_values expects 'set' type") if len(allowed_set) == 0: raise TypeError("allowed_values cannot be empty") found_match = False for v in allowed_set: if not self._is_valid_number_type(v): raise TypeError("allowed_values must contain only integer types") if v <= 0: raise TypeError("allowed_values must only contain values > 0") if number_of_neighbors == v: found_match = True if found_match: self.spec.kNearestNeighborsClassifier.numberOfNeighbors.defaultValue = ( number_of_neighbors ) for v in allowed_set: self.spec.kNearestNeighborsClassifier.numberOfNeighbors.set.values.append( v ) else: raise ValueError("number_of_neighbors is not a valid value")
[docs] def number_of_neighbors_allowed_range(self): """ Get the range of allowed values for the numberOfNeighbors parameter. Returns ------- Tuple of (``min_value``, ``max_value``) or ``None`` if the range hasn't been set. """ if self.spec.kNearestNeighborsClassifier.numberOfNeighbors.HasField("range"): return ( self.spec.kNearestNeighborsClassifier.numberOfNeighbors.range.minValue, self.spec.kNearestNeighborsClassifier.numberOfNeighbors.range.maxValue, ) return None
[docs] def number_of_neighbors_allowed_set(self): """ Get the set of allowed values for the numberOfNeighbors parameter. Returns ------- Set of allowed values or ``None`` if the set of allowed values hasn't been populated. """ if self.spec.kNearestNeighborsClassifier.numberOfNeighbors.HasField("set"): spec_values = ( self.spec.kNearestNeighborsClassifier.numberOfNeighbors.set.values ) allowed_values = set() for v in spec_values: allowed_values.add(v) return allowed_values return None
[docs] def add_samples(self, data_points, labels): """ Add some samples to the KNearestNeighborsClassifier model. Parameters ---------- data_points List of input data points. labels List of corresponding labels. Returns ------- None """ if len(data_points) == 0: raise TypeError("data_points is empty") if len(labels) == 0: raise TypeError("labels is empty") if len(data_points[0]) != self.number_of_dimensions: raise TypeError( "dimensionality of data_points != expected number of dimensions" ) if len(data_points) != len(labels): raise TypeError("len(data_points) != len(labels)") # Validate the types of the labels before adding any points. self._validate_label_types(labels) for data_point in data_points: sample = ( self.spec.kNearestNeighborsClassifier.nearestNeighborsIndex.floatSamples.add() ) for feature in data_point: sample.vector.append(feature) if self.spec.kNearestNeighborsClassifier.HasField("int64ClassLabels"): for label in labels: self.spec.kNearestNeighborsClassifier.int64ClassLabels.vector.append( label ) else: # string labels for label in labels: self.spec.kNearestNeighborsClassifier.stringClassLabels.vector.append( label )
def _validate_label_types(self, labels): """ Ensure the label types matched the expected types. Parameters ---------- spec The spec. labels The list of labels. Returns ------- None, throws a TypeError if not expected. """ if self.spec.kNearestNeighborsClassifier.HasField("int64ClassLabels"): check_is_valid = KNearestNeighborsClassifierBuilder._is_valid_number_type else: check_is_valid = KNearestNeighborsClassifierBuilder._is_valid_text_type for label in labels: if not check_is_valid(label): raise TypeError("Invalid type for label: {}".format(type(label))) @staticmethod def _is_valid_text_type(obj): """ Checks if the object is a valid text type. Parameters ---------- obj The object to check. Returns ------- True if a valid text type, False otherwise. """ return isinstance(obj, str) @staticmethod def _is_valid_number_type(obj): """ Checks if the object is a valid number type. Parameters ---------- obj The object to check. Returns ------- True if a valid number type, False otherwise. """ return isinstance(obj, (int, _np.integer))