Feature Engineering

Imputer

A transformer that replaces missing values with a default value, such as a statistically-derived value.

If ReplaceValue is set, then missing values of that type are replaced with the corresponding value.

For example: if replaceDoubleValue is set to NaN and a single NaN double value is provided as input, then it is replaced by imputedDoubleValue. However if the input is an array of doubles, then any instances of NaN in the array is replaced with the corresponding value in imputedDoubleArray.

message Imputer {
    oneof ImputedValue {
        double imputedDoubleValue = 1;
        int64 imputedInt64Value = 2;
        string imputedStringValue = 3;
        DoubleVector imputedDoubleArray = 4;
        Int64Vector imputedInt64Array = 5;
        StringToDoubleMap imputedStringDictionary = 6;
        Int64ToDoubleMap imputedInt64Dictionary = 7;
    }

    oneof ReplaceValue {
        double replaceDoubleValue = 11;
        int64 replaceInt64Value = 12;
        string replaceStringValue = 13;
    }
}

Scaler

A scaling operation.

This function has the following formula:

f(x) = scaleValue \cdot (x + shiftValue)

If the scaleValue is not given, the default value 1 is used. If the shiftValue is not given, the default value 0 is used.

If scaleValue and shiftValue are each a single value and the input is an array, then the scale and shift are applied to each element of the array.

If the input is an integer, then it is converted to a double to perform the scaling operation. If the output type is an integer, then it is cast to an integer. If that cast is lossy, then an error is generated.

message Scaler {
    repeated double shiftValue = 1;
    repeated double scaleValue = 2;
}

Normalizer

A normalization preprocessor.

message Normalizer {
    enum NormType {
        LMax = 0;
        L1 = 1;
        L2 = 2;
    }

    NormType normType = 1;
}

Normalizer.NormType

There are three normalization modes, which have the corresponding formulas:

Max

max(x_i)

L1

z = ||x||_1 = \sum_{i=1}^{n} |x_i|

L2

z = ||x||_2 = \sqrt{\sum_{i=1}^{n} x_i^2}

enum NormType {
    LMax = 0;
    L1 = 1;
    L2 = 2;
}

OneHotEncoder

Transforms a categorical feature into an array. The array will be all zeros expect a single entry of one.

Each categorical value will map to an index, this mapping is given by either the stringCategories parameter or the int64Categories parameter.

message OneHotEncoder {
    enum HandleUnknown {
        ErrorOnUnknown = 0;
        IgnoreUnknown = 1;   // Output will be all zeros for unknown values.
    }

    oneof CategoryType {
        StringVector stringCategories = 1;
        Int64Vector int64Categories = 2;
    }

    // Output can be a dictionary with only one entry, instead of an array.
    bool outputSparse = 10;

    HandleUnknown handleUnknown = 11;
}

OneHotEncoder.HandleUnknown

enum HandleUnknown {
    ErrorOnUnknown = 0;
    IgnoreUnknown = 1;   // Output will be all zeros for unknown values.
}

CategoricalMapping

A categorical mapping.

This allows conversion from integers to strings, or from strings to integers.

message CategoricalMapping {
    oneof MappingType {
        // Conversion from strings to integers
        StringToInt64Map stringToInt64Map = 1;

        // Conversion from integer to string
        Int64ToStringMap int64ToStringMap = 2;
    }

    oneof ValueOnUnknown {
        // Default output when converting from an integer to a string.
        string strValue = 101;

        // Default output when converting from an string to an integer.
        int64 int64Value = 102;
    }
}

FeatureVectorizer

A FeatureVectorizer puts one or more features into a single array.

The ordering of features in the output array is determined by inputList.

inputDimensions is a zero based index.

message FeatureVectorizer {
    message InputColumn {
        string inputColumn = 1;
        uint64 inputDimensions = 2;
    }

    repeated InputColumn inputList = 1;
}

FeatureVectorizer.InputColumn

message InputColumn {
    string inputColumn = 1;
    uint64 inputDimensions = 2;
}

DictVectorizer

Uses an index mapping to convert a dictionary to an array.

The output array will be equal in length to the index mapping vector parameter. All keys in the input dictionary must be present in the index mapping vector.

For each item in the input dictionary, insert its value in the output array. The position of the insertion is determined by the position of the item’s key in the index mapping. Any keys not present in the input dictionary, will be zero in the output array.

For example: if the stringToIndex parameter is set to ["a", "c", "b", "z"], then an input of {"a": 4, "c": 8} will produce an output of [4, 8, 0, 0].

message DictVectorizer {
    oneof Map {
        StringVector stringToIndex = 1;

        Int64Vector int64ToIndex = 2;
    }
}

ArrayFeatureExtractor

An array feature extractor.

Given an index, extracts the value at that index from its array input. Indexes are zero-based.

message ArrayFeatureExtractor {
    repeated uint64 extractIndex = 1;
}

NonMaximumSuppression

message NonMaximumSuppression {
    // Suppression methods:
    message PickTop {
        bool perClass = 1;
    }

    oneof SuppressionMethod {
        PickTop pickTop = 1;
    }

    oneof ClassLabels {
        StringVector stringClassLabels = 100;
        Int64Vector int64ClassLabels = 101;
    }

    double iouThreshold = 110;

           it means there is a 60% (0.2 + 0.4) confidence that an object is
           present)
    double confidenceThreshold = 111;

    string confidenceInputFeatureName = 200;

    string coordinatesInputFeatureName = 201;

    string iouThresholdInputFeatureName = 202;

    string confidenceThresholdInputFeatureName = 203;

    string confidenceOutputFeatureName = 210;

    string coordinatesOutputFeatureName = 211;
}

NonMaximumSuppression.PickTop

message PickTop {
    bool perClass = 1;
}