Source code for catboost_spark.core


import collections
import datetime
from enum import Enum

from py4j.java_gateway import JavaObject

from pyspark import keyword_only, SparkContext


from pyspark.ml.classification import JavaProbabilisticClassificationModel
from pyspark.ml.regression import JavaRegressionModel


import pyspark.ml.common
from pyspark.ml.common import inherit_doc
from pyspark.ml.param import Param, Params
from pyspark.ml.util import JavaMLReader, JavaMLWriter, JavaMLWritable, MLReadable


import pyspark.ml.wrapper
from pyspark.ml.wrapper import JavaParams, JavaEstimator, JavaWrapper
from pyspark.sql import DataFrame, SparkSession


"""
    original JavaParams._from_java has to be replaced because of hardcoded class names transformation
"""

@staticmethod
def _from_java_patched_for_catboost(java_stage):
    """
    Given a Java object, create and return a Python wrapper of it.
    Used for ML persistence.

    Meta-algorithms such as Pipeline should override this method as a classmethod.
    """
    def __get_class(clazz):
        """
        Loads Python class from its name.
        """
        parts = clazz.split('.')
        module = ".".join(parts[:-1])
        m = __import__(module)
        for comp in parts[1:]:
            m = getattr(m, comp)
        return m
    stage_name = (
        java_stage.getClass().getName()
            .replace("org.apache.spark", "pyspark")
            .replace("ai.catboost.spark", "catboost_spark")
    )
    # Generate a default new instance from the stage_name class.
    py_type = __get_class(stage_name)
    if issubclass(py_type, JavaParams):
        # Load information from java_stage to the instance.
        py_stage = py_type()
        py_stage._java_obj = java_stage
        py_stage._resetUid(java_stage.uid())
        py_stage._transfer_params_from_java()
    elif hasattr(py_type, "_from_java"):
        py_stage = py_type._from_java(java_stage)
    else:
        raise NotImplementedError("This Java stage cannot be loaded into Python currently: %r"
                                  % stage_name)
    return py_stage

JavaParams._from_java = _from_java_patched_for_catboost


"""
    Adapt _py2java and _java2py for additional types present in CatBoost Params
"""

_standard_py2java = pyspark.ml.common._py2java
_standard_java2py = pyspark.ml.common._java2py

def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, SparkSession):
        return obj._jsparkSession
    if isinstance(obj, Enum):
        return getattr(
            getattr(
                sc._jvm.ru.yandex.catboost.spark.catboost4j_spark.core.src.native_impl, 
                obj.__class__.__name__
            ),
            'swigToEnum'
        )(obj.value)
    if isinstance(obj, datetime.timedelta):
        return sc._jvm.java.time.Duration.ofMillis(obj // datetime.timedelta(milliseconds=1))
    if isinstance(obj, JavaParams):
        return obj._to_java()
    if isinstance(obj, collections.OrderedDict):
        return sc._jvm.java.util.LinkedHashMap(obj)
    return _standard_py2java(sc, obj)

def _java2py(sc, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        enumValues = r.getClass().getEnumConstants()
        if (enumValues is not None) and (len(enumValues) > 0):
            return globals()[r.getClass().getSimpleName()](r.swigValue())
        
        clsName = r.getClass().getName()
        if clsName == 'java.time.Duration':
            return datetime.timedelta(milliseconds=r.toMillis())
        if clsName == 'ai.catboost.spark.Pool':
            return Pool(r)
        if clsName == 'java.util.LinkedHashMap':
            return collections.OrderedDict(r)
    return _standard_java2py(sc, r, encoding)

pyspark.ml.common._py2java = _py2java
pyspark.ml.common._java2py = _java2py

pyspark.ml.wrapper._py2java = _py2java
pyspark.ml.wrapper._java2py = _java2py


@inherit_doc
class CatBoostMLReader(JavaMLReader):
    """
    (Private) Specialization of :py:class:`JavaMLReader` for CatBoost types
    """

    @classmethod
    def _java_loader_class(cls, clazz):
        """
        Returns the full class name of the Java ML instance.
        """
        java_package = clazz.__module__.replace("catboost_spark.core", "ai.catboost.spark")
        print("CatBoostMLReader._java_loader_class. ", java_package + "." + clazz.__name__)
        return java_package + "." + clazz.__name__



[docs]class PoolLoadParams(JavaParams):
    """
    Parameters
    ----------
    delimiter : str, default: "\t"
        The delimiter character used to separate the data in the dataset description input file.
    hasHeader : bool
        Read the column names from the first line of the dataset description file if this parameter is set.
    """

    @keyword_only
    def __init__(self, delimiter="\t", hasHeader=None):
        super(PoolLoadParams, self).__init__()
        self._java_obj = self._new_java_obj("ai.catboost.spark.params.PoolLoadParams")
        self.delimiter = Param(self, "delimiter", "The delimiter character used to separate the data in the dataset description input file.")
        self._setDefault(delimiter="\t")
        self.hasHeader = Param(self, "hasHeader", "Read the column names from the first line of the dataset description file if this parameter is set.")

        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        self.setParams(**kwargs)


[docs]    @keyword_only
    def setParams(self, delimiter="\t", hasHeader=None):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        delimiter : str, default: "\t"
            The delimiter character used to separate the data in the dataset description input file.
        hasHeader : bool
            Read the column names from the first line of the dataset description file if this parameter is set.
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getDelimiter(self):
        """
        Returns
        -------
        str
            The delimiter character used to separate the data in the dataset description input file.
        """
        return self.getOrDefault(self.delimiter)

[docs]    def setDelimiter(self, value):
        """
        Parameters
        ----------
        value : str
            The delimiter character used to separate the data in the dataset description input file.
        """
        self._set(delimiter=value)
        return self



[docs]    def getHasHeader(self):
        """
        Returns
        -------
        bool
            Read the column names from the first line of the dataset description file if this parameter is set.
        """
        return self.getOrDefault(self.hasHeader)

[docs]    def setHasHeader(self, value):
        """
        Parameters
        ----------
        value : bool
            Read the column names from the first line of the dataset description file if this parameter is set.
        """
        self._set(hasHeader=value)
        return self





[docs]class QuantizationParams(JavaParams):
    """
    Parameters
    ----------
    borderCount : int
        The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
    featureBorderType : EBorderSelectionType
        The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
    ignoredFeaturesIndices : list
        Feature indices to exclude from the training
    ignoredFeaturesNames : list
        Feature names to exclude from the training
    inputBorders : str
        Load Custom quantization borders and missing value modes from a file (do not generate them)
    nanMode : ENanMode
        The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
    perFloatFeatureQuantizaton : list
        The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
    threadCount : int
        Number of CPU threads in parallel operations on client
    """

    @keyword_only
    def __init__(self, borderCount=None, featureBorderType=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, nanMode=None, perFloatFeatureQuantizaton=None, threadCount=None):
        super(QuantizationParams, self).__init__()
        self._java_obj = self._new_java_obj("ai.catboost.spark.params.QuantizationParams")
        self.borderCount = Param(self, "borderCount", "The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.")
        self.featureBorderType = Param(self, "featureBorderType", "The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'")
        self.ignoredFeaturesIndices = Param(self, "ignoredFeaturesIndices", "Feature indices to exclude from the training")
        self.ignoredFeaturesNames = Param(self, "ignoredFeaturesNames", "Feature names to exclude from the training")
        self.inputBorders = Param(self, "inputBorders", "Load Custom quantization borders and missing value modes from a file (do not generate them)")
        self.nanMode = Param(self, "nanMode", "The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'")
        self.perFloatFeatureQuantizaton = Param(self, "perFloatFeatureQuantizaton", "The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]")
        self.threadCount = Param(self, "threadCount", "Number of CPU threads in parallel operations on client")

        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        self.setParams(**kwargs)


[docs]    @keyword_only
    def setParams(self, borderCount=None, featureBorderType=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, nanMode=None, perFloatFeatureQuantizaton=None, threadCount=None):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        borderCount : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        featureBorderType : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        ignoredFeaturesIndices : list
            Feature indices to exclude from the training
        ignoredFeaturesNames : list
            Feature names to exclude from the training
        inputBorders : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        nanMode : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        perFloatFeatureQuantizaton : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        threadCount : int
            Number of CPU threads in parallel operations on client
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getBorderCount(self):
        """
        Returns
        -------
        int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        return self.getOrDefault(self.borderCount)

[docs]    def setBorderCount(self, value):
        """
        Parameters
        ----------
        value : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        self._set(borderCount=value)
        return self



[docs]    def getFeatureBorderType(self):
        """
        Returns
        -------
        EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        return self.getOrDefault(self.featureBorderType)

[docs]    def setFeatureBorderType(self, value):
        """
        Parameters
        ----------
        value : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        self._set(featureBorderType=value)
        return self



[docs]    def getIgnoredFeaturesIndices(self):
        """
        Returns
        -------
        list
            Feature indices to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesIndices)

[docs]    def setIgnoredFeaturesIndices(self, value):
        """
        Parameters
        ----------
        value : list
            Feature indices to exclude from the training
        """
        self._set(ignoredFeaturesIndices=value)
        return self



[docs]    def getIgnoredFeaturesNames(self):
        """
        Returns
        -------
        list
            Feature names to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesNames)

[docs]    def setIgnoredFeaturesNames(self, value):
        """
        Parameters
        ----------
        value : list
            Feature names to exclude from the training
        """
        self._set(ignoredFeaturesNames=value)
        return self



[docs]    def getInputBorders(self):
        """
        Returns
        -------
        str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        return self.getOrDefault(self.inputBorders)

[docs]    def setInputBorders(self, value):
        """
        Parameters
        ----------
        value : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        self._set(inputBorders=value)
        return self



[docs]    def getNanMode(self):
        """
        Returns
        -------
        ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        return self.getOrDefault(self.nanMode)

[docs]    def setNanMode(self, value):
        """
        Parameters
        ----------
        value : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        self._set(nanMode=value)
        return self



[docs]    def getPerFloatFeatureQuantizaton(self):
        """
        Returns
        -------
        list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        return self.getOrDefault(self.perFloatFeatureQuantizaton)

[docs]    def setPerFloatFeatureQuantizaton(self, value):
        """
        Parameters
        ----------
        value : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        self._set(perFloatFeatureQuantizaton=value)
        return self



[docs]    def getThreadCount(self):
        """
        Returns
        -------
        int
            Number of CPU threads in parallel operations on client
        """
        return self.getOrDefault(self.threadCount)

[docs]    def setThreadCount(self, value):
        """
        Parameters
        ----------
        value : int
            Number of CPU threads in parallel operations on client
        """
        self._set(threadCount=value)
        return self





[docs]class Pool(JavaParams):
    """
    CatBoost's abstraction of a dataset.
    Features data can be stored in raw (features column has pyspark.ml.linalg.Vector type)
    or quantized (float feature values are quantized into integer bin values, features column has
    Array[Byte] type) form.

    Raw Pool can be transformed to quantized form using `quantize` method.
    This is useful if this dataset is used for training multiple times and quantization parameters do not
    change. Pre-quantized Pool allows to cache quantized features data and so do not re-run
    feature quantization step at the start of an each training.
    """
    def __init__(self, data_frame_or_java_object, pairs_data_frame=None):
        """
        Construct Pool from DataFrame, optionally specifying pairs data in an additional DataFrame.
        """
        if isinstance(data_frame_or_java_object, JavaObject):
            java_obj = data_frame_or_java_object
        else:
            java_obj = JavaWrapper._new_java_obj("ai.catboost.spark.Pool", data_frame_or_java_object, pairs_data_frame)

        super(Pool, self).__init__(java_obj)
        self.baselineCol = Param(self, "baselineCol", "baseline column name")
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol="features")
        self.groupIdCol = Param(self, "groupIdCol", "groupId column name")
        self.groupWeightCol = Param(self, "groupWeightCol", "groupWeight column name")
        self.labelCol = Param(self, "labelCol", "label column name")
        self._setDefault(labelCol="label")
        self.sampleIdCol = Param(self, "sampleIdCol", "sampleId column name")
        self.subgroupIdCol = Param(self, "subgroupIdCol", "subgroupId column name")
        self.timestampCol = Param(self, "timestampCol", "timestamp column name")
        self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0")
      

[docs]    @keyword_only
    def setParams(self, baselineCol=None, featuresCol="features", groupIdCol=None, groupWeightCol=None, labelCol="label", sampleIdCol=None, subgroupIdCol=None, timestampCol=None, weightCol=None):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        baselineCol : str
            baseline column name
        featuresCol : str, default: "features"
            features column name
        groupIdCol : str
            groupId column name
        groupWeightCol : str
            groupWeight column name
        labelCol : str, default: "label"
            label column name
        sampleIdCol : str
            sampleId column name
        subgroupIdCol : str
            subgroupId column name
        timestampCol : str
            timestamp column name
        weightCol : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getBaselineCol(self):
        """
        Returns
        -------
        str
            baseline column name
        """
        return self.getOrDefault(self.baselineCol)

[docs]    def setBaselineCol(self, value):
        """
        Parameters
        ----------
        value : str
            baseline column name
        """
        self._set(baselineCol=value)
        return self



[docs]    def getFeaturesCol(self):
        """
        Returns
        -------
        str
            features column name
        """
        return self.getOrDefault(self.featuresCol)

[docs]    def setFeaturesCol(self, value):
        """
        Parameters
        ----------
        value : str
            features column name
        """
        self._set(featuresCol=value)
        return self



[docs]    def getGroupIdCol(self):
        """
        Returns
        -------
        str
            groupId column name
        """
        return self.getOrDefault(self.groupIdCol)

[docs]    def setGroupIdCol(self, value):
        """
        Parameters
        ----------
        value : str
            groupId column name
        """
        self._set(groupIdCol=value)
        return self



[docs]    def getGroupWeightCol(self):
        """
        Returns
        -------
        str
            groupWeight column name
        """
        return self.getOrDefault(self.groupWeightCol)

[docs]    def setGroupWeightCol(self, value):
        """
        Parameters
        ----------
        value : str
            groupWeight column name
        """
        self._set(groupWeightCol=value)
        return self



[docs]    def getLabelCol(self):
        """
        Returns
        -------
        str
            label column name
        """
        return self.getOrDefault(self.labelCol)

[docs]    def setLabelCol(self, value):
        """
        Parameters
        ----------
        value : str
            label column name
        """
        self._set(labelCol=value)
        return self



[docs]    def getSampleIdCol(self):
        """
        Returns
        -------
        str
            sampleId column name
        """
        return self.getOrDefault(self.sampleIdCol)

[docs]    def setSampleIdCol(self, value):
        """
        Parameters
        ----------
        value : str
            sampleId column name
        """
        self._set(sampleIdCol=value)
        return self



[docs]    def getSubgroupIdCol(self):
        """
        Returns
        -------
        str
            subgroupId column name
        """
        return self.getOrDefault(self.subgroupIdCol)

[docs]    def setSubgroupIdCol(self, value):
        """
        Parameters
        ----------
        value : str
            subgroupId column name
        """
        self._set(subgroupIdCol=value)
        return self



[docs]    def getTimestampCol(self):
        """
        Returns
        -------
        str
            timestamp column name
        """
        return self.getOrDefault(self.timestampCol)

[docs]    def setTimestampCol(self, value):
        """
        Parameters
        ----------
        value : str
            timestamp column name
        """
        self._set(timestampCol=value)
        return self



[docs]    def getWeightCol(self):
        """
        Returns
        -------
        str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        return self.getOrDefault(self.weightCol)

[docs]    def setWeightCol(self, value):
        """
        Parameters
        ----------
        value : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        self._set(weightCol=value)
        return self




    def _call_java(self, name, *args):
        self._transfer_params_to_java()
        return JavaWrapper._call_java(self, name, *args)


[docs]    def isQuantized(self):
        """
        Returns whether the main `data` has already been quantized.
        """
        return self._call_java("isQuantized")


[docs]    def getFeatureCount(self):
        """
        Returns the number of features.
        """
        return self._call_java("getFeatureCount")


[docs]    def getFeatureNames(self):
        """
        Returns the list of feature names.
        """
        return self._call_java("getFeatureNames")


[docs]    def count(self):
        """
        Returns the number of rows in the main `data` DataFrame.
        """
        return self._call_java("count")


[docs]    def pairsCount(self):
        """
        Returns the number of rows in the `pairsData` DataFrame.
        """
        return self._call_java("pairsCount")


[docs]    def getBaselineCount(self):
        """
        Returns the dimension of the baseline data (0 if not specified).
        """
        return self._call_java("getBaselineCount")


    @property
    def data(self):
        """
        DataFrame with the main data (features, label, (optionally) weight etc.)
        """
        return self._call_java("data")

    @property
    def pairsData(self):
        """
        DataFrame with the pairs data (groupId, winnerId, loserId and optionally weight).
        Can be None.
        """
        return self._call_java("pairsData")

[docs]    def quantize(self, quantizationParams = None):
        """Create Pool with quantized features from Pool with raw features"""
        if quantizationParams is None:
            quantizationParams = QuantizationParams()
        return self._call_java("quantize", quantizationParams)

[docs]    def repartition(self, partitionCount, byGroupColumnsIfPresent):
        """
        Repartion data to the specified number of partitions.
        Useful to repartition data to create one partition per executor for training
        (where each executor gets its' own CatBoost worker with a part of the training data).
        """
        return self._call_java("repartition", partitionCount, byGroupColumnsIfPresent)

[docs]    @staticmethod
    def load(sparkSession, dataPathWithScheme, columnDescription=None, poolLoadParams=None, pairsDataPathWithScheme=None):
        """
        Load dataset in one of CatBoost's natively supported formats:
           * dsv - https://catboost.ai/docs/concepts/input-data_values-file.html
           * libsvm - https://catboost.ai/docs/concepts/input-data_libsvm.html
        
        Parameters
        ----------
        sparkSession : SparkSession
        dataPathWithScheme : str
            Path with scheme to dataset in CatBoost format.
            For example, `dsv:///home/user/datasets/my_dataset/train.dsv` or
            `libsvm:///home/user/datasets/my_dataset/train.libsvm`
        columnDescription : str, optional
            Path to column description file. See https://catboost.ai/docs/concepts/input-data_column-descfile.html 
        params : PoolLoadParams, optional
            Additional params specifying data format.
        pairsDataPathWithScheme : str, optional
            Path with scheme to dataset pairs in CatBoost format.
            Only "dsv-grouped" format is supported for now.
            For example, `dsv-grouped:///home/user/datasets/my_dataset/train_pairs.dsv`
        
        Returns
        -------
           Pool
               Pool containing loaded data
        """
        if poolLoadParams is None:
            poolLoadParams = PoolLoadParams()
        sc = sparkSession.sparkContext
        java_obj = sc._jvm.ai.catboost.spark.Pool.load(
            _py2java(sc, sparkSession),
            dataPathWithScheme,
            (sc._jvm.java.nio.file.Paths.get(columnDescription, sc._gateway.new_array(sc._jvm.String, 0))
             if columnDescription
             else None
            ),
            _py2java(sc, poolLoadParams),
            pairsDataPathWithScheme
        )
        return Pool(java_obj)



[docs]class EAutoClassWeightsType(Enum):
    Balanced = 0
    SqrtBalanced = 1
    No = 2


[docs]class EBootstrapType(Enum):
    Poisson = 0
    Bayesian = 1
    Bernoulli = 2
    MVS = 3
    No = 4


[docs]class EBorderSelectionType(Enum):
    Median = 0
    GreedyLogSum = 1
    UniformAndQuantiles = 2
    MinEntropy = 3
    MaxLogSum = 4
    Uniform = 5
    GreedyMinEntropy = 6


[docs]class ECalcTypeShapValues(Enum):
    Approximate = 0
    Regular = 1
    Exact = 2
    Independent = 3


[docs]class EExplainableModelOutput(Enum):
    Raw = 0
    Probability = 1
    LossFunction = 2


[docs]class EFstrType(Enum):
    PredictionValuesChange = 0
    LossFunctionChange = 1
    FeatureImportance = 2
    InternalFeatureImportance = 3
    Interaction = 4
    InternalInteraction = 5
    ShapValues = 6
    PredictionDiff = 7
    ShapInteractionValues = 8


[docs]class ELeavesEstimation(Enum):
    Gradient = 0
    Newton = 1
    Exact = 2
    Simple = 3


[docs]class ELeavesEstimationStepBacktracking(Enum):
    No = 0
    AnyImprovement = 1
    Armijo = 2


[docs]class ELoggingLevel(Enum):
    Silent = 0
    Verbose = 1
    Info = 2
    Debug = 3


[docs]class EModelShrinkMode(Enum):
    Constant = 0
    Decreasing = 1


[docs]class EModelType(Enum):
    CatboostBinary = 0
    AppleCoreML = 1
    Cpp = 2
    Python = 3
    Json = 4
    Onnx = 5
    Pmml = 6
    CPUSnapshot = 7


[docs]class ENanMode(Enum):
    Min = 0
    Max = 1
    Forbidden = 2


[docs]class EOverfittingDetectorType(Enum):
    No = 0
    Wilcoxon = 1
    IncToDec = 2
    Iter = 3


[docs]class EPreCalcShapValues(Enum):
    Auto = 0
    UsePreCalc = 1
    NoPreCalc = 2


[docs]class ESamplingFrequency(Enum):
    PerTree = 0
    PerTreeLevel = 1


[docs]class ESamplingUnit(Enum):
    Object = 0
    Group = 1


[docs]class EScoreFunction(Enum):
    SolarL2 = 0
    Cosine = 1
    NewtonL2 = 2
    NewtonCosine = 3
    LOOL2 = 4
    SatL2 = 5
    L2 = 6



[docs]@inherit_doc
class CatBoostRegressor(JavaEstimator, MLReadable, JavaMLWritable):
    """
    Class to train CatBoostRegressionModel

    Init Parameters
    ---------------
        allowConstLabel : bool
            Use it to train models with datasets that have equal label values for all objects.
        allowWritingFiles : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        approxOnFullHistory : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        baggingTemperature : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        bestModelMinTrees : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        bootstrapType : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        borderCount : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        connectTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=60000)
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        customMetric : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        depth : int
            Depth of the tree.Default value is 6.
        diffusionTemperature : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        earlyStoppingRounds : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        evalMetric : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        featureBorderType : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        featureWeightsList : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        featureWeightsMap : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        featuresCol : str, default: "features"
            features column name
        firstFeatureUsePenaltiesList : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        firstFeatureUsePenaltiesMap : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        foldLenMultiplier : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        foldPermutationBlock : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        hasTime : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        ignoredFeaturesIndices : list
            Feature indices to exclude from the training
        ignoredFeaturesNames : list
            Feature names to exclude from the training
        inputBorders : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        iterations : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        l2LeafReg : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        labelCol : str, default: "label"
            label column name
        leafEstimationBacktracking : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        leafEstimationIterations : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        leafEstimationMethod : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        learningRate : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        loggingLevel : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        lossFunction : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        metricPeriod : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        modelShrinkMode : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        modelShrinkRate : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        mvsReg : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        nanMode : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        odPval : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        odType : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        odWait : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        oneHotMaxSize : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        penaltiesCoefficient : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        perFloatFeatureQuantizaton : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        perObjectFeaturePenaltiesList : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        perObjectFeaturePenaltiesMap : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        predictionCol : str, default: "prediction"
            prediction column name
        randomSeed : int
            The random seed used for training. Default value is 0.
        randomStrength : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        rsm : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        samplingFrequency : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        samplingUnit : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        saveSnapshot : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        scoreFunction : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        snapshotFile : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        snapshotInterval : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        sparkPartitionCount : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        subsample : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        threadCount : int
            Number of CPU threads in parallel operations on client
        trainDir : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        useBestModel : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        weightCol : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        workerInitializationTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=600000)
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        workerMaxFailures : int, default: 4
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
    """

    @keyword_only
    def __init__(self, allowConstLabel=None, allowWritingFiles=None, approxOnFullHistory=None, baggingTemperature=None, bestModelMinTrees=None, bootstrapType=None, borderCount=None, connectTimeout=datetime.timedelta(milliseconds=60000), customMetric=None, depth=None, diffusionTemperature=None, earlyStoppingRounds=None, evalMetric=None, featureBorderType=None, featureWeightsList=None, featureWeightsMap=None, featuresCol="features", firstFeatureUsePenaltiesList=None, firstFeatureUsePenaltiesMap=None, foldLenMultiplier=None, foldPermutationBlock=None, hasTime=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, iterations=None, l2LeafReg=None, labelCol="label", leafEstimationBacktracking=None, leafEstimationIterations=None, leafEstimationMethod=None, learningRate=None, loggingLevel=None, lossFunction=None, metricPeriod=None, modelShrinkMode=None, modelShrinkRate=None, mvsReg=None, nanMode=None, odPval=None, odType=None, odWait=None, oneHotMaxSize=None, penaltiesCoefficient=None, perFloatFeatureQuantizaton=None, perObjectFeaturePenaltiesList=None, perObjectFeaturePenaltiesMap=None, predictionCol="prediction", randomSeed=None, randomStrength=None, rsm=None, samplingFrequency=None, samplingUnit=None, saveSnapshot=None, scoreFunction=None, snapshotFile=None, snapshotInterval=None, sparkPartitionCount=None, subsample=None, threadCount=None, trainDir=None, useBestModel=None, weightCol=None, workerInitializationTimeout=datetime.timedelta(milliseconds=600000), workerMaxFailures=4):
        super(CatBoostRegressor, self).__init__()
        self._java_obj = self._new_java_obj("ai.catboost.spark.CatBoostRegressor")
        self.allowConstLabel = Param(self, "allowConstLabel", "Use it to train models with datasets that have equal label values for all objects.")
        self.allowWritingFiles = Param(self, "allowWritingFiles", "Allow to write analytical and snapshot files during training. Enabled by default.")
        self.approxOnFullHistory = Param(self, "approxOnFullHistory", "Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.")
        self.baggingTemperature = Param(self, "baggingTemperature", "This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.")
        self.bestModelMinTrees = Param(self, "bestModelMinTrees", "The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.")
        self.bootstrapType = Param(self, "bootstrapType", "Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.")
        self.borderCount = Param(self, "borderCount", "The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.")
        self.connectTimeout = Param(self, "connectTimeout", "Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute")
        self._setDefault(connectTimeout=datetime.timedelta(milliseconds=60000))
        self.customMetric = Param(self, "customMetric", "Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).")
        self.depth = Param(self, "depth", "Depth of the tree.Default value is 6.")
        self.diffusionTemperature = Param(self, "diffusionTemperature", "The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.")
        self.earlyStoppingRounds = Param(self, "earlyStoppingRounds", "Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.")
        self.evalMetric = Param(self, "evalMetric", "The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).")
        self.featureBorderType = Param(self, "featureBorderType", "The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'")
        self.featureWeightsList = Param(self, "featureWeightsList", "Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.")
        self.featureWeightsMap = Param(self, "featureWeightsMap", "Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.")
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol="features")
        self.firstFeatureUsePenaltiesList = Param(self, "firstFeatureUsePenaltiesList", "Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.")
        self.firstFeatureUsePenaltiesMap = Param(self, "firstFeatureUsePenaltiesMap", "Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.")
        self.foldLenMultiplier = Param(self, "foldLenMultiplier", "Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.")
        self.foldPermutationBlock = Param(self, "foldPermutationBlock", "Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.")
        self.hasTime = Param(self, "hasTime", "Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).")
        self.ignoredFeaturesIndices = Param(self, "ignoredFeaturesIndices", "Feature indices to exclude from the training")
        self.ignoredFeaturesNames = Param(self, "ignoredFeaturesNames", "Feature names to exclude from the training")
        self.inputBorders = Param(self, "inputBorders", "Load Custom quantization borders and missing value modes from a file (do not generate them)")
        self.iterations = Param(self, "iterations", "The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.")
        self.l2LeafReg = Param(self, "l2LeafReg", "Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.")
        self.labelCol = Param(self, "labelCol", "label column name")
        self._setDefault(labelCol="label")
        self.leafEstimationBacktracking = Param(self, "leafEstimationBacktracking", "When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'")
        self.leafEstimationIterations = Param(self, "leafEstimationIterations", "CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.")
        self.leafEstimationMethod = Param(self, "leafEstimationMethod", "The method used to calculate the values in leaves. See documentation for details.")
        self.learningRate = Param(self, "learningRate", "The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.")
        self.loggingLevel = Param(self, "loggingLevel", "The logging level to output to stdout. See documentation for details. Default value is 'Verbose'")
        self.lossFunction = Param(self, "lossFunction", "The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).")
        self.metricPeriod = Param(self, "metricPeriod", "The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.")
        self.modelShrinkMode = Param(self, "modelShrinkMode", "Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'")
        self.modelShrinkRate = Param(self, "modelShrinkRate", "The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.")
        self.mvsReg = Param(self, "mvsReg", "Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.")
        self.nanMode = Param(self, "nanMode", "The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'")
        self.odPval = Param(self, "odPval", "The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.")
        self.odType = Param(self, "odType", "The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'")
        self.odWait = Param(self, "odWait", "The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.")
        self.oneHotMaxSize = Param(self, "oneHotMaxSize", "Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.")
        self.penaltiesCoefficient = Param(self, "penaltiesCoefficient", "A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.")
        self.perFloatFeatureQuantizaton = Param(self, "perFloatFeatureQuantizaton", "The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]")
        self.perObjectFeaturePenaltiesList = Param(self, "perObjectFeaturePenaltiesList", "Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.")
        self.perObjectFeaturePenaltiesMap = Param(self, "perObjectFeaturePenaltiesMap", "Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.")
        self.predictionCol = Param(self, "predictionCol", "prediction column name")
        self._setDefault(predictionCol="prediction")
        self.randomSeed = Param(self, "randomSeed", "The random seed used for training. Default value is 0.")
        self.randomStrength = Param(self, "randomStrength", "The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0")
        self.rsm = Param(self, "rsm", "Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.")
        self.samplingFrequency = Param(self, "samplingFrequency", "Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'")
        self.samplingUnit = Param(self, "samplingUnit", "The sampling scheme, see documentation for details. Default value is 'Object'")
        self.saveSnapshot = Param(self, "saveSnapshot", "Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.")
        self.scoreFunction = Param(self, "scoreFunction", "The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'")
        self.snapshotFile = Param(self, "snapshotFile", "The name of the file to save the training progress information in. This file is used for recovering training after an interruption.")
        self.snapshotInterval = Param(self, "snapshotInterval", "The interval between saving snapshots. See documentation for details. Default value is 600 seconds.")
        self.sparkPartitionCount = Param(self, "sparkPartitionCount", "The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default")
        self.subsample = Param(self, "subsample", "Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.")
        self.threadCount = Param(self, "threadCount", "Number of CPU threads in parallel operations on client")
        self.trainDir = Param(self, "trainDir", "The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'")
        self.useBestModel = Param(self, "useBestModel", "If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.")
        self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0")
        self.workerInitializationTimeout = Param(self, "workerInitializationTimeout", "Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes")
        self._setDefault(workerInitializationTimeout=datetime.timedelta(milliseconds=600000))
        self.workerMaxFailures = Param(self, "workerMaxFailures", "Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4")
        self._setDefault(workerMaxFailures=4)

        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        self.setParams(**kwargs)


[docs]    @keyword_only
    def setParams(self, allowConstLabel=None, allowWritingFiles=None, approxOnFullHistory=None, baggingTemperature=None, bestModelMinTrees=None, bootstrapType=None, borderCount=None, connectTimeout=datetime.timedelta(milliseconds=60000), customMetric=None, depth=None, diffusionTemperature=None, earlyStoppingRounds=None, evalMetric=None, featureBorderType=None, featureWeightsList=None, featureWeightsMap=None, featuresCol="features", firstFeatureUsePenaltiesList=None, firstFeatureUsePenaltiesMap=None, foldLenMultiplier=None, foldPermutationBlock=None, hasTime=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, iterations=None, l2LeafReg=None, labelCol="label", leafEstimationBacktracking=None, leafEstimationIterations=None, leafEstimationMethod=None, learningRate=None, loggingLevel=None, lossFunction=None, metricPeriod=None, modelShrinkMode=None, modelShrinkRate=None, mvsReg=None, nanMode=None, odPval=None, odType=None, odWait=None, oneHotMaxSize=None, penaltiesCoefficient=None, perFloatFeatureQuantizaton=None, perObjectFeaturePenaltiesList=None, perObjectFeaturePenaltiesMap=None, predictionCol="prediction", randomSeed=None, randomStrength=None, rsm=None, samplingFrequency=None, samplingUnit=None, saveSnapshot=None, scoreFunction=None, snapshotFile=None, snapshotInterval=None, sparkPartitionCount=None, subsample=None, threadCount=None, trainDir=None, useBestModel=None, weightCol=None, workerInitializationTimeout=datetime.timedelta(milliseconds=600000), workerMaxFailures=4):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        allowConstLabel : bool
            Use it to train models with datasets that have equal label values for all objects.
        allowWritingFiles : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        approxOnFullHistory : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        baggingTemperature : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        bestModelMinTrees : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        bootstrapType : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        borderCount : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        connectTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=60000)
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        customMetric : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        depth : int
            Depth of the tree.Default value is 6.
        diffusionTemperature : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        earlyStoppingRounds : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        evalMetric : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        featureBorderType : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        featureWeightsList : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        featureWeightsMap : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        featuresCol : str, default: "features"
            features column name
        firstFeatureUsePenaltiesList : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        firstFeatureUsePenaltiesMap : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        foldLenMultiplier : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        foldPermutationBlock : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        hasTime : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        ignoredFeaturesIndices : list
            Feature indices to exclude from the training
        ignoredFeaturesNames : list
            Feature names to exclude from the training
        inputBorders : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        iterations : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        l2LeafReg : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        labelCol : str, default: "label"
            label column name
        leafEstimationBacktracking : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        leafEstimationIterations : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        leafEstimationMethod : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        learningRate : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        loggingLevel : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        lossFunction : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        metricPeriod : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        modelShrinkMode : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        modelShrinkRate : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        mvsReg : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        nanMode : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        odPval : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        odType : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        odWait : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        oneHotMaxSize : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        penaltiesCoefficient : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        perFloatFeatureQuantizaton : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        perObjectFeaturePenaltiesList : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        perObjectFeaturePenaltiesMap : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        predictionCol : str, default: "prediction"
            prediction column name
        randomSeed : int
            The random seed used for training. Default value is 0.
        randomStrength : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        rsm : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        samplingFrequency : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        samplingUnit : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        saveSnapshot : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        scoreFunction : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        snapshotFile : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        snapshotInterval : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        sparkPartitionCount : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        subsample : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        threadCount : int
            Number of CPU threads in parallel operations on client
        trainDir : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        useBestModel : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        weightCol : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        workerInitializationTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=600000)
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        workerMaxFailures : int, default: 4
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getAllowConstLabel(self):
        """
        Returns
        -------
        bool
            Use it to train models with datasets that have equal label values for all objects.
        """
        return self.getOrDefault(self.allowConstLabel)

[docs]    def setAllowConstLabel(self, value):
        """
        Parameters
        ----------
        value : bool
            Use it to train models with datasets that have equal label values for all objects.
        """
        self._set(allowConstLabel=value)
        return self



[docs]    def getAllowWritingFiles(self):
        """
        Returns
        -------
        bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        """
        return self.getOrDefault(self.allowWritingFiles)

[docs]    def setAllowWritingFiles(self, value):
        """
        Parameters
        ----------
        value : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        """
        self._set(allowWritingFiles=value)
        return self



[docs]    def getApproxOnFullHistory(self):
        """
        Returns
        -------
        bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        """
        return self.getOrDefault(self.approxOnFullHistory)

[docs]    def setApproxOnFullHistory(self, value):
        """
        Parameters
        ----------
        value : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        """
        self._set(approxOnFullHistory=value)
        return self



[docs]    def getBaggingTemperature(self):
        """
        Returns
        -------
        float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        """
        return self.getOrDefault(self.baggingTemperature)

[docs]    def setBaggingTemperature(self, value):
        """
        Parameters
        ----------
        value : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        """
        self._set(baggingTemperature=value)
        return self



[docs]    def getBestModelMinTrees(self):
        """
        Returns
        -------
        int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        """
        return self.getOrDefault(self.bestModelMinTrees)

[docs]    def setBestModelMinTrees(self, value):
        """
        Parameters
        ----------
        value : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        """
        self._set(bestModelMinTrees=value)
        return self



[docs]    def getBootstrapType(self):
        """
        Returns
        -------
        EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        """
        return self.getOrDefault(self.bootstrapType)

[docs]    def setBootstrapType(self, value):
        """
        Parameters
        ----------
        value : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        """
        self._set(bootstrapType=value)
        return self



[docs]    def getBorderCount(self):
        """
        Returns
        -------
        int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        return self.getOrDefault(self.borderCount)

[docs]    def setBorderCount(self, value):
        """
        Parameters
        ----------
        value : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        self._set(borderCount=value)
        return self



[docs]    def getConnectTimeout(self):
        """
        Returns
        -------
        datetime.timedelta
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        """
        return self.getOrDefault(self.connectTimeout)

[docs]    def setConnectTimeout(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        """
        self._set(connectTimeout=value)
        return self



[docs]    def getCustomMetric(self):
        """
        Returns
        -------
        list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.customMetric)

[docs]    def setCustomMetric(self, value):
        """
        Parameters
        ----------
        value : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        """
        self._set(customMetric=value)
        return self



[docs]    def getDepth(self):
        """
        Returns
        -------
        int
            Depth of the tree.Default value is 6.
        """
        return self.getOrDefault(self.depth)

[docs]    def setDepth(self, value):
        """
        Parameters
        ----------
        value : int
            Depth of the tree.Default value is 6.
        """
        self._set(depth=value)
        return self



[docs]    def getDiffusionTemperature(self):
        """
        Returns
        -------
        float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        """
        return self.getOrDefault(self.diffusionTemperature)

[docs]    def setDiffusionTemperature(self, value):
        """
        Parameters
        ----------
        value : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        """
        self._set(diffusionTemperature=value)
        return self



[docs]    def getEarlyStoppingRounds(self):
        """
        Returns
        -------
        int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        """
        return self.getOrDefault(self.earlyStoppingRounds)

[docs]    def setEarlyStoppingRounds(self, value):
        """
        Parameters
        ----------
        value : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        """
        self._set(earlyStoppingRounds=value)
        return self



[docs]    def getEvalMetric(self):
        """
        Returns
        -------
        str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.evalMetric)

[docs]    def setEvalMetric(self, value):
        """
        Parameters
        ----------
        value : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        self._set(evalMetric=value)
        return self



[docs]    def getFeatureBorderType(self):
        """
        Returns
        -------
        EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        return self.getOrDefault(self.featureBorderType)

[docs]    def setFeatureBorderType(self, value):
        """
        Parameters
        ----------
        value : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        self._set(featureBorderType=value)
        return self



[docs]    def getFeatureWeightsList(self):
        """
        Returns
        -------
        list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        """
        return self.getOrDefault(self.featureWeightsList)

[docs]    def setFeatureWeightsList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        """
        self._set(featureWeightsList=value)
        return self



[docs]    def getFeatureWeightsMap(self):
        """
        Returns
        -------
        dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        """
        return self.getOrDefault(self.featureWeightsMap)

[docs]    def setFeatureWeightsMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        """
        self._set(featureWeightsMap=value)
        return self



[docs]    def getFeaturesCol(self):
        """
        Returns
        -------
        str
            features column name
        """
        return self.getOrDefault(self.featuresCol)

[docs]    def setFeaturesCol(self, value):
        """
        Parameters
        ----------
        value : str
            features column name
        """
        self._set(featuresCol=value)
        return self



[docs]    def getFirstFeatureUsePenaltiesList(self):
        """
        Returns
        -------
        list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        """
        return self.getOrDefault(self.firstFeatureUsePenaltiesList)

[docs]    def setFirstFeatureUsePenaltiesList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        """
        self._set(firstFeatureUsePenaltiesList=value)
        return self



[docs]    def getFirstFeatureUsePenaltiesMap(self):
        """
        Returns
        -------
        dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        """
        return self.getOrDefault(self.firstFeatureUsePenaltiesMap)

[docs]    def setFirstFeatureUsePenaltiesMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        """
        self._set(firstFeatureUsePenaltiesMap=value)
        return self



[docs]    def getFoldLenMultiplier(self):
        """
        Returns
        -------
        float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        """
        return self.getOrDefault(self.foldLenMultiplier)

[docs]    def setFoldLenMultiplier(self, value):
        """
        Parameters
        ----------
        value : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        """
        self._set(foldLenMultiplier=value)
        return self



[docs]    def getFoldPermutationBlock(self):
        """
        Returns
        -------
        int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        """
        return self.getOrDefault(self.foldPermutationBlock)

[docs]    def setFoldPermutationBlock(self, value):
        """
        Parameters
        ----------
        value : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        """
        self._set(foldPermutationBlock=value)
        return self



[docs]    def getHasTime(self):
        """
        Returns
        -------
        bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        """
        return self.getOrDefault(self.hasTime)

[docs]    def setHasTime(self, value):
        """
        Parameters
        ----------
        value : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        """
        self._set(hasTime=value)
        return self



[docs]    def getIgnoredFeaturesIndices(self):
        """
        Returns
        -------
        list
            Feature indices to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesIndices)

[docs]    def setIgnoredFeaturesIndices(self, value):
        """
        Parameters
        ----------
        value : list
            Feature indices to exclude from the training
        """
        self._set(ignoredFeaturesIndices=value)
        return self



[docs]    def getIgnoredFeaturesNames(self):
        """
        Returns
        -------
        list
            Feature names to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesNames)

[docs]    def setIgnoredFeaturesNames(self, value):
        """
        Parameters
        ----------
        value : list
            Feature names to exclude from the training
        """
        self._set(ignoredFeaturesNames=value)
        return self



[docs]    def getInputBorders(self):
        """
        Returns
        -------
        str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        return self.getOrDefault(self.inputBorders)

[docs]    def setInputBorders(self, value):
        """
        Parameters
        ----------
        value : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        self._set(inputBorders=value)
        return self



[docs]    def getIterations(self):
        """
        Returns
        -------
        int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        """
        return self.getOrDefault(self.iterations)

[docs]    def setIterations(self, value):
        """
        Parameters
        ----------
        value : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        """
        self._set(iterations=value)
        return self



[docs]    def getL2LeafReg(self):
        """
        Returns
        -------
        float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        """
        return self.getOrDefault(self.l2LeafReg)

[docs]    def setL2LeafReg(self, value):
        """
        Parameters
        ----------
        value : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        """
        self._set(l2LeafReg=value)
        return self



[docs]    def getLabelCol(self):
        """
        Returns
        -------
        str
            label column name
        """
        return self.getOrDefault(self.labelCol)

[docs]    def setLabelCol(self, value):
        """
        Parameters
        ----------
        value : str
            label column name
        """
        self._set(labelCol=value)
        return self



[docs]    def getLeafEstimationBacktracking(self):
        """
        Returns
        -------
        ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        """
        return self.getOrDefault(self.leafEstimationBacktracking)

[docs]    def setLeafEstimationBacktracking(self, value):
        """
        Parameters
        ----------
        value : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        """
        self._set(leafEstimationBacktracking=value)
        return self



[docs]    def getLeafEstimationIterations(self):
        """
        Returns
        -------
        int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        """
        return self.getOrDefault(self.leafEstimationIterations)

[docs]    def setLeafEstimationIterations(self, value):
        """
        Parameters
        ----------
        value : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        """
        self._set(leafEstimationIterations=value)
        return self



[docs]    def getLeafEstimationMethod(self):
        """
        Returns
        -------
        ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        """
        return self.getOrDefault(self.leafEstimationMethod)

[docs]    def setLeafEstimationMethod(self, value):
        """
        Parameters
        ----------
        value : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        """
        self._set(leafEstimationMethod=value)
        return self



[docs]    def getLearningRate(self):
        """
        Returns
        -------
        float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        """
        return self.getOrDefault(self.learningRate)

[docs]    def setLearningRate(self, value):
        """
        Parameters
        ----------
        value : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        """
        self._set(learningRate=value)
        return self



[docs]    def getLoggingLevel(self):
        """
        Returns
        -------
        ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        """
        return self.getOrDefault(self.loggingLevel)

[docs]    def setLoggingLevel(self, value):
        """
        Parameters
        ----------
        value : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        """
        self._set(loggingLevel=value)
        return self



[docs]    def getLossFunction(self):
        """
        Returns
        -------
        str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.lossFunction)

[docs]    def setLossFunction(self, value):
        """
        Parameters
        ----------
        value : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        self._set(lossFunction=value)
        return self



[docs]    def getMetricPeriod(self):
        """
        Returns
        -------
        int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        """
        return self.getOrDefault(self.metricPeriod)

[docs]    def setMetricPeriod(self, value):
        """
        Parameters
        ----------
        value : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        """
        self._set(metricPeriod=value)
        return self



[docs]    def getModelShrinkMode(self):
        """
        Returns
        -------
        EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        """
        return self.getOrDefault(self.modelShrinkMode)

[docs]    def setModelShrinkMode(self, value):
        """
        Parameters
        ----------
        value : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        """
        self._set(modelShrinkMode=value)
        return self



[docs]    def getModelShrinkRate(self):
        """
        Returns
        -------
        float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        """
        return self.getOrDefault(self.modelShrinkRate)

[docs]    def setModelShrinkRate(self, value):
        """
        Parameters
        ----------
        value : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        """
        self._set(modelShrinkRate=value)
        return self



[docs]    def getMvsReg(self):
        """
        Returns
        -------
        float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        """
        return self.getOrDefault(self.mvsReg)

[docs]    def setMvsReg(self, value):
        """
        Parameters
        ----------
        value : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        """
        self._set(mvsReg=value)
        return self



[docs]    def getNanMode(self):
        """
        Returns
        -------
        ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        return self.getOrDefault(self.nanMode)

[docs]    def setNanMode(self, value):
        """
        Parameters
        ----------
        value : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        self._set(nanMode=value)
        return self



[docs]    def getOdPval(self):
        """
        Returns
        -------
        float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        """
        return self.getOrDefault(self.odPval)

[docs]    def setOdPval(self, value):
        """
        Parameters
        ----------
        value : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        """
        self._set(odPval=value)
        return self



[docs]    def getOdType(self):
        """
        Returns
        -------
        EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        """
        return self.getOrDefault(self.odType)

[docs]    def setOdType(self, value):
        """
        Parameters
        ----------
        value : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        """
        self._set(odType=value)
        return self



[docs]    def getOdWait(self):
        """
        Returns
        -------
        int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        """
        return self.getOrDefault(self.odWait)

[docs]    def setOdWait(self, value):
        """
        Parameters
        ----------
        value : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        """
        self._set(odWait=value)
        return self



[docs]    def getOneHotMaxSize(self):
        """
        Returns
        -------
        int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        """
        return self.getOrDefault(self.oneHotMaxSize)

[docs]    def setOneHotMaxSize(self, value):
        """
        Parameters
        ----------
        value : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        """
        self._set(oneHotMaxSize=value)
        return self



[docs]    def getPenaltiesCoefficient(self):
        """
        Returns
        -------
        float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        """
        return self.getOrDefault(self.penaltiesCoefficient)

[docs]    def setPenaltiesCoefficient(self, value):
        """
        Parameters
        ----------
        value : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        """
        self._set(penaltiesCoefficient=value)
        return self



[docs]    def getPerFloatFeatureQuantizaton(self):
        """
        Returns
        -------
        list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        return self.getOrDefault(self.perFloatFeatureQuantizaton)

[docs]    def setPerFloatFeatureQuantizaton(self, value):
        """
        Parameters
        ----------
        value : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        self._set(perFloatFeatureQuantizaton=value)
        return self



[docs]    def getPerObjectFeaturePenaltiesList(self):
        """
        Returns
        -------
        list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        """
        return self.getOrDefault(self.perObjectFeaturePenaltiesList)

[docs]    def setPerObjectFeaturePenaltiesList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        """
        self._set(perObjectFeaturePenaltiesList=value)
        return self



[docs]    def getPerObjectFeaturePenaltiesMap(self):
        """
        Returns
        -------
        dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        """
        return self.getOrDefault(self.perObjectFeaturePenaltiesMap)

[docs]    def setPerObjectFeaturePenaltiesMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        """
        self._set(perObjectFeaturePenaltiesMap=value)
        return self



[docs]    def getPredictionCol(self):
        """
        Returns
        -------
        str
            prediction column name
        """
        return self.getOrDefault(self.predictionCol)

[docs]    def setPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            prediction column name
        """
        self._set(predictionCol=value)
        return self



[docs]    def getRandomSeed(self):
        """
        Returns
        -------
        int
            The random seed used for training. Default value is 0.
        """
        return self.getOrDefault(self.randomSeed)

[docs]    def setRandomSeed(self, value):
        """
        Parameters
        ----------
        value : int
            The random seed used for training. Default value is 0.
        """
        self._set(randomSeed=value)
        return self



[docs]    def getRandomStrength(self):
        """
        Returns
        -------
        float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        """
        return self.getOrDefault(self.randomStrength)

[docs]    def setRandomStrength(self, value):
        """
        Parameters
        ----------
        value : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        """
        self._set(randomStrength=value)
        return self



[docs]    def getRsm(self):
        """
        Returns
        -------
        float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        """
        return self.getOrDefault(self.rsm)

[docs]    def setRsm(self, value):
        """
        Parameters
        ----------
        value : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        """
        self._set(rsm=value)
        return self



[docs]    def getSamplingFrequency(self):
        """
        Returns
        -------
        ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        """
        return self.getOrDefault(self.samplingFrequency)

[docs]    def setSamplingFrequency(self, value):
        """
        Parameters
        ----------
        value : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        """
        self._set(samplingFrequency=value)
        return self



[docs]    def getSamplingUnit(self):
        """
        Returns
        -------
        ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        """
        return self.getOrDefault(self.samplingUnit)

[docs]    def setSamplingUnit(self, value):
        """
        Parameters
        ----------
        value : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        """
        self._set(samplingUnit=value)
        return self



[docs]    def getSaveSnapshot(self):
        """
        Returns
        -------
        bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        """
        return self.getOrDefault(self.saveSnapshot)

[docs]    def setSaveSnapshot(self, value):
        """
        Parameters
        ----------
        value : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        """
        self._set(saveSnapshot=value)
        return self



[docs]    def getScoreFunction(self):
        """
        Returns
        -------
        EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        """
        return self.getOrDefault(self.scoreFunction)

[docs]    def setScoreFunction(self, value):
        """
        Parameters
        ----------
        value : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        """
        self._set(scoreFunction=value)
        return self



[docs]    def getSnapshotFile(self):
        """
        Returns
        -------
        str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        """
        return self.getOrDefault(self.snapshotFile)

[docs]    def setSnapshotFile(self, value):
        """
        Parameters
        ----------
        value : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        """
        self._set(snapshotFile=value)
        return self



[docs]    def getSnapshotInterval(self):
        """
        Returns
        -------
        datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        """
        return self.getOrDefault(self.snapshotInterval)

[docs]    def setSnapshotInterval(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        """
        self._set(snapshotInterval=value)
        return self



[docs]    def getSparkPartitionCount(self):
        """
        Returns
        -------
        int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        """
        return self.getOrDefault(self.sparkPartitionCount)

[docs]    def setSparkPartitionCount(self, value):
        """
        Parameters
        ----------
        value : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        """
        self._set(sparkPartitionCount=value)
        return self



[docs]    def getSubsample(self):
        """
        Returns
        -------
        float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        """
        return self.getOrDefault(self.subsample)

[docs]    def setSubsample(self, value):
        """
        Parameters
        ----------
        value : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        """
        self._set(subsample=value)
        return self



[docs]    def getThreadCount(self):
        """
        Returns
        -------
        int
            Number of CPU threads in parallel operations on client
        """
        return self.getOrDefault(self.threadCount)

[docs]    def setThreadCount(self, value):
        """
        Parameters
        ----------
        value : int
            Number of CPU threads in parallel operations on client
        """
        self._set(threadCount=value)
        return self



[docs]    def getTrainDir(self):
        """
        Returns
        -------
        str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        """
        return self.getOrDefault(self.trainDir)

[docs]    def setTrainDir(self, value):
        """
        Parameters
        ----------
        value : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        """
        self._set(trainDir=value)
        return self



[docs]    def getUseBestModel(self):
        """
        Returns
        -------
        bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        """
        return self.getOrDefault(self.useBestModel)

[docs]    def setUseBestModel(self, value):
        """
        Parameters
        ----------
        value : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        """
        self._set(useBestModel=value)
        return self



[docs]    def getWeightCol(self):
        """
        Returns
        -------
        str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        return self.getOrDefault(self.weightCol)

[docs]    def setWeightCol(self, value):
        """
        Parameters
        ----------
        value : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        self._set(weightCol=value)
        return self



[docs]    def getWorkerInitializationTimeout(self):
        """
        Returns
        -------
        datetime.timedelta
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        """
        return self.getOrDefault(self.workerInitializationTimeout)

[docs]    def setWorkerInitializationTimeout(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        """
        self._set(workerInitializationTimeout=value)
        return self



[docs]    def getWorkerMaxFailures(self):
        """
        Returns
        -------
        int
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        return self.getOrDefault(self.workerMaxFailures)

[docs]    def setWorkerMaxFailures(self, value):
        """
        Parameters
        ----------
        value : int
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        self._set(workerMaxFailures=value)
        return self




[docs]    @classmethod
    def read(cls):
        """Returns an MLReader instance for this class."""
        return CatBoostMLReader(cls)

    def _create_model(self, java_model):
        return CatBoostRegressionModel(java_model)
  
[docs]    def fit(self, trainDataset, evalDatasets=None):
        """
        Extended variant of standard Estimator's fit method
        that accepts CatBoost's Pool s and allows to specify additional
        datasets for computing evaluation metrics and overfitting detection similarily to CatBoost's other APIs.
        
        Parameters
        ---------- 
        trainDataset : Pool or DataFrame
          The input training dataset.
        evalDatasets : Pools, optional
          The validation datasets used for the following processes:
           - overfitting detector
           - best iteration selection
           - monitoring metrics' changes
        
        Returns
        -------
        trained model: CatBoostRegressionModel
        """
        if (isinstance(trainDataset, DataFrame)):
            if evalDatasets is not None:
                raise RuntimeError("if trainDataset has type DataFrame no evalDatasets are supported")
            return JavaEstimator.fit(self, trainDataset)
        else:
            sc = SparkContext._active_spark_context
            evalDatasetCount = 0 if (evalDatasets is None) else len(evalDatasets)

            # need to create it because default mapping for python list is ArrayList, not Array
            evalDatasetsAsJavaObject = sc._gateway.new_array(sc._jvm.ai.catboost.spark.Pool, evalDatasetCount)
            for i in range(evalDatasetCount):
                evalDatasetsAsJavaObject[i] = _py2java(sc, evalDatasets[i])
            self._transfer_params_to_java()
            java_model = self._java_obj.fit(_py2java(sc, trainDataset), evalDatasetsAsJavaObject)
            return CatBoostRegressionModel(java_model)

[docs]@inherit_doc
class CatBoostRegressionModel(JavaRegressionModel, MLReadable, JavaMLWritable):
    """
    Regression model trained by CatBoost. Use CatBoostRegressor to train it
    """
    def __init__(self, java_model):
        super(CatBoostRegressionModel, self).__init__(java_model)
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol="features")
        self.labelCol = Param(self, "labelCol", "label column name")
        self._setDefault(labelCol="label")
        self.predictionCol = Param(self, "predictionCol", "prediction column name")
        self._setDefault(predictionCol="prediction")
        self._transfer_params_from_java()


[docs]    @keyword_only
    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction"):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        featuresCol : str, default: "features"
            features column name
        labelCol : str, default: "label"
            label column name
        predictionCol : str, default: "prediction"
            prediction column name
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getFeaturesCol(self):
        """
        Returns
        -------
        str
            features column name
        """
        return self.getOrDefault(self.featuresCol)

[docs]    def setFeaturesCol(self, value):
        """
        Parameters
        ----------
        value : str
            features column name
        """
        self._set(featuresCol=value)
        return self



[docs]    def getLabelCol(self):
        """
        Returns
        -------
        str
            label column name
        """
        return self.getOrDefault(self.labelCol)

[docs]    def setLabelCol(self, value):
        """
        Parameters
        ----------
        value : str
            label column name
        """
        self._set(labelCol=value)
        return self



[docs]    def getPredictionCol(self):
        """
        Returns
        -------
        str
            prediction column name
        """
        return self.getOrDefault(self.predictionCol)

[docs]    def setPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            prediction column name
        """
        self._set(predictionCol=value)
        return self




    @staticmethod
    def _from_java(java_model):
        return CatBoostRegressionModel(java_model)

[docs]    @classmethod
    def read(cls):
        """Returns an MLReader instance for this class."""
        return CatBoostMLReader(cls)

[docs]    def saveNativeModel(self, fileName, format=EModelType.CatboostBinary, exportParameters=None, pool=None):
        """
        Save the model to a local file.
        See https://catboost.ai/docs/concepts/python-reference_catboostclassifier_save_model.html
        for detailed parameters description
        """
        return self._call_java("saveNativeModel", fileName, format, exportParameters, pool)

[docs]    @staticmethod
    def loadNativeModel(fileName, format=EModelType.CatboostBinary):
        """
        Load the model from a local file.
        See https://catboost.ai/docs/concepts/python-reference_catboostclassifier_load_model.html
        for detailed parameters description
        """
        sc = SparkContext._active_spark_context
        java_model = sc._jvm.ai.catboost.spark.CatBoostRegressionModel.loadNativeModel(fileName, _py2java(sc, format))
        return CatBoostRegressionModel(java_model)


[docs]    def transformPool(self, pool):
        """
        This function is useful when the dataset has been already quantized but works with any Pool
        """
        return self._call_java("transformPool", pool)


[docs]    def getFeatureImportance(self, 
                             fstrType=EFstrType.FeatureImportance,
                             data=None,
                             calcType=ECalcTypeShapValues.Regular
                            ):
        """
        Parameters
        ----------
        fstrType : EFstrType
            Supported values are FeatureImportance, PredictionValuesChange, LossFunctionChange, PredictionDiff
        data : Pool
            if fstrType is PredictionDiff it is required and must contain 2 samples
            if fstrType is PredictionValuesChange this param is required in case if model was explicitly trained
            with flag to store no leaf weights.
            otherwise it can be null
        calcType : ECalcTypeShapValues
            Used only for PredictionValuesChange. 
            Possible values:
              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values

        Returns
        -------
        list of float
            array of feature importances (index corresponds to the order of features in the model)
        """
        return self._call_java("getFeatureImportance", fstrType, data, calcType)

[docs]    def getFeatureImportancePrettified(self, 
                                       fstrType=EFstrType.FeatureImportance,
                                       data=None,
                                       calcType=ECalcTypeShapValues.Regular
                                      ):
        """
        Parameters
        ----------
        fstrType : EFstrType
            Supported values are FeatureImportance, PredictionValuesChange, LossFunctionChange, PredictionDiff
        data : Pool
            if fstrType is PredictionDiff it is required and must contain 2 samples
            if fstrType is PredictionValuesChange this param is required in case if model was explicitly trained
            with flag to store no leaf weights.
            otherwise it can be null
        calcType : ECalcTypeShapValues
            Used only for PredictionValuesChange. 
            Possible values:

              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values

        Returns
        -------
        list of FeatureImportance
            array of feature importances sorted in descending order by importance
        """
        return self._call_java("getFeatureImportancePrettified", fstrType, data, calcType)

[docs]    def getFeatureImportanceShapValues(self,
                                       data,
                                       preCalcMode=EPreCalcShapValues.Auto,
                                       calcType=ECalcTypeShapValues.Regular,
                                       modelOutputType=EExplainableModelOutput.Raw,
                                       referenceData=None,
                                       outputColumns=None
                                      ):
        """
        Parameters
        ----------
        data : Pool
            dataset to calculate SHAP values for
        preCalcMode : EPreCalcShapValues
            Possible values:
               - Auto
                  Use direct SHAP Values calculation only if data size is smaller than average leaves number
                  (the best of two strategies below is chosen).
               - UsePreCalc
                  Calculate SHAP Values for every leaf in preprocessing. Final complexity is
                  O(NT(D+F))+O(TL^2 D^2) where N is the number of documents(objects), T - number of trees,
                  D - average tree depth, F - average number of features in tree, L - average number of leaves in tree
                  This is much faster (because of a smaller constant) than direct calculation when N >> L
               - NoPreCalc
                  Use direct SHAP Values calculation calculation with complexity O(NTLD^2). Direct algorithm
                  is faster when N < L (algorithm from https://arxiv.org/abs/1802.03888)
        calcType : ECalcTypeShapValues
            Possible values:

              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values
        referenceData : Pool
            reference data for Independent Tree SHAP values from https://arxiv.org/abs/1905.04610v1
            if referenceData is not null, then Independent Tree SHAP values are calculated
        outputColumns : list of str
            columns from data to add to output DataFrame, if None - add all columns

        Returns
        -------
        DataFrame
            - for regression and binclass models: 
              contains outputColumns and "shapValues" column with Vector of length (n_features + 1) with SHAP values
            - for multiclass models:
              contains outputColumns and "shapValues" column with Matrix of shape (n_classes x (n_features + 1)) with SHAP values
        """
        return self._call_java(
            "getFeatureImportanceShapValues", 
            data, 
            preCalcMode,
            calcType,
            modelOutputType,
            referenceData,
            outputColumns
        )

[docs]    def getFeatureImportanceShapInteractionValues(self,
                                                  data,
                                                  featureIndices=None,
                                                  featureNames=None,
                                                  preCalcMode=EPreCalcShapValues.Auto,
                                                  calcType=ECalcTypeShapValues.Regular,
                                                  outputColumns=None):
        """
        SHAP interaction values are calculated for all features pairs if nor featureIndices nor featureNames 
          are specified.

        Parameters
        ----------
        data : Pool
            dataset to calculate SHAP interaction values
        featureIndices : (int, int), optional
            pair of features indices to calculate SHAP interaction values for.
        featureNames : (str, str), optional
            pair of features names to calculate SHAP interaction values for.
        preCalcMode : EPreCalcShapValues
            Possible values:

            - Auto
                Use direct SHAP Values calculation only if data size is smaller than average leaves number
                (the best of two strategies below is chosen).
            - UsePreCalc
                Calculate SHAP Values for every leaf in preprocessing. Final complexity is
                O(NT(D+F))+O(TL^2 D^2) where N is the number of documents(objects), T - number of trees,
                D - average tree depth, F - average number of features in tree, L - average number of leaves in tree
                This is much faster (because of a smaller constant) than direct calculation when N >> L
            - NoPreCalc
                Use direct SHAP Values calculation calculation with complexity O(NTLD^2). Direct algorithm
                is faster when N < L (algorithm from https://arxiv.org/abs/1802.03888)
        calcType : ECalcTypeShapValues
            Possible values:

              - Regular
                  Calculate regular SHAP values
              - Approximate
                  Calculate approximate SHAP values
              - Exact
                  Calculate exact SHAP values
        outputColumns : list of str
            columns from data to add to output DataFrame, if None - add all columns

        Returns
        -------
        DataFrame
            - for regression and binclass models: 
              contains outputColumns and "featureIdx1", "featureIdx2", "shapInteractionValue" columns
            - for multiclass models:
              contains outputColumns and "classIdx", "featureIdx1", "featureIdx2", "shapInteractionValue" columns
        """
        return self._call_java(
            "getFeatureImportanceShapInteractionValues", 
            data,
            featureIndices,
            featureNames,
            preCalcMode, 
            calcType,
            outputColumns
        )

[docs]    def getFeatureImportanceInteraction(self):
        """
        Returns
        -------
        list of FeatureInteractionScore
        """
        return self._call_java("getFeatureImportanceInteraction")




[docs]@inherit_doc
class CatBoostClassifier(JavaEstimator, MLReadable, JavaMLWritable):
    """
    Class to train CatBoostClassificationModel

    Init Parameters
    ---------------
        allowConstLabel : bool
            Use it to train models with datasets that have equal label values for all objects.
        allowWritingFiles : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        approxOnFullHistory : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        autoClassWeights : EAutoClassWeightsType
            Automatically calculate class weights based either on the total weight or the total number of objects in each class. The values are used as multipliers for the object weights. Default value is 'None'
        baggingTemperature : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        bestModelMinTrees : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        bootstrapType : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        borderCount : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        classNames : list
            Allows to redefine the default values (consecutive integers).
        classWeightsList : list
            List of weights for each class. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsMap.
        classWeightsMap : dict
            Map from class name to weight. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsList.
        classesCount : int
            The upper limit for the numeric class label. Defines the number of classes for multiclassification. See documentation for details.
        connectTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=60000)
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        customMetric : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        depth : int
            Depth of the tree.Default value is 6.
        diffusionTemperature : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        earlyStoppingRounds : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        evalMetric : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        featureBorderType : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        featureWeightsList : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        featureWeightsMap : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        featuresCol : str, default: "features"
            features column name
        firstFeatureUsePenaltiesList : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        firstFeatureUsePenaltiesMap : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        foldLenMultiplier : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        foldPermutationBlock : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        hasTime : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        ignoredFeaturesIndices : list
            Feature indices to exclude from the training
        ignoredFeaturesNames : list
            Feature names to exclude from the training
        inputBorders : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        iterations : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        l2LeafReg : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        labelCol : str, default: "label"
            label column name
        leafEstimationBacktracking : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        leafEstimationIterations : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        leafEstimationMethod : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        learningRate : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        loggingLevel : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        lossFunction : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        metricPeriod : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        modelShrinkMode : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        modelShrinkRate : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        mvsReg : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        nanMode : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        odPval : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        odType : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        odWait : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        oneHotMaxSize : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        penaltiesCoefficient : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        perFloatFeatureQuantizaton : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        perObjectFeaturePenaltiesList : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        perObjectFeaturePenaltiesMap : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        predictionCol : str, default: "prediction"
            prediction column name
        probabilityCol : str, default: "probability"
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        randomSeed : int
            The random seed used for training. Default value is 0.
        randomStrength : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        rawPredictionCol : str, default: "rawPrediction"
            raw prediction (a.k.a. confidence) column name
        rsm : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        samplingFrequency : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        samplingUnit : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        saveSnapshot : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        scalePosWeight : float
            The weight for class 1 in binary classification. The value is used as a multiplier for the weights of objects from class 1. Default value is 1 (both classes have equal weight).
        scoreFunction : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        snapshotFile : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        snapshotInterval : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        sparkPartitionCount : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        subsample : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        targetBorder : float
            If set, defines the border for converting target values to 0 and 1 classes.
        threadCount : int
            Number of CPU threads in parallel operations on client
        thresholds : list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        trainDir : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        useBestModel : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        weightCol : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        workerInitializationTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=600000)
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        workerMaxFailures : int, default: 4
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
    """

    @keyword_only
    def __init__(self, allowConstLabel=None, allowWritingFiles=None, approxOnFullHistory=None, autoClassWeights=None, baggingTemperature=None, bestModelMinTrees=None, bootstrapType=None, borderCount=None, classNames=None, classWeightsList=None, classWeightsMap=None, classesCount=None, connectTimeout=datetime.timedelta(milliseconds=60000), customMetric=None, depth=None, diffusionTemperature=None, earlyStoppingRounds=None, evalMetric=None, featureBorderType=None, featureWeightsList=None, featureWeightsMap=None, featuresCol="features", firstFeatureUsePenaltiesList=None, firstFeatureUsePenaltiesMap=None, foldLenMultiplier=None, foldPermutationBlock=None, hasTime=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, iterations=None, l2LeafReg=None, labelCol="label", leafEstimationBacktracking=None, leafEstimationIterations=None, leafEstimationMethod=None, learningRate=None, loggingLevel=None, lossFunction=None, metricPeriod=None, modelShrinkMode=None, modelShrinkRate=None, mvsReg=None, nanMode=None, odPval=None, odType=None, odWait=None, oneHotMaxSize=None, penaltiesCoefficient=None, perFloatFeatureQuantizaton=None, perObjectFeaturePenaltiesList=None, perObjectFeaturePenaltiesMap=None, predictionCol="prediction", probabilityCol="probability", randomSeed=None, randomStrength=None, rawPredictionCol="rawPrediction", rsm=None, samplingFrequency=None, samplingUnit=None, saveSnapshot=None, scalePosWeight=None, scoreFunction=None, snapshotFile=None, snapshotInterval=None, sparkPartitionCount=None, subsample=None, targetBorder=None, threadCount=None, thresholds=None, trainDir=None, useBestModel=None, weightCol=None, workerInitializationTimeout=datetime.timedelta(milliseconds=600000), workerMaxFailures=4):
        super(CatBoostClassifier, self).__init__()
        self._java_obj = self._new_java_obj("ai.catboost.spark.CatBoostClassifier")
        self.allowConstLabel = Param(self, "allowConstLabel", "Use it to train models with datasets that have equal label values for all objects.")
        self.allowWritingFiles = Param(self, "allowWritingFiles", "Allow to write analytical and snapshot files during training. Enabled by default.")
        self.approxOnFullHistory = Param(self, "approxOnFullHistory", "Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.")
        self.autoClassWeights = Param(self, "autoClassWeights", "Automatically calculate class weights based either on the total weight or the total number of objects in each class. The values are used as multipliers for the object weights. Default value is 'None'")
        self.baggingTemperature = Param(self, "baggingTemperature", "This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.")
        self.bestModelMinTrees = Param(self, "bestModelMinTrees", "The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.")
        self.bootstrapType = Param(self, "bootstrapType", "Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.")
        self.borderCount = Param(self, "borderCount", "The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.")
        self.classNames = Param(self, "classNames", "Allows to redefine the default values (consecutive integers).")
        self.classWeightsList = Param(self, "classWeightsList", "List of weights for each class. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsMap.")
        self.classWeightsMap = Param(self, "classWeightsMap", "Map from class name to weight. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsList.")
        self.classesCount = Param(self, "classesCount", "The upper limit for the numeric class label. Defines the number of classes for multiclassification. See documentation for details.")
        self.connectTimeout = Param(self, "connectTimeout", "Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute")
        self._setDefault(connectTimeout=datetime.timedelta(milliseconds=60000))
        self.customMetric = Param(self, "customMetric", "Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).")
        self.depth = Param(self, "depth", "Depth of the tree.Default value is 6.")
        self.diffusionTemperature = Param(self, "diffusionTemperature", "The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.")
        self.earlyStoppingRounds = Param(self, "earlyStoppingRounds", "Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.")
        self.evalMetric = Param(self, "evalMetric", "The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).")
        self.featureBorderType = Param(self, "featureBorderType", "The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'")
        self.featureWeightsList = Param(self, "featureWeightsList", "Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.")
        self.featureWeightsMap = Param(self, "featureWeightsMap", "Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.")
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol="features")
        self.firstFeatureUsePenaltiesList = Param(self, "firstFeatureUsePenaltiesList", "Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.")
        self.firstFeatureUsePenaltiesMap = Param(self, "firstFeatureUsePenaltiesMap", "Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.")
        self.foldLenMultiplier = Param(self, "foldLenMultiplier", "Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.")
        self.foldPermutationBlock = Param(self, "foldPermutationBlock", "Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.")
        self.hasTime = Param(self, "hasTime", "Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).")
        self.ignoredFeaturesIndices = Param(self, "ignoredFeaturesIndices", "Feature indices to exclude from the training")
        self.ignoredFeaturesNames = Param(self, "ignoredFeaturesNames", "Feature names to exclude from the training")
        self.inputBorders = Param(self, "inputBorders", "Load Custom quantization borders and missing value modes from a file (do not generate them)")
        self.iterations = Param(self, "iterations", "The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.")
        self.l2LeafReg = Param(self, "l2LeafReg", "Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.")
        self.labelCol = Param(self, "labelCol", "label column name")
        self._setDefault(labelCol="label")
        self.leafEstimationBacktracking = Param(self, "leafEstimationBacktracking", "When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'")
        self.leafEstimationIterations = Param(self, "leafEstimationIterations", "CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.")
        self.leafEstimationMethod = Param(self, "leafEstimationMethod", "The method used to calculate the values in leaves. See documentation for details.")
        self.learningRate = Param(self, "learningRate", "The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.")
        self.loggingLevel = Param(self, "loggingLevel", "The logging level to output to stdout. See documentation for details. Default value is 'Verbose'")
        self.lossFunction = Param(self, "lossFunction", "The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).")
        self.metricPeriod = Param(self, "metricPeriod", "The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.")
        self.modelShrinkMode = Param(self, "modelShrinkMode", "Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'")
        self.modelShrinkRate = Param(self, "modelShrinkRate", "The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.")
        self.mvsReg = Param(self, "mvsReg", "Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.")
        self.nanMode = Param(self, "nanMode", "The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'")
        self.odPval = Param(self, "odPval", "The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.")
        self.odType = Param(self, "odType", "The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'")
        self.odWait = Param(self, "odWait", "The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.")
        self.oneHotMaxSize = Param(self, "oneHotMaxSize", "Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.")
        self.penaltiesCoefficient = Param(self, "penaltiesCoefficient", "A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.")
        self.perFloatFeatureQuantizaton = Param(self, "perFloatFeatureQuantizaton", "The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]")
        self.perObjectFeaturePenaltiesList = Param(self, "perObjectFeaturePenaltiesList", "Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.")
        self.perObjectFeaturePenaltiesMap = Param(self, "perObjectFeaturePenaltiesMap", "Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.")
        self.predictionCol = Param(self, "predictionCol", "prediction column name")
        self._setDefault(predictionCol="prediction")
        self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities")
        self._setDefault(probabilityCol="probability")
        self.randomSeed = Param(self, "randomSeed", "The random seed used for training. Default value is 0.")
        self.randomStrength = Param(self, "randomStrength", "The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0")
        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
        self._setDefault(rawPredictionCol="rawPrediction")
        self.rsm = Param(self, "rsm", "Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.")
        self.samplingFrequency = Param(self, "samplingFrequency", "Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'")
        self.samplingUnit = Param(self, "samplingUnit", "The sampling scheme, see documentation for details. Default value is 'Object'")
        self.saveSnapshot = Param(self, "saveSnapshot", "Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.")
        self.scalePosWeight = Param(self, "scalePosWeight", "The weight for class 1 in binary classification. The value is used as a multiplier for the weights of objects from class 1. Default value is 1 (both classes have equal weight).")
        self.scoreFunction = Param(self, "scoreFunction", "The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'")
        self.snapshotFile = Param(self, "snapshotFile", "The name of the file to save the training progress information in. This file is used for recovering training after an interruption.")
        self.snapshotInterval = Param(self, "snapshotInterval", "The interval between saving snapshots. See documentation for details. Default value is 600 seconds.")
        self.sparkPartitionCount = Param(self, "sparkPartitionCount", "The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default")
        self.subsample = Param(self, "subsample", "Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.")
        self.targetBorder = Param(self, "targetBorder", "If set, defines the border for converting target values to 0 and 1 classes.")
        self.threadCount = Param(self, "threadCount", "Number of CPU threads in parallel operations on client")
        self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold")
        self.trainDir = Param(self, "trainDir", "The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'")
        self.useBestModel = Param(self, "useBestModel", "If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.")
        self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0")
        self.workerInitializationTimeout = Param(self, "workerInitializationTimeout", "Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes")
        self._setDefault(workerInitializationTimeout=datetime.timedelta(milliseconds=600000))
        self.workerMaxFailures = Param(self, "workerMaxFailures", "Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4")
        self._setDefault(workerMaxFailures=4)

        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        self.setParams(**kwargs)


[docs]    @keyword_only
    def setParams(self, allowConstLabel=None, allowWritingFiles=None, approxOnFullHistory=None, autoClassWeights=None, baggingTemperature=None, bestModelMinTrees=None, bootstrapType=None, borderCount=None, classNames=None, classWeightsList=None, classWeightsMap=None, classesCount=None, connectTimeout=datetime.timedelta(milliseconds=60000), customMetric=None, depth=None, diffusionTemperature=None, earlyStoppingRounds=None, evalMetric=None, featureBorderType=None, featureWeightsList=None, featureWeightsMap=None, featuresCol="features", firstFeatureUsePenaltiesList=None, firstFeatureUsePenaltiesMap=None, foldLenMultiplier=None, foldPermutationBlock=None, hasTime=None, ignoredFeaturesIndices=None, ignoredFeaturesNames=None, inputBorders=None, iterations=None, l2LeafReg=None, labelCol="label", leafEstimationBacktracking=None, leafEstimationIterations=None, leafEstimationMethod=None, learningRate=None, loggingLevel=None, lossFunction=None, metricPeriod=None, modelShrinkMode=None, modelShrinkRate=None, mvsReg=None, nanMode=None, odPval=None, odType=None, odWait=None, oneHotMaxSize=None, penaltiesCoefficient=None, perFloatFeatureQuantizaton=None, perObjectFeaturePenaltiesList=None, perObjectFeaturePenaltiesMap=None, predictionCol="prediction", probabilityCol="probability", randomSeed=None, randomStrength=None, rawPredictionCol="rawPrediction", rsm=None, samplingFrequency=None, samplingUnit=None, saveSnapshot=None, scalePosWeight=None, scoreFunction=None, snapshotFile=None, snapshotInterval=None, sparkPartitionCount=None, subsample=None, targetBorder=None, threadCount=None, thresholds=None, trainDir=None, useBestModel=None, weightCol=None, workerInitializationTimeout=datetime.timedelta(milliseconds=600000), workerMaxFailures=4):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        allowConstLabel : bool
            Use it to train models with datasets that have equal label values for all objects.
        allowWritingFiles : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        approxOnFullHistory : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        autoClassWeights : EAutoClassWeightsType
            Automatically calculate class weights based either on the total weight or the total number of objects in each class. The values are used as multipliers for the object weights. Default value is 'None'
        baggingTemperature : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        bestModelMinTrees : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        bootstrapType : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        borderCount : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        classNames : list
            Allows to redefine the default values (consecutive integers).
        classWeightsList : list
            List of weights for each class. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsMap.
        classWeightsMap : dict
            Map from class name to weight. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsList.
        classesCount : int
            The upper limit for the numeric class label. Defines the number of classes for multiclassification. See documentation for details.
        connectTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=60000)
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        customMetric : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        depth : int
            Depth of the tree.Default value is 6.
        diffusionTemperature : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        earlyStoppingRounds : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        evalMetric : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        featureBorderType : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        featureWeightsList : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        featureWeightsMap : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        featuresCol : str, default: "features"
            features column name
        firstFeatureUsePenaltiesList : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        firstFeatureUsePenaltiesMap : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        foldLenMultiplier : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        foldPermutationBlock : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        hasTime : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        ignoredFeaturesIndices : list
            Feature indices to exclude from the training
        ignoredFeaturesNames : list
            Feature names to exclude from the training
        inputBorders : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        iterations : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        l2LeafReg : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        labelCol : str, default: "label"
            label column name
        leafEstimationBacktracking : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        leafEstimationIterations : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        leafEstimationMethod : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        learningRate : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        loggingLevel : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        lossFunction : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        metricPeriod : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        modelShrinkMode : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        modelShrinkRate : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        mvsReg : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        nanMode : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        odPval : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        odType : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        odWait : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        oneHotMaxSize : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        penaltiesCoefficient : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        perFloatFeatureQuantizaton : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        perObjectFeaturePenaltiesList : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        perObjectFeaturePenaltiesMap : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        predictionCol : str, default: "prediction"
            prediction column name
        probabilityCol : str, default: "probability"
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        randomSeed : int
            The random seed used for training. Default value is 0.
        randomStrength : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        rawPredictionCol : str, default: "rawPrediction"
            raw prediction (a.k.a. confidence) column name
        rsm : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        samplingFrequency : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        samplingUnit : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        saveSnapshot : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        scalePosWeight : float
            The weight for class 1 in binary classification. The value is used as a multiplier for the weights of objects from class 1. Default value is 1 (both classes have equal weight).
        scoreFunction : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        snapshotFile : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        snapshotInterval : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        sparkPartitionCount : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        subsample : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        targetBorder : float
            If set, defines the border for converting target values to 0 and 1 classes.
        threadCount : int
            Number of CPU threads in parallel operations on client
        thresholds : list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        trainDir : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        useBestModel : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        weightCol : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        workerInitializationTimeout : datetime.timedelta, default: datetime.timedelta(milliseconds=600000)
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        workerMaxFailures : int, default: 4
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getAllowConstLabel(self):
        """
        Returns
        -------
        bool
            Use it to train models with datasets that have equal label values for all objects.
        """
        return self.getOrDefault(self.allowConstLabel)

[docs]    def setAllowConstLabel(self, value):
        """
        Parameters
        ----------
        value : bool
            Use it to train models with datasets that have equal label values for all objects.
        """
        self._set(allowConstLabel=value)
        return self



[docs]    def getAllowWritingFiles(self):
        """
        Returns
        -------
        bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        """
        return self.getOrDefault(self.allowWritingFiles)

[docs]    def setAllowWritingFiles(self, value):
        """
        Parameters
        ----------
        value : bool
            Allow to write analytical and snapshot files during training. Enabled by default.
        """
        self._set(allowWritingFiles=value)
        return self



[docs]    def getApproxOnFullHistory(self):
        """
        Returns
        -------
        bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        """
        return self.getOrDefault(self.approxOnFullHistory)

[docs]    def setApproxOnFullHistory(self, value):
        """
        Parameters
        ----------
        value : bool
            Use all the preceding rows in the fold for calculating the approximated values. This mode is slower and in rare cases slightly more accurate.
        """
        self._set(approxOnFullHistory=value)
        return self



[docs]    def getAutoClassWeights(self):
        """
        Returns
        -------
        EAutoClassWeightsType
            Automatically calculate class weights based either on the total weight or the total number of objects in each class. The values are used as multipliers for the object weights. Default value is 'None'
        """
        return self.getOrDefault(self.autoClassWeights)

[docs]    def setAutoClassWeights(self, value):
        """
        Parameters
        ----------
        value : EAutoClassWeightsType
            Automatically calculate class weights based either on the total weight or the total number of objects in each class. The values are used as multipliers for the object weights. Default value is 'None'
        """
        self._set(autoClassWeights=value)
        return self



[docs]    def getBaggingTemperature(self):
        """
        Returns
        -------
        float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        """
        return self.getOrDefault(self.baggingTemperature)

[docs]    def setBaggingTemperature(self, value):
        """
        Parameters
        ----------
        value : float
            This parameter can be used if the selected bootstrap type is Bayesian. Possible values are in the range [0, +inf). The higher the value the more aggressive the bagging is.Default value in 1.0.
        """
        self._set(baggingTemperature=value)
        return self



[docs]    def getBestModelMinTrees(self):
        """
        Returns
        -------
        int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        """
        return self.getOrDefault(self.bestModelMinTrees)

[docs]    def setBestModelMinTrees(self, value):
        """
        Parameters
        ----------
        value : int
            The minimal number of trees that the best model should have. If set, the output model contains at least  the given number of trees even if the best model is located within these trees. Should be used with the useBestModel parameter. No limit by default.
        """
        self._set(bestModelMinTrees=value)
        return self



[docs]    def getBootstrapType(self):
        """
        Returns
        -------
        EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        """
        return self.getOrDefault(self.bootstrapType)

[docs]    def setBootstrapType(self, value):
        """
        Parameters
        ----------
        value : EBootstrapType
            Bootstrap type. Defines the method for sampling the weights of objects.The default value depends on the selected mode and processing unit type: QueryCrossEntropy, YetiRankPairwise, PairLogitPairwise: Bernoulli with the subsample parameter set to 0.5. MultiClass and MultiClassOneVsAll: Bayesian. Other modes: MVS with the subsample parameter set to 0.8.
        """
        self._set(bootstrapType=value)
        return self



[docs]    def getBorderCount(self):
        """
        Returns
        -------
        int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        return self.getOrDefault(self.borderCount)

[docs]    def setBorderCount(self, value):
        """
        Parameters
        ----------
        value : int
            The number of splits for numerical features. Allowed values are integers from 1 to 65535 inclusively. Default value is 254.
        """
        self._set(borderCount=value)
        return self



[docs]    def getClassNames(self):
        """
        Returns
        -------
        list
            Allows to redefine the default values (consecutive integers).
        """
        return self.getOrDefault(self.classNames)

[docs]    def setClassNames(self, value):
        """
        Parameters
        ----------
        value : list
            Allows to redefine the default values (consecutive integers).
        """
        self._set(classNames=value)
        return self



[docs]    def getClassWeightsList(self):
        """
        Returns
        -------
        list
            List of weights for each class. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsMap.
        """
        return self.getOrDefault(self.classWeightsList)

[docs]    def setClassWeightsList(self, value):
        """
        Parameters
        ----------
        value : list
            List of weights for each class. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsMap.
        """
        self._set(classWeightsList=value)
        return self



[docs]    def getClassWeightsMap(self):
        """
        Returns
        -------
        dict
            Map from class name to weight. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsList.
        """
        return self.getOrDefault(self.classWeightsMap)

[docs]    def setClassWeightsMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Map from class name to weight. The values are used as multipliers for the object weights.  This parameter is mutually exclusive with classWeightsList.
        """
        self._set(classWeightsMap=value)
        return self



[docs]    def getClassesCount(self):
        """
        Returns
        -------
        int
            The upper limit for the numeric class label. Defines the number of classes for multiclassification. See documentation for details.
        """
        return self.getOrDefault(self.classesCount)

[docs]    def setClassesCount(self, value):
        """
        Parameters
        ----------
        value : int
            The upper limit for the numeric class label. Defines the number of classes for multiclassification. See documentation for details.
        """
        self._set(classesCount=value)
        return self



[docs]    def getConnectTimeout(self):
        """
        Returns
        -------
        datetime.timedelta
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        """
        return self.getOrDefault(self.connectTimeout)

[docs]    def setConnectTimeout(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            Timeout to wait while establishing socket connections between TrainingDriver and workers.Default is 1 minute
        """
        self._set(connectTimeout=value)
        return self



[docs]    def getCustomMetric(self):
        """
        Returns
        -------
        list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.customMetric)

[docs]    def setCustomMetric(self, value):
        """
        Parameters
        ----------
        value : list
            Metric values to output during training. These functions are not optimized and are displayed for  informational purposes only. Some metrics support optional parameters (see the Objectives and  metrics documentation section for details on each metric).
        """
        self._set(customMetric=value)
        return self



[docs]    def getDepth(self):
        """
        Returns
        -------
        int
            Depth of the tree.Default value is 6.
        """
        return self.getOrDefault(self.depth)

[docs]    def setDepth(self, value):
        """
        Parameters
        ----------
        value : int
            Depth of the tree.Default value is 6.
        """
        self._set(depth=value)
        return self



[docs]    def getDiffusionTemperature(self):
        """
        Returns
        -------
        float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        """
        return self.getOrDefault(self.diffusionTemperature)

[docs]    def setDiffusionTemperature(self, value):
        """
        Parameters
        ----------
        value : float
            The diffusion temperature of the Stochastic Gradient Langevin Boosting mode. Only non-negative values are supported. Default value is 10000.
        """
        self._set(diffusionTemperature=value)
        return self



[docs]    def getEarlyStoppingRounds(self):
        """
        Returns
        -------
        int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        """
        return self.getOrDefault(self.earlyStoppingRounds)

[docs]    def setEarlyStoppingRounds(self, value):
        """
        Parameters
        ----------
        value : int
            Sets the overfitting detector type to Iter and stops the training after the specified number of iterations since the iteration with the optimal metric value.
        """
        self._set(earlyStoppingRounds=value)
        return self



[docs]    def getEvalMetric(self):
        """
        Returns
        -------
        str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.evalMetric)

[docs]    def setEvalMetric(self, value):
        """
        Parameters
        ----------
        value : str
            The metric used for overfitting detection (if enabled) and best model selection (if enabled). Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        self._set(evalMetric=value)
        return self



[docs]    def getFeatureBorderType(self):
        """
        Returns
        -------
        EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        return self.getOrDefault(self.featureBorderType)

[docs]    def setFeatureBorderType(self, value):
        """
        Parameters
        ----------
        value : EBorderSelectionType
            The quantization mode for numerical features. See documentation for details. Default value is 'GreedyLogSum'
        """
        self._set(featureBorderType=value)
        return self



[docs]    def getFeatureWeightsList(self):
        """
        Returns
        -------
        list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        """
        return self.getOrDefault(self.featureWeightsList)

[docs]    def setFeatureWeightsList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-feature multiplication weights used when choosing the best split. Array indices correspond to feature indices. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsMap.
        """
        self._set(featureWeightsList=value)
        return self



[docs]    def getFeatureWeightsMap(self):
        """
        Returns
        -------
        dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        """
        return self.getOrDefault(self.featureWeightsMap)

[docs]    def setFeatureWeightsMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-feature multiplication weights used when choosing the best split. Map is 'feature_name' -> weight. The score of each candidate is multiplied by the weights of features from the current split.This parameter is mutually exclusive with featureWeightsList.
        """
        self._set(featureWeightsMap=value)
        return self



[docs]    def getFeaturesCol(self):
        """
        Returns
        -------
        str
            features column name
        """
        return self.getOrDefault(self.featuresCol)

[docs]    def setFeaturesCol(self, value):
        """
        Parameters
        ----------
        value : str
            features column name
        """
        self._set(featuresCol=value)
        return self



[docs]    def getFirstFeatureUsePenaltiesList(self):
        """
        Returns
        -------
        list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        """
        return self.getOrDefault(self.firstFeatureUsePenaltiesList)

[docs]    def setFirstFeatureUsePenaltiesList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesMap.
        """
        self._set(firstFeatureUsePenaltiesList=value)
        return self



[docs]    def getFirstFeatureUsePenaltiesMap(self):
        """
        Returns
        -------
        dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        """
        return self.getOrDefault(self.firstFeatureUsePenaltiesMap)

[docs]    def setFirstFeatureUsePenaltiesMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-feature penalties for the first occurrence of the feature in the model. The given value is subtracted from the score if the current candidate is the first one to include the feature in the model. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with firstFeatureUsePenaltiesList.
        """
        self._set(firstFeatureUsePenaltiesMap=value)
        return self



[docs]    def getFoldLenMultiplier(self):
        """
        Returns
        -------
        float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        """
        return self.getOrDefault(self.foldLenMultiplier)

[docs]    def setFoldLenMultiplier(self, value):
        """
        Parameters
        ----------
        value : float
            Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. Default value is 2.0.
        """
        self._set(foldLenMultiplier=value)
        return self



[docs]    def getFoldPermutationBlock(self):
        """
        Returns
        -------
        int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        """
        return self.getOrDefault(self.foldPermutationBlock)

[docs]    def setFoldPermutationBlock(self, value):
        """
        Parameters
        ----------
        value : int
            Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation. Default value is 1.
        """
        self._set(foldPermutationBlock=value)
        return self



[docs]    def getHasTime(self):
        """
        Returns
        -------
        bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        """
        return self.getOrDefault(self.hasTime)

[docs]    def setHasTime(self, value):
        """
        Parameters
        ----------
        value : bool
            Use the order of objects in the input data (do not perform random permutations during Choosing the tree structure stage).
        """
        self._set(hasTime=value)
        return self



[docs]    def getIgnoredFeaturesIndices(self):
        """
        Returns
        -------
        list
            Feature indices to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesIndices)

[docs]    def setIgnoredFeaturesIndices(self, value):
        """
        Parameters
        ----------
        value : list
            Feature indices to exclude from the training
        """
        self._set(ignoredFeaturesIndices=value)
        return self



[docs]    def getIgnoredFeaturesNames(self):
        """
        Returns
        -------
        list
            Feature names to exclude from the training
        """
        return self.getOrDefault(self.ignoredFeaturesNames)

[docs]    def setIgnoredFeaturesNames(self, value):
        """
        Parameters
        ----------
        value : list
            Feature names to exclude from the training
        """
        self._set(ignoredFeaturesNames=value)
        return self



[docs]    def getInputBorders(self):
        """
        Returns
        -------
        str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        return self.getOrDefault(self.inputBorders)

[docs]    def setInputBorders(self, value):
        """
        Parameters
        ----------
        value : str
            Load Custom quantization borders and missing value modes from a file (do not generate them)
        """
        self._set(inputBorders=value)
        return self



[docs]    def getIterations(self):
        """
        Returns
        -------
        int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        """
        return self.getOrDefault(self.iterations)

[docs]    def setIterations(self, value):
        """
        Parameters
        ----------
        value : int
            The maximum number of trees that can be built when solving machine learning problems. When using other parameters that limit the number of iterations, the final number of trees may be less than the number specified in this parameter. Default value is 1000.
        """
        self._set(iterations=value)
        return self



[docs]    def getL2LeafReg(self):
        """
        Returns
        -------
        float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        """
        return self.getOrDefault(self.l2LeafReg)

[docs]    def setL2LeafReg(self, value):
        """
        Parameters
        ----------
        value : float
            Coefficient at the L2 regularization term of the cost function. Any positive value is allowed. Default value is 3.0.
        """
        self._set(l2LeafReg=value)
        return self



[docs]    def getLabelCol(self):
        """
        Returns
        -------
        str
            label column name
        """
        return self.getOrDefault(self.labelCol)

[docs]    def setLabelCol(self, value):
        """
        Parameters
        ----------
        value : str
            label column name
        """
        self._set(labelCol=value)
        return self



[docs]    def getLeafEstimationBacktracking(self):
        """
        Returns
        -------
        ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        """
        return self.getOrDefault(self.leafEstimationBacktracking)

[docs]    def setLeafEstimationBacktracking(self, value):
        """
        Parameters
        ----------
        value : ELeavesEstimationStepBacktracking
            When the value of the leafEstimationIterations parameter is greater than 1, CatBoost makes several gradient or newton steps when calculating the resulting leaf values of a tree. The behaviour differs depending on the value of this parameter. See documentation for details. Default value is 'AnyImprovement'
        """
        self._set(leafEstimationBacktracking=value)
        return self



[docs]    def getLeafEstimationIterations(self):
        """
        Returns
        -------
        int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        """
        return self.getOrDefault(self.leafEstimationIterations)

[docs]    def setLeafEstimationIterations(self, value):
        """
        Parameters
        ----------
        value : int
            CatBoost might calculate leaf values using several gradient or newton steps instead of a single one. This parameter regulates how many steps are done in every tree when calculating leaf values.
        """
        self._set(leafEstimationIterations=value)
        return self



[docs]    def getLeafEstimationMethod(self):
        """
        Returns
        -------
        ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        """
        return self.getOrDefault(self.leafEstimationMethod)

[docs]    def setLeafEstimationMethod(self, value):
        """
        Parameters
        ----------
        value : ELeavesEstimation
            The method used to calculate the values in leaves. See documentation for details.
        """
        self._set(leafEstimationMethod=value)
        return self



[docs]    def getLearningRate(self):
        """
        Returns
        -------
        float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        """
        return self.getOrDefault(self.learningRate)

[docs]    def setLearningRate(self, value):
        """
        Parameters
        ----------
        value : float
            The learning rate. Used for reducing the gradient step. The default value is defined automatically for Logloss, MultiClass & RMSE loss functions depending on  the number of iterations if none of 'leaf_estimation_iterations', leaf_estimation_method', 'l2_leaf_reg' is set. In this case, the selected learning rate is printed to stdout and saved in the model. In other cases, the default value is 0.03.
        """
        self._set(learningRate=value)
        return self



[docs]    def getLoggingLevel(self):
        """
        Returns
        -------
        ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        """
        return self.getOrDefault(self.loggingLevel)

[docs]    def setLoggingLevel(self, value):
        """
        Parameters
        ----------
        value : ELoggingLevel
            The logging level to output to stdout. See documentation for details. Default value is 'Verbose'
        """
        self._set(loggingLevel=value)
        return self



[docs]    def getLossFunction(self):
        """
        Returns
        -------
        str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        return self.getOrDefault(self.lossFunction)

[docs]    def setLossFunction(self, value):
        """
        Parameters
        ----------
        value : str
            The metric to use in training. The specified value also determines the machine learning problem to solve. Some metrics support optional parameters (see the Objectives and metrics documentation section for details on each metric).
        """
        self._set(lossFunction=value)
        return self



[docs]    def getMetricPeriod(self):
        """
        Returns
        -------
        int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        """
        return self.getOrDefault(self.metricPeriod)

[docs]    def setMetricPeriod(self, value):
        """
        Parameters
        ----------
        value : int
            The frequency of iterations to calculate the values of objectives and metrics. The value should be a  positive integer. The usage of this parameter speeds up the training. Default value is 1.
        """
        self._set(metricPeriod=value)
        return self



[docs]    def getModelShrinkMode(self):
        """
        Returns
        -------
        EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        """
        return self.getOrDefault(self.modelShrinkMode)

[docs]    def setModelShrinkMode(self, value):
        """
        Parameters
        ----------
        value : EModelShrinkMode
            Determines how the actual model shrinkage coefficient is calculated at each iteration. See documentation for details. Default value is 'Constant'
        """
        self._set(modelShrinkMode=value)
        return self



[docs]    def getModelShrinkRate(self):
        """
        Returns
        -------
        float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        """
        return self.getOrDefault(self.modelShrinkRate)

[docs]    def setModelShrinkRate(self, value):
        """
        Parameters
        ----------
        value : float
            The constant used to calculate the coefficient for multiplying the model on each iteration. See documentation for details.
        """
        self._set(modelShrinkRate=value)
        return self



[docs]    def getMvsReg(self):
        """
        Returns
        -------
        float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        """
        return self.getOrDefault(self.mvsReg)

[docs]    def setMvsReg(self, value):
        """
        Parameters
        ----------
        value : float
            Affects the weight of the denominator and can be used for balancing between the importance and Bernoulli sampling (setting it to 0 implies importance sampling and to +Inf - Bernoulli).Note: This parameter is supported only for the MVS sampling method.
        """
        self._set(mvsReg=value)
        return self



[docs]    def getNanMode(self):
        """
        Returns
        -------
        ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        return self.getOrDefault(self.nanMode)

[docs]    def setNanMode(self, value):
        """
        Parameters
        ----------
        value : ENanMode
            The method for processing missing values in the input dataset. See documentation for details. Default value is 'Min'
        """
        self._set(nanMode=value)
        return self



[docs]    def getOdPval(self):
        """
        Returns
        -------
        float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        """
        return self.getOrDefault(self.odPval)

[docs]    def setOdPval(self, value):
        """
        Parameters
        ----------
        value : float
            The threshold for the IncToDec overfitting detector type. The training is stopped when the specified value is reached. Requires that a validation dataset was input. See documentation for details.Turned off by default.
        """
        self._set(odPval=value)
        return self



[docs]    def getOdType(self):
        """
        Returns
        -------
        EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        """
        return self.getOrDefault(self.odType)

[docs]    def setOdType(self, value):
        """
        Parameters
        ----------
        value : EOverfittingDetectorType
            The type of the overfitting detector to use. See documentation for details. Default value is 'IncToDec'
        """
        self._set(odType=value)
        return self



[docs]    def getOdWait(self):
        """
        Returns
        -------
        int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        """
        return self.getOrDefault(self.odWait)

[docs]    def setOdWait(self, value):
        """
        Parameters
        ----------
        value : int
            The number of iterations to continue the training after the iteration with the optimal metric value. See documentation for details. Default value is 20.
        """
        self._set(odWait=value)
        return self



[docs]    def getOneHotMaxSize(self):
        """
        Returns
        -------
        int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        """
        return self.getOrDefault(self.oneHotMaxSize)

[docs]    def setOneHotMaxSize(self, value):
        """
        Parameters
        ----------
        value : int
            Use one-hot encoding for all categorical features with a number of different values less than or equal to the given parameter value. Ctrs are not calculated for such features.
        """
        self._set(oneHotMaxSize=value)
        return self



[docs]    def getPenaltiesCoefficient(self):
        """
        Returns
        -------
        float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        """
        return self.getOrDefault(self.penaltiesCoefficient)

[docs]    def setPenaltiesCoefficient(self, value):
        """
        Parameters
        ----------
        value : float
            A single-value common coefficient to multiply all penalties. Non-negative values are supported. Default value is 1.0.
        """
        self._set(penaltiesCoefficient=value)
        return self



[docs]    def getPerFloatFeatureQuantizaton(self):
        """
        Returns
        -------
        list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        return self.getOrDefault(self.perFloatFeatureQuantizaton)

[docs]    def setPerFloatFeatureQuantizaton(self, value):
        """
        Parameters
        ----------
        value : list
            The quantization description for the given list of features (one or more).Description format for a single feature: FeatureId[:border_count=BorderCount][:nan_mode=BorderType][:border_type=border_selection_method]
        """
        self._set(perFloatFeatureQuantizaton=value)
        return self



[docs]    def getPerObjectFeaturePenaltiesList(self):
        """
        Returns
        -------
        list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        """
        return self.getOrDefault(self.perObjectFeaturePenaltiesList)

[docs]    def setPerObjectFeaturePenaltiesList(self, value):
        """
        Parameters
        ----------
        value : list
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Array indices correspond to feature indices. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesMap.
        """
        self._set(perObjectFeaturePenaltiesList=value)
        return self



[docs]    def getPerObjectFeaturePenaltiesMap(self):
        """
        Returns
        -------
        dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        """
        return self.getOrDefault(self.perObjectFeaturePenaltiesMap)

[docs]    def setPerObjectFeaturePenaltiesMap(self, value):
        """
        Parameters
        ----------
        value : dict
            Per-object penalties for the first use of the feature for the object. The given value is multiplied by the number of objects that are divided by the current split and use the feature for the first time. Map is 'feature_name' -> penalty. See documentation for details. This parameter is mutually exclusive with perObjectFeaturePenaltiesList.
        """
        self._set(perObjectFeaturePenaltiesMap=value)
        return self



[docs]    def getPredictionCol(self):
        """
        Returns
        -------
        str
            prediction column name
        """
        return self.getOrDefault(self.predictionCol)

[docs]    def setPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            prediction column name
        """
        self._set(predictionCol=value)
        return self



[docs]    def getProbabilityCol(self):
        """
        Returns
        -------
        str
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        """
        return self.getOrDefault(self.probabilityCol)

[docs]    def setProbabilityCol(self, value):
        """
        Parameters
        ----------
        value : str
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        """
        self._set(probabilityCol=value)
        return self



[docs]    def getRandomSeed(self):
        """
        Returns
        -------
        int
            The random seed used for training. Default value is 0.
        """
        return self.getOrDefault(self.randomSeed)

[docs]    def setRandomSeed(self, value):
        """
        Parameters
        ----------
        value : int
            The random seed used for training. Default value is 0.
        """
        self._set(randomSeed=value)
        return self



[docs]    def getRandomStrength(self):
        """
        Returns
        -------
        float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        """
        return self.getOrDefault(self.randomStrength)

[docs]    def setRandomStrength(self, value):
        """
        Parameters
        ----------
        value : float
            The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model. See documentation for details. Default value is 1.0
        """
        self._set(randomStrength=value)
        return self



[docs]    def getRawPredictionCol(self):
        """
        Returns
        -------
        str
            raw prediction (a.k.a. confidence) column name
        """
        return self.getOrDefault(self.rawPredictionCol)

[docs]    def setRawPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            raw prediction (a.k.a. confidence) column name
        """
        self._set(rawPredictionCol=value)
        return self



[docs]    def getRsm(self):
        """
        Returns
        -------
        float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        """
        return self.getOrDefault(self.rsm)

[docs]    def setRsm(self, value):
        """
        Parameters
        ----------
        value : float
            Random subspace method. The percentage of features to use at each split selection, when features are selected over again at random. The value must be in the range (0;1]. Default value is 1.
        """
        self._set(rsm=value)
        return self



[docs]    def getSamplingFrequency(self):
        """
        Returns
        -------
        ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        """
        return self.getOrDefault(self.samplingFrequency)

[docs]    def setSamplingFrequency(self, value):
        """
        Parameters
        ----------
        value : ESamplingFrequency
            Frequency to sample weights and objects when building trees. Default value is 'PerTreeLevel'
        """
        self._set(samplingFrequency=value)
        return self



[docs]    def getSamplingUnit(self):
        """
        Returns
        -------
        ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        """
        return self.getOrDefault(self.samplingUnit)

[docs]    def setSamplingUnit(self, value):
        """
        Parameters
        ----------
        value : ESamplingUnit
            The sampling scheme, see documentation for details. Default value is 'Object'
        """
        self._set(samplingUnit=value)
        return self



[docs]    def getSaveSnapshot(self):
        """
        Returns
        -------
        bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        """
        return self.getOrDefault(self.saveSnapshot)

[docs]    def setSaveSnapshot(self, value):
        """
        Parameters
        ----------
        value : bool
            Enable snapshotting for restoring the training progress after an interruption. If enabled, the default  period for making snapshots is 600 seconds. Use the snapshotInterval parameter to change this period.
        """
        self._set(saveSnapshot=value)
        return self



[docs]    def getScalePosWeight(self):
        """
        Returns
        -------
        float
            The weight for class 1 in binary classification. The value is used as a multiplier for the weights of objects from class 1. Default value is 1 (both classes have equal weight).
        """
        return self.getOrDefault(self.scalePosWeight)

[docs]    def setScalePosWeight(self, value):
        """
        Parameters
        ----------
        value : float
            The weight for class 1 in binary classification. The value is used as a multiplier for the weights of objects from class 1. Default value is 1 (both classes have equal weight).
        """
        self._set(scalePosWeight=value)
        return self



[docs]    def getScoreFunction(self):
        """
        Returns
        -------
        EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        """
        return self.getOrDefault(self.scoreFunction)

[docs]    def setScoreFunction(self, value):
        """
        Parameters
        ----------
        value : EScoreFunction
            The score type used to select the next split during the tree construction. See documentation for details. Default value is 'Cosine'
        """
        self._set(scoreFunction=value)
        return self



[docs]    def getSnapshotFile(self):
        """
        Returns
        -------
        str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        """
        return self.getOrDefault(self.snapshotFile)

[docs]    def setSnapshotFile(self, value):
        """
        Parameters
        ----------
        value : str
            The name of the file to save the training progress information in. This file is used for recovering training after an interruption.
        """
        self._set(snapshotFile=value)
        return self



[docs]    def getSnapshotInterval(self):
        """
        Returns
        -------
        datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        """
        return self.getOrDefault(self.snapshotInterval)

[docs]    def setSnapshotInterval(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            The interval between saving snapshots. See documentation for details. Default value is 600 seconds.
        """
        self._set(snapshotInterval=value)
        return self



[docs]    def getSparkPartitionCount(self):
        """
        Returns
        -------
        int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        """
        return self.getOrDefault(self.sparkPartitionCount)

[docs]    def setSparkPartitionCount(self, value):
        """
        Parameters
        ----------
        value : int
            The number of partitions used during training. Corresponds to the number of active parallel tasks. Set to the number of active executors by default
        """
        self._set(sparkPartitionCount=value)
        return self



[docs]    def getSubsample(self):
        """
        Returns
        -------
        float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        """
        return self.getOrDefault(self.subsample)

[docs]    def setSubsample(self, value):
        """
        Parameters
        ----------
        value : float
            Sample rate for bagging. The default value depends on the dataset size and the bootstrap type, see documentation for details.
        """
        self._set(subsample=value)
        return self



[docs]    def getTargetBorder(self):
        """
        Returns
        -------
        float
            If set, defines the border for converting target values to 0 and 1 classes.
        """
        return self.getOrDefault(self.targetBorder)

[docs]    def setTargetBorder(self, value):
        """
        Parameters
        ----------
        value : float
            If set, defines the border for converting target values to 0 and 1 classes.
        """
        self._set(targetBorder=value)
        return self



[docs]    def getThreadCount(self):
        """
        Returns
        -------
        int
            Number of CPU threads in parallel operations on client
        """
        return self.getOrDefault(self.threadCount)

[docs]    def setThreadCount(self, value):
        """
        Parameters
        ----------
        value : int
            Number of CPU threads in parallel operations on client
        """
        self._set(threadCount=value)
        return self



[docs]    def getThresholds(self):
        """
        Returns
        -------
        list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        """
        return self.getOrDefault(self.thresholds)

[docs]    def setThresholds(self, value):
        """
        Parameters
        ----------
        value : list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        """
        self._set(thresholds=value)
        return self



[docs]    def getTrainDir(self):
        """
        Returns
        -------
        str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        """
        return self.getOrDefault(self.trainDir)

[docs]    def setTrainDir(self, value):
        """
        Parameters
        ----------
        value : str
            The directory for storing the files on Driver node generated during training. Default value is 'catboost_info'
        """
        self._set(trainDir=value)
        return self



[docs]    def getUseBestModel(self):
        """
        Returns
        -------
        bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        """
        return self.getOrDefault(self.useBestModel)

[docs]    def setUseBestModel(self, value):
        """
        Parameters
        ----------
        value : bool
            If this parameter is set, the number of trees that are saved in the resulting model is selected based on the optimal value of the evalMetric. This option requires a validation dataset to be provided.
        """
        self._set(useBestModel=value)
        return self



[docs]    def getWeightCol(self):
        """
        Returns
        -------
        str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        return self.getOrDefault(self.weightCol)

[docs]    def setWeightCol(self, value):
        """
        Parameters
        ----------
        value : str
            weight column name. If this is not set or empty, we treat all instance weights as 1.0
        """
        self._set(weightCol=value)
        return self



[docs]    def getWorkerInitializationTimeout(self):
        """
        Returns
        -------
        datetime.timedelta
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        """
        return self.getOrDefault(self.workerInitializationTimeout)

[docs]    def setWorkerInitializationTimeout(self, value):
        """
        Parameters
        ----------
        value : datetime.timedelta
            Timeout to wait until CatBoost workers on Spark executors are initalized and sent their info to master. Depends on dataset size. Default is 10 minutes
        """
        self._set(workerInitializationTimeout=value)
        return self



[docs]    def getWorkerMaxFailures(self):
        """
        Returns
        -------
        int
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        return self.getOrDefault(self.workerMaxFailures)

[docs]    def setWorkerMaxFailures(self, value):
        """
        Parameters
        ----------
        value : int
            Number of individual CatBoost workers failures before giving up training. Should be greater than or equal to 1. Default is 4
        """
        self._set(workerMaxFailures=value)
        return self




[docs]    @classmethod
    def read(cls):
        """Returns an MLReader instance for this class."""
        return CatBoostMLReader(cls)

    def _create_model(self, java_model):
        return CatBoostClassificationModel(java_model)
  
[docs]    def fit(self, trainDataset, evalDatasets=None):
        """
        Extended variant of standard Estimator's fit method
        that accepts CatBoost's Pool s and allows to specify additional
        datasets for computing evaluation metrics and overfitting detection similarily to CatBoost's other APIs.
        
        Parameters
        ---------- 
        trainDataset : Pool or DataFrame
          The input training dataset.
        evalDatasets : Pools, optional
          The validation datasets used for the following processes:
           - overfitting detector
           - best iteration selection
           - monitoring metrics' changes
        
        Returns
        -------
        trained model: CatBoostClassificationModel
        """
        if (isinstance(trainDataset, DataFrame)):
            if evalDatasets is not None:
                raise RuntimeError("if trainDataset has type DataFrame no evalDatasets are supported")
            return JavaEstimator.fit(self, trainDataset)
        else:
            sc = SparkContext._active_spark_context
            evalDatasetCount = 0 if (evalDatasets is None) else len(evalDatasets)

            # need to create it because default mapping for python list is ArrayList, not Array
            evalDatasetsAsJavaObject = sc._gateway.new_array(sc._jvm.ai.catboost.spark.Pool, evalDatasetCount)
            for i in range(evalDatasetCount):
                evalDatasetsAsJavaObject[i] = _py2java(sc, evalDatasets[i])
            self._transfer_params_to_java()
            java_model = self._java_obj.fit(_py2java(sc, trainDataset), evalDatasetsAsJavaObject)
            return CatBoostClassificationModel(java_model)

[docs]@inherit_doc
class CatBoostClassificationModel(JavaProbabilisticClassificationModel, MLReadable, JavaMLWritable):
    """
    Classification model trained by CatBoost. Use CatBoostClassifier to train it
    """
    def __init__(self, java_model):
        super(CatBoostClassificationModel, self).__init__(java_model)
        self.featuresCol = Param(self, "featuresCol", "features column name")
        self._setDefault(featuresCol="features")
        self.labelCol = Param(self, "labelCol", "label column name")
        self._setDefault(labelCol="label")
        self.predictionCol = Param(self, "predictionCol", "prediction column name")
        self._setDefault(predictionCol="prediction")
        self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities")
        self._setDefault(probabilityCol="probability")
        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
        self._setDefault(rawPredictionCol="rawPrediction")
        self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold")
        self._transfer_params_from_java()


[docs]    @keyword_only
    def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", thresholds=None):
        """
        Set the (keyword only) parameters

        Parameters
        ----------
        featuresCol : str, default: "features"
            features column name
        labelCol : str, default: "label"
            label column name
        predictionCol : str, default: "prediction"
            prediction column name
        probabilityCol : str, default: "probability"
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        rawPredictionCol : str, default: "rawPrediction"
            raw prediction (a.k.a. confidence) column name
        thresholds : list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        """
        if hasattr(self, "_input_kwargs"):
            kwargs = self._input_kwargs
        else:
            kwargs = self.__init__._input_kwargs
        return self._set(**kwargs)


[docs]    def getFeaturesCol(self):
        """
        Returns
        -------
        str
            features column name
        """
        return self.getOrDefault(self.featuresCol)

[docs]    def setFeaturesCol(self, value):
        """
        Parameters
        ----------
        value : str
            features column name
        """
        self._set(featuresCol=value)
        return self



[docs]    def getLabelCol(self):
        """
        Returns
        -------
        str
            label column name
        """
        return self.getOrDefault(self.labelCol)

[docs]    def setLabelCol(self, value):
        """
        Parameters
        ----------
        value : str
            label column name
        """
        self._set(labelCol=value)
        return self



[docs]    def getPredictionCol(self):
        """
        Returns
        -------
        str
            prediction column name
        """
        return self.getOrDefault(self.predictionCol)

[docs]    def setPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            prediction column name
        """
        self._set(predictionCol=value)
        return self



[docs]    def getProbabilityCol(self):
        """
        Returns
        -------
        str
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        """
        return self.getOrDefault(self.probabilityCol)

[docs]    def setProbabilityCol(self, value):
        """
        Parameters
        ----------
        value : str
            Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities
        """
        self._set(probabilityCol=value)
        return self



[docs]    def getRawPredictionCol(self):
        """
        Returns
        -------
        str
            raw prediction (a.k.a. confidence) column name
        """
        return self.getOrDefault(self.rawPredictionCol)

[docs]    def setRawPredictionCol(self, value):
        """
        Parameters
        ----------
        value : str
            raw prediction (a.k.a. confidence) column name
        """
        self._set(rawPredictionCol=value)
        return self



[docs]    def getThresholds(self):
        """
        Returns
        -------
        list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        """
        return self.getOrDefault(self.thresholds)

[docs]    def setThresholds(self, value):
        """
        Parameters
        ----------
        value : list
            Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class's threshold
        """
        self._set(thresholds=value)
        return self




    @staticmethod
    def _from_java(java_model):
        return CatBoostClassificationModel(java_model)

[docs]    @classmethod
    def read(cls):
        """Returns an MLReader instance for this class."""
        return CatBoostMLReader(cls)

[docs]    def saveNativeModel(self, fileName, format=EModelType.CatboostBinary, exportParameters=None, pool=None):
        """
        Save the model to a local file.
        See https://catboost.ai/docs/concepts/python-reference_catboostclassifier_save_model.html
        for detailed parameters description
        """
        return self._call_java("saveNativeModel", fileName, format, exportParameters, pool)

[docs]    @staticmethod
    def loadNativeModel(fileName, format=EModelType.CatboostBinary):
        """
        Load the model from a local file.
        See https://catboost.ai/docs/concepts/python-reference_catboostclassifier_load_model.html
        for detailed parameters description
        """
        sc = SparkContext._active_spark_context
        java_model = sc._jvm.ai.catboost.spark.CatBoostClassificationModel.loadNativeModel(fileName, _py2java(sc, format))
        return CatBoostClassificationModel(java_model)


[docs]    def transformPool(self, pool):
        """
        This function is useful when the dataset has been already quantized but works with any Pool
        """
        return self._call_java("transformPool", pool)


[docs]    def getFeatureImportance(self, 
                             fstrType=EFstrType.FeatureImportance,
                             data=None,
                             calcType=ECalcTypeShapValues.Regular
                            ):
        """
        Parameters
        ----------
        fstrType : EFstrType
            Supported values are FeatureImportance, PredictionValuesChange, LossFunctionChange, PredictionDiff
        data : Pool
            if fstrType is PredictionDiff it is required and must contain 2 samples
            if fstrType is PredictionValuesChange this param is required in case if model was explicitly trained
            with flag to store no leaf weights.
            otherwise it can be null
        calcType : ECalcTypeShapValues
            Used only for PredictionValuesChange. 
            Possible values:
              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values

        Returns
        -------
        list of float
            array of feature importances (index corresponds to the order of features in the model)
        """
        return self._call_java("getFeatureImportance", fstrType, data, calcType)

[docs]    def getFeatureImportancePrettified(self, 
                                       fstrType=EFstrType.FeatureImportance,
                                       data=None,
                                       calcType=ECalcTypeShapValues.Regular
                                      ):
        """
        Parameters
        ----------
        fstrType : EFstrType
            Supported values are FeatureImportance, PredictionValuesChange, LossFunctionChange, PredictionDiff
        data : Pool
            if fstrType is PredictionDiff it is required and must contain 2 samples
            if fstrType is PredictionValuesChange this param is required in case if model was explicitly trained
            with flag to store no leaf weights.
            otherwise it can be null
        calcType : ECalcTypeShapValues
            Used only for PredictionValuesChange. 
            Possible values:

              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values

        Returns
        -------
        list of FeatureImportance
            array of feature importances sorted in descending order by importance
        """
        return self._call_java("getFeatureImportancePrettified", fstrType, data, calcType)

[docs]    def getFeatureImportanceShapValues(self,
                                       data,
                                       preCalcMode=EPreCalcShapValues.Auto,
                                       calcType=ECalcTypeShapValues.Regular,
                                       modelOutputType=EExplainableModelOutput.Raw,
                                       referenceData=None,
                                       outputColumns=None
                                      ):
        """
        Parameters
        ----------
        data : Pool
            dataset to calculate SHAP values for
        preCalcMode : EPreCalcShapValues
            Possible values:
               - Auto
                  Use direct SHAP Values calculation only if data size is smaller than average leaves number
                  (the best of two strategies below is chosen).
               - UsePreCalc
                  Calculate SHAP Values for every leaf in preprocessing. Final complexity is
                  O(NT(D+F))+O(TL^2 D^2) where N is the number of documents(objects), T - number of trees,
                  D - average tree depth, F - average number of features in tree, L - average number of leaves in tree
                  This is much faster (because of a smaller constant) than direct calculation when N >> L
               - NoPreCalc
                  Use direct SHAP Values calculation calculation with complexity O(NTLD^2). Direct algorithm
                  is faster when N < L (algorithm from https://arxiv.org/abs/1802.03888)
        calcType : ECalcTypeShapValues
            Possible values:

              - Regular
                 Calculate regular SHAP values
              - Approximate
                 Calculate approximate SHAP values
              - Exact
                 Calculate exact SHAP values
        referenceData : Pool
            reference data for Independent Tree SHAP values from https://arxiv.org/abs/1905.04610v1
            if referenceData is not null, then Independent Tree SHAP values are calculated
        outputColumns : list of str
            columns from data to add to output DataFrame, if None - add all columns

        Returns
        -------
        DataFrame
            - for regression and binclass models: 
              contains outputColumns and "shapValues" column with Vector of length (n_features + 1) with SHAP values
            - for multiclass models:
              contains outputColumns and "shapValues" column with Matrix of shape (n_classes x (n_features + 1)) with SHAP values
        """
        return self._call_java(
            "getFeatureImportanceShapValues", 
            data, 
            preCalcMode,
            calcType,
            modelOutputType,
            referenceData,
            outputColumns
        )

[docs]    def getFeatureImportanceShapInteractionValues(self,
                                                  data,
                                                  featureIndices=None,
                                                  featureNames=None,
                                                  preCalcMode=EPreCalcShapValues.Auto,
                                                  calcType=ECalcTypeShapValues.Regular,
                                                  outputColumns=None):
        """
        SHAP interaction values are calculated for all features pairs if nor featureIndices nor featureNames 
          are specified.

        Parameters
        ----------
        data : Pool
            dataset to calculate SHAP interaction values
        featureIndices : (int, int), optional
            pair of features indices to calculate SHAP interaction values for.
        featureNames : (str, str), optional
            pair of features names to calculate SHAP interaction values for.
        preCalcMode : EPreCalcShapValues
            Possible values:

            - Auto
                Use direct SHAP Values calculation only if data size is smaller than average leaves number
                (the best of two strategies below is chosen).
            - UsePreCalc
                Calculate SHAP Values for every leaf in preprocessing. Final complexity is
                O(NT(D+F))+O(TL^2 D^2) where N is the number of documents(objects), T - number of trees,
                D - average tree depth, F - average number of features in tree, L - average number of leaves in tree
                This is much faster (because of a smaller constant) than direct calculation when N >> L
            - NoPreCalc
                Use direct SHAP Values calculation calculation with complexity O(NTLD^2). Direct algorithm
                is faster when N < L (algorithm from https://arxiv.org/abs/1802.03888)
        calcType : ECalcTypeShapValues
            Possible values:

              - Regular
                  Calculate regular SHAP values
              - Approximate
                  Calculate approximate SHAP values
              - Exact
                  Calculate exact SHAP values
        outputColumns : list of str
            columns from data to add to output DataFrame, if None - add all columns

        Returns
        -------
        DataFrame
            - for regression and binclass models: 
              contains outputColumns and "featureIdx1", "featureIdx2", "shapInteractionValue" columns
            - for multiclass models:
              contains outputColumns and "classIdx", "featureIdx1", "featureIdx2", "shapInteractionValue" columns
        """
        return self._call_java(
            "getFeatureImportanceShapInteractionValues", 
            data,
            featureIndices,
            featureNames,
            preCalcMode, 
            calcType,
            outputColumns
        )

[docs]    def getFeatureImportanceInteraction(self):
        """
        Returns
        -------
        list of FeatureInteractionScore
        """
        return self._call_java("getFeatureImportanceInteraction")