Usage examples

Regression

CatBoostRegressor class with array-like data
from catboost import CatBoostRegressor
# Initialize data

train_data = [[1, 4, 5, 6],
              [4, 5, 6, 7],
              [30, 40, 50, 60]]

eval_data = [[2, 4, 6, 8],
             [1, 4, 50, 60]]

train_labels = [10, 20, 30]
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=2)
# Fit model
model.fit(train_data, train_labels)
# Get predictions
preds = model.predict(eval_data)

Train on GPU

Train a classification model on GPU:
from catboost import CatBoostClassifier

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]
train_labels = [0, 0, 1, 1]

model = CatBoostClassifier(iterations=1000, 
                           task_type = "GPU")
model.fit(train_data,
          train_labels,
          verbose = False)

Binary classification

CatBoostClassifier class with array-like data
from catboost import CatBoostClassifier
# Initialize data
cat_features = [0, 1]
train_data = [["a", "b", 1, 4, 5, 6],
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [["a", "b", 2, 4, 6, 8],
             ["a", "d", 1, 4, 50, 60]]

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')
Load the dataset using Pool, train it with CatBoostClassifier and make a prediction
from catboost import CatBoostClassifier, Pool

train_data = Pool(data=[[1, 4, 5, 6],
                        [4, 5, 6, 7],
                        [30, 40, 50, 60]],
                  label=[1, 1, -1],
                  weight=[0.1, 0.2, 0.3])

model = CatBoostClassifier(iterations=10)

model.fit(train_data)
preds_class = model.predict(train_data)

Multiclassification

from catboost import Pool, CatBoostClassifier

train_data = [["summer", 1924, 44],
              ["summer", 1932, 37],
              ["winter", 1980, 37],
              ["summer", 2012, 204]]

eval_data = [["winter", 1996, 197],
             ["winter", 1968, 37],
             ["summer", 2002, 77],
             ["summer", 1948, 59]]

cat_features = [0]

train_label = ["France", "USA", "USA", "UK"]
eval_label = ["USA", "France", "USA", "UK"]


train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')
# Fit model
model.fit(train_dataset)
# Get predicted classes
preds_class = model.predict(eval_dataset)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_dataset)
# Get predicted RawFormulaVal
preds_raw = model.predict(eval_dataset, 
                          prediction_type='RawFormulaVal')

Get the best result for each metric

Return the best results for each metric calculated on the eval dataset:

from catboost import CatBoostClassifier, Pool

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]

train_labels = [0, 0, 1, 1]

eval_data = [[2, 1],
             [3, 1],
             [9, 0],
             [5, 3]]

eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data,
                    eval_labels)

model = CatBoostClassifier(learning_rate=0.03,
                           custom_metric=['Logloss',
                                          'AUC:hints=skip_train~false'])

model.fit(train_data,
          train_labels,
          eval_set=eval_dataset,
          verbose=False)

print(model.get_best_score())
Note. This example illustrates the usage of the method with the CatBoostClassifier class. The usage with other classes is identical.

Get the identifier of the iteration with the best result

Return the iteration with the best value of the evaluation metric on the eval dataset:

from catboost import CatBoostClassifier, Pool

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]

train_labels = [0, 0, 1, 1]

eval_data = [[2, 1],
             [3, 1],
             [9, 0],
             [5, 3]]

eval_labels = [0, 1, 1, 0]

eval_dataset = Pool(eval_data,
                    eval_labels)

model = CatBoostClassifier(learning_rate=0.03,
                           eval_metric='AUC')

model.fit(train_data,
          train_labels,
          eval_set=eval_dataset,
          verbose=False)

print(model.get_best_iteration())
Note. This example illustrates the usage of the method with the CatBoostClassifier class. The usage with other classes is identical.

Load the dataset from list, ndarray, pandas.DataFrame, pandas.Series

Dataset with categorical features
from catboost import Pool
cat_features = [0, 1, 2]
data = [["a","b", 1, 4, 5, 6],
        ["a","b", 4, 5, 6, 7],
        ["c","d", 30, 40, 50, 60]]

label = [1, 1, -1]

dataset = Pool(data, label, cat_features)
Dataset without categorical features
from catboost import Pool
data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]

label = [1, 1, -1]
dataset = Pool(data, label)
Dataset without labels (for prediction)
from catboost import Pool
data = [[1, 4, 5, 6],
        [4, 5, 6, 7],
        [30, 40, 50, 60]]
dataset = Pool(data)

Load the dataset from a file

Dataset without specified columns description (without categorical features)
from catboost import Pool
dataset = Pool("data.tsv")
pool_no_categ is the file following file with the object descriptions:
1	1935	01
1	1958	08
0	1969	09

Since the columns description file is not specified, it is assumed that the first column of the file (indexed 0) defines the label value, and all other columns are the values of numerical features.

Dataset with specified columns description (with categorical features)
from catboost import Pool
dataset = Pool("data_with_cat_features.tsv", 
               column_description="data_with_cat_features.cd")

Get a slice of a pool

from catboost import Pool

data = [[1, 3],
        [0, 4],
        [1, 7],
        [6, 4],
        [5, 3]]

dataset = Pool(data)
print(dataset.num_row())

dataset_part = dataset.slice([0, 1, 2])
print(dataset_part.num_row())
Get a slice of five objects from the input dataset:
Output:
5
3

CV

Perform cross-validation on the given dataset:

from catboost import Pool, cv

cv_data = [["France", 1924, 44],
           ["USA", 1932, 37],
           ["Switzerland", 1928, 25],
           ["Norway", 1952, 30],
           ["Japan", 1972, 35],
           ["Mexico", 1968, 112]]

labels = [1, 1, 0, 0, 0, 1]

cat_features = [0]

cv_dataset = Pool(data=cv_data,
                  label=labels,
                  cat_features=cat_features)

params = {"iterations": 100,
          "depth": 2,
          "loss_function": "Logloss",
          "verbose": False}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

The following is a chart plotted with Jupyter Notebook for the given example.

Perform cross-validation and save ROC curve points to the roc-curve output file:

from catboost import Pool, cv

cv_data = [["France", 1924, 44],
           ["USA", 1932, 37],
           ["Switzerland", 1928, 25],
           ["Norway", 1952, 30],
           ["Japan", 1972, 35],
           ["Mexico", 1968, 112]]

labels = [1, 1, 0, 0, 0, 1]

cv_dataset = Pool(data=cv_data,
                  label=labels,
                  cat_features=[0])

params = {"iterations": 100,
          "depth": 2,
          "loss_function": "Logloss",
          "verbose": False,
          "roc_file": "roc-file"}

scores = cv(cv_dataset,
            params,
            fold_count=2)

Using object weights

The weight for each object in the input data can be set in the form of a one-dimensional array-like data (length = data length).

Weights are used to calculate the optimized loss function and metrics. By default, it is set to 1 for all objects.

import numpy as np
from catboost import Pool, CatBoostClassifier
# Initialize data

train_data = np.random.randint(1, 100, size=(100, 10))
train_labels = np.random.randint(2, size=(100))
train_weight = np.random.random(100)

# Initialize Pool from data
train_dataset = Pool(train_data,
                     train_labels,
                     weight=train_weight)
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=10)
# Fit model
model.fit(train_dataset)

Using best model

If this parameter is set, the number of trees that are saved in the resulting model is defined as follows:
  1. Build the number of trees defined by the training parameters.
  2. Use the validation dataset to identify the iteration with the optimal value of the metric specified in  --eval-metric (eval_metric).

No trees are saved after this iteration.

The eval_set parameter is obligatory for the fit method if the best model mode is on.

from catboost import Pool, CatBoostClassifier

train_data = [["France", 1924, 44],
              ["USA", 1932, 37],
              ["USA", 1980, 37]]

eval_data = [["USA", 1996, 197],
             ["France", 1968, 37],
             ["USA", 2002, 77]]

cat_features = [0]

train_label = [1, 1, 0]
eval_label = [0, 0, 1]

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

eval_dataset = Pool(data=eval_data,
                    label=eval_label,
                    cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100)
# Fit model with `use_best_model=True`

model.fit(train_dataset,
          use_best_model=True,
          eval_set=eval_dataset)

print("Count of trees in model = {}".format(model.tree_count_))

Load the model from a file

The following example illustrates how to save a trained model to a file and then load it.

from catboost import CatBoostClassifier, Pool

train_data = [[1, 3],
              [0, 4],
              [1, 7]]
train_labels = [1, 0, 1]

# catboost_pool = Pool(train_data, train_labels)

model = CatBoostClassifier(learning_rate=0.03)
model.fit(train_data,
          train_labels,
          verbose=False)

model.save_model("model")

from_file = CatBoostClassifier()

from_file.load_model("model")

Using staged_predict

The values of the model can be output for each i-th tree of the model by taking into consideration only the trees in the range [1;i – 1].

from catboost import Pool, CatBoostClassifier

train_data = [["France", 1924, 44],
              ["USA", 1932, 37],
              ["USA", 1980, 37]]

eval_data = [["USA", 1996, 197],
             ["France", 1968, 37],
             ["USA", 2002, 77]]

cat_features = [0]
train_label = [1, 1, 0]
eval_label = [0, 0, 1]

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)
eval_data = Pool(data=eval_data,
                 label=eval_label,
                 cat_features=cat_features)

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=100)
# Fit model
model.fit(train_dataset)

# Get staged_predictions
# First method: use predict() with `ntree_end` in loop
staged_predictions_brute_force = []

for i in range(1, model.tree_count_ + 1):
    staged_predictions_brute_force.append(model.predict(eval_data, 
                                                        ntree_end=i))

# Second method: It is equivalent to the previous, but faster
staged_predictions = list(model.staged_predict(eval_data))

Using pre-training results (baseline)

A pre-trained model can be used. The results (only raw_values, not probability or class) can be set as baseline for the new model.

The form of the baseline depends on the machine learning problem being solved:
  • Multiclass classification — a two-dimensional array: shape = (length of data, number of classes)
  • Regression, binary classification, ranking— a one-dimensional array.
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool
from catboost.datasets import msrank
from sklearn.model_selection import train_test_split

# Download train and validation datasets
train_df, test_df = msrank()
#Column 0 contains label values, column 1 contains group ids.
X_train, y_train = train_df.drop([0, 1], axis=1).values, train_df[0].values
X_test, y_test = test_df.drop([0, 1], axis=1).values, test_df[0].values

# Split train data into two parts. First part - for baseline model,
# second part - for major model
splitted_data = train_test_split(X_train, y_train, test_size=0.5)
X_train_first, X_train_second, y_train_first, y_train_second = splitted_data

catboost_model = CatBoostRegressor(iterations=200, verbose=False)

##########################################
######### SIMPLE BASELINE MODEL ##########
##########################################

# Prepare simple baselines (just mean target on first part of train pool).
baseline_value = y_train_first.mean()
train_baseline = np.array([baseline_value] * y_train_second.shape[0])
test_baseline = np.array([baseline_value] * y_test.shape[0])

# Create pools
train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline)
test_pool = Pool(X_test, y_test, baseline=test_baseline)

# Train CatBoost model
catboost_model.fit(train_pool, eval_set=test_pool)

# Apply model on pool with baseline values
preds1 = catboost_model.predict(test_pool)

# Apply model on numpy.array and then add the baseline values
preds2 = test_baseline + catboost_model.predict(X_test)

# Check that preds have small diffs
assert (np.abs(preds1 - preds2) < 1e-6).all()

print(mean_squared_error(y_test, preds1))

##########################################
######### LINEAR BASELINE MODEL ##########
##########################################

# Train baseline model (linear regression) on first part of train pool
baseline_model = Ridge(alpha=3e3, normalize=True)
baseline_model.fit(X_train_first, y_train_first)

# Prepare baselines
train_baseline = baseline_model.predict(X_train_second)
test_baseline = baseline_model.predict(X_test)

# Create pools
train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline)
test_pool = Pool(X_test, y_test, baseline=test_baseline)

# Train CatBoost model
catboost_model.fit(train_pool, eval_set=test_pool)

# Apply model on pool with baseline values
preds1 = catboost_model.predict(test_pool)

# Apply model on numpy.array and then add the baseline values
preds2 = baseline_model.predict(X_test) + catboost_model.predict(X_test)

# Check that preds have small diffs
assert (np.abs(preds1 - preds2) < 1e-6).all()

print(mean_squared_error(y_test, preds1))

Training continuation

from catboost import CatBoostRegressor

# Initialize data

train_data = [[1, 4, 5, 6],
              [4, 5, 6, 7],
              [30, 40, 50, 60]]

eval_data = [[2, 4, 6, 8],
             [1, 4, 50, 60]]

train_labels = [10, 20, 30]


# initial parameters

model1 = CatBoostRegressor(iterations=2,
                           learning_rate=0.2,
                           depth=2)

model1.fit(train_data, train_labels)

# continue training with the same parameters, result will be in updated model1

model1.fit(train_data, train_labels, init_model=model1)


# continue training with different parameters

model2 = CatBoostRegressor(iterations=4,
                           learning_rate=0.1,
                           depth=4)

# result will be in model2, model1 will be unchanged

model2.fit(train_data, train_labels, init_model=model1)

Exporting the model to Apple CoreML

The following is an example of exporting a model trained with CatBoostClassifier to Apple CoreML for further usage on iOS devices:
  1. Train the model and save it in CoreML format.

    For example, if training on the Iris dataset:
    import catboost
    import sklearn
    
    iris = sklearn.datasets.load_iris()
    cls = catboost.CatBoostClassifier(loss_function='MultiClass')
    
    cls.fit(iris.data, iris.target)
    
    # Save model to catboost format
    cls.save_model("iris.mlmodel", format="coreml", export_parameters={'prediction_type': 'probability'})
  2. Import the resulting model to XCode.

    The following is an example of importing with Swift:
    import CoreML
    
    let model = iris()
    let sepal_l = 7.0
    let sepal_w = 3.2
    let petal_l = 4.7
    let petal_w = 1.4
    
    guard let output = try? model.prediction(input: irisInput(feature_0: sepal_l, feature_1: sepal_w, feature_2: petal_l, feature_3: petal_w)) else {
        fatalError("Unexpected runtime error.")
    }
    
    print(String(
        format: "Output probabilities: %1.5f; %1.5f; %1.5f",
        output.prediction[0].doubleValue,
        output.prediction[1].doubleValue,
        output.prediction[2].doubleValue
    ))

Object strength calculation

Calculate the train_pool objects strength for the pool objects:
from catboost import Pool, CatBoost

train_data = [["France", 1924, 44],
              ["USA", 1932, 37],
              ["USA", 1980, 37]]

data = [["USA", 1996, 197],
        ["France", 1968, 37],
        ["USA", 2002, 77]]


cat_features = [0]
train_label = [1, 1, 0]
label = [0, 0, 1]

train_dataset = Pool(data=train_data,
                     label=train_label,
                     cat_features=cat_features)

dataset = Pool(data=data,
               cat_features=cat_features,
               label=label)

cb = CatBoost({'iterations': 10})
cb.fit(train_dataset)
indices, scores = cb.get_object_importance(dataset,
                                           train_dataset,
                                           top_size=100)

Custom metric for overfitting detector and best model selection

To set a custom metric for overfitting detector and best model selection:
  1. Create an object that implements the following interface:
    class CustomMetric(object):
        def get_final_error(self, error, weight):
            return 0.0
    
        def is_max_optimal(self):
            return True
    
        def evaluate(self, approxes, target, weight):
            # approxes - list of list-like objects (one object per approx dimension)
            # target - list-like object
            # weight - list-like object, can be None
            return 0.0, 0.0
    The following is an example of the Logloss function implementation:
    import math
    from six.moves import xrange
    from catboost import Pool, CatBoostClassifier
    
    class LoglossMetric(object):
        def get_final_error(self, error, weight):
            return error / (weight + 1e-38)
    
        def is_max_optimal(self):
            return True
    
        def evaluate(self, approxes, target, weight):
            # approxes is list of indexed containers
            # (containers with only __len__ and __getitem__ defined), one container
            # per approx dimension. Each container contains floats.
            # weight is one dimensional indexed container.
            # target is float.   
            # weight parameter can be None.
            # Returns pair (error, weights sum)
    
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])
    
            approx = approxes[0]
    
            error_sum = 0.0
            weight_sum = 0.0
    
            for i in xrange(len(approx)):
                w = 1.0 if weight is None else weight[i]
                weight_sum += w
                error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i])))
    
            return error_sum, weight_sum
    
    cat_features = [0, 1, 2]
    
    train_data = [["a", "b", 1, 4, 5, 6],
                  ["a", "b", 4, 5, 6, 7],
                  ["c", "d", 30, 40, 50, 60]]
    
    train_labels = [1, 1, 0]
    
    eval_data = [["a", "c", 3, 4, 4, 1],
                 ["a", "d", 1, 5, 5, 5],
                 ["b", "d", 31, 25, 60, 70]]
    
    eval_labels = [0, 1, 1]
    
    eval_dataset = Pool(eval_data,
                        label=eval_labels,
                        cat_features=cat_features)
    # Initialize CatBoostClassifier with custom `eval_metric`
    model = CatBoostClassifier(iterations=5,
                               eval_metric=LoglossMetric())
    # Fit model with `use_best_model=True`
    model.fit(train_data,
              train_labels,
              cat_features,
              use_best_model=True,
              eval_set=eval_dataset)
    
    # Get predictions
    pred = model.predict(eval_data)
    
  2. Pass the created object to the eval_metric parameter:
    CatBoostClassifier(eval_metric=CustomMetric())

Custom objective function

A custom objective function can be used by specifying a python object as the value for the loss_function parameter. In this case the objective is always maximized.

Depending on the machine learning problem the python object should have one the following functions defined:
  • calc_ders_range
  • calc_ders_multi(approxes, target, weight) (for multiclassification)
import math
from six.moves import xrange
from catboost import Pool, CatBoostClassifier


class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        exponents = []
        for index in xrange(len(approxes)):
            exponents.append(math.exp(approxes[index]))

        result = []
        for index in xrange(len(targets)):
            p = exponents[index] / (1 + exponents[index])
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))

        return result

cat_features = [0, 1]

train_data = [[1, 4, 5, 6],
              [4, 5, 6, 7],
              [30, 40, 50, 60]]

train_labels = [1, 1, 0]

eval_data = [[3, 4, 4, 1],
             [1, 5, 5, 5],
             [31, 25, 60, 70]]

# Initialize CatBoostClassifier with custom `loss_function`
model = CatBoostClassifier(loss_function=LoglossObjective(),
                           eval_metric="Logloss")
# Fit model
model.fit(train_data, train_labels)
# Only prediction_type='RawFormulVal' allowed with custom `loss_function`
preds_raw = model.predict(eval_data,
                          prediction_type='RawFormulaVal')

print(preds_raw)