Usage examples
Load datasets
Load the Dataset description in delimiter-separated values format and the object descriptions from the train
and train.cd
files respectively (both stored in the current directory):
library(catboost)
pool_path = system.file("extdata",
"adult_train.1000",
package = "catboost")
column_description_path = system.file("extdata",
"adult.cd",
package = "catboost")
pool <- catboost.load_pool(pool_path,
column_description = column_description_path)
head(pool, 1)[[1]]
Load the dataset from the CatBoostR package (this dataset is a subset of the Adult Data Set distributed through the UCI Machine Learning Repository):
library(catboost)
pool_path = system.file("extdata",
"adult_train.1000",
package="catboost")
column_description_vector = rep('numeric', 15)
cat_features <- c(3, 5, 7, 8, 9, 10, 11, 15)
for (i in cat_features)
column_description_vector[i] <- 'factor'
data <- read.table(pool_path,
head = F,
sep = "\t",
colClasses = column_description_vector,
na.strings='NAN')
# Transform categorical features to numerical
for (i in cat_features)
data[,i] <- as.numeric(factor(data[,i]))
pool <- catboost.load_pool(as.matrix(data[,-target]),
label = as.matrix(data[,target]),
cat_features = cat_features - 2)
head(pool, 1)[[1]]
Load the dataset from data.frame:
library(catboost)
train_path = system.file("extdata",
"adult_train.1000",
package="catboost")
test_path = system.file("extdata",
"adult_test.1000",
package="catboost")
column_description_vector = rep('numeric', 15)
cat_features <- c(3, 5, 7, 8, 9, 10, 11, 15)
for (i in cat_features)
column_description_vector[i] <- 'factor'
train <- read.table(train_path,
head = F,
sep = "\t",
colClasses = column_description_vector,
na.strings='NAN')
test <- read.table(test_path,
head = F,
sep = "\t",
colClasses = column_description_vector,
na.strings='NAN')
target <- c(1)
train_pool <- catboost.load_pool(data=train[,-target],
label = train[,target])
test_pool <- catboost.load_pool(data=test[,-target],
label = test[,target])
head(train_pool, 1)[[1]]
head(test_pool, 1)[[1]]
Train a model
Load a dataset with numerical features, define the training parameters and start the training:
library(catboost)
dataset = matrix(c(1900,7,
1896,1,
1896,41),
nrow=3,
ncol=2,
byrow = TRUE)
label_values = c(0,1,1)
fit_params <- list(iterations = 100,
loss_function = 'Logloss')
pool = catboost.load_pool(dataset, label = label_values)
model <- catboost.train(pool, params = fit_params)
Load a dataset with numerical features, define the training parameters and start the training on GPU:
library(catboost)
dataset = matrix(c(1900,7,
1896,1,
1896,41),
nrow=3,
ncol=2,
byrow = TRUE)
label_values = c(0,1,1)
fit_params <- list(iterations = 100,
loss_function = 'Logloss',
task_type = 'GPU')
pool = catboost.load_pool(dataset, label = label_values)
model <- catboost.train(pool, params = fit_params)
Load a dataset with numerical and categorical features, define the training parameters and start the training:
library(catboost)
countries = c('RUS','USA','SUI')
years = c(1900,1896,1896)
phone_codes = c(7,1,41)
domains = c('ru','us','ch')
dataset = data.frame(countries, years, phone_codes, domains)
label_values = c(0,1,1)
fit_params <- list(iterations = 100,
loss_function = 'Logloss',
ignored_features = c(4,9),
border_count = 32,
depth = 5,
learning_rate = 0.03,
l2_leaf_reg = 3.5)
pool = catboost.load_pool(dataset, label = label_values, cat_features = c(0,3))
model <- catboost.train(pool, params = fit_params)
Load a dataset with numerical and categorical features, define the training parameters and start the training on GPU:
library(catboost)
countries = c('RUS','USA','SUI')
years = c(1900,1896,1896)
phone_codes = c(7,1,41)
domains = c('ru','us','ch')
dataset = data.frame(countries, years, phone_codes, domains)
label_values = c(0,1,1)
fit_params <- list(iterations = 100,
loss_function = 'Logloss',
ignored_features = c(4,9),
border_count = 32,
depth = 5,
learning_rate = 0.03,
l2_leaf_reg = 3.5,
task_type = 'GPU')
pool = catboost.load_pool(dataset, label = label_values, cat_features = c(0,3))
model <- catboost.train(pool, params = fit_params)
Apply the model
Apply the model to the given dataset using the RawFormulaVal output type for calculating the approximated values of the formula:
library(catboost)
prediction <- catboost.predict(model,
pool,
prediction_type = 'RawFormulaVal')
Return the first n objects of the dataset
Return the first 10 objects of the dataset:
library(catboost)
head(pool, n = 10)
Select hyperparameters
Return the identifier of the iteration with the best result of the evaluation metric or loss function on the last validation set
library(caret)
library(titanic)
library(catboost)
set.seed(12345)
data <- as.data.frame(as.matrix(titanic_train), stringsAsFactors = TRUE)
drop_columns = c("PassengerId", "Survived", "Name", "Ticket", "Cabin")
x <- data[,!(names(data) %in% drop_columns)]
y <- data[,c("Survived")]
fit_control <- trainControl(method = "cv",
number = 4,
classProbs = TRUE)
grid <- expand.grid(depth = c(4, 6, 8),
learning_rate = 0.1,
iterations = 100,
l2_leaf_reg = 1e-3,
rsm = 0.95,
border_count = 64)
report <- train(x, as.factor(make.names(y)),
method = catboost.caret,
logging_level = 'Verbose', preProc = NULL,
tuneGrid = grid, trControl = fit_control)
print(report)
importance <- varImp(report, scale = FALSE)
print(importance)
Calculate object strength
Calculate the object strength:
library(catboost)
train_dataset = matrix(c(1900,7,1,
1896,1,1),
nrow=2,
ncol=3,
byrow = TRUE)
label_values = c(0, 1)
train_pool = catboost.load_pool(train_dataset,
label_values)
input_dataset = matrix(c(1900,47,1,
1904,27,1),
nrow=2,
ncol=3,
byrow = TRUE)
input_pool = catboost.load_pool(input_dataset,
label_values)
trained_model <- catboost.train(train_pool,
params = list(iterations = 10))
object_importance <- catboost.get_object_importance(trained_model,
input_pool,
train_pool)