R mlr3 random forest

20/01/2024 17:30

R

R

machine-learning

mlr3

Introduction to random forest using mlr3.

mlr3 action一下

需求：建立二分类结局的随机森林预测模型。
过程：
1.纳入全部特征，训练集上训练模型，调参，得到模型的超参数。
2.使用以上的超参数，建立包含全部特征的预测模型，得到各个特征的变量重要性。
3.根据变量重要性，逐个纳入变量到随机森林模型中，并计算纳入不同数量变量时的模型评价指标。
4.选择达到合适模型评价指标时，包含最少特征的模型作为最终模型。

0.数据预处理

1
2
3
4
5
6
7
8
library(tidyverse)
library(mlr3verse)
mydat <- iris %>% 
  filter(Species != "setosa") %>% 
  mutate_at("Species", factor, 
            levels = c("versicolor", "virginica"), 
            labels = c(0,1))
knitr::kable(mydat[1:5,], digits = 3, align = 'c')

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
7.0	3.2	4.7	1.4
6.4	3.2	4.5	1.5
6.9	3.1	4.9	1.5
5.5	2.3	4.0	1.3
6.5	2.8	4.6	1.5

1.建立包含全部特征的模型，调参

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 设置学习器和任务
set.seed(121)
mytask = TaskClassif$new("irisStudy", mydat, target = "Species", positive = "1")
mytask$nrow
splits = partition(mytask, ratio = 0.7)
    
mylearner = lrn("classif.ranger", importance = "impurity", 
            predict_type = "prob", seed = 121)
# 稍微调参
task_trian <- mytask$clone()$filter(rows = splits$train)
task_trian$nrow
mytune = tune(tuner = tnr("random_search"),
            task = task_trian,
            learner = lrn("classif.ranger", importance = "impurity",    seed = 121, 
            predict_type = "prob", max.depth = to_tune(1, 10)),
            resampling = rsmp("repeated_cv", repeats = 10, folds = 5),
            measures = msr("classif.ce"),
            terminator = trm("evals", n_evals = 40))
mytune$archive
mytune$result_learner_param_vals

2.根据1中的超参数建立全部特征模型，得到变量重要性

1
2
3
4
5
6
7
8
9
10
# 设置学习器参数
mylearner$param_set$values = mytune$result_learner_param_vals
# 重新训练模型
mylearner$train(mytask, row_ids = splits$train)
# 查看模型结果
mylearner$importance()
## Petal.Length  Petal.Width Sepal.Length  Sepal.Width 
##   16.9508446   11.3301055    2.4145580    0.1500099
mylearner$oob_error()
## [1] 0.04483554

3.按重要性顺序逐步纳入变量

1
2
3
4
5
6
7
8
nvar <- length(mylearner$importance()) 
ooo <- matrix(FALSE, nrow = nvar , ncol = nvar)
ooo[lower.tri(ooo)] <- TRUE
diag(ooo) <- TRUE
dimnames(ooo)[[2]] <- names(mylearner$importance())
# 创建设计点
mydesign <- data.table(ooo)
knitr::kable(mydesign, digits = 3, align = 'c')

Petal.Length	Petal.Width	Sepal.Length	Sepal.Width
TRUE	FALSE	FALSE	FALSE
TRUE	TRUE	FALSE	FALSE
TRUE	TRUE	TRUE	FALSE
TRUE	TRUE	TRUE	TRUE

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
myinstance = fselect(
  fselector = fs("design_points", design = mydesign),
  task = task_trian,
  learner = mylearner,
  resampling = rsmp("cv", folds = 5),
  measures = msrs(c("classif.auc", "classif.acc", 
                    "classif.sensitivity", "classif.specificity"))
)

out <- as.data.table(myinstance$archive) %>% 
  select(batch_nr, features, classif.auc, classif.acc, 
         classif.sensitivity, classif.specificity)
# 设置一下字体
windowsFonts(Cab = windowsFont(family = "Calibri"))
out %>% 
  pivot_longer(cols = 3:6, names_to = c("task", "eval"),
               names_sep = "\\.", values_to = "value") %>% 
ggplot(aes(x = batch_nr)) +
  geom_point(aes(y = value, color = eval), size = 1.4) +
  geom_line(aes(y = value, color = eval), linewidth = 0.6) +
  geom_segment(aes(x = 1, y = 0.6, xend = 1, yend = 1), linewidth=0.6,linetype="dashed") +
  theme_classic(base_family = "Cab") + 
  scale_colour_manual(NULL, values = c("#4575b4","#fdae61","#1a9641","#d7191c"),
                      labels = c("AUC","ACC","sensitivity","specificity")) +
  labs(x = 'Number of features', y = 'Values') +
  scale_y_continuous(limits = c(0.6, 1), expand = c(0, 0), breaks = seq(0.6, 1, 0.1)) +
  scale_x_continuous(breaks = c(3, 5, 10, 15, 20)) +
  theme(axis.text.x = element_text(face = "bold", size =10), 
        axis.title.x = element_text(face = "bold", size =12), 
        axis.text.y = element_text(face = "bold",size = 10),
        axis.title.y = element_text(face = "bold",size = 12))

mlr3-demo

# 4.建立最终模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
mytask$select(myinstance$result_feature_set[[1]])
mylearner$train(mytask, row_ids = splits$train)
mylearner$predict(mytask, row_ids = splits$test)

## <PredictionClassif> for 30 observations:
##     row_ids truth response      prob.1       prob.0
##           3     0        1 0.604905852 0.3950941476
##           5     0        0 0.000211427 0.9997885730
##          17     0        0 0.000211427 0.9997885730
## ---                                                
##          92     1        1 0.995207034 0.0047929661
##          98     1        1 0.999405086 0.0005949141
##          99     1        1 0.999405086 0.0005949141
    
mylearner$predict(mytask, row_ids = splits$test)$score(msrs(c("classif.auc", "classif.acc", 
                                                              "classif.sensitivity", "classif.specificity")))
##         classif.auc         classif.acc classif.sensitivity classif.specificity 
##           0.9355556           0.8666667           0.8666667           0.8666667

mylearner$importance()
## Petal.Length 
##      31.5415

mylearner$oob_error()
## [1] 0.0521194

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
7.0	3.2	4.7	1.4
6.4	3.2	4.5	1.5
6.9	3.1	4.9	1.5
5.5	2.3	4.0	1.3
6.5	2.8	4.6	1.5

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
7.0	3.2	4.7	1.4
6.4	3.2	4.5	1.5
6.9	3.1	4.9	1.5
5.5	2.3	4.0	1.3
6.5	2.8	4.6	1.5

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
7.0	3.2	4.7	1.4
6.4	3.2	4.5	1.5
6.9	3.1	4.9	1.5
5.5	2.3	4.0	1.3
6.5	2.8	4.6	1.5