Insurance Claim Prediction

Code

load("E:/Personal Website/blogs/claim prediction/telematics.RData")
dat_1 <- dat_1 |>
  mutate(across(where(is.character), as.factor))
dat_1$AMT_Claim <- scale(dat_1$AMT_Claim)

Project Goal

The main goal is how to develop a model that predicts the amount of claim(AMT Clain) using telematics.RData. For additional information regarding the data, refer to https://www.mdpi.com/2227-9091/9/4/58.

Exploratory Data Analysis

There are no missing values. We found 84 observations with negative car age. Negative car age does not make sense so we exclude these observations. Using the findCorrelation() function in R, we identify variables highly correlated(\(>= 0.8\)) with other variables. The following variables are then excluded. We also exclude the number of claims during observations(NB_claim). NB_claim is considered as a response in the original article.

Code

tele_df <- dat_1 |> dplyr::filter(Car.age >= 0) |>
  mutate(across(where(is.character), as.factor))

#percentage of missing values
naniar::pct_miss(tele_df)

[1] 0

Code

num_variables <- select(tele_df, where(is.numeric)) |> select(-AMT_Claim)
cor_matrix <- cor(num_variables, use = "complete.obs")
high_cor_vars <- findCorrelation(cor_matrix, cutoff = .8, names = TRUE)

matrix(high_cor_vars,  nrow = 4, ncol = 5, byrow = TRUE) |> kableExtra::kable()

Brake.11miles	Accel.11miles	Accel.09miles	Brake.12miles	Accel.12miles
Brake.09miles	Brake.14miles	Pct.drive.wkday	Pct.drive.wkend	Brake.08miles
Left.turn.intensity10	Left.turn.intensity09	Left.turn.intensity11	Left.turn.intensity08	Right.turn.intensity11
Right.turn.intensity10	Right.turn.intensity09	Right.turn.intensity12	Pct.drive.3hrs	Brake.11miles

Code

tele_df_clean <- select(tele_df, -any_of(c(high_cor_vars, "NB_Claim")))

My final dataset for modeling has 32 variables with 3780 observations.

Code

summarise(tele_df_clean,
          across(
            where(is.numeric),
            .fns = list(
              min = min,
              median = median,
              mean = mean,
              stdev = sd,
              q25 = ~ quantile(., 0.25),
              q75 = ~ quantile(., 0.75),
              max = max
            ),
            .names = "{.col}_{.fn}"
          )) |>
  pivot_longer(
    cols = everything(),
    names_to = c("variable", "stat"),
    names_pattern = "^(.*)_(min|median|mean|stdev|q25|q75|max)$"
  ) |>
  pivot_wider(names_from = stat, values_from = value) |>
  arrange(variable) |> #alphabetical order
  kable(digits = 3, caption = "Summary statistics") |>
  kable_styling(full_width = TRUE,
                bootstrap_options = c("striped", "hover"))

Summary statistics
variable	min	median	mean	stdev	q25	q75	max
AMT_Claim	-0.659	-0.296	-0.019	0.961	-0.517	0.076	18.606
Accel.06miles	0.000	31.000	52.888	63.514	14.000	67.000	621.000
Accel.08miles	0.000	2.000	5.445	11.809	1.000	5.000	151.000
Accel.14miles	0.000	0.000	0.329	1.841	0.000	0.000	51.000
Annual.miles.drive	683.508	9320.565	9868.095	3994.933	6213.710	12427.420	31068.550
Annual.pct.driven	0.036	0.848	0.751	0.250	0.539	0.975	1.000
Avgdays.week	0.963	6.032	5.832	0.871	5.415	6.478	7.000
Brake.06miles	2.000	78.000	102.026	84.244	43.000	133.000	621.000
Car.age	0.000	4.000	4.656	3.603	2.000	7.000	18.000
Credit.score	453.000	790.000	768.181	91.891	714.000	838.000	900.000
Duration	181.000	366.000	346.437	52.427	365.000	366.000	366.000
Insured.age	18.000	47.000	46.810	14.534	35.000	57.000	90.000
Left.turn.intensity12	0.000	1.000	1318.778	20084.448	0.000	9.000	441232.000
Pct.drive.2hrs	0.000	0.002	0.005	0.007	0.001	0.006	0.094
Pct.drive.4hrs	0.000	0.000	0.000	0.001	0.000	0.000	0.045
Pct.drive.fri	0.007	0.156	0.158	0.026	0.143	0.170	0.400
Pct.drive.mon	0.015	0.140	0.140	0.024	0.126	0.153	0.309
Pct.drive.rush.am	0.000	0.083	0.099	0.071	0.045	0.137	0.406
Pct.drive.rush.pm	0.001	0.140	0.147	0.062	0.106	0.178	0.436
Pct.drive.sat	0.003	0.135	0.137	0.036	0.116	0.155	0.516
Pct.drive.sun	0.006	0.111	0.111	0.033	0.091	0.130	0.305
Pct.drive.thr	0.009	0.155	0.156	0.026	0.142	0.169	0.498
Pct.drive.tue	0.003	0.147	0.150	0.027	0.134	0.163	0.402
Pct.drive.wed	0.000	0.148	0.149	0.025	0.133	0.162	0.355
Right.turn.intensity08	0.000	357.000	1472.644	15911.977	59.000	1068.000	380712.000
Territory	12.000	63.000	56.649	23.183	36.000	76.000	91.000
Total.miles.driven	67.224	7772.176	8741.845	5469.612	4759.608	11361.270	41019.575
Years.noclaims	0.000	22.000	23.545	14.904	10.000	35.000	74.000

Code

tele_df_clean |>
  select(where(is.character), where(is.factor)) |>
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "level") |>
  group_by(variable, level) |>
  summarise(count = n(), .groups = "drop_last") |>
  mutate(prop = count / sum(count)) |> ungroup() |>
  arrange(variable, desc(count)) |>
  kable(digits = 3, caption = "Frequency and Proportion of Categorical Variables") |>
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover"))

Frequency and Proportion of Categorical Variables
variable	level	count	prop
Car.use	Commute	2259	0.598
Car.use	Private	1348	0.357
Car.use	Commercial	152	0.040
Car.use	Farmer	21	0.006
Insured.sex	Male	2011	0.532
Insured.sex	Female	1769	0.468
Marital	Married	2444	0.647
Marital	Single	1336	0.353
Region	Urban	3113	0.824
Region	Rural	667	0.176

Code

ggplot(tele_df_clean, aes(x = AMT_Claim)) +
  geom_histogram(
    aes(y = after_stat(density)),
    bins = 30,
    fill = "skyblue",
    color = "black",
    alpha = 0.7
  )

The distribution of standardized claims is right skewed.

Code

split <- sample.split(tele_df_clean$AMT_Claim, SplitRatio = .8)
train_df <- subset(tele_df_clean,split == TRUE)
test_df <- subset(tele_df_clean, split == FALSE)
cat("Train data sample size:", nrow(train_df))

Train data sample size: 3024

Code

cat("Test data sample size:", nrow(test_df))

Test data sample size: 756

Models

Regression Tree

Code

insur_tree <- rpart(AMT_Claim ~ .,
                    data = train_df,
                    method = "anova",
                    "cp" = .008)
rpart.plot(insur_tree)

Code

insur_tree_importance <- with(insur_tree, tibble(
  variable = names(variable.importance),
  importance = as.numeric(variable.importance)
)) |> arrange(desc(importance)) |> slice_head(n = 100)

ggplot(insur_tree_importance, 
       aes(x = factor(variable, levels = rev(variable)), y = importance)) +
  geom_col(fill = "steelblue") +
  labs(x = "variable") +
  coord_flip()

Code

amt_claim_hat <- predict(insur_tree, newdata = test_df)
insur_treemse <- mean((test_df$AMT_Claim - amt_claim_hat)^2)
cat("MSE of the regression tree before pruning:",
    insur_treemse |> round(3))

MSE of the regression tree before pruning: 0.802

Code

cptable <- insur_tree$cptable
ggplot(cptable, aes(nsplit, xerror)) +
  geom_point() +
  geom_line()

Code

#prune the trees
ins_bestcp <- cptable[which.min(cptable[, "xerror"]), "CP"]
insur_pruntree <- prune(insur_tree, cp = ins_bestcp)

rpart.plot(insur_pruntree)

Code

with(insur_pruntree,
     tibble(
       variable = names(variable.importance),
       importance = as.numeric(variable.importance)
     )) |>
  ggplot(aes(x = factor(variable, levels = rev(variable)), y = importance)) +
  geom_col(fill = "steelblue") +
  labs(x = "variable") +
  coord_flip()

Code

#after prunning prediction
prune_amtclaim_hat <- predict(insur_pruntree, newdata = test_df)
prune_mse <- mean((test_df$AMT_Claim - prune_amtclaim_hat)^2)
cat("MSE after pruning:", prune_mse)

MSE after pruning: 0.8019689

Code

mse_dif <- ((insur_treemse - prune_mse) / (insur_treemse)) * 100
cat("MSE reduction:", mse_dif |> round(3))

MSE reduction: 0

Bagging

Code

#---------------------choosing best ntrees----------------
#| fig-cap: "Optimal number of trees for bagging"
#---------------------trees to grow--------------------------

ntree_values <- c(50, 100, 200, 300, 500)
# # Run cross-validation in parallel
# bagging_cverrors <- foreach(n = ntree_values,
#                             .combine = rbind,
#                             .packages = "caret") %dopar% {
#                               model <- train(
#                                 AMT_Claim ~ .,
#                                 data = train_df,
#                                 method = "rf",
#                                 trControl = trainControl(method = "cv", number = 10),
#                                 ntree = n,
#                                 importance = TRUE
#                               )
# 
# data.frame(ntree = n, RMSE = min(model$results$RMSE))
# }
# bagging_cverrors <- write_csv(bagging_cverrors, "bagging_cverrors")
# bagging_cverrors <- read_csv("bagging_cverrors")
# ggplot(bagging_cverrors, aes(ntree, RMSE))+
# geom_point()+
# geom_line()
# #
# # bagging_bestntree <- bagging_cverrors$ntree[which.min(bagging_cverrors$RMSE)]

Code

# #-----------------bagging model---------------------------------
# #---------------------------------------------------------------------
insur_bagging <- randomForest(
  AMT_Claim ~ .,
  data = train_df,
  importance = TRUE,
  ntree = 500,
  mtry = floor(ncol(train_df) - 1)
)
# varImpPlot(insur_bagging, main = "")

#variable importance plot using ggplot
importance(insur_bagging) %>% as.data.frame() %>%
  mutate(variable = rownames(.)) %>%
  rename(importance= `%IncMSE`) %>%
  arrange(desc(importance)) %>%
  ggplot(aes(x = reorder(variable, importance), y = importance))+
  geom_bar(stat = "identity", fill = "steelblue")+
  coord_flip()

Code

insurbag_hat <- predict(insur_bagging, newdata = test_df)
bagging_mse <- mean((test_df$AMT_Claim- insurbag_hat)^2)
cat("Bagging MSE:", bagging_mse)

Bagging MSE: 0.4942471

Boosting

Code

#---------------------------boosting ntree search--------------------------------------------
# boosting_cv <- foreach(n = ntree_values,
#                        .combine = rbind,
#                        .packages = "gbm") %dopar% {
#                          set.seed(123)
#                          model <- gbm(
#                            AMT_Claim ~ .,
#                            data = train_df,
#                            distribution = "gaussian",
#                            n.trees = n,
#                            shrinkage = 0.01,
#                            interaction.depth = 1,
#                            n.minobsinnode = 10,
#                            cv.folds = 10,
#                            verbose = FALSE
#                          )
#                          
#                          data.frame(n_tree = n, cv_error = min(model$cv.error))
#                        }
# 
# boosting_cv |>
# ggplot(aes(n_tree, cv_error))+
# geom_point()+
# geom_line()
# boosting_bestntree <- boosting_cv$n_tree[which.min(boosting_cv$cv_error)]
insur_boosting <- gbm(
  AMT_Claim ~ .,
  data = train_df,
  distribution = "gaussian",
  n.trees = 500,
  interaction.depth = 1
)
insur_boostingsummary <- summary(insur_boosting, plotit = FALSE)
ggplot(insur_boostingsummary, aes(x = reorder(var, rel.inf), y = rel.inf)) +
  geom_col(fill = "steelblue") +
  coord_flip()+
  labs(x = "Variable", y = "Relative Influence")

Code

#---------------------------boosting prediction--------------------------
boost_amthat <- predict(insur_boosting, newdata = test_df)
boosting_insurmse <- mean((test_df$AMT_Claim - boost_amthat)^2)
cat("Boosting MSE:", boosting_insurmse)

Boosting MSE: 0.6746562

Random Forest

Code

rf_claim <- randomForest(
  AMT_Claim ~ .,
  data = train_df,
  importance = TRUE,
  mtry = floor(sqrt(ncol(train_df) - 1)),
  ntree = 500
)
# 
# varImpPlot(rf_claim, main = "")

#variable importance with ggplot
importance(rf_claim) %>% as.data.frame() %>%
  mutate(variable = rownames(.)) %>%
  rename(importance= `%IncMSE`) %>%
  arrange(desc(importance)) %>%
  ggplot(aes(x = reorder(variable, importance), y = importance))+
  geom_bar(stat = "identity", fill = "steelblue")+
  coord_flip()

Code

#predict on test set and compute mse
rf_yhat <- predict(rf_claim, newdata = test_df)
rf_mse <- mean((test_df$AMT_Claim - rf_yhat)^2)
cat("RandomForest MSE:", rf_mse)

RandomForest MSE: 0.4387962

Model Comparison

Code

tibble(
  method = c("Pruned regression tree", "Bagging", "Boosting", "RandomForest"),
  mse = c(prune_mse, bagging_mse, boosting_insurmse, rf_mse)
) |>
  kable(digits = 3, caption = "Model comparison")

Model comparison
method	mse
Pruned regression tree	0.802
Bagging	0.494
Boosting	0.675
RandomForest	0.439