library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
df = read_csv("insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 1,338 × 7
## age sex bmi children smoker region charges
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 19 female 27.9 0 yes southwest 16885.
## 2 18 male 33.8 1 no southeast 1726.
## 3 28 male 33 3 no southeast 4449.
## 4 33 male 22.7 0 no northwest 21984.
## 5 32 male 28.9 0 no northwest 3867.
## 6 31 female 25.7 0 no southeast 3757.
## 7 46 female 33.4 1 no southeast 8241.
## 8 37 female 27.7 3 no northwest 7282.
## 9 37 male 29.8 2 no northeast 6406.
## 10 60 female 25.8 0 no northwest 28923.
## # ℹ 1,328 more rows
#Data Cleaning
sum(is.na(df))
## [1] 0
df$smoker <- as.factor(df$smoker)
df$region <- as.factor(df$region)
df$sex <- as.factor(df$sex)
insurance_scaled <- df %>%
mutate(across(c(age, bmi, children), scale))
# Look for trends between betwene predistors and cost
# what we started noticng here is the smokers tend to be on the higher cost end
ggplot(df, aes(x=bmi, y=charges, color=smoker)) +
geom_point(alpha=0.6) +
labs(title="Medical Charges by BMI and Smoking Status")

ggplot(df, aes(x=age, y=charges, color=smoker)) +
geom_point(alpha=0.6) +
labs(title="Medical Charges vs Age")

#what we notice as age increase medical charges also go up
# Split data into train and test sets
set.seed(123)
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]
# Simple Regression Model
model_lm <- train(charges ~ age + bmi + children + smoker + region + sex,
data = train,
method = "lm")
summary(model_lm)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11297 -2846 -1005 1542 29791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12031.63 1132.08 -10.628 < 2e-16 ***
## age 243.43 13.65 17.834 < 2e-16 ***
## bmi 355.06 32.62 10.884 < 2e-16 ***
## children 559.84 156.28 3.582 0.000356 ***
## smokeryes 24172.29 459.97 52.552 < 2e-16 ***
## regionnorthwest -556.52 539.58 -1.031 0.302594
## regionsoutheast -856.23 541.73 -1.581 0.114281
## regionsouthwest -967.82 541.96 -1.786 0.074421 .
## sexmale -216.88 378.52 -0.573 0.566792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6145 on 1061 degrees of freedom
## Multiple R-squared: 0.7533, Adjusted R-squared: 0.7514
## F-statistic: 404.9 on 8 and 1061 DF, p-value: < 2.2e-16
# Predict and evaluate
pred_lm <- predict(model_lm, newdata = test)
postResample(pred_lm, test$charges)
## RMSE Rsquared MAE
## 5763.3845920 0.7339413 3940.1845821
summary(pred_lm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2054 5241 9350 11949 14001 40890
results_lm <- data.frame(
Actual = test$charges,
Predicted = pred_lm
)
head(results_lm)
## Actual Predicted
## 1 11090.718 14882.844
## 2 39611.758 32598.883
## 3 13228.847 15358.255
## 4 4149.736 6367.288
## 5 14451.835 11501.198
## 6 4687.797 4579.663
ggplot(results_lm, aes(x = Actual, y = Predicted)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Linear Regression: Predicted vs Actual Charges",
x = "Actual Charges ($)",
y = "Predicted Charges ($)"
) +
theme_minimal()

# Now Implmenting Regularization
x <- model.matrix(charges ~ ., data=insurance_scaled)[,-1]
y <- insurance_scaled$charges
set.seed(123)
train_index <- sample(1:nrow(x), 0.8*nrow(x))
x_train <- x[train_index,]
x_test <- x[-train_index,]
y_train <- y[train_index]
y_test <- y[-train_index]
ridge <- cv.glmnet(x_train, y_train, alpha=0)
lasso <- cv.glmnet(x_train, y_train, alpha=1)
pred_ridge <- predict(ridge, s=ridge$lambda.min, newx=x_test)
pred_lasso <- predict(lasso, s=lasso$lambda.min, newx=x_test)
RMSE_ridge <- sqrt(mean((y_test - pred_ridge)^2))
RMSE_lasso <- sqrt(mean((y_test - pred_lasso)^2))
r2_lasso <- cor(y_test, pred_lasso)^2
RMSE_ridge
## [1] 5752.42
RMSE_lasso
## [1] 5767.916
r2_lasso
## s=179.6503
## [1,] 0.7318516
results <- data.frame(
Actual = y_test,
Ridge = as.numeric(pred_ridge),
Lasso = as.numeric(pred_lasso)
)
# Ridge regression plot
ggplot(results, aes(x = Actual, y = Ridge)) +
geom_point(color = "steelblue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Ridge Regression: Predicted vs Actual Charges",
x = "Actual Charges ($)",
y = "Predicted Charges ($)"
) +
theme_minimal()

# Lasso regression plot
ggplot(results, aes(x = Actual, y = Lasso)) +
geom_point(color = "darkorange", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(
title = "Lasso Regression: Predicted vs Actual Charges",
x = "Actual Charges ($)",
y = "Predicted Charges ($)"
) +
theme_minimal()

library(tidyverse)
ridge_coefs <- as.matrix(coef(ridge, s = ridge$lambda.min))
coef_df <- data.frame(
Feature = rownames(ridge_coefs),
Coefficient = ridge_coefs[,1]
)
ggplot(coef_df %>% filter(Feature != "(Intercept)"),
aes(x = reorder(Feature, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Ridge Regression Coefficients",
x = "Feature", y = "Effect on Predicted Charges") +
theme_minimal()
