insurance project

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10

df = read_csv("insurance.csv")

## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df

## # A tibble: 1,338 × 7
##      age sex      bmi children smoker region    charges
##    <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>       <dbl>
##  1    19 female  27.9        0 yes    southwest  16885.
##  2    18 male    33.8        1 no     southeast   1726.
##  3    28 male    33          3 no     southeast   4449.
##  4    33 male    22.7        0 no     northwest  21984.
##  5    32 male    28.9        0 no     northwest   3867.
##  6    31 female  25.7        0 no     southeast   3757.
##  7    46 female  33.4        1 no     southeast   8241.
##  8    37 female  27.7        3 no     northwest   7282.
##  9    37 male    29.8        2 no     northeast   6406.
## 10    60 female  25.8        0 no     northwest  28923.
## # ℹ 1,328 more rows

#Data Cleaning 
sum(is.na(df))

## [1] 0

df$smoker <- as.factor(df$smoker)
df$region <- as.factor(df$region)
df$sex <- as.factor(df$sex)

insurance_scaled <- df %>%
  mutate(across(c(age, bmi, children), scale))

# Look for trends between betwene predistors and cost 
# what we started noticng here is the smokers tend to be on the higher cost end

ggplot(df, aes(x=bmi, y=charges, color=smoker)) +
  geom_point(alpha=0.6) +
  labs(title="Medical Charges by BMI and Smoking Status")

ggplot(df, aes(x=age, y=charges, color=smoker)) +
  geom_point(alpha=0.6) +
  labs(title="Medical Charges vs Age")

#what we notice as age increase medical charges also go up

# Split data into train and test sets
set.seed(123)
train_index <- sample(1:nrow(df), 0.8 * nrow(df))
train <- df[train_index, ]
test <- df[-train_index, ]

# Simple Regression Model 
model_lm <- train(charges ~ age + bmi + children + smoker + region + sex,
                  data = train,
                  method = "lm")
summary(model_lm)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11297  -2846  -1005   1542  29791 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -12031.63    1132.08 -10.628  < 2e-16 ***
## age                243.43      13.65  17.834  < 2e-16 ***
## bmi                355.06      32.62  10.884  < 2e-16 ***
## children           559.84     156.28   3.582 0.000356 ***
## smokeryes        24172.29     459.97  52.552  < 2e-16 ***
## regionnorthwest   -556.52     539.58  -1.031 0.302594    
## regionsoutheast   -856.23     541.73  -1.581 0.114281    
## regionsouthwest   -967.82     541.96  -1.786 0.074421 .  
## sexmale           -216.88     378.52  -0.573 0.566792    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6145 on 1061 degrees of freedom
## Multiple R-squared:  0.7533, Adjusted R-squared:  0.7514 
## F-statistic: 404.9 on 8 and 1061 DF,  p-value: < 2.2e-16

# Predict and evaluate
pred_lm <- predict(model_lm, newdata = test)
postResample(pred_lm, test$charges)

##         RMSE     Rsquared          MAE 
## 5763.3845920    0.7339413 3940.1845821

summary(pred_lm)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -2054    5241    9350   11949   14001   40890

results_lm <- data.frame(
  Actual = test$charges,
  Predicted = pred_lm
)
head(results_lm)

##      Actual Predicted
## 1 11090.718 14882.844
## 2 39611.758 32598.883
## 3 13228.847 15358.255
## 4  4149.736  6367.288
## 5 14451.835 11501.198
## 6  4687.797  4579.663

ggplot(results_lm, aes(x = Actual, y = Predicted)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Linear Regression: Predicted vs Actual Charges",
    x = "Actual Charges ($)",
    y = "Predicted Charges ($)"
  ) +
  theme_minimal()

# Now Implmenting Regularization 
x <- model.matrix(charges ~ ., data=insurance_scaled)[,-1]
y <- insurance_scaled$charges



set.seed(123)
train_index <- sample(1:nrow(x), 0.8*nrow(x))
x_train <- x[train_index,]
x_test <- x[-train_index,]
y_train <- y[train_index]
y_test <- y[-train_index]


ridge <- cv.glmnet(x_train, y_train, alpha=0)
lasso <- cv.glmnet(x_train, y_train, alpha=1)

pred_ridge <- predict(ridge, s=ridge$lambda.min, newx=x_test)
pred_lasso <- predict(lasso, s=lasso$lambda.min, newx=x_test)

RMSE_ridge <- sqrt(mean((y_test - pred_ridge)^2))
RMSE_lasso <- sqrt(mean((y_test - pred_lasso)^2))
r2_lasso <- cor(y_test, pred_lasso)^2

RMSE_ridge

## [1] 5752.42

RMSE_lasso

## [1] 5767.916

r2_lasso

##      s=179.6503
## [1,]  0.7318516

results <- data.frame(
  Actual = y_test,
  Ridge = as.numeric(pred_ridge),
  Lasso = as.numeric(pred_lasso)
)

# Ridge regression plot
ggplot(results, aes(x = Actual, y = Ridge)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Ridge Regression: Predicted vs Actual Charges",
    x = "Actual Charges ($)",
    y = "Predicted Charges ($)"
  ) +
  theme_minimal()

# Lasso regression plot
ggplot(results, aes(x = Actual, y = Lasso)) +
  geom_point(color = "darkorange", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  labs(
    title = "Lasso Regression: Predicted vs Actual Charges",
    x = "Actual Charges ($)",
    y = "Predicted Charges ($)"
  ) +
  theme_minimal()

library(tidyverse)

ridge_coefs <- as.matrix(coef(ridge, s = ridge$lambda.min))
coef_df <- data.frame(
  Feature = rownames(ridge_coefs),
  Coefficient = ridge_coefs[,1]
)

ggplot(coef_df %>% filter(Feature != "(Intercept)"),
       aes(x = reorder(Feature, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(title = "Ridge Regression Coefficients",
       x = "Feature", y = "Effect on Predicted Charges") +
  theme_minimal()

insurance project

Rishi Rana

2025-10-21