knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
# --- Core Data Manipulation ---
library(tidyverse)
library(lubridate)
library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(janitor)
# --- Machine Learning ---
library(fastDummies)
library(smotefamily)
library(xgboost)
library(caret)
library(ranger)
library(rpart)
library(pROC)
# --- Visualization ---
library(ggplot2)
library(gridExtra)
library(scales)
library(corrplot)
library(viridis)
library(knitr)
library(kableExtra)
library(plotly)
Demographics <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_demographics_train.csv")
StatusData <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_status_level_train.csv")
MRR <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_monthly_recurring_revenue_train.csv")
Revenue <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_revenue_history_train.csv")
Support <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/support_ticket_activity_train.csv")
Bugs <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/product_bug_reports_train.csv")
Newsletter <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/newsletter_engagement_train.csv")
Satisfaction_scores <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_satisfaction_scores_train.csv")
Region_industry <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_region_and_industry_train.csv")
Automatic data quality analysis for all datasets (Dimensions, Duplicates, Missing Values):
# --- 1. DEFINE FUNCTION ---
# This function takes a dataframe and generates a summary report
create_quality_report <- function(df, dataset_name) {
# Print Subheader
cat("\n### ", dataset_name, " Analysis\n")
# Dimensions
cat("- **Rows:** ", nrow(df), "\n")
cat("- **Columns:** ", ncol(df), "\n")
# Duplicate Check
dup_count <- sum(duplicated(df))
cat("- **Duplicate Rows:** ", dup_count, ifelse(dup_count > 0, " (WARNING!)", ""), "\n\n")
# create Detailed Table
summary_df <- data.frame(
Column = names(df),
Type = sapply(df, class),
Missing_Count = colSums(is.na(df)),
Missing_Percent = round(colSums(is.na(df)) / nrow(df) * 100, 2),
Unique_Values = sapply(df, function(x) length(unique(x)))
)
# Print Table (in HTML format)
print(
kable(summary_df, caption = paste(dataset_name, "Details"), row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE, position = "left") %>%
scroll_box(height = "250px")
)
cat("\n---\n") # Separator line
}
# --- 2. EXECUTE FUNCTION (For all 9 Datasets) ---
create_quality_report(Demographics, "1. Demographics")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| CUS ID | character | 0 | 0 | 2068 |
| Customer Age (Months) | numeric | 0 | 0 | 72 |
create_quality_report(StatusData, "2. Customer Status")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0.00 | 2068 |
| Status | character | 0 | 0.00 | 3 |
| Customer Level | character | 1 | 0.05 | 4 |
create_quality_report(MRR, "3. Monthly Recurring Revenue (MRR)")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0 | 1056 |
| MRR | character | 0 | 0 | 795 |
create_quality_report(Revenue, "4. Revenue History")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0 | 712 |
| Total Revenue | character | 0 | 0 | 570 |
create_quality_report(Support, "5. Support Tickets")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0 | 1072 |
| Help Ticket Count | numeric | 0 | 0 | 94 |
| Help Ticket Lead Time (hours) | numeric | 0 | 0 | 767 |
create_quality_report(Bugs, "6. Bug Reports")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0 | 502 |
| Product Bug Task Count | numeric | 0 | 0 | 161 |
create_quality_report(Newsletter, "7. Newsletter Engagement")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0 | 199 |
| Company Newsletter Interaction Count | numeric | 0 | 0 | 21 |
create_quality_report(Satisfaction_scores, "8. Satisfaction Scores")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0.00 | 502 |
| Year | numeric | 0 | 0.00 | 1 |
| Quarter | numeric | 0 | 0.00 | 3 |
| Survey Date | Date | 0 | 0.00 | 3 |
| Response Date | Date | 0 | 0.00 | 144 |
| How likely are you to recommend insider to a friend or colleague | numeric | 0 | 0.00 | 11 |
| How would you rate the value you gain from our company | numeric | 0 | 0.00 | 5 |
| How frequently are you using our platform | character | 3 | 0.19 | 4 |
| Please rate the overall quality of our products | numeric | 0 | 0.00 | 6 |
| Please rate the usability of the panel | numeric | 47 | 2.99 | 6 |
| Please rate your understanding of our reporting capabilities in the panel | character | 0 | 0.00 | 5 |
create_quality_report(Region_industry, "9. Region & Industry")
| Column | Type | Missing_Count | Missing_Percent | Unique_Values |
|---|---|---|---|---|
| Customer ID | character | 0 | 0.00 | 2068 |
| Region | character | 1 | 0.05 | 24 |
| Vertical | character | 48 | 2.32 | 23 |
| Subvertical | character | 71 | 3.43 | 51 |
Bugs <- Bugs %>% distinct()
Satisfaction_scores <- Satisfaction_scores %>% distinct()
cat("Cleaning Complete: Duplicates removed from Bugs and Satisfaction datasets.\n")
## Cleaning Complete: Duplicates removed from Bugs and Satisfaction datasets.
Our main objective in this project is to examine the distribution of customer status.
# We are visualizing the 'Status' column in the StatusData table.
StatusData %>%
group_by(Status) %>%
summarise(num = n()) %>%
mutate(percent = round(num / sum(num) * 100, 1)) %>%
ggplot(aes(x = Status, y = num, fill = Status)) +
geom_col() +
geom_text(aes(label = paste0(num , " (%", percent, ")")), vjust = -0.5) +
labs(title = "Customer Status Distribution (Target Variable)",
x = "Status",
y = "number of customer") +
theme_minimal() +
theme(legend.position = "none")
Before merging, we standardize the key identifier column to ‘Customer_ID’ across all datasets to ensure consistency.
# Standardize 'Demographics' (Change 'CUS ID' to 'Customer_ID')
Demographics <- Demographics %>%
rename(Customer_ID = `CUS ID`)
# Standardize other datasets (Change 'Customer ID' to 'Customer_ID')
# We use backticks `` because the original names contain spaces.
StatusData <- StatusData %>% rename(Customer_ID = `Customer ID`)
MRR <- MRR %>% rename(Customer_ID = `Customer ID`)
Revenue <- Revenue %>% rename(Customer_ID = `Customer ID`)
Support <- Support %>% rename(Customer_ID = `Customer ID`)
Bugs <- Bugs %>% rename(Customer_ID = `Customer ID`)
Newsletter <- Newsletter %>% rename(Customer_ID = `Customer ID`)
Satisfaction_scores <- Satisfaction_scores %>% rename(Customer_ID = `Customer ID`)
Region_industry <- Region_industry %>% rename(Customer_ID = `Customer ID`)
cat("Column standardization complete. All datasets now use 'Customer_ID' as the key.\n")
## Column standardization complete. All datasets now use 'Customer_ID' as the key.
We are summarizing the detailed questions in the satisfaction survey to create a single ‘Average Satisfaction Score’ for each customer.
# Check the columns of the satisfaction table (are all columns except ID numerical scores?)
# Calculate the mean for each row and create a new column.
Satisfaction_Feat <- Satisfaction_scores %>%
# 1. Select only numeric columns (excluding ID)
select(Customer_ID, where(is.numeric)) %>%
# 2. Extract non-score numerical data such as 'Year' and 'Quarter'.
select(-Year, -Quarter) %>%
# 3. Calculate the average of the remaining score columns.
mutate(Instance_Score = rowMeans(select(., -Customer_ID), na.rm = TRUE)) %>%
# Group by customer and reduce to a single line.
group_by(Customer_ID) %>%
summarise(Avg_Satisfaction = mean(Instance_Score, na.rm = TRUE))
cat("Completed: There is a single satisfaction score for each customer.\n")
## Completed: There is a single satisfaction score for each customer.
head(Satisfaction_Feat)
## # A tibble: 6 × 2
## Customer_ID Avg_Satisfaction
## <chr> <dbl>
## 1 CUS-1002400 5.27
## 2 CUS-1006000 6.67
## 3 CUS-1019500 5.48
## 4 CUS-1025800 4.88
## 5 CUS-1030400 5.04
## 6 CUS-1068750 5.5
We are summarizing transaction-based tables into a single row per customer (Aggregation).
# 1. Support Tickets: How many total support requests has each customer made?
Support_Agg <- Support %>%
group_by(Customer_ID) %>%
summarise(Total_Tickets = n()) # n() counts the number of rows
# 2. Revenue History: What is the total expenditure of the customer?
Revenue_Agg <- Revenue %>%
mutate(amount_clean = parse_number(as.character(grep("[0-9]", ., value = TRUE)[1]))) %>%
mutate(across(where(is.character), parse_number)) %>%
group_by(Customer_ID) %>%
summarise(
Total_Revenue = sum(across(where(is.numeric), sum), na.rm = TRUE), # Sum all numeric columns
Transaction_Count = n()
)
# 3. Bug Reports: How many times has the customer reported a bug?
Bugs_Agg <- Bugs %>%
group_by(Customer_ID) %>%
summarise(Bug_Count = n())
# 4. MRR
MRR_Agg <- MRR %>%
mutate(MRR_Clean = parse_number(as.character(MRR))) %>%
group_by(Customer_ID) %>%
summarise(MRR = mean(MRR_Clean, na.rm = TRUE))
# 5. Newsletter
Newsletter_Agg <- Newsletter %>%
group_by(Customer_ID) %>%
summarise(Newsletter_Count = sum(`Company Newsletter Interaction Count`, na.rm = TRUE))
cat("Completed: Support, Revenue, Bugs, MRR, Newsletter tables have been summarized.\n")
## Completed: Support, Revenue, Bugs, MRR, Newsletter tables have been summarized.
Demographics$Customer_ID <- as.character(Demographics$Customer_ID)
Region_industry$Customer_ID <- as.character(Region_industry$Customer_ID)
StatusData$Customer_ID <- as.character(StatusData$Customer_ID)
MRR_Agg$Customer_ID <- as.character(MRR_Agg$Customer_ID)
Newsletter_Agg$Customer_ID <- as.character(Newsletter_Agg$Customer_ID)
Revenue_Agg$Customer_ID <- as.character(Revenue_Agg$Customer_ID)
Support_Agg$Customer_ID <- as.character(Support_Agg$Customer_ID)
Bugs_Agg$Customer_ID <- as.character(Bugs_Agg$Customer_ID)
Satisfaction_Feat$Customer_ID <- as.character(Satisfaction_Feat$Customer_ID)
cat("All Customer_ID columns have been set to 'Character' format.\n")
## All Customer_ID columns have been set to 'Character' format.
We are merging the summarized tables created during Feature Engineering into the main dataset.
Master_Data <- Demographics %>%
left_join(Region_industry, by = "Customer_ID") %>%
left_join(StatusData, by = "Customer_ID") %>%
left_join(MRR_Agg, by = "Customer_ID") %>%
left_join(Newsletter_Agg, by = "Customer_ID") %>%
# The following are the aggregated tables:
left_join(Revenue_Agg, by = "Customer_ID") %>%
left_join(Support_Agg, by = "Customer_ID") %>%
left_join(Bugs_Agg, by = "Customer_ID") %>%
left_join(Satisfaction_Feat, by = "Customer_ID")
cat("Master Dataset Created Successfully (Aggregated).\n")
## Master Dataset Created Successfully (Aggregated).
cat("Final Dimensions: ", nrow(Master_Data), " Rows, ", ncol(Master_Data), " Columns\n")
## Final Dimensions: 2068 Rows, 14 Columns
# Display the table
kable(head(Master_Data), caption = "Final Master Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F) %>%
scroll_box(width = "100%")
| Customer_ID | Customer Age (Months) | Region | Vertical | Subvertical | Status | Customer Level | MRR | Newsletter_Count | Total_Revenue | Transaction_Count | Total_Tickets | Bug_Count | Avg_Satisfaction |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CUS-1471650 | 24 | Malaysia | Media and Publishing | Content Websites | Onboarding | Enterprise | 196 | NA | NA | NA | NA | NA | NA |
| CUS-1087300 | 25 | Australia | Transportation and Logistics | Transportation and Logistics | Churn | Semi-Enterprise | 3714 | NA | NA | NA | NA | NA | NA |
| CUS-230550 | 12 | Turkey | Retail | Fashion Retailer | Churn | Long-tail | NA | NA | NA | NA | NA | NA | NA |
| CUS-1470950 | 22 | Thailand | Retail | Marketplace | Retained | Enterprise | 14297 | 3 | NA | NA | 1 | 6 | 4.7500 |
| CUS-1526100 | 11 | Vietnam | Other | Other | Retained | Semi-Enterprise | 2250 | 2 | NA | NA | 1 | 4 | 5.4375 |
| CUS-1507900 | 12 | Vietnam | Other | Other | Churn | Long-tail | NA | 1 | NA | NA | 1 | NA | NA |
We address the missing values resulting from the merge process. Strategy: 1. Activity counts (Tickets, Bugs, Revenue) are filled with 0 (meaning no activity). 2. Satisfaction scores are imputed with the Median value to maintain distribution.
library(readr)
library(tidyr)
library(dplyr)
Master_Data <- Master_Data %>%
mutate(
# 1.
MRR = as.numeric(as.character(MRR)),
Total_Revenue = as.numeric(as.character(Total_Revenue)),
Newsletter_Count = as.numeric(as.character(Newsletter_Count))
) %>%
mutate(
# 2. Imputation
Total_Tickets = replace_na(Total_Tickets, 0),
Bug_Count = replace_na(Bug_Count, 0),
Total_Revenue = replace_na(Total_Revenue, 0),
Transaction_Count = replace_na(Transaction_Count, 0),
# if not in MRR, Newsletter -> 0
MRR = replace_na(MRR, 0),
Newsletter_Count = replace_na(Newsletter_Count, 0),
# if not in satisfaction score -> Median
Avg_Satisfaction = ifelse(is.na(Avg_Satisfaction),
median(Avg_Satisfaction, na.rm = TRUE),
Avg_Satisfaction)
)
# 3. Verify Imputation
cat("\n--------------------------------\n")
##
## --------------------------------
cat("Missing Values AFTER Imputation:\n")
## Missing Values AFTER Imputation:
print(colSums(is.na(Master_Data)))
## Customer_ID Customer Age (Months) Region
## 0 0 1
## Vertical Subvertical Status
## 48 71 0
## Customer Level MRR Newsletter_Count
## 1 0 0
## Total_Revenue Transaction_Count Total_Tickets
## 0 0 0
## Bug_Count Avg_Satisfaction
## 0 0
Master_Data <- Master_Data %>%
mutate(
Region = replace_na(Region, "Unknown"),
Vertical = replace_na(Vertical, "Unknown"),
Subvertical = replace_na(Subvertical, "Unknown"),
`Customer Level` = replace_na(`Customer Level`, "Unknown")
)
cat("\n--------------------------------\n")
##
## --------------------------------
cat("Final check:\n")
## Final check:
print(colSums(is.na(Master_Data)))
## Customer_ID Customer Age (Months) Region
## 0 0 0
## Vertical Subvertical Status
## 0 0 0
## Customer Level MRR Newsletter_Count
## 0 0 0
## Total_Revenue Transaction_Count Total_Tickets
## 0 0 0
## Bug_Count Avg_Satisfaction
## 0 0
# 1. Define the function here first (since section 4.3 has not run yet)
cap_outliers <- function(x) {
quantiles <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
x[x > quantiles[2]] <- quantiles[2] # Cap values at the upper bound
x[x < quantiles[1]] <- quantiles[1] # Cap values at the lower bound
return(x)
}
# 2. Remove the corrupted Revenue columns from Master_Data
Master_Data <- Master_Data %>%
select(-Total_Revenue, -Transaction_Count)
# 3. Re-extract Revenue data from raw file with aggressive cleaning
Revenue_Agg_Fixed <- Revenue %>%
mutate(across(everything(), as.character)) %>% # Convert everything to character strings
pivot_longer(cols = -Customer_ID, values_to = "raw_value") %>% # Pivot to long format
mutate(clean_val = parse_number(raw_value)) %>% # Extract numeric values
group_by(Customer_ID) %>%
summarise(
Total_Revenue = sum(clean_val, na.rm = TRUE),
Transaction_Count = n()
)
# 4. Re-integrate into Master_Data
Master_Data <- Master_Data %>%
left_join(Revenue_Agg_Fixed, by = "Customer_ID") %>%
mutate(
# Set unmatched records (customers with no purchases) to 0
Total_Revenue = replace_na(Total_Revenue, 0),
Transaction_Count = replace_na(Transaction_Count, 0)
)
# 5. Apply Outlier Capping for this NEW column
Master_Data <- Master_Data %>%
mutate(Total_Revenue_Capped = cap_outliers(Total_Revenue))
cat("Revenue table repaired and Outlier capping applied.\n")
## Revenue table repaired and Outlier capping applied.
cat("New Average Revenue: ", mean(Master_Data$Total_Revenue, na.rm = TRUE), "\n")
## New Average Revenue: 4603337
Understand the relationships between numeric variables.
library(corrplot)
# 1. Select only numeric columns for correlation analysis
numeric_vars <- Master_Data %>%
select(where(is.numeric)) %>%
# Removing 'Transaction_Count' if it has zero variance or is redundant
select(-Transaction_Count)
# 2. Compute the correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# 3. Visualize the correlation matrix
corrplot(cor_matrix,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45, # Text label rotation
addCoef.col = "black", # Add coefficient numbers
number.cex = 0.7, # Font size for coefficients
diag = FALSE, # Hide diagonal
title = "Feature Correlation Matrix",
mar = c(0,0,1,0))
summary(Master_Data$Total_Revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 4.603e+06 2.130e+03 2.790e+09
Analyze the distribution of the ‘Avg_Satisfaction’ score.
ggplot(Master_Data, aes(x = Avg_Satisfaction)) +
geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "#2A9D8F", color = "white", alpha = 0.7) +
geom_density(color = "#E76F51", size = 1) +
labs(title = "Distribution of Average Satisfaction Scores",
subtitle = "Histogram with Density Curve",
x = "Average Satisfaction Score",
y = "Density") +
theme_minimal()
## 4.3 Outlier Detection and Treatment Identify and cap extreme values
(outliers) to prevent model skew.
# --- Step 1: Visualize Outliers (Boxplots) ---
# We focus on highly variable columns: Revenue, MRR, and Tickets.
p1 <- ggplot(Master_Data, aes(y = Total_Revenue)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Total Revenue Outliers") + theme_minimal()
p2 <- ggplot(Master_Data, aes(y = MRR)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "MRR Outliers") + theme_minimal()
p3 <- ggplot(Master_Data, aes(y = Total_Tickets)) +
geom_boxplot(fill = "pink") +
labs(title = "Total Tickets Outliers") + theme_minimal()
library(gridExtra)
grid.arrange(p1, p2, p3, ncol = 3)
# --- Step 2: Outlier Treatment (Capping / Winsorization) ---
# Strategy: Cap values above the 99th percentile.
# We do not delete rows to preserve data; we just limit the maximum value.
cap_outliers <- function(x) {
quantiles <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
x[x > quantiles[2]] <- quantiles[2] # Cap upper bound
x[x < quantiles[1]] <- quantiles[1] # Cap lower bound (optional)
return(x)
}
# Apply capping to relevant numeric columns
Master_Data <- Master_Data %>%
mutate(
Total_Revenue_Capped = cap_outliers(Total_Revenue),
MRR_Capped = cap_outliers(MRR),
Total_Tickets_Capped = cap_outliers(Total_Tickets)
)
cat("Outlier treatment complete. New columns with suffix '_Capped' created.\n")
## Outlier treatment complete. New columns with suffix '_Capped' created.
# Goal: Create new predictive features BEFORE encoding or splitting.
# Logic: It is safer to calculate ratios on real data than on synthetic (SMOTE) data.
Model_Data_Enhanced <- Master_Data %>%
mutate(
# 1. Ticket Intensity: Tickets per month of tenure
# (Adding +1 to avoid division by zero)
Ticket_per_Month = Total_Tickets_Capped / (`Customer Age (Months)` + 1),
# 2. Value per Age: Revenue generated per month
Revenue_per_Month = Total_Revenue_Capped / (`Customer Age (Months)` + 1),
# 3. High Risk Flag: Interaction between Low Satisfaction and High Bugs
# If Satisfaction is low (e.g. 2) and Bugs are high (e.g. 5) -> Risk = (10-2)*5 = 40 (High Score)
Risk_Factor = (10 - Avg_Satisfaction) * Bug_Count
)
cat("5.1 Complete: New Features (Ratios & Interactions) created on Master Data.\n")
## 5.1 Complete: New Features (Ratios & Interactions) created on Master Data.
# Goal: Select relevant columns and convert categoricals to numeric.
library(fastDummies)
library(janitor)
# 1. Prepare Target & Drop Unused Columns
Model_Data_Selected <- Model_Data_Enhanced %>%
mutate(Target = ifelse(Status == "Churn", 1, 0)) %>%
select(
-Customer_ID, # ID is not predictive
-Status, # Converted to Target
-Total_Revenue, # Using Capped version
-MRR, # Using Capped version
-Total_Tickets # Using Capped version
)
# 2. One-Hot Encoding
categorical_cols <- c("Region", "Vertical", "Subvertical", "Customer Level")
Model_Data_Encoded <- dummy_cols(
Model_Data_Selected,
select_columns = categorical_cols,
remove_selected_columns = TRUE, # Remove original text columns
remove_first_dummy = FALSE # Keep all categories
) %>%
clean_names() # Standardize names (region_turkey, etc.)
cat("5.2 Complete: Encoding Finished.\n")
## 5.2 Complete: Encoding Finished.
cat("Dimensions:", nrow(Model_Data_Encoded), "Rows,", ncol(Model_Data_Encoded), "Columns\n")
## Dimensions: 2068 Rows, 114 Columns
# Goal: Split data into 80% Training and 20% Testing.
library(caret)
set.seed(123)
train_index <- createDataPartition(Model_Data_Encoded$target, p = 0.8, list = FALSE)
X_train_raw <- Model_Data_Encoded[train_index, ] %>% select(-target)
y_train_raw <- Model_Data_Encoded[train_index, ]$target
# Test Set (We will NOT touch this with SMOTE)
Test_Set <- Model_Data_Encoded[-train_index, ]
X_test <- Test_Set %>% select(-target)
y_test <- Test_Set$target
cat("5.3 Complete: Data Split.\n")
## 5.3 Complete: Data Split.
cat("Train Size:", length(y_train_raw), "\n")
## Train Size: 1655
cat("Test Size: ", length(y_test), "\n")
## Test Size: 413
# Goal: Apply SMOTE only to the Training set to fix imbalance.
library(smotefamily)
# Apply SMOTE
# K=5 is standard. dup_size=0 means we don't just copy rows, we generate new ones.
smote_result <- SMOTE(X = X_train_raw, target = y_train_raw, K = 5, dup_size = 0)
# Extract Balanced Train Data
Train_Set_Balanced <- smote_result$data %>%
rename(target = class) %>%
mutate(target = as.numeric(as.character(target)))
# Separate Features and Target for the final Train set
X_train_balanced <- Train_Set_Balanced %>% select(-target)
y_train_balanced <- Train_Set_Balanced$target
cat("5.4 Complete: SMOTE applied to Training Set.\n")
## 5.4 Complete: SMOTE applied to Training Set.
cat("Original Churn Count:", sum(y_train_raw == 1), "\n")
## Original Churn Count: 771
cat("Balanced Churn Count:", sum(y_train_balanced == 1), "\n")
## Balanced Churn Count: 771
library(ggplot2)
library(gridExtra)
# Plot before and after
p1 <- data.frame(target = y_train_raw) %>%
ggplot(aes(x = factor(target), fill = factor(target))) + geom_bar() +
labs(title = "Original Train", x = "Status") + theme_minimal() + theme(legend.position="none")
p2 <- data.frame(target = y_train_balanced) %>%
ggplot(aes(x = factor(target), fill = factor(target))) + geom_bar() +
labs(title = "SMOTE Balanced Train", x = "Status") + theme_minimal() + theme(legend.position="none")
grid.arrange(p1, p2, ncol = 2)
# Goal: Define how we will validate the models (5-Fold Cross Validation).
# Note: Train-Test Split was already done in Step 5.3.
library(caret)
# 1. Convert Target to Factor (Required for Classification in Caret)
# 0 -> "Current", 1 -> "Churn"
# Caret prefers text labels for classification levels.
y_train_balanced_factor <- factor(y_train_balanced, levels = c(0, 1), labels = c("Current", "Churn"))
y_test_factor <- factor(y_test, levels = c(0, 1), labels = c("Current", "Churn"))
# 2. Define Control Parameters (5-Fold CV)
# This splits the training data into 5 parts to validate the model internally.
fit_control <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE, # Calculate probabilities (for ROC curve)
summaryFunction = twoClassSummary, # Use ROC/AUC as metric
verboseIter = FALSE
)
cat("6.1 Complete: Training setup ready with 5-Fold Cross Validation.\n")
## 6.1 Complete: Training setup ready with 5-Fold Cross Validation.
# Goal: Train a Random Forest model using the balanced dataset.
cat("Training Random Forest... (This may take a minute)\n")
## Training Random Forest... (This may take a minute)
# We use the 'ranger' package (faster implementation of Random Forest)
rf_model <- train(
x = X_train_balanced,
y = y_train_balanced_factor,
method = "ranger",
metric = "ROC", # Optimize for Area Under Curve
trControl = fit_control,
tuneLength = 5 # Try 5 different hyperparameter combinations automatically
)
print(rf_model)
## Random Forest
##
## 1655 samples
## 113 predictor
## 2 classes: 'Current', 'Churn'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1324, 1325, 1324, 1324, 1323
## Resampling results across tuning parameters:
##
## mtry splitrule ROC Sens Spec
## 2 gini 0.9791287 0.9162365 0.9377294
## 2 extratrees 0.9525956 0.9128531 0.8404692
## 29 gini 0.9850263 0.9468220 0.9558860
## 29 extratrees 0.9842069 0.9479584 0.9481022
## 57 gini 0.9837080 0.9513611 0.9494093
## 57 extratrees 0.9844812 0.9502311 0.9507331
## 85 gini 0.9836052 0.9524910 0.9454964
## 85 extratrees 0.9842036 0.9479584 0.9507331
## 113 gini 0.9836783 0.9524910 0.9416087
## 113 extratrees 0.9841905 0.9479584 0.9429409
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 29, splitrule = gini
## and min.node.size = 1.
cat("\nRandom Forest Training Complete.\n")
##
## Random Forest Training Complete.
# Goal: Train a GBM model (A robust alternative to XGBoost with high stability in R)
library(gbm)
library(caret)
cat("Training GBM... (This takes a moment but is very stable)\n")
## Training GBM... (This takes a moment but is very stable)
# We use the same balanced training data (X_train_balanced) that worked for Random Forest.
# Unlike XGBoost, GBM is less "finicky" and handles factors and data types very effectively.
# Define a basic hyperparameter grid for GBM
gbmGrid <- expand.grid(
interaction.depth = c(1, 3, 5), # Maximum depth of each tree
n.trees = c(100, 150), # Number of boosting iterations
shrinkage = 0.1, # Learning rate
n.minobsinnode = 10 # Minimum number of observations in terminal nodes
)
gbm_model <- train(
x = X_train_balanced, # Using the same dataset as Random Forest
y = y_train_balanced_factor, # Our binary target variable
method = "gbm", # Method set to 'gbm'
metric = "ROC",
trControl = fit_control, # Settings defined in section 6.1
tuneGrid = gbmGrid,
verbose = FALSE # Turn off excessive logging for a cleaner console
)
print(gbm_model)
## Stochastic Gradient Boosting
##
## 1655 samples
## 113 predictor
## 2 classes: 'Current', 'Churn'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1323, 1324, 1324, 1325, 1324
## Resampling results across tuning parameters:
##
## interaction.depth n.trees ROC Sens Spec
## 1 100 0.9818013 0.9309900 0.9558860
## 1 150 0.9820772 0.9343862 0.9558860
## 3 100 0.9855298 0.9434515 0.9545790
## 3 150 0.9857064 0.9434515 0.9558777
## 5 100 0.9862623 0.9468477 0.9519816
## 5 150 0.9865304 0.9479712 0.9493842
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
cat("\nGBM Training Complete.\n")
##
## GBM Training Complete.
# Goal: Compare Random Forest and GBM performance visually to select the best model.
# The resamples function collects Cross-Validation results from both models
results <- resamples(list(RF = rf_model, GBM = gbm_model))
# Print summary statistics (Mean, Median, Min, Max for ROC, Sens, Spec)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: RF, GBM
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.9708893 0.9831206 0.9868416 0.9850263 0.9889390 0.9953408 0
## GBM 0.9804994 0.9850319 0.9855372 0.9865304 0.9878935 0.9936899 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.9265537 0.9322034 0.9375000 0.9468220 0.9604520 0.9774011 0
## GBM 0.9265537 0.9435028 0.9491525 0.9479712 0.9545455 0.9661017 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.9480519 0.9545455 0.9545455 0.9558860 0.9545455 0.9677419 0
## GBM 0.9285714 0.9415584 0.9415584 0.9493842 0.9610390 0.9741935 0
# Visualization with Boxplots
# This chart shows which model is more stable and has higher overall scores.
bwplot(results, metric = "ROC", main = "Model Comparison: ROC Curve Distribution")
# Visualization with Dotplots
# Useful for a quick glance at the mean performance and confidence intervals.
dotplot(results, metric = "ROC", main = "Model Comparison: ROC Performance Dotplot")
cat("7 Complete: Models compared visually.\n")
## 7 Complete: Models compared visually.
Commentary: Based on the results of the 5-fold Cross-Validation, both the Random Forest and GBM models demonstrate exceptionally high performance. The Area Under the Curve (AUC) for both models is approximately 0.98. The Random Forest model was selected as the final model because it exhibits a slightly more stable distribution compared to GBM.
# Goal: Use the best model (Random Forest) to predict on the Test Set.
cat("Predicting on Test Set using Random Forest...\n")
## Predicting on Test Set using Random Forest...
# 1. Generate Predictions (Both Probabilities and Classes)
# We are using the Random Forest model as it achieved the highest ROC score during training.
predictions_prob <- predict(rf_model, X_test, type = "prob")
predictions_class <- predict(rf_model, X_test, type = "raw")
# 2. Create the Confusion Matrix
# Evaluate model performance: How many did we get right? (Accuracy, Sensitivity, Specificity)
# We set 'positive = "Churn"' to ensure metrics reflect our ability to detect churners.
conf_matrix <- confusionMatrix(predictions_class, y_test_factor, positive = "Churn")
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Current Churn
## Current 200 6
## Churn 12 195
##
## Accuracy : 0.9564
## 95% CI : (0.932, 0.974)
## No Information Rate : 0.5133
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9128
##
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.9701
## Specificity : 0.9434
## Pos Pred Value : 0.9420
## Neg Pred Value : 0.9709
## Prevalence : 0.4867
## Detection Rate : 0.4722
## Detection Prevalence : 0.5012
## Balanced Accuracy : 0.9568
##
## 'Positive' Class : Churn
##
cat("\n8. Complete: Final Test Evaluation Done.\n")
##
## 8. Complete: Final Test Evaluation Done.
Final Test Result: The model achieved an overall Accuracy of 95.64% on the Test Data, which it had never encountered before.
Error Rate: Out of 413 customers in the test set, only 18 were misclassified (6 False Positives, 12 False Negatives).
Sensitivity (Recall): 97.01%. This metric demonstrates the model’s high level of success in identifying customers with churn potential.
Kappa Score: A value of 0.91 proves that the model’s success is not due to chance and represents a “Perfect Agreement” level.
## 9. Feature Importance
# Goal: Re-train the optimal model with importance enabled and visualize drivers.
cat("Re-calculating importance using the 'impurity' method...\n")
## Re-calculating importance using the 'impurity' method...
# 1. Re-train the model one last time with importance enabled
# We use the best 'mtry' found in your previous step (which was 29)
final_rf_with_imp <- train(
x = X_train_balanced,
y = y_train_balanced_factor,
method = "ranger",
importance = 'impurity', # This is the missing piece!
tuneGrid = data.frame(mtry = 29, splitrule = "gini", min.node.size = 1),
trControl = trainControl(method = "none") # No need for CV again
)
# 2. Extract and Plot Importance
var_imp_data <- varImp(final_rf_with_imp, scale = FALSE)
# 3. Generate the Visualization
plot(var_imp_data, top = 20, main = "Top 20 Drivers of Customer Churn")
cat("9. Complete: Feature Importance Plot Generated successfully.\n")
## 9. Complete: Feature Importance Plot Generated successfully.
Variable Analysis: When examining the most significant factors influencing customer churn, it is evident that financial metrics are the primary drivers.
MRR_Capped (Monthly Recurring Revenue): This is by far the most decisive factor.
Transaction_Count (Number of Transactions): The customer’s activity level has a direct impact on their loyalty.
Total_Revenue: The total value a customer has brought to the company is also a critical indicator.
These results highlight the necessity of closely monitoring the behaviors of high-revenue and high-transaction customers.