1. Environment Setup

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

# --- Core Data Manipulation  ---
library(tidyverse)
library(lubridate)
library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(janitor)

# --- Machine Learning  ---
library(fastDummies)
library(smotefamily)
library(xgboost)
library(caret)
library(ranger)
library(rpart)
library(pROC)

# --- Visualization  ---
library(ggplot2)
library(gridExtra)
library(scales)
library(corrplot)
library(viridis)
library(knitr)
library(kableExtra)
library(plotly)

2. Data Acquisition & Quality Assessment

2.1 Load Training Data

Demographics <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_demographics_train.csv")
StatusData <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_status_level_train.csv")
MRR <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_monthly_recurring_revenue_train.csv")
Revenue <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_revenue_history_train.csv")
Support <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/support_ticket_activity_train.csv")
Bugs <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/product_bug_reports_train.csv")
Newsletter <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/newsletter_engagement_train.csv")
Satisfaction_scores <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_satisfaction_scores_train.csv")
Region_industry <- read_csv("/Users/macbook/Documents/Ceng3Kış2025/ProjectDataScience/customer_region_and_industry_train.csv")

2.2 Data Quality Report

Automatic data quality analysis for all datasets (Dimensions, Duplicates, Missing Values):

# --- 1. DEFINE FUNCTION ---
# This function takes a dataframe and generates a summary report
create_quality_report <- function(df, dataset_name) {
  
  # Print Subheader
  cat("\n### ", dataset_name, " Analysis\n")
  
  # Dimensions
  cat("- **Rows:** ", nrow(df), "\n")
  cat("- **Columns:** ", ncol(df), "\n")
  
  # Duplicate Check
  dup_count <- sum(duplicated(df))
  cat("- **Duplicate Rows:** ", dup_count, ifelse(dup_count > 0, " (WARNING!)", ""), "\n\n")
  
  # create Detailed Table
  summary_df <- data.frame(
    Column = names(df),
    Type = sapply(df, class),
    Missing_Count = colSums(is.na(df)),
    Missing_Percent = round(colSums(is.na(df)) / nrow(df) * 100, 2),
    Unique_Values = sapply(df, function(x) length(unique(x)))
  )
  
  # Print Table (in HTML format)
  print(
    kable(summary_df, caption = paste(dataset_name, "Details"), row.names = FALSE) %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE, position = "left") %>%
      scroll_box(height = "250px")
  )
  
  cat("\n---\n") # Separator line
}

# --- 2. EXECUTE FUNCTION (For all 9 Datasets) ---

create_quality_report(Demographics, "1. Demographics")

1. Demographics Analysis

Rows: 2068
Columns: 2
Duplicate Rows: 0

Demographics Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
CUS ID	character	0	0	2068
Customer Age (Months)	numeric	0	0	72

create_quality_report(StatusData, "2. Customer Status")

2. Customer Status Analysis

Rows: 2068
Columns: 3
Duplicate Rows: 0

Customer Status Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0.00	2068
Status	character	0	0.00	3
Customer Level	character	1	0.05	4

create_quality_report(MRR, "3. Monthly Recurring Revenue (MRR)")

3. Monthly Recurring Revenue (MRR) Analysis

Rows: 1056
Columns: 2
Duplicate Rows: 0

Monthly Recurring Revenue (MRR) Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0	1056
MRR	character	0	0	795

create_quality_report(Revenue, "4. Revenue History")

4. Revenue History Analysis

Rows: 712
Columns: 2
Duplicate Rows: 0

Revenue History Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0	712
Total Revenue	character	0	0	570

create_quality_report(Support, "5. Support Tickets")

5. Support Tickets Analysis

Rows: 1072
Columns: 3
Duplicate Rows: 0

Support Tickets Details
Column	Type	Unique_Values
Customer ID	character	1072
Help Ticket Count	numeric	94
Help Ticket Lead Time (hours)	numeric	767

create_quality_report(Bugs, "6. Bug Reports")

6. Bug Reports Analysis

Rows: 1571
Columns: 2
Duplicate Rows: 25 (WARNING!)

Bug Reports Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0	502
Product Bug Task Count	numeric	0	0	161

create_quality_report(Newsletter, "7. Newsletter Engagement")

7. Newsletter Engagement Analysis

Rows: 199
Columns: 2
Duplicate Rows: 0

Newsletter Engagement Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0	199
Company Newsletter Interaction Count	numeric	0	0	21

create_quality_report(Satisfaction_scores, "8. Satisfaction Scores")

8. Satisfaction Scores Analysis

Rows: 1571
Columns: 11
Duplicate Rows: 4 (WARNING!)

Satisfaction Scores Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0.00	502
Year	numeric	0	0.00	1
Quarter	numeric	0	0.00	3
Survey Date	Date	0	0.00	3
Response Date	Date	0	0.00	144
How likely are you to recommend insider to a friend or colleague	numeric	0	0.00	11
How would you rate the value you gain from our company	numeric	0	0.00	5
How frequently are you using our platform	character	3	0.19	4
Please rate the overall quality of our products	numeric	0	0.00	6
Please rate the usability of the panel	numeric	47	2.99	6
Please rate your understanding of our reporting capabilities in the panel	character	0	0.00	5

create_quality_report(Region_industry, "9. Region & Industry")

9. Region & Industry Analysis

Rows: 2068
Columns: 4
Duplicate Rows: 0

Region & Industry Details
Column	Type	Missing_Count	Missing_Percent	Unique_Values
Customer ID	character	0	0.00	2068
Region	character	1	0.05	24
Vertical	character	48	2.32	23
Subvertical	character	71	3.43	51

Data cleaning: removing duplicates

Bugs <- Bugs %>% distinct()
Satisfaction_scores <- Satisfaction_scores %>% distinct()
cat("Cleaning Complete: Duplicates removed from Bugs and Satisfaction datasets.\n")

## Cleaning Complete: Duplicates removed from Bugs and Satisfaction datasets.

2.3 Initial Target Distribution

Our main objective in this project is to examine the distribution of customer status.

# We are visualizing the 'Status' column in the StatusData table.
StatusData %>%
  group_by(Status) %>%
  summarise(num = n()) %>%
  mutate(percent = round(num / sum(num) * 100, 1)) %>%
  ggplot(aes(x = Status, y = num, fill = Status)) +
  geom_col() +
  geom_text(aes(label = paste0(num , " (%", percent, ")")), vjust = -0.5) +
  labs(title = "Customer Status Distribution (Target Variable)",
       x = "Status",
       y = "number of customer") +
  theme_minimal() +
  theme(legend.position = "none")

3. Data Preprocessing Pipeline

3.1 Column Name Standardization

Before merging, we standardize the key identifier column to ‘Customer_ID’ across all datasets to ensure consistency.

# Standardize 'Demographics' (Change 'CUS ID' to 'Customer_ID')
Demographics <- Demographics %>% 
  rename(Customer_ID = `CUS ID`)

# Standardize other datasets (Change 'Customer ID' to 'Customer_ID')
# We use backticks `` because the original names contain spaces.

StatusData <- StatusData %>% rename(Customer_ID = `Customer ID`)
MRR <- MRR %>% rename(Customer_ID = `Customer ID`)
Revenue <- Revenue %>% rename(Customer_ID = `Customer ID`)
Support <- Support %>% rename(Customer_ID = `Customer ID`)
Bugs <- Bugs %>% rename(Customer_ID = `Customer ID`)
Newsletter <- Newsletter %>% rename(Customer_ID = `Customer ID`)
Satisfaction_scores <- Satisfaction_scores %>% rename(Customer_ID = `Customer ID`)
Region_industry <- Region_industry %>% rename(Customer_ID = `Customer ID`)

cat("Column standardization complete. All datasets now use 'Customer_ID' as the key.\n")

## Column standardization complete. All datasets now use 'Customer_ID' as the key.

3.2 Feature Engineering - Satisfaction Surveys

We are summarizing the detailed questions in the satisfaction survey to create a single ‘Average Satisfaction Score’ for each customer.

# Check the columns of the satisfaction table (are all columns except ID numerical scores?)
# Calculate the mean for each row and create a new column.

Satisfaction_Feat <- Satisfaction_scores %>%
  # 1. Select only numeric columns (excluding ID) 
  select(Customer_ID, where(is.numeric)) %>%
  # 2. Extract non-score numerical data such as 'Year' and 'Quarter'.
  select(-Year, -Quarter) %>%
  # 3. Calculate the average of the remaining score columns.
  mutate(Instance_Score = rowMeans(select(., -Customer_ID), na.rm = TRUE)) %>%
  # Group by customer and reduce to a single line.
  group_by(Customer_ID) %>%
  summarise(Avg_Satisfaction = mean(Instance_Score, na.rm = TRUE))

cat("Completed: There is a single satisfaction score for each customer.\n")

## Completed: There is a single satisfaction score for each customer.

head(Satisfaction_Feat)

## # A tibble: 6 × 2
##   Customer_ID Avg_Satisfaction
##   <chr>                  <dbl>
## 1 CUS-1002400             5.27
## 2 CUS-1006000             6.67
## 3 CUS-1019500             5.48
## 4 CUS-1025800             4.88
## 5 CUS-1030400             5.04
## 6 CUS-1068750             5.5

3.3 Aggregate Additional Features

We are summarizing transaction-based tables into a single row per customer (Aggregation).

# 1. Support Tickets: How many total support requests has each customer made?
Support_Agg <- Support %>%
  group_by(Customer_ID) %>%
  summarise(Total_Tickets = n()) # n() counts the number of rows

# 2. Revenue History: What is the total expenditure of the customer?
Revenue_Agg <- Revenue %>%
  mutate(amount_clean = parse_number(as.character(grep("[0-9]", ., value = TRUE)[1]))) %>%
  mutate(across(where(is.character), parse_number)) %>%
  group_by(Customer_ID) %>%
  summarise(
    Total_Revenue = sum(across(where(is.numeric), sum), na.rm = TRUE), # Sum all numeric columns
    Transaction_Count = n()
  )

# 3. Bug Reports: How many times has the customer reported a bug?
Bugs_Agg <- Bugs %>%
  group_by(Customer_ID) %>%
  summarise(Bug_Count = n())

# 4. MRR 
MRR_Agg <- MRR %>%
  mutate(MRR_Clean = parse_number(as.character(MRR))) %>%
  group_by(Customer_ID) %>%
  summarise(MRR = mean(MRR_Clean, na.rm = TRUE))

# 5. Newsletter
Newsletter_Agg <- Newsletter %>%
  group_by(Customer_ID) %>%
  summarise(Newsletter_Count = sum(`Company Newsletter Interaction Count`, na.rm = TRUE))

cat("Completed: Support, Revenue, Bugs, MRR, Newsletter tables have been summarized.\n")

## Completed: Support, Revenue, Bugs, MRR, Newsletter tables have been summarized.

Fix ID types (type mismatch repair)

Demographics$Customer_ID <- as.character(Demographics$Customer_ID)
Region_industry$Customer_ID <- as.character(Region_industry$Customer_ID)
StatusData$Customer_ID <- as.character(StatusData$Customer_ID)

MRR_Agg$Customer_ID <- as.character(MRR_Agg$Customer_ID)
Newsletter_Agg$Customer_ID <- as.character(Newsletter_Agg$Customer_ID)
Revenue_Agg$Customer_ID <- as.character(Revenue_Agg$Customer_ID)
Support_Agg$Customer_ID <- as.character(Support_Agg$Customer_ID)
Bugs_Agg$Customer_ID <- as.character(Bugs_Agg$Customer_ID)
Satisfaction_Feat$Customer_ID <- as.character(Satisfaction_Feat$Customer_ID)

cat("All Customer_ID columns have been set to 'Character' format.\n")

## All Customer_ID columns have been set to 'Character' format.

3.4 Master Dataset Integration

We are merging the summarized tables created during Feature Engineering into the main dataset.

Master_Data <- Demographics %>%
  left_join(Region_industry, by = "Customer_ID") %>%
  left_join(StatusData, by = "Customer_ID") %>%
  
  left_join(MRR_Agg, by = "Customer_ID") %>%
  left_join(Newsletter_Agg, by = "Customer_ID") %>%
  # The following are the aggregated tables:
  left_join(Revenue_Agg, by = "Customer_ID") %>%
  left_join(Support_Agg, by = "Customer_ID") %>%
  left_join(Bugs_Agg, by = "Customer_ID") %>%
  left_join(Satisfaction_Feat, by = "Customer_ID")

cat("Master Dataset Created Successfully (Aggregated).\n")

## Master Dataset Created Successfully (Aggregated).

cat("Final Dimensions: ", nrow(Master_Data), " Rows, ", ncol(Master_Data), " Columns\n")

## Final Dimensions:  2068  Rows,  14  Columns

# Display the table
kable(head(Master_Data), caption = "Final Master Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F) %>%
  scroll_box(width = "100%")

Final Master Dataset
Customer_ID	Customer Age (Months)	Region	Vertical	Subvertical	Status	Customer Level	MRR	Newsletter_Count	Total_Revenue	Transaction_Count	Total_Tickets	Bug_Count	Avg_Satisfaction
CUS-1471650	24	Malaysia	Media and Publishing	Content Websites	Onboarding	Enterprise	196	NA	NA	NA	NA	NA	NA
CUS-1087300	25	Australia	Transportation and Logistics	Transportation and Logistics	Churn	Semi-Enterprise	3714	NA	NA	NA	NA	NA	NA
CUS-230550	12	Turkey	Retail	Fashion Retailer	Churn	Long-tail	NA	NA	NA	NA	NA	NA	NA
CUS-1470950	22	Thailand	Retail	Marketplace	Retained	Enterprise	14297	3	NA	NA	1	6	4.7500
CUS-1526100	11	Vietnam	Other	Other	Retained	Semi-Enterprise	2250	2	NA	NA	1	4	5.4375
CUS-1507900	12	Vietnam	Other	Other	Churn	Long-tail	NA	1	NA	NA	1	NA	NA

3.5 Missing Value Imputation

We address the missing values resulting from the merge process. Strategy: 1. Activity counts (Tickets, Bugs, Revenue) are filled with 0 (meaning no activity). 2. Satisfaction scores are imputed with the Median value to maintain distribution.

library(readr)
library(tidyr)
library(dplyr)

Master_Data <- Master_Data %>%
  mutate(
    # 1. 
    MRR = as.numeric(as.character(MRR)),
    Total_Revenue = as.numeric(as.character(Total_Revenue)),
    Newsletter_Count = as.numeric(as.character(Newsletter_Count))
  ) %>%
  mutate(
    # 2. Imputation
    Total_Tickets = replace_na(Total_Tickets, 0),
    Bug_Count = replace_na(Bug_Count, 0),
    Total_Revenue = replace_na(Total_Revenue, 0),
    Transaction_Count = replace_na(Transaction_Count, 0),
    
    # if not in MRR, Newsletter ->  0 
    MRR = replace_na(MRR, 0),
    Newsletter_Count = replace_na(Newsletter_Count, 0),
    
    # if not in satisfaction score -> Median
    Avg_Satisfaction = ifelse(is.na(Avg_Satisfaction), 
                              median(Avg_Satisfaction, na.rm = TRUE), 
                              Avg_Satisfaction)
  )

# 3. Verify Imputation
cat("\n--------------------------------\n")

## 
## --------------------------------

cat("Missing Values AFTER Imputation:\n")

## Missing Values AFTER Imputation:

print(colSums(is.na(Master_Data)))

##           Customer_ID Customer Age (Months)                Region 
##                     0                     0                     1 
##              Vertical           Subvertical                Status 
##                    48                    71                     0 
##        Customer Level                   MRR      Newsletter_Count 
##                     1                     0                     0 
##         Total_Revenue     Transaction_Count         Total_Tickets 
##                     0                     0                     0 
##             Bug_Count      Avg_Satisfaction 
##                     0                     0

Master_Data <- Master_Data %>%
  mutate(
    Region = replace_na(Region, "Unknown"),
    Vertical = replace_na(Vertical, "Unknown"),
    Subvertical = replace_na(Subvertical, "Unknown"),
    `Customer Level` = replace_na(`Customer Level`, "Unknown")
  )

cat("\n--------------------------------\n")

## 
## --------------------------------

cat("Final check:\n")

## Final check:

print(colSums(is.na(Master_Data)))

##           Customer_ID Customer Age (Months)                Region 
##                     0                     0                     0 
##              Vertical           Subvertical                Status 
##                     0                     0                     0 
##        Customer Level                   MRR      Newsletter_Count 
##                     0                     0                     0 
##         Total_Revenue     Transaction_Count         Total_Tickets 
##                     0                     0                     0 
##             Bug_Count      Avg_Satisfaction 
##                     0                     0

Recreate and merge the Revenue table.

# 1. Define the function here first (since section 4.3 has not run yet)
cap_outliers <- function(x) {
  quantiles <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
  x[x > quantiles[2]] <- quantiles[2] # Cap values at the upper bound
  x[x < quantiles[1]] <- quantiles[1] # Cap values at the lower bound
  return(x)
}

# 2. Remove the corrupted Revenue columns from Master_Data
Master_Data <- Master_Data %>% 
  select(-Total_Revenue, -Transaction_Count)

# 3. Re-extract Revenue data from raw file with aggressive cleaning
Revenue_Agg_Fixed <- Revenue %>%
  mutate(across(everything(), as.character)) %>% # Convert everything to character strings
  pivot_longer(cols = -Customer_ID, values_to = "raw_value") %>% # Pivot to long format
  mutate(clean_val = parse_number(raw_value)) %>% # Extract numeric values
  group_by(Customer_ID) %>%
  summarise(
    Total_Revenue = sum(clean_val, na.rm = TRUE),
    Transaction_Count = n()
  )

# 4. Re-integrate into Master_Data
Master_Data <- Master_Data %>%
  left_join(Revenue_Agg_Fixed, by = "Customer_ID") %>%
  mutate(
    # Set unmatched records (customers with no purchases) to 0
    Total_Revenue = replace_na(Total_Revenue, 0),
    Transaction_Count = replace_na(Transaction_Count, 0)
  )

# 5. Apply Outlier Capping for this NEW column
Master_Data <- Master_Data %>%
  mutate(Total_Revenue_Capped = cap_outliers(Total_Revenue))

cat("Revenue table repaired and Outlier capping applied.\n")

## Revenue table repaired and Outlier capping applied.

cat("New Average Revenue: ", mean(Master_Data$Total_Revenue, na.rm = TRUE), "\n")

## New Average Revenue:  4603337

4. Exploratory Data Analysis

4.1 Correlation Analysis

Understand the relationships between numeric variables.

library(corrplot)
# 1. Select only numeric columns for correlation analysis
numeric_vars <- Master_Data %>%
  select(where(is.numeric)) %>%
  # Removing 'Transaction_Count' if it has zero variance or is redundant
  select(-Transaction_Count) 

# 2. Compute the correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")

# 3. Visualize the correlation matrix
corrplot(cor_matrix, 
         method = "color", 
         type = "upper", 
         tl.col = "black", 
         tl.srt = 45, # Text label rotation
         addCoef.col = "black", # Add coefficient numbers
         number.cex = 0.7, # Font size for coefficients
         diag = FALSE, # Hide diagonal
         title = "Feature Correlation Matrix",
         mar = c(0,0,1,0))

summary(Master_Data$Total_Revenue)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 4.603e+06 2.130e+03 2.790e+09

4.2 Satisfaction Metrics Distribution

Analyze the distribution of the ‘Avg_Satisfaction’ score.

ggplot(Master_Data, aes(x = Avg_Satisfaction)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.5, fill = "#2A9D8F", color = "white", alpha = 0.7) +
  geom_density(color = "#E76F51", size = 1) +
  labs(title = "Distribution of Average Satisfaction Scores",
       subtitle = "Histogram with Density Curve",
       x = "Average Satisfaction Score",
       y = "Density") +
  theme_minimal()

## 4.3 Outlier Detection and Treatment Identify and cap extreme values (outliers) to prevent model skew.

# --- Step 1: Visualize Outliers (Boxplots) ---
# We focus on highly variable columns: Revenue, MRR, and Tickets.
p1 <- ggplot(Master_Data, aes(y = Total_Revenue)) + 
  geom_boxplot(fill = "lightblue") + 
  labs(title = "Total Revenue Outliers") + theme_minimal()

p2 <- ggplot(Master_Data, aes(y = MRR)) + 
  geom_boxplot(fill = "lightgreen") + 
  labs(title = "MRR Outliers") + theme_minimal()

p3 <- ggplot(Master_Data, aes(y = Total_Tickets)) + 
  geom_boxplot(fill = "pink") + 
  labs(title = "Total Tickets Outliers") + theme_minimal()

library(gridExtra)
grid.arrange(p1, p2, p3, ncol = 3)

# --- Step 2: Outlier Treatment (Capping / Winsorization) ---
# Strategy: Cap values above the 99th percentile. 
# We do not delete rows to preserve data; we just limit the maximum value.

cap_outliers <- function(x) {
  quantiles <- quantile(x, c(0.01, 0.99), na.rm = TRUE)
  x[x > quantiles[2]] <- quantiles[2] # Cap upper bound
  x[x < quantiles[1]] <- quantiles[1] # Cap lower bound (optional)
  return(x)
}

# Apply capping to relevant numeric columns
Master_Data <- Master_Data %>%
  mutate(
    Total_Revenue_Capped = cap_outliers(Total_Revenue),
    MRR_Capped = cap_outliers(MRR),
    Total_Tickets_Capped = cap_outliers(Total_Tickets)
  )

cat("Outlier treatment complete. New columns with suffix '_Capped' created.\n")

## Outlier treatment complete. New columns with suffix '_Capped' created.

5. Feature Engineering and Model Preparation

5.1 Advanced Feature Creation

# Goal: Create new predictive features BEFORE encoding or splitting.
# Logic: It is safer to calculate ratios on real data than on synthetic (SMOTE) data.

Model_Data_Enhanced <- Master_Data %>%
  mutate(
    # 1. Ticket Intensity: Tickets per month of tenure
    # (Adding +1 to avoid division by zero)
    Ticket_per_Month = Total_Tickets_Capped / (`Customer Age (Months)` + 1),
    
    # 2. Value per Age: Revenue generated per month
    Revenue_per_Month = Total_Revenue_Capped / (`Customer Age (Months)` + 1),
    
    # 3. High Risk Flag: Interaction between Low Satisfaction and High Bugs
    # If Satisfaction is low (e.g. 2) and Bugs are high (e.g. 5) -> Risk = (10-2)*5 = 40 (High Score)
    Risk_Factor = (10 - Avg_Satisfaction) * Bug_Count
  )

cat("5.1 Complete: New Features (Ratios & Interactions) created on Master Data.\n")

## 5.1 Complete: New Features (Ratios & Interactions) created on Master Data.

5.2 Selection and One-Hot Encoding

# Goal: Select relevant columns and convert categoricals to numeric.

library(fastDummies)
library(janitor)

# 1. Prepare Target & Drop Unused Columns
Model_Data_Selected <- Model_Data_Enhanced %>%
  mutate(Target = ifelse(Status == "Churn", 1, 0)) %>%
  select(
    -Customer_ID,      # ID is not predictive
    -Status,           # Converted to Target
    -Total_Revenue,    # Using Capped version
    -MRR,              # Using Capped version
    -Total_Tickets     # Using Capped version
  )

# 2. One-Hot Encoding
categorical_cols <- c("Region", "Vertical", "Subvertical", "Customer Level")

Model_Data_Encoded <- dummy_cols(
  Model_Data_Selected,
  select_columns = categorical_cols,
  remove_selected_columns = TRUE, # Remove original text columns
  remove_first_dummy = FALSE      # Keep all categories
) %>%
  clean_names() # Standardize names (region_turkey, etc.)

cat("5.2 Complete: Encoding Finished.\n")

## 5.2 Complete: Encoding Finished.

cat("Dimensions:", nrow(Model_Data_Encoded), "Rows,", ncol(Model_Data_Encoded), "Columns\n")

## Dimensions: 2068 Rows, 114 Columns

5.3 Train-Test Split

# Goal: Split data into 80% Training and 20% Testing.

library(caret)
set.seed(123)

train_index <- createDataPartition(Model_Data_Encoded$target, p = 0.8, list = FALSE)

X_train_raw <- Model_Data_Encoded[train_index, ] %>% select(-target)
y_train_raw <- Model_Data_Encoded[train_index, ]$target

# Test Set (We will NOT touch this with SMOTE)
Test_Set    <- Model_Data_Encoded[-train_index, ]
X_test      <- Test_Set %>% select(-target)
y_test      <- Test_Set$target

cat("5.3 Complete: Data Split.\n")

## 5.3 Complete: Data Split.

cat("Train Size:", length(y_train_raw), "\n")

## Train Size: 1655

cat("Test Size: ", length(y_test), "\n")

## Test Size:  413

5.4 SMOTE (only on Train)

# Goal: Apply SMOTE only to the Training set to fix imbalance.

library(smotefamily)

# Apply SMOTE
# K=5 is standard. dup_size=0 means we don't just copy rows, we generate new ones.
smote_result <- SMOTE(X = X_train_raw, target = y_train_raw, K = 5, dup_size = 0)

# Extract Balanced Train Data
Train_Set_Balanced <- smote_result$data %>%
  rename(target = class) %>%
  mutate(target = as.numeric(as.character(target)))

# Separate Features and Target for the final Train set
X_train_balanced <- Train_Set_Balanced %>% select(-target)
y_train_balanced <- Train_Set_Balanced$target

cat("5.4 Complete: SMOTE applied to Training Set.\n")

## 5.4 Complete: SMOTE applied to Training Set.

cat("Original Churn Count:", sum(y_train_raw == 1), "\n")

## Original Churn Count: 771

cat("Balanced Churn Count:", sum(y_train_balanced == 1), "\n")

## Balanced Churn Count: 771

5.5 Final Distribution Check

library(ggplot2)
library(gridExtra)

# Plot before and after
p1 <- data.frame(target = y_train_raw) %>%
  ggplot(aes(x = factor(target), fill = factor(target))) + geom_bar() +
  labs(title = "Original Train", x = "Status") + theme_minimal() + theme(legend.position="none")

p2 <- data.frame(target = y_train_balanced) %>%
  ggplot(aes(x = factor(target), fill = factor(target))) + geom_bar() +
  labs(title = "SMOTE Balanced Train", x = "Status") + theme_minimal() + theme(legend.position="none")

grid.arrange(p1, p2, ncol = 2)

6. Model Development and Training

6.1 Training Setup (Cross-Validation)

# Goal: Define how we will validate the models (5-Fold Cross Validation).
# Note: Train-Test Split was already done in Step 5.3.

library(caret)

# 1. Convert Target to Factor (Required for Classification in Caret)
# 0 -> "Current", 1 -> "Churn"
# Caret prefers text labels for classification levels.
y_train_balanced_factor <- factor(y_train_balanced, levels = c(0, 1), labels = c("Current", "Churn"))
y_test_factor <- factor(y_test, levels = c(0, 1), labels = c("Current", "Churn"))

# 2. Define Control Parameters (5-Fold CV)
# This splits the training data into 5 parts to validate the model internally.
fit_control <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE, # Calculate probabilities (for ROC curve)
  summaryFunction = twoClassSummary, # Use ROC/AUC as metric
  verboseIter = FALSE
)

cat("6.1 Complete: Training setup ready with 5-Fold Cross Validation.\n")

## 6.1 Complete: Training setup ready with 5-Fold Cross Validation.

6.2 Model1: Random Forest

# Goal: Train a Random Forest model using the balanced dataset.

cat("Training Random Forest... (This may take a minute)\n")

## Training Random Forest... (This may take a minute)

# We use the 'ranger' package (faster implementation of Random Forest)
rf_model <- train(
  x = X_train_balanced,
  y = y_train_balanced_factor,
  method = "ranger",
  metric = "ROC", # Optimize for Area Under Curve
  trControl = fit_control,
  tuneLength = 5 # Try 5 different hyperparameter combinations automatically
)

print(rf_model)

## Random Forest 
## 
## 1655 samples
##  113 predictor
##    2 classes: 'Current', 'Churn' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1324, 1325, 1324, 1324, 1323 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec     
##     2   gini        0.9791287  0.9162365  0.9377294
##     2   extratrees  0.9525956  0.9128531  0.8404692
##    29   gini        0.9850263  0.9468220  0.9558860
##    29   extratrees  0.9842069  0.9479584  0.9481022
##    57   gini        0.9837080  0.9513611  0.9494093
##    57   extratrees  0.9844812  0.9502311  0.9507331
##    85   gini        0.9836052  0.9524910  0.9454964
##    85   extratrees  0.9842036  0.9479584  0.9507331
##   113   gini        0.9836783  0.9524910  0.9416087
##   113   extratrees  0.9841905  0.9479584  0.9429409
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 29, splitrule = gini
##  and min.node.size = 1.

cat("\nRandom Forest Training Complete.\n")

## 
## Random Forest Training Complete.

6.3 Model2: GBM(Gradient Boosting Machine)

# Goal: Train a GBM model (A robust alternative to XGBoost with high stability in R)

library(gbm)
library(caret)

cat("Training GBM... (This takes a moment but is very stable)\n")

## Training GBM... (This takes a moment but is very stable)

# We use the same balanced training data (X_train_balanced) that worked for Random Forest.
# Unlike XGBoost, GBM is less "finicky" and handles factors and data types very effectively.

# Define a basic hyperparameter grid for GBM
gbmGrid <- expand.grid(
  interaction.depth = c(1, 3, 5), # Maximum depth of each tree
  n.trees = c(100, 150),         # Number of boosting iterations
  shrinkage = 0.1,                # Learning rate
  n.minobsinnode = 10             # Minimum number of observations in terminal nodes
)

gbm_model <- train(
  x = X_train_balanced,        # Using the same dataset as Random Forest
  y = y_train_balanced_factor, # Our binary target variable
  method = "gbm",              # Method set to 'gbm'
  metric = "ROC",
  trControl = fit_control,     # Settings defined in section 6.1
  tuneGrid = gbmGrid,
  verbose = FALSE              # Turn off excessive logging for a cleaner console
)

print(gbm_model)

## Stochastic Gradient Boosting 
## 
## 1655 samples
##  113 predictor
##    2 classes: 'Current', 'Churn' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1323, 1324, 1324, 1325, 1324 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  ROC        Sens       Spec     
##   1                  100      0.9818013  0.9309900  0.9558860
##   1                  150      0.9820772  0.9343862  0.9558860
##   3                  100      0.9855298  0.9434515  0.9545790
##   3                  150      0.9857064  0.9434515  0.9558777
##   5                  100      0.9862623  0.9468477  0.9519816
##   5                  150      0.9865304  0.9479712  0.9493842
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.

cat("\nGBM Training Complete.\n")

## 
## GBM Training Complete.

7. Model Evaluation

# Goal: Compare Random Forest and GBM performance visually to select the best model.
# The resamples function collects Cross-Validation results from both models
results <- resamples(list(RF = rf_model, GBM = gbm_model))

# Print summary statistics (Mean, Median, Min, Max for ROC, Sens, Spec)
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: RF, GBM 
## Number of resamples: 5 
## 
## ROC 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## RF  0.9708893 0.9831206 0.9868416 0.9850263 0.9889390 0.9953408    0
## GBM 0.9804994 0.9850319 0.9855372 0.9865304 0.9878935 0.9936899    0
## 
## Sens 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## RF  0.9265537 0.9322034 0.9375000 0.9468220 0.9604520 0.9774011    0
## GBM 0.9265537 0.9435028 0.9491525 0.9479712 0.9545455 0.9661017    0
## 
## Spec 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## RF  0.9480519 0.9545455 0.9545455 0.9558860 0.9545455 0.9677419    0
## GBM 0.9285714 0.9415584 0.9415584 0.9493842 0.9610390 0.9741935    0

# Visualization with Boxplots
# This chart shows which model is more stable and has higher overall scores.
bwplot(results, metric = "ROC", main = "Model Comparison: ROC Curve Distribution")

# Visualization with Dotplots
# Useful for a quick glance at the mean performance and confidence intervals.
dotplot(results, metric = "ROC", main = "Model Comparison: ROC Performance Dotplot")

cat("7 Complete: Models compared visually.\n")

## 7 Complete: Models compared visually.

Commentary: Based on the results of the 5-fold Cross-Validation, both the Random Forest and GBM models demonstrate exceptionally high performance. The Area Under the Curve (AUC) for both models is approximately 0.98. The Random Forest model was selected as the final model because it exhibits a slightly more stable distribution compared to GBM.

8. Final Predicitons on Test Set

# Goal: Use the best model (Random Forest) to predict on the Test Set.

cat("Predicting on Test Set using Random Forest...\n")

## Predicting on Test Set using Random Forest...

# 1. Generate Predictions (Both Probabilities and Classes)
# We are using the Random Forest model as it achieved the highest ROC score during training.
predictions_prob <- predict(rf_model, X_test, type = "prob")
predictions_class <- predict(rf_model, X_test, type = "raw")

# 2. Create the Confusion Matrix
# Evaluate model performance: How many did we get right? (Accuracy, Sensitivity, Specificity)
# We set 'positive = "Churn"' to ensure metrics reflect our ability to detect churners.
conf_matrix <- confusionMatrix(predictions_class, y_test_factor, positive = "Churn")

print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Current Churn
##    Current     200     6
##    Churn        12   195
##                                         
##                Accuracy : 0.9564        
##                  95% CI : (0.932, 0.974)
##     No Information Rate : 0.5133        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.9128        
##                                         
##  Mcnemar's Test P-Value : 0.2386        
##                                         
##             Sensitivity : 0.9701        
##             Specificity : 0.9434        
##          Pos Pred Value : 0.9420        
##          Neg Pred Value : 0.9709        
##              Prevalence : 0.4867        
##          Detection Rate : 0.4722        
##    Detection Prevalence : 0.5012        
##       Balanced Accuracy : 0.9568        
##                                         
##        'Positive' Class : Churn         
##

cat("\n8. Complete: Final Test Evaluation Done.\n")

## 
## 8. Complete: Final Test Evaluation Done.

Final Test Result: The model achieved an overall Accuracy of 95.64% on the Test Data, which it had never encountered before.

Error Rate: Out of 413 customers in the test set, only 18 were misclassified (6 False Positives, 12 False Negatives).
Sensitivity (Recall): 97.01%. This metric demonstrates the model’s high level of success in identifying customers with churn potential.
Kappa Score: A value of 0.91 proves that the model’s success is not due to chance and represents a “Perfect Agreement” level.

9. Feature Importance

## 9. Feature Importance
# Goal: Re-train the optimal model with importance enabled and visualize drivers.

cat("Re-calculating importance using the 'impurity' method...\n")

## Re-calculating importance using the 'impurity' method...

# 1. Re-train the model one last time with importance enabled
# We use the best 'mtry' found in your previous step (which was 29)
final_rf_with_imp <- train(
  x = X_train_balanced, 
  y = y_train_balanced_factor,
  method = "ranger",
  importance = 'impurity', # This is the missing piece!
  tuneGrid = data.frame(mtry = 29, splitrule = "gini", min.node.size = 1),
  trControl = trainControl(method = "none") # No need for CV again
)

# 2. Extract and Plot Importance
var_imp_data <- varImp(final_rf_with_imp, scale = FALSE)

# 3. Generate the Visualization
plot(var_imp_data, top = 20, main = "Top 20 Drivers of Customer Churn")

cat("9. Complete: Feature Importance Plot Generated successfully.\n")

## 9. Complete: Feature Importance Plot Generated successfully.

Variable Analysis: When examining the most significant factors influencing customer churn, it is evident that financial metrics are the primary drivers.

MRR_Capped (Monthly Recurring Revenue): This is by far the most decisive factor.
Transaction_Count (Number of Transactions): The customer’s activity level has a direct impact on their loyalty.
Total_Revenue: The total value a customer has brought to the company is also a critical indicator.

These results highlight the necessity of closely monitoring the behaviors of high-revenue and high-transaction customers.

Customer Churn Prediction Project

Alime KILINÇ

2026-01-11