Cluster analysis

Author

Alexander Lawless

Published

10-03-2025

expand for setup code chunk
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE, fig.width =12, fig.height = 9)

library(tidyverse)
library(GGally)
library(factoextra)
library(plotly)
library(tidyLPA)
library(mclust)
library(DT)

# Look ups ----
key_measure_lookup <-
  tribble(
    ~variable, ~variable_clean,
    "person_id"                 , "0. Person ID",
    "care_contacts"             , "A. Care contacts",
    "referrals"                 , "B. Referrals",
    "length_care"               , "C. Care length",
    "average_daily_contacts"    , "D. Average daily contacts",
    "contact_prop_aft_midpoint" , "E. Contact proportion after midpoint",
    "team_input_count"          , "F. Team input count",
    "intermittent_care_periods" , "G. Intermittent care periods",
    "specialist_contact_prop"   , "H. Specialist care proportion",
    "remote_contact_prop"       , "I. Remote care proportion",
    "avg_contact_duration"      , "J. Average contact duration",
    "acute_admissions_2223"     , "K. Acute admission count"
  )

# Load the data and saved objects ----

remove_outliers <- read_csv("remove_outliers_rep_sample.csv")

key_measures_rep_sub_sample <-
  read_csv("key_measures_sample_cluster.csv") |>
  anti_join(remove_outliers, by = "person_id") |>
  dplyr::select(3:13) |>
  as.data.frame()

#load("cluster_analysis.RData")

load("cluster_analysis_kmeans.RData")
load("cluster_analysis_lpa.RData")
load("cluster_analysis_dbscan.RData")
load("cluster_analysis_evaluations.RData")

# Functions ----

## kmeans 

link_model_output <- function(model, model_id) {

  model$tot.withinss |>
    tibble() |>
    rename(tot.withinss = 1) |>
    mutate(id = model_id)
}

wssplot <- function(data, nc = 10, seed = 1234, max_iter = 100) {
  wss <- numeric(nc)
  wss[1] <- (nrow(data) - 1) * sum(apply(data, 2, var))
  set.seed(seed)
  for (i in 2:nc) {
    wss[i] <- sum(kmeans(data, centers = i, nstart = 25, iter.max = max_iter)$withinss)
  }
  plot(1:nc, wss, type = 'b', xlab = 'Number of Clusters', ylab = 'Within Group Sum of Squares',
       main = 'Elbow Method Plot to Find Optimal Number of Clusters', frame.plot = TRUE,
       col = 'blue', lwd = 1.5)
}

## LPA

## Evaluation
### Function to subsample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

### Evaluate clustering performance on a subsample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Subsample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

### Function to identify the optimum model
select_optimum_model <- function(kmeans_eval, lpa_eval, dbscan_eval) {
  # Combine evaluation metrics into a data frame
  eval_metrics <- data.frame(
    model = c("K-means", "LPA", "DBSCAN"),
    silhouette_score = c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),
    db_index = c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),
    wcss = c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss)
  )

  # Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better)
  eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score / max(eval_metrics$silhouette_score)
  eval_metrics$normalized_db_index <- min(eval_metrics$db_index) / eval_metrics$db_index
  eval_metrics$normalized_wcss <- min(eval_metrics$wcss) / eval_metrics$wcss

  # Calculate a combined score (you can adjust the weights as needed)
  eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss

  # Identify the model with the highest combined score
  optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]

  return(optimum_model)
}

Cluster analysis approaches

Step 1: Performing K-means Clustering

Create K-means Clustering models:

set.seed(123)  # For reproducibility
kmeans_result_4 <-  kmeans(key_measures_rep_sub_sample, centers = 4, nstart = 100, iter.max = 100) 
kmeans_result_5 <-  kmeans(key_measures_rep_sub_sample, centers = 5, nstart = 100, iter.max = 100) 
kmeans_result_6 <-  kmeans(key_measures_rep_sub_sample, centers = 6, nstart = 100, iter.max = 100) 
kmeans_result_7 <-  kmeans(key_measures_rep_sub_sample, centers = 7, nstart = 100, iter.max = 100) 
kmeans_result_8 <-  kmeans(key_measures_rep_sub_sample, centers = 8, nstart = 100, iter.max = 100) 
kmeans_result_9 <-  kmeans(key_measures_rep_sub_sample, centers = 9, nstart = 100, iter.max = 100) 
kmeans_result_10 <- kmeans(key_measures_rep_sub_sample, centers = 10, nstart = 100, iter.max = 100)
kmeans_result_11 <- kmeans(key_measures_rep_sub_sample, centers = 11, nstart = 100, iter.max = 100)
kmeans_result_12 <- kmeans(key_measures_rep_sub_sample, centers = 12, nstart = 100, iter.max = 100)

Identify the number of clusters to derive

## 1. **Determine the Optimal Number of Clusters**:
# fviz_nbclust(key_measures_scaled_head, kmeans, method = "wss")  # Elbow Method

wssplot(key_measures_rep_sub_sample)

# K-means goodness of fit
## The total within-cluster sum of squares (WSS) measures the compactness of the clusters. Lower values indicate better fit.
## The more clusters derived, the lower the sum of squares
## Trade of between fit and model simplicity?

link_model_output(kmeans_result_4, "clusters_4") |>
  union_all(link_model_output(kmeans_result_5 , "clusters_5")) |>
  union_all(link_model_output(kmeans_result_6 , "clusters_6")) |>
  union_all(link_model_output(kmeans_result_7 , "clusters_7")) |>
  union_all(link_model_output(kmeans_result_8 , "clusters_8")) |>
  union_all(link_model_output(kmeans_result_9 , "clusters_9")) |>
  union_all(link_model_output(kmeans_result_10, "clusters_10")) |>
  union_all(link_model_output(kmeans_result_11, "clusters_11")) |>
  union_all(link_model_output(kmeans_result_12, "clusters_12")) |>
  mutate(rn = row_number()) |>

  ggplot(aes(x = reorder(id, rn), y = tot.withinss)) +
  geom_col(width = 0.01) +
  geom_point(size = 5) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  labs(x = "n_clusters",
       y = "Total within-cluster sum of squares (WSS)",
       title = "Comparing sum of squares Goodness of Fit measure",
       subtitle = "K-means cluster models"
       )

Visualise K-means models:

## 3. **Visualize the Clusters**:
fviz_cluster_plot <-
  fviz_cluster(kmeans_result_6,
             data = key_measures_rep_sub_sample,
             geom = "point",
             #ellipse.level = 99,
             ellipse.alpha = 0.1,
             pointsize = 0.5
             ) +
  #geom_point(aes(shape = cluster), size = 1, alpha = 0.2) +
  labs(title = "K-means Clustering Results",
     x = "Variable 1",
     y = "Variable 2") +
  theme_minimal()
fviz_cluster_plot

Pairwise Scatter Plot Matrix

# Perform Principal Component Analysis (PCA)
pca_result <-
  prcomp(key_measures_rep_sub_sample, scale. = TRUE)

# Create a dataframe with the principal components
pca_data <-
  as.data.frame(pca_result$x) |>
  mutate(cluster = as.factor(kmeans_result_6$cluster)) # Add cluster assignments to the dataframe

# Create pairwise scatter plot matrix
ggpairs_plot <- ggpairs(pca_data, aes(color = cluster, alpha = 0.5))
ggpairs_plot

3D plot of top 3 principle components

# Create 3D plot
plot_ly(pca_data_3d, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster
        #colors = c('#1f77b4', '#ff7f0e', '#2ca02c')
        ) %>%
  add_markers(size = 1) %>%
  layout(scene = list(xaxis = list(title = 'PC1'),
                      yaxis = list(title = 'PC2'),
                      zaxis = list(title = 'PC3')),
         title = '3D Plot of Principal Components')

Plot classifications

kmeans_classification_means <-
  bind_cols(key_measures_rep_sub_sample, classification = kmeans_result_6$cluster) |>
  group_by(classification) |>
  summarise_all(funs(mean)) |>
  mutate(classification = paste0("Profile ", 1:6)) |>
  mutate_at(vars(-classification), function(x) round(x, 3)) |>
  rename(profile = classification)

kmeans_classification_means_plot_free <-
  kmeans_classification_means |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile, scale = "free_y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Mean Zscore by variable and cluster/profile",
       subtitle = "K-means cluster analysis - 6 clusters")


kmeans_classification_means_plot_fixed <-
  kmeans_classification_means |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Mean Zscore by variable and cluster/profile",
       subtitle = "K-means cluster analysis - 6 clusters")
kmeans_classification_means_plot_fixed

kmeans_classification_means_plot_free

Step 2: Performing Model-Based Clustering

Test number of clusters in model based approach (LPA)

Function:

explore_model_fit <- function(df,
                              n_profiles_range = 1:12,
                              model_names = c("EII","VII","EEI","VEI","EVI","VVI","EEE",
                                              "VEE","EVE","VVE","EEV","VEV","EVV","VVV"
                                              )) {
  set.seed(123)
  x <- mclustBIC(df, G = n_profiles_range, modelNames = model_names)
  y <- x |>
    as.data.frame.matrix() |>
    rownames_to_column("n_profiles") |>
    rename(
      `EII: spherical, equal volume`                             = EII,
      `VII: spherical, unequal volume`                           = VII,
      `EEI: diagonal, equal volume and shape`                    = EEI,
      `VEI: diagonal, varying volume, equal shape`               = VEI,
      `EVI: diagonal, equal volume, varying shape`               = EVI,
      `VVI: diagonal, varying volume and shape`                  = VVI,
      `EEE: ellipsoidal, equal volume, shape, and orientation`   = EEE,
      `VEE: ellipsoidal, equal shape and orientation`            = VEE,
      `EVE: ellipsoidal, equal volume and orientation`           = EVE,
      `VVE: ellipsoidal, equal orientation`                      = VVE,
      `EEV: ellipsoidal, equal volume and equal shape`           = EEV,
      `VEV: ellipsoidal, equal shape`                            = VEV,
      `EVV: ellipsoidal, equal volume`                           = EVV,
      `VVV: ellipsoidal, varying volume, shape, and orientation` = VVV
      )
  y
}

fit_output <- explore_model_fit(key_measures_rep_sub_sample, n_profiles_range = 1:12)

# Elbow plot
to_plot <-
  fit_output |>
  gather(`Covariance matrix structure`, val, -n_profiles) |>
  mutate(`Covariance matrix structure` = as.factor(`Covariance matrix structure`),
         val = abs(val)) # this is to make the BIC values positive (to align with more common formula / interpretation of BIC)

Elbow plot to test n_clusters

to_plot |>
  tibble() |>
  mutate(n_profiles = as.numeric(n_profiles)) |>

  ggplot(aes(x = n_profiles,
             y = val,
             color = `Covariance matrix structure`,
             group = `Covariance matrix structure`,
             #label = `Covariance matrix structure`
             )) +
  geom_line() +
  geom_point() +
  #geom_label() +
  scale_x_continuous(breaks = scales::pretty_breaks()) +
  theme_minimal() +
  labs(x = "No. clusters",
       y = "BIC (smaller value is better)",
       title = "Comparing LPA model and number of cluster selection",
       subtitle = "Model-based clustering - LPA")

Select model

lpa_model_7 <- Mclust(key_measures_rep_sub_sample, G = 7, modelNames = "EEV")

selected_model <- lpa_model_7
print(summary(selected_model))
---------------------------------------------------- 
Gaussian finite mixture model fitted by EM algorithm 
---------------------------------------------------- 

Mclust EEV (ellipsoidal, equal volume and shape) model with 7 components: 

 log-likelihood     n  df      BIC      ICL
      -826230.8 95952 479 -1657957 -1687013

Clustering table:
    1     2     3     4     5     6     7 
 6025  8138 33153  4017 34403  5276  4940 

Plot model classifications

# Plot model classifications
dff <- bind_cols(key_measures_scaled_head, classification = selected_model$classification)

proc_df <-
  dff |>
  mutate_at(vars(-classification), scale) |>
  group_by(classification) |>
  summarise_all(funs(mean)) |>
  mutate(classification = paste0("Profile ", 1:7)) |>
  mutate_at(vars(-classification), function(x) round(x, 3)) |>
  rename(profile = classification)

lpa_profiles_mean_fixed <-
  proc_df |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile#, scales = "free_y"
             ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")
        ) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Plotting LPA profiles by variable and average Zscore",
       subtitle = "Model-based clustering - LPA")

lpa_profiles_mean_free <-
  proc_df |>
  pivot_longer(cols = -profile) |>
  left_join(key_measure_lookup, by = c("name" = "variable")) |>
  ggplot(aes(x = variable_clean, y = value, fill = variable_clean)) +
  geom_col() +
  facet_wrap(~profile, scales = "free_y"
             ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "none",
        strip.background = element_rect(fill = NA, colour = "grey")
        ) +
  labs(x = "Variable",
       y = "Mean Zscore",
       title = "Plotting LPA profiles by variable and average Zscore",
       subtitle = "Model-based clustering - LPA")
lpa_profiles_mean_fixed

lpa_profiles_mean_free

Step 3: Performing Density-Based Clustering

Optimal value of “eps” parameter

dbscan::kNNdistplot(key_measures_rep_sub_sample, k =  5)
abline(h = 1.8, lty = 2)

Plot DBSCAN results

# Print DBSCAN
#print(db)

# Plot clusters
dbscan_plot_fviz

Plot model classifications

# Visualise DBscan clusters
# Calculate the average variable scores for each cluster
db_average_scores <- aggregate(key_measures_rep_sub_sample, by = list(cluster = db$cluster), FUN = mean)

library(reshape2)
library(ggplot2)

# Melt the data for ggplot2
db_melted_scores <- reshape2::melt(db_average_scores, id.vars = "cluster")

# Create a bar plot
db_clusters_avg_plot_fixed <-
  db_melted_scores |>
  ggplot(aes(x = variable, y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Variable Scores by Cluster", x = "Variable", y = "Average Score", fill = "Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.background = element_rect(fill = NA, colour = "grey"),
        legend.position = "none"
        ) +
  facet_wrap(~factor(cluster))

db_clusters_avg_plot_free <-
  db_melted_scores |>
  ggplot(aes(x = variable, y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Variable Scores by Cluster", x = "Variable", y = "Average Score", fill = "Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.background = element_rect(fill = NA, colour = "grey"),
        legend.position = "none"
        ) +
  facet_wrap(~factor(cluster), scales = "free_y")
db_clusters_avg_plot_fixed

db_clusters_avg_plot_free

Heat map

Step 4: Evaluate Clustering Models

When evaluating which cluster model is most suitable, we isolate the cluster vectors from each of the model objects:

# Isolate cluster vectors from each model
kmeans_clusters_4  <- kmeans_result_4$cluster
kmeans_clusters_5  <- kmeans_result_5$cluster
kmeans_clusters_6  <- kmeans_result_6$cluster
kmeans_clusters_7  <- kmeans_result_7$cluster
kmeans_clusters_8  <- kmeans_result_8$cluster
kmeans_clusters_9  <- kmeans_result_9$cluster
kmeans_clusters_10 <- kmeans_result_10$cluster
kmeans_clusters_11 <- kmeans_result_11$cluster
kmeans_clusters_12 <- kmeans_result_12$cluster

lpa_clusters_5     <- lpa_model_5$classification
lpa_clusters_6     <- lpa_model_6$classification
lpa_clusters_7     <- lpa_model_7$classification
lpa_clusters_8     <- lpa_model_8$classification
lpa_clusters_9     <- lpa_model_9$classification

dbscan_clusters    <- db$cluster

Next, we calcuate the following measures of model fit on samples (25%) of each cluster vectors:

We calculate 3 metrics:

  1. Silhouette Score
  2. Davies-Bouldin Index (DB Index)
  3. Within-Cluster Sum of Squares (WCSS)
# Function to subsample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

# Evaluate clustering performance on a subsample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Subsample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

Explanation of Metrics:

Silhouette Score:

  • Interpretation: Measures how similar an object is to its own cluster compared to other clusters. Values range from -1 to 1.
  • Higher values indicate better-defined clusters. Negative values suggest that samples might be assigned to the wrong cluster.

DB Index (Davies-Bouldin Index):

  • Interpretation: Measures the average similarity ratio of each cluster with the cluster that is most similar to it. Lower values are better.
  • Lower values indicate better clustering. Higher values suggest that clusters are not well separated.

WCSS (Within-Cluster Sum of Squares):

  • Interpretation: Measures the total variance within each cluster. Lower values are better.
  • Lower values indicate that clusters are more compact. Higher values suggest that clusters are more spread out.
# Calculate evaluation metrics for each model using a subsample
kmeans_eval_4  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_4 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_4")
kmeans_eval_5  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_5 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_5")
kmeans_eval_6  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_6 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_6")
kmeans_eval_7  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_7 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_7")
kmeans_eval_8  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_8 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_8")
kmeans_eval_9  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_9 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_9")
kmeans_eval_10 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_10) |> as.data.frame() |> mutate(id = "kmeans_clusters_10")
kmeans_eval_11 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_11) |> as.data.frame() |> mutate(id = "kmeans_clusters_11")
kmeans_eval_12 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_12) |> as.data.frame() |> mutate(id = "kmeans_clusters_12")

lpa_eval_5    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_5) |> as.data.frame() |> mutate(id = "lpa_clusters_5")
lpa_eval_6    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_6) |> as.data.frame() |> mutate(id = "lpa_clusters_6")
lpa_eval_7    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_7) |> as.data.frame() |> mutate(id = "lpa_clusters_7")
lpa_eval_8    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_8) |> as.data.frame() |> mutate(id = "lpa_clusters_8")
lpa_eval_9    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_9) |> as.data.frame() |> mutate(id = "lpa_clusters_9")

dbscan_eval   <- evaluate_clustering_subsample(key_measures_rep_sub_sample, dbscan_clusters) |> as.data.frame() |> mutate(id = "dbscan_clusters")

For ease of comparison, we combine each of the eval-objects into a single table:

# Union all eval outputs to visualise and compare
model_evaluation_scores <-
  kmeans_eval_4 |>
  union_all(kmeans_eval_5 ) |>
  union_all(kmeans_eval_6 ) |>
  union_all(kmeans_eval_7 ) |>
  union_all(kmeans_eval_8 ) |>
  union_all(kmeans_eval_9 ) |>
  union_all(kmeans_eval_10) |>
  union_all(kmeans_eval_11) |>
  union_all(kmeans_eval_12) |>
  union_all(lpa_eval_5) |>
  union_all(lpa_eval_6) |>
  union_all(lpa_eval_7) |>
  union_all(lpa_eval_8) |>
  union_all(lpa_eval_9) |>
  union_all(dbscan_eval) |>
  tibble()

We can then visualise the evaluation metrics:

model_evaluation_scores |>
  pivot_longer(cols = -id) |>
  mutate(group =
           case_when(id == "dbscan_clusters" ~ "Density-based",
                     str_detect(id, "kmeans") ~ "Distance-based",
                     str_detect(id, "lpa") ~ "Model-based")
         ) |>
  mutate(
    id =
      factor(id,
             levels = c("kmeans_clusters_4",
                        "kmeans_clusters_5",
                        "kmeans_clusters_6",
                        "kmeans_clusters_7",
                        "kmeans_clusters_8",
                        "kmeans_clusters_9",
                        "kmeans_clusters_10",
                        "kmeans_clusters_11",
                        "kmeans_clusters_12",
                        "lpa_clusters_5",
                        "lpa_clusters_6",
                        "lpa_clusters_7",
                        "lpa_clusters_8",
                        "lpa_clusters_9",
                        "dbscan_clusters"
                        ))) |>

  ggplot(aes(x = id, y = value, colour = group)) +
  geom_point(size = 3) +
  facet_wrap(~name, scale = "free_y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = NA, colour = "grey"),
        #legend.position = "none"
        ) +
  labs(x = "Cluster model",
       colour = "Cluster type:",
       title = "Model comparison",
       subtitle = "DB-index: Lower values are better
Silhouette score: Higher values better
Within-cluster Sum of Squares: Lower values indicate clusters are more compact"
       )

From the above visualisation, we can note:

  • Silhouette Score: lpa_clusters_5 has the highest score, indicating well-defined clusters.
  • DB Index: dbscan_clusters has the lowest index, indicating well-separated clusters.
  • WCSS: kmeans_clusters_12 has the lowest WCSS, indicating compact clusters.

In the absence of a well-performing model across multiple metrics, we can normalise each metric and create a composite measure. In doing so, we calculate an average normalised score across the 3 measures of model fit and divide by 3 to calcuate the composite score.

An alternative approach to to sum the normalised score and apply a weighting but taking an average incorporates the assumption that each metric is equally weighted.

# Approach 1: creation of composite score by averaging normalised metrics
normalised_evaluation_scores <-
  model_evaluation_scores |>
  mutate(
    norm_silhouette = (silhouette_score - min(silhouette_score)) / (max(silhouette_score) - min(silhouette_score)),
    norm_db_index = (max(db_index) - db_index) / (max(db_index) - min(db_index)),
    norm_wcss = (max(wcss) - wcss) / (max(wcss) - min(wcss))
    ) |> # Normalize metrics
  mutate(composite_score = (norm_silhouette + norm_db_index + norm_wcss) / 3) |> # Calculate composite score - average
  dplyr::select(id, everything()) |>
  arrange(desc(composite_score)) 

DT::datatable(normalised_evaluation_scores)

The above composite score implies that the k-means model with 12 clusters is the most suitable model, however whether or not these clusters suit our understanding and intended use of the model is open for debate and adjustment.

K-means (12 cluster) model output:

kmeans_classification_means_plot_fixed_12

kmeans_classification_means_plot_free_12