Cluster analysis approaches

Author

Alexander Lawless

Published

14-03-2025

expand for setup code chunk
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE, fig.width =12, fig.height = 9)

library(tidyverse)
library(GGally)
library(factoextra)
library(plotly)
library(tidyLPA)
library(mclust)
library(DT)

# Look ups ----
key_measure_lookup <-
  tribble(
    ~variable, ~variable_clean,
    "person_id"                 , "0. Person ID",
    "care_contacts"             , "A. Care contacts",
    "referrals"                 , "B. Referrals",
    "length_care"               , "C. Care length",
    "average_daily_contacts"    , "D. Average daily contacts",
    "contact_prop_aft_midpoint" , "E. Contact proportion after midpoint",
    "team_input_count"          , "F. Team input count",
    "intermittent_care_periods" , "G. Intermittent care periods",
    "specialist_contact_prop"   , "H. Specialist care proportion",
    "remote_contact_prop"       , "I. Remote care proportion",
    "avg_contact_duration"      , "J. Average contact duration",
    "acute_admissions_2223"     , "K. Acute admission count"
  )

# Load the data and saved objects ----

remove_outliers <- read_csv("remove_outliers_rep_sample.csv")

key_measures_rep_sub_sample <-
  read_csv("key_measures_sample_cluster.csv") |>
  anti_join(remove_outliers, by = "person_id") |>
  dplyr::select(3:13) |>
  as.data.frame()

#load("cluster_analysis.RData")

load("cluster_analysis_kmeans.RData")
load("cluster_analysis_lpa.RData")
load("cluster_analysis_dbscan.RData")
load("cluster_analysis_evaluations.RData")

# Perform Principal Component Analysis (PCA)
pca_result <-
  prcomp(key_measures_rep_sub_sample, scale. = TRUE)

# Create a dataframe with the principal components
pca_data <-
  as.data.frame(pca_result$x) |>
  mutate(cluster = as.factor(kmeans_result_11$cluster)) # Add cluster assignments to the dataframe

# Functions ----

create_dt <- function(x) {
    
    DT::datatable(
        x
        , extensions = "Buttons"
        , options = list(
            dom = "Blfrtip"
            , buttons = c("copy", "csv")
            , lengthMenu = list(
                c(10, 25, 50, -1)
                , c(10, 25, 50, "All")
                ),
            width = "400px",  # Specify the width
            height = "200px" # Specify the height
            )
        )
}

## kmeans 

link_model_output <- function(model, model_id) {

  model$tot.withinss |>
    tibble() |>
    rename(tot.withinss = 1) |>
    mutate(id = model_id)
}

wssplot <- function(data, nc = 10, seed = 1234, max_iter = 100) {
  wss <- numeric(nc)
  wss[1] <- (nrow(data) - 1) * sum(apply(data, 2, var))
  set.seed(seed)
  for (i in 2:nc) {
    wss[i] <- sum(kmeans(data, centers = i, nstart = 25, iter.max = max_iter)$withinss)
  }
  plot(1:nc, wss, type = 'b', xlab = 'Number of Clusters', ylab = 'Within Group Sum of Squares',
       main = 'Elbow Method Plot to Find Optimal Number of Clusters', frame.plot = TRUE,
       col = 'blue', lwd = 1.5)
}

## LPA

## Evaluation
### Function to subsample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

### Evaluate clustering performance on a subsample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Subsample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

### Function to identify the optimum model
select_optimum_model <- function(kmeans_eval, lpa_eval, dbscan_eval) {
  # Combine evaluation metrics into a data frame
  eval_metrics <- data.frame(
    model = c("K-means", "LPA", "DBSCAN"),
    silhouette_score = c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),
    db_index = c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),
    wcss = c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss)
  )

  # Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better)
  eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score / max(eval_metrics$silhouette_score)
  eval_metrics$normalized_db_index <- min(eval_metrics$db_index) / eval_metrics$db_index
  eval_metrics$normalized_wcss <- min(eval_metrics$wcss) / eval_metrics$wcss

  # Calculate a combined score (you can adjust the weights as needed)
  eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss

  # Identify the model with the highest combined score
  optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]

  return(optimum_model)
}

In this re-organised cluster analysis output, we run multiple cluster models, of varying method and n_clusters, then evaluate the most suitable model.

Having selected the most suitable model, based out evaluation metrics and discussions with the wider analytical team, we then present only that approach, giving details of underlying methods.

Cluster models

set.seed(123)  # For reproducibility
kmeans_result_4 <-  kmeans(key_measures_rep_sub_sample, centers = 4, nstart = 100, iter.max = 100) 
kmeans_result_5 <-  kmeans(key_measures_rep_sub_sample, centers = 5, nstart = 100, iter.max = 100) 
kmeans_result_6 <-  kmeans(key_measures_rep_sub_sample, centers = 6, nstart = 100, iter.max = 100) 
kmeans_result_7 <-  kmeans(key_measures_rep_sub_sample, centers = 7, nstart = 100, iter.max = 100) 
kmeans_result_8 <-  kmeans(key_measures_rep_sub_sample, centers = 8, nstart = 100, iter.max = 100) 
kmeans_result_9 <-  kmeans(key_measures_rep_sub_sample, centers = 9, nstart = 100, iter.max = 100) 
kmeans_result_10 <- kmeans(key_measures_rep_sub_sample, centers = 10, nstart = 100, iter.max = 100)
kmeans_result_11 <- kmeans(key_measures_rep_sub_sample, centers = 11, nstart = 100, iter.max = 100)
kmeans_result_12 <- kmeans(key_measures_rep_sub_sample, centers = 12, nstart = 100, iter.max = 100)
lpa_model_5 <- Mclust(key_measures_rep_sub_sample, G = 5, modelNames = "EEV")
lpa_model_6 <- Mclust(key_measures_rep_sub_sample, G = 6, modelNames = "EEV")
lpa_model_7 <- Mclust(key_measures_rep_sub_sample, G = 7, modelNames = "EEV")
lpa_model_8 <- Mclust(key_measures_rep_sub_sample, G = 8, modelNames = "EEV")
lpa_model_9 <- Mclust(key_measures_rep_sub_sample, G = 9, modelNames = "EEV")
db <- fpc::dbscan(key_measures_rep_sub_sample, eps = 1.8, MinPts = 5)

Model evaluation

When evaluating which cluster model is most suitable, we isolate the cluster vectors from each of the model objects:

kmeans_clusters_4  <- kmeans_result_4$cluster
kmeans_clusters_5  <- kmeans_result_5$cluster
kmeans_clusters_6  <- kmeans_result_6$cluster
kmeans_clusters_7  <- kmeans_result_7$cluster
kmeans_clusters_8  <- kmeans_result_8$cluster
kmeans_clusters_9  <- kmeans_result_9$cluster
kmeans_clusters_10 <- kmeans_result_10$cluster
kmeans_clusters_11 <- kmeans_result_11$cluster
kmeans_clusters_12 <- kmeans_result_12$cluster
lpa_clusters_5     <- lpa_model_5$classification
lpa_clusters_6     <- lpa_model_6$classification
lpa_clusters_7     <- lpa_model_7$classification
lpa_clusters_8     <- lpa_model_8$classification
lpa_clusters_9     <- lpa_model_9$classification
dbscan_clusters    <- db$cluster

Next, we calculate the following measures of model fit on samples (25%) of each cluster vectors:

We calculate 3 metrics:

  1. Silhouette Score
  2. Davies-Bouldin Index (DB Index)
  3. Within-Cluster Sum of Squares (WCSS)
# Function to subsample data
subsample_data <- function(data, fraction = 0.25) {
  set.seed(123) # For reproducibility
  sample_indices <- sample(1:nrow(data), size = floor(fraction * nrow(data)))
  return(data[sample_indices, ])
}

# Evaluate clustering performance on a subsample
evaluate_clustering_subsample <- function(data, clusters, fraction = 0.25) {
  # Subsample the data
  subsample <- subsample_data(data, fraction)
  subsample_clusters <- clusters[1:nrow(subsample)]

  # Ensure clusters are numeric
  subsample_clusters <- as.numeric(subsample_clusters)

  # Calculate silhouette score
  dist_matrix <- dist(subsample)
  sil <- silhouette(subsample_clusters, dist_matrix)
  silhouette_score <- mean(sil[, 3])

  # Calculate Davies-Bouldin index
  db_index <- index.DB(subsample, subsample_clusters)$DB

  # Calculate within-cluster sum of squares (WCSS)
  wcss <- sum(kmeans(subsample, centers = length(unique(subsample_clusters)))$withinss)

  return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))
}

Explanation of Metrics:

Silhouette Score:

  • Interpretation: Measures how similar an object is to its own cluster compared to other clusters. Values range from -1 to 1.
  • Higher values indicate better-defined clusters. Negative values suggest that samples might be assigned to the wrong cluster.

DB Index (Davies-Bouldin Index):

  • Interpretation: Measures the average similarity ratio of each cluster with the cluster that is most similar to it. Lower values are better.
  • Lower values indicate better clustering. Higher values suggest that clusters are not well separated.

WCSS (Within-Cluster Sum of Squares):

  • Interpretation: Measures the total variance within each cluster. Lower values are better.
  • Lower values indicate that clusters are more compact. Higher values suggest that clusters are more spread out.
# Calculate evaluation metrics for each model using a subsample
kmeans_eval_4  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_4 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_4")
kmeans_eval_5  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_5 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_5")
kmeans_eval_6  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_6 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_6")
kmeans_eval_7  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_7 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_7")
kmeans_eval_8  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_8 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_8")
kmeans_eval_9  <-  evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_9 ) |> as.data.frame() |> mutate(id = "kmeans_clusters_9")
kmeans_eval_10 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_10) |> as.data.frame() |> mutate(id = "kmeans_clusters_10")
kmeans_eval_11 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_11) |> as.data.frame() |> mutate(id = "kmeans_clusters_11")
kmeans_eval_12 <- evaluate_clustering_subsample(key_measures_rep_sub_sample, kmeans_clusters_12) |> as.data.frame() |> mutate(id = "kmeans_clusters_12")
lpa_eval_5    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_5) |> as.data.frame() |> mutate(id = "lpa_clusters_5")
lpa_eval_6    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_6) |> as.data.frame() |> mutate(id = "lpa_clusters_6")
lpa_eval_7    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_7) |> as.data.frame() |> mutate(id = "lpa_clusters_7")
lpa_eval_8    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_8) |> as.data.frame() |> mutate(id = "lpa_clusters_8")
lpa_eval_9    <- evaluate_clustering_subsample(key_measures_rep_sub_sample, lpa_clusters_9) |> as.data.frame() |> mutate(id = "lpa_clusters_9")
dbscan_eval   <- evaluate_clustering_subsample(key_measures_rep_sub_sample, dbscan_clusters) |> as.data.frame() |> mutate(id = "dbscan_clusters")

For ease of comparison, we combine each of the eval-objects into a single table:

# Union all eval outputs to visualise and compare
model_evaluation_scores <-
  kmeans_eval_4 |>
  union_all(kmeans_eval_5 ) |>
  union_all(kmeans_eval_6 ) |>
  union_all(kmeans_eval_7 ) |>
  union_all(kmeans_eval_8 ) |>
  union_all(kmeans_eval_9 ) |>
  union_all(kmeans_eval_10) |>
  union_all(kmeans_eval_11) |>
  union_all(kmeans_eval_12) |>
  union_all(lpa_eval_5) |>
  union_all(lpa_eval_6) |>
  union_all(lpa_eval_7) |>
  union_all(lpa_eval_8) |>
  union_all(lpa_eval_9) |>
  union_all(dbscan_eval) |>
  tibble()

We can then visualise the evaluation metrics:

expand for setup code chunk
model_evaluation_scores |>
  pivot_longer(cols = -id) |>
  mutate(group =
           case_when(id == "dbscan_clusters" ~ "Density-based",
                     str_detect(id, "kmeans") ~ "Distance-based",
                     str_detect(id, "lpa") ~ "Model-based")
         ) |>
  mutate(
    id =
      factor(id,
             levels = c("kmeans_clusters_4",
                        "kmeans_clusters_5",
                        "kmeans_clusters_6",
                        "kmeans_clusters_7",
                        "kmeans_clusters_8",
                        "kmeans_clusters_9",
                        "kmeans_clusters_10",
                        "kmeans_clusters_11",
                        "kmeans_clusters_12",
                        "lpa_clusters_5",
                        "lpa_clusters_6",
                        "lpa_clusters_7",
                        "lpa_clusters_8",
                        "lpa_clusters_9",
                        "dbscan_clusters")
             )) |>
  mutate(alpha = 
           case_when(
             name == "silhouette_score" & id == "lpa_clusters_5" ~ "1",
             name == "wcss" & id == "kmeans_clusters_12" ~ "1",
             name == "db_index" & id == "dbscan_clusters" ~ "1",
             TRUE ~ "0"
           )) |>
  
  ggplot() +
  geom_point(aes(x = id, y = value, 
                 fill = group, colour = group, 
                 alpha = alpha, shape = alpha, size = alpha),
             show.legend = c(alpha = FALSE, size = FALSE, fill = FALSE)
             ) +
  facet_wrap(~name, scale = "free_y") +
  scale_shape_manual(values = c(21,22)) +
  scale_size_manual(values = c(2.5, 6)) +
  scale_alpha_manual(values = c(0.5,1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
        axis.title.y = element_blank(),
        strip.background = element_rect(fill = NA, colour = "grey"),
        strip.text = element_text(face = "bold")
        ) +
  labs(x = "Cluster model",
       colour = "Cluster type:",
       title = "Model comparison",
       subtitle = "DB-index: Lower values are better
Silhouette score: Higher values better
Within-cluster Sum of Squares: Lower values indicate clusters are more compact"
  )

From the above visualisation, we can note:

  • Silhouette Score: lpa_clusters_5 has the highest score, indicating well-defined clusters.
  • DB Index: dbscan_clusters has the lowest index, indicating well-separated clusters.
  • WCSS: kmeans_clusters_12 has the lowest WCSS, indicating compact clusters.

In the absence of a well-performing model across multiple metrics, we can normalise each metric and create a composite measure. In doing so, we sum normalised scores across the 3 measures of model fit and divide by 3 to calculate the composite score.

An alternative approach to to sum the normalised score and apply a weighting but taking an average incorporates the assumption that each metric is equally weighted.

The above composite score implies that the k-means model with 12 clusters is the most suitable model, however whether or not these clusters suit our understanding and intended use of the model is open for debate and adjustment.

Also of note, kmeans model with 12 clusters shows a warning of no convergence, implying instability in the model and steering us towards the next best option - kmeans model with 11 clusters.

Detail selected model

Count by cluster

 [1] 10875  9872  4888  8375  5189 17620  7896  3501  5215  7955 14566

Cluster details

> print(kmeans_result_11)
K-means clustering with 11 clusters of sizes 10875, 9872, 4888, 8375, 5189, 17620, 7896, 3501, 5215, 7955, 14566


Within cluster sum of squares by cluster:
  
 [1] 37736.67 30548.32 26365.43 31510.54 26118.11 33721.66 38711.28 34652.79 44701.08 45893.40 47017.39
 
(between_SS / total_SS =  53.0 %)

Visualise clusters

# Perform Principal Component Analysis (PCA)
pca_result <-
  prcomp(key_measures_rep_sub_sample, scale. = TRUE)

# Create a dataframe with the principal components
pca_data <-
  as.data.frame(pca_result$x) |>
  mutate(cluster = as.factor(kmeans_result_11$cluster)) # Add cluster assignments to the dataframe

# Create pairwise scatter plot matrix
ggpairs_plot <- ggpairs(pca_data, aes(color = cluster, alpha = 0.5))

ggpairs_plot

3D plot of top 3 principle components

Interpret clusters

Cluster means

K-means (11 cluster) model output:

kmeans_classification_means_plot_fixed_11

kmeans_classification_means_plot_free_11