knitr::opts_chunk$set(echo =FALSE, warning=FALSE, message=FALSE, fig.width =12, fig.height =9)library(tidyverse)library(GGally)library(factoextra)library(plotly)library(tidyLPA)library(mclust)library(DT)# Look ups ----key_measure_lookup <-tribble(~variable, ~variable_clean,"person_id" , "0. Person ID","care_contacts" , "A. Care contacts","referrals" , "B. Referrals","length_care" , "C. Care length","average_daily_contacts" , "D. Average daily contacts","contact_prop_aft_midpoint" , "E. Contact proportion after midpoint","team_input_count" , "F. Team input count","intermittent_care_periods" , "G. Intermittent care periods","specialist_contact_prop" , "H. Specialist care proportion","remote_contact_prop" , "I. Remote care proportion","avg_contact_duration" , "J. Average contact duration","acute_admissions_2223" , "K. Acute admission count" )# Load the data and saved objects ----remove_outliers <-read_csv("remove_outliers_rep_sample.csv")key_measures_rep_sub_sample <-read_csv("key_measures_sample_cluster.csv") |>anti_join(remove_outliers, by ="person_id") |> dplyr::select(3:13) |>as.data.frame()#load("cluster_analysis.RData")load("cluster_analysis_kmeans.RData")load("cluster_analysis_lpa.RData")load("cluster_analysis_dbscan.RData")load("cluster_analysis_evaluations.RData")# Perform Principal Component Analysis (PCA)pca_result <-prcomp(key_measures_rep_sub_sample, scale. =TRUE)# Create a dataframe with the principal componentspca_data <-as.data.frame(pca_result$x) |>mutate(cluster =as.factor(kmeans_result_11$cluster)) # Add cluster assignments to the dataframe# Functions ----create_dt <-function(x) { DT::datatable( x , extensions ="Buttons" , options =list(dom ="Blfrtip" , buttons =c("copy", "csv") , lengthMenu =list(c(10, 25, 50, -1) , c(10, 25, 50, "All") ),width ="400px", # Specify the widthheight ="200px"# Specify the height ) )}## kmeans link_model_output <-function(model, model_id) { model$tot.withinss |>tibble() |>rename(tot.withinss =1) |>mutate(id = model_id)}wssplot <-function(data, nc =10, seed =1234, max_iter =100) { wss <-numeric(nc) wss[1] <- (nrow(data) -1) *sum(apply(data, 2, var))set.seed(seed)for (i in2:nc) { wss[i] <-sum(kmeans(data, centers = i, nstart =25, iter.max = max_iter)$withinss) }plot(1:nc, wss, type ='b', xlab ='Number of Clusters', ylab ='Within Group Sum of Squares',main ='Elbow Method Plot to Find Optimal Number of Clusters', frame.plot =TRUE,col ='blue', lwd =1.5)}## LPA## Evaluation### Function to subsample datasubsample_data <-function(data, fraction =0.25) {set.seed(123) # For reproducibility sample_indices <-sample(1:nrow(data), size =floor(fraction *nrow(data)))return(data[sample_indices, ])}### Evaluate clustering performance on a subsampleevaluate_clustering_subsample <-function(data, clusters, fraction =0.25) {# Subsample the data subsample <-subsample_data(data, fraction) subsample_clusters <- clusters[1:nrow(subsample)]# Ensure clusters are numeric subsample_clusters <-as.numeric(subsample_clusters)# Calculate silhouette score dist_matrix <-dist(subsample) sil <-silhouette(subsample_clusters, dist_matrix) silhouette_score <-mean(sil[, 3])# Calculate Davies-Bouldin index db_index <-index.DB(subsample, subsample_clusters)$DB# Calculate within-cluster sum of squares (WCSS) wcss <-sum(kmeans(subsample, centers =length(unique(subsample_clusters)))$withinss)return(list(silhouette_score = silhouette_score, db_index = db_index, wcss = wcss))}### Function to identify the optimum modelselect_optimum_model <-function(kmeans_eval, lpa_eval, dbscan_eval) {# Combine evaluation metrics into a data frame eval_metrics <-data.frame(model =c("K-means", "LPA", "DBSCAN"),silhouette_score =c(kmeans_eval$silhouette_score, lpa_eval$silhouette_score, dbscan_eval$silhouette_score),db_index =c(kmeans_eval$db_index, lpa_eval$db_index, dbscan_eval$db_index),wcss =c(kmeans_eval$wcss, lpa_eval$wcss, dbscan_eval$wcss) )# Normalize the metrics for comparison (higher silhouette score is better, lower DB index and WCSS are better) eval_metrics$normalized_silhouette <- eval_metrics$silhouette_score /max(eval_metrics$silhouette_score) eval_metrics$normalized_db_index <-min(eval_metrics$db_index) / eval_metrics$db_index eval_metrics$normalized_wcss <-min(eval_metrics$wcss) / eval_metrics$wcss# Calculate a combined score (you can adjust the weights as needed) eval_metrics$combined_score <- eval_metrics$normalized_silhouette + eval_metrics$normalized_db_index + eval_metrics$normalized_wcss# Identify the model with the highest combined score optimum_model <- eval_metrics[which.max(eval_metrics$combined_score), "model"]return(optimum_model)}
In this re-organised cluster analysis output, we run multiple cluster models, of varying method and n_clusters, then evaluate the most suitable model.
Having selected the most suitable model, based out evaluation metrics and discussions with the wider analytical team, we then present only that approach, giving details of underlying methods.
For ease of comparison, we combine each of the eval-objects into a single table:
# Union all eval outputs to visualise and comparemodel_evaluation_scores <- kmeans_eval_4 |>union_all(kmeans_eval_5 ) |>union_all(kmeans_eval_6 ) |>union_all(kmeans_eval_7 ) |>union_all(kmeans_eval_8 ) |>union_all(kmeans_eval_9 ) |>union_all(kmeans_eval_10) |>union_all(kmeans_eval_11) |>union_all(kmeans_eval_12) |>union_all(lpa_eval_5) |>union_all(lpa_eval_6) |>union_all(lpa_eval_7) |>union_all(lpa_eval_8) |>union_all(lpa_eval_9) |>union_all(dbscan_eval) |>tibble()
We can then visualise the evaluation metrics:
expand for setup code chunk
model_evaluation_scores |>pivot_longer(cols =-id) |>mutate(group =case_when(id =="dbscan_clusters"~"Density-based",str_detect(id, "kmeans") ~"Distance-based",str_detect(id, "lpa") ~"Model-based") ) |>mutate(id =factor(id,levels =c("kmeans_clusters_4","kmeans_clusters_5","kmeans_clusters_6","kmeans_clusters_7","kmeans_clusters_8","kmeans_clusters_9","kmeans_clusters_10","kmeans_clusters_11","kmeans_clusters_12","lpa_clusters_5","lpa_clusters_6","lpa_clusters_7","lpa_clusters_8","lpa_clusters_9","dbscan_clusters") )) |>mutate(alpha =case_when( name =="silhouette_score"& id =="lpa_clusters_5"~"1", name =="wcss"& id =="kmeans_clusters_12"~"1", name =="db_index"& id =="dbscan_clusters"~"1",TRUE~"0" )) |>ggplot() +geom_point(aes(x = id, y = value, fill = group, colour = group, alpha = alpha, shape = alpha, size = alpha),show.legend =c(alpha =FALSE, size =FALSE, fill =FALSE) ) +facet_wrap(~name, scale ="free_y") +scale_shape_manual(values =c(21,22)) +scale_size_manual(values =c(2.5, 6)) +scale_alpha_manual(values =c(0.5,1)) +theme_minimal() +theme(axis.text.x =element_text(angle =90, vjust =0.5, hjust =0.5),axis.title.y =element_blank(),strip.background =element_rect(fill =NA, colour ="grey"),strip.text =element_text(face ="bold") ) +labs(x ="Cluster model",colour ="Cluster type:",title ="Model comparison",subtitle ="DB-index: Lower values are betterSilhouette score: Higher values betterWithin-cluster Sum of Squares: Lower values indicate clusters are more compact" )
From the above visualisation, we can note:
Silhouette Score: lpa_clusters_5 has the highest score, indicating well-defined clusters.
DB Index: dbscan_clusters has the lowest index, indicating well-separated clusters.
WCSS: kmeans_clusters_12 has the lowest WCSS, indicating compact clusters.
In the absence of a well-performing model across multiple metrics, we can normalise each metric and create a composite measure. In doing so, we sum normalised scores across the 3 measures of model fit and divide by 3 to calculate the composite score.
An alternative approach to to sum the normalised score and apply a weighting but taking an average incorporates the assumption that each metric is equally weighted.
The above composite score implies that the k-means model with 12 clusters is the most suitable model, however whether or not these clusters suit our understanding and intended use of the model is open for debate and adjustment.
Also of note, kmeans model with 12 clusters shows a warning of no convergence, implying instability in the model and steering us towards the next best option - kmeans model with 11 clusters.