## 1. **Determine the Optimal Number of Clusters**:# fviz_nbclust(key_measures_scaled_head, kmeans, method = "wss") # Elbow Methodwssplot(key_measures_rep_sub_sample)
# K-means goodness of fit## The total within-cluster sum of squares (WSS) measures the compactness of the clusters. Lower values indicate better fit.## The more clusters derived, the lower the sum of squares## Trade of between fit and model simplicity?link_model_output(kmeans_result_4, "clusters_4") |>union_all(link_model_output(kmeans_result_5 , "clusters_5")) |>union_all(link_model_output(kmeans_result_6 , "clusters_6")) |>union_all(link_model_output(kmeans_result_7 , "clusters_7")) |>union_all(link_model_output(kmeans_result_8 , "clusters_8")) |>union_all(link_model_output(kmeans_result_9 , "clusters_9")) |>union_all(link_model_output(kmeans_result_10, "clusters_10")) |>union_all(link_model_output(kmeans_result_11, "clusters_11")) |>union_all(link_model_output(kmeans_result_12, "clusters_12")) |>mutate(rn =row_number()) |>ggplot(aes(x =reorder(id, rn), y = tot.withinss)) +geom_col(width =0.01) +geom_point(size =5) +scale_y_continuous(labels = scales::comma) +theme_minimal() +theme(axis.text.x =element_text(angle =90, vjust =0.5)) +labs(x ="n_clusters",y ="Total within-cluster sum of squares (WSS)",title ="Comparing sum of squares Goodness of Fit measure",subtitle ="K-means cluster models" )
# Perform Principal Component Analysis (PCA)pca_result <-prcomp(key_measures_rep_sub_sample, scale. =TRUE)# Create a dataframe with the principal componentspca_data <-as.data.frame(pca_result$x) |>mutate(cluster =as.factor(kmeans_result_6$cluster)) # Add cluster assignments to the dataframe# Create pairwise scatter plot matrixggpairs_plot <-ggpairs(pca_data, aes(color = cluster, alpha =0.5))
ggpairs_plot
3D plot of top 3 principle components
# Create 3D plotplot_ly(pca_data_3d, x =~PC1, y =~PC2, z =~PC3, color =~cluster#colors = c('#1f77b4', '#ff7f0e', '#2ca02c') ) %>%add_markers(size =1) %>%layout(scene =list(xaxis =list(title ='PC1'),yaxis =list(title ='PC2'),zaxis =list(title ='PC3')),title ='3D Plot of Principal Components')
# Visualise DBscan clusters# Calculate the average variable scores for each clusterdb_average_scores <-aggregate(key_measures_rep_sub_sample, by =list(cluster = db$cluster), FUN = mean)library(reshape2)library(ggplot2)# Melt the data for ggplot2db_melted_scores <- reshape2::melt(db_average_scores, id.vars ="cluster")# Create a bar plotdb_clusters_avg_plot_fixed <- db_melted_scores |>ggplot(aes(x = variable, y = value, fill = variable)) +geom_bar(stat ="identity", position ="dodge") +labs(title ="Average Variable Scores by Cluster", x ="Variable", y ="Average Score", fill ="Cluster") +theme_minimal() +theme(axis.text.x =element_text(angle =90, hjust =1),strip.background =element_rect(fill =NA, colour ="grey"),legend.position ="none" ) +facet_wrap(~factor(cluster))db_clusters_avg_plot_free <- db_melted_scores |>ggplot(aes(x = variable, y = value, fill = variable)) +geom_bar(stat ="identity", position ="dodge") +labs(title ="Average Variable Scores by Cluster", x ="Variable", y ="Average Score", fill ="Cluster") +theme_minimal() +theme(axis.text.x =element_text(angle =90, hjust =1),strip.background =element_rect(fill =NA, colour ="grey"),legend.position ="none" ) +facet_wrap(~factor(cluster), scales ="free_y")
Silhouette Score: lpa_clusters_5 has the highest score, indicating well-defined clusters.
DB Index: dbscan_clusters has the lowest index, indicating well-separated clusters.
WCSS: kmeans_clusters_12 has the lowest WCSS, indicating compact clusters.
In the absence of a well-performing model across multiple metrics, we can normalise each metric and create a composite measure. In doing so, we calculate an average normalised score across the 3 measures of model fit and divide by 3 to calcuate the composite score.
An alternative approach to to sum the normalised score and apply a weighting but taking an average incorporates the assumption that each metric is equally weighted.
The above composite score implies that the k-means model with 12 clusters is the most suitable model, however whether or not these clusters suit our understanding and intended use of the model is open for debate and adjustment.