#Exploratory Clustering for Emergency Department Patients - ICIMTH 2022 library(ggpubr) library(factoextra) data <- read.csv(file.choose()) #preview file summary(data) # identify working directory getwd() #remove the first three columns data = subset(data, select = -c(1:3)) #convert to factor data$Dept<- factor(data$Dept) data$SEX<- factor(data$SEX) data$Admission<- factor(data$Admission) summary(data) data1<-data summary(data1) #Remove factors SEX, Dept, Admission data.noclass<- data1[-c(15:17)] summary(data.noclass) #Scale library(dplyr) data.noclass1<-data.noclass %>% mutate_if(is.numeric, scale) summary(data.noclass1) #k-means including all the variables set.seed(100) ED.cl <- kmeans(data.noclass1, centers=2) dist(ED.cl$centers) data1$cluster <- fitted(ED.cl, method = "classes") table(data1$Admission, data1$cluster) #https://cran.r-project.org/web/packages/FeatureImpCluster/readme/README.html install.packages('FeatureImpCluster') library(FeatureImpCluster) library(flexclust) set.seed(10) res <- kcca(data.noclass1,k=2) set.seed(10) FeatureImp_res <- FeatureImpCluster(res,as.data.table(data.noclass1)) plot(FeatureImp_res) barplot(res) #For our example we repeat the clustering and feature importance calculation 20 times: set.seed(12) nr_seeds <- 20 seeds_vec <- sample(1:1000,nr_seeds) savedImp <- data.frame(matrix(0,nr_seeds,dim(data.noclass1)[2])) count <- 1 for (s in seeds_vec) { set.seed(s) res <- kcca(data.noclass1,k=2) set.seed(s) FeatureImp_res <- FeatureImpCluster(res,as.data.table(data.noclass1),sub = 1,biter = 1) savedImp[count,] <- FeatureImp_res$featureImp[sort(names(FeatureImp_res$featureImp))] count <- count + 1 } names(savedImp) <- sort(names(FeatureImp_res$featureImp)) #Now it becomes quite obvious that x and y are the only relevant features, and we could do our clustering only based on these features. This is important in practice since cluster centroids with a lower number of features are easier to interpret, and we can save time / money collecting and pre-processing unnecessary features. boxplot(savedImp) #After the previous analysis we conclude that we only need to keep the variables #Age, CRP, HGB, NEUT, LYM, UREA data.noclass2<-data.noclass1[c('Age', 'CRP', 'HGB', 'NEUT.', 'LYM.', 'UREA')] #We apply k-means in the reduced dataset set.seed(100) ED.cl <- kmeans(data.noclass2, centers=2) dist(ED.cl$centers) data1$cluster <- fitted(ED.cl, method = "classes") table(data1$Admission, data1$cluster) ##--- We observe no improvement in the clustering #visualize k-means clusters fviz_cluster(ED.cl, data = data.noclass1, palette = c("#2E9FDF", "#00AFBB", "#E7B800"), geom = "point", ellipse.type = "convex", ggtheme = theme_bw() ) #Compute principal component analysis (PCA) # Dimension reduction using PCA res.pca <- prcomp(data.noclass1, scale = TRUE) # Coordinates of individuals ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord) # Add clusters obtained using the K-means algorithm ind.coord$cluster <- factor(data1$cluster) # Add Species groups from the original data sett ind.coord$Admission <- data1$Admission # Data inspection head(ind.coord$Admission) # Percentage of variance explained by dimensions eigenvalue <- round(get_eigenvalue(res.pca), 1) variance.percent <- eigenvalue$variance.percent head(eigenvalue) #Visualize k-means clusters ggscatter( ind.coord, x = "Dim.1", y = "Dim.2", color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex", shape = "Admission", size = 1.5, legend = "right", ggtheme = theme_bw(), xlab = paste0("Dim 1 (", variance.percent[1], "% )" ), ylab = paste0("Dim 2 (", variance.percent[2], "% )" ) ) + stat_mean(aes(color = cluster), size = 4)