Mean SD Median Any sequences Over 0.1% Over 1% Over 10%
Staphylococcus 39.46 33.15 32.00 41 40 37 30
Alloiococcus 19.54 31.27 0.06 41 19 17 14
Turicella 18.25 24.98 4.16 40 24 21 17
Propionibacterium 15.54 21.75 4.36 41 38 36 15
Corynebacterium 2.15 5.57 0.43 40 27 15 1
Streptococcus 0.87 2.37 0.07 33 17 5 1
Enhydrobacter 0.70 2.71 0.00 25 8 4 2
Kocuria 0.41 2.51 0.00 14 3 1 1
Chryseobacterium 0.40 1.66 0.00 17 5 3 0
Neisseria 0.18 0.61 0.00 23 7 1 0
Anaerococcus 0.18 0.82 0.01 23 6 1 0
Snodgrassella 0.15 0.75 0.00 12 3 2 0
Acinetobacter 0.09 0.30 0.01 30 5 2 0
Lactobacillus 0.09 0.31 0.01 22 6 1 0
Lactococcus 0.08 0.44 0.00 21 2 1 0
Haemophilus 0.08 0.20 0.00 21 11 1 0
Rothia 0.08 0.30 0.00 17 3 1 0
Veillonella 0.07 0.19 0.00 16 5 1 0
Stenotrophomonas 0.07 0.36 0.00 12 1 1 0
Actinomyces 0.06 0.19 0.00 22 4 1 0
Finegoldia 0.05 0.16 0.00 23 4 0 0
Micrococcus 0.05 0.16 0.01 22 2 0 0
Pseudomonas 0.05 0.12 0.00 17 5 0 0
Gemella 0.05 0.14 0.00 19 5 0 0
Brevibacterium 0.04 0.20 0.00 6 2 1 0
other taxa 0.78 1.46 0.18 40 25 9 0
unclassified taxa 0.54 0.95 0.20 39 27 6 0
write.csv(top25_gen_summaries, "Outputs/genus_prevalences.csv")
for(i in 1:matrixColSize) { for(j in 1:matrixColSize) {
resultsMatrix[i,j] = JSD(as.vector(inMatrix[,i]), as.vector(inMatrix[,j])) } }
colnames -> colnames(resultsMatrix) -> rownames(resultsMatrix) as.dist(resultsMatrix) -> resultsMatrix
attr(resultsMatrix, "method") <- "dist"
return(resultsMatrix) }
pam.clustering <- function(x,k) {
cluster <- as.vector(pam(as.dist(x), k, diss = TRUE)$clustering) return(cluster)
}
Estimate the optimal number of clusters by calculating the clustering with different amounts of clusters, and comparing their Calinski-Harabasz (CH) Index values (higher CH == better clustering).
# Trim data to exclude rare genera
ear_gen_final_trim <- filter_taxa(ear_gen_final,
function(x) sum(x > 1) > 5, prune = TRUE)
# Calculate distance matrix
ear_gen_relabunds <- prop.table(otu_table(ear_gen_final_trim), margin = 2) dist_clust_gen <- dist.JSD(ear_gen_relabunds)
# Calculate results with different numbers of clusters nclusters <- NULL
for (k in 1:20) { if (k == 1) {
nclusters[k] <- NA } else {
data.cluster_temp <- pam.clustering(dist_clust_gen, k)
nclusters[k] <- index.G1(t(ear_gen_relabunds), data.cluster_temp, d = dist_clust_gen, centrotypes = "medoids") } }
# Plot (with a line at y == 65 to make comparing the highest values easier) clust_df <- data.frame(clusters = factor(1:20), CH = nclusters)
ggplot(clust_df, aes(x = clusters, y = CH)) + geom_bar(stat = "identity", width = 0.1) + xlab("Number of clusters") +
ylab("CH index") + theme_bw() +
geom_hline(yintercept = 65, lty = 2) + theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank())
0 20 40 60
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Number of clusters
CH index
The CH values for the best three are very similar. Another metric for clustering success is the mean silhouette value (values range from -1 to 1, where 1 is the best and -1 is the worst). Make a table showing both the CH and the mean silhouette values for the top five:
clust_df_top <- clust_df[order(clust_df$CH, decreasing = TRUE)[1:5],]
clust_df_top$Mean_silhouette <- sapply(clust_df_top$clusters, function(x) mean(silhouette(pam(dist_clust_gen, x, diss = TRUE), dist_clust_gen)[,3])) customKable(clust_df_top)
clusters CH Mean_silhouette
5 5 75.424 0.302
4 4 74.635 0.315
6 6 73.260 0.301
14 14 70.419 0.301
15 15 70.376 0.313
4, 5 and 6 clusters are very close; continue with all three.
# Vectors with the clustering results
genclust4 <- pam(dist_clust_gen, 4, diss = TRUE)$clustering genclust5 <- pam(dist_clust_gen, 5, diss = TRUE)$clustering genclust6 <- pam(dist_clust_gen, 6, diss = TRUE)$clustering
Earlier, samples were grouped according to the most common genus. How does this compare to the clustering?
clust_and_tax_gen <- data.frame(
Clust4 = factor(genclust4[levels(top_gen_samples_melt$variable)]), Clust5 = factor(genclust5[levels(top_gen_samples_melt$variable)]), Clust6 = factor(genclust6[levels(top_gen_samples_melt$variable)]), Max_gen = sample_max_gen[levels(top_gen_samples_melt$variable)]) Four clusters:
customKable(table(clust_and_tax_gen$Max_gen, clust_and_tax_gen$Clust4))
1 2 3 4
Staphylococcus 1 10 2 4 Propionibacterium 0 0 7 0
Alloiococcus 9 0 0 0
Turicella 2 0 0 6
Five clusters:
customKable(table(clust_and_tax_gen$Max_gen, clust_and_tax_gen$Clust5)) 1 2 3 4 5
Staphylococcus 10 0 1 0 6 Propionibacterium 0 0 7 0 0
Alloiococcus 0 9 0 0 0
Turicella 0 1 0 6 1
Six clusters:
customKable(table(clust_and_tax_gen$Max_gen, clust_and_tax_gen$Clust6)) 1 2 3 4 5 6
Staphylococcus 1 9 1 0 6 0 Propionibacterium 0 0 7 0 0 0
Alloiococcus 6 0 0 3 0 0
Turicella 0 0 0 2 1 5
All three solutions have one Staphylococcus-dominated cluster and onePropionibacterium-dominated cluster.
The 4- and 5-cluster solutions also have one clearly Alloiococcus-dominated cluster, which is split in the 6-cluster solution. The 5 and six cluster solutions have one clearlyTuricella-dominated cluster, which the 4-cluster solution doesn’t have. Since the 5-cluster solution had the highest CH index value, keep that solution.
To visualize what taxa are in each cluster, plot the relative abundances of genera in the samples, re-organized according to the clustering (right ear samples only):
# Taxon labels for legend
taxl_clust <- levels(top_gen_samples_melt$Genus)[1:10]
taxl_clust <- paste("paste(italic('", taxl_clust, "'))", sep = "")
taxl_clust <- parse(text = c(taxl_clust, paste("paste('", c("other genera", "unclassified genus"),
"')", sep = "")))
gen_legend_right <- g_legend(ggplot(top_gen_samples_melt,
aes(x = variable, y = value, fill = Genus)) + geom_bar(stat = "identity", position = "stack") +
theme_bw(base_size = 10) +
guides(fill = guide_legend(ncol = 2)) +
scale_fill_manual(values = relacols3, name = "Genus", labels = taxl_clust) +
theme(legend.key.size = unit(4.5, "mm"), legend.position = "right",
legend.text.align = 0))
cluster_relabund_plots_gen <- lapply(1:length(unique(genclust5)), function (x) arrangeGrob(
ggplot(subset(top_gen_samples_melt,
variable %in% rownames(subset(clust_and_tax_gen, Clust5 == x))), aes(x = variable, y = value, fill = Genus)) +
geom_bar(stat = "identity", position = "stack") + xlab(NULL) +
ylab("Relative abundance (%)") +
scale_fill_manual(values = relacols3) + theme_bw(base_size = 10) +
ggtitle(paste("Cluster", x)) +
theme(axis.text.x = element_text(color = "black", angle = 45, hjust = 1), panel.grid = element_blank(),
legend.position = "none")))
cluster_relabund_plots_gen[[6]] <- gen_legend_right
# Draw plots
ototype_plot_gen <- do.call("arrangeGrob",
c(cluster_relabund_plots_gen, ncol = 2))
Figure 1
grid.arrange(ototype_plot_gen)
0 25 50 75 100
Sample_5Sample_9Sample_15Sample_21Sample_33Sample_45Sample_71Sample_91Sample_95Sample_97
Relative abundance (%)
Cluster 1
0 25 50 75 100
Sample_17Sample_25Sample_29Sample_43Sample_47Sample_57Sample_59Sample_63Sample_79Sample_11
Relative abundance (%)
Cluster 2
0 25 50 75 100
Sample_83Sample_13Sample_23Sample_37Sample_41Sample_53Sample_65Sample_99
Relative abundance (%)
Cluster 3
0 25 50 75 100
Sample_35Sample_51Sample_55Sample_67Sample_73Sample_75
Relative abundance (%)
Cluster 4
0 25 50 75 100
Sample_31Sample_49Sample_61Sample_69Sample_81Sample_87Sample_89
Relative abundance (%)
Cluster 5
Genus
Staphylococcus Alloiococcus Turicella
Propionibacterium Corynebacterium Streptococcus
Enhydrobacter Kocuria
Chryseobacterium Snodgrassella other genera unclassified genus
# Export
ggsave(grid.arrange(ototype_plot_gen),
filename = "Outputs/fig1_clustering_and_genus_relabunds.pdf", device = cairo_pdf, width = 180, height = 180, units = "mm")
This matches what was seen in the table of dominant taxa per cluster: Cluster 1 has mainlyStaphylococcus, Cluster 2 is Alloiococcus-dominated, Cluster 3 is Propionibacterium-dominated, Cluster 4 is Turicella- dominated, and Cluster 5 has bothStaphylococcus andTuricella.