R Code for Sjövall, A. et al.:
Microbiome of the external auditory canal
Velma T. E. Aho 03 January 2020
Contents
Setting up data and tools 2
Basic statistics for sequence data 7
Technical controls and contaminants . . . 8
Subject demographics 14 Table 1 . . . 16
Final microbiota data setup 17 Batch variation 17 Alpha diversity . . . 17
Figure S1A . . . 18
Beta diversity . . . 18
Figure S1B . . . 19
Right and left ears 20 Alpha diversity . . . 20
Beta diversity . . . 22
Figure S2 . . . 23
Handedness 24 Alpha diversity . . . 24
Beta diversity . . . 24
Trim to right ears 26 Most common bacterial taxa 26 ASVs . . . 26
Genera . . . 29
Figure S3 (ASVs + genera depending on classification) . . . 30
Taxon prevalences . . . 34
Table S1 . . . 34
Clustering by microbial community type 38 Figure 1 . . . 42
Microbiota and clinical variables 43 Alpha diversity . . . 44
Beta diversity . . . 46
Table 2 . . . 50
Figure 2 . . . 50
Clustering and clinical variables 51
Differential abundance 53 Figure S4 . . . 57
Session info 57
Setting up data and tools
Load required packages:
library("kableExtra") library("phyloseq") library("ggplot2") library("gridExtra") library("reshape2") library("decontam") library("vegan") library("tools") library("dplyr") library("cluster") library("clusterSim") library("ade4")
library("adegraphics") library("DESeq2")
Set up functions and variables for use throughout the analysis:
# Styling function for table output:
customKable <- function(df, d = 3, cn = NA, pos = "center"){
kable_styling(kable(df, digits = d, col.names = cn,
format = "latex", booktabs = TRUE, linesep = "", format.args = list(big.mark = ",")),
latex_options = "striped", position = pos) }
# Blank panel for grid graphics library("grid")
blankPanel <- grid.rect(gp = gpar(col = "white"), draw = FALSE)
# Legend-grabbing function g_legend <- function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend <- tmp$grobs[[leg]]
legend }
# Alpha diversity plotting function for three indices plotAlphaDivs <- function(df, var, title = ""){
grid.arrange(
ggplot(df, aes(x = df[[var]], y = Observed)) + geom_boxplot() +
xlab(var) + theme_bw() + ggtitle(title) +
theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), ggplot(df, aes(x = df[[var]], y = Shannon)) +
geom_boxplot() + xlab(var) + theme_bw() + ggtitle("") +
theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), ggplot(df, aes(x = df[[var]], y = InvSimpson)) +
geom_boxplot() + xlab(var) + theme_bw() + ggtitle("") +
theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), nrow = 1)
}
# Pre-selected (hand-picked) color variables for relative abundance bar charts relacols1 <- c("#01665E", "#C51B7D", "#F1B6DA", "#35978F", "#DFC27D", "#BF812D",
"darkslateblue", "thistle", "#C7EAE5", "gray75", "gray25")
relacols2 <- c("#01665E", "#DFC27D", "slategray3", "plum4", "thistle", "#35978F",
"plum3", "thistle1", "lightsteelblue2", "skyblue3",
"lightsalmon4", "lightseagreen", "darkseagreen2", "peachpuff1",
"mediumaquamarine", "seagreen4", "darkseagreen4", "deepskyblue4",
"mistyrose2", "lightpink2", "gray50")
relacols3 <- c("#01665E", "lightsteelblue2", "plum4", "#DFC27D", "salmon2",
"lightsalmon4", "peachpuff1", "palegreen2", "mediumpurple",
"pink", "gray50", "black")
relacols4 <- c("#01665E", "#DFC27D", "slategray3", "plum4", "#35978F",
"thistle", "lightsteelblue2", "gray50", "black")
# Function for relative abundance tables and bar charts relAbundChart <- function(phyloObj, taxaCount = 10,
byVariable = "sample", cols, table = FALSE){
makeAbundTable <- function(phyloO){
abund_table <- as.data.frame(prop.table(otu_table(phyloO), 2)*100) otherpos <- which(is.na(tax_table(phyloO)[, "Phylum"]))
ucpos <- grep("^unclassified$", rownames(abund_table)) if(length(otherpos) == 0 & length(ucpos) == 0){
final_table <- as.data.frame(t(abund_table[order(rowMeans(abund_table), decreasing = TRUE), ])) return(final_table)
}
final_table <- abund_table[-c(ucpos, otherpos), ]
final_table <- final_table[order(rowMeans(final_table), decreasing = TRUE), ]
if(length(otherpos) != 0){
other_taxa <- abund_table[otherpos, ] rownames(other_taxa) <- "other taxa"
final_table <- rbind(final_table, other_taxa) }if(length(ucpos) != 0){
ucTaxa <- abund_table[ucpos, ]
rownames(ucTaxa) <- "unclassified taxa"
final_table <- rbind(final_table, ucTaxa) }
final_table <- as.data.frame(t(final_table)) return(final_table)
}
aggregateAbunds <- function(table, variable){
agg_table <- suppressWarnings(
aggregate(table, by = list(table[, variable]), FUN = "mean"))[, 1:ncol(table)]
rownames(agg_table) <- agg_table[, 1]
agg_table <- agg_table[, 2:ncol(agg_table)]
return(agg_table) }
gg2table <- function(table){
tableGG <- as.data.frame(t(table)) tableGG$Type <- rownames(tableGG)
ggTable <- melt(tableGG, id.vars = "Type") return(ggTable)
}
byVar <- sample_data(phyloObj)[[byVariable]]
if (taxa_are_rows(phyloObj) == FALSE) { phyloObj <- phyloseq(tax_table(phyloObj),
sample_data(phyloObj),
otu_table(t(otu_table(phyloObj))), taxa_are_rows = TRUE) }
legend_title <- colnames(tax_table(phyloObj))[ncol(tax_table(phyloObj))]
topTaxa <- c(names(sort(taxa_sums(phyloObj) / sum(sample_sums(phyloObj)), decreasing = TRUE))[1:taxaCount], "unclassified") phylo_merged <- merge_taxa(phyloObj, rownames(tax_table(phyloObj))
[!(rownames(tax_table(phyloObj))) %in% topTaxa]) phylo_abunds <- makeAbundTable(phylo_merged)
if(byVariable != "sample"){
phylo_abunds <- cbind(phylo_abunds, byVar)
phylo_abunds <- aggregateAbunds(phylo_abunds, "byVar") }
if(table == TRUE){
phylo_abunds <- as.data.frame(t(as.matrix(phylo_abunds))) return(phylo_abunds)
}
plot_df <- gg2table(phylo_abunds)
plot_df$Type <- factor(plot_df$Type, levels = unique(plot_df$Type)) abund_bar_chart <- ggplot(plot_df, aes(variable, value, fill = Type)) +
geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 10) +
xlab(NULL) +
ylab("Mean relative abundance (%)") +
scale_fill_manual(values = cols, name = legend_title, limits = levels(plot_df$Type)) if(byVariable == "sample"){
abund_bar_chart <- abund_bar_chart + ylab("Relative abundance (%)") }
return(abund_bar_chart) }
# Function for merging data to different taxonomic levels in phyloseq objects collapseTaxLevel <- function(phylo_obj, level){
collapseOtuTable <- function(phylo_obj, level){
tax <- as.data.frame(as(tax_table(phylo_obj), "matrix")) otu <- as.data.frame(as(otu_table(phylo_obj), "matrix")) otu[, level] <- tax[, level]
otu_collapsed <- melt(otu, id = (level)) otu_collapsed <- acast(otu_collapsed,
as.formula(paste(level, "~variable", sep = "")), sum) otu_collapsed <- otu_collapsed[order(rownames(otu_collapsed)), ]
return(otu_collapsed) }
if (taxa_are_rows(phylo_obj) == FALSE) {
phylo_obj <- phyloseq(tax_table(phylo_obj), sample_data(phylo_obj),
otu_table(t(otu_table(phylo_obj))), taxa_are_rows = TRUE)
}
levelNum <- grep(level, colnames(tax_table(phylo_obj))) if (levelNum == 1){
otu_collapsed <- collapseOtuTable(phylo_obj, level) tax_collapsed <- data.frame(rownames(otu_collapsed),
row.names = rownames(otu_collapsed)) colnames(tax_collapsed) <- level
} else {
tax <- as(tax_table(phylo_obj), "matrix")
tax <- apply(tax, 2, function(x) gsub(".*_unclassified", "unclassified", x))
phylo_obj <- phyloseq(otu_table(phylo_obj), sample_data(phylo_obj), tax_table(tax))
otu_collapsed <- collapseOtuTable(phylo_obj, level) tax <- as(tax_table(phylo_obj), "matrix")
tax_collapsed <- unique(tax[, 1:levelNum])
tax_collapsed <- tax_collapsed[-grep("unclassified", tax_collapsed[,levelNum]),]
tax_collapsed <- rbind(tax_collapsed, rep("unclassified", (levelNum))) rownames(tax_collapsed) <- tax_collapsed[, level]
tax_collapsed <- tax_collapsed[order(rownames(tax_collapsed)), ] }
new_phylo_obj <- phyloseq(otu_table(otu_collapsed, taxa_are_rows = TRUE), tax_table(tax_collapsed),
sample_data(phylo_obj)) return(new_phylo_obj)
}
Import the 16S rRNA gene amplicon data (from DADA2) to make a phyloseq object:
# Read counts
ear_OTUs <- readRDS("ears_seqtab_final_v2.rds")
colnames(ear_OTUs) <- paste("ASV", 1:ncol(ear_OTUs), sep = "")
# Taxonomy (fixing empty cells to "unclassified") ear_tax <- readRDS("ears_tax_final_v2.rds")
rownames(ear_tax) <- paste("ASV", 1:nrow(ear_tax), sep = "") ear_tax[is.na(ear_tax)] <- "unclassified"
ear_tax[,"Species"] <- paste(rownames(ear_tax), ": ", ear_tax[,"Genus"], " ",
ear_tax[,"Species"], sep = "")
ear_tax[,"Species"] <- gsub("unclassified unclassified", "unclassified", ear_tax[,"Species"])
# Simple metadata for preliminary data exploration ear_meta <- read.csv("basic_meta.csv", header = TRUE) rownames(ear_meta) <- ear_meta$Sample
# Negative control or not -variable ear_meta$Neg <- ear_meta$Type != "sample"
# Make phyloseq object
earphy <- phyloseq(otu_table(ear_OTUs, taxa_are_rows = FALSE), tax_table(ear_tax),
sample_data(ear_meta))
Basic statistics for sequence data
There are a total of 10,266,925 sequence reads and a total of 3,767 unique Amplicon Sequence Variants (ASVs) in the data. ASVs represent microbes that have an identical V3-V4 16S rRNA sequence according to the analysis run withdada2, version 1.7.7.
The number of samples in the data is 109. Out of these samples, 9 are technical controls (2 sampling blanks, 3 DNA extraction blanks and 4 PCR control blanks), while 100 are actual ear samples.
Basic statistics for the distribution of sequence reads in the samples:
customKable(as.matrix(summary(sample_sums(earphy))), d = 0, cn = "Sequence reads") Sequence reads
Min. 32
1st Qu. 71,724
Median 96,291
Mean 94,192
3rd Qu. 124,324
Max. 169,833
Histogram of the number of reads per sample:
ggplot(data.frame(ReadSum = sample_sums(earphy)), aes(x = ReadSum)) + geom_histogram(color = "black", fill = "gray60") +
theme_bw() +
scale_y_continuous(breaks = seq(0, 10, 2)) + xlab("Sequence reads") +
ylab("Number of samples") +
theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank())
0 2 4 6 8 10
0 50000 100000 150000
Sequence reads
Number of samples
All samples with < 10000 reads are technical control samples, as can be seen from a table of 10 samples with the least reads:
customKable(sort(sample_sums(earphy))[1:10], cn = "Read count")
Read count
Sample_BIPCR0ctrl2_1 32
Sample_BIPCR0ctrl2_2 43
Sample_kitctrl2_1 67
Sample_86blank 140
Sample_kitctrl2_2 329
Sample_85blank 530
Sample_kitctrl 714
Sample_BIPCR0ctrl1 2,431
Sample_BIPCR0ctrl2 13,283
Sample_39 16,607
Technical controls and contaminants
Use the R package “decontam” to estimate the amounts of contaminants in samples, based on both presence in the negative controls, and the relationship of taxon abundance with DNA concentration in samples.
# Estimate contaminants
earphy_conts <- isContaminant(earphy, conc = "Conc", neg = "Neg",
method = "combined", batch = "Batch", threshold = 0.1) earphy_conts <- cbind(earphy_conts, as(tax_table(earphy), "matrix"))
Total number of suspected contaminant ASVs (of 3,767):
nrow(subset(earphy_conts, contaminant == TRUE))
## [1] 56
Details for the 20 most common ones:
kable_styling(kable(
head(subset(earphy_conts, contaminant == TRUE), 20)[,c(5, 9:10, 12:13)], digits = 4, format = "latex", booktabs = TRUE, linesep = "",
col.names = c("p-value", "Class", "Order", "Genus", "Species")), latex_options = "striped", font_size = 8)
p-value Class Order Genus Species
ASV20 0.0268 Betaproteobacteria Burkholderiales Burkholderia ASV20: Burkholderia caledonica ASV26 0.0365 Gammaproteobacteria Pseudomonadales Pseudomonas ASV26: Pseudomonas unclassified ASV30 0.0106 Gammaproteobacteria Oceanospirillales Halomonas ASV30: Halomonas phoceae ASV32 0.0233 Betaproteobacteria Burkholderiales Burkholderia ASV32: Burkholderia unclassified ASV33 0.0133 Gammaproteobacteria Alteromonadales Shewanella ASV33: Shewanella unclassified ASV54 0.0078 Betaproteobacteria Burkholderiales Ralstonia ASV54: Ralstonia unclassified ASV58 0.0034 Gammaproteobacteria Oceanospirillales Halomonas ASV58: Halomonas unclassified ASV70 0.0064 Gammaproteobacteria Xanthomonadales Stenotrophomonas ASV70: Stenotrophomonas maltophilia ASV76 0.0064 Betaproteobacteria Burkholderiales Ralstonia ASV76: Ralstonia insidiosa
ASV80 0.0031 Gammaproteobacteria Oceanospirillales Halomonas ASV80: Halomonas unclassified ASV83 0.0065 Betaproteobacteria Burkholderiales Burkholderia ASV83: Burkholderia unclassified ASV86 0.0113 Betaproteobacteria Burkholderiales Ralstonia ASV86: Ralstonia solanacearum ASV94 0.0581 Betaproteobacteria Burkholderiales Burkholderia ASV94: Burkholderia unclassified ASV99 0.0018 Betaproteobacteria Burkholderiales Burkholderia ASV99: Burkholderia unclassified ASV106 0.0037 Gammaproteobacteria Alteromonadales Shewanella ASV106: Shewanella unclassified ASV112 0.0030 Gammaproteobacteria Oceanospirillales Halomonas ASV112: Halomonas unclassified ASV113 0.0117 Gammaproteobacteria Oceanospirillales Halomonas ASV113: Halomonas unclassified ASV117 0.0195 Gammaproteobacteria Oceanospirillales Halomonas ASV117: Halomonas unclassified ASV122 0.0050 Gammaproteobacteria Oceanospirillales Halomonas ASV122: Halomonas unclassified ASV144 0.0831 Acidobacteria_Gp2 Gp2 unclassified ASV144: unclassified
Example plots for taxa labeled as contaminants:
# Make a combined sample type/batch variable for coloring sample_data(earphy)$BatchType <- factor(
paste(sample_data(earphy)$Batch, sample_data(earphy)$Type))
# Frequency plots for first 9 contaminants plot_frequency(earphy,
rownames(subset(earphy_conts, contaminant == TRUE))[1:9], conc = "Conc") +
theme_bw() +
theme(legend.position = "bottom") + geom_point(aes(color = BatchType)) +
scale_color_manual(values = c("dodgerblue", "orchid1", "gray40",
"blue", "maroon3", "black",
"seagreen1")) + xlab("DNA Concentration") +
ylab("log(frequency)") + theme_bw()
ASV58 ASV70 ASV76
ASV32 ASV33 ASV54
ASV20 ASV26 ASV30
3 10 30 3 10 30 3 10 30
1e−04 1e−02 1e+00
1e−04 1e−02 1e+00
1e−04 1e−02 1e+00
DNA Concentration
log(frequency)
BatchType
Batch1 extraction_blank Batch1 PCR_blank Batch1 sample
Batch2 extraction_blank Batch2 PCR_blank Batch2 sample Batch2 sampling_blank
As a comparison, plot first 9 non-contaminants:
plot_frequency(earphy,
rownames(subset(earphy_conts, contaminant == FALSE))[1:9], conc = "Conc") +
theme_bw() +
theme(legend.position = "bottom") +
geom_point(aes(color = BatchType)) +
scale_color_manual(values = c("dodgerblue", "orchid1", "gray40",
"blue", "maroon3", "black",
"seagreen1")) + xlab("DNA Concentration") +
ylab("log(frequency)") + theme_bw()
ASV7 ASV8 ASV9
ASV4 ASV5 ASV6
ASV1 ASV2 ASV3
3 10 30 3 10 30 3 10 30
1e−04 1e−03 1e−02 1e−01 1e+00
1e−04 1e−03 1e−02 1e−01 1e+00
1e−04 1e−03 1e−02 1e−01 1e+00
DNA Concentration
log(frequency)
BatchType
Batch1 extraction_blank Batch1 PCR_blank Batch1 sample
Batch2 extraction_blank Batch2 PCR_blank Batch2 sample Batch2 sampling_blank
Make a plot summarizing read counts and contaminant status:
ear_cont_simpl <- data.frame(
Cont = colSums(t(otu_table(earphy))[rownames(subset( earphy_conts, contaminant == TRUE)), ]),
NotCont = colSums(t(otu_table(earphy))[rownames(subset( earphy_conts, contaminant == FALSE)), ]),
sample_data(earphy))
# Relevel the sample variable numerically ear_cont_simpl$Sample <- factor(
ear_cont_simpl$Sample,
levels = c(paste("Sample", 1:102, sep = "_"),
unique(as.character(subset(ear_cont_simpl, Neg == TRUE)$Sample))))
# Melt
ear_cont_simpl <- melt(ear_cont_simpl)
# Plot
ear_cont_bars <- grid.arrange(
ggplot(subset(ear_cont_simpl, Neg == FALSE & Batch == "Batch1" &
variable != "Conc"),
aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 8) +
ylab("Sequence reads") + xlab(NULL) +
ggtitle("Batch 1") +
scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),
name = "Contaminant taxon?") + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none"),
ggplot(subset(ear_cont_simpl, Neg == FALSE & Batch == "Batch2" &
variable != "Conc"),
aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 8) +
ylab("Sequence reads") + xlab(NULL) +
ggtitle("Batch 2") +
scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),
name = "Contaminant taxon?") + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none"),
arrangeGrob(
ggplot(subset(ear_cont_simpl, Neg == TRUE & variable != "Conc"), aes(x = Sample, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "stack") + theme_bw() +
ylab("Sequence reads") + xlab(NULL) +
facet_grid("~Batch") + ggtitle("Blanks") +
scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),
name = "Contaminant taxon?") + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1)), blankPanel, nrow = 1, widths = c(3, 1)),
ncol = 1, heights = c(1, 1, 1.5))
0 50000 100000 150000
Sample_1Sample_2Sample_3Sample_4Sample_5Sample_6Sample_7Sample_8Sample_9Sample_10Sample_11Sample_12Sample_13Sample_14Sample_15Sample_16Sample_17Sample_18Sample_19Sample_20Sample_21Sample_22Sample_23Sample_24Sample_25Sample_26Sample_27Sample_28Sample_29Sample_30Sample_31Sample_32Sample_33Sample_34Sample_35Sample_36Sample_37Sample_38Sample_39Sample_40
Sequence reads
Batch 1
0 50000 100000 150000
Sample_41Sample_42Sample_43Sample_44Sample_45Sample_46Sample_47Sample_48Sample_49Sample_50Sample_51Sample_52Sample_53Sample_54Sample_55Sample_56Sample_57Sample_58Sample_59Sample_60Sample_61Sample_62Sample_63Sample_64Sample_65Sample_66Sample_67Sample_68Sample_69Sample_70Sample_71Sample_72Sample_73Sample_74Sample_75Sample_76Sample_77Sample_78Sample_79Sample_80Sample_81Sample_82Sample_83Sample_84Sample_87Sample_88Sample_89Sample_90Sample_91Sample_92Sample_93Sample_94Sample_95Sample_96Sample_97Sample_98Sample_99Sample_100Sample_101Sample_102
Sequence reads
Batch 2
Batch1 Batch2
Sample_kitctr l
Sample_BIPCR0ctr l1
Sample_BIPCR0ctr l2
Sample_85b lank
Sample_86b lank
Sample_kitctr l2_1
Sample_kitctr l2_2
Sample_BIPCR0ctr l2_1
Sample_BIPCR0ctr l2_2
Sample_kitctr l
Sample_BIPCR0ctr l1
Sample_BIPCR0ctr l2
Sample_85b lank
Sample_86b lank
Sample_kitctr l2_1
Sample_kitctr l2_2
Sample_BIPCR0ctr l2_1
Sample_BIPCR0ctr l2_2 0
5000 10000
Sequence reads
Contaminant taxon?
yes no
Blanks
Delete all the suspected contaminant ASVs and trim all technical control samples from the data before downstream analyses:
# Trim the technical control samples
earphy_trim <- subset_samples(earphy, Type == "sample")
# Remove the contaminant ASVs
earphy_trim <- subset_taxa(earphy_trim,
Species %in% subset(earphy_conts, contaminant == FALSE)$Species)
# Additionally, remove reads classified as Chloroplasts/Cyanobacteria
earphy_trim <- subset_taxa(earphy_trim, Phylum != "Cyanobacteria/Chloroplast")
After this trimming, the final sequence counts for samples were as follows (showing right and left ears separately):
ear_reads <- data.frame(ReadSum = sample_sums(earphy_trim), sample_data(earphy_trim))
ear_reads$Subject <- factor(ear_reads$Subject, levels = unique(ear_reads$Subject))
# Histogram of read counts
ggplot(ear_reads, aes(y = ReadSum, x = Subject,
group = Subject, shape = Side, color = Side)) + geom_point(size = 2) +
geom_line(color = "gray20") + theme_bw() +
ylab("Sequence reads") + xlab(NULL) +
scale_color_manual(values = c("gray60", "seagreen4")) + scale_shape_manual(values = c(15, 16)) +
scale_y_continuous(breaks = seq(0, 175000, 25000)) + theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1))
0 25000 50000 75000 100000 125000 150000 175000
Subject_1Subject_3Subject_5Subject_7Subject_9Subject_11Subject_13Subject_15Subject_17Subject_19Subject_21Subject_23Subject_25Subject_27Subject_29Subject_31Subject_33Subject_35Subject_37Subject_39Subject_41Subject_43Subject_45Subject_47Subject_49Subject_51Subject_53Subject_55Subject_57Subject_59Subject_61Subject_63Subject_65Subject_67Subject_69Subject_71Subject_73Subject_75Subject_77Subject_79Subject_81Subject_83Subject_87Subject_89Subject_91Subject_93Subject_95Subject_97Subject_99Subject_101
Sequence reads
Side
left right
Check the amount of contaminant reads (before trimming) in the samples from subjects with the least reads:
ggplot(subset(ear_cont_simpl,
Subject %in% subset(ear_reads, ReadSum < 50000)$Subject &
variable != "Conc"),
aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw() +
ylab("Sequence reads") + xlab(NULL) +
facet_grid(~Subject, space = "free", scales = "free") + scale_fill_manual(values = c("firebrick", "gray20"),
labels = c("yes", "no"), name = "Contaminant taxon?") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color = "gray40"), axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = 10), strip.background = element_rect(fill = "white"), legend.position = "none")
Subject_101 Subject_15 Subject_19 Subject_27 Subject_39 Subject_45 Subject_7
Sample_101Sample_102Sample_15Sample_16Sample_19Sample_20Sample_27Sample_28Sample_39Sample_40Sample_45Sample_46 Sample_7Sample_8 0
50000 100000
Sequence reads
Based on this, trim out Subjects 19, 27, 39 and 7 (based on low read count and many contaminants) as well as Subject 101 (because of low read count in right ear sample). Keep Subject 45 (who had no contaminants and a low read count only in the left ear) and Subject 15 (whose left-ear sample had a low read count and a high proportion of contaminants, but the right ear sample seemed better and the analyses were focused on that side).
low_read_subjects <- c("Subject_101", "Subject_19", "Subject_27",
"Subject_39", "Subject_7")
# Trim
earphy_trim <- subset_samples(earphy_trim, !(Subject %in% low_read_subjects))
Subject demographics
Import the demographic/clinical data for the subjects:
# Import metadata
earclin <- read.csv("earmeta_v4.5.csv", row.names = 1) rownames(earclin) <- earclin$Sample
# Binary factor variables are interpreted as numeric on import; fix:
binaryvars <- vector() for(i in colnames(earclin)){
if(identical(as.numeric(sort(unique(earclin[,i]))), c(0, 1))) { binaryvars <- c(binaryvars, i)
earclin[,i] <- factor(earclin[,i]) } }
# Relevel the sampling season (spring samples were collected first):
earclin$Sampling_season <- factor(earclin$Sampling_season, levels = c("spring", "autumn"))
# Additional variable for whether the sample corresponds to the dominant hand:
earclin$Dominant_side <- factor(earclin$Side == earclin$Handedness) levels(earclin$Dominant_side) <- c("no", "yes")
# Variable list for downstream comparisons
varlist <- c("Sampling_season", "Sex", "Age", "Allergy", "Dominant_side",
"Ear_cleaning_swab", "Recent_common_cold", "Staff_student") Check how many subjects have used antibiotics during the past month and exclude them:
# Number of abx users?
nrow(subset(earclin, Abx_past_month == 1))/2
## [1] 4
# Exclude these four:
earclin <- subset(earclin, Abx_past_month != 1 | is.na(Abx_past_month))
Exclude the subjects whose samples were trimmed from sequence data based on read count and/or contami- nants:
# Number of low-read subjects:
length(low_read_subjects)
## [1] 5
# Exclude these:
earclin <- subset(earclin, !(Subject %in% low_read_subjects))
Create a table of the main variables of interest for the manuscript (split by sex, which seemed like an important source of microbial differences based on earlier analyses):
# Table with only right ears (to avoid duplication of values) earclin_right <- subset(earclin, Side == "right")
# List of variables for the table
t1vars <- c(varlist[-c(grep("Sex", varlist), grep("Age", varlist), grep("Staff_student", varlist))],
"Earplugs", "Handedness", "Past_ear_infections")
# Create table and reorganize to get the desired output format table1_df <- lapply(t1vars, function(x) table(earclin_right$Sex,
earclin_right[[x]], useNA = "always")) table1_df <- do.call("rbind", table1_df)
table1_df <- as.data.frame(cbind(table1_df, row.names(table1_df))) table1_df <- subset(table1_df, !is.na(V4))
table1_df$Variable <- rep(t1vars, each = 2) rownames(table1_df) <- NULL
colnames(table1_df) <- c("Value0", "Value1", "NA", "Sex", "Varname") table1_df[, "Varname"] <- gsub("_", " ", table1_df[, "Varname"])
table1_melt <- suppressWarnings(melt(table1_df, id.vars = c("Sex", "Varname"))) table1_cast <- dcast(table1_melt, Varname ~ Sex + variable)
# Put together the final nice-looking table
table1 <- data.frame(Variable = table1_cast$Varname,
Females = paste("no: ", table1_cast$female_Value0,
", yes: ", table1_cast$female_Value1,
", not available: ", table1_cast$female_NA, sep = ""),
Males = paste("no: ", table1_cast$male_Value0,
", yes: ", table1_cast$male_Value1,
", not available: ", table1_cast$male_NA, sep = ""))
table1$Females <- gsub(", not available: 0", "", table1$Females) table1$Males <- gsub(", not available: 0", "", table1$Males) table1$Variable <- as.character(table1$Variable)
table1 <- rbind(c("Age (mean \u00B1 sd)",
paste(round(mean(subset(earclin_right, Sex == "female")$Age), digits = 2),
"\u00B1",
round(sd(subset(earclin_right, Sex == "female")$Age), digits = 2)), paste(round(mean(subset(earclin_right, Sex == "male")$Age), digits = 2),
"\u00B1",
round(sd(subset(earclin_right, Sex == "male")$Age), digits = 2))), table1)
# Fix some of the variable levels
table1[table1$Variable == "Sampling season",] <-
gsub("no", "autumn", table1[table1$Variable == "Sampling season",]) table1[table1$Variable == "Sampling season",] <-
gsub("yes", "spring", table1[table1$Variable == "Sampling season",]) table1[table1$Variable == "Handedness",] <-
gsub("no:", "left:", table1[table1$Variable == "Handedness",]) table1[table1$Variable == "Handedness",] <-
gsub("yes", "right", table1[table1$Variable == "Handedness",]) The basic demographics for the study subjects, split by sex, are as follows:
Table 1
customKable(table1)
Variable Females Males
Age (mean ± sd) 36.96 ± 10.23 38.21 ± 12.45
Allergy no: 11, yes: 16 no: 9, yes: 5
Dominant side no: 3, yes: 24 no: 2, yes: 11, not available: 1 Ear cleaning swab no: 12, yes: 15 no: 11, yes: 3
Earplugs no: 24, yes: 3 no: 14, yes: 0
Handedness left: 3, right: 24 left: 2, right: 11, not available: 1 Past ear infections no: 23, yes: 4 no: 10, yes: 3, not available: 1 Recent common cold no: 15, yes: 12 no: 9, yes: 4, not available: 1 Sampling season autumn: 8, spring: 19 autumn: 6, spring: 8
# Export table
write.table(table1, "Outputs/table1.txt", row.names = FALSE)
Some potentially interesting variables have a skewed distribution between males and females: for example, many more females than males report regularly cleaning their ears with swabs. All 3 earplug users are also female (but there are so few of these subjects that this is not a very useful variable).
Final microbiota data setup
Final phyloseq objects for analyses with the clinical data:
# Full data:
ear_ASV_final <- phyloseq(
otu_table(subset_samples(earphy_trim, Sample %in% earclin$Sample), taxa_are_rows = FALSE),
tax_table(earphy_trim), sample_data(earclin))
# Make corresponding genus-level phyloseq object
ear_gen_final <- collapseTaxLevel(ear_ASV_final, "Genus")
# Subsampled to the same number of reads per sample:
ear_ASV_final_R <- rarefy_even_depth(ear_ASV_final, rngseed = 2760757) Final sequence summary metrics:
paste("Total number of sequences:", sum(otu_table(ear_ASV_final)))
## [1] "Total number of sequences: 8878042"
paste("Mean of sequences per sample:",
round(mean(sample_sums(ear_ASV_final)), digits = 0))
## [1] "Mean of sequences per sample: 108269"
paste("SD of sequences per sample:",
round(sd(sample_sums(ear_ASV_final)), digits = 0))
## [1] "SD of sequences per sample: 30875"
paste("Number of ASVs:", ntaxa(ear_ASV_final))
## [1] "Number of ASVs: 3650"
Batch variation
The ear samples were collected in two different seasons (spring and autumn). They were also delivered to the sequencing laboratory and handled and run in these two batches, which could cause technical differences between them. Look for overall differences between batches with alpha and beta diversity measures:
Alpha diversity
# Create a data frame of alpha diversity and clinical/demographic data adiv_df <- cbind(earclin,
estimate_richness(ear_ASV_final_R,
measures = c("Observed", "InvSimpson", "Shannon")))
Test if there are statistically significant differences in alpha diversity between batches with three commonly used alpha diversity indices (observed richness, Shannon and inverse Simpson indices; Wilcoxon rank sum test):
p-values:
suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"), function(x) signif(wilcox.test(data = adiv_df,
as.formula(paste(x, "~ Sampling_season")))$p.value, digits = 1)))
## Observed InvSimpson Shannon
## 0.6 0.6 0.2
Figure S1A
The difference between batches is not statistically significant. Plot the diversity index values:
figS1A <- plotAlphaDivs(adiv_df, "Sampling_season", "A")
100 200
spring autumn
Sampling_season
Observed
A
0 1 2 3
spring autumn
Sampling_season
Shannon
2.5 5.0 7.5 10.0
spring autumn
Sampling_season
InvSimpson
Samples collected in spring and handled as the first batch have a slightly higher alpha diversity, but the difference is not statistically significant.
Beta diversity
Plot the batches with Non-Metric Multidimensional Scaling (NMDS) ordination of Bray-Curtis dissimilarity:
# Run NMDS ordination with Bray-Curtis dissimilarity
ear_ord <- ordinate(ear_ASV_final_R, "NMDS", "bray", try = 999, trace = FALSE)
# Plot
figS1B <- plot_ordination(ear_ASV_final_R, ear_ord, type = "samples",
color = "Sampling_season", shape = "Sampling_season") + theme_bw() +
coord_fixed() +
geom_point(size = 2) +
scale_color_manual(values = c("gray75", "black"), name = "Batch",
labels = c("Batch 1 (spring)", "Batch 2 (autumn)")) + scale_shape_manual(values = c(16, 17), name = "Batch",
labels = c("Batch 1 (spring)", "Batch 2 (autumn)")) + stat_ellipse(level = 0.9) +
ggtitle("B") +
theme(panel.grid = element_blank())
Figure S1B figS1B
−1.0
−0.5 0.0 0.5 1.0
−2 −1 0 1 2
NMDS1
NMDS2
Batch
Batch 1 (spring) Batch 2 (autumn)
B
Export figures together:
# Export
ggsave(grid.arrange(figS1A, figS1B, nrow = 2),
filename = "Outputs/figS1_batches.pdf", device = cairo_pdf, width = 135, height = 105, units = "mm")
The samples cluster visually according to the two batches. Test for statistical significance in beta diversity difference between batches with adonis:
dist_df <- vegdist(as(otu_table(ear_ASV_final_R), "matrix"), method = "bray") adonis(dist_df ~ Sampling_season,
data = as(sample_data(ear_ASV_final_R), "data.frame"), perm = 9999)
#### Call:
## adonis(formula = dist_df ~ Sampling_season, data = as(sample_data(ear_ASV_final_R), "data.frame"), permutations = 9999)
#### Permutation: free
## Number of permutations: 9999
#### Terms added sequentially (first to last)
#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Sampling_season 1 0.2112 0.21121 0.72049 0.00893 0.6345
## Residuals 80 23.4515 0.29314 0.99107
## Total 81 23.6627 1.00000
The difference between batches is not statistically significant.
Overall, looking at both alpha and beta diversity, the two batches differ somewhat even after contaminant trimming. Unfortunately, since the seasonal and technical batches are exactly the same, it is impossible to distinguish between biological and technical differences. The detected differences could be a mix of both.
Right and left ears
Two samples were collected from every subject, one from each ear. Look for overall differences between left and right ears using diversity measures, and additionally, check how much the same subject’s samples resemble each other when compared to unrelated subjects.
Alpha diversity
Test if there are overall statistically significant differences in diversity between right ears and left ears (Wilcoxon signed-rank test for paired samples):
p-values:
# Test for significances
suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"),
function(x) round(wilcox.test(x = subset(adiv_df, Side == "right")[[x]], y = subset(adiv_df, Side == "left")[[x]], paired = TRUE)$p.value,
digits = 3)))
## Observed InvSimpson Shannon
## 0.291 0.316 0.389
Plots of the diversity index values:
# Plot
plotAlphaDivs(adiv_df, "Side")
100 200
left right
Side
Observed
0 1 2 3
left right
Side
Shannon
2.5 5.0 7.5 10.0
left right
Side
InvSimpson
Overall, alpha diversity does not differ when contrasting right and left ears.
How similar are the alpha diversity values from the same subject? A plot similar to the above, but with lines connecting samples from the same subject:
grid.arrange(
ggplot(adiv_df, aes(x = Side, y = Observed)) + geom_point() +
theme_bw() +
geom_line(data = adiv_df, aes(x = Side, y = Observed, group = Subject)) +
theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), ggplot(adiv_df, aes(x = Side, y = Shannon)) +
geom_point() + theme_bw() +
geom_line(data = adiv_df, aes(x = Side, y = Shannon, group = Subject)) + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), ggplot(adiv_df, aes(x = Side, y = InvSimpson)) +
geom_point() + theme_bw() +
geom_line(data = adiv_df, aes(x = Side, y = InvSimpson, group = Subject)) + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")), nrow=1)
100 200
left right
Side
Observed
0 1 2 3
left right
Side
Shannon
2.5 5.0 7.5 10.0
left right
Side
InvSimpson
Summary statistics for differences between the alpha diversities in right and left ear samples for each index:
leftright_adiv_per_subj <- data.frame(Observed_diff =
subset(adiv_df, Side == "right")$Observed - subset(adiv_df, Side == "left")$Observed, Shannon_diff =
subset(adiv_df, Side == "right")$Shannon - subset(adiv_df, Side == "left")$Shannon, InvSimpson_diff =
subset(adiv_df, Side == "right")$InvSimpson - subset(adiv_df, Side == "left")$InvSimpson, row.names = unique(adiv_df$Subject))
customKable(round(sapply(leftright_adiv_per_subj, summary), digits = 2)) Observed_diff Shannon_diff InvSimpson_diff
Min. -130.00 -2.48 -6.39
1st Qu. -21.00 -0.40 -0.88
Median -8.00 -0.04 -0.24
Mean -7.27 -0.09 -0.32
3rd Qu. 13.00 0.13 0.53
Max. 109.00 1.16 2.51
Aside from a few outliers who have a large difference between ears, most paired samples have a similar alpha
diversity.
Beta diversity
Plot left vs right on an NMDS ordination, regardless of subject:
leftright_bdivs <- cbind(as.data.frame(ear_ord$points), as(sample_data(ear_ASV_final_R)
[, c("Subject", "Sample", "Side")], "data.frame")) ggplot(leftright_bdivs, aes(x = MDS1, y = MDS2,
color = Side, shape = Side)) + theme_bw() +
geom_point(size = 2) +
scale_color_manual(values = c("gray60", "seagreen4")) + scale_shape_manual(values = c(15, 16)) +
coord_fixed() +
stat_ellipse(level = 0.9) +
theme(panel.grid = element_blank())
−1.0
−0.5 0.0 0.5 1.0
−2 −1 0 1
MDS1
MDS2
Side
left right
Test for statistical significance with adonis:
adonis(dist_df ~ Side, as(sample_data(ear_ASV_final_R), "data.frame"), perm = 9999)
#### Call:
## adonis(formula = dist_df ~ Side, data = as(sample_data(ear_ASV_final_R), "data.frame"), permutations = 9999)
#### Permutation: free
## Number of permutations: 9999
#### Terms added sequentially (first to last)
#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Side 1 0.0542 0.054203 0.18367 0.00229 0.9902
## Residuals 80 23.6085 0.295106 0.99771
## Total 81 23.6627 1.00000
Overall, the left and right sides are very similar.
Next, to see if the left and right samples from the same subject resemble each other, plot the samples with connecting lines indicating samples from the same subject.
subject_pairs_nmds <- ggplot(leftright_bdivs, aes(x = MDS1, y = MDS2, shape = Side)) + geom_point(size = 2) +
coord_fixed() +
geom_line(data = leftright_bdivs, aes(x = MDS1, y = MDS2, group = Subject)) + theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(color = "black"))
As a different way of visualizing the same thing, compare dissimilarities of sample pairs from the same subject to pairs from unrelated subjects. (Each data point in the plot is a dissimilarity value for one pair of samples.) ear_dist <- vegdist(as.data.frame(as.matrix(otu_table(ear_ASV_final_R))), method = "bray") ear_pairs <- data.frame(t(combn(attr(ear_dist, "Labels"), 2)), as.numeric(ear_dist)) colnames(ear_pairs) <- c("S1", "S2", "Dissimilarity")
ear_pairs$S1_subject <-
leftright_bdivs[match(ear_pairs$S1, leftright_bdivs$Sample), "Subject"]
ear_pairs$S2_subject <-
leftright_bdivs[match(ear_pairs$S2, leftright_bdivs$Sample), "Subject"]
ear_pairs$SameSubject <-
factor(as.character(ear_pairs$S1_subject) == as.character(ear_pairs$S2_subject)) levels(ear_pairs$SameSubject) <- c("no", "yes")
ear_pairs$SameSubject <- factor(ear_pairs$SameSubject, levels = c("yes", "no")) ear_pairs_similarities <- ggplot(ear_pairs, aes(SameSubject, Dissimilarity)) +
geom_boxplot(outlier.size = 1, outlier.colour = "gray60") + theme_bw() +
ylab("Pairwise Bray-Curtis\ndissimilarity") + xlab("Same subject") +
stat_summary(fun.y = mean, geom = "point", shape = 4, size = 4) + theme(panel.grid = element_blank(),
axis.text = element_text(color = "black")) Combine these plots into a figure:
Figure S2
figS2 <- arrangeGrob(ear_pairs_similarities, blankPanel, subject_pairs_nmds, nrow = 1, widths = c(0.8, 0.03, 1))
grid.arrange(figS2)
0.00 0.25 0.50 0.75 1.00
yes no
Same subject Pairwise Bray−Curtis dissimilarity
−1.0
−0.5 0.0 0.5 1.0
−1 0 1
MDS1
MDS2
Side
left right
ggsave(figS2, filename = "Outputs/figS2_leftright.pdf", device = cairo_pdf, width = 180, height = 75, units = "mm")
Overall, similar to alpha diversity, the same subject’s samples tend to be similar.
Handedness
A comparison that perhaps makes more sense biologically is to compare the samples from the ear on the side of the dominant hand to that on the side of the non-dominant hand (as one could expect to see e.g. some hygiene-related differences between these).
Alpha diversity
p-values:
# Test for significances
suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"),
function(x) round(wilcox.test(x = subset(adiv_df, Dominant_side == "yes")[[x]], y = subset(adiv_df, Dominant_side == "no")[[x]], paired = TRUE)$p.value,
digits = 3)))
## Observed InvSimpson Shannon
## 0.769 0.656 0.705
There are no statistically significant differences between samples from the dominant and non-dominant side.
Plots of the diversity index values:
# Plot
plotAlphaDivs(subset(adiv_df, !is.na(Dominant_side)), "Dominant_side")
100 200
no yes
Dominant_side
Observed
0 1 2 3
no yes
Dominant_side
Shannon
2.5 5.0 7.5 10.0
no yes
Dominant_side
InvSimpson
There are no visually clear trends, either.
Beta diversity
Ordination plot contrasting the samples from the dominant vs non-dominant sides:
dominance_bdivs <- cbind(as.data.frame(ear_ord$points), as(sample_data(ear_ASV_final_R) [, c("Subject", "Sample", "Dominant_side")], "data.frame"))
# Drop samples with missing handedness
dominance_bdivs <- subset(dominance_bdivs, !is.na(Dominant_side))
# Plot
ggplot(dominance_bdivs, aes(x = MDS1, y = MDS2,
color = Dominant_side, shape = Dominant_side)) + theme_bw() +
geom_point(size = 2) +
scale_color_manual(values = c("gray60", "darkblue")) + scale_shape_manual(values = c(15, 16)) +
coord_fixed() +
stat_ellipse(level = 0.9) +
theme(panel.grid = element_blank())
−1.0
−0.5 0.0 0.5 1.0
−2 −1 0 1
MDS1
MDS2
Dominant_side
no yes
Test for statistical significance with adonis:
# Distance matrix without samples with missing data dominance_dist_df <- vegdist(as(
otu_table(subset_samples(ear_ASV_final_R, !is.na(Dominant_side))), "matrix"), method = "bray")
adonis(dominance_dist_df ~ Dominant_side,
as(sample_data(subset_samples(ear_ASV_final_R,
!is.na(Dominant_side))), "data.frame"), perm = 9999)
#### Call:
## adonis(formula = dominance_dist_df ~ Dominant_side, data = as(sample_data(subset_samples(ear_ASV_final_R, !is.na(Dominant_side))), "data.frame"), permutations = 9999)
#### Permutation: free
## Number of permutations: 9999
#### Terms added sequentially (first to last)
#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## Dominant_side 1 0.0503 0.050346 0.17256 0.00221 0.9934
## Residuals 78 22.7568 0.291754 0.99779
## Total 79 22.8071 1.00000
There is no detectable difference in beta diversity between samples on the side of the dominant and non- dominant hand.
Trim to right ears
Since left and right ears are so similar in almost all subjects, and there is also no notable difference between samples depending on whether they represent the side of the dominant hand or not, focus on right ear data for the following analyses to simplify the statistical comparisons.
Phyloseq objects with only right ear samples:
ear_ASV_right <- subset_samples(ear_ASV_final, Side == "right") ear_gen_right <- subset_samples(ear_gen_final, Side == "right")
# Subsample to the same number of reads per sample:
ear_ASV_right_R <- rarefy_even_depth(ear_ASV_right, rngseed = 7554714) ear_gen_right_R <- rarefy_even_depth(ear_gen_right, rngseed = 1992623)
Most common bacterial taxa
Focusing on the right ear samples, summarize the numbers of taxa on different taxonomic levels:
print(paste("ASVs: ",
ntaxa(ear_ASV_right),
", genera: ",
ntaxa(subset_taxa(ear_gen_right, Genus != "unclassified")),
", families: ",
ntaxa(subset_taxa(collapseTaxLevel(ear_ASV_right, level = "Family"), Family != "unclassified")),
sep = ""))
## [1] "ASVs: 3650, genera: 462, families: 174"
ASVs
Overall, the 20 most common ASVs in the data are the following:
top_asv_ra <- relAbundChart(ear_ASV_right, taxaCount = 20, table = TRUE, byVar = "Side") top_asvs <- data.frame(
Taxonomy = c(as(tax_table(ear_ASV_right)
[rownames(top_asv_ra)[1:20], "Species"], "vector"), "other taxa"), Relative_abundance = top_asv_ra[[1]])
customKable(top_asvs, d = 2, cn = c("ASV", "Mean relative abundance (%)"))
ASV Mean relative abundance (%)
ASV1: Staphylococcus auricularis 28.65
ASV2: Propionibacterium acnes 15.35
ASV3: Alloiococcus otitis 15.15
ASV4: Turicella otitidis 6.54
ASV5: Turicella unclassified 5.88
ASV6: Staphylococcus unclassified 4.63
ASV7: Turicella unclassified 3.61
ASV8: Turicella unclassified 1.98
ASV13: Alloiococcus unclassified 1.27
ASV11: Alloiococcus unclassified 1.20
ASV17: Corynebacterium auris 0.99
ASV12: Staphylococcus unclassified 0.95
ASV15: Staphylococcus unclassified 0.78
ASV25: Enhydrobacter aerosaccus 0.69
ASV16: Staphylococcus unclassified 0.67
ASV14: Staphylococcus unclassified 0.63
ASV10: Staphylococcus unclassified 0.58
ASV19: Alloiococcus unclassified 0.55
ASV9: Corynebacterium unclassified 0.41
ASV28: Kocuria koreensis 0.37
other taxa 9.12
Plot the abundances of the top ASVs in each sample; samples organized according to whether ASV1, ASV2, ASV3 or some other ASV was the most common taxon.
# Relative abundances of top 20 taxa
top_ra_samples <- relAbundChart(ear_ASV_right, taxaCount = 20, table = TRUE)
# Top ASV for each sample
sample_max_asv <- sapply(top_ra_samples,
function(x) rownames(top_ra_samples)[which.max(x)]) sample_max_asv[!(sample_max_asv %in% c("ASV1", "ASV2", "ASV3"))] <- "other"
sample_order <- names(sort(sample_max_asv))
# Add taxonomy info
top_ra_samples$ASV <- c(as(tax_table(ear_ASV_right)[rownames(top_ra_samples)[1:20],
"Species"], "vector"), "other taxa")
# Rearrange data
top_ra_samples_melt <- melt(top_ra_samples)
top_ra_samples_melt$ASV <- factor(top_ra_samples_melt$ASV, levels = top_ra_samples$ASV) top_ra_samples_melt$variable <- factor(top_ra_samples_melt$variable, levels = sample_order) top_ra_samples_melt <- top_ra_samples_melt[order(top_ra_samples_melt$variable),]
# Split into two for plotting
top_ra_samples_melt1 <- subset(top_ra_samples_melt, variable %in%
levels(top_ra_samples_melt$variable)[1:20]) top_ra_samples_melt2 <- subset(top_ra_samples_melt, variable %in%
levels(top_ra_samples_melt$variable)[
21:length(levels(top_ra_samples_melt$variable))])
# Draw the legend and the plot in two rows
asv_l <- g_legend(ggplot(top_ra_samples_melt1, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +
theme_bw(base_size = 9) +
guides(fill = guide_legend(ncol = 1)) +
scale_fill_manual(values = relacols2, name = "Amplicon Sequence Variant") + theme(legend.key.size = unit(4.5, "mm")))
asv_p1 <- ggplot(top_ra_samples_melt1, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +
xlab(NULL) +
ylab("Relative abundance (%)") +
scale_fill_manual(values = relacols2) + theme_bw(base_size = 10) +
theme(axis.text.x = element_text(color = "black", angle = 45, hjust = 1), panel.grid = element_blank(),
legend.position = "none")
asv_p2 <- ggplot(top_ra_samples_melt2, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +
xlab(NULL) +
ylab("Relative abundance (%)") +
scale_fill_manual(values = relacols2) + theme_bw(base_size = 10) +
theme(axis.text.x = element_text(color = "black", angle = 45, hjust = 1), panel.grid = element_blank(),
legend.position = "none")
fig_asv <- arrangeGrob(arrangeGrob(asv_p1, asv_p2, nrow = 2), asv_l, nrow = 1, widths = c(7.2, 2.8))
grid.arrange(fig_asv)
0 25 50 75 100
Sample_5Sample_15Sample_21Sample_31Sample_33Sample_45Sample_49Sample_59Sample_61Sample_69Sample_71Sample_81Sample_83Sample_91Sample_95Sample_97Sample_13Sample_23Sample_37Sample_41
Relative abundance (%)
0 25 50 75 100
Sample_65Sample_99Sample_11Sample_17Sample_25Sample_43Sample_47Sample_57Sample_63Sample_79Sample_9Sample_29Sample_35Sample_51Sample_53Sample_55Sample_67Sample_73Sample_75Sample_87Sample_89
Relative abundance (%)
Amplicon Sequence Variant
ASV1: Staphylococcus auricularis ASV2: Propionibacterium acnes ASV3: Alloiococcus otitis ASV4: Turicella otitidis ASV5: Turicella unclassified ASV6: Staphylococcus unclassified ASV7: Turicella unclassified ASV8: Turicella unclassified ASV13: Alloiococcus unclassified ASV11: Alloiococcus unclassified ASV17: Corynebacterium auris ASV12: Staphylococcus unclassified ASV15: Staphylococcus unclassified ASV25: Enhydrobacter aerosaccus ASV16: Staphylococcus unclassified ASV14: Staphylococcus unclassified ASV10: Staphylococcus unclassified ASV19: Alloiococcus unclassified ASV9: Corynebacterium unclassified ASV28: Kocuria koreensis other taxa
Genera
A similar table and plot, but for the top 10 genera:
top_gen_ra <- relAbundChart(ear_gen_right, taxaCount = 11, table = TRUE, byVar = "Side") customKable(top_gen_ra, d = 2 , cn = "Mean relative abundance (%)")
Mean relative abundance (%)
Staphylococcus 39.46
Alloiococcus 19.54
Turicella 18.25
Propionibacterium 15.54
Corynebacterium 2.15
Streptococcus 0.87
Enhydrobacter 0.70
Kocuria 0.41
Chryseobacterium 0.40
Snodgrassella 0.15
other taxa 1.99
unclassified taxa 0.54
# Put data together
top_gen_samples <- relAbundChart(ear_gen_right, taxaCount = 11, table = TRUE)
# Get top genus for each sample and rearrange data for plotting sample_max_gen <- sapply(top_gen_samples,
function(x) rownames(top_gen_samples)[which.max(x)]) sample_max_gen <- factor(sample_max_gen,
levels = c("Staphylococcus", "Propionibacterium",
"Alloiococcus", "Turicella")) sample_order_gen <- names(sort(sample_max_gen))
top_gen_samples$Genus <- rownames(top_gen_samples) top_gen_samples_melt <- melt(top_gen_samples)
top_gen_samples_melt$Genus <- factor(top_gen_samples_melt$Genus, levels = top_gen_samples$Genus) top_gen_samples_melt$variable <- factor(top_gen_samples_melt$variable,
levels = sample_order_gen)
top_gen_samples_melt <- top_gen_samples_melt[order(top_gen_samples_melt$variable),]
# Split into two for plotting
top_gen_samples_melt1 <- subset(top_gen_samples_melt, variable %in%
levels(top_gen_samples_melt$variable)[1:20]) top_gen_samples_melt2 <- subset(top_gen_samples_melt, variable %in%
levels(top_gen_samples_melt$variable)[
21:length(levels(top_gen_samples_melt$variable))])
# Draw plots gen_l <-
g_legend(ggplot(top_gen_samples_melt1, aes(x = variable, y = value, fill = Genus)) + geom_bar(stat = "identity", position = "stack") +
theme_bw(base_size = 9) +