• Tidak ada hasil yang ditemukan

R Code for Sjövall, A. et al.: Microbiome of the external auditory canal

N/A
N/A
Protected

Academic year: 2024

Membagikan "R Code for Sjövall, A. et al.: Microbiome of the external auditory canal"

Copied!
60
0
0

Teks penuh

(1)

R Code for Sjövall, A. et al.:

Microbiome of the external auditory canal

Velma T. E. Aho 03 January 2020

Contents

Setting up data and tools 2

Basic statistics for sequence data 7

Technical controls and contaminants . . . 8

Subject demographics 14 Table 1 . . . 16

Final microbiota data setup 17 Batch variation 17 Alpha diversity . . . 17

Figure S1A . . . 18

Beta diversity . . . 18

Figure S1B . . . 19

Right and left ears 20 Alpha diversity . . . 20

Beta diversity . . . 22

Figure S2 . . . 23

Handedness 24 Alpha diversity . . . 24

Beta diversity . . . 24

Trim to right ears 26 Most common bacterial taxa 26 ASVs . . . 26

Genera . . . 29

Figure S3 (ASVs + genera depending on classification) . . . 30

Taxon prevalences . . . 34

Table S1 . . . 34

Clustering by microbial community type 38 Figure 1 . . . 42

Microbiota and clinical variables 43 Alpha diversity . . . 44

Beta diversity . . . 46

Table 2 . . . 50

Figure 2 . . . 50

Clustering and clinical variables 51

(2)

Differential abundance 53 Figure S4 . . . 57

Session info 57

Setting up data and tools

Load required packages:

library("kableExtra") library("phyloseq") library("ggplot2") library("gridExtra") library("reshape2") library("decontam") library("vegan") library("tools") library("dplyr") library("cluster") library("clusterSim") library("ade4")

library("adegraphics") library("DESeq2")

Set up functions and variables for use throughout the analysis:

# Styling function for table output:

customKable <- function(df, d = 3, cn = NA, pos = "center"){

kable_styling(kable(df, digits = d, col.names = cn,

format = "latex", booktabs = TRUE, linesep = "", format.args = list(big.mark = ",")),

latex_options = "striped", position = pos) }

# Blank panel for grid graphics library("grid")

blankPanel <- grid.rect(gp = gpar(col = "white"), draw = FALSE)

# Legend-grabbing function g_legend <- function(a.gplot){

tmp <- ggplot_gtable(ggplot_build(a.gplot))

leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend <- tmp$grobs[[leg]]

legend }

# Alpha diversity plotting function for three indices plotAlphaDivs <- function(df, var, title = ""){

grid.arrange(

ggplot(df, aes(x = df[[var]], y = Observed)) + geom_boxplot() +

xlab(var) + theme_bw() + ggtitle(title) +

(3)

theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), ggplot(df, aes(x = df[[var]], y = Shannon)) +

geom_boxplot() + xlab(var) + theme_bw() + ggtitle("") +

theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), ggplot(df, aes(x = df[[var]], y = InvSimpson)) +

geom_boxplot() + xlab(var) + theme_bw() + ggtitle("") +

theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), nrow = 1)

}

# Pre-selected (hand-picked) color variables for relative abundance bar charts relacols1 <- c("#01665E", "#C51B7D", "#F1B6DA", "#35978F", "#DFC27D", "#BF812D",

"darkslateblue", "thistle", "#C7EAE5", "gray75", "gray25")

relacols2 <- c("#01665E", "#DFC27D", "slategray3", "plum4", "thistle", "#35978F",

"plum3", "thistle1", "lightsteelblue2", "skyblue3",

"lightsalmon4", "lightseagreen", "darkseagreen2", "peachpuff1",

"mediumaquamarine", "seagreen4", "darkseagreen4", "deepskyblue4",

"mistyrose2", "lightpink2", "gray50")

relacols3 <- c("#01665E", "lightsteelblue2", "plum4", "#DFC27D", "salmon2",

"lightsalmon4", "peachpuff1", "palegreen2", "mediumpurple",

"pink", "gray50", "black")

relacols4 <- c("#01665E", "#DFC27D", "slategray3", "plum4", "#35978F",

"thistle", "lightsteelblue2", "gray50", "black")

# Function for relative abundance tables and bar charts relAbundChart <- function(phyloObj, taxaCount = 10,

byVariable = "sample", cols, table = FALSE){

makeAbundTable <- function(phyloO){

abund_table <- as.data.frame(prop.table(otu_table(phyloO), 2)*100) otherpos <- which(is.na(tax_table(phyloO)[, "Phylum"]))

ucpos <- grep("^unclassified$", rownames(abund_table)) if(length(otherpos) == 0 & length(ucpos) == 0){

final_table <- as.data.frame(t(abund_table[order(rowMeans(abund_table), decreasing = TRUE), ])) return(final_table)

}

final_table <- abund_table[-c(ucpos, otherpos), ]

final_table <- final_table[order(rowMeans(final_table), decreasing = TRUE), ]

(4)

if(length(otherpos) != 0){

other_taxa <- abund_table[otherpos, ] rownames(other_taxa) <- "other taxa"

final_table <- rbind(final_table, other_taxa) }if(length(ucpos) != 0){

ucTaxa <- abund_table[ucpos, ]

rownames(ucTaxa) <- "unclassified taxa"

final_table <- rbind(final_table, ucTaxa) }

final_table <- as.data.frame(t(final_table)) return(final_table)

}

aggregateAbunds <- function(table, variable){

agg_table <- suppressWarnings(

aggregate(table, by = list(table[, variable]), FUN = "mean"))[, 1:ncol(table)]

rownames(agg_table) <- agg_table[, 1]

agg_table <- agg_table[, 2:ncol(agg_table)]

return(agg_table) }

gg2table <- function(table){

tableGG <- as.data.frame(t(table)) tableGG$Type <- rownames(tableGG)

ggTable <- melt(tableGG, id.vars = "Type") return(ggTable)

}

byVar <- sample_data(phyloObj)[[byVariable]]

if (taxa_are_rows(phyloObj) == FALSE) { phyloObj <- phyloseq(tax_table(phyloObj),

sample_data(phyloObj),

otu_table(t(otu_table(phyloObj))), taxa_are_rows = TRUE) }

legend_title <- colnames(tax_table(phyloObj))[ncol(tax_table(phyloObj))]

topTaxa <- c(names(sort(taxa_sums(phyloObj) / sum(sample_sums(phyloObj)), decreasing = TRUE))[1:taxaCount], "unclassified") phylo_merged <- merge_taxa(phyloObj, rownames(tax_table(phyloObj))

[!(rownames(tax_table(phyloObj))) %in% topTaxa]) phylo_abunds <- makeAbundTable(phylo_merged)

if(byVariable != "sample"){

phylo_abunds <- cbind(phylo_abunds, byVar)

phylo_abunds <- aggregateAbunds(phylo_abunds, "byVar") }

(5)

if(table == TRUE){

phylo_abunds <- as.data.frame(t(as.matrix(phylo_abunds))) return(phylo_abunds)

}

plot_df <- gg2table(phylo_abunds)

plot_df$Type <- factor(plot_df$Type, levels = unique(plot_df$Type)) abund_bar_chart <- ggplot(plot_df, aes(variable, value, fill = Type)) +

geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 10) +

xlab(NULL) +

ylab("Mean relative abundance (%)") +

scale_fill_manual(values = cols, name = legend_title, limits = levels(plot_df$Type)) if(byVariable == "sample"){

abund_bar_chart <- abund_bar_chart + ylab("Relative abundance (%)") }

return(abund_bar_chart) }

# Function for merging data to different taxonomic levels in phyloseq objects collapseTaxLevel <- function(phylo_obj, level){

collapseOtuTable <- function(phylo_obj, level){

tax <- as.data.frame(as(tax_table(phylo_obj), "matrix")) otu <- as.data.frame(as(otu_table(phylo_obj), "matrix")) otu[, level] <- tax[, level]

otu_collapsed <- melt(otu, id = (level)) otu_collapsed <- acast(otu_collapsed,

as.formula(paste(level, "~variable", sep = "")), sum) otu_collapsed <- otu_collapsed[order(rownames(otu_collapsed)), ]

return(otu_collapsed) }

if (taxa_are_rows(phylo_obj) == FALSE) {

phylo_obj <- phyloseq(tax_table(phylo_obj), sample_data(phylo_obj),

otu_table(t(otu_table(phylo_obj))), taxa_are_rows = TRUE)

}

levelNum <- grep(level, colnames(tax_table(phylo_obj))) if (levelNum == 1){

otu_collapsed <- collapseOtuTable(phylo_obj, level) tax_collapsed <- data.frame(rownames(otu_collapsed),

row.names = rownames(otu_collapsed)) colnames(tax_collapsed) <- level

} else {

tax <- as(tax_table(phylo_obj), "matrix")

tax <- apply(tax, 2, function(x) gsub(".*_unclassified", "unclassified", x))

(6)

phylo_obj <- phyloseq(otu_table(phylo_obj), sample_data(phylo_obj), tax_table(tax))

otu_collapsed <- collapseOtuTable(phylo_obj, level) tax <- as(tax_table(phylo_obj), "matrix")

tax_collapsed <- unique(tax[, 1:levelNum])

tax_collapsed <- tax_collapsed[-grep("unclassified", tax_collapsed[,levelNum]),]

tax_collapsed <- rbind(tax_collapsed, rep("unclassified", (levelNum))) rownames(tax_collapsed) <- tax_collapsed[, level]

tax_collapsed <- tax_collapsed[order(rownames(tax_collapsed)), ] }

new_phylo_obj <- phyloseq(otu_table(otu_collapsed, taxa_are_rows = TRUE), tax_table(tax_collapsed),

sample_data(phylo_obj)) return(new_phylo_obj)

}

Import the 16S rRNA gene amplicon data (from DADA2) to make a phyloseq object:

# Read counts

ear_OTUs <- readRDS("ears_seqtab_final_v2.rds")

colnames(ear_OTUs) <- paste("ASV", 1:ncol(ear_OTUs), sep = "")

# Taxonomy (fixing empty cells to "unclassified") ear_tax <- readRDS("ears_tax_final_v2.rds")

rownames(ear_tax) <- paste("ASV", 1:nrow(ear_tax), sep = "") ear_tax[is.na(ear_tax)] <- "unclassified"

ear_tax[,"Species"] <- paste(rownames(ear_tax), ": ", ear_tax[,"Genus"], " ",

ear_tax[,"Species"], sep = "")

ear_tax[,"Species"] <- gsub("unclassified unclassified", "unclassified", ear_tax[,"Species"])

# Simple metadata for preliminary data exploration ear_meta <- read.csv("basic_meta.csv", header = TRUE) rownames(ear_meta) <- ear_meta$Sample

# Negative control or not -variable ear_meta$Neg <- ear_meta$Type != "sample"

# Make phyloseq object

earphy <- phyloseq(otu_table(ear_OTUs, taxa_are_rows = FALSE), tax_table(ear_tax),

sample_data(ear_meta))

(7)

Basic statistics for sequence data

There are a total of 10,266,925 sequence reads and a total of 3,767 unique Amplicon Sequence Variants (ASVs) in the data. ASVs represent microbes that have an identical V3-V4 16S rRNA sequence according to the analysis run withdada2, version 1.7.7.

The number of samples in the data is 109. Out of these samples, 9 are technical controls (2 sampling blanks, 3 DNA extraction blanks and 4 PCR control blanks), while 100 are actual ear samples.

Basic statistics for the distribution of sequence reads in the samples:

customKable(as.matrix(summary(sample_sums(earphy))), d = 0, cn = "Sequence reads") Sequence reads

Min. 32

1st Qu. 71,724

Median 96,291

Mean 94,192

3rd Qu. 124,324

Max. 169,833

Histogram of the number of reads per sample:

ggplot(data.frame(ReadSum = sample_sums(earphy)), aes(x = ReadSum)) + geom_histogram(color = "black", fill = "gray60") +

theme_bw() +

scale_y_continuous(breaks = seq(0, 10, 2)) + xlab("Sequence reads") +

ylab("Number of samples") +

theme(panel.grid.minor = element_blank(), panel.grid.major.x = element_blank())

0 2 4 6 8 10

0 50000 100000 150000

Sequence reads

Number of samples

All samples with < 10000 reads are technical control samples, as can be seen from a table of 10 samples with the least reads:

customKable(sort(sample_sums(earphy))[1:10], cn = "Read count")

(8)

Read count

Sample_BIPCR0ctrl2_1 32

Sample_BIPCR0ctrl2_2 43

Sample_kitctrl2_1 67

Sample_86blank 140

Sample_kitctrl2_2 329

Sample_85blank 530

Sample_kitctrl 714

Sample_BIPCR0ctrl1 2,431

Sample_BIPCR0ctrl2 13,283

Sample_39 16,607

Technical controls and contaminants

Use the R package “decontam” to estimate the amounts of contaminants in samples, based on both presence in the negative controls, and the relationship of taxon abundance with DNA concentration in samples.

# Estimate contaminants

earphy_conts <- isContaminant(earphy, conc = "Conc", neg = "Neg",

method = "combined", batch = "Batch", threshold = 0.1) earphy_conts <- cbind(earphy_conts, as(tax_table(earphy), "matrix"))

Total number of suspected contaminant ASVs (of 3,767):

nrow(subset(earphy_conts, contaminant == TRUE))

## [1] 56

Details for the 20 most common ones:

kable_styling(kable(

head(subset(earphy_conts, contaminant == TRUE), 20)[,c(5, 9:10, 12:13)], digits = 4, format = "latex", booktabs = TRUE, linesep = "",

col.names = c("p-value", "Class", "Order", "Genus", "Species")), latex_options = "striped", font_size = 8)

p-value Class Order Genus Species

ASV20 0.0268 Betaproteobacteria Burkholderiales Burkholderia ASV20: Burkholderia caledonica ASV26 0.0365 Gammaproteobacteria Pseudomonadales Pseudomonas ASV26: Pseudomonas unclassified ASV30 0.0106 Gammaproteobacteria Oceanospirillales Halomonas ASV30: Halomonas phoceae ASV32 0.0233 Betaproteobacteria Burkholderiales Burkholderia ASV32: Burkholderia unclassified ASV33 0.0133 Gammaproteobacteria Alteromonadales Shewanella ASV33: Shewanella unclassified ASV54 0.0078 Betaproteobacteria Burkholderiales Ralstonia ASV54: Ralstonia unclassified ASV58 0.0034 Gammaproteobacteria Oceanospirillales Halomonas ASV58: Halomonas unclassified ASV70 0.0064 Gammaproteobacteria Xanthomonadales Stenotrophomonas ASV70: Stenotrophomonas maltophilia ASV76 0.0064 Betaproteobacteria Burkholderiales Ralstonia ASV76: Ralstonia insidiosa

ASV80 0.0031 Gammaproteobacteria Oceanospirillales Halomonas ASV80: Halomonas unclassified ASV83 0.0065 Betaproteobacteria Burkholderiales Burkholderia ASV83: Burkholderia unclassified ASV86 0.0113 Betaproteobacteria Burkholderiales Ralstonia ASV86: Ralstonia solanacearum ASV94 0.0581 Betaproteobacteria Burkholderiales Burkholderia ASV94: Burkholderia unclassified ASV99 0.0018 Betaproteobacteria Burkholderiales Burkholderia ASV99: Burkholderia unclassified ASV106 0.0037 Gammaproteobacteria Alteromonadales Shewanella ASV106: Shewanella unclassified ASV112 0.0030 Gammaproteobacteria Oceanospirillales Halomonas ASV112: Halomonas unclassified ASV113 0.0117 Gammaproteobacteria Oceanospirillales Halomonas ASV113: Halomonas unclassified ASV117 0.0195 Gammaproteobacteria Oceanospirillales Halomonas ASV117: Halomonas unclassified ASV122 0.0050 Gammaproteobacteria Oceanospirillales Halomonas ASV122: Halomonas unclassified ASV144 0.0831 Acidobacteria_Gp2 Gp2 unclassified ASV144: unclassified

(9)

Example plots for taxa labeled as contaminants:

# Make a combined sample type/batch variable for coloring sample_data(earphy)$BatchType <- factor(

paste(sample_data(earphy)$Batch, sample_data(earphy)$Type))

# Frequency plots for first 9 contaminants plot_frequency(earphy,

rownames(subset(earphy_conts, contaminant == TRUE))[1:9], conc = "Conc") +

theme_bw() +

theme(legend.position = "bottom") + geom_point(aes(color = BatchType)) +

scale_color_manual(values = c("dodgerblue", "orchid1", "gray40",

"blue", "maroon3", "black",

"seagreen1")) + xlab("DNA Concentration") +

ylab("log(frequency)") + theme_bw()

ASV58 ASV70 ASV76

ASV32 ASV33 ASV54

ASV20 ASV26 ASV30

3 10 30 3 10 30 3 10 30

1e−04 1e−02 1e+00

1e−04 1e−02 1e+00

1e−04 1e−02 1e+00

DNA Concentration

log(frequency)

BatchType

Batch1 extraction_blank Batch1 PCR_blank Batch1 sample

Batch2 extraction_blank Batch2 PCR_blank Batch2 sample Batch2 sampling_blank

As a comparison, plot first 9 non-contaminants:

plot_frequency(earphy,

rownames(subset(earphy_conts, contaminant == FALSE))[1:9], conc = "Conc") +

theme_bw() +

theme(legend.position = "bottom") +

(10)

geom_point(aes(color = BatchType)) +

scale_color_manual(values = c("dodgerblue", "orchid1", "gray40",

"blue", "maroon3", "black",

"seagreen1")) + xlab("DNA Concentration") +

ylab("log(frequency)") + theme_bw()

ASV7 ASV8 ASV9

ASV4 ASV5 ASV6

ASV1 ASV2 ASV3

3 10 30 3 10 30 3 10 30

1e−04 1e−03 1e−02 1e−01 1e+00

1e−04 1e−03 1e−02 1e−01 1e+00

1e−04 1e−03 1e−02 1e−01 1e+00

DNA Concentration

log(frequency)

BatchType

Batch1 extraction_blank Batch1 PCR_blank Batch1 sample

Batch2 extraction_blank Batch2 PCR_blank Batch2 sample Batch2 sampling_blank

Make a plot summarizing read counts and contaminant status:

ear_cont_simpl <- data.frame(

Cont = colSums(t(otu_table(earphy))[rownames(subset( earphy_conts, contaminant == TRUE)), ]),

NotCont = colSums(t(otu_table(earphy))[rownames(subset( earphy_conts, contaminant == FALSE)), ]),

sample_data(earphy))

# Relevel the sample variable numerically ear_cont_simpl$Sample <- factor(

ear_cont_simpl$Sample,

levels = c(paste("Sample", 1:102, sep = "_"),

unique(as.character(subset(ear_cont_simpl, Neg == TRUE)$Sample))))

# Melt

ear_cont_simpl <- melt(ear_cont_simpl)

# Plot

(11)

ear_cont_bars <- grid.arrange(

ggplot(subset(ear_cont_simpl, Neg == FALSE & Batch == "Batch1" &

variable != "Conc"),

aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 8) +

ylab("Sequence reads") + xlab(NULL) +

ggtitle("Batch 1") +

scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),

name = "Contaminant taxon?") + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black"),

axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none"),

ggplot(subset(ear_cont_simpl, Neg == FALSE & Batch == "Batch2" &

variable != "Conc"),

aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw(base_size = 8) +

ylab("Sequence reads") + xlab(NULL) +

ggtitle("Batch 2") +

scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),

name = "Contaminant taxon?") + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black"),

axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none"),

arrangeGrob(

ggplot(subset(ear_cont_simpl, Neg == TRUE & variable != "Conc"), aes(x = Sample, y = value, fill = variable)) +

geom_bar(stat = "identity", position = "stack") + theme_bw() +

ylab("Sequence reads") + xlab(NULL) +

facet_grid("~Batch") + ggtitle("Blanks") +

scale_fill_manual(values = c("firebrick", "gray20"), labels = c("yes", "no"),

name = "Contaminant taxon?") + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black"),

axis.text.x = element_text(angle = 45, hjust = 1)), blankPanel, nrow = 1, widths = c(3, 1)),

ncol = 1, heights = c(1, 1, 1.5))

(12)

0 50000 100000 150000

Sample_1Sample_2Sample_3Sample_4Sample_5Sample_6Sample_7Sample_8Sample_9Sample_10Sample_11Sample_12Sample_13Sample_14Sample_15Sample_16Sample_17Sample_18Sample_19Sample_20Sample_21Sample_22Sample_23Sample_24Sample_25Sample_26Sample_27Sample_28Sample_29Sample_30Sample_31Sample_32Sample_33Sample_34Sample_35Sample_36Sample_37Sample_38Sample_39Sample_40

Sequence reads

Batch 1

0 50000 100000 150000

Sample_41Sample_42Sample_43Sample_44Sample_45Sample_46Sample_47Sample_48Sample_49Sample_50Sample_51Sample_52Sample_53Sample_54Sample_55Sample_56Sample_57Sample_58Sample_59Sample_60Sample_61Sample_62Sample_63Sample_64Sample_65Sample_66Sample_67Sample_68Sample_69Sample_70Sample_71Sample_72Sample_73Sample_74Sample_75Sample_76Sample_77Sample_78Sample_79Sample_80Sample_81Sample_82Sample_83Sample_84Sample_87Sample_88Sample_89Sample_90Sample_91Sample_92Sample_93Sample_94Sample_95Sample_96Sample_97Sample_98Sample_99Sample_100Sample_101Sample_102

Sequence reads

Batch 2

Batch1 Batch2

Sample_kitctr l

Sample_BIPCR0ctr l1

Sample_BIPCR0ctr l2

Sample_85b lank

Sample_86b lank

Sample_kitctr l2_1

Sample_kitctr l2_2

Sample_BIPCR0ctr l2_1

Sample_BIPCR0ctr l2_2

Sample_kitctr l

Sample_BIPCR0ctr l1

Sample_BIPCR0ctr l2

Sample_85b lank

Sample_86b lank

Sample_kitctr l2_1

Sample_kitctr l2_2

Sample_BIPCR0ctr l2_1

Sample_BIPCR0ctr l2_2 0

5000 10000

Sequence reads

Contaminant taxon?

yes no

Blanks

Delete all the suspected contaminant ASVs and trim all technical control samples from the data before downstream analyses:

# Trim the technical control samples

earphy_trim <- subset_samples(earphy, Type == "sample")

# Remove the contaminant ASVs

earphy_trim <- subset_taxa(earphy_trim,

Species %in% subset(earphy_conts, contaminant == FALSE)$Species)

# Additionally, remove reads classified as Chloroplasts/Cyanobacteria

earphy_trim <- subset_taxa(earphy_trim, Phylum != "Cyanobacteria/Chloroplast")

After this trimming, the final sequence counts for samples were as follows (showing right and left ears separately):

ear_reads <- data.frame(ReadSum = sample_sums(earphy_trim), sample_data(earphy_trim))

(13)

ear_reads$Subject <- factor(ear_reads$Subject, levels = unique(ear_reads$Subject))

# Histogram of read counts

ggplot(ear_reads, aes(y = ReadSum, x = Subject,

group = Subject, shape = Side, color = Side)) + geom_point(size = 2) +

geom_line(color = "gray20") + theme_bw() +

ylab("Sequence reads") + xlab(NULL) +

scale_color_manual(values = c("gray60", "seagreen4")) + scale_shape_manual(values = c(15, 16)) +

scale_y_continuous(breaks = seq(0, 175000, 25000)) + theme(panel.grid.minor = element_blank(),

panel.grid.major.x = element_blank(),

axis.text.x = element_text(angle = 45, hjust = 1))

0 25000 50000 75000 100000 125000 150000 175000

Subject_1Subject_3Subject_5Subject_7Subject_9Subject_11Subject_13Subject_15Subject_17Subject_19Subject_21Subject_23Subject_25Subject_27Subject_29Subject_31Subject_33Subject_35Subject_37Subject_39Subject_41Subject_43Subject_45Subject_47Subject_49Subject_51Subject_53Subject_55Subject_57Subject_59Subject_61Subject_63Subject_65Subject_67Subject_69Subject_71Subject_73Subject_75Subject_77Subject_79Subject_81Subject_83Subject_87Subject_89Subject_91Subject_93Subject_95Subject_97Subject_99Subject_101

Sequence reads

Side

left right

Check the amount of contaminant reads (before trimming) in the samples from subjects with the least reads:

ggplot(subset(ear_cont_simpl,

Subject %in% subset(ear_reads, ReadSum < 50000)$Subject &

variable != "Conc"),

aes(x = Sample, y = value, fill = variable)) + geom_bar(stat = "identity", position = "stack") + theme_bw() +

ylab("Sequence reads") + xlab(NULL) +

facet_grid(~Subject, space = "free", scales = "free") + scale_fill_manual(values = c("firebrick", "gray20"),

labels = c("yes", "no"), name = "Contaminant taxon?") +

scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) + theme(panel.grid.minor = element_blank(),

panel.grid.major.x = element_blank(),

panel.grid.major.y = element_line(color = "gray40"), axis.text = element_text(color = "black"),

axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5, size = 10), strip.background = element_rect(fill = "white"), legend.position = "none")

(14)

Subject_101 Subject_15 Subject_19 Subject_27 Subject_39 Subject_45 Subject_7

Sample_101Sample_102Sample_15Sample_16Sample_19Sample_20Sample_27Sample_28Sample_39Sample_40Sample_45Sample_46 Sample_7Sample_8 0

50000 100000

Sequence reads

Based on this, trim out Subjects 19, 27, 39 and 7 (based on low read count and many contaminants) as well as Subject 101 (because of low read count in right ear sample). Keep Subject 45 (who had no contaminants and a low read count only in the left ear) and Subject 15 (whose left-ear sample had a low read count and a high proportion of contaminants, but the right ear sample seemed better and the analyses were focused on that side).

low_read_subjects <- c("Subject_101", "Subject_19", "Subject_27",

"Subject_39", "Subject_7")

# Trim

earphy_trim <- subset_samples(earphy_trim, !(Subject %in% low_read_subjects))

Subject demographics

Import the demographic/clinical data for the subjects:

# Import metadata

earclin <- read.csv("earmeta_v4.5.csv", row.names = 1) rownames(earclin) <- earclin$Sample

# Binary factor variables are interpreted as numeric on import; fix:

binaryvars <- vector() for(i in colnames(earclin)){

if(identical(as.numeric(sort(unique(earclin[,i]))), c(0, 1))) { binaryvars <- c(binaryvars, i)

earclin[,i] <- factor(earclin[,i]) } }

# Relevel the sampling season (spring samples were collected first):

earclin$Sampling_season <- factor(earclin$Sampling_season, levels = c("spring", "autumn"))

# Additional variable for whether the sample corresponds to the dominant hand:

earclin$Dominant_side <- factor(earclin$Side == earclin$Handedness) levels(earclin$Dominant_side) <- c("no", "yes")

(15)

# Variable list for downstream comparisons

varlist <- c("Sampling_season", "Sex", "Age", "Allergy", "Dominant_side",

"Ear_cleaning_swab", "Recent_common_cold", "Staff_student") Check how many subjects have used antibiotics during the past month and exclude them:

# Number of abx users?

nrow(subset(earclin, Abx_past_month == 1))/2

## [1] 4

# Exclude these four:

earclin <- subset(earclin, Abx_past_month != 1 | is.na(Abx_past_month))

Exclude the subjects whose samples were trimmed from sequence data based on read count and/or contami- nants:

# Number of low-read subjects:

length(low_read_subjects)

## [1] 5

# Exclude these:

earclin <- subset(earclin, !(Subject %in% low_read_subjects))

Create a table of the main variables of interest for the manuscript (split by sex, which seemed like an important source of microbial differences based on earlier analyses):

# Table with only right ears (to avoid duplication of values) earclin_right <- subset(earclin, Side == "right")

# List of variables for the table

t1vars <- c(varlist[-c(grep("Sex", varlist), grep("Age", varlist), grep("Staff_student", varlist))],

"Earplugs", "Handedness", "Past_ear_infections")

# Create table and reorganize to get the desired output format table1_df <- lapply(t1vars, function(x) table(earclin_right$Sex,

earclin_right[[x]], useNA = "always")) table1_df <- do.call("rbind", table1_df)

table1_df <- as.data.frame(cbind(table1_df, row.names(table1_df))) table1_df <- subset(table1_df, !is.na(V4))

table1_df$Variable <- rep(t1vars, each = 2) rownames(table1_df) <- NULL

colnames(table1_df) <- c("Value0", "Value1", "NA", "Sex", "Varname") table1_df[, "Varname"] <- gsub("_", " ", table1_df[, "Varname"])

table1_melt <- suppressWarnings(melt(table1_df, id.vars = c("Sex", "Varname"))) table1_cast <- dcast(table1_melt, Varname ~ Sex + variable)

# Put together the final nice-looking table

table1 <- data.frame(Variable = table1_cast$Varname,

Females = paste("no: ", table1_cast$female_Value0,

", yes: ", table1_cast$female_Value1,

", not available: ", table1_cast$female_NA, sep = ""),

(16)

Males = paste("no: ", table1_cast$male_Value0,

", yes: ", table1_cast$male_Value1,

", not available: ", table1_cast$male_NA, sep = ""))

table1$Females <- gsub(", not available: 0", "", table1$Females) table1$Males <- gsub(", not available: 0", "", table1$Males) table1$Variable <- as.character(table1$Variable)

table1 <- rbind(c("Age (mean \u00B1 sd)",

paste(round(mean(subset(earclin_right, Sex == "female")$Age), digits = 2),

"\u00B1",

round(sd(subset(earclin_right, Sex == "female")$Age), digits = 2)), paste(round(mean(subset(earclin_right, Sex == "male")$Age), digits = 2),

"\u00B1",

round(sd(subset(earclin_right, Sex == "male")$Age), digits = 2))), table1)

# Fix some of the variable levels

table1[table1$Variable == "Sampling season",] <-

gsub("no", "autumn", table1[table1$Variable == "Sampling season",]) table1[table1$Variable == "Sampling season",] <-

gsub("yes", "spring", table1[table1$Variable == "Sampling season",]) table1[table1$Variable == "Handedness",] <-

gsub("no:", "left:", table1[table1$Variable == "Handedness",]) table1[table1$Variable == "Handedness",] <-

gsub("yes", "right", table1[table1$Variable == "Handedness",]) The basic demographics for the study subjects, split by sex, are as follows:

Table 1

customKable(table1)

Variable Females Males

Age (mean ± sd) 36.96 ± 10.23 38.21 ± 12.45

Allergy no: 11, yes: 16 no: 9, yes: 5

Dominant side no: 3, yes: 24 no: 2, yes: 11, not available: 1 Ear cleaning swab no: 12, yes: 15 no: 11, yes: 3

Earplugs no: 24, yes: 3 no: 14, yes: 0

Handedness left: 3, right: 24 left: 2, right: 11, not available: 1 Past ear infections no: 23, yes: 4 no: 10, yes: 3, not available: 1 Recent common cold no: 15, yes: 12 no: 9, yes: 4, not available: 1 Sampling season autumn: 8, spring: 19 autumn: 6, spring: 8

# Export table

write.table(table1, "Outputs/table1.txt", row.names = FALSE)

Some potentially interesting variables have a skewed distribution between males and females: for example, many more females than males report regularly cleaning their ears with swabs. All 3 earplug users are also female (but there are so few of these subjects that this is not a very useful variable).

(17)

Final microbiota data setup

Final phyloseq objects for analyses with the clinical data:

# Full data:

ear_ASV_final <- phyloseq(

otu_table(subset_samples(earphy_trim, Sample %in% earclin$Sample), taxa_are_rows = FALSE),

tax_table(earphy_trim), sample_data(earclin))

# Make corresponding genus-level phyloseq object

ear_gen_final <- collapseTaxLevel(ear_ASV_final, "Genus")

# Subsampled to the same number of reads per sample:

ear_ASV_final_R <- rarefy_even_depth(ear_ASV_final, rngseed = 2760757) Final sequence summary metrics:

paste("Total number of sequences:", sum(otu_table(ear_ASV_final)))

## [1] "Total number of sequences: 8878042"

paste("Mean of sequences per sample:",

round(mean(sample_sums(ear_ASV_final)), digits = 0))

## [1] "Mean of sequences per sample: 108269"

paste("SD of sequences per sample:",

round(sd(sample_sums(ear_ASV_final)), digits = 0))

## [1] "SD of sequences per sample: 30875"

paste("Number of ASVs:", ntaxa(ear_ASV_final))

## [1] "Number of ASVs: 3650"

Batch variation

The ear samples were collected in two different seasons (spring and autumn). They were also delivered to the sequencing laboratory and handled and run in these two batches, which could cause technical differences between them. Look for overall differences between batches with alpha and beta diversity measures:

Alpha diversity

# Create a data frame of alpha diversity and clinical/demographic data adiv_df <- cbind(earclin,

estimate_richness(ear_ASV_final_R,

measures = c("Observed", "InvSimpson", "Shannon")))

Test if there are statistically significant differences in alpha diversity between batches with three commonly used alpha diversity indices (observed richness, Shannon and inverse Simpson indices; Wilcoxon rank sum test):

(18)

p-values:

suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"), function(x) signif(wilcox.test(data = adiv_df,

as.formula(paste(x, "~ Sampling_season")))$p.value, digits = 1)))

## Observed InvSimpson Shannon

## 0.6 0.6 0.2

Figure S1A

The difference between batches is not statistically significant. Plot the diversity index values:

figS1A <- plotAlphaDivs(adiv_df, "Sampling_season", "A")

100 200

spring autumn

Sampling_season

Observed

A

0 1 2 3

spring autumn

Sampling_season

Shannon

2.5 5.0 7.5 10.0

spring autumn

Sampling_season

InvSimpson

Samples collected in spring and handled as the first batch have a slightly higher alpha diversity, but the difference is not statistically significant.

Beta diversity

Plot the batches with Non-Metric Multidimensional Scaling (NMDS) ordination of Bray-Curtis dissimilarity:

# Run NMDS ordination with Bray-Curtis dissimilarity

ear_ord <- ordinate(ear_ASV_final_R, "NMDS", "bray", try = 999, trace = FALSE)

# Plot

figS1B <- plot_ordination(ear_ASV_final_R, ear_ord, type = "samples",

color = "Sampling_season", shape = "Sampling_season") + theme_bw() +

coord_fixed() +

geom_point(size = 2) +

scale_color_manual(values = c("gray75", "black"), name = "Batch",

labels = c("Batch 1 (spring)", "Batch 2 (autumn)")) + scale_shape_manual(values = c(16, 17), name = "Batch",

labels = c("Batch 1 (spring)", "Batch 2 (autumn)")) + stat_ellipse(level = 0.9) +

ggtitle("B") +

theme(panel.grid = element_blank())

(19)

Figure S1B figS1B

−1.0

−0.5 0.0 0.5 1.0

−2 −1 0 1 2

NMDS1

NMDS2

Batch

Batch 1 (spring) Batch 2 (autumn)

B

Export figures together:

# Export

ggsave(grid.arrange(figS1A, figS1B, nrow = 2),

filename = "Outputs/figS1_batches.pdf", device = cairo_pdf, width = 135, height = 105, units = "mm")

The samples cluster visually according to the two batches. Test for statistical significance in beta diversity difference between batches with adonis:

dist_df <- vegdist(as(otu_table(ear_ASV_final_R), "matrix"), method = "bray") adonis(dist_df ~ Sampling_season,

data = as(sample_data(ear_ASV_final_R), "data.frame"), perm = 9999)

#### Call:

## adonis(formula = dist_df ~ Sampling_season, data = as(sample_data(ear_ASV_final_R), "data.frame"), permutations = 9999)

#### Permutation: free

## Number of permutations: 9999

#### Terms added sequentially (first to last)

#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)

## Sampling_season 1 0.2112 0.21121 0.72049 0.00893 0.6345

## Residuals 80 23.4515 0.29314 0.99107

## Total 81 23.6627 1.00000

The difference between batches is not statistically significant.

(20)

Overall, looking at both alpha and beta diversity, the two batches differ somewhat even after contaminant trimming. Unfortunately, since the seasonal and technical batches are exactly the same, it is impossible to distinguish between biological and technical differences. The detected differences could be a mix of both.

Right and left ears

Two samples were collected from every subject, one from each ear. Look for overall differences between left and right ears using diversity measures, and additionally, check how much the same subject’s samples resemble each other when compared to unrelated subjects.

Alpha diversity

Test if there are overall statistically significant differences in diversity between right ears and left ears (Wilcoxon signed-rank test for paired samples):

p-values:

# Test for significances

suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"),

function(x) round(wilcox.test(x = subset(adiv_df, Side == "right")[[x]], y = subset(adiv_df, Side == "left")[[x]], paired = TRUE)$p.value,

digits = 3)))

## Observed InvSimpson Shannon

## 0.291 0.316 0.389

Plots of the diversity index values:

# Plot

plotAlphaDivs(adiv_df, "Side")

100 200

left right

Side

Observed

0 1 2 3

left right

Side

Shannon

2.5 5.0 7.5 10.0

left right

Side

InvSimpson

Overall, alpha diversity does not differ when contrasting right and left ears.

How similar are the alpha diversity values from the same subject? A plot similar to the above, but with lines connecting samples from the same subject:

grid.arrange(

ggplot(adiv_df, aes(x = Side, y = Observed)) + geom_point() +

theme_bw() +

geom_line(data = adiv_df, aes(x = Side, y = Observed, group = Subject)) +

(21)

theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), ggplot(adiv_df, aes(x = Side, y = Shannon)) +

geom_point() + theme_bw() +

geom_line(data = adiv_df, aes(x = Side, y = Shannon, group = Subject)) + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), ggplot(adiv_df, aes(x = Side, y = InvSimpson)) +

geom_point() + theme_bw() +

geom_line(data = adiv_df, aes(x = Side, y = InvSimpson, group = Subject)) + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")), nrow=1)

100 200

left right

Side

Observed

0 1 2 3

left right

Side

Shannon

2.5 5.0 7.5 10.0

left right

Side

InvSimpson

Summary statistics for differences between the alpha diversities in right and left ear samples for each index:

leftright_adiv_per_subj <- data.frame(Observed_diff =

subset(adiv_df, Side == "right")$Observed - subset(adiv_df, Side == "left")$Observed, Shannon_diff =

subset(adiv_df, Side == "right")$Shannon - subset(adiv_df, Side == "left")$Shannon, InvSimpson_diff =

subset(adiv_df, Side == "right")$InvSimpson - subset(adiv_df, Side == "left")$InvSimpson, row.names = unique(adiv_df$Subject))

customKable(round(sapply(leftright_adiv_per_subj, summary), digits = 2)) Observed_diff Shannon_diff InvSimpson_diff

Min. -130.00 -2.48 -6.39

1st Qu. -21.00 -0.40 -0.88

Median -8.00 -0.04 -0.24

Mean -7.27 -0.09 -0.32

3rd Qu. 13.00 0.13 0.53

Max. 109.00 1.16 2.51

Aside from a few outliers who have a large difference between ears, most paired samples have a similar alpha

(22)

diversity.

Beta diversity

Plot left vs right on an NMDS ordination, regardless of subject:

leftright_bdivs <- cbind(as.data.frame(ear_ord$points), as(sample_data(ear_ASV_final_R)

[, c("Subject", "Sample", "Side")], "data.frame")) ggplot(leftright_bdivs, aes(x = MDS1, y = MDS2,

color = Side, shape = Side)) + theme_bw() +

geom_point(size = 2) +

scale_color_manual(values = c("gray60", "seagreen4")) + scale_shape_manual(values = c(15, 16)) +

coord_fixed() +

stat_ellipse(level = 0.9) +

theme(panel.grid = element_blank())

−1.0

−0.5 0.0 0.5 1.0

−2 −1 0 1

MDS1

MDS2

Side

left right

Test for statistical significance with adonis:

adonis(dist_df ~ Side, as(sample_data(ear_ASV_final_R), "data.frame"), perm = 9999)

#### Call:

## adonis(formula = dist_df ~ Side, data = as(sample_data(ear_ASV_final_R), "data.frame"), permutations = 9999)

#### Permutation: free

## Number of permutations: 9999

#### Terms added sequentially (first to last)

#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)

## Side 1 0.0542 0.054203 0.18367 0.00229 0.9902

## Residuals 80 23.6085 0.295106 0.99771

## Total 81 23.6627 1.00000

Overall, the left and right sides are very similar.

Next, to see if the left and right samples from the same subject resemble each other, plot the samples with connecting lines indicating samples from the same subject.

(23)

subject_pairs_nmds <- ggplot(leftright_bdivs, aes(x = MDS1, y = MDS2, shape = Side)) + geom_point(size = 2) +

coord_fixed() +

geom_line(data = leftright_bdivs, aes(x = MDS1, y = MDS2, group = Subject)) + theme_bw() +

theme(panel.grid = element_blank(),

axis.text = element_text(color = "black"))

As a different way of visualizing the same thing, compare dissimilarities of sample pairs from the same subject to pairs from unrelated subjects. (Each data point in the plot is a dissimilarity value for one pair of samples.) ear_dist <- vegdist(as.data.frame(as.matrix(otu_table(ear_ASV_final_R))), method = "bray") ear_pairs <- data.frame(t(combn(attr(ear_dist, "Labels"), 2)), as.numeric(ear_dist)) colnames(ear_pairs) <- c("S1", "S2", "Dissimilarity")

ear_pairs$S1_subject <-

leftright_bdivs[match(ear_pairs$S1, leftright_bdivs$Sample), "Subject"]

ear_pairs$S2_subject <-

leftright_bdivs[match(ear_pairs$S2, leftright_bdivs$Sample), "Subject"]

ear_pairs$SameSubject <-

factor(as.character(ear_pairs$S1_subject) == as.character(ear_pairs$S2_subject)) levels(ear_pairs$SameSubject) <- c("no", "yes")

ear_pairs$SameSubject <- factor(ear_pairs$SameSubject, levels = c("yes", "no")) ear_pairs_similarities <- ggplot(ear_pairs, aes(SameSubject, Dissimilarity)) +

geom_boxplot(outlier.size = 1, outlier.colour = "gray60") + theme_bw() +

ylab("Pairwise Bray-Curtis\ndissimilarity") + xlab("Same subject") +

stat_summary(fun.y = mean, geom = "point", shape = 4, size = 4) + theme(panel.grid = element_blank(),

axis.text = element_text(color = "black")) Combine these plots into a figure:

Figure S2

figS2 <- arrangeGrob(ear_pairs_similarities, blankPanel, subject_pairs_nmds, nrow = 1, widths = c(0.8, 0.03, 1))

grid.arrange(figS2)

0.00 0.25 0.50 0.75 1.00

yes no

Same subject Pairwise Bray−Curtis dissimilarity

−1.0

−0.5 0.0 0.5 1.0

−1 0 1

MDS1

MDS2

Side

left right

(24)

ggsave(figS2, filename = "Outputs/figS2_leftright.pdf", device = cairo_pdf, width = 180, height = 75, units = "mm")

Overall, similar to alpha diversity, the same subject’s samples tend to be similar.

Handedness

A comparison that perhaps makes more sense biologically is to compare the samples from the ear on the side of the dominant hand to that on the side of the non-dominant hand (as one could expect to see e.g. some hygiene-related differences between these).

Alpha diversity

p-values:

# Test for significances

suppressWarnings(sapply(c("Observed", "InvSimpson", "Shannon"),

function(x) round(wilcox.test(x = subset(adiv_df, Dominant_side == "yes")[[x]], y = subset(adiv_df, Dominant_side == "no")[[x]], paired = TRUE)$p.value,

digits = 3)))

## Observed InvSimpson Shannon

## 0.769 0.656 0.705

There are no statistically significant differences between samples from the dominant and non-dominant side.

Plots of the diversity index values:

# Plot

plotAlphaDivs(subset(adiv_df, !is.na(Dominant_side)), "Dominant_side")

100 200

no yes

Dominant_side

Observed

0 1 2 3

no yes

Dominant_side

Shannon

2.5 5.0 7.5 10.0

no yes

Dominant_side

InvSimpson

There are no visually clear trends, either.

Beta diversity

Ordination plot contrasting the samples from the dominant vs non-dominant sides:

dominance_bdivs <- cbind(as.data.frame(ear_ord$points), as(sample_data(ear_ASV_final_R) [, c("Subject", "Sample", "Dominant_side")], "data.frame"))

# Drop samples with missing handedness

(25)

dominance_bdivs <- subset(dominance_bdivs, !is.na(Dominant_side))

# Plot

ggplot(dominance_bdivs, aes(x = MDS1, y = MDS2,

color = Dominant_side, shape = Dominant_side)) + theme_bw() +

geom_point(size = 2) +

scale_color_manual(values = c("gray60", "darkblue")) + scale_shape_manual(values = c(15, 16)) +

coord_fixed() +

stat_ellipse(level = 0.9) +

theme(panel.grid = element_blank())

−1.0

−0.5 0.0 0.5 1.0

−2 −1 0 1

MDS1

MDS2

Dominant_side

no yes

Test for statistical significance with adonis:

# Distance matrix without samples with missing data dominance_dist_df <- vegdist(as(

otu_table(subset_samples(ear_ASV_final_R, !is.na(Dominant_side))), "matrix"), method = "bray")

adonis(dominance_dist_df ~ Dominant_side,

as(sample_data(subset_samples(ear_ASV_final_R,

!is.na(Dominant_side))), "data.frame"), perm = 9999)

#### Call:

## adonis(formula = dominance_dist_df ~ Dominant_side, data = as(sample_data(subset_samples(ear_ASV_final_R, !is.na(Dominant_side))), "data.frame"), permutations = 9999)

#### Permutation: free

## Number of permutations: 9999

#### Terms added sequentially (first to last)

#### Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)

## Dominant_side 1 0.0503 0.050346 0.17256 0.00221 0.9934

## Residuals 78 22.7568 0.291754 0.99779

## Total 79 22.8071 1.00000

There is no detectable difference in beta diversity between samples on the side of the dominant and non- dominant hand.

(26)

Trim to right ears

Since left and right ears are so similar in almost all subjects, and there is also no notable difference between samples depending on whether they represent the side of the dominant hand or not, focus on right ear data for the following analyses to simplify the statistical comparisons.

Phyloseq objects with only right ear samples:

ear_ASV_right <- subset_samples(ear_ASV_final, Side == "right") ear_gen_right <- subset_samples(ear_gen_final, Side == "right")

# Subsample to the same number of reads per sample:

ear_ASV_right_R <- rarefy_even_depth(ear_ASV_right, rngseed = 7554714) ear_gen_right_R <- rarefy_even_depth(ear_gen_right, rngseed = 1992623)

Most common bacterial taxa

Focusing on the right ear samples, summarize the numbers of taxa on different taxonomic levels:

print(paste("ASVs: ",

ntaxa(ear_ASV_right),

", genera: ",

ntaxa(subset_taxa(ear_gen_right, Genus != "unclassified")),

", families: ",

ntaxa(subset_taxa(collapseTaxLevel(ear_ASV_right, level = "Family"), Family != "unclassified")),

sep = ""))

## [1] "ASVs: 3650, genera: 462, families: 174"

ASVs

Overall, the 20 most common ASVs in the data are the following:

top_asv_ra <- relAbundChart(ear_ASV_right, taxaCount = 20, table = TRUE, byVar = "Side") top_asvs <- data.frame(

Taxonomy = c(as(tax_table(ear_ASV_right)

[rownames(top_asv_ra)[1:20], "Species"], "vector"), "other taxa"), Relative_abundance = top_asv_ra[[1]])

customKable(top_asvs, d = 2, cn = c("ASV", "Mean relative abundance (%)"))

(27)

ASV Mean relative abundance (%)

ASV1: Staphylococcus auricularis 28.65

ASV2: Propionibacterium acnes 15.35

ASV3: Alloiococcus otitis 15.15

ASV4: Turicella otitidis 6.54

ASV5: Turicella unclassified 5.88

ASV6: Staphylococcus unclassified 4.63

ASV7: Turicella unclassified 3.61

ASV8: Turicella unclassified 1.98

ASV13: Alloiococcus unclassified 1.27

ASV11: Alloiococcus unclassified 1.20

ASV17: Corynebacterium auris 0.99

ASV12: Staphylococcus unclassified 0.95

ASV15: Staphylococcus unclassified 0.78

ASV25: Enhydrobacter aerosaccus 0.69

ASV16: Staphylococcus unclassified 0.67

ASV14: Staphylococcus unclassified 0.63

ASV10: Staphylococcus unclassified 0.58

ASV19: Alloiococcus unclassified 0.55

ASV9: Corynebacterium unclassified 0.41

ASV28: Kocuria koreensis 0.37

other taxa 9.12

Plot the abundances of the top ASVs in each sample; samples organized according to whether ASV1, ASV2, ASV3 or some other ASV was the most common taxon.

# Relative abundances of top 20 taxa

top_ra_samples <- relAbundChart(ear_ASV_right, taxaCount = 20, table = TRUE)

# Top ASV for each sample

sample_max_asv <- sapply(top_ra_samples,

function(x) rownames(top_ra_samples)[which.max(x)]) sample_max_asv[!(sample_max_asv %in% c("ASV1", "ASV2", "ASV3"))] <- "other"

sample_order <- names(sort(sample_max_asv))

# Add taxonomy info

top_ra_samples$ASV <- c(as(tax_table(ear_ASV_right)[rownames(top_ra_samples)[1:20],

"Species"], "vector"), "other taxa")

# Rearrange data

top_ra_samples_melt <- melt(top_ra_samples)

top_ra_samples_melt$ASV <- factor(top_ra_samples_melt$ASV, levels = top_ra_samples$ASV) top_ra_samples_melt$variable <- factor(top_ra_samples_melt$variable, levels = sample_order) top_ra_samples_melt <- top_ra_samples_melt[order(top_ra_samples_melt$variable),]

# Split into two for plotting

top_ra_samples_melt1 <- subset(top_ra_samples_melt, variable %in%

levels(top_ra_samples_melt$variable)[1:20]) top_ra_samples_melt2 <- subset(top_ra_samples_melt, variable %in%

levels(top_ra_samples_melt$variable)[

21:length(levels(top_ra_samples_melt$variable))])

# Draw the legend and the plot in two rows

(28)

asv_l <- g_legend(ggplot(top_ra_samples_melt1, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +

theme_bw(base_size = 9) +

guides(fill = guide_legend(ncol = 1)) +

scale_fill_manual(values = relacols2, name = "Amplicon Sequence Variant") + theme(legend.key.size = unit(4.5, "mm")))

asv_p1 <- ggplot(top_ra_samples_melt1, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +

xlab(NULL) +

ylab("Relative abundance (%)") +

scale_fill_manual(values = relacols2) + theme_bw(base_size = 10) +

theme(axis.text.x = element_text(color = "black", angle = 45, hjust = 1), panel.grid = element_blank(),

legend.position = "none")

asv_p2 <- ggplot(top_ra_samples_melt2, aes(x = variable, y = value, fill = ASV)) + geom_bar(stat = "identity", position = "stack") +

xlab(NULL) +

ylab("Relative abundance (%)") +

scale_fill_manual(values = relacols2) + theme_bw(base_size = 10) +

theme(axis.text.x = element_text(color = "black", angle = 45, hjust = 1), panel.grid = element_blank(),

legend.position = "none")

fig_asv <- arrangeGrob(arrangeGrob(asv_p1, asv_p2, nrow = 2), asv_l, nrow = 1, widths = c(7.2, 2.8))

grid.arrange(fig_asv)

0 25 50 75 100

Sample_5Sample_15Sample_21Sample_31Sample_33Sample_45Sample_49Sample_59Sample_61Sample_69Sample_71Sample_81Sample_83Sample_91Sample_95Sample_97Sample_13Sample_23Sample_37Sample_41

Relative abundance (%)

0 25 50 75 100

Sample_65Sample_99Sample_11Sample_17Sample_25Sample_43Sample_47Sample_57Sample_63Sample_79Sample_9Sample_29Sample_35Sample_51Sample_53Sample_55Sample_67Sample_73Sample_75Sample_87Sample_89

Relative abundance (%)

Amplicon Sequence Variant

ASV1: Staphylococcus auricularis ASV2: Propionibacterium acnes ASV3: Alloiococcus otitis ASV4: Turicella otitidis ASV5: Turicella unclassified ASV6: Staphylococcus unclassified ASV7: Turicella unclassified ASV8: Turicella unclassified ASV13: Alloiococcus unclassified ASV11: Alloiococcus unclassified ASV17: Corynebacterium auris ASV12: Staphylococcus unclassified ASV15: Staphylococcus unclassified ASV25: Enhydrobacter aerosaccus ASV16: Staphylococcus unclassified ASV14: Staphylococcus unclassified ASV10: Staphylococcus unclassified ASV19: Alloiococcus unclassified ASV9: Corynebacterium unclassified ASV28: Kocuria koreensis other taxa

(29)

Genera

A similar table and plot, but for the top 10 genera:

top_gen_ra <- relAbundChart(ear_gen_right, taxaCount = 11, table = TRUE, byVar = "Side") customKable(top_gen_ra, d = 2 , cn = "Mean relative abundance (%)")

Mean relative abundance (%)

Staphylococcus 39.46

Alloiococcus 19.54

Turicella 18.25

Propionibacterium 15.54

Corynebacterium 2.15

Streptococcus 0.87

Enhydrobacter 0.70

Kocuria 0.41

Chryseobacterium 0.40

Snodgrassella 0.15

other taxa 1.99

unclassified taxa 0.54

# Put data together

top_gen_samples <- relAbundChart(ear_gen_right, taxaCount = 11, table = TRUE)

# Get top genus for each sample and rearrange data for plotting sample_max_gen <- sapply(top_gen_samples,

function(x) rownames(top_gen_samples)[which.max(x)]) sample_max_gen <- factor(sample_max_gen,

levels = c("Staphylococcus", "Propionibacterium",

"Alloiococcus", "Turicella")) sample_order_gen <- names(sort(sample_max_gen))

top_gen_samples$Genus <- rownames(top_gen_samples) top_gen_samples_melt <- melt(top_gen_samples)

top_gen_samples_melt$Genus <- factor(top_gen_samples_melt$Genus, levels = top_gen_samples$Genus) top_gen_samples_melt$variable <- factor(top_gen_samples_melt$variable,

levels = sample_order_gen)

top_gen_samples_melt <- top_gen_samples_melt[order(top_gen_samples_melt$variable),]

# Split into two for plotting

top_gen_samples_melt1 <- subset(top_gen_samples_melt, variable %in%

levels(top_gen_samples_melt$variable)[1:20]) top_gen_samples_melt2 <- subset(top_gen_samples_melt, variable %in%

levels(top_gen_samples_melt$variable)[

21:length(levels(top_gen_samples_melt$variable))])

# Draw plots gen_l <-

g_legend(ggplot(top_gen_samples_melt1, aes(x = variable, y = value, fill = Genus)) + geom_bar(stat = "identity", position = "stack") +

theme_bw(base_size = 9) +

Gambar

Figure S1B figS1B
top_ra_samples &lt;- relAbundChart (ear_ASV_right, taxaCount = 20, table = TRUE)
top_gen_ra &lt;- relAbundChart (ear_gen_right, taxaCount = 11, table = TRUE, byVar = &#34;Side&#34;) customKable (top_gen_ra, d = 2 , cn = &#34;Mean relative abundance (%)&#34;)
Figure S3 (ASVs + genera depending on classification)
+4

Referensi

Dokumen terkait