-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path23.symbionts.Rmd
313 lines (242 loc) · 13.9 KB
/
23.symbionts.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
---
title: "Symbiont Profiles"
output:
github_document:
pandoc_args: --webtex
bibliography: bibliography.bib
---
```{r echo=FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(fig.width=12, fig.height=8,
echo=FALSE, warning=FALSE, message=FALSE)
library(tidyverse)
require(ComplexHeatmap)
require(ggpubr)
library(RColorBrewer)
library(ggsci)
```
## Genus level analysis using Kraken
The relative abundance of major clades (genera) of Symbiodiniaceae was profiled using [kraken](https://ccb.jhu.edu/software/kraken/) version 1.1.1 [@Wood2014-qp] to classify raw reads from each sample. To ensure that this made full use of available reads and also that it was not affected by biased database composition we restricted our analysis to taxa for which a complete reference genome was available. This included the following data;
- Clade A: *Symbiodinium microadriadicum* [from genbank](https://www.ncbi.nlm.nih.gov/assembly/GCA_001939145.1)
- Clade B: *Breviolum sp.* [from OIST](https://marinegenomics.oist.jp/symb/download/symbB.v1.0.genome.fa.gz)
- Clade C: (C1) *Cladocopium sp.* [from reefgenomics](http://symbs.reefgenomics.org/download/SymbC1.Genome.Scaffolds.fasta.gz)
- Clade D: *Durusdinium sp.* [from OIST](https://marinegenomics.oist.jp/symbd/viewer/download?project_id=102)
- Clade F: *Fugacium sp.* [from reefgenomics](http://symbs.reefgenomics.org/download/SymbF.Genome.Scaffolds.fasta.gz)
These genomes were combined with the host *A. digitifera* genome as well the standard kraken bacterial sequences to build a kraken database (see [07_build_kraken.sh](data/hpc/symbiodinium_profiles/07_build_kraken.sh)) using default values for kmer length (31) and minimiser length (15).
kraken was then used to classify all non host read pairs (paired reads not aligning to the host genome) for all samples and the raw outputs processed with `kraken-mpa-report`. This produces a report in a format similar to MetaPhlAn's output (see [09_run_kraken_genome.sh](data/hpc/symbiodinium_profiles/09_run_kraken_genome.sh)).
```{r}
if(!file.exists("data/r_essentials/23_mpa_data.rds")){
genome_mpa31_files <- list.files("data/hpc/symbiodinium_profiles/genome_kraken_mpa/",pattern = "*.mpa",full.names = TRUE)
read_mpa <- function(path){
s <- basename(path) %>% str_extract("[^\\.]+")
sample_group <- s %>% str_extract("[^\\-]+")
mpa_data <- read_tsv(path,col_names = c("taxonomy","read_count"),col_types = cols()) %>%
add_column(sample=s) %>%
mutate(sample_group= case_when(
grepl("^AI",sample) ~ "IN",
grepl("^BR",sample) ~ "IN",
grepl("^AR",sample) ~ "NO",
grepl("^RS",sample) ~ "SO",
grepl("^DR",sample) ~ "JP"
))
}
genome_mpa31_data <- do.call(rbind,lapply(genome_mpa31_files,read_mpa)) %>% add_column(kmer="g31")
mpa_data <- genome_mpa31_data
write_rds(mpa_data,"data/r_essentials/23_mpa_data.rds")
} else {
mpa_data <- read_rds("data/r_essentials/23_mpa_data.rds")
}
```
```{r}
group_order <- c("A"=1,"B"=2,"C"=3,"D"=4,"F"=5, 'Host'=8)
loc_order <- c("IN"=1,"NO"=2,"SO"=3,"JP"=4)
clade_names <- c("A"="Symbiodinium","B"="Breviolum","C"="Cladocopium","D"="Durusdinium","F"="Fugacium")
symbiodinium_data <- mpa_data %>%
filter(grepl(pattern = "f__Symbiodiniaceae",taxonomy) |
grepl( pattern = "digitifera", taxonomy)) %>%
mutate(clade = str_match(taxonomy,pattern = "clade_([ABCDF])")[,2]) %>%
mutate(clade = ifelse(is.na(clade),"Host",clade)) %>%
mutate(clade_order=group_order[clade]) %>%
mutate(sample_group_order=loc_order[sample_group]) %>%
mutate(clade_name = clade_names[clade]) %>%
filter(clade!="Host")
```
```{r, include=FALSE, eval=FALSE}
#symbiont read summary. To provide summary stats for paper only
library(googlesheets4)
gs4_deauth()
read_totals <- read_sheet("https://docs.google.com/spreadsheets/d/1RHye3l7R05nA6kAw9u28H84X7BxbNsGUsWykX5eqvdw/edit?usp=sharing",range = "A2:K77") %>% dplyr::select("Sample ID",total_yield="Total yield bases (Mb)")
symbiont_data_summary <- symbiodinium_data %>%
group_by(sample,sample_group) %>%
dplyr::summarise(reads=sum(read_count)) %>%
extract(sample,into="Sample ID",regex="(.*?_.*?_.*?)_") %>%
na.omit() %>%
left_join(read_totals) %>%
mutate(symbiont_percentage = reads*100/(total_yield*1e6/300))
```
```{r}
# Plot by absolute read counts
#
library(ggrepel)
library(ggpubr)
symb_plot_data <- symbiodinium_data %>%
ungroup() %>%
filter(sample_group!="JP") %>% # Interesting but beyond scope of the paper
group_by(sample) %>%
dplyr::mutate(sample_total = sum(read_count))
spg_plot <- symb_plot_data %>%
ggplot(aes(x=clade_name,y=read_count/1e6)) +
geom_boxplot(aes(color=clade_name), outlier.size = 0.5) +
facet_wrap(~reorder(sample_group,sample_group_order),nrow = 1) + theme_pubclean(base_size = 10) +
ylab("Read Count (Millions)") + xlab("") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
legend.position = "none", legend.title = element_blank())
```
```{r}
# Plot by read count proportions
# gp_symb_data <- symbiodinium_data %>%
# group_by(sample) %>%
# mutate(sample_total = sum(read_count)) %>%
# ungroup() %>%
# group_by(clade,sample) %>%
# mutate(proportion = sum(read_count)/sample_total)
spg_plot_props <- symb_plot_data %>%
group_by(clade,sample) %>%
dplyr::mutate(proportion = sum(read_count)/sample_total) %>%
ggplot(aes(x=clade_name,y=proportion)) +
geom_point(aes(color=clade_name), position = position_jitterdodge(jitter.height = 0.01, jitter.width = 0.8), size=0.5) +
facet_wrap(~reorder(sample_group,sample_group_order),nrow = 1) + theme_pubclean(base_size = 10) +
xlab("") + ylab("Read Proportion") + ylim(0,1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
legend.position = "bottom", legend.title = element_blank())
```
Irrespective of whether absolute read counts or proportion of reads is used the dominant symbiont clade for all locations and all but one sample was *Cladocopium*. A single sample from Japan was dominated by *Durusdinium*.
```{r}
# Combine plots
library(cowplot)
plot_grid(spg_plot,spg_plot_props, ncol = 1, align = "hv", axis = "lr", rel_heights = c(0.4,0.6))
ggsave("figures/fig-s8.jpg",width = 6.2,height = 4.6)
```
# Diversity within the dominant genus, *Cladocopium*
While the kraken analysis clearly showed that all Western Australian samples are dominated by *Cladocopium*, this is a large and diverse genus. We therefore undertook additional analyses to search for patterns of symbiont diversity at lower taxonomic levels.
## Symbiont mitogenome haplotypes
Firstly, reads were extracted from duplicate marked bam files and mapped to the *Cladocopium* (C1) genome. The symbiont mitogenome sequence was downloaded from [reefgenomics](http://symbs.reefgenomics.org/download/).
```bash
samtools fastq -F 1024 {sample}_dpmarked.bam | bam mem -t 12 <mitogenome> - | \
samtools view -b -F 4 > {sample}_mito.bam
```
The mapping coverage in the symbiont mitochondrial genome was generally low and uneven. We thus filtered samples with less than 20X average mapping depth (excluding regions with no reads mapped). This left 41 samples for which it was possible to generate a symbiont haplotype network.
We then used bcftools to make genotype calls and then filtered these calls for quality as follows; snps were excluded if they were within 3 bp around indels, if they had less than 10 quality score, and if their coverage depth was twice bigger than the average depth or less than three.
```bash
mean_cov=$(samtools depth {sample}_mito.bam | awk '{sum += $3}END{print sum/NR}')
mean_cov=$(print "%.0f" $mean_cov)
let max_dp=mean_cov*2
bcftools mpileup -Ou -f <mitogenome> {sample}_mito.bam | \
bcftools call -c --ploidy 1 - | \
bcftools norm -Ob -f <mitogenome> - | \
bcftools filter -Oz --SnpGap 3 --IndelGap 5 \
-e " %QUAL<10 || DP > $max_dp || DP<3" - > {sample}_mito.vcf.gz
```
Next, a consensus sequence was generated for each sample by applying the relevant filtered variant file to the mitochondrial reference sequence. Uncalled loci were set to be "-".
```bash
cat <mitogenome> | bcftools consensus -a - -e 'type="indel"' {sample}_mito.vcf.gz |\
bioawk -c fastx -v samp={sample} '{printf(">%s\n%s\n", samp, $seq)}' > {sample}_consensus.fa
```
Finally, all consensus sequences were concatenated into on "alignment" file and trimAl was used to remove positions with gaps and output in NEXUS format.
```bash
cat *_consensus.fa > alignment.fasta
trimal -in alignment.fasta -nomaps -nexus -out alignment.trimal.nex
```
The alignments in Nexus-format were loaded in [popart](http://popart.otago.ac.nz/index.shtml) to generate haplotype networks using minimum spanning method.
```{r}
knitr::include_graphics("figures/Symbiont_mitohaps.jpg")
```
**Figure 2:** The haplotype network of symbiont Cladocopium C1 mitochondiral in 41 coral samples
## ITS2 assignment for symbiont reads
We firstly extracted reads that are not mapped to host genome and mapped them to the ITS2 sequences from [symportal](symportal.org) - published named sequences.
```bash
samtools view -b -f4 {sample}_dpmarked.bam | bedtools bamtofastq -i - -fq fastq/{sample}_1.fq -fq2 fastq/{sample}_2.fq
bwa mem -t 10 published_div.fasta {sample}_1.fq {sample}_2.fq |samtools view -F 4 > {sample}.sam
```
We then count the number of reads that are mapped to each ITS2 sequences and calculated the proportion by dividing the number of reads to the total number of reads.
```{r}
samples <- read_tsv("data/hpc/pcangsd/samples.txt",col_names = c("sample_id","location","mapping_rate","mean_mapping_depth","genome_cov"))
its <- read_tsv("data/hpc/symbiont/ITS2_reads_mapping/sample_it2_div.txt") %>% mutate(sample = str_replace(sample,"_merged","")) %>%
mutate(sample_id = str_replace(sample,"_L004","")) %>%
mutate(sample_id = str_replace(sample_id,"_S[0-9]+$","")) %>%
left_join(samples,by="sample_id") %>%
filter(!grepl(sample_id,pattern = "^BR_5_121"))
its_matrix <- its %>%
select(sample_id,prop,div) %>%
pivot_wider(names_from = "div",values_from = prop,values_fill = 0) %>%
column_to_rownames(var="sample_id") %>% as.matrix()
keep_cols1 <- colSums(its_matrix) > 1.5
keep_cols2 <-(its_matrix %>% apply(2,max))>0.2
keep_cols <- keep_cols1 | keep_cols2
library(ComplexHeatmap)
library(colorspace)
locations <- its %>% select(sample_id,location) %>% distinct() %>% pull(location)
names(locations) <- its %>% select(sample_id,location) %>% distinct() %>% pull(sample_id)
source("scripts/color_scheme.R")
named_colors <- myCol
names(named_colors) <- c("Inshore", "North Offshore", "South Offshore")
ra <- rowAnnotation(location = locations[rownames(its_matrix)],
col = list(location = named_colors),
annotation_legend_param = list(title = ""),
show_legend = FALSE,
show_annotation_name = FALSE)
library(circlize)
library(cowplot)
col_fun = colorRamp2(c(0.6, 0.3,0), sequential_hcl(n=7, palette = "Purples 3")[c(1,2,6)])
hm <- Heatmap(its_matrix[,keep_cols],
right_annotation = ra,
col = col_fun,
show_row_names = FALSE,
show_column_dend = FALSE,
column_names_side = "top",
heatmap_legend_param = list(title = "",title_gp = gpar(fontsize = 12)))
hm_plot <- grid.grabExpr(draw(hm, padding = unit(c(2,2,2,10),"mm")))
plot_grid(hm_plot)
#ggsave(hm_plot,filename = "its2-hm.pdf", height = 4, width = 6)
```
## Distance based on D2S statistics
To further make use of symbiont reads in our data, we applied [jackknifing pipeline](https://github.com/chanlab-genomics/jackknifing) to raw reads which extracts reads mapped to C1 genome and count kmers from sequences using jellyfish and calculated the D2 statistics based on kmers.
We decided the optimum kmer size to be 21 and used D2S statistics, eventually, we got a matrix of kmer distances for each pair of samples.
```bash
## minus host
bwa mem -t 12 {ad.reference} {sample}_1.fastq {sample}_2.fastq |\ samtools view -f12 -F256 |\
bamToFastq -i - -fq {sample}_nonhost_1.fq -fq2 {sample}_nonhost_2.fq
## mapping to C1 + symportal ITS2 from Cladocopium strains
bwa mem -t 12 {SymC1.all.fasta} {sample}_nonhost_1.fq {sample}_nonhost_2.fq | \
samtools view -F 4 -b |samtools sort > {sample}_symC.bam
gatk MarkDuplicates -I {sample}_symC.bam -O {sample}_symC_sorted.bam -M {sample}.metrics.txt --REMOVE_DUPLICATES true
samtools fasta {sample}_symC_sorted.bam -o {sample}_symC.fasta
## jackknifing
jellyfish count -m 21 -t 12 -s 1G -o {sample}.21.jf {sample}_symC.fasta
jellyfish dump -ct {sample}.21.jf |sort -k1,1 | python2 jackknifing/jf_scripts/Kmers_2_NumbericRepresentation.py -o {sample}.21.nkc.gz
python2 jackknifing/jf_scripts/Composition_of_InputSeqs.py --fasta {sample}_symC.fasta --freq {sample}.CharFreq
## calculated d2s for each pair
python2 jackknifing/calc_d2s/Calculate_D2S.py \
--kmerset1 {sample1}.21.nkc.gz --kmerset1_freq {sample1}.CharFreq \
--kmerset2 {sample2}.21.nkc.gz --kmerset2_freq {sample2}.CharFreq \
--D2S_out {sample1}-{sample2}.txt
## generate matrix...
```
Based on this matrix, we made MDS plot below.
```{r}
t_wa <- read_tsv("data/hpc/symbiont/d2s/wa_matrix.txt", col_names = F) %>% column_to_rownames("X1") %>% as.matrix()
colnames(t_wa) <- rownames(t_wa)
mds_wa <- t_wa %>% cmdscale() %>% as.data.frame %>% rownames_to_column("id")%>%
mutate(location=str_split( string = id,pattern = "_") %>%
map_chr(first), pop=case_when(location %in% c("AI","BR")~"inshore", location%in% c("RS1","RS2","RS3")~"southoffshore", location=="AR"~"northoffshore"))
ggscatter(mds_wa, x="V1", y="V2",
#color="groups",
repel=TRUE,
size=1,
color="pop",
#color="location",
ellipse=TRUE,
#palette = pal_startrek("uniform")(7),
palette = pal_startrek("uniform")(7),
ellipse.type="convex",xlab = "", ylab="",
ggtheme = theme_bw(base_size = 12,base_line_size = NA),legend="none")
```