23.symbionts.Rmd

---
title: "Symbiont Profiles"
output:
  github_document:
    pandoc_args: --webtex
bibliography: bibliography.bib
---

```{r echo=FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(fig.width=12, fig.height=8,
                      echo=FALSE, warning=FALSE, message=FALSE)
library(tidyverse)
require(ComplexHeatmap)
require(ggpubr)
library(RColorBrewer)
library(ggsci)
```
## Genus level analysis using Kraken
The relative abundance of major clades (genera) of Symbiodiniaceae was profiled using [kraken](https://ccb.jhu.edu/software/kraken/) version 1.1.1 [@Wood2014-qp] to classify raw reads from each sample.  To ensure that this made full use of available reads and also that it was not affected by biased database composition we restricted our analysis to taxa for which a complete reference genome was available. This included the following data;

- Clade A: *Symbiodinium microadriadicum* [from genbank](https://www.ncbi.nlm.nih.gov/assembly/GCA_001939145.1)
- Clade B: *Breviolum sp.* [from OIST](https://marinegenomics.oist.jp/symb/download/symbB.v1.0.genome.fa.gz)
- Clade C: (C1) *Cladocopium sp.* [from reefgenomics](http://symbs.reefgenomics.org/download/SymbC1.Genome.Scaffolds.fasta.gz)
- Clade D: *Durusdinium sp.* [from OIST](https://marinegenomics.oist.jp/symbd/viewer/download?project_id=102)
- Clade F: *Fugacium sp.* [from reefgenomics](http://symbs.reefgenomics.org/download/SymbF.Genome.Scaffolds.fasta.gz)

These genomes were combined with the host *A. digitifera* genome as well the standard kraken bacterial sequences to build a kraken database (see [07_build_kraken.sh](data/hpc/symbiodinium_profiles/07_build_kraken.sh)) using default values for kmer length (31) and minimiser length (15).

kraken was then used to classify all non host read pairs (paired reads not aligning to the host genome) for all samples and the raw outputs processed with `kraken-mpa-report`.  This produces a report in a format similar to MetaPhlAn's output (see [09_run_kraken_genome.sh](data/hpc/symbiodinium_profiles/09_run_kraken_genome.sh)). 

```{r}
if(!file.exists("data/r_essentials/23_mpa_data.rds")){
  genome_mpa31_files <- list.files("data/hpc/symbiodinium_profiles/genome_kraken_mpa/",pattern = "*.mpa",full.names = TRUE)

  read_mpa <- function(path){
    s <- basename(path) %>% str_extract("[^\\.]+")
    sample_group <- s %>% str_extract("[^\\-]+")
    mpa_data <- read_tsv(path,col_names = c("taxonomy","read_count"),col_types = cols()) %>% 
      add_column(sample=s) %>% 
      mutate(sample_group= case_when(
        grepl("^AI",sample) ~ "IN",
        grepl("^BR",sample) ~ "IN",
        grepl("^AR",sample) ~ "NO",
        grepl("^RS",sample) ~ "SO",
        grepl("^DR",sample) ~ "JP"
            ))
  }

  genome_mpa31_data <- do.call(rbind,lapply(genome_mpa31_files,read_mpa)) %>% add_column(kmer="g31")

  mpa_data <- genome_mpa31_data 
  write_rds(mpa_data,"data/r_essentials/23_mpa_data.rds")
} else {
  mpa_data <- read_rds("data/r_essentials/23_mpa_data.rds")
}
```


```{r}
group_order <- c("A"=1,"B"=2,"C"=3,"D"=4,"F"=5, 'Host'=8)
loc_order <- c("IN"=1,"NO"=2,"SO"=3,"JP"=4)

clade_names <- c("A"="Symbiodinium","B"="Breviolum","C"="Cladocopium","D"="Durusdinium","F"="Fugacium")

symbiodinium_data <- mpa_data %>% 
  filter(grepl(pattern = "f__Symbiodiniaceae",taxonomy) | 
           grepl( pattern = "digitifera", taxonomy)) %>% 
  mutate(clade = str_match(taxonomy,pattern = "clade_([ABCDF])")[,2]) %>% 
  mutate(clade = ifelse(is.na(clade),"Host",clade)) %>% 
  mutate(clade_order=group_order[clade]) %>% 
  mutate(sample_group_order=loc_order[sample_group]) %>%   
  mutate(clade_name = clade_names[clade]) %>% 
  filter(clade!="Host") 
```

```{r, include=FALSE, eval=FALSE}
#symbiont read summary. To provide summary stats for paper only

library(googlesheets4)
gs4_deauth()
read_totals <- read_sheet("https://docs.google.com/spreadsheets/d/1RHye3l7R05nA6kAw9u28H84X7BxbNsGUsWykX5eqvdw/edit?usp=sharing",range = "A2:K77") %>% dplyr::select("Sample ID",total_yield="Total yield bases (Mb)")

symbiont_data_summary <- symbiodinium_data %>% 
  group_by(sample,sample_group) %>% 
  dplyr::summarise(reads=sum(read_count)) %>% 
  extract(sample,into="Sample ID",regex="(.*?_.*?_.*?)_") %>% 
  na.omit() %>% 
  left_join(read_totals) %>% 
  mutate(symbiont_percentage = reads*100/(total_yield*1e6/300)) 
```


```{r}
# Plot by absolute read counts
# 

library(ggrepel)
library(ggpubr)

symb_plot_data <- symbiodinium_data %>% 
  ungroup() %>% 
  filter(sample_group!="JP") %>% # Interesting but beyond scope of the paper
  group_by(sample) %>% 
  dplyr::mutate(sample_total = sum(read_count))
  
spg_plot <- symb_plot_data %>% 
  ggplot(aes(x=clade_name,y=read_count/1e6)) + 
  geom_boxplot(aes(color=clade_name), outlier.size = 0.5) + 
  facet_wrap(~reorder(sample_group,sample_group_order),nrow = 1)  + theme_pubclean(base_size = 10) +
  ylab("Read Count (Millions)") + xlab("") +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
        legend.position = "none", legend.title = element_blank())
```


```{r}
# Plot by read count proportions
# gp_symb_data <- symbiodinium_data %>% 
#   group_by(sample) %>% 
#   mutate(sample_total = sum(read_count)) %>% 
#   ungroup() %>% 
#   group_by(clade,sample) %>% 
#   mutate(proportion = sum(read_count)/sample_total)

spg_plot_props <- symb_plot_data %>% 
  group_by(clade,sample) %>% 
  dplyr::mutate(proportion = sum(read_count)/sample_total) %>% 
  ggplot(aes(x=clade_name,y=proportion)) + 
  geom_point(aes(color=clade_name), position = position_jitterdodge(jitter.height = 0.01, jitter.width = 0.8), size=0.5) +
  facet_wrap(~reorder(sample_group,sample_group_order),nrow = 1) + theme_pubclean(base_size = 10) +
    xlab("") + ylab("Read Proportion") + ylim(0,1) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
        legend.position = "bottom", legend.title = element_blank())
```

Irrespective of whether absolute read counts or proportion of reads is used the dominant symbiont clade for all locations and all but one sample was *Cladocopium*. A single sample from Japan was dominated by *Durusdinium*.

```{r}
# Combine plots 
library(cowplot)
plot_grid(spg_plot,spg_plot_props, ncol = 1, align = "hv", axis = "lr", rel_heights = c(0.4,0.6))
ggsave("figures/fig-s8.jpg",width = 6.2,height = 4.6)
```

# Diversity within the dominant genus, *Cladocopium*

While the kraken analysis clearly showed that all Western Australian samples are dominated by *Cladocopium*, this is a large and diverse genus.  We therefore undertook additional analyses to search for patterns of symbiont diversity at lower taxonomic levels. 


## Symbiont mitogenome haplotypes

Firstly, reads were extracted from duplicate marked bam files and mapped to the *Cladocopium* (C1) genome. The symbiont mitogenome sequence was downloaded from [reefgenomics](http://symbs.reefgenomics.org/download/).

```bash
samtools fastq -F 1024 {sample}_dpmarked.bam | bam mem -t 12 <mitogenome> - | \
samtools view -b -F 4 > {sample}_mito.bam
```

The mapping coverage in the symbiont mitochondrial genome was generally low and uneven. We thus filtered samples with less than 20X average mapping depth (excluding regions with no reads mapped). This left 41 samples for which it was possible to generate a symbiont haplotype network.

We then used bcftools to make genotype calls and then filtered these calls for quality as follows; snps were excluded if they were within 3 bp around indels, if they had less than 10 quality score, and if their coverage depth was twice bigger than the average depth or less than three.

```bash
mean_cov=$(samtools depth {sample}_mito.bam | awk '{sum += $3}END{print sum/NR}')
mean_cov=$(print "%.0f" $mean_cov)
let max_dp=mean_cov*2

bcftools mpileup -Ou -f <mitogenome> {sample}_mito.bam | \
bcftools call -c --ploidy 1 - | \
bcftools norm -Ob -f <mitogenome> - | \
bcftools filter -Oz --SnpGap 3 --IndelGap 5 \
-e " %QUAL<10 || DP > $max_dp || DP<3" - > {sample}_mito.vcf.gz
```

Next, a consensus sequence was generated for each sample by applying the relevant filtered variant file to the mitochondrial reference sequence. Uncalled loci were set to be "-".

```bash
cat <mitogenome> | bcftools consensus -a - -e 'type="indel"' {sample}_mito.vcf.gz |\
bioawk -c fastx -v samp={sample} '{printf(">%s\n%s\n", samp, $seq)}' > {sample}_consensus.fa
```

Finally, all consensus sequences were concatenated into on "alignment" file and trimAl was used to remove positions with gaps and output in NEXUS format.

```bash
cat *_consensus.fa > alignment.fasta
trimal -in alignment.fasta -nomaps -nexus -out alignment.trimal.nex
```

The alignments in Nexus-format were loaded in [popart](http://popart.otago.ac.nz/index.shtml) to generate haplotype networks using minimum spanning method.

```{r}
knitr::include_graphics("figures/Symbiont_mitohaps.jpg")
```

**Figure 2:** The haplotype network of symbiont Cladocopium C1 mitochondiral in 41 coral samples

## ITS2 assignment for symbiont reads
We firstly extracted reads that are not mapped to host genome and mapped them to the ITS2 sequences from [symportal](symportal.org) - published named sequences.

```bash
samtools view -b -f4 {sample}_dpmarked.bam | bedtools bamtofastq -i - -fq fastq/{sample}_1.fq -fq2 fastq/{sample}_2.fq

bwa mem -t 10 published_div.fasta {sample}_1.fq {sample}_2.fq |samtools view -F 4 > {sample}.sam
```
We then count the number of reads that are mapped to each ITS2 sequences and calculated the proportion by dividing the number of reads to the total number of reads.

```{r}
samples <- read_tsv("data/hpc/pcangsd/samples.txt",col_names = c("sample_id","location","mapping_rate","mean_mapping_depth","genome_cov"))
its <- read_tsv("data/hpc/symbiont/ITS2_reads_mapping/sample_it2_div.txt") %>% mutate(sample = str_replace(sample,"_merged","")) %>% 
    mutate(sample_id = str_replace(sample,"_L004","")) %>% 
    mutate(sample_id = str_replace(sample_id,"_S[0-9]+$","")) %>% 
  left_join(samples,by="sample_id") %>% 
  filter(!grepl(sample_id,pattern = "^BR_5_121"))

its_matrix <- its %>% 
  select(sample_id,prop,div) %>% 
  pivot_wider(names_from = "div",values_from = prop,values_fill = 0) %>% 
  column_to_rownames(var="sample_id") %>% as.matrix()

keep_cols1 <- colSums(its_matrix) > 1.5

keep_cols2 <-(its_matrix %>% apply(2,max))>0.2

keep_cols <- keep_cols1 | keep_cols2

library(ComplexHeatmap)
library(colorspace)

locations <- its %>% select(sample_id,location) %>% distinct() %>% pull(location)
names(locations) <- its %>% select(sample_id,location) %>% distinct() %>% pull(sample_id)
source("scripts/color_scheme.R")
named_colors <- myCol
names(named_colors) <- c("Inshore", "North Offshore", "South Offshore")

ra <- rowAnnotation(location = locations[rownames(its_matrix)],
                    col = list(location = named_colors),
                    annotation_legend_param = list(title = ""),
                    show_legend = FALSE,
                    show_annotation_name = FALSE)

library(circlize)
library(cowplot)

col_fun = colorRamp2(c(0.6, 0.3,0), sequential_hcl(n=7, palette = "Purples 3")[c(1,2,6)])

hm <- Heatmap(its_matrix[,keep_cols],
              right_annotation = ra,
              col = col_fun,
              show_row_names = FALSE, 
              show_column_dend = FALSE,
              column_names_side = "top",
              heatmap_legend_param = list(title = "",title_gp = gpar(fontsize = 12)))


hm_plot <- grid.grabExpr(draw(hm, padding = unit(c(2,2,2,10),"mm")))
plot_grid(hm_plot)
#ggsave(hm_plot,filename = "its2-hm.pdf", height = 4, width = 6)
```


## Distance based on D2S statistics
To further make use of symbiont reads in our data, we applied [jackknifing pipeline](https://github.com/chanlab-genomics/jackknifing) to raw reads which extracts reads mapped to C1 genome and count kmers from sequences using jellyfish and calculated the D2 statistics based on kmers.
We decided the optimum kmer size to be 21 and used D2S statistics, eventually, we got a matrix of kmer distances for each pair of samples.

```bash
## minus host
bwa mem -t 12 {ad.reference} {sample}_1.fastq {sample}_2.fastq |\ samtools view -f12 -F256 |\
bamToFastq -i - -fq {sample}_nonhost_1.fq -fq2 {sample}_nonhost_2.fq

## mapping to C1 + symportal ITS2 from Cladocopium strains
bwa mem -t 12 {SymC1.all.fasta} {sample}_nonhost_1.fq {sample}_nonhost_2.fq | \
samtools view -F 4 -b |samtools sort > {sample}_symC.bam
gatk MarkDuplicates -I {sample}_symC.bam -O {sample}_symC_sorted.bam -M {sample}.metrics.txt --REMOVE_DUPLICATES true
samtools fasta {sample}_symC_sorted.bam -o {sample}_symC.fasta

## jackknifing
jellyfish count -m 21 -t 12 -s 1G -o {sample}.21.jf {sample}_symC.fasta
jellyfish dump -ct {sample}.21.jf |sort -k1,1 | python2 jackknifing/jf_scripts/Kmers_2_NumbericRepresentation.py -o {sample}.21.nkc.gz
python2 jackknifing/jf_scripts/Composition_of_InputSeqs.py --fasta {sample}_symC.fasta --freq {sample}.CharFreq

## calculated d2s for each pair
python2 jackknifing/calc_d2s/Calculate_D2S.py \
--kmerset1 {sample1}.21.nkc.gz --kmerset1_freq {sample1}.CharFreq \
--kmerset2 {sample2}.21.nkc.gz --kmerset2_freq {sample2}.CharFreq \
--D2S_out {sample1}-{sample2}.txt

## generate matrix...
```

Based on this matrix, we made MDS plot below.

```{r}
t_wa <- read_tsv("data/hpc/symbiont/d2s/wa_matrix.txt", col_names = F) %>% column_to_rownames("X1") %>% as.matrix() 
colnames(t_wa) <- rownames(t_wa)

mds_wa <- t_wa %>% cmdscale() %>%  as.data.frame %>% rownames_to_column("id")%>% 
  mutate(location=str_split( string = id,pattern = "_") %>% 
           map_chr(first), pop=case_when(location %in% c("AI","BR")~"inshore", location%in% c("RS1","RS2","RS3")~"southoffshore", location=="AR"~"northoffshore"))

ggscatter(mds_wa, x="V1", y="V2", 
          #color="groups",
          repel=TRUE, 
          size=1,
          color="pop",
          #color="location",
          ellipse=TRUE,
          #palette = pal_startrek("uniform")(7),
          palette = pal_startrek("uniform")(7),
          ellipse.type="convex",xlab = "", ylab="",
          ggtheme = theme_bw(base_size = 12,base_line_size = NA),legend="none")
```