forked from almlab/BIO-ML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2.genome_assembly_commandLines.txt
102 lines (70 loc) · 5.03 KB
/
2.genome_assembly_commandLines.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#############################################
############## Genome Assembly ##############
#############################################
### Quality filtering
cutadapt -a CTGTCTCTTAT -A CTGTCTCTTAT -o genome.R1.cut.fastq -p genome.R2.cut.fastq genome.R1.fastq genome.R2.fastq
java -jar trimmomatic-0.36.jar PE -phred33 genome.R1.cut.fastq genome.R2.cut.fastq genome.R1.trim.fastq genome.R1.trim.solo.fastq genome.R2.trim.fastq genome.R2.trim.solo.fastq LEADING:3 TRAILING:3 SLIDINGWINDOW:5:20 MINLEN:50
### Genome assembly
#Spades (contig assembly):
/scratch/users/mpoyet/bin/SPAdes-3.9.1-Linux/bin/spades.py -1 genome.R1.trim.fastq -2 genome.R2.trim.fastq -t 16 --careful -o ResultsSpades
#SSPACE (Scaffolding of contigs):
SSPACE_Standard_v3.0.pl -l library_sspace.txt -s contigs.fasta -b SSPACE" >> spades_${i}.sbatch
(library_sspace.txt file is a text file that contains the average insert size and the standard deviation for the corresponding flowcell).
#GapFiller (Fill gaps between contigs):
perl GapFiller.pl -l library_sspace.txt -s SSPACE/genome.final.scaffolds.fasta -b GAPFILLER
#We remove small contigs
cp GAPFILLER/GAPFILLER.gapfilled.final.fa genome_withsmallContigs.scaffolds.fasta
./removeSmallContigs genome_withsmallContigs.scaffolds.fasta 1000 genome_final.scaffolds.fasta
#Coverage calculation with BBmap:
bbmap.sh in1=genome.R1.trim.fastq in2=genome.R2.trim.fastq ref=genome_final.scaffolds.fasta covstats=covstats_bbmap_genome.txt &> summary_results_bbmap_genome.txt
### Genome annotation and statistics
#Genome annotation with Prokka:
prokka --outdir Prokka.1 --prefix genome_prokka genome_final.scaffolds.fasta
#Genome assembly statistics with CheckM:
checkm lineage_wf --tab_table -f results_checkM_genome.txt -t 8 -x fna Prokka.1/ Checkm/
#Genome contamination is extracted from file results_checkM_genome.txt. If contamination is high (>10), the genome is cleaned, using the distribution of coverage across contigs, to contigs with extreme coverage value (usually are small contigs, representing plasmids that can be of foreign origin. Plasmids are often in multiple copies, and can more easily contaminate assemblies). We use a R script for this:
mv genome_final.scaffolds.fasta genome_contam.scaffolds.fasta
library(strucchange)
a=read.table("covstats_bbmap_genome.txt",sep='\t')
cov=sort(a$V2)
m=breakpoints(log(cov)~seq(1,length(cov)))
write(cov[m$breakpoints[length(m$breakpoints)]],file="coverage_contig_threshold.txt")
./remove_lowCoverage_contigs covstats_bbmap_genome.txt genome_contam.scaffolds.fasta coverage_contig_threshold.txt genome_final.scaffolds.fasta
#Then the genome gets re-annotated and re-processed by CheckM.
#For all genomes that pass quality filtering (contamination < 10, completeness > 90%), we collect more data on the assembly using CheckM:
checkm tree_qa -f results_checkM_tree_qa_genome.txt -o 2 Checkm/
### Taxonomy calling
#We used an approach similar to the open-reference method employed for clustering 16S sequences. We first cluster all our genomes based on genomic distances. Then we call a taxonomy using a reference collection of NCBI genomes. We use the MASH program to calculate genomic distances.
#We list all genomes:
find . -type f -name *_final.scaffolds.fasta > list_of_all_genomes_beforeClustering.txt
#First we compute the sketch file:
mash sketch -p 16 -o all_genomes_BIOML_library -l list_of_all_genomes_beforeClustering.txt
#Then we compute all distances:
/scratch/users/mgroussi/bin/Mash/mash dist -p 16 all_genomes_BIOML_library.msh all_genomes_BIOML_library.msh > results_allpairwiseD_rawMash_BIOML.txt
#We reformat the table to get genome pairs and distances in tab format: results_allpairwiseD_tabMash_BIOML.txt
#We use R to cluster genomes into groups of genomes with genomic distances below 0.05:
library(micropan)
mat=read.table("results_allpairwiseD_tabMash_BIOML.txt",sep='\t',h=T)
mat$Sequence.A=as.vector(mat$Sequence.A)
mat$Sequence.B=as.vector(mat$Sequence.B)
bC=bClust(mat,linkage="complete",threshold=0.05)
bCdataframe=data.frame(names(bC),bC)
colnames(bCdataframe)=c("genomes","cluster")
write.table(bCdataframe,file="speciesClusters_BIOML_genomes.txt",row.names=F)
save(bC,file="bClust_variable_completeLinkage_005threshold.Rdata")
#we download all assembly genomes from the NCBI:
cat assembly_summary_ncbi_withoutContamGenomes.txt | while read i; do
ftp=$(echo "$i" | cut -f1)
pattern=$(echo "${ftp##*/}")
name=$(echo "$i" | cut -f2)
name2=${name// /_}
strain=$(echo "$i" | cut -f3)
strain2=${strain// /_}
wget -O "$name2"_"$strain2"_genomic.fna.gz "$ftp"/${pattern}_genomic.fna.gz
gunzip "$name2"_"$strain2"_genomic.fna.gz
done
#We create the mash sketch file from all NCBI genomes:
ls *_genomic.fna > list_of_all_NCBI_assemblies.txt
mash sketch -p 16 -o all_genomes_NCBI -l list_of_all_NCBI_assemblies.txt
#Using a representative genome from each cluster, we then calculate the distance between this genome and all NCBI genomes. The output must be sorted by distance values to get the taxonomies of closest genomes:
mash dist all_genomes_NCBI.msh genome_final.scaffolds.fasta -v 0.0000000001