-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapscale_remove_negatives.py
executable file
·232 lines (190 loc) · 7.6 KB
/
apscale_remove_negatives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python3
"""
A submodule for the apscale wrapper to filter reads in negative controls from
samples with microDecon once apscale is done.
By Chris Hempel (christopher.hempel@kaust.edu.sa) on 1 Nov 2023
"""
import datetime
import os
import pandas as pd
import argparse
import warnings
import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages
from rpy2.robjects import pandas2ri
# Define that warnings are not printed to console
warnings.filterwarnings("ignore")
# Funtion to print datetime and text
def time_print(text):
print(datetime.datetime.now().strftime("%H:%M:%S"), ": ", text, sep="")
# Define a custom argument type for a list of strings
def list_of_strings(arg):
return arg.split(",")
# Export cleaned sequences as FASTA file
# Function to write FASTA file
def write_fasta(df, filename):
with open(filename, "w") as file:
for index, row in df.iterrows():
file.write(f'>{row["ID"]}\n{row["Seq"]}\n')
# Define function to process dfs
def remove_negs_from_df(df, unit, negative_controls):
# Identify negative controls that do and don't contain 0 reads (microDecon gives an error if used with negative controls with 0 reads)
negative_controls_keep = [neg for neg in negative_controls if df[neg].sum() > 0]
negative_controls_drop = [
neg for neg in negative_controls if neg not in negative_controls_keep
]
# If all negative controls are removed because they don't contain any reads, return the df without neg controls
if not negative_controls_keep:
return df.drop(columns=negative_controls_drop)
# Identify true samples and all samples
ranks = ["domain", "phylum", "class", "order", "family", "genus", "species"]
notSampleColumns = [
"ID",
"Seq",
"total_reads",
"lowest_rank",
"lowest_taxon",
] + ranks
true_samples = list(df.columns.difference(negative_controls + notSampleColumns))
samples = negative_controls_keep + true_samples
# Generate sample df and format for microDecon
df_samples = df[["ID"] + samples]
# Separating other information and dropping negative controls with 0 reads
df_other = df.drop(columns=samples + negative_controls_drop)
# If only 1 OTU/ESV in the blank(s) contains reads, microDecon will fail.
# In that case, we instead just subtract the summed reads from the 1 OTU/ESV from the samples.
non_zero_rows = df_samples[negative_controls_keep][
(df_samples[negative_controls_keep] != 0).any(axis=1)
]
if len(non_zero_rows) == 1:
# Get the row sum
non_zero_rows_sum = df_samples[negative_controls_keep].sum(axis=1)
# Subtract from samples
df_samples_decon = df_samples[true_samples].sub(non_zero_rows_sum, axis=0)
# Turn negative values to 0
df_samples_decon = df_samples_decon.applymap(lambda x: max(0, x))
# Add ID column
df_samples_decon = pd.DataFrame(
{"ID": df["ID"], **df_samples_decon.to_dict("list")}
)
# Otherwise we use microDecon
else:
# Remove negatives
decon_results = microDecon(
df_samples,
numb_blanks=len(negative_controls_keep),
numb_ind=robjects.IntVector([len(true_samples)]),
taxa=False,
thresh=1, # Turns off to drop OTUs that show up in only 70% of samples
prop_thresh=0, # Turns off to drop OTUs with a total abundance of < 0.005%
)
# Convert the result to a Python object and save filtered df
df_samples_decon = robjects.conversion.rpy2py(decon_results.rx2("decon.table"))
# Delete negative control column
df_samples_decon = df_samples_decon.drop(df_samples_decon.columns[1], axis=1)
# Merge reads and other data
df_decon = df_samples_decon.merge(df_other, on="ID")
# Restore row order
df_decon["NumericValues"] = df_decon["ID"].str.replace(f"{unit}_", "").astype(int)
df_decon = df_decon.sort_values(by="NumericValues")
df_decon = df_decon.drop(columns=["NumericValues"])
# Drop ESVs with 0 reads
df_decon = df_decon[df_decon.drop(columns=notSampleColumns).sum(axis=1) != 0]
# Update the "total reads" column since Neg samples were dropped
df_decon["total_reads"] = df_decon[true_samples].sum(axis=1)
return df_decon
# Define arguments
parser = argparse.ArgumentParser(
description="""A submodule for the apscale wrapper to filter reads in negative controls from
samples with microDecon once apscale is done.""",
)
parser.add_argument(
"--negative_controls",
help="List the names of all negative controls (without _R1/2.fq.gz), separated by commas without spaces.",
metavar="control1,control2,control3",
required=True,
type=list_of_strings,
)
parser.add_argument(
"--project_dir",
help="Directory containing apscale results.",
required=True,
)
# Parse argument
args = parser.parse_args()
# Set project_name argument
project_name = os.path.basename(args.project_dir)
# Start of pipeline
time_print(
"Removing negative control reads from samples with the R package microDecon..."
)
# Define path name variables
path_to_otu_clustering = os.path.join(
project_name, "9_lulu_filtering", "otu_clustering"
)
path_to_denoising = os.path.join(project_name, "9_lulu_filtering", "denoising")
# Path to ESV/OTU post-LULU files
otu_postlulu_file = os.path.join(
path_to_otu_clustering,
f"{project_name}-OTU_table-with_filtered_taxonomy.csv",
)
esv_postlulu_file = os.path.join(
path_to_denoising,
f"{project_name}-ESV_table-with_filtered_taxonomy.csv",
)
esv_postlulu_df = pd.read_csv(esv_postlulu_file)
otu_postlulu_df = pd.read_csv(otu_postlulu_file)
# Test if microdecon is installed:
if not rpackages.isinstalled("microDecon"):
time_print(
"The required R package microDecon is not installed. Installing now. This can take a while..."
)
# Import R utils to install other packages
utils = rpackages.importr("utils")
# Set mirror
utils.chooseCRANmirror(ind=1)
if not rpackages.isinstalled("devtools"):
time_print(
"The R package devtools is required to install microDecon. Installing devtools. This can take a while..."
)
utils.install_packages("devtools")
devtools = rpackages.importr("devtools")
devtools.install_github("donaldtmcknight/microDecon")
time_print(
"Installation of microDecon finished. Removing negative control reads from samples..."
)
# Load microDecon
robjects.r('library("microDecon")')
# Define the decon function
microDecon = robjects.r["decon"]
# Activate that pandas can be converted to R
pandas2ri.activate()
# Process dfs
otu_postlulu_df_microdeconFiltered = remove_negs_from_df(
otu_postlulu_df, "OTU", args.negative_controls
)
esv_postlulu_df_microdeconFiltered = remove_negs_from_df(
esv_postlulu_df, "ESV", args.negative_controls
)
# Export dfs
otu_postlulu_df_microdeconFiltered.to_csv(
otu_postlulu_file.replace(".csv", "-without_NegControls.csv"),
index=False,
)
esv_postlulu_df_microdeconFiltered.to_csv(
esv_postlulu_file.replace(".csv", "-without_NegControls.csv"),
index=False,
)
# Provide the desired filenames for the FASTA files
fasta_filename_otus = os.path.join(
path_to_otu_clustering,
f"{project_name}-OTU_sequences-without_NegControls.fasta",
)
fasta_filename_esvs = os.path.join(
path_to_denoising,
f"{project_name}-ESV_sequences-without_NegControls.fasta",
)
# Write the FASTA files
write_fasta(otu_postlulu_df_microdeconFiltered, fasta_filename_otus)
write_fasta(esv_postlulu_df_microdeconFiltered, fasta_filename_esvs)
time_print("Negative control reads removed.")