Skip to content

Commit

Permalink
Merge pull request #71 from kcleal/dysgu_dev
Browse files Browse the repository at this point in the history
v1.6.1
  • Loading branch information
kcleal authored Sep 22, 2023
2 parents 9cce9db + 462e116 commit f388fab
Show file tree
Hide file tree
Showing 14 changed files with 587 additions and 452 deletions.
216 changes: 123 additions & 93 deletions README.rst

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dysgu/call_component.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ cdef make_generic_insertion_item(aln, int insert_size, int insert_std):
aln_span = aln.reference_end - aln.pos
v_item.size_inferred = 1
if insert_std > 0:
rand_insert_pos = insert_size - aln_span + int(normal(0, insert_std))
rand_insert_pos = abs(insert_size - aln_span + int(normal(0, insert_std)))
else: # single read mode
v_item.svtype = "BND"
clip_s = max(clip_sizes(aln))
Expand Down
93 changes: 45 additions & 48 deletions dysgu/cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ import time

ctypedef EventResult EventResult_t

np.random.seed(0)
random.seed(0)


def filter_potential(input_events, tree, regions_only):
potential = []
Expand Down Expand Up @@ -198,13 +201,13 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
ci2 = ei.contig2
ci_alt = ei.variant_seq
if isinstance(ci_alt, str):
if len(ci_alt) > 0 and ci_alt[0] in "<.":
if len(ci_alt) == 1 or (len(ci_alt) > 0 and ci_alt[0] in "<."):
ci_alt = ""
cj = ej.contig
cj2 = ej.contig2
cj_alt = ej.variant_seq
if isinstance(cj_alt, str):
if len(cj_alt) > 0 and cj_alt[0] in "<.":
if len(cj_alt) == 1 or (len(cj_alt) > 0 and cj_alt[0] in "<."):
cj_alt = ""

any_contigs_to_check = any((ci, ci2, ci_alt)) and any((cj, cj2, cj_alt))
Expand Down Expand Up @@ -675,33 +678,33 @@ def process_job(msg_queue, args):
completed_file.close()


def postcall_job(preliminaries, aux_data):
mode, ref_path, no_gt, insert_stdev, paired_end, drop_gaps = aux_data
ref_genome = pysam.FastaFile(ref_path)
preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, drop_gaps)
preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
preliminaries = post_call.strand_binom_t(preliminaries)
preliminaries = assembler.contig_info(preliminaries) # GC info, repetitiveness
preliminaries = find_repeat_expansions(preliminaries, insert_stdev)
preliminaries = post_call.compressability(preliminaries)
preliminaries = post_call.get_gt_metric2(preliminaries, mode, no_gt)
preliminaries = post_call.get_ref_base(preliminaries, ref_genome)
return preliminaries
# def postcall_job(preliminaries, aux_data):
# mode, ref_path, no_gt, insert_stdev, paired_end, drop_gaps = aux_data
# ref_genome = pysam.FastaFile(ref_path)
# preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, drop_gaps)
# preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
# preliminaries = post_call.strand_binom_t(preliminaries)
# preliminaries = assembler.contig_info(preliminaries) # GC info, repetitiveness
# preliminaries = find_repeat_expansions(preliminaries, insert_stdev)
# preliminaries = post_call.compressability(preliminaries)
# preliminaries = post_call.get_gt_metric2(preliminaries, mode, no_gt)
# preliminaries = post_call.get_ref_base(preliminaries, ref_genome)
# return preliminaries


# https://rvprasad.medium.com/data-and-chunk-sizes-matter-when-using-multiprocessing-pool-map-in-python-5023c96875ef
aux_data = None

def initializer(init_data):
global aux_data
aux_data = init_data
# aux_data = None
#
# def initializer(init_data):
# global aux_data
# aux_data = init_data


def with_initializer_worker_wrapper(varying_data):
return postcall_job(varying_data, aux_data)
# def with_initializer_worker_wrapper(varying_data):
# return postcall_job(varying_data, aux_data)


def pipe1(args, infile, kind, regions, ibam, ref_genome, bam_iter=None):
def pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name, bam_iter=None):
procs = args['procs']
low_mem = args['low_mem']
tdir = args["working_directory"]
Expand Down Expand Up @@ -774,21 +777,14 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, bam_iter=None):
max_single_size = min(max(args["max_cov"] * 50, 10000), 100000) # limited between 5000 - 50,000 reads
event_id = 0
block_edge_events = []
# min_support = int(args["min_support"])
# args["min_support"] = min_support
# logging.info("Minimum support {}".format(min_support))
# if args["pl"] == "pe": # reads with internal SVs can be detected at lower support
# lower_bound_support = min_support - 1 if min_support - 1 > 1 else 1
# else:
# lower_bound_support = min_support

clip_length = args["clip_length"]
merge_dist = args["merge_dist"]
min_size = args["min_size"]
length_extend = args["length_extend"]
divergence = args["divergence"]
read_buffer = genome_scanner.read_buffer
sites_info = sites_utils.vcf_reader(args["sites"], infile, args["reference"], paired_end, parse_probs=args["parse_probs"],
default_prob=args["sites_prob"], pass_only=args["sites_pass_only"] == "True")
sites_info = sites_utils.vcf_reader(args["sites"], infile, args["parse_probs"], sample_name, args["ignore_sample_sites"] == "True", args["sites_prob"], args["sites_pass_only"] == "True")

cdef Py_SimpleGraph G
G, node_to_name, bad_clip_counter, sites_adder, n_aligned_bases = graph.construct_graph(genome_scanner,
Expand Down Expand Up @@ -1084,23 +1080,24 @@ def pipe1(args, infile, kind, regions, ibam, ref_genome, bam_iter=None):

preliminaries = post_call.get_badclip_metric(preliminaries, bad_clip_counter, infile, regions)

if False:
aux_data = args["mode"], args["reference"], args["no_gt"], insert_stdev, paired_end, args["drop_gaps"] == "True"
pool = multiprocessing.Pool(procs, initializer, (aux_data,))
n = int(len(preliminaries) / procs)
preliminaries = [preliminaries[i:i + n] for i in range(0, len(preliminaries), n)]
preliminaries = pool.map(with_initializer_worker_wrapper, preliminaries)
preliminaries = [item for m in preliminaries for item in m]
# if False:
# aux_data = args["mode"], args["reference"], args["no_gt"], insert_stdev, paired_end, args["drop_gaps"] == "True"
# pool = multiprocessing.Pool(procs, initializer, (aux_data,))
# n = int(len(preliminaries) / procs)
# preliminaries = [preliminaries[i:i + n] for i in range(0, len(preliminaries), n)]
# preliminaries = pool.map(with_initializer_worker_wrapper, preliminaries)
# preliminaries = [item for m in preliminaries for item in m]
#
# else:

else:
preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True")
preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
preliminaries = post_call.strand_binom_t(preliminaries)
preliminaries = assembler.contig_info(preliminaries) # GC info, repetitiveness
preliminaries = find_repeat_expansions(preliminaries, insert_stdev)
preliminaries = post_call.compressability(preliminaries)
preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], args["no_gt"])
preliminaries = post_call.get_ref_base(preliminaries, ref_genome)
preliminaries = re_map.drop_svs_near_reference_gaps(preliminaries, paired_end, ref_genome, args["drop_gaps"] == "True")
preliminaries = post_call.ref_repetitiveness(preliminaries, ref_genome)
preliminaries = post_call.strand_binom_t(preliminaries)
preliminaries = assembler.contig_info(preliminaries) # GC info, repetitiveness
preliminaries = find_repeat_expansions(preliminaries, insert_stdev)
preliminaries = post_call.compressability(preliminaries)
preliminaries = post_call.get_gt_metric2(preliminaries, args["mode"], args["no_gt"])
preliminaries = post_call.get_ref_base(preliminaries, ref_genome, args["symbolic_sv_size"])

preliminaries = sample_level_density(preliminaries, regions)
preliminaries = coverage_analyser.normalize_coverage_values(preliminaries)
Expand Down Expand Up @@ -1188,7 +1185,7 @@ def cluster_reads(args):
#####################
# Run dysgu here #
#####################
events, site_adder = pipe1(args, infile, kind, regions, ibam, ref_genome)
events, site_adder = pipe1(args, infile, kind, regions, ibam, ref_genome, sample_name)
if not events:
logging.critical("No events found")
return
Expand Down
2 changes: 1 addition & 1 deletion dysgu/coverage.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ cdef class GenomeScanner:
if self.bam_iter is not None:
f_iter = self.bam_iter
else:
f_iter = self.input_bam
f_iter = self.input_bam.fetch(until_eof=True)
for aln in f_iter:
# if aln.flag & 1284 or aln.mapq < mq_thresh or aln.cigartuples is None:
# continue
Expand Down
5 changes: 4 additions & 1 deletion dysgu/extra_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import pandas as pd
pd.options.mode.chained_assignment = None
from sys import byteorder
import os

import resource
ctypedef EventResult EventResult_t


Expand All @@ -27,6 +27,9 @@ class BadClipCounter:
if not self.low_mem:
self.clip_pos_arr = [array.array("L", []) for i in range(n_references)] # 32 bit unsigned int
else:
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
if soft + (n_references * 2) > soft:
resource.setrlimit(resource.RLIMIT_NOFILE, (min(hard, soft + (n_references * 2)), hard))
self.clip_pos_arr = [open(f"{self.temp_dir}/{i}.badclip.bin", "wb") for i in range(n_references)]

def tidy(self):
Expand Down
8 changes: 5 additions & 3 deletions dysgu/filter_normals.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,18 +255,20 @@ def get_right_clip(r):


def get_contig_clipped_bases(cont, extend_by_ratio=2):
if not cont:
if not cont or len(cont) < 20:
return None, None
left = ""
right = ""
if cont[0].islower():
i = 1
while cont[i].islower():
m = len(cont)
while i < m and cont[i].islower():
i += 1
left = cont[:(extend_by_ratio*i)-1]
if cont[-1].islower():
i = -1
while cont[i].islower():
m = -len(cont)
while i > m and cont[i].islower():
i -= 1
i = max(0, len(cont) + (extend_by_ratio*i))
right = cont[i:]
Expand Down
7 changes: 6 additions & 1 deletion dysgu/graph.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ cdef void add_to_graph(Py_SimpleGraph G, AlignedSegment r, PairedEndScoper_t pe_
if bnd_site_node != bnd_site_node2 and bnd_site_node2 >= 0 and not G.hasEdge(node_name, bnd_site_node2):
G.addEdge(node_name, bnd_site_node2, 0)
# Debug:
# if r.qname == "6211787c-d032-4c6a-8f8b-020f616f2055":
# if r.qname == "M03762:232:000000000-L65J4:1:2111:17729:15161":
# echo("---", r.qname, read_enum, node_name, (event_pos, pos2), length_from_cigar, list(other_nodes))
# look = {'a4d38568-fd80-4785-8fa5-84ed132b445c', '2313a985-385c-4c84-b02c-dddfc627940b', '0031840a-bd2d-475d-9a04-528f71c7b512'}
# if r.qname in look:
Expand Down Expand Up @@ -1449,6 +1449,11 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, Py_SimpleGrap
else:
return
# Debug:
# if 157835 in n2n:
# echo("parts", partitions)
# echo("s_between", support_between)
# echo("s_within", support_within)

# echo("parts", partitions)
# echo("s_between", support_between)
# echo("s_within", support_within)
Expand Down
30 changes: 18 additions & 12 deletions dysgu/io_funcs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ cimport numpy as np
import logging
from map_set_utils import merge_intervals, echo
from collections import defaultdict
import pkg_resources
from importlib.metadata import version
import sortedcontainers
import pandas as pd
import os
import sys
import gzip
from dysgu.map_set_utils import Py_BasicIntervalTree

import random

from libc.stdlib cimport malloc

Expand All @@ -28,7 +28,7 @@ cdef char *basemap = [ '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0
'\0', '\0', '\0', '\0', 'a', 'a' ]

np.random.seed(0)

random.seed(0)

cpdef str reverse_complement(str seq, int seq_len):
"""https://bioinformatics.stackexchange.com/questions/3583/\
Expand Down Expand Up @@ -144,11 +144,16 @@ cpdef list col_names(small_output):
]


def make_main_record(r, version, index, format_f, df_rows, add_kind, small_output):
def make_main_record(r, dysgu_version, index, format_f, df_rows, add_kind, small_output):
rep, repsc, lenprec = 0, 0, 1
mean_prob, max_prob = None, None
debug = False
# if abs(r['posA'] - 39084726) < 10:
# echo('found', dict(r))
# echo(len(format_f) > 1)
# echo([(int(v["su"]), v["posA"], v["event_id"], k) for k, v in df_rows.items()])
# debug = True
if len(format_f) > 1:

best = sorted([(int(v["su"]), k) for k, v in df_rows.items()], reverse=True)[0][1]
probs = [v["prob"] for k, v in df_rows.items()]
mean_prob = np.mean(probs)
Expand Down Expand Up @@ -264,7 +269,7 @@ def make_main_record(r, version, index, format_f, df_rows, add_kind, small_outpu
fmt_keys = "GT:GQ:NMP:NMS:NMB:MAPQP:MAPQS:NP:MAS:SU:WR:PE:SR:SC:BND:SQC:SCW:SQR:BE:COV:MCOV:LNK:NEIGH:NEIGH10:RB:PS:MS:SBT:NG:NSA:NXA:NMU:NDC:RMS:RED:BCC:FCC:STL:RAS:FAS:ICN:OCN:CMP:RR:JIT:PROB"

if "variant_seq" in r and isinstance(r["variant_seq"], str):
if r['svtype'] == "INS":
if r['svtype'] == "INS" or r.variant_seq:
alt_field = r.variant_seq.upper()
else:
alt_field = f"<{r['svtype']}>"
Expand All @@ -281,7 +286,7 @@ def make_main_record(r, version, index, format_f, df_rows, add_kind, small_outpu
alt_field,
".", "." if "filter" not in r else r['filter'],
# INFO line
";".join([f"SVMETHOD=DYSGUv{version}",
";".join([f"SVMETHOD=DYSGUv{dysgu_version}",
f"SVTYPE={r['svtype']}",
f"END={r['posB']}" if r['chrA'] == r['chrB'] else f"END={r['posA'] + 1}",
f"CHR2={r['chrB']}" + chr2_pos,
Expand All @@ -297,7 +302,8 @@ def make_main_record(r, version, index, format_f, df_rows, add_kind, small_outpu
# FORMAT line(s)
for item in format_f.values():
rec.append(":".join(map(str, item)))

# if debug:
# echo("returned ", rec)
return rec


Expand Down Expand Up @@ -341,7 +347,7 @@ def gen_format_fields(r, df, names, n_fields, small_output):
row = cols[name]
format_fields[name] = get_fmt(row, small_output)
else:
format_fields[name] = [0] * n_fields
format_fields[name] = ['0/0'] + [0] * (n_fields - 1)
return format_fields, cols


Expand Down Expand Up @@ -451,8 +457,8 @@ def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=N
contig_names=contig_names) + "\t" + "\t".join(names) + "\n")

if show_names:
logging.info("Input samples: {}".format(str(list(names))))
version = pkg_resources.require("dysgu")[0].version
logging.info("Samples: {}".format(str(list(names))))
dysgu_version = version("dysgu")
seen_idx = set([])
cnames = ['raw_reads_10kb', 'NMpri', 'NMsupp', 'MAPQpri', 'MAPQsupp', "NMbase", "n_gaps"]
for col in cnames:
Expand Down Expand Up @@ -489,7 +495,7 @@ def to_vcf(df, args, names, outfile, show_names=True, contig_names="", header=N
if "partners" in r and r["partners"] is not None and r["partners"] != ".":
seen_idx |= set(r["partners"])

r_main = make_main_record(r, version, count, format_f, df_rows, add_kind, small_output_f)
r_main = make_main_record(r, dysgu_version, count, format_f, df_rows, add_kind, small_output_f)
recs.append(r_main)
count += 1

Expand Down
Loading

0 comments on commit f388fab

Please sign in to comment.