Skip to content

Commit

Permalink
find fast changes in depth
Browse files Browse the repository at this point in the history
  • Loading branch information
brentp committed Sep 12, 2018
1 parent adf368b commit 4aa6a02
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 13 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ do not have lower than average coverage** compared to regions with similar gc-co
`duphold` takes a **bam/cram**, a **VCF/BCF** of SV calls, and a **fasta** reference and it updates the FORMAT field for a
single sample with:

+ **DHZ**: z-score for the variant depth *relative to the rest of the chromosome* the variant was found on.
+ **DHFC**: fold-change for the variant depth *relative to the rest of the chromosome* the variant was found on
+ **DHBZ**: z-score for the variant depth *relative to bins in the genome with similar GC-content*.
+ **DHBFC**: fold-change for the variant depth *relative to bins in the genome with similar GC-content*.
+ **DHD**: rapid change in depth at one of the break-points (1 for higher (DUP). 0 for no or conflicting changes. -1 for drop (DUP), 2 or -2 for both break points)

It also adds **GCF** to the INFO field indicating the fraction of G or C bases in the variant.

Expand Down
4 changes: 2 additions & 2 deletions duphold.nimble
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Package

version = "0.0.1"
version = "0.0.2"
author = "Brent Pedersen"
description = "find depth support for DUP/DEL/CNV calls that use PE/SR"
license = "MIT"


# Dependencies

requires "hts >= 0.2.0", "docopt", "genoiser"
requires "docopt#0abba63", "genoiser >= 0.2.1"
srcDir = "src"

bin = @["duphold"]
Expand Down
99 changes: 89 additions & 10 deletions src/duphold.nim
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type Stats = ref object
n: int
S: float64
m: float64
t: int # total used for DHD

proc update(s:var Stats, d:int, include_zero:bool) {.inline.} =
## streaming mean, sd
Expand All @@ -23,6 +24,20 @@ proc update(s:var Stats, d:int, include_zero:bool) {.inline.} =
s.m += (df - s.m) / s.n.float64
s.S += (df - s.m) * (df - mprev)

proc addm(s:var Stats, d:int) {.inline.} =
# update only the t and n.
# not that this method is never used on a struct where the update method is also used.
s.n += 1
s.t += d

proc dropm(s:var Stats, d:int) {.inline.} =
# drp the t and the n.
s.n -= 1
s.t -= d

proc mean(s:var Stats):float64 {.inline.} =
return s.t.float64 / s.n.float64

proc idepthfun*(aln:Record, posns:var seq[mrange]) =
## depthfun is an example of a `fun` that can be sent to `genoiser`.
## it sets reports the depth at each position
Expand Down Expand Up @@ -66,6 +81,65 @@ proc get_or_empty(variant:Variant, field:string, input:var seq[float32]) =
for i, f in input:
input[i] = cast[float32](bcf_float_missing)

proc check_rapid_depth_change(start:int, stop:int, values: var seq[int32], w:int=6): int32 =
## if start and end indicate the bounds of a deletion, we can often expect to see a rapid change in
## depth at or near the break-point.
var
# could use CI for this, but larger CI == less confident anyway.
dist = min(80, max(20, 0.05 * (stop - start).float64).int)
# if we see too many changes then we can't trust the result.
changes = 0
d: float64
last_change = 0

for bi, bp in @[start, stop]:
var
cs = bp - dist
left = Stats()
right = Stats()

for i in (cs-w)..<(cs):
left.addm(values[i])
for i in cs..<(cs + w):
right.addm(values[i])

for k in cs..(bp + dist + w):
if k - last_change > w:
var
lm = left.mean
rm = right.mean
if lm < 8 and rm < 8: continue
if bi == 0:
d = rm / lm
else:
d = lm / rm
#if (k > start - 10 and k < start + 10) or (k > stop - 10 and k < stop + 10):
# echo k, " ",d

# in addition to normal change (1.25), we have special-case here for when we're very close to either break-point.
# the changes <= bi makes sure we dont mess up an existing, better change that was
# already found.
if d > 1.25 or (changes <= bi and d > 1.15 and (((k - start).abs < 2) or (k - stop).abs < 2)):
changes += 1
#echo "DUP position:", k, " fc:", d, " changes:", changes, " left:", left.mean, " right:", right.mean
last_change = k
result += 1
if d < 0.7 or (changes <= bi and d < 0.8 and (((k - start).abs < 2) or (k - stop).abs < 2)):
changes += 1
last_change = k
#echo "DEL position:", k, " fc:", d, " changes:", changes, " left:", left.mean, " right:", right.mean
result -= 1
# now update left and right
left.dropm(values[k-w])
left.addm(values[k])

right.dropm(values[k])
right.addm(values[k + w])

#echo "FINISHED:", "changes:", changes, "result:", result
if changes > 2:
result = 0

proc add_stats(variant:Variant, values:var seq[int32], sample_i: int, stats:Stats, gc_stats:var seq[Stats], fai:Fai) =
var
s = variant.start
Expand All @@ -78,7 +152,7 @@ proc add_stats(variant:Variant, values:var seq[int32], sample_i: int, stats:Stat
#echo gci, " ", gc, " ", gc_stat.n, " ", gc_stat.S

var local_stats = Stats()
for i in s..<e:
for i in (s+1)..e:
local_stats.update(values[i], true)

var tmp = @[gc]
Expand All @@ -87,11 +161,11 @@ proc add_stats(variant:Variant, values:var seq[int32], sample_i: int, stats:Stat

var floats = newSeq[float32](variant.vcf.n_samples)

get_or_empty(variant, "DHZ", floats)
var z = (local_stats.m - stats.m) / sqrt(stats.S/stats.n.float64)
floats[sample_i] = z.float32
if variant.format.set("DHZ", floats) != Status.OK:
quit "error setting DHZ in VCF"
#get_or_empty(variant, "DHZ", floats)
#var z = (local_stats.m - stats.m) / sqrt(stats.S/stats.n.float64)
#floats[sample_i] = z.float32
#if variant.format.set("DHZ", floats) != Status.OK:
# quit "error setting DHZ in VCF"

get_or_empty(variant, "DHFC", floats)
var fc = local_stats.m / stats.m
Expand All @@ -111,6 +185,11 @@ proc add_stats(variant:Variant, values:var seq[int32], sample_i: int, stats:Stat
if variant.format.set("DHBFC", floats) != Status.OK:
quit "error setting DHBFC in VCF"

var dhd = @[check_rapid_depth_change(s, e, values)]
if variant.format.set("DHD", dhd) != Status.OK:
quit "error setting DHD in VCF"


iterator duphold*(bam:Bam, vcf:VCF, fai:Fai, sample_i:int, step:int=STEP): Variant =
var depths : Fun#(values:new_seq[int32](), f:idepthfun)
var
Expand Down Expand Up @@ -206,18 +285,18 @@ Options:
if not open(ovcf, $args["--output"], mode="w"):
quit "unable to open output vcf"


if vcf.header.add_info("GCF", "1", "Float", "GC-content fraction for the variant region betwee 0 and 1.") != Status.OK:
quit "unable to add to header"

if vcf.header.add_format("DHZ", "1", "Float", "duphold z-score for depth") != Status.OK:
quit "unable to add to header"
#if vcf.header.add_format("DHZ", "1", "Float", "duphold z-score for depth") != Status.OK:
# quit "unable to add to header"
if vcf.header.add_format("DHFC", "1", "Float", "duphold depth fold-change") != Status.OK:
quit "unable to add to header"
if vcf.header.add_format("DHBZ", "1", "Float", "duphold z-score for depth compared to bins with matching GC") != Status.OK:
quit "unable to add to header"
if vcf.header.add_format("DHBFC", "1", "Float", "duphold depth fold-change compared to bins with matching GC") != Status.OK:
quit "unable to add to header"
if vcf.header.add_format("DHD", "1", "Integer", "duphold rapid change in depth at one of the break-points (1 for higher. 0 for no or conflicting changes. -1 for drop, 2 for both break points)") != Status.OK:
quit "unable to add to header"

ovcf.header = vcf.header

Expand Down

0 comments on commit 4aa6a02

Please sign in to comment.