-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcuda2.cuh
1139 lines (974 loc) · 39.6 KB
/
cuda2.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
Barracuda - A Short Sequence Aligner for NVIDIA Graphics Cards
Module: barracuda.cu - CUDA alignment and samse kernels
Copyright (C) 2012, University of Cambridge Metabolic Research Labs.
Contributers: Petr Klus, Dag Lyberg, Simon Lam and Brian Lam
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 3
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
This program is based on a modified version of BWA
File Creation date: 2012.6.8
*/
/* (0.7.0) beta: $Revision: 1.12 $
27 Feb 2015 WBL Remove all changes to cuda_dfs_match
25 Feb 2015 WBL Apply r1.55-62 scache_global_bwt
21 Feb 2015 WBL Apply r1.25 mycache only NOT sequence_global sequence_stride
redo r1.47 __align__ mycache
11 Feb 2015 WBL taken from barracuda.cu r1.82
Revert to r1.2 and provide old bwt_cuda_match_exact instead
*/
__device__ inline unsigned int numbits(unsigned int i, unsigned char c)
// with y of 32 bits which is a string sequence encoded with 2 bits per alphabet,
// count the number of occurrence of c ( one pattern of 2 bits alphabet ) in y
{
i = ((c&2)?i:~i)>>1&((c&1)?i:~i)&0x55555555;
i = (i&0x33333333)+(i>>2&0x33333333);
return((i+(i>>4)&0x0F0F0F0F)*0x01010101)>>24;
}
__device__ void bwt_cuda_device_calculate_width (uint32_t * global_bwt, unsigned char* sequence, unsigned int * widths, unsigned char * bids, unsigned short offset, unsigned short length)
//Calculate bids and widths for worst case bound, returns widths[senquence length] and bids[sequence length]
{
#if DEBUG_LEVEL > 8
printf ("in cal width\n");
#endif
unsigned short bid;
//suffix array interval k(lower bound) and l(upper bound)
bwtint_t k, l;
unsigned int i;
int last = -1;
#ifdef scache_global_bwt
D_mycache;
#else
//may need non-default __align__ to allow efficent access from __ldg()
#ifdef mycache4
__align__(16) uint32_t mycache[size_mycache];
#else
#ifdef mycache2
__align__(8) uint32_t mycache[size_mycache];
#else
uint32_t mycache[size_mycache];
#endif
#endif
#endif /*scache_global_bwt*/
// do calculation and update w and bid
bid = 0;
k = 0;
l = bwt_cuda.seq_len;
//printf("seq_len: %llu\n", bwt_cuda.seq_len);
// printf("from GPU\n");
// printf("k&l calculations\n");
for (i = offset; i < length; ++i) {
unsigned char c = sequence[i];
//printf("\n\nbase %u,%u\n",i,c);
//printf("k: %llu l: %llu\n",k,l);
if (c < 4) {
//printf("Calculating k\n");
//unsigned long long startK = k;
//unsigned long long tmpK = ((k==0)?0:bwt_cuda_occ(global_bwt, k - 1, c)) + 1;
k = bwt_cuda.L2[c] + ((k==0)?0:bwt_cuda_occ(global_bwt, k - 1, c, 0, &last,l_mycache0)) + 1;
//printf("Calculating l\n");
//unsigned long long startL = l;
//unsigned long long tmpL = bwt_cuda_occ(global_bwt, l, c);
l = bwt_cuda.L2[c] + bwt_cuda_occ(global_bwt, l, c, 1, &last,l_mycache0);
//printf("%i occ: %llu\n", c, bwt_cuda.L2[c]);
//if(offset==0) printf("i:%d,c:%d,bwt:%u,\n", i, c, bwt_cuda.L2[c]);
// printf("k:%u,",k);
// printf("l:%u\n",l);
}
if (k > l || c > 3) {
k = 0;
l = bwt_cuda.seq_len;
++bid;
}
widths[i] = l - k + 1;
bids[i] = bid;
}
widths[length] = k + 1;
bids[length] = bid;
//printf("\n\n here comes width and bids\n");
//for(i = 0; i < length; ++i){
// printf("%u,%d,%u,%u\n",i,sequence[i],widths[i],bids[i]);
// }
return;
}
__device__ int bwa_cuda_cal_maxdiff(int l, double err, double thres)
{
double elambda = exp(-l * err);
double sum, y = 1.0;
int k, x = 1;
for (k = 1, sum = elambda; k < 1000; ++k) {
y *= l * err;
x *= k;
sum += elambda * y / x;
if (1.0 - sum < thres) return k;
}
return 2;
}
__device__ void gap_stack_shadow_cuda(int x, bwtint_t max, int last_diff_pos, unsigned int * width, unsigned char * bid)
{
int i, j;
for (i = j = 0; i < last_diff_pos; ++i)
{
if (width[i] > x)
{
width[i] -= x;
}
else if (width[i] == x)
{
bid[i] = 1;
width[i] = max - (++j);
} // else should not happen
}
}
__device__ unsigned int int_log2_cuda(uint32_t v)
//integer log
{
unsigned int c = 0;
if (v & 0xffff0000u) { v >>= 16; c |= 16; }
if (v & 0xff00) { v >>= 8; c |= 8; }
if (v & 0xf0) { v >>= 4; c |= 4; }
if (v & 0xc) { v >>= 2; c |= 2; }
if (v & 0x2) c |= 1;
return c;
}
//////////////////////////////////////////////////////////////////////////////////////////
//DFS MATCH
//////////////////////////////////////////////////////////////////////////////////////////
__device__ void cuda_dfs_initialize(ulong2 *stack, uchar4 *stack_mm, uint2 *position_data, char4 *pushes/*, int * scores*/)
//initialize memory store for dfs_match
{
int i;
ulong2 temp1 = {0,0};
uchar4 temp2 = {0,0,0,0};
char4 temp3 = {0,0,0,0};;
uint2 temp4 = {0,0};
// zero out the whole stack
for (i = 0; i < MAX_ALN_LENGTH; i++)
{
stack[i] = temp1;
stack_mm[i] = temp2;
pushes[i] = temp3;
position_data[i] = temp4;
}
return;
}
__device__ void cuda_dfs_push(ulong2 *stack, uchar4 *stack_mm, uint2* position_data, char4 *pushes, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int state, int is_diff, int current_stage)
//create a new entry in memory store
{
//printf(",%i/%i", i, current_stage);
stack[current_stage].x = k;
stack[current_stage].y = l;
//stack[current_stage].z = i;
//if (is_diff)stack[current_stage].w = i;
position_data[current_stage].x = i;
if(is_diff) position_data[current_stage].y = i;
//printf("length pushed: %u\n", i);
stack_mm[current_stage].x = n_mm;
stack_mm[current_stage].y = n_gapo;
stack_mm[current_stage].z = n_gape;
stack_mm[current_stage].w = state;
char4 temp = {0,0,0,0};
pushes[current_stage] = temp;
return;
}
__device__ int cuda_dfs_match(uint32_t * global_bwt, int len, const unsigned char *str, unsigned int *widths, unsigned char *bids, const gap_opt_t *opt, alignment_meta_t *aln, barracuda_aln1_t* alns, init_info_t* init, int best_score, int full_len, char seeding)
//This function tries to find the alignment of the sequence and returns SA coordinates, no. of mismatches, gap openings and extensions
//It uses a depth-first search approach rather than breath-first as the memory available in CUDA is far less than in CPU mode
//The search rooted from the last char [len] of the sequence to the first with the whole bwt as a ref from start
//and recursively narrow down the k(upper) & l(lower) SA boundaries until it reaches the first char [i = 0], if k<=l then a match is found.
{
#if DEBUG_LEVEL > 6
printf ("in dfs match\n");
#endif
//Initialisations
int start_pos = init->start_pos;
int remain = full_len - len - start_pos;
aln->pos = init->start_pos + len;
int best_diff = start_pos==0 ? opt->max_diff + 1 : init->best_diff;
int best_cnt = init->best_cnt;
const bwt_t * bwt = &bwt_cuda; // rbwt for sequence 0 and bwt for sequence 1;
int current_stage = 0;
ulong2 entries_info[MAX_ALN_LENGTH];
uint2 position_data[MAX_ALN_LENGTH];
uchar4 entries_scores[MAX_ALN_LENGTH];
char4 done_push_types[MAX_ALN_LENGTH];
int n_aln = 0;
int loop_count = 0;
const int max_count = options_cuda.max_entries;
/*if(init->has_exact){
best_score = 0;
}
else */if(start_pos) {
best_score = init->score;
}
int max_diff_base, allow_gap, max_no_partial_hits;
if(seeding){
max_diff_base = opt->max_seed_diff;
max_no_partial_hits = MAX_NO_SEEDING_PARTIALS;
}
else {
max_diff_base = opt->max_diff;
max_no_partial_hits = MAX_NO_REGULAR_PARTIALS;
}
if(options_cuda.fast){
allow_gap = seeding ? 0 : options_cuda.max_gapo;
}
else {
allow_gap = opt->max_gapo;
}
//There are other places that these can be replaced (i.e. not calculated on every while loop) but they (empirically) slow things down. No idea why! Arran
uchar2 mode = {options_cuda.mode & BWA_MODE_NONSTOP, options_cuda.mode & BWA_MODE_GAPE};
#define USE_MODE_NONSTOP mode.x
#define USE_MODE_GAPE mode.y
//printf("worst_score:%u, query sequence length: %i\n", best_score,len);
//Initialise memory stores first in, last out
//Zero-ing the values is unnecessary because the push overwrites everything
cuda_dfs_initialize(entries_info, entries_scores, position_data, done_push_types/*, scores*/); //initialize initial entry, current stage set at 0 and done push type = 0
//push first entry, the first char of the query sequence into memory stores for evaluation
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, len, init->lim_k, start_pos>0 ? init->lim_l : bwt->seq_len, init->cur_n_mm, init->cur_n_gapo, init->cur_n_gape, 0, 0, current_stage); //push initial entry to start
#if ARRAN_DEBUG_LEVEL > 1
char bases[4] = {'A', 'C', 'G', 'T'};
int start_score = init->cur_n_mm * options_cuda.s_mm + init->cur_n_gapo * options_cuda.s_gapo + init->cur_n_gape * options_cuda.s_gape;
printf("\n---STARTING LOOP start_pos: %i k: %lu l: %lu start_score: %i best_score: %i", start_pos, init->lim_k, entries_info[0].y, start_score, best_score);
#endif
ulong4 cuda_cnt_k = {0,0,0,0};
while(current_stage >= 0)
{
int i,j, m;
int hit_found, allow_diff, allow_M;
bwtint_t k, l;
char e_n_mm, e_n_gapo, e_n_gape, e_state;
loop_count ++;
//define break from loop conditions
if ((remain==0 && n_aln == options_cuda.max_aln) || n_aln==max_no_partial_hits) {
#if DEBUG_LEVEL > 7 || ARRAN_DEBUG_LEVEL > 1
printf("\n\n*****breaking on n_aln == max_aln\n");
#endif
break;
}
if (remain==0 && best_cnt > options_cuda.max_top2) {
#if DEBUG_LEVEL > 7 || ARRAN_DEBUG_LEVEL > 1
printf("\n\n*****breaking on best_cnt>... %i>%i\n", best_cnt, options_cuda.max_top2);
#endif
break;
}
if (loop_count > max_count) {
#if DEBUG_LEVEL > 7 || ARRAN_DEBUG_LEVEL > 1
printf("\n\n*****loop_count > max_count\n");
#endif
break;
}
//put extracted entry into local variables
k = entries_info[current_stage].x; // SA interval
l = entries_info[current_stage].y; // SA interval
//i = entries_info[current_stage].z; // length
i = position_data[current_stage].x;
//printf(",%i/%i", i, current_stage);
e_n_mm = entries_scores[current_stage].x; // no of mismatches
e_n_gapo = entries_scores[current_stage].y; // no of gap openings
e_n_gape = entries_scores[current_stage].z; // no of gap extensions
e_state = entries_scores[current_stage].w; // state (M/I/D)
//calculate score
//Storing this in position_data[current_stage].z has been tested by including score+relevant_penalty in
//calls to cuda_dfs_push but it is slower than calculating each time
int score = e_n_mm * options_cuda.s_mm + e_n_gapo * options_cuda.s_gapo + e_n_gape * options_cuda.s_gape;
int worst_tolerated_score = (USE_MODE_NONSTOP)? 1000: best_score + options_cuda.s_mm;
if(score > worst_tolerated_score) {
#if DEBUG_LEVEL > 7 || ARRAN_DEBUG_LEVEL > 1
printf("\n\n*****breaking on score(%i) > worst_tolerated_score(%i)\n", score, worst_tolerated_score);
#endif
break;
}
int max_diff = max_diff_base;
//calculate the allowance for differences
m = max_diff - e_n_mm - e_n_gapo;
#if DEBUG_LEVEL > 8
printf("dfs stage %u:",current_stage);
printf("k:%lu, l: %lu, i: %u, diff remaining :%u, best_score: %u, n_mm:%u\n", k, l,i,m, best_score, e_n_mm);
#endif
if (USE_MODE_GAPE) m -= e_n_gape;
// check if the entry is outside boundary or is over the max diff allowed)
if (m < 0 || (i > 0 && m < bids[i-1+remain]))
{
#if DEBUG_LEVEL > 8 || ARRAN_DEBUG_LEVEL > 1
printf("breaking: %d, m:%d\n", bids[i-1+remain],m);
#endif
current_stage --;
continue;
}
// check whether a hit (full sequence when it reaches the last char, i.e. i = 0) is found, if it is, record the alignment information
hit_found = 0;
if (!i)
{
hit_found = 1;
}else if (!m) // alternatively if no difference is allowed, just do exact match)
{
if (e_state == STATE_M ||(options_cuda.mode&BWA_MODE_GAPE) || e_n_gape == opt->max_gape)
{
if (bwt_cuda_match_exact(global_bwt, i, str, &k, &l))
{
#if ARRAN_DEBUG_LEVEL > 1
printf("\n[aln_core][exact_match] n_aln: %i pos: %i k: %lu l: %lu mm: %i gapo: %i gape: %i state: %i score: %i seq: %i", n_aln, start_pos + len - i, k, l, e_n_mm, e_n_gapo, e_n_gape, e_state, score, init->sequence_id);
#endif
hit_found = 1;
}else
{
#if ARRAN_DEBUG_LEVEL > 1
printf("\n[aln_core][no_exact] n_aln: %i pos: %i k: %lu l: %lu mm: %i gapo: %i gape: %i state: %i score: %i seq: %i", n_aln, start_pos + len - i, k, l, e_n_mm, e_n_gapo, e_n_gape, e_state, score, init->sequence_id);
#endif
current_stage --;
continue; // if there is no hit, then go backwards to parent stage
}
}
}
if (hit_found)
{
// action for found hits
//int do_add = 1;
if (score < best_score)
{
best_score = score;
best_diff = e_n_mm + e_n_gapo + (options_cuda.mode & BWA_MODE_GAPE) * e_n_gape;
best_cnt = 0; //reset best cnt if new score is better
if (!(options_cuda.mode & BWA_MODE_NONSTOP))
max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour
}
if (score == best_score) best_cnt += l - k + 1;
if (e_n_gapo)
{ // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat
// if this alignment was already found, do not add to alignment record array unless the new score is better.
for (j = 0; j < n_aln; ++j)
if (alns[j].k == k && alns[j].l == l) break;
if (j < n_aln)
{
if (score < alns[j].score)
{
alns[j].score = score;
alns[j].n_mm = e_n_mm;
alns[j].n_gapo = e_n_gapo;
alns[j].n_gape = e_n_gape;
alns[j].best_cnt = best_cnt;
}
//do_add = 0;
hit_found = 0;
}
}
if (hit_found)
{ // append result the alignment record array
gap_stack_shadow_cuda(l - k + 1, bwt->seq_len, position_data[current_stage].y, widths, bids);
// record down number of mismatch, gap open, gap extension and a??
alns[n_aln].n_mm = entries_scores[current_stage].x;
alns[n_aln].n_gapo = entries_scores[current_stage].y;
alns[n_aln].n_gape = entries_scores[current_stage].z;
// the suffix array interval
alns[n_aln].k = k;
alns[n_aln].l = l;
alns[n_aln].score = score;
alns[n_aln].best_cnt = best_cnt;
#if ARRAN_DEBUG_LEVEL > 1
printf("\n[aln_core][hit_found] n_aln: %i pos: %i k: %lu l: %lu mm: %i gapo: %i gape: %i state: %i score: %i seq: %i", n_aln, start_pos + len - i, k, l, e_n_mm, e_n_gapo, e_n_gape, e_state, score, init->sequence_id);
#endif
++n_aln;
}
current_stage --;
continue;
}
// proceed and evaluate the next base on sequence
--i;
// retrieve Occurrence values and determine all the eligible daughter nodes, done only once at the first instance and skip when it is revisiting the stage
bwtint_t ks[MAX_ALN_LENGTH][4], ls[MAX_ALN_LENGTH][4];
char eligible_cs[MAX_ALN_LENGTH][5], no_of_eligible_cs=0;
if(!done_push_types[current_stage].x)
{
if(k) {
cuda_cnt_k = bwt_cuda_occ4(global_bwt, k-1);
}
ulong4 cuda_cnt_l = bwt_cuda_occ4(global_bwt, l);
ks[current_stage][0] = bwt->L2[0] + cuda_cnt_k.x + 1;
ls[current_stage][0] = bwt->L2[0] + cuda_cnt_l.x;
ks[current_stage][1] = bwt->L2[1] + cuda_cnt_k.y + 1;
ls[current_stage][1] = bwt->L2[1] + cuda_cnt_l.y;
ks[current_stage][2] = bwt->L2[2] + cuda_cnt_k.z + 1;
ls[current_stage][2] = bwt->L2[2] + cuda_cnt_l.z;
ks[current_stage][3] = bwt->L2[3] + cuda_cnt_k.w + 1;
ls[current_stage][3] = bwt->L2[3] + cuda_cnt_l.w;
if (ks[current_stage][0] <= ls[current_stage][0])
{
eligible_cs[current_stage][no_of_eligible_cs++] = 0;
}
if (ks[current_stage][1] <= ls[current_stage][1])
{
eligible_cs[current_stage][no_of_eligible_cs++] = 1;
}
if (ks[current_stage][2] <= ls[current_stage][2])
{
eligible_cs[current_stage][no_of_eligible_cs++] = 2;
}
if (ks[current_stage][3] <= ls[current_stage][3])
{
eligible_cs[current_stage][no_of_eligible_cs++] = 3;
}
eligible_cs[current_stage][4] = no_of_eligible_cs;
#if ARRAN_DEBUG_LEVEL > 1
if(start_pos + len - i ARRAN_DEBUG_POSITION){
printf("\nScore / Worst: %i/%i", score, worst_tolerated_score);
printf("\n%i %c i: %i k: %lu l: %lu OK: %i\n", start_pos + len - i, bases[str[i]], i, k, l, no_of_eligible_cs);
for(int q=0; q<len; q++){
printf("%c", bases[str[q]]);
}
for(int q=0; q<4; q++){
printf("\n%c l: %lu k: %lu %s", bases[q], ks[current_stage][q], ls[current_stage][q], ks[current_stage][q]<=ls[current_stage][q] ? "OK" : "");
}
}
#endif
}else
{
no_of_eligible_cs = eligible_cs[current_stage][4];
}
// test whether difference is allowed
allow_diff = 1;
allow_M = 1;
if (i)
{
if (bids[i-1+remain] > m -1)
{
allow_diff = 0;
//printf("\n...%i %i %i", real_pos, bids[i-1+remain], m);
}else if (bids[i-1+remain] == m-1 && bids[i+remain] == m-1 && widths[i-1+remain] == widths[i+remain])
{
allow_M = 0;
}
}
//donepushtypes stores information for each stage whether a prospective daughter node has been evaluated or not
//donepushtypes[current_stage].x exact match, =0 not done, =1 done
//donepushtypes[current_stage].y mismatches, 0 not done, =no of eligible cs with a k<=l done
//donepushtypes[current_stage].z deletions, =0 not done, =no of eligible cs with a k<=l done
//donepushtypes[current_stage].w insertions match, =0 not done, =1 done
//.z and .w are shared among gap openings and extensions as they are mutually exclusive
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// exact match
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//try exact match first
if (!done_push_types[current_stage].x)
{
#if DEBUG_LEVEL >8
printf("trying exact\n");
#endif
int c = str[i];
done_push_types[current_stage].x = 1;
if (c < 4)
{
#if DEBUG_LEVEL > 8
printf("c:%i, i:%i\n",c,i);
printf("k:%u\n",ks[current_stage][c]);
printf("l:%u\n",ls[current_stage][c]);
#endif
if (ks[current_stage][c] <= ls[current_stage][c])
{
#if DEBUG_LEVEL > 8
printf("ex match found\n");
#endif
#if ARRAN_DEBUG_LEVEL > 1 && ARRAN_PRINT_DFS > 0 && ARRAN_PRINT_EXACT_MATCHES > 0
printf("\n--- E %i %c stg: %i", start_pos + len - i, bases[c], current_stage);
#endif
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i, ks[current_stage][c], ls[current_stage][c], e_n_mm, e_n_gapo, e_n_gape, STATE_M, 0, current_stage+1);
current_stage++;
continue;
}
}
}else if (score == worst_tolerated_score)
{
allow_diff = 0;
}
if (allow_diff)
{
#if DEBUG_LEVEL > 8
printf("trying inexact...\n");
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// mismatch
////////////////////////////////////////////////////////////////////////////////////////////////////////////
if (done_push_types[current_stage].y < no_of_eligible_cs) //check if done before
{
int c = eligible_cs[current_stage][(done_push_types[current_stage].y)];
done_push_types[current_stage].y++;
if(c==str[i]){
c = eligible_cs[current_stage][(done_push_types[current_stage].y)];
done_push_types[current_stage].y++;
}
if (allow_M) // daughter node - mismatch
{
if (score + options_cuda.s_mm <= worst_tolerated_score) //skip if prospective entry is beyond worst tolerated
{
if (done_push_types[current_stage].y <= no_of_eligible_cs)
{
#if ARRAN_DEBUG_LEVEL > 1 && ARRAN_PRINT_DFS > 0 && ARRAN_PRINT_MISMATCHES > 0
printf("\n### M %i %c stg: %i", start_pos + len - i, bases[c], current_stage);
#endif
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i, ks[current_stage][c], ls[current_stage][c], e_n_mm + 1, e_n_gapo, e_n_gape, STATE_M, 1, current_stage+1);
current_stage++;
continue;
}
}
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Indels (Insertions/Deletions)
////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define USE_OLD_GAP_CODE 0
#if USE_OLD_GAP_CODE==0
unsigned int tmp = (options_cuda.mode & BWA_MODE_LOGGAP)? (int_log2_cuda(e_n_gape + e_n_gapo))>>1 + 1 : e_n_gapo + e_n_gape;
signed char * done_type;
uint2 push_data;
uchar4 gap_meta, gap_meta2;
#define GAP_TYPE gap_meta.x
#define GAP_PENALTY gap_meta.y
#define GAP_COUNT gap_meta.z
#define GAP_LIMIT gap_meta.w
#define GAP_INC gap_meta2.x
#define TYPE_LIMIT gap_meta2.y
#define DELTA_I gap_meta2.z
#define DEL_MAX gap_meta2.w
#define PUSH_K push_data.x
#define PUSH_L push_data.y
if(e_state){ //extending a gap
GAP_TYPE = e_state;
GAP_PENALTY = options_cuda.s_gape;
GAP_COUNT = e_n_gape;
GAP_LIMIT = opt->max_gape;
GAP_INC = 0;
}
else { //opening a gap
GAP_TYPE = !done_push_types[current_stage].w ? STATE_I : STATE_D;
GAP_PENALTY = options_cuda.s_gapo;
GAP_COUNT = e_n_gapo;
GAP_LIMIT = allow_gap;
GAP_INC = 1;
}
if(GAP_TYPE & STATE_I){
done_type = &(done_push_types[current_stage].w);
TYPE_LIMIT = 1;
PUSH_K = k;
PUSH_L = l;
DELTA_I = 0;
DEL_MAX = 1;
}
else {
done_type = &(done_push_types[current_stage].z);
TYPE_LIMIT = no_of_eligible_cs;
PUSH_K = ks[current_stage][(eligible_cs[current_stage][(done_push_types[current_stage].z)])];
PUSH_L = ls[current_stage][(eligible_cs[current_stage][(done_push_types[current_stage].z)])];
DELTA_I = 1;
// we only checked max_del_occ when extending a deletion and !e_state means we are opening a gap
DEL_MAX = !e_state || e_n_gape + e_n_gapo < max_diff || l - k < options_cuda.max_del_occ;
}
(*done_type)++;
// the order of these if-statements has been optimised through trial & error / bubble sort
if(i + remain >= options_cuda.indel_end_skip + tmp && start_pos + len - i >= options_cuda.indel_end_skip + tmp){
if(score+GAP_PENALTY <= worst_tolerated_score){
if(GAP_COUNT < GAP_LIMIT){
if(*done_type <= TYPE_LIMIT){
if(DEL_MAX){
#if ARRAN_DEBUG_LEVEL > 1 && ARRAN_PRINT_DFS > 0 && ARRAN_PRINT_GAPS > 0
printf("\n@@@ G%s %s %i stg: %i", e_state ? "E" : "O", GAP_TYPE & STATE_I ? "I" : "D", start_pos + len - i, current_stage);
#endif
current_stage++;
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i + DELTA_I, PUSH_K, PUSH_L, e_n_mm, e_n_gapo + GAP_INC, e_n_gape + 1 - GAP_INC, GAP_TYPE, 1, current_stage);
continue;
}
}
}
}
}
#else
unsigned int occ;
//int allow_gap = 0;
if(allow_gap){
if (!e_state) // daughter node - opening a gap insertion or deletion
{
if (score + options_cuda.s_gapo <=worst_tolerated_score) //skip if prospective entry is beyond worst tolerated
{
if (e_n_gapo < opt->max_gapo)
{
if (!done_push_types[current_stage].w)
{ //insertions
done_push_types[current_stage].w = 1;
unsigned int tmp = (options_cuda.mode & BWA_MODE_LOGGAP)? (int_log2_cuda(e_n_gape + e_n_gapo))>>1 + 1 : e_n_gapo + e_n_gape;
if (i >= options_cuda.indel_end_skip + tmp && len - i >= options_cuda.indel_end_skip + tmp)
{
current_stage++;
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i, k, l, e_n_mm, e_n_gapo + 1, e_n_gape, STATE_I, 1, current_stage);
continue;
}
}
else if (done_push_types[current_stage].z < no_of_eligible_cs) //check if done before
{ //deletions
unsigned int tmp = (options_cuda.mode & BWA_MODE_LOGGAP)? (int_log2_cuda(e_n_gape + e_n_gapo))>>1 + 1 : e_n_gapo + e_n_gape;
if(i + remain >= options_cuda.indel_end_skip + tmp && real_pos >= options_cuda.indel_end_skip + tmp)
{
int c = eligible_cs[current_stage][(done_push_types[current_stage].z)];
done_push_types[current_stage].z++;
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i + 1, ks[current_stage][c], ls[current_stage][c], e_n_mm, e_n_gapo + 1, e_n_gape, STATE_D, 1, current_stage+1);
current_stage++; //advance stage number by 1
continue;
}
else
{
done_push_types[current_stage].z++;
}
}
}
}
}else if (e_state == STATE_I) //daughter node - extend an insertion entry
{
if(!done_push_types[current_stage].w) //check if done before
{
done_push_types[current_stage].w = 1;
if (e_n_gape < opt->max_gape) //skip if no of gap ext is beyond limit
{
if (score + options_cuda.s_gape <=worst_tolerated_score) //skip if prospective entry is beyond worst tolerated
{
unsigned int tmp = (options_cuda.mode & BWA_MODE_LOGGAP)? (int_log2_cuda(e_n_gape + e_n_gapo))>>1 + 1 : e_n_gapo + e_n_gape;
if (i >= options_cuda.indel_end_skip + tmp && len - i >= options_cuda.indel_end_skip + tmp)
{
current_stage++; //advance stage number by 1
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i, k, l, e_n_mm, e_n_gapo, e_n_gape + 1, STATE_I, 1, current_stage);
continue; //skip the rest and proceed to next stage
}
}
}
}
}else if (e_state == STATE_D) //daughter node - extend a deletion entry
{
occ = l - k + 1;
if (done_push_types[current_stage].z < no_of_eligible_cs) //check if done before
{
if (e_n_gape < opt->max_gape) //skip if no of gap ext is beyond limit
{
if (score + options_cuda.s_gape <=worst_tolerated_score) //skip if prospective entry is beyond worst tolerated
{
if (e_n_gape + e_n_gapo < max_diff || occ < options_cuda.max_del_occ)
{
unsigned int tmp = (options_cuda.mode & BWA_MODE_LOGGAP)? (int_log2_cuda(e_n_gape + e_n_gapo))>>1 + 1 : e_n_gapo + e_n_gape;
if (i >= options_cuda.indel_end_skip + tmp && len - i >= options_cuda.indel_end_skip + tmp)
{
int c = eligible_cs[current_stage][(done_push_types[current_stage].z)];
done_push_types[current_stage].z++;
cuda_dfs_push(entries_info, entries_scores, position_data, done_push_types, i + 1, ks[current_stage][c], ls[current_stage][c], e_n_mm, e_n_gapo, e_n_gape + 1, STATE_D, 1, current_stage+1);
current_stage++; //advance stage number
continue;
}
}
}
}
else
{
done_push_types[current_stage].z++;
}
}
} //end else if (e_state == STATE_D)
} //end allow_gap
//end USE_OLD_GAP_CODE==1
#endif
}//end if (!allow_diff)
current_stage--;
} //end do while loop
aln->no_of_alignments = n_aln;
aln->best_cnt = best_cnt;
#if ARRAN_DEBUG_LEVEL > 1
printf("\n[aln_core][end_of_split_pass] n_aln: %i", n_aln);
#endif
// printf("no of alignments %u\n",n_aln);
return best_score;
}
__global__ void cuda_inexact_match_caller(uint32_t * global_bwt, int no_of_sequences, alignment_meta_t* global_alignment_meta, barracuda_aln1_t* global_alns, init_info_t* global_init, widths_bids_t *global_w_b, int best_score, char split_engage, char clump)
//CUDA kernal for inexact match on both strands
//calls bwt_cuda_device_calculate_width to determine the boundaries of the search space
//and then calls dfs_match to search for alignment using dfs approach
{
// Block ID for CUDA threads, as there is only 1 thread per block possible for now
unsigned int blockId = blockIdx.x * blockDim.x + threadIdx.x;
//Core function
// work on valid sequence only
if ( blockId < no_of_sequences ) {
unsigned char local_complemented_sequence[SEQUENCE_HOLDER_LENGTH];
alignment_meta_t local_alignment_meta;
barracuda_aln1_t local_alns[MAX_NO_PARTIAL_HITS];
widths_bids_t local_w_b;
init_info_t local_init;
memset(&local_alignment_meta, 0, sizeof(alignment_meta_t));
memset(&local_alns, 0, MAX_NO_PARTIAL_HITS*sizeof(barracuda_aln1_t));
local_init = global_init[blockId];
local_alignment_meta.sequence_id = local_init.sequence_id;
local_w_b = global_w_b[local_init.sequence_id];
//initialize local options for each query sequence
gap_opt_t local_options = options_cuda;
//get sequences from texture memory
const uint2 sequence_info = tex1Dfetch(sequences_index_array, local_init.sequence_id);
const unsigned int sequence_offset = sequence_info.x;
const unsigned short sequence_length = sequence_info.y;
unsigned int last_read = ~0;
unsigned int last_read_data = 0;
unsigned int pass_length;
char seeding = local_init.start_pos<local_options.seed_len;
if(split_engage){
if(seeding){
pass_length = min(MAX_SEEDING_PASS_LENGTH, local_options.seed_len - local_init.start_pos);
}
else {
pass_length = min(MAX_REGULAR_PASS_LENGTH, sequence_length - local_init.start_pos);
}
}
else {
pass_length = sequence_length;
}
if(clump){
pass_length = min(pass_length, SUFFIX_CLUMP_WIDTH);
}
//the data is packed in a way that requires us to cycle through and change last_read and last_read_data
int cycle_i = sequence_length - local_init.start_pos - pass_length;
for(int i=0; i<cycle_i; i++){
read_char(sequence_offset + i, &last_read, &last_read_data);
}
for (int i = 0; i < pass_length; i++){
unsigned char c = read_char(sequence_offset + cycle_i + i, &last_read, &last_read_data );
//local_sequence[i] = c;
local_complemented_sequence[i] = c > 3 ? 4 : 3 - c;
}
local_alignment_meta.finished = (local_init.start_pos+pass_length == sequence_length);
//initialize local options
if (options_cuda.fnr > 0.0) local_options.max_diff = bwa_cuda_cal_maxdiff(sequence_length, BWA_AVG_ERR, options_cuda.fnr);
//make sure when max diff < seed diff will override seed diff settings
if (local_options.max_seed_diff > local_options.max_diff) local_options.max_seed_diff = local_options.max_diff;
if (local_options.max_diff < options_cuda.max_gapo) local_options.max_gapo = local_options.max_diff;
//Align with two the 2-way bwt reference
local_alignment_meta.best_score = local_init.score = cuda_dfs_match(global_bwt, pass_length, local_complemented_sequence, local_w_b.widths, local_w_b.bids, &local_options, &local_alignment_meta, local_alns, &local_init, best_score, sequence_length, seeding);
// copy alignment info to global memory
global_alignment_meta[blockId] = local_alignment_meta;
int max_no_partial_hits = seeding ? MAX_NO_SEEDING_PARTIALS : MAX_NO_REGULAR_PARTIALS;
memcpy(global_alns + blockId*max_no_partial_hits, local_alns, max_no_partial_hits*sizeof(barracuda_aln1_t));
}
return;
}
__global__ void cuda_prepare_widths(uint32_t * global_bwt, int no_of_sequences, widths_bids_t * global_w_b, char * global_N_too_high)
{
unsigned int blockId = blockIdx.x * blockDim.x + threadIdx.x;
if ( blockId < no_of_sequences ) {
widths_bids_t local_w_b;
unsigned char local_sequence[MAX_SEQUENCE_LENGTH];
gap_opt_t local_options = options_cuda;
const uint2 sequence_info = tex1Dfetch(sequences_index_array, blockId);
const unsigned int sequence_offset = sequence_info.x;
const unsigned short sequence_length = sequence_info.y;
unsigned int last_read = ~0;
unsigned int last_read_data;
int N = 0;
if (options_cuda.fnr > 0.0) local_options.max_diff = bwa_cuda_cal_maxdiff(sequence_length, BWA_AVG_ERR, options_cuda.fnr);
for(int i=0; i<sequence_length; ++i)
{
unsigned char c = read_char(sequence_offset + i, &last_read, &last_read_data );
local_sequence[i] = c;
if(c>3) N++;
if(N>local_options.max_diff) break;
}
if(N<=local_options.max_diff){
global_N_too_high[blockId] = 0;
bwt_cuda_device_calculate_width(global_bwt, local_sequence, local_w_b.widths, local_w_b.bids, 0, sequence_length - local_options.seed_len - 1);
bwt_cuda_device_calculate_width(global_bwt, local_sequence, local_w_b.widths, local_w_b.bids, sequence_length - local_options.seed_len - 1, sequence_length);
}
else {
global_N_too_high[blockId] = 1;
}
//for(int i = 0; i < sequence_length; ++i){
//printf("%u %d %u %u\n",i,local_sequence[i],local_w_b.bids[i],local_w_b.widths[i]);
//}
global_w_b[blockId] = local_w_b;
}
return;
}
#if USE_PETR_SPLIT_KERNEL > 0
WBL 26 Feb 2016 not in use.... /*
//TODO: remove reverse alignment
__global__ void cuda_split_inexact_match_caller(int no_of_sequences, unsigned short max_sequence_length, alignment_meta_t* global_alignment_store, unsigned char cuda_opt)
//CUDA kernal for inexact match on a specified strand
// modified for split kernels
//calls bwt_cuda_device_calculate_width to determine the boundaries of the search space
//and then calls dfs_match to search for alignment using dfs approach
{
// Block ID for CUDA threads, as there is only 1 thread per block possible for now
unsigned int blockId = blockIdx.x * blockDim.x + threadIdx.x;
//Local store for sequence widths bids and alignments
unsigned int local_sequence_widths[MAX_SEQUENCE_LENGTH];
unsigned char local_sequence_bids[MAX_SEQUENCE_LENGTH];
unsigned char local_sequence[MAX_SEQUENCE_LENGTH];
unsigned char local_rc_sequence[MAX_SEQUENCE_LENGTH];
alignment_meta_t local_alignment_store;
//fetch the alignment store from memory
local_alignment_store = global_alignment_store[blockId];
int max_aln = options_cuda.max_aln;
//initialize local options for each query sequence
gap_opt_t local_options = options_cuda;
const int pass_length = (options_cuda.seed_len > PASS_LENGTH)? options_cuda.seed_len : PASS_LENGTH;
const int split_engage = pass_length + 6;
//Core function
// work on valid sequence only
if ( blockId < no_of_sequences )
{
#if DEBUG_LEVEL > 5
printf("start..\n");
#endif
//get sequences from texture memory
//const uint2 sequence_info = tex1Dfetch(sequences_index_array, blockId);
// sequences no longer align with the block ids
const uint2 sequence_info = tex1Dfetch(sequences_index_array, local_alignment_store.sequence_id);
const unsigned int sequence_offset = sequence_info.x;
unsigned int last_read = ~0;
unsigned int last_read_data;
//calculate new length - are we dealing with the last bit?
int start_pos = local_alignment_store.start_pos;
unsigned short process_length;
// decide if this is the last part to process
if (!start_pos && sequence_info.y >= split_engage) {
// first round and splitting is going to happen, forcing if necessary
process_length = min(sequence_info.y, pass_length);
} else {
// subsequent rounds or splitting is not happening
if (sequence_info.y - start_pos < pass_length * 2) {
// mark this pass as last
local_alignment_store.finished = 1;
if (sequence_info.y - start_pos > pass_length) {
// "natural" splitting finish
process_length = min(sequence_info.y, sequence_info.y%pass_length + pass_length);
} else {
// last pass of "forced" splitting
process_length = sequence_info.y - start_pos;
}
} else {
process_length = min(sequence_info.y, pass_length);
}
}
#if DEBUG_LEVEL > 7
printf("process length: %d, start_pos: %d, sequence_length: %d\n", process_length, start_pos, sequence_info.y);
#endif
//const unsigned short sequence_length = (!start_pos) ? process_length : sequence_info.y;