-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDELFI2_divergence.R
2884 lines (2440 loc) · 122 KB
/
DELFI2_divergence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(stringr)
library(devtools)
library (LaplacesDemon)
library(FNN)
library("RColorBrewer")
library(ROCR)
library("ggplot2")
library("mixtools")
library(gplots)
library(spatstat)
library("transport")
library(readxl)
library(caret)
library(seewave)
library("glmnet")
library("ggpubr")
setwd ('G:\\DELFI_data/Derived/fragment_length_in_bins')
setwd ('~/genomedk/DELFI_data/Derived/fragment_length_in_bins')
require("reticulate")
py_install("pandas")
py_install("https://github.com/Hogfeldt/ctDNAtool")
source_python("~/genomedk/matovanalysis/DELFI_analysis/python/pickle_reader.py")
#pickle_data <- read_pickle_file("~/genomedk/DELFI2/Workspaces/per_and_elias/delfi2_length_5Mbp/DL000978HLQ0_100AM.pickle")
##excel_sheets(path = "~/genomedk/DELFI2/RawData/201217_Delfi2_fastq_and_sample_manifest_updated_batchinfo.xlsx")
#del2 <- read_excel("~/genomedk/DELFI2/RawData/201217_Delfi2_fastq_and_sample_manifest_updated_batchinfo.xlsx", sheet = 1)
#end2 <- read_excel("~/genomedk/DELFI2/RawData/201217_Delfi2_fastq_and_sample_manifest_updated_batchinfo.xlsx", sheet = 2)# 1069 samples: 169 colon, 100 rectum, 800 control.
del2 <- read_excel("~/genomedk/DELFI2/Workspaces/matov/201217_Delfi2_fastq_and_sample_manifest_updated_batchinfo2.xlsx", sheet = 1)
end2 <- read_excel("~/genomedk/DELFI2/Workspaces/matov/201217_Delfi2_fastq_and_sample_manifest_updated_batchinfo2.xlsx", sheet = 2)# 1069 samples: 169 colon, 100 rectum, 800 control.
col_list <- which(end2$diagnostic_group=="Colon cancer") # 169 samples
col1 <- which(end2$diagnostic_group=="Colon cancer" & end2$crc_stage=="I") # 19
col2 <- which(end2$diagnostic_group=="Colon cancer" & end2$crc_stage=="II") # 65
col3 <- which(end2$diagnostic_group=="Colon cancer" & end2$crc_stage=="III") # 42
col4 <- which(end2$diagnostic_group=="Colon cancer" & end2$crc_stage=="IV") # 43
col0 <- which(end2$diagnostic_group=="Adenoma colon" & end2$adenoma_risk=="HIGH") # 46 col Adenoma HIGH
colA <- which(end2$diagnostic_group=="Adenoma colon" & end2$adenoma_risk=="LOW") # 53 col Adenoma LOW
ctl1 <- which(end2$diagnostic_group=="No comorbidity-no finding") # 284
ctl2 <- which(end2$diagnostic_group=="Comorbidity-no finding") # 191
ctl3 <- which(end2$diagnostic_group=="Other finding") # 325
rec_list <- which(end2$diagnostic_group=="Rectal cancer") # 100 samples
rec1 <- which(end2$diagnostic_group=="Rectal cancer" & end2$crc_stage=="I") # 29
rec2 <- which(end2$diagnostic_group=="Rectal cancer" & end2$crc_stage=="II") # 25
rec3 <- which(end2$diagnostic_group=="Rectal cancer" & end2$crc_stage=="III") # 31
rec4 <- which(end2$diagnostic_group=="Rectal cancer" & end2$crc_stage=="IV") # 15
rec0 <- which(end2$diagnostic_group=="Adenoma rectum" & end2$adenoma_risk=="HIGH") # 21 rec Adenoma HIGH
recA <- which(end2$diagnostic_group=="Adenoma rectum" & end2$adenoma_risk=="LOW") # 14 rec Adenoma LOW
e2 <- data.frame(end2) # 1803 27 - stage and disease info comes from this file (sheet 2)
d2 <- data.frame(del2) # 765 10 - DELFI names (sheet 1)
m2 <- merge(e2,d2,by="SampleID") # 681 36 - only 681 are members of both groups
col_list2 <- which(m2$diagnostic_group=="Colon cancer") # 79
col12 <- which(m2$diagnostic_group=="Colon cancer" & m2$crc_stage=="I") # 8
col22 <- which(m2$diagnostic_group=="Colon cancer" & m2$crc_stage=="II") # 30
col32 <- which(m2$diagnostic_group=="Colon cancer" & m2$crc_stage=="III") # 18
col42 <- which(m2$diagnostic_group=="Colon cancer" & m2$crc_stage=="IV") # 23
rec12 <- which(m2$diagnostic_group=="Rectal cancer" & m2$crc_stage=="I") # 10
rec22 <- which(m2$diagnostic_group=="Rectal cancer" & m2$crc_stage=="II") # 11
rec32 <- which(m2$diagnostic_group=="Rectal cancer" & m2$crc_stage=="III") # 18
rec42 <- which(m2$diagnostic_group=="Rectal cancer" & m2$crc_stage=="IV") # 11
# 345 already analyzed, 7 colon replicates, 60 ctl3 later, 66 adenomas, tot. 478 for analysis (471 unique)
# plus 210 (of 681 in the list), all unique?, are "validation", i.e. m2$diagnostic_group is NA.
col02 <- which(m2$diagnostic_group=="Adenoma colon" & m2$adenoma_risk=="HIGH") # 20 col Adenoma HIGH
colA2 <- which(m2$diagnostic_group=="Adenoma colon" & m2$adenoma_risk=="LOW") # 28 col Adenoma LOW
rec02 <- which(m2$diagnostic_group=="Adenoma rectum" & m2$adenoma_risk=="HIGH") # 11 rec Adenoma HIGH
recA2 <- which(m2$diagnostic_group=="Adenoma rectum" & m2$adenoma_risk=="LOW") # 7 rec Adenoma LOW
sum(m2$Replicate) # 29
length(m2$Replicate)# 681
# these seem to be 681 Delfi IDs.
val210 <- m2$diagnostic_group=="NA"
val210[is.na(val210)] <- 1 # list of 210 validation samples
val <- which(val210==1)
auxVAL <- unlist(sapply(m2$DELFI.ID[val] , function(x) grep(x, x = pileupsD2 )))
listVAL <- unique(auxVAL)
# 210 individual KLD to the 43 CTL Delfi1. pick top 189 bp . maybe go to 12 bins.
val189D2 <- valD2[,,189] #210 (samples) 574 (bins)
write.csv(t(val189D2[,1:555]),'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_val210_555bins_fr189.csv')
# 555 x 210
dim(hnm)# 700 23310
hnm189 <- hnm[189,]#extract from 700 only 189 bp
h189x210 = array(0, dim=c(23310,210))
for (i in 1:210 ) {
h189x210[,i]<-hnm189 #duplicate the same vector 210 times to prepare the divergence.
}
write.csv(h189x210,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi1_healthy43_555bins_fr189.csv')
# 23310(42*555) x 210
k189_val_d1ctl43 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2val210_D1ctl43_fr189.csv')
k189_vald1ctl43 <- k189_val_d1ctl43[2:211,2]
plot(k189_vald1ctl43 )
k189_val210_sorted<-sort(k189_vald1ctl43, index.return=TRUE,decreasing = T )
plot(k189_val210_sorted$x)
pileupsD2[listVAL[k189_val210_sorted$ix[1:21]]] # the colon cancer portion of the validation cohort
# $ix 26 184 78 200 188 149 148 116 103 115 74 157 9 85 109 134 47 112 182 16 205
# plot vector of 210 divergence numbers in bits (its always for 189 bp only)
# are there peaks and dips?
# library no longer exists
classifier( method = c("randomForest", "svm", "nnet" ), featureMat, positiveSamples, negativeSamples, tunecontrol = tune.control(sampling = "cross", cross = 5), ...)
##for random forest, and using five-fold cross validation for obtaining optimal parameters
cl <- classifier( method = "randomForest",
featureMat = hcTop20,
positiveSamples = cv21, # col79x12 for frl195bp
negativeSamples = hv21,# ctl1_74x12 for frl195bp
tunecontrol = tune.control(sampling = "cross", cross = 5),
ntree = 100 ) #build 100 trees for the forest
# Error in family$linkfun(mustart) : Argument mu must be a nonempty numeric vector
simple_logistic_model = glm(data = data.frame(as.factor(df)), #dim(df) 153 12
family = binomial())
summary(simple_logistic_model)
# x 195 for COL, y 195 for CTL
glmnet(t(cv21), t(hv21), family = "binomial", alpha = 1, lambda = NULL)
#ROC; find where overall success rate numbers are
condition <- rbind(array(1, dim=c(79,12)), array(0, dim=c(74,12)))
pred <- prediction(hcTop20, condition, label.ordering = c(0, 1))
#pearson 0.7627064
cor(kc[1:499], kd2_col79, method = c("pearson", "kendall", "spearman"))
cor.test(kc[1:499], kd2_col79, method=c("pearson", "kendall", "spearman"))
plot(kc[1:499], ylim=range(c(0,2.7)), col="green", type="l")
par(new = TRUE)
plot(kd2_col79, ylim=range(c(0,2.7)), col="red", type="l")
legend(400,2.6,legend=c("DELFI1", "DELFI2"), col=c("green","red"),lty=1:1, cex=1.0)
plot(k_d2_col1, ylim=range(c(0,1.9)), col="green", type="l")
par(new = TRUE)
plot(k_d2_col2, ylim=range(c(0,1.9)), col="blue", type="l")
par(new = TRUE)
plot(k_d2_col3, ylim=range(c(0,1.9)), col="orange", type="l")
par(new = TRUE)
plot(k_d2_col4, ylim=range(c(0,1.9)), col="red", type="l")
legend(400,1.5,legend=c("CC SI", "CC SII", "CC SIII", "CC SIV"),col=c("green","blue","orange","red"),lty=1:1, cex=1.0)
plot(kd2_col79, ylim=range(c(0,.8)), col="green", type="l")
par(new = TRUE)
plot(kd2_col79_ctl2, ylim=range(c(0,.8)), col="blue", type="l")
par(new = TRUE)
plot(kd2_col79_ctl3, ylim=range(c(0,.8)), col="red", type="l")
legend(220,.8,legend=c("CC CTL1", "CC CTL2", "CC CTL3"),col=c("green","blue","red"),lty=1:1, cex=1.0)
m2$DELFI.ID[col_list2] # 79
#[1] "DL001860CRP0" "DL001940CRP0" "DL002182CRP0" "DL002181CRP0" "DL002023CRP0" "DL001364CRP0_1" "DL001364CRP0" "DL001503CRP0" "DL001288CRP0"
#[10] "DL001509CRP0" "DL001422CRP0" "DL001207CRP0" "DL001800CRP0" "DL001093CRP0" "DL001785CRP0" "DL001923CRP0" "DL002241CRP0" "DL001902CRP0"
#[19] "DL001845CRP0" "DL002194CRP0_1" "DL001578CRP0" "DL001613CRP0" "DL001614CRP0" "DL001348CRP0" "DL001096CRP0_1" "DL001096CRP0" "DL001475CRP0"
#[28] "DL001857CRP0" "DL001835CRP0" "DL001561CRP0" "DL001142CRP0" "DL001582CRP0_1" "DL001582CRP0" "DL001448CRP0" "DL002259CRP0" "DL001581CRP0"
#[37] "DL001888CRP0" "DL002085CRP0" "DL002000CRP0" "DL002072CRP0" "DL002267CRP0" "DL001496CRP0" "DL001390CRP0" "DL001758CRP0_1" "DL001563CRP0"
#[46] "DL002286CRP0" "DL002286CRP0_1" "DL001849CRP0" "DL002204CRP0" "DL001550CRP0" "DL001162CRP0" "DL002227CRP0" "DL001244CRP0" "DL002280CRP0"
#[55] "DL001236CRP0" "DL002104CRP0" "DL002046CRP0" "DL002223CRP0" "DL002123CRP0" "DL001914CRP0" "DL001918CRP0" "DL001855CRP0" "DL002026CRP0"
#[64] "DL001270CRP0" "DL001316CRP0" "DL001562CRP0" "DL001178CRP0" "DL001572CRP0" "DL001843CRP0" "DL001843CRP0_1" "DL001964CRP0" "DL001964CRP0_1"
#[73] "DL001763CRP0" "DL001650CRP0" "DL001965CRP0" "DL002190CRP0_1" "DL002190CRP0" "DL001793CRP0" "DL001480CRP0"
pileupsD2 <- list.files("~/genomedk/DELFI2/Workspaces/per_and_elias/delfi2_length_5Mbp", recursive = T, full.names = T, pattern = "tsv")
pileupsD2_1M <- list.files("~/genomedk/DELFI2/Workspaces/per_and_elias/delfi2_length_1Mbp", recursive = T, full.names = T, pattern = "tsv")
d2_test <- read.table(pileupsD2[204], header = TRUE)
d2_t2 <- read.table(pileupsD2[304], header = TRUE)
samplesREC <- sapply(m2$DELFI.ID[rec_list2] , function(x) grep(x, x = pileupsD2 ))
samplesCOL <- sapply(m2$DELFI.ID[col_list2] , function(x) grep(x, x = pileupsD2 ))
listCOL1 <- sapply(m2$DELFI.ID[col12] , function(x) grep(x, x = pileupsD2 ))
listCOL2 <- sapply(m2$DELFI.ID[col22] , function(x) grep(x, x = pileupsD2 ))
listCOL3 <- unlist(sapply(m2$DELFI.ID[col32] , function(x) grep(x, x = pileupsD2 )))
listCOL4 <- unlist(sapply(m2$DELFI.ID[col42] , function(x) grep(x, x = pileupsD2 )))
listCOL0 <- unlist(sapply(m2$DELFI.ID[col02] , function(x) grep(x, x = pileupsD2 )))
listCOLA <- unlist(sapply(m2$DELFI.ID[colA2] , function(x) grep(x, x = pileupsD2 )))
listREC0 <- unlist(sapply(m2$DELFI.ID[rec02] , function(x) grep(x, x = pileupsD2 )))
listRECA <- unlist(sapply(m2$DELFI.ID[recA2] , function(x) grep(x, x = pileupsD2 )))
#######################################colon adenoma low##########################
col0D2 = array(0, dim=c(20,574,499))
j=1
for (i in 1:20 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCOL0[i]], header = TRUE) # sample per sample, file per file.
col0D2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
colAD2 = array(0, dim=c(28,574,499))
j=1
for (i in 1:28 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCOLA[i]], header = TRUE) # sample per sample, file per file.
colAD2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
rec0D2 = array(0, dim=c(11,574,499))
j=1
for (i in 1:11 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listREC0[i]], header = TRUE) # sample per sample, file per file.
rec0D2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
recAD2 = array(0, dim=c(7,574,499))
j=1
for (i in 1:7 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listRECA[i]], header = TRUE) # sample per sample, file per file.
recAD2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
col0D22 = array(0, dim=c(20*574,499))
colAD22 = array(0, dim=c(28*574,499))
rec0D22 = array(0, dim=c(11*574,499))
recAD22 = array(0, dim=c(7*574,499))
for (i in 1:499) {
#i=1
auxCOL0 <- col0D2[,,i]
col0D22[,i] <- auxCOL0
auxCOLA<- colAD2[,,i]
colAD22[,i] <- auxCOLA
#auxREC0 <- rec0D2[,,i]
#rec0D22[,i] <- auxREC0
#auxRECA <- recAD2[,,i]
#recAD22[,i] <- auxRECA
}
write.csv(col0D22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_adeH20.csv')
write.csv(colAD22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_adeL28.csv')
write.csv(rec0D22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_rec_adeH11.csv')
write.csv(recAD22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_rec_adeL7.csv')
kd2_col0 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_adeH20_ctl1.csv')
k_d2_col0 <- kd2_col0[2:500,2]
plot(k_d2_col0)
kd2_colA <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_adeL28_ctl1.csv')
k_d2_colA <- kd2_colA[2:500,2]
plot(k_d2_colA)
kd2_rec0 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2rec_adeH11_ctl1.csv')
k_d2_rec0 <- kd2_rec0[2:500,2]
plot(k_d2_rec0)
kd2_recA <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2rec_adeL7_ctl1.csv')
k_d2_recA <- kd2_recA[2:500,2]
plot(k_d2_recA)
# combined figure for the four adenomas
plot(k_d2_col0, ylim=range(c(0,.36)) ,col="red", type="l")
par(new = TRUE)
plot(k_d2_colA, ylim=range(c(0,.36)), col="green", type="l")
legend(300,.3,legend=c("CC AH", "CC AL"),col=c("red","green"),lty=1:1, cex=1.0)
plot(k_d2_rec0, ylim=range(c(0,.7)), col="orange", type="l")
par(new = TRUE)
plot(k_d2_recA, ylim=range(c(0,.7)), col="blue", type="l")
legend(300,.6,legend=c( "RC AH", "RC AL"),col=c("orange","blue"),lty=1:1, cex=1.0)
auxCOL <- unlist(samplesCOL)
# auxCRC
#DL001860CRP0 DL001940CRP0 DL002182CRP0 DL002181CRP0 DL002023CRP0 DL001364CRP0_1 DL001364CRP01 DL001364CRP02 DL001503CRP0
#377 419 511 510 452 188 188 189 250
#DL001288CRP0 DL001509CRP0 DL001422CRP0 DL001207CRP0 DL001800CRP0 DL001093CRP0 DL001785CRP0 DL001923CRP0 DL002241CRP0
#159 253 219 133 353 86 342 412 538
#DL001902CRP0 DL001845CRP0 DL002194CRP0_1 DL001578CRP0 DL001613CRP0 DL001614CRP0 DL001348CRP0 DL001096CRP0_1 DL001096CRP01
#398 370 519 277 292 293 184 87 87
#DL001096CRP02 DL001475CRP0 DL001857CRP0 DL001835CRP0 DL001561CRP0 DL001142CRP0 DL001582CRP0_1 DL001582CRP01 DL001582CRP02
#88 237 376 364 269 105 279 279 280
#DL001448CRP0 DL002259CRP0 DL001581CRP0 DL001888CRP0 DL002085CRP0 DL002000CRP0 DL002072CRP0 DL002267CRP0 DL001496CRP0
#226 542 278 391 471 442 467 544 248
#DL001390CRP0 DL001758CRP0_1 DL001563CRP0 DL002286CRP01 DL002286CRP02 DL002286CRP0_1 DL001849CRP0 DL002204CRP0 DL001550CRP0
#203 330 271 552 553 552 373 524 264
#DL001162CRP0 DL002227CRP0 DL001244CRP0 DL002280CRP0 DL001236CRP0 DL002104CRP0 DL002046CRP0 DL002223CRP0 DL002123CRP0
#110 531 143 549 139 480 460 529 490
#DL001914CRP0 DL001918CRP0 DL001855CRP0 DL002026CRP0 DL001270CRP0 DL001316CRP0 DL001562CRP0 DL001178CRP0 DL001572CRP0
#405 407 375 454 151 170 270 116 274
#DL001843CRP01 DL001843CRP02 DL001843CRP0_1 DL001964CRP01 DL001964CRP02 DL001964CRP0_1 DL001763CRP0 DL001650CRP0 DL001965CRP0
#368 369 368 429 430 429 333 305 431
#DL002190CRP0_1 DL002190CRP01 DL002190CRP02 DL001793CRP0 DL001480CRP0
#516 516 517 348 239
listCOL <- unique(auxCOL)
#[1] 377 419 511 510 452 188 189 250 159 253 219 133 353 86 342 412 538 398 370 519 277 292 293 184 87 88 237 376 364 269 105 279 280 226 542 278 391 471 442 467 544 248 203 330
#[45] 271 552 553 373 524 264 110 531 143 549 139 480 460 529 490 405 407 375 454 151 170 270 116 274 368 369 429 430 333 305 431 516 517 348 239
length(unique(auxVAL)) # 210 (4 were replicated, even if its the same sample)
#################validation#######################
valD2 = array(0, dim=c(210,574,499))
j=1
for (i in 1:210 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listVAL[i]], header = TRUE) # sample per sample, file per file.
valD2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
length(unique(auxCOL)) # 79 (samples with indexes 188, 87, 279, 552, 368, 429, 516 were replicated, even if its the same sample)
#################colon all stages#######################
colD2 = array(0, dim=c(79,574,499))
#ctl1D2_1M = array(0, dim=c(74,2873,499)) # 5x574 genomic bins, i.e. 1Mb rather than 5Mb
j=1
for (i in 1:74 ) {#79
print(i)
#i=3
auxFR <- read.table(pileupsD2[listCOL[i]], header = TRUE) # sample per sample, file per file.
#auxFR <- read.table(pileupsD2_1M[listCTL1[i]], header = TRUE) # sample per sample, file per file.
colD2[j,,] <- unlist(auxFR[,2:500])
#ctl1D2_1M[j,,] <- unlist(auxFR[,2:500])
# save 79 individual profiles as xls files
#sa_name <- paste0('~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_individual', i,'.csv')
print(length(unlist(auxFR[,2:500])))
#auxSAVE <-matrix(unlist(auxFR[,2:500]),nrow=574,ncol=499)
#write.csv(unlist(auxSAVE),sa_name)
j=j+1
}
#######################################colon stage 1##########################
k_col1_i<-matrix(,nrow=8,ncol=700)
col1D2 = array(0, dim=c(8,574,499))
j=1
for (i in 1:8 ) {
print(i)
#i=3
auxFR <- read.table(pileupsD2[listCOL1[i]], header = TRUE) # sample per sample, file per file.
col1D2[j,,] <- unlist(auxFR[,2:500])
#sa_name <- paste0('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col1_individual', i,'.csv')
#aux<- read.csv(sa_name)
#k_col1_i[i,] <- aux[2:500,2]
j=j+1
}
#######################################colon stage 2##########################
col2D2 = array(0, dim=c(30,574,499))
j=1
for (i in 1:30 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCOL2[i]], header = TRUE) # sample per sample, file per file.
col2D2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
#######################################colon stage 3##########################
col3D2 = array(0, dim=c(18,574,499))
j=1
for (i in 1:18 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCOL3[i]], header = TRUE) # sample per sample, file per file.
col3D2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
#######################################colon stage 4##########################
col4D2 = array(0, dim=c(23,574,499))
j=1
for (i in 1:23 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCOL4[i]], header = TRUE) # sample per sample, file per file.
col4D2[j,,] <- unlist(auxFR[,2:500])
j=j+1
}
##########################CONTROL##########################################################
ctl1_list <- which(m2$diagnostic_group=="No comorbidity-no finding") # 74
samplesCTL1 <- sapply(m2$DELFI.ID[ctl1_list] , function(x) grep(x, x = pileupsD2 ))
auxCTL1 <- unlist(samplesCTL1)
#DL001791HLP0 DL001790HLP0 DL001437HLP0 DL001636HLP0 DL001481HLP0 DL001168HLP0 DL001171HLP0 DL001181HLP0 DL001215HLP0 DL001223HLP0
#347 346 220 298 240 113 114 118 135 137
#DL002195HLP0 DL002016HLP0 DL001609HLP0 DL001287HLP0 DL001440HLP0 DL001530HLP0 DL001092HLP0 DL001394HLP0 DL001839HLP0 DL002172HLP0
#520 451 290 158 221 257 85 206 367 507
#DL001775HLP0 DL001908HLP0 DL002199HLP0 DL002180HLP0 DL001605HLP0 DL001391HLP0 DL001884HLP0 DL001870HLP0 DL001872HLP0 DL001780HLP0
#338 403 521 509 289 204 389 381 383 339
#DL001956HLP0 DL001644HLP0 DL002088HLP0 DL001646HLP0 DL001798HLP0 DL001795HLP0 DL002031HLP0 DL001783HLP0 DL002074HLP0 DL001331HLP0
#426 303 472 304 352 349 455 340 468 174
#DL001451HLP0 DL001345HLP0 DL001382HLP0 DL001979HLP0 DL001553HLP0 DL001826HLP0 DL001868HLP0 DL001707HLP0 DL002129HLP0 DL002153HLP0
#228 182 199 437 265 363 380 318 493 501
#DL001148HLP0 DL002215HLP0 DL002053HLP0 DL001312HLP0 DL002050HLP0 DL001243HLP0 DL002206HLP0 DL001310HLP0 DL001272HLP0 DL002290HLP0
#106 527 463 169 461 142 525 168 153 555
#DL002008HLP0 DL002099HLP0 DL001408HLP0 DL001767HLP0 DL001279HLP0 DL002097HLP0 DL002092HLP0 DL001269HLP0 DL001457HLP0 DL001542HLP0
#446 478 212 336 155 476 474 150 233 262
#DL001961HLP0 DL001118HLP0 DL001117HLP0 DL001253HLP0
#428 98 97 144
listCTL1 <- unique(auxCTL1)
length(unique(auxCTL1)) # 74
ctl1D2 = array(0, dim=c(74,574,499))
#ctl1D2_1M = array(0, dim=c(74,2873,499)) # 5x574 genomic bins, i.e. 1Mb rather than 5Mb
j=1
for (i in 1:74 ) {
print(i)
#i=2
auxFR <- read.table(pileupsD2[listCTL1[i]], header = TRUE) # sample per sample, file per file.
ctl1D2[j,,] <- unlist(auxFR[,2:500])
# save 74 individual profiles as xls files
#sa_name <- paste0('~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_individual', i,'.csv')
#auxSAVE <-matrix(unlist(auxFR[,2:500]),nrow=574,ncol=499)
#write.csv(unlist(auxSAVE),sa_name)
j=j+1
}
ctl1D22 = array(0, dim=c(74*574,499))
col1D22 = array(0, dim=c(8*574,499))
col2D22 = array(0, dim=c(30*574,499))
col3D22 = array(0, dim=c(18*574,499))
col4D22 = array(0, dim=c(23*574,499))
colD22 = array(0, dim=c(79*574,499))
ctl1D22_1M = array(0, dim=c(74*2873,499))
colD22_1M = array(0, dim=c(79*2873,499))
for (i in 1:499) {
#i=1
auxCTL <- ctl1D2[,,i]
ctl1D22[,i] <- auxCTL
auxCOL <- colD2[,,i]
colD22[,i] <- auxCOL
#auxCOL1 <- col1D2[,,i]
#col1D22[,i] <- auxCOL1
#auxCOL2<- col2D2[,,i]
#col2D22[,i] <- auxCOL2
#auxCOL3 <- col3D2[,,i]
#col3D22[,i] <- auxCOL3
auxCOL4 <- col4D2[,,i]
col4D22[,i] <- auxCOL4
}
write.csv(colD22_1M,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_all79_1M.csv')
write.csv(ctl1D22_1M,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_74_1M.csv')
k_d2_col79_1M <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_COL79_CTL1_1M.csv')
kd2_col79_1M <- k_d2_col79_1M[2:575,2]
plot(kd2_col79_1M)
k_d2_crc129 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_CRC129_ctl1.csv')
kd2_crc129 <- k_d2_crc129[2:575,2]
plot(kd2_crc129)
ctlD22 <- rbind(ctl1D22,ctl2D22)
ctlD22 <- rbind(ctlD22,ctl3D22)
write.csv(ctlD22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl_all276.csv')
k_d2_crc129_ctl276 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_CRC129_ctl276.csv')
kd2_crc129_ctl276 <- k_d2_crc129_ctl276[2:575,2]
plot(kd2_crc129_ctl276)
k_d2_col79_ctl276 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_COL79_ctl276.csv')
kd2_col79_ctl276 <- k_d2_col79_ctl276[2:575,2]
plot(kd2_col79_ctl276)
write.csv(colD22,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_all79.csv')
dim(colD2)# 79 574 499
dim(ctl1D2)# 74 574 499
c195<-colD2[,,195]
h195<-ctl1D2[,,195]
hh195<-ctl11D2[,,195]
dim(c195) # 79 574
dim(h195)# 74 574
dim(hh195)# 73 574
write.csv(c195,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_fr195.csv')
write.csv(h195,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr195.csv')
write.csv(hh195,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl73_fr195.csv')
kk195_d2_col <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_COLfr195_ctl73.csv')
kk195d2_col <- kk195_d2_col[2:575,2]
plot(kk195d2_col)
indx<-which(kk195d2_col>-5)#.847)# 141 153 158 170 218 347 374 405 408 458 528 558
cv22 <-matrix(,nrow=79,ncol=length(indx))
cv22<-c195[,indx]
hv22 <-matrix(,nrow=73,ncol=length(indx))
hv22<-hh195[,indx]
hcTop20 <- rbind(cv22 , hv22)
df<-scale(hcTop20)
df[is.nan(df)] <- 0
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
k195_d2_col <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_COLfr195_ctl1.csv')
k195d2_col <- k195_d2_col[2:575,2]
plot(k195d2_col)
ind195<-which(k195d2_col>-5)# all
ind195<-which(k195d2_col>1.12)# 33 48 90 145 156 191 213 356 429 483 529
ind195<-which(k195d2_col>1.72)# 356
ind195<-which(k195d2_col>1.58)# 356 529
indx<-ind195
cv21 <-matrix(,nrow=79,ncol=length(indx))
cv21<-c195[,indx]
hv21 <-matrix(,nrow=74,ncol=length(indx))
hv21<-h195[,indx]
hcTop20 <- rbind(cv21 , hv21)
df<-scale(hcTop20)
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
c195<-colD2[,,195]
h195<-ctl1D2[,,195]
cv2N <- matrix(colD22[,195],nrow=79,ncol=574)
hv2N <- matrix(ctl1D22[,195],nrow=74,ncol=574)
hcTop20 <- rbind(cv2N , hv2N)#(c195,h195)#(cv2N , hv2N)
df<-scale(hcTop20)
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
c365<-colD2[,,365]
h365<-ctl1D2[,,365]
write.csv(c365,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_fr365.csv')
write.csv(h365,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr365.csv')
k365_d2_col <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2_COLfr365_ctl1.csv')
k365d2_col <- k365_d2_col[2:575,2]
plot(k365d2_col)
indx<-which(k365d2_col>-5)#.847)# 141 153 158 170 218 347 374 405 408 458 528 558
cv22 <-matrix(,nrow=79,ncol=length(indx))
cv22<-c365[,indx]
hv22 <-matrix(,nrow=74,ncol=length(indx))
hv22<-h365[,indx]
hcTop20 <- rbind(cv22 , hv22)
df<-scale(hcTop20)
df[is.nan(df)] <- 0
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
cv2<-t(rbind(t(cv21),t(cv22)))
hv2<-t(rbind(t(hv21),t(hv22)))
hcTop20 <- rbind(cv2 , hv2)
df<-scale(hcTop20)
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
hnm499<-hnm[1:499,]
cnm499<-cnm[1:499,]
write.csv(t(cnm499),'~/genomedk/matovanalysis/DELFI_analysis/python/delfi1_crc27_frl499.csv')
write.csv(t(hnm499),'~/genomedk/matovanalysis/DELFI_analysis/python/delfi1_ctl43_frl499.csv')
k1_d2_col79 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col79_D1ctl43.csv')
k1d2_col79 <- k1_d2_col79[2:500,2]
plot(k1d2_col79)
k1_d2_col79hg38 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col79_D1ctl43HG38.csv')
k1d2_col79hg38 <- k1_d2_col79hg38[2:500,2]
plot(k1d2_col79hg38)
k2_d2_col79hg38 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col79_D1ctl86HG38.csv')
k2d2_col79hg38 <- k2_d2_col79hg38[2:500,2]
plot(k2d2_col79hg38)
hM189 <- matrix(hnm499[189,], ncol = 555, byrow = 42) #
write.csv(hM189,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi1_ctl43_frl499_189.csv')
cM189 <- matrix(colD22[,189], ncol = 574, byrow = 79) #
write.csv(cM189[,1:555],'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col79_bin555_189.csv')
col_bins = array(0, dim=c(79*499,574))
ctl1_bins = array(0, dim=c(74*499,574))
#col_bins <- matrix(, ncol = 574, byrow = 79) #
dim(colD2) # 79 574 499
for (i in 1:574){
auxCOL <- colD2[,i,]
col_bins[,i] <- auxCOL
auxCTL1 <- ctl1D2[,i,]
ctl1_bins[,i] <- auxCTL1
}
#ctl1_bins <- matrix(ctl1D22, ncol = 574, byrow = 74) #
write.csv(ctl1_bins,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1Bins.csv')
write.csv(col_bins,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_colBins.csv')
k_d2_colBins <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_Bins.csv')
kd2_colBins<-k_d2_colBins[2:701,2]
plot(kd2_colBins)
dim(colD22) # 45346 499
dim(ctl1D22) # 42476 499
colD2B10<- array(0, dim = c(79*574*10,50))
ctl1D2B10<- array(0, dim = c(74*574*10,50))
umicB10<-array(0,dim=c(67*595*10,50))
for (i in 0:49){
#i=2
k=10*i+1
if (i != 49) {
colD2B10[,i] <- colD22[,k:(k+9)]
ctl1D2B10[,i] <- ctl1D22[,k:(k+9)]
} else {
colD2B10[,i] <- colD22[,(k-1):(k+8)]# repeat one FRl from the previous segment
ctl1D2B10[,i] <- ctl1D22[,(k-1):(k+8)]# repeat one FRl from the previous segment
}
}
write.csv(colD2B10,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col_79_binned10frl.csv')
write.csv(ctl1D2B10,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_74_binned10frl.csv')
k1_d2_col79_189 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col79_D1ctl43_fr189.csv')
k1d2_col79_189 <- k1_d2_col79_189[2:556,2]
plot(k1d2_col79_189)
indx<-which(k1d2_col79_189>-5)#1.24) #1.3
cv22 <-matrix(,nrow=79,ncol=length(indx))
cv22<-cM189[,indx]
hv22 <-matrix(,nrow=42,ncol=length(indx))
hv22<-hM189[,indx]
hcTop20 <- rbind(cv22 , hv22)
df<-scale(hcTop20)
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
hgA <- hist(colD22[,195], breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,195],breaks = 50, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,470), ylim = c(0,3500)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,470), ylim = c(0,3500)) # Add 2nd histogram using different color
hgA <- hist(colD22[,365], breaks = 30 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,365],breaks = 30, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,70), ylim = c(0,5000)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,70), ylim = c(0,5000)) # Add 2nd histogram using different color
hgA <- hist(colD22[,137], breaks = 150 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,137],breaks = 50, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,520), ylim = c(0,4200)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,520), ylim = c(0,4200)) # Add 2nd histogram using different color
hgA <- hist(col4D22[,137], breaks = 150 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,137],breaks = 50, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,520), ylim = c(0,4200)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,520), ylim = c(0,4200)) # Add 2nd histogram using different color
hgA <- hist(colD22[,277], breaks = 100 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,277],breaks = 30, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,40), ylim = c(0,6000)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,40), ylim = c(0,6000)) # Add 2nd histogram using different color
hgA <- hist(col4D22[,277], breaks = 100 , plot = FALSE) # Save first histogram data
hgB <- hist(ctl1D22[,277],breaks = 30, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,40), ylim = c(0,6000)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,40), ylim = c(0,6000)) # Add 2nd histogram using different color
plot(kc4[1:499], ylim=range(c(0,2.4)), col="red", type="l")
par(new = TRUE)
plot(kd2_col4, ylim=range(c(0,2.4)), col="green", type="l")
par(new = TRUE)
plot(k2d2_col4, ylim=range(c(0,2.4)), col="blue", type="l")
legend(240,2.48,legend=c("D1 8 CRC Stage IV 43 CTL", "D2 23 CC Stage IV 74 CTL No comorbidity", "D2 23 CC Stage IV 71 CTL Comorbidity"),col=c("red","green","blue"),lty=1:1, cex=1.0)
#pearson
cor(kc4[1:499], kd2_col4, method = c("pearson", "kendall", "spearman"))
cor(kc4[1:499], k2d2_col4, method=c("pearson", "kendall", "spearman"))
cor(kd2_col4, k2d2_col4, method=c("pearson", "kendall", "spearman"))
max_k2d2_colStage <- c(max(k2d2_col1),max(k2d2_col2),max(k2d2_col3),max(k2d2_col4))
d2_maxSens_spec90 <- c(.62,.52,.52,.97)
cor(max_k2d2_colStage, d2_maxSens_spec90, method=c("pearson", "kendall", "spearman"))
d2_maxSens_spec95 <- c(.50,.42,.43,.96)
cor(max_kd2_colStage, d2_maxSens_spec95, method=c("pearson", "kendall", "spearman"))
cor(max_kd2_colStage, d2_maxSens_spec90, method=c("pearson", "kendall", "spearman"))
max_kd2_colStage <- c(max(kd2_col1),max(kd2_col2),max(kd2_col3),max(kd2_col4))
k_d2_col_i1 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_individual1.csv')
kd2_col_i[1,] <- k_d2_col_i1[2:500,2]
plot(kd2_col_i[1,])
k_d2_col_i2 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_individual2.csv')
kd2_col_i[2,] <- k_d2_col_i2[2:500,2]
plot(kd2_col_i[2,])
k_d2_col_i3 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_individual3.csv')
kd2_col_i[3,]<- k_d2_col_i3[2:500,2]
plot(kd2_col_i[3,])
k_d2_col_i4 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_individual4.csv')
kd2_col_i[4,] <- k_d2_col_i4[2:500,2]
plot(kd2_col_i[4,])
kd2_col_i<-matrix(,nrow=79,ncol=499)
for(i in 1:79) {
sa_name <- paste0('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col_individual', i,'.csv')
aux <- read.csv(sa_name)
kd2_col_i[i,] <- aux[2:500,2]
print(which.max(aux[2:500,2]))
#plot(kd2_col_i[i,], ylim=range(c(0,2.4)), col="red", type="l"))
#par(new = TRUE)
}
for(i in 1:79) {
#i=4
#plot(kd2_col_i[i,], ylim=range(c(0,5)),col= rgb(1,0,0,i/10)) # looks good
plot(kd2_col_i[i,], ylim=range(c(0,5)),col= col2rgb(i, alpha = FALSE)) # looks good
par(new = TRUE)
}
col_list2 <- which(m2$diagnostic_group=="Colon cancer") # 79
colS1<-kd2_col_i[m2$crc_stage[col_list2]=="I",]
colS2<-kd2_col_i[m2$crc_stage[col_list2]=="II",]
colS3<-kd2_col_i[m2$crc_stage[col_list2]=="III",]
colS4<-kd2_col_i[m2$crc_stage[col_list2]=="IV",]
mS1=NULL
mS2=NULL
mS3=NULL
mS4=NULL
maS1=NULL
maS2=NULL
maS3=NULL
maS4=NULL
for(i in 1:8) {
plot(colS1[i,], ylim=range(c(0,5.2)), col= col2rgb(i+50, alpha = FALSE), type="l")
mS1[i]<-which.max(colS1[i,])
maS1[i]<-max(colS1[i,])
par(new = TRUE)
}
#plot(mS1, maS1, pch = 24, cex=2, col="blue", bg="red", lwd=2)# plot the dot of the max
#par(new = TRUE)
for(i in 1:30) {
plot(colS2[i,], ylim=range(c(0,5.2)), col= col2rgb(2*i+50), type="l")
mS2[i]<-which.max(colS2[i,])
maS2[i]<-max(colS2[i,])
par(new = TRUE)
}
for(i in 1:18) {
plot(colS3[i,], ylim=range(c(0,5.2)), col= col2rgb(2*i+50), type="l")
mS3[i]<-which.max(colS3[i,])
maS3[i]<-max(colS3[i,])
par(new = TRUE)
}
for(i in 1:23) {
plot(colS4[i,], ylim=range(c(0,5.2)), col= col2rgb(2*i+50), type="l")
mS4[i]<-which.max(colS4[i,])
maS4[i]<-max(colS4[i,])
par(new = TRUE)
}
dim(colS1)# 8 499
dim(colS2)#30 499
dim(colS3)# 18 499
dim(k_crc4_i)# 23 499
#dim(k_ctl1_i)# 43 700
colS <- rbind(colS1,colS2)
colS <- rbind(colS,colS3)
colS <- rbind(colS,colS4)
c198<-colS[,70]
c364<-colS[,253]
h198<-k_ctl1_i[,70]
h364<-k_ctl1_i[,253]
plot(c198,c364,xlim=c(0,1.3),ylim=c(0,2),col="red")
par(new = TRUE)
plot(h198, h364,xlim=c(0,1.3),ylim=c(0,2),col="green")
frl1=207; frl2=366; k=5.4
# boundary for capacity regions for Stage I, II, III, IV.
plot(colS1[,frl1],colS1[,frl2],xlim=c(0,k),ylim=c(0,k),col="blue",pch=1)
par(new = TRUE)
plot(colS2[,frl1],colS2[,frl2],xlim=c(0,k),ylim=c(0,k),col="orange",pch=2)
par(new = TRUE)
plot(colS3[,frl1],colS3[,frl2],xlim=c(0,k),ylim=c(0,k),col="red",pch=3)
par(new = TRUE)
plot(colS4[,frl1],colS4[,frl2],xlim=c(0,k),ylim=c(0,k),col="brown",pch=4)
par(new = TRUE)
plot(kd2_ctl1_i[,frl1],kd2_ctl1_i[,frl2],xlim=c(0,k),ylim=c(0,k),col="green",pch=5)
legend(3.5,2,legend=c("KLD for Col SI samples", "KLD for Col SII samples", "KLD for Col SIII samples","KLD for Col SIV samples","KLD for Ctl1 74 samples"),col=c("blue","orange","red","brown","green"),lty=1:1, cex=1.0)
plot(abs(colSums(colS)-colSums(kd2_ctl1_i)))
which.max(abs(colSums(colS,na.rm = T)-colSums(kd2_ctl1_i,na.rm = T)))
which(abs(colSums(colS)-colSums(kd2_ctl1_i))>50)
hgA <- hist(mS1, breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(mS2,breaks = 50, plot = FALSE) # Save 2nd histogram data
hgC <- hist(mS3, breaks = 50 , plot = FALSE) # Save first histogram data
hgD <- hist(mS4,breaks = 50, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(100,400), ylim = c(0,8)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(100,400), ylim = c(0,8)) # Add 2nd histogram using different color
plot(hgC, col = rgb(1,0,1,1/10),add = TRUE,xlim = c(100,400), ylim = c(0,8)) # Plot 1st histogram using a transparent color
plot(hgD, col = rgb(0,1,1,1/10), add = TRUE,xlim = c(100,400), ylim = c(0,8)) # Add 2nd histogram using different color
plot(mS1, ylim=range(c(100,400)),col="green")
par(new = TRUE)
plot(mS2, ylim=range(c(100,400)), col="blue")
par(new = TRUE)
plot(mS3, ylim=range(c(100,400)), col="orange")
par(new = TRUE)
plot(mS4, ylim=range(c(100,400)), col="red")
legend(240,2.48,legend=c("D1 8 CRC Stage IV 43 CTL", "D2 23 CC Stage IV 74 CTL No comorbidity", "D2 23 CC Stage IV 71 CTL Comorbidity"),col=c("red","green","blue"),lty=1:1, cex=1.0)
k_d2_ctl1_i1 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2ctl1_individual1.csv')
kd2_ctl1_i[1,] <- k_d2_ctl1_i1[2:500,2]
plot(kd2_ctl1_i[1,])
k_d2_ctl1_i2 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2ctl1_individual2.csv')
kd2_ctl1_i[2,] <- k_d2_ctl1_i2[2:500,2]
plot(kd2_ctl1_i[2,])
# brew a palette with 74 values and pass it to "col" with an index i.
kd2_ctl1_i<-matrix(,nrow=74,ncol=499)
mC1=NULL
maC1=NULL
for(i in 1:74) {# break down the plot of 74 into 4 plots of 18
#i=1
sa_name <- paste0('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2ctl1_individual', i,'.csv')
aux <- read.csv(sa_name)
kd2_ctl1_i[i,] <- aux[2:500,2]
t1<-floor(i/9)
k <- (i-t1*9)
plot(kd2_ctl1_i[i,], ylim=range(c(0,3.8)), col= palette()[k], type="l") # there is something wrong with the colors
print(palette()[k])#print(col2rgb(2*i+50))
mC1[i]<-which.max(kd2_ctl1_i[i,] )
maC1[i]<-max(kd2_ctl1_i[i,] )
par(new = TRUE)
}
print(median(maC1))
print(median(mC1))
dim(k_ctl1_i) # 43 700 # 9 is NA
totKLD_D1_CTL1 <- rowSums(k_ctl1_i[,1:499], dims = 1)# take only 1:499
dim(kd2_ctl1_i) #74 499
totKLD_D2_CTL1 <- rowSums(kd2_ctl1_i, dims = 1)# is it much higher?
hgA <- hist(totKLD_D1_CTL1, breaks = 10, plot = FALSE) # Save first histogram data
hgB <- hist(totKLD_D2_CTL1, breaks = 30, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,1000), ylim = c(0,24)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,1000), ylim = c(0,24)) # Add 2nd histogram using different color
ctl1D2[35,,]
totFRs_D2_CTL1 <- rowSums(ctl1D2)
totFRs_D2_CTL1[35] # 38186686 outlier
mean(totFRs_D2_CTL1)+3*sd(totFRs_D2_CTL1) # 33584769
plot(totFRs_D1_CTL1)
plot(totFRs_D2_CTL1)
shapiro.test(totFRs_D1_CTL1)
shapiro.test(totFRs_D2_CTL1)
totFRs_D2_CTL11<-vector()
totFRs_D2_CTL11[1:34]<-totFRs_D2_CTL1[1:34]
totFRs_D2_CTL11[35:73]<-totFRs_D2_CTL1[36:74]
shapiro.test(totFRs_D2_CTL11)
totKLD_D1_CTL11 <- rowSums(k_ctl1_i, dims = 1)# take only 1:499
plot(totFRs_D1_CTL11)
shapiro.test(totFRs_D1_CTL11)
c_195<-colD2[,,195]
h1_195<-ctl1D2[,,195]
c2M195 <- matrix(c_195, ncol = 574, byrow = 79) #
h2M195 <- matrix(h1_195, ncol = 574, byrow = 74) #
mean(h2M195[,539])/mean(c2M195[,539])# GPX4 1.362526
mean(h2M195[,356])/mean(c2M195[,356]) # VDAC2 1.314116
hgA <- hist(c2M195[,356], breaks = 30 , plot = FALSE) # Save first histogram data
hgB <- hist(h2M195[,356],breaks = 20, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,400), ylim = c(0,10)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,400), ylim = c(0,10)) # Add 2nd histogram using different color
c_205<-colD2[,,205]
h1_205<-ctl1D2[,,205]
c2M205 <- matrix(c_205, ncol = 574, byrow = 79) #
h2M205 <- matrix(h1_205, ncol = 574, byrow = 74) #
mean(h2M205[,539])/mean(c2M205[,539])# GPX4 1.394561
hgA <- hist(c2M205[,363], breaks = 30 , plot = FALSE) # Save first histogram data
hgB <- hist(h2M205[,363],breaks = 20, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,270), ylim = c(0,13)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,270), ylim = c(0,13)) # Add 2nd histogram using different color
# Stage I 205 bp, Stage II 137 bp, Stage III 364 bp, Stage IV 364 bp
c1_205<-col1D2[,,205]
h1_205<-ctl1D2[,,205]
write.csv(c1_205,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col1_fr205.csv')
write.csv(h1_205,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr205.csv')
#######################
col277<-colD2[,,277]
h1_277<-ctl1D2[,,277]
cM277 <- matrix(col277, ncol = 574, byrow = 79) #
hM277 <- matrix(h1_277, ncol = 574, byrow = 74) #
hgA <- hist(cM364[,406], breaks = 20 , plot = FALSE) # Save first histogram data
hgB <- hist(hM364[,406],breaks = 40, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,60), ylim = c(0,5)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,60), ylim = c(0,5)) # Add 2nd histogram using different color
##
hgA <- hist(cM277[,426], breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(hM277[,426],breaks = 20, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,25), ylim = c(0,15)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,25), ylim = c(0,15)) # Add 2nd histogram using different color
#######################
col365<-colD2[,,365]
h1_365<-ctl1D2[,,365]
cM365 <- matrix(col365, ncol = 574, byrow = 79) #
hM365 <- matrix(h1_365, ncol = 574, byrow = 74) #
##
hgA <- hist(cM365[,405], breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(hM365[,405],breaks = 50, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,60), ylim = c(0,10)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,60), ylim = c(0,10)) # Add 2nd histogram using different color
c4_137<-col4D2[,,137]
write.csv(c4_137,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col4_fr137.csv')
k137_d2_col4 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col4_fr137_ctl1.csv')
k137d2_col4 <- k137_d2_col4[2:575,2]
plot(k137d2_col4)
h1_277<-ctl1D2[,,277]
write.csv(h1_277,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr277.csv')
c4_277<-col4D2[,,277]
cM277 <- matrix(c4_277, ncol = 574, byrow = 23) #
hM277 <- matrix(h1_277, ncol = 574, byrow = 74) #
hgA <- hist(cM277[,426], breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(hM277[,426],breaks = 5, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,88), ylim = c(0,22)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,88), ylim = c(0,22)) # Add 2nd histogram using different color
mean(cM277[,426])/mean(hM277[,426]) #
write.csv(c4_277,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col4_fr277.csv')
k277_d2_col4 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col4_fr277_ctl1.csv')
k277d2_col4 <- k277_d2_col4[2:575,2]
plot(k277d2_col4)
#######################
c2_137<-col2D2[,,137]
h1_137<-ctl1D2[,,137]
cM137 <- matrix(c4_137, ncol = 574, byrow = 23) #
hM137 <- matrix(h1_137, ncol = 574, byrow = 74) #
hgA <- hist(cM137[,555], breaks = 50 , plot = FALSE) # Save first histogram data
hgB <- hist(hM137[,555],breaks = 25, plot = FALSE) # Save 2nd histogram data
plot(hgA, col = rgb(1,0,0,1/10),xlim = c(0,400), ylim = c(0,10)) # Plot 1st histogram using a transparent color
plot(hgB, col = rgb(0,1,0,1/10), add = TRUE,xlim = c(0,400), ylim = c(0,10)) # Add 2nd histogram using different color
mean(cM137[,555])/mean(hM137[,555]) #1.785717
write.csv(c2_137,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col2_fr137.csv')
write.csv(h1_137,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr137.csv')
k137_d2_col2 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col2_fr137_ctl1.csv')
k137d2_col2 <- k137_d2_col2[2:575,2]
plot(k137d2_col2)
indx<-which(k137d2_col2>0.33)#12
cv <-matrix(,nrow=30,ncol=length(indx))
cv<-c2_137[,indx]
hv <-matrix(,nrow=74,ncol=length(indx))
hv<-h1_137[,indx]
hcTop20 <- rbind(cv , hv)
df<-scale(hcTop20)
df[is.nan(df)] <- 0
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
c3_364<-col3D2[,,364]
h1_364<-ctl1D2[,,364]
write.csv(c3_364,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col3_fr364.csv')
write.csv(h1_364,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_ctl1_fr364.csv')
k364_d2_col3 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col3_fr364_ctl1.csv')
k364d2_col3 <- k364_d2_col3[2:575,2]
plot(k364d2_col3)
indx<-which(k364d2_col3>0.08)#12
cv <-matrix(,nrow=18,ncol=length(indx))
cv<-c3_364[,indx]
hv <-matrix(,nrow=74,ncol=length(indx))
hv<-h1_364[,indx]
hcTop20 <- rbind(cv , hv)
df<-scale(hcTop20)
df[is.nan(df)] <- 0
col <- colorRampPalette(brewer.pal(11, "RdYlBu"))(256)
hm <- heatmap(df, scale = "none", col = col)
c4_364<-col3D2[,,364]
write.csv(c4_364,'~/genomedk/matovanalysis/DELFI_analysis/python/delfi2_col4_fr364.csv')
k364_d2_col4 <- read.csv('~/genomedk/matovanalysis/DELFI_analysis/python/KLdivergenceD2col4_fr364_ctl1.csv')
k364d2_col4 <- k364_d2_col4[2:575,2]
plot(k364d2_col4)
indx<-which(k364d2_col4>0.08)#12
cv <-matrix(,nrow=23,ncol=length(indx))
cv<-c4_364[,indx]
hv <-matrix(,nrow=74,ncol=length(indx))
hv<-h1_364[,indx]
hcTop20 <- rbind(cv , hv)