forked from GreenleafLab/MPAL-Single-Cell-2019
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscRNA_scATAC_Integration_04_Link_TFs_To_Targets.R
130 lines (98 loc) · 4.19 KB
/
scRNA_scATAC_Integration_04_Link_TFs_To_Targets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#Clustering and scATAC-seq UMAP for Hematopoiesis data
#06/02/19
#Cite Granja*, Klemm*, Mcginnis* et al.
#A single cell framework for multi-omic analysis of disease identifies
#malignant regulatory signatures in mixed phenotype acute leukemia (2019)
#Created by Jeffrey Granja
library(cicero)
library(data.table)
library(Matrix)
library(GenomicRanges)
library(magrittr)
library(SummarizedExperiment)
library(optparse)
library(yaml)
library(Rcpp)
set.seed(1)
####################################################
#Functions
####################################################
featureName <- function(gr){
paste(seqnames(gr),start(gr),end(gr),sep="_")
}
####################################################
#Input Data
####################################################
diff_scATAC_file <- "Save-scATAC-PromoterNorm-Differentials.rds"
diff_scRNA_file <- "output/scATAC_scRNA/Differential/Differential_Summary.rds"
p2gLinks_file <- "../../../Integration/heme_mpal/Integration/Save_MPAL_P2G_Links-190511.rds"
motif_matches_file <- "../../../scATAC/post/heme_mpal/output/Motif_ArchRMatches.rds"
#Read Inputs
diffObj <- readRDS(diff_scRNA_file)
p2gLinks <- readRDS(p2gLinks_file)$linksSig
matches <- readRDS(motif_matches_file)
diffATAC <- readRDS(diff_scATAC_file)
rownames(matches) <- paste(seqnames(matches),start(matches),end(matches), sep = "_")
rownames(diffATAC) <- paste(seqnames(diffATAC),start(diffATAC),end(diffATAC), sep = "_")
#Make P2G Mats
names(p2gLinks) <- paste0("l",seq_along(p2gLinks))
dAp2g <- diffATAC[featureName(p2gLinks),colnames(diffObj$diffRNA)]
dRp2g <- diffObj$diffRNA[mcols(p2gLinks)$gene_name,]
rownames(dAp2g) <- names(p2gLinks)
rownames(dRp2g) <- names(p2gLinks)
#Identify Significant Peaks and Genes per MPAL Subpopulation
sigMatP2G <- assays(dAp2g)[[1]] >= 0.5 & assays(dAp2g)[[2]] <= 0.05 & assays(dRp2g)[[1]] >= 0.5 & assays(dRp2g)[[2]] <= 0.01
#Which have at least 1 MPAL subpopulation with a linked (within p2glinks) diff peak and diff gene
sigP2G <- p2gLinks[which(rowSums(sigMatP2G) > 0)]
#List of TFs to identify Targerts
tfs <- c("RUNX1_1182")
#Identify Targets
t2gDF <- lapply(seq_along(tfs), function(x){
message(x)
#Subset Linked Peaks that contain TF Motif
peaksx <- names(which(assay(matches[,tfs[x]])[,1]))
#Subset sigMat by linked peaks with TF Motif
sigMatP2Gx <- sigMatP2G[rownames(sigMatP2G) %in% peaksx,]
#Subset links by linked peaks with TF Motif
linksx <- sigP2G[featureName(sigP2G) %in% peaksx]
#Figure out which samples are true for each link
sigMatP2Gx <- sigMatP2G[names(linksx),]
#Compute sum of MPAL subpopulations that are diff peak and diff peak for every peak connected to the gene
sigDFListx <- split(data.frame(sigMatP2Gx), mcols(linksx)$gene_name) %>% lapply(., colSums)
#Convert to Data Frame
sigDFx <- data.frame(Reduce("rbind",sigDFListx))
rownames(sigDFx) <- names(sigDFListx)
#Set to maximum of 1 for each MPAL subpop for all diff peak gene combo
sigDFx[sigDFx > 1] <- 1
#Compute number of positive MPAL subpopulations
nSubpop <- rowSums(sigDFx)
#Summed R2 linkage for each linked positive diff peak gene combo (Max of Cor Healthy Cor Cancer then Squared for each Gene)
maxCor <- pmax(mcols(linksx)$CorrelationHealthy,mcols(linksx)$CorrelationCancer,na.rm=TRUE)
linkageScore <- split(maxCor^2, mcols(linksx)$gene_name) %>%
lapply(., sum) %>%
unlist(use.names=TRUE)
#Return Summary Data Frame
data.frame(TF = tfs[x], Gene = names(nSubpop), N = nSubpop, P = nSubpop / ncol(sigMatP2Gx), linkageScore = linkageScore[names(nSubpop)])
}) %>% Reduce("rbind",.)
#Time to Plot
#Plot TF idx
idx <- "RUNX1_1182"
#Subset TF idx
df <- t2gDF[t2gDF[,1]==idx,]
#Filter Genes with 2 or Less Subpop Diff
df <- df[df$N > 2,]
#Compute Zscore for N Subpop
df$ZN <- scale(df$N)
#Compute Zscore for Linkage Score
df$ZLS <- scale(df$linkageScore)
#Compute Zscore sum for N Subpop and Linkage Score
df$SummedZ <- df$ZN + df$ZLS
#GGPlot2
pdf("results/RUNX1-Targets.pdf")
ggplot(df, aes(N, linkageScore)) + geom_point() + theme_bw() +
ggrepel::geom_label_repel(
data = df[head(order(df$SummedZ,decreasing=TRUE),25),], size = 5,
aes(x=P,y=linkageScore,color=NULL,label=Gene)
)
dev.off()
saveRDS(df, "results/Save-RUNX1-Targets.rds")