diff --git a/.gitignore b/.gitignore index cdd00e67..0c964879 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ inst/tmp* # ignore vignette artefacts FraseR.pdf cache -vignettes/*.pdf vignettes/*.tex vignettes/*.txt vignettes/*.log diff --git a/DESCRIPTION b/DESCRIPTION index aa4debec..4e24061a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,14 @@ Package: FRASER Type: Package Title: Find RAre Splicing Events in RNA-Seq Data -Version: 1.15.1 +Version: 1.99.3 Date: 2024-02-11 Authors@R: c( person("Christian", "Mertes", role=c("aut", "cre"), email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")), person("Ines", "Scheller", role=c("aut"), email="scheller@in.tum.de", comment=c(ORCID="0000-0003-4533-7857")), - person("Karoline", "Lutz", role=c("ctb"), email="lutzk@in.tum.de"), + person("Karoline", "Lutz", role=c("aut"), email="lutzk@in.tum.de"), person("Vicente", "Yepez", role=c("aut"), email="yepez@in.tum.de", comment=c(ORCID="0000-0001-7916-3643")), person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de", @@ -87,9 +87,13 @@ Suggests: covr, TxDb.Hsapiens.UCSC.hg19.knownGene, org.Hs.eg.db, + rtracklayer, + SGSeq, + ggbio, + biovizBase LinkingTo: - Rcpp, - RcppArmadillo + RcppArmadillo, + Rcpp Collate: variables.R getNSetterFuns.R @@ -119,3 +123,4 @@ Collate: fitCorrectionMethods.R plotMethods.R zzz.R + resultAnnotations.R diff --git a/NAMESPACE b/NAMESPACE index 35e2d66a..2c289ef5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export("condition<-") export("currentType<-") export("dontWriteHDF5<-") export("featureExclusionMask<-") +export("fitMetrics<-") export("name<-") export("nonSplicedReads<-") export("pairedEnd<-") @@ -22,12 +23,16 @@ export(K) export(N) export(aberrant) export(addCountsToFraserDataSet) +export(annotateIntronReferenceOverlap) +export(annotatePotentialImpact) export(annotateRanges) export(annotateRangesWithTxDb) +export(availableFDRsubsets) export(bamFile) export(bestQ) export(calculatePSIValues) export(calculatePadjValues) +export(calculatePadjValuesOnSubset) export(calculatePvalues) export(calculateZscore) export(condition) @@ -44,6 +49,8 @@ export(filterExpression) export(filterExpressionAndVariability) export(filterVariability) export(fit) +export(fitMetrics) +export(flagBlacklistRegions) export(getNonSplitReadCountsForAllSamples) export(getSplitReadCountsForAllSamples) export(hyperParams) @@ -60,19 +67,22 @@ export(pVals) export(padjVals) export(pairedEnd) export(plotAberrantPerSample) +export(plotBamCoverage) +export(plotBamCoverageFromResultTable) export(plotCountCorHeatmap) export(plotEncDimSearch) export(plotExpectedVsObservedPsi) export(plotExpression) export(plotFilterExpression) export(plotFilterVariability) +export(plotManhattan) export(plotQQ) +export(plotSpliceMetricRank) export(plotVolcano) export(predictedMeans) export(pseudocount) export(psiTypes) export(results) -export(resultsByGenes) export(rho) export(samples) export(saveFraserDataSet) @@ -98,6 +108,7 @@ exportMethods(assays) exportMethods(bamFile) exportMethods(condition) exportMethods(filterExpression) +exportMethods(filterVariability) exportMethods(length) exportMethods(name) exportMethods(nonSplicedReads) @@ -105,6 +116,7 @@ exportMethods(pairedEnd) exportMethods(plotAberrantPerSample) exportMethods(plotCountCorHeatmap) exportMethods(plotEncDimSearch) +exportMethods(plotManhattan) exportMethods(plotQQ) exportMethods(plotVolcano) exportMethods(results) @@ -158,27 +170,40 @@ importFrom(GenomeInfoDb,seqlengths) importFrom(GenomeInfoDb,seqlevels) importFrom(GenomeInfoDb,seqlevelsStyle) importFrom(GenomeInfoDb,seqnames) +importFrom(GenomeInfoDb,sortSeqlevels) importFrom(GenomeInfoDb,standardChromosomes) importFrom(GenomicAlignments,junctions) importFrom(GenomicAlignments,readGAlignmentPairs) importFrom(GenomicAlignments,readGAlignments) importFrom(GenomicAlignments,summarizeJunctions) +importFrom(GenomicFeatures,exons) +importFrom(GenomicFeatures,fiveUTRsByTranscript) importFrom(GenomicFeatures,genes) importFrom(GenomicFeatures,intronsByTranscript) importFrom(GenomicFeatures,makeTxDbFromGFF) +importFrom(GenomicFeatures,seqlevels0) +importFrom(GenomicFeatures,threeUTRsByTranscript) +importFrom(GenomicRanges,"end<-") +importFrom(GenomicRanges,"seqinfo<-") +importFrom(GenomicRanges,"start<-") importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) +importFrom(GenomicRanges,end) importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,granges) importFrom(GenomicRanges,invertStrand) importFrom(GenomicRanges,makeGRangesFromDataFrame) +importFrom(GenomicRanges,start) importFrom(HDF5Array,HDF5Array) importFrom(HDF5Array,loadHDF5SummarizedExperiment) importFrom(HDF5Array,path) importFrom(HDF5Array,saveHDF5SummarizedExperiment) importFrom(HDF5Array,writeHDF5Array) +importFrom(IRanges,"%over%") importFrom(IRanges,IRanges) +importFrom(IRanges,distance) importFrom(IRanges,from) +importFrom(IRanges,nearest) importFrom(IRanges,ranges) importFrom(IRanges,subsetByOverlaps) importFrom(IRanges,to) @@ -204,15 +229,18 @@ importFrom(Rsamtools,scanBamHeader) importFrom(Rsubread,featureCounts) importFrom(S4Vectors,"mcols<-") importFrom(S4Vectors,"metadata<-") +importFrom(S4Vectors,"values<-") importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,Rle) importFrom(S4Vectors,SimpleList) +importFrom(S4Vectors,elementMetadata) importFrom(S4Vectors,end) importFrom(S4Vectors,mcols) importFrom(S4Vectors,metadata) importFrom(S4Vectors,queryHits) importFrom(S4Vectors,start) importFrom(S4Vectors,subjectHits) +importFrom(S4Vectors,values) importFrom(SummarizedExperiment,"assay<-") importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") @@ -236,6 +264,7 @@ importFrom(VGAM,rbetabinom) importFrom(VGAM,vglm) importFrom(biomaRt,getBM) importFrom(biomaRt,useEnsembl) +importFrom(cowplot,background_grid) importFrom(cowplot,theme_cowplot) importFrom(extraDistr,dbbinom) importFrom(extraDistr,pbbinom) @@ -245,6 +274,8 @@ importFrom(ggplot2,aes) importFrom(ggplot2,annotate) importFrom(ggplot2,annotation_logticks) importFrom(ggplot2,element_blank) +importFrom(ggplot2,facet_grid) +importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_hline) @@ -253,10 +284,14 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(ggplot2,geom_segment) importFrom(ggplot2,geom_smooth) +importFrom(ggplot2,geom_text) importFrom(ggplot2,geom_vline) importFrom(ggplot2,ggplot) importFrom(ggplot2,ggtitle) +importFrom(ggplot2,guide_legend) +importFrom(ggplot2,guides) importFrom(ggplot2,labs) +importFrom(ggplot2,quo_name) importFrom(ggplot2,scale_color_brewer) importFrom(ggplot2,scale_color_discrete) importFrom(ggplot2,scale_color_gradientn) @@ -271,6 +306,7 @@ importFrom(ggplot2,theme_bw) importFrom(ggplot2,xlab) importFrom(ggplot2,xlim) importFrom(ggplot2,ylab) +importFrom(ggplot2,ylim) importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRampPalette) importFrom(matrixStats,colAnys) @@ -324,9 +360,11 @@ importFrom(stats,rnbinom) importFrom(stats,rnorm) importFrom(stats,runif) importFrom(stats,sd) +importFrom(tibble,"%>%") importFrom(tibble,as_tibble) importFrom(tools,file_path_as_absolute) importFrom(utils,capture.output) importFrom(utils,packageVersion) +importFrom(utils,tail) importMethodsFrom(OUTRIDER,results) useDynLib(FRASER) diff --git a/NEWS b/NEWS index 15ca2c35..29bb134b 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,32 @@ +CHANGES IN VERSION 1.99.3 +------------------------- + o Bugfix for contig names containing certain characters + +CHANGES IN VERSION 1.99.2 +------------------------- + o Update of the plot functions to support colorring aberrant status based + on p values computed on subsets of genes + +------------------------- + o Major update to FRASER2: + o Introduction of new & more robust splice metric Intron Jaccard Index + o Only Intron Jaccard Index metric used by default + o Improved gene level pvalue calculation and internal storage + o Introduction of option to limit FDR correction to user-defined + subsets of genes per sample (e.g. OMIM genes with rare variant) + o Updated internal pseudocount parameter and default delta Jaccard + cutoff + o Junction filtering adapted to usage of Intron Jaccard Index metric + o Require min expression of N >= 10 in 25% of the samples + o Results table: + o Functionality to flag outliers in blacklist regions of the genome + o Functionality to annotate the predicted type of aberrantSplicing + (e.g. exon skipping, intron retention etc.) + o Several updates in the plotting functions + o introduction of manhattan plot functionality + o possibility to create sashimi plots to visualize read coverage in + the bam files for outliers + CHANGES IN VERSION 1.8.1 ------------------------- o Bugfix in merging splicing counts (#41) diff --git a/R/AllGenerics-definitions.R b/R/AllGenerics-definitions.R index ced4142a..0015b143 100644 --- a/R/AllGenerics-definitions.R +++ b/R/AllGenerics-definitions.R @@ -146,3 +146,14 @@ setGeneric("nonSplicedReads", #' @export setGeneric("nonSplicedReads<-", signature = "object", function(object, value) standardGeneric("nonSplicedReads<-")) + +#' @rdname plotFunctions +#' @export +setGeneric("plotManhattan", function(object, ...) + standardGeneric("plotManhattan")) + +#' @rdname filtering +#' @export +setGeneric("filterVariability", function(object, ...) + standardGeneric("filterVariability")) + diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 5c6258d6..e00cd10b 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -12,8 +12,8 @@ asFDS <- function(x){ #' #' @title Getter/Setter methods for the FraserDataSet #' -#' @description The following methods are getter and setter methods to extract or set -#' certain values of a FraserDataSet object. +#' @description The following methods are getter and setter methods to extract +#' or set certain values of a FraserDataSet object. #' #' \code{samples} sets or gets the sample IDs; \code{condition} ; #' \code{} @@ -66,6 +66,9 @@ NULL #' @rdname fds-methods #' @export setMethod("samples", "FraserDataSet", function(object) { + if(!is.null(colnames(object))){ + return(colnames(object)) + } return(as.character(colData(object)[,"sampleID"])) }) @@ -74,6 +77,7 @@ setMethod("samples", "FraserDataSet", function(object) { setReplaceMethod("samples", "FraserDataSet", function(object, value) { colData(object)[,"sampleID"] <- as.character(value) rownames(colData(object)) <- colData(object)[,"sampleID"] + colnames(object) <- as.character(value) validObject(object) return(object) }) @@ -518,12 +522,14 @@ setReplaceMethod("rowRanges", "FraserDataSet", FRASER.rowRanges.replace) #' @examples #' fds <- createTestFraserDataSet() #' -#' counts(fds, type="psi5", side="ofInterest") -#' counts(fds, type="psi5", side="other") +#' counts(fds, side="ofInterest") +#' counts(fds, type="jaccard", side="other") +#' head(K(fds)) +#' head(K(fds, type="psi5")) #' head(K(fds, type="psi3")) #' head(N(fds, type="theta")) #' -setMethod("counts", "FraserDataSet", function(object, type=NULL, +setMethod("counts", "FraserDataSet", function(object, type=currentType(object), side=c("ofInterest", "otherSide")){ side <- match.arg(side) if(side=="ofInterest"){ @@ -539,8 +545,9 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, # extract psi value from type type <- whichPSIType(type) if(length(type) == 0 | length(type) > 1){ - stop(paste0("Please provide a correct psi type: psi5, psi3, or ", - "theta. Not the given one: '", type, "'.")) + stop(paste0("Please provide a correct psi type: psi5, psi3, ", + "theta or jaccard. Not the given one: '", + type, "'.")) } aname <- paste0("rawOtherCounts_", type) if(!aname %in% assayNames(object)){ @@ -554,7 +561,8 @@ setMethod("counts", "FraserDataSet", function(object, type=NULL, #' setter for count data #' #' @rdname counts -setReplaceMethod("counts", "FraserDataSet", function(object, type=NULL, +setReplaceMethod("counts", "FraserDataSet", function(object, + type=currentType(object), side=c("ofInterest", "otherSide"), ..., value){ side <- match.arg(side) @@ -581,150 +589,290 @@ setAs("DataFrame", "matrix", function(from){ as.matrix(as(from, "data.table")) }) #' -#' retrieve a single sample result object -#' @noRd -resultsSingleSample <- function(sampleID, gr, pvals, padjs, zscores, psivals, - rawCts, rawTotalCts, deltaPsiVals, muPsi, psiType, fdrCut, - zscoreCut, dPsiCut, rowMeansK, rowMeansN, minCount, - additionalColumns){ - - zscore <- zscores[,sampleID] - dpsi <- deltaPsiVals[,sampleID] - pval <- pvals[,sampleID] - padj <- padjs[,sampleID] - - goodCut <- !logical(length(zscore)) - if(!is.na(zscoreCut)){ - goodCut <- goodCut & na2default(abs(zscore) >= zscoreCut, TRUE) - } - if(!is.na(dPsiCut)){ - goodCut <- goodCut & na2default(abs(dpsi) >= dPsiCut, TRUE) - } - if(!is.na(fdrCut)){ - goodCut <- goodCut & na2false(padj <= fdrCut) +#' Mapping of chromosome names +#' +#' @param fds FraserDataSet +#' @param style The style of the chromosome names. +#' @param ... Further parameters. For mapSeqLevels: further parameters +#' passed to GenomeInfoDb::mapSeqlevels(). +#' +#' @rdname fds-methods +#' @export +mapSeqlevels <- function(fds, style="UCSC", ...){ + + mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) + # fix missing names() when fds has only a single chromosome + if(is.null(names(mappings))){ + names(mappings) <- seqlevels(fds) } - if(!is.na(minCount)){ - goodCut <- goodCut & rawTotalCts[,sampleID] >= minCount + + if(length(mappings) != length(seqlevels(fds))){ + message(date(), ": Drop non standard chromosomes for compatibility.") + fds <- keepStandardChromosomes(fds) + nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) + validObject(fds) } + fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] + + seqlevels(fds) <- as.vector(mappings) + seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) + + return(fds) +} +#' +#' retrieve a single sample result object +#' @noRd +resultsSingleSample <- function(sampleID, gr, pvals, padjs, + psivals, rawCts, rawTotalCts, rawNonsplitCts, + rawNsProportion, nsProportion_99quantile, + deltaPsiVals, psiType, rowMeansK, rowMeansN, + aberrant, aggregate, rho, + pvalsGene=NULL, padjsGene=NULL, + aberrantGene, additionalColumns, + geneColumn="hgnc_symbol"){ + mcols(gr)$idx <- seq_along(gr) + # if gene level results, find the most aberrant junction per gene first + if(isTRUE(aggregate)){ + goodGenes <- rownames(aberrantGene)[aberrantGene[,sampleID] & + !is.na(aberrantGene[,sampleID])] + geneJunctions <- findJunctionsForAberrantGenes(gr=gr, + aberrantGenes=goodGenes, + pvals=pvals[,sampleID], + dpsi=deltaPsiVals[,sampleID], + geneColumn=geneColumn, + aberrantJunctions=aberrant[,sampleID]) + goodCut <- rep(FALSE, nrow(pvals)) + goodCut[geneJunctions] <- TRUE + } else{ + goodCut <- aberrant[,sampleID] + } + ans <- granges(gr[goodCut]) - + if(!any(goodCut)){ return(ans) } - - if(!"hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(gr)$hgnc_symbol <- NA_character_ + mcols(ans)$idx <- mcols(gr)$idx[goodCut] + + if(!geneColumn %in% colnames(mcols(gr))){ + mcols(gr)[,geneColumn] <- NA_character_ } - + # extract data mcols(ans)$sampleID <- Rle(sampleID) if("hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])$hgnc_symbol) - } - if("other_hgnc_symbol" %in% colnames(mcols(gr))){ - mcols(ans)$addHgncSymbols <- Rle(mcols(gr[goodCut])$other_hgnc_symbol) + mcols(ans)$hgncSymbol <- Rle(mcols(gr[goodCut])[,geneColumn]) } + mcols(ans)$type <- Rle(psiType) - mcols(ans)$pValue <- signif(pval[goodCut], 5) - mcols(ans)$padjust <- signif(padj[goodCut], 5) - mcols(ans)$zScore <- Rle(round(zscore[goodCut], 2)) + mcols(ans)$pValue <- signif(pvals[goodCut,sampleID], 5) + mcols(ans)$padjust <- signif(padjs[goodCut,sampleID], 5) mcols(ans)$psiValue <- Rle(round(psivals[goodCut,sampleID], 2)) - mcols(ans)$deltaPsi <- Rle(round(dpsi[goodCut], 2)) - mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) - mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + mcols(ans)$deltaPsi <- round(deltaPsiVals[goodCut,sampleID], 2) mcols(ans)$counts <- Rle(rawCts[goodCut, sampleID]) mcols(ans)$totalCounts <- Rle(rawTotalCts[goodCut, sampleID]) + mcols(ans)$meanCounts <- Rle(round(rowMeansK[goodCut], 2)) + mcols(ans)$meanTotalCounts <- Rle(round(rowMeansN[goodCut], 2)) + + if(psiType == "jaccard"){ + mcols(ans)$nonsplitCounts <- + Rle(round(rawNonsplitCts[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion <- + Rle(round(rawNsProportion[goodCut, sampleID], 2)) + mcols(ans)$nonsplitProportion_99quantile <- + Rle(round(nsProportion_99quantile[goodCut], 2)) + } if(!is.null(additionalColumns)){ for(column in additionalColumns){ mcols(ans)[,column] <- Rle(mcols(gr[goodCut])[,column]) } } - - return(ans[order(mcols(ans)$pValue)]) + + if(isTRUE(aggregate)){ + # report junction more than once if it is significant for several genes + nrGenesPerJunction <- table(geneJunctions) + ans <- rep(ans, nrGenesPerJunction[as.character(mcols(ans)$idx)]) + mcols(ans)$hgncSymbol <- + as.data.table(ans)[, names(geneJunctions)[geneJunctions == idx], + by = eval(colnames(mcols(ans)))][,V1] + + # add gene level pvalue + mcols(ans)$pValueGene <- + signif(pvalsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$padjustGene <- + signif(padjsGene[mcols(ans)$hgncSymbol,sampleID], 5) + mcols(ans)$hgncSymbol <- Rle(mcols(ans)$hgncSymbol) + } + + # remove helper column + mcols(ans)$idx <- NULL + + + return(ans[order(mcols(ans)$pValue, -abs(mcols(ans)$deltaPsi))]) } -FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, - dPsiCutoff, psiType, BPPARAM=bpparam(), maxCols=20, - minCount, additionalColumns=NULL){ - - # check input - checkNaAndRange(fdrCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(dPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(zscoreCutoff, min=0, max=100, scalar=TRUE, na.ok=TRUE) - checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) - +FRASER.results <- function(object, sampleIDs, fdrCutoff, + dPsiCutoff, minCount, rhoCutoff, psiType, + maxCols=20, aggregate=FALSE, collapse=FALSE, + geneColumn="hgnc_symbol", BPPARAM=bpparam(), + subsetName=NULL, all=all, additionalColumns=NULL){ + stopifnot(is(object, "FraserDataSet")) stopifnot(all(sampleIDs %in% samples(object))) - + + if("annotatedJunction" %in% colnames(mcols(object, type="j")) && + !("annotatedJunction" %in% additionalColumns)){ + additionalColumns <- c(additionalColumns, "annotatedJunction") + } + + # only extract results for requested psiTypes if pvals exist for them + stopifnot(all(psiType %in% psiTypes)) + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } + pvalsAvailable <- checkPadjAvailableForFilters(object, type=psiType, + filters=list(rho=rhoCutoff), + aggregate=aggregate, + subsetName=subsetName) + psiType <- psiType[pvalsAvailable] + if(all(isFALSE(pvalsAvailable))){ + stop("For the splice metric(s), pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function.") + } + resultsls <- bplapply(psiType, BPPARAM=BPPARAM, function(type){ - message(date(), ": Collecting results for: ", type) + message(date(), ": Collecting results for: ", type, + ifelse(is.null(subsetName), " (transcriptome-wide)", + paste0(" (", subsetName, ")"))) currentType(object) <- type gr <- rowRanges(object, type=type) - + # first get row means rowMeansK <- rowMeans(K(object, type=type)) rowMeansN <- rowMeans(N(object, type=type)) - + + # get proportion of nonsplitCounts among all counts (N) for each intron + if(type == "jaccard"){ + rawNonsplitCts <- as.matrix(assay(object, "rawCountsJnonsplit")) + rawNsProportion <- rawNonsplitCts / as.matrix(N(object)) + nsProportion_99quantile <- + rowQuantiles(rawNsProportion, probs=0.99) + } else{ + rawNonsplitCts <- NULL + rawNsProportion <- NULL + nsProportion_99quantile <- NULL + } + # then iterate by chunk chunkCols <- getMaxChunks2Read(fds=object, assayName=type, max=maxCols) sampleChunks <- getSamplesByChunk(fds=object, sampleIDs=sampleIDs, - chunkSize=chunkCols) - + chunkSize=chunkCols) + ans <- lapply(seq_along(sampleChunks), function(idx){ message(date(), ": Process chunk: ", idx, " for: ", type) sc <- sampleChunks[[idx]] tmp_x <- object[,sc] - + # extract values rawCts <- as.matrix(K(tmp_x)) rawTotalCts <- as.matrix(N(tmp_x)) - pvals <- as.matrix(pVals(tmp_x)) - padjs <- as.matrix(padjVals(tmp_x)) - zscores <- as.matrix(zScores(tmp_x)) + pvals <- as.matrix(pVals(tmp_x, + filters=list(rho=rhoCutoff))) + padjs <- as.matrix(padjVals(tmp_x, + subsetName=subsetName, + filters=list(rho=rhoCutoff))) psivals <- as.matrix(assay(tmp_x, type)) muPsi <- as.matrix(predictedMeans(tmp_x)) psivals_pc <- (rawCts + pseudocount()) / - (rawTotalCts + 2*pseudocount()) - deltaPsiVals <- psivals_pc - muPsi - + (rawTotalCts + 2*pseudocount()) + deltaPsiVals <- deltaPsiValue(tmp_x, type) + rho <- rho(tmp_x, type) + aberrant <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=ifelse(isTRUE(aggregate), + NA, fdrCutoff), + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=FALSE, + all=all, + geneColumn=geneColumn, + subsetName=subsetName) + if(isTRUE(aggregate)){ + pvalsGene <- as.matrix(pVals(tmp_x, level="gene", + filters=list(rho=rhoCutoff))) + padjsGene <- as.matrix(padjVals(tmp_x, level="gene", + subsetName=subsetName, + filters=list(rho=rhoCutoff))) + aberrantGene <- aberrant.FRASER(tmp_x, type=type, + padjCutoff=fdrCutoff, + deltaPsiCutoff=dPsiCutoff, + minCount=minCount, + rhoCutoff=rhoCutoff, + aggregate=TRUE, + all=all, + geneColumn=geneColumn, + subsetName=subsetName) + } else{ + pvalsGene <- NULL + padjsGene <- NULL + aberrantGene <- NULL + } + if(length(sc) == 1){ colnames(pvals) <- sc colnames(padjs) <- sc - colnames(zscores) <- sc colnames(deltaPsiVals) <- sc } # create result table sampleRes <- lapply(sc, - resultsSingleSample, gr=gr, pvals=pvals, padjs=padjs, - zscores=zscores, psiType=type, psivals=psivals, - deltaPsiVals=deltaPsiVals, muPsi=muPsi, rawCts=rawCts, - rawTotalCts=rawTotalCts, fdrCut=fdrCutoff, - zscoreCut=zscoreCutoff, dPsiCut=dPsiCutoff, - rowMeansK=rowMeansK, rowMeansN=rowMeansN, - minCount=minCount, additionalColumns=additionalColumns) - + resultsSingleSample, gr=gr, pvals=pvals, + padjs=padjs, psiType=type, + psivals=psivals, deltaPsiVals=deltaPsiVals, + rawCts=rawCts, rawTotalCts=rawTotalCts, + rawNonsplitCts=rawNonsplitCts[,sc,drop=FALSE], + rawNsProportion=rawNsProportion[,sc,drop=FALSE], + nsProportion_99quantile=nsProportion_99quantile, + rowMeansK=rowMeansK, rowMeansN=rowMeansN, + aberrant=aberrant, aggregate=aggregate, + rho=rho, geneColumn=geneColumn, + pvalsGene=pvalsGene, padjsGene=padjsGene, + aberrantGene=aberrantGene, + additionalColumns=additionalColumns) + # return combined result return(unlist(GRangesList(sampleRes))) }) - + unlist(GRangesList(ans)) }) - + # merge results ans <- unlist(GRangesList(resultsls)) - + # sort it if existing if(length(ans) > 0){ ans <- ans[order(ans$pValue)] + if(is.null(subsetName)){ + mcols(ans)[["FDR_set"]] <- "transcriptome-wide" + } else{ + mcols(ans)[["FDR_set"]] <- subsetName + } } - + + # collapse into one row per gene if requested + if(isTRUE(aggregate) && isTRUE(collapse)){ + ans <- collapseResTablePerGene(ans) + } + # return only the results return(ans) } + #' #' Extracting results and aberrant splicing events #' @@ -736,31 +884,45 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' @param sampleIDs A vector of sample IDs for which results should be #' retrieved #' @param padjCutoff The FDR cutoff to be applied or NA if not requested. -#' @param zScoreCutoff The z-score cutoff to be applied or NA if not requested. #' @param deltaPsiCutoff The cutoff on delta psi or NA if not requested. #' @param minCount The minimum count value of the total coverage of an intron #' to be considered as significant. #' result +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which +#' junctions are filtered #' @param psiType The psi types for which the results should be retrieved. #' @param additionalColumns Character vector containing the names of additional #' columns from mcols(fds) that should appear in the result table #' (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns #' are included. #' @param BPPARAM The BiocParallel parameter. -#' @param res Result as created with \code{results()} -#' @param geneColumn The name of the column in \code{mcols(res)} that contains -#' the gene symbols. -#' @param method The p.adjust method that is being used to adjust p values per -#' sample. #' @param type Splicing type (psi5, psi3 or theta) #' @param by By default \code{none} which means no grouping. But if #' \code{sample} or \code{feature} is specified the sum by #' sample or feature is returned -#' @param aggregate If TRUE the returned object is based on the grouped -#' features -#' @param ... Further arguments can be passed to the method. If "zscores", -#' "padjVals" or "dPsi" is given, the values of those arguments -#' are used to define the aberrant events. +#' @param aggregate If TRUE the returned object is aggregated to the feature +#' level (i.e. gene level). +#' @param collapse Only takes effect if \code{aggregate=TRUE}. +#' If TRUE, collapses results across the different psi +#' types to return only one row per feature (gene) and sample. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. +#' @param all By default FALSE, only significant introns (or genes) are listed +#' in the results. If TRUE, results are assembled for all +#' samples and introns/genes regardless of significance. +#' @param returnTranscriptomewideResults If FDR corrected pvalues for subsets +#' of genes of interest have been calculated, this parameter +#' indicates whether additionally the transcriptome-wide results +#' should be returned as well (default), or whether only results +#' for those subsets should be retrieved. +#' @param subsetName The name of a subset of genes of interest for which FDR +#' corrected pvalues were previously computed. Those FDR values +#' on the subset will then be used to determine aberrant status. +#' Default is NULL (using transcriptome-wide FDR corrected pvalues). +#' @param ... Further arguments can be passed to the method. If "n", +#' "padjVals", "dPsi" or "rhoVals" are given, the values of those +#' arguments are used to define the aberrant events. #' #' @return For \code{results}: GRanges object containing significant results. #' For \code{aberrant}: Either a of logical values of size @@ -773,186 +935,216 @@ FRASER.results <- function(object, sampleIDs, fdrCutoff, zscoreCutoff, #' # get data, fit and compute p-values and z-scores #' fds <- createTestFraserDataSet() #' -#' # extract results: for this example dataset, z score cutoff of 2 is used to -#' # get at least one result and show the output -#' res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +#' # extract results: for this example dataset, no cutoffs are used to +#' # show the output of the results function +#' res <- results(fds, all=TRUE) #' res #' #' # aggregate the results by genes (gene symbols need to be annotated first #' # using annotateRanges() function) -#' resultsByGenes(res) +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) +#' +#' # aggregate the results by genes and collapse over all psi types to obtain +#' # only one row per gene in the results table +#' results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, +#' collapse=TRUE) #' #' # get aberrant events per sample: on the example data, nothing is aberrant #' # based on the adjusted p-value -#' aberrant(fds, type="psi5", by="sample") +#' aberrant(fds, type="jaccard", by="sample") #' #' # get aberrant events per gene (first annotate gene symbols) #' fds <- annotateRangesWithTxDb(fds) -#' aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, -#' aggregate=TRUE) +#' aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) #' #' # find aberrant junctions/splice sites -#' aberrant(fds, type="psi5") -#' @export -setMethod("results", "FraserDataSet", function(object, - sampleIDs=samples(object), padjCutoff=0.05, - zScoreCutoff=NA, deltaPsiCutoff=0.3, - minCount=5, psiType=c("psi3", "psi5", "theta"), - additionalColumns=NULL, BPPARAM=bpparam(), ...){ - FRASER.results(object=object, sampleIDs=sampleIDs, fdrCutoff=padjCutoff, - zscoreCutoff=zScoreCutoff, dPsiCutoff=deltaPsiCutoff, - minCount=minCount, psiType=match.arg(psiType, several.ok=TRUE), - additionalColumns=additionalColumns, BPPARAM=BPPARAM) -}) - -#' @rdname results -#' @export -resultsByGenes <- function(res, geneColumn="hgncSymbol", method="BY"){ - # sort by pvalue - res <- res[order(res$pValue)] - - # extract subset - if(is(res, "GRanges")){ - ans <- as.data.table(mcols(res)[,c(geneColumn, "pValue", "sampleID")]) - colnames(ans) <- c("features", "pval", "sampleID") - } else { - ans <- featureNames <- res[,.( - features=get(geneColumn), pval=pValue, sampleID=sampleID)] - } - - # remove NAs - naIdx <- ans[,is.na(features)] - ansNoNA <- ans[!is.na(features)] - - # compute pvalues by gene - ansNoNA[,pByFeature:=min(p.adjust(pval, method="holm")), - by="sampleID,features"] - - # subset to lowest pvalue by gene - dupIdx <- duplicated(ansNoNA[,.(features,sampleID)]) - ansGenes <- ansNoNA[!dupIdx] - - # compute FDR - ansGenes[,fdrByFeature:=p.adjust(pByFeature, method=method), - by="sampleID"] - - # get final result table - finalAns <- res[!naIdx][!dupIdx] - finalAns$pValueGene <- ansGenes$pByFeature - finalAns$padjustGene <- ansGenes$fdrByFeature - finalAns -} - -#' -#' Mapping of chromosome names -#' -#' @param fds FraserDataSet -#' @param style The style of the chromosome names. -#' @param ... Further parameters. For mapSeqLevels: further parameters -#' passed to GenomeInfoDb::mapSeqlevels(). +#' aberrant(fds, type="jaccard") +#' +#' # retrieve results limiting FDR correction to only a subset of genes +#' # first, we need to create a list of genes per sample that will be tested +#' geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=geneList)) +#' results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) #' -#' @rdname fds-methods #' @export -mapSeqlevels <- function(fds, style="UCSC", ...){ - - mappings <- na.omit(GenomeInfoDb::mapSeqlevels(seqlevels(fds), style, ...)) - # fix missing names() when fds has only a single chromosome - if(is.null(names(mappings))){ - names(mappings) <- seqlevels(fds) +setMethod("results", "FraserDataSet", function(object, + sampleIDs=samples(object), padjCutoff=0.1, + deltaPsiCutoff=0.1, + rhoCutoff=NA, aggregate=FALSE, collapse=FALSE, + minCount=5, psiType=psiTypes, + geneColumn="hgnc_symbol", all=FALSE, + returnTranscriptomewideResults=TRUE, + additionalColumns=NULL, BPPARAM=bpparam()){ + psiType <- match.arg(psiType, several.ok=TRUE) + FDRsets <- availableFDRsubsets(object) + + if(isFALSE(returnTranscriptomewideResults) && is.null(FDRsets)){ + warning("Retrieving transcriptome-wide results as no other ", + "FDR subsets are available in the fds object.") + returnTranscriptomewideResults <- TRUE + } + if(isTRUE(returnTranscriptomewideResults)){ + res <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, all=all, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + subsetName=NULL, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) } - - if(length(mappings) != length(seqlevels(fds))){ - message(date(), ": Drop non standard chromosomes for compatibility.") - fds <- keepStandardChromosomes(fds) - nonSplicedReads(fds) <- keepStandardChromosomes(nonSplicedReads(fds)) - validObject(fds) + + # add results for FDR_subsets if requested + if(!is.null(FDRsets)){ + resls_subsets <- lapply(FDRsets, function(setName){ + res_sub <- FRASER.results(object=object, sampleIDs=sampleIDs, + fdrCutoff=padjCutoff, dPsiCutoff=deltaPsiCutoff, + rhoCutoff=rhoCutoff, minCount=minCount, + psiType=psiType, all=all, + aggregate=aggregate, collapse=collapse, geneColumn=geneColumn, + subsetName=setName, additionalColumns=additionalColumns, + BPPARAM=BPPARAM) + }) + + if(isTRUE(returnTranscriptomewideResults)){ + res <- unlist(GRangesList(unlist(list(res, resls_subsets)))) + } else{ + res <- unlist(GRangesList(unlist(resls_subsets))) + } + + # sort it if existing + if(length(res) > 0){ + res <- res[order(res$pValue)] + if(isTRUE(aggregate)){ + res <- res[!is.na(res$pValueGene)] + } + } } - fds <- fds[as.vector(seqnames(fds)) %in% names(mappings)] - - seqlevels(fds) <- as.vector(mappings) - seqlevels(nonSplicedReads(fds)) <- as.vector(mappings) - - return(fds) -} - - -aberrant.FRASER <- function(object, type=currentType(object), padjCutoff=0.05, - deltaPsiCutoff=0.3, zScoreCutoff=NA, minCount=5, - by=c("none", "sample", "feature"), aggregate=FALSE, ...){ + return(res) +}) - checkNaAndRange(zScoreCutoff, min=0, max=Inf, na.ok=TRUE) - checkNaAndRange(padjCutoff, min=0, max=1, na.ok=TRUE) - checkNaAndRange(deltaPsiCutoff, min=0, max=1, na.ok=TRUE) +aberrant.FRASER <- function(object, type=fitMetrics(object), + padjCutoff=0.1, deltaPsiCutoff=0.1, + minCount=5, rhoCutoff=NA, + by=c("none", "sample", "feature"), + aggregate=FALSE, geneColumn="hgnc_symbol", + subsetName=NULL, all=FALSE, ...){ + + checkNaAndRange(padjCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(deltaPsiCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(rhoCutoff, min=0, max=1, scalar=TRUE, na.ok=TRUE) + checkNaAndRange(minCount, min=0, max=Inf, scalar=TRUE, na.ok=TRUE) by <- match.arg(by) - + type <- match.arg(type) + + if(is.na(rhoCutoff)){ + rhoCutoff <- 1 + } + dots <- list(...) if("n" %in% names(dots)){ n <- dots[['n']] } else { n <- N(object, type=type) } - if("zscores" %in% names(dots)){ - zscores <- dots[['zscores']] - } else { - zscores <- zScores(object, type=type) - } if("padjVals" %in% names(dots)){ padj <- dots[['padjVals']] } else { - padj <- padjVals(object, type=type) + # check if padj values are available for the given filters + pvalsAvailable <- checkPadjAvailableForFilters(object, type=type, + filters=list(rho=rhoCutoff), + aggregate=aggregate, + subsetName=subsetName) + if(isFALSE(pvalsAvailable)){ + stop("For the given filters, pvalues are not yet computed. \n", + "Please compute them first by running the ", + "calculatePadjValues function with the requested filters.") + } + padj <- padjVals(object, type=type, level="site", subsetName=subsetName, + filters=list(rho=rhoCutoff)) } if("dPsi" %in% names(dots)){ dpsi <- dots[['dPsi']] } else { dpsi <- deltaPsiValue(object, type=type) } - - - # create cutoff matrix - goodCutoff <- matrix(TRUE, nrow=nrow(zscores), ncol=ncol(zscores), - dimnames=dimnames(zscores)) - if("hgnc_symbol" %in% colnames(mcols(object, type=type)) & - nrow(mcols(object, type=type)) == nrow(goodCutoff)){ - rownames(goodCutoff) <- mcols(object, type=type)[,"hgnc_symbol"] - } else if(isTRUE(aggregate)){ - stop("Please provide hgnc symbols to compute gene p values!") + if("rhoVals" %in% names(dots)){ + rho <- dots[['rhoVals']] + } else { + rho <- matrix(rho(object, type=type), + nrow=nrow(dpsi), ncol=ncol(dpsi)) + } + if(isTRUE(aggregate)){ + if("padjGeneVals" %in% names(dots)){ + padj_gene <- dots[['padjGeneVals']] + } else{ + padj_gene <- padjVals(object, type=type, level="gene", + subsetName=subsetName, + filters=list(rho=rhoCutoff)) + } + } - # check each cutoff if in use (not NA) - if(!is.na(minCount)){ - goodCutoff <- goodCutoff & as.matrix(n >= minCount) - } - if(!is.na(zScoreCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(zscores) >= zScoreCutoff) - } - if(!is.na(deltaPsiCutoff)){ - goodCutoff <- goodCutoff & as.matrix(abs(dpsi) >= deltaPsiCutoff) + if(is.na(padjCutoff)){ + padjCutoff <- 1 } - if(!is.na(padjCutoff)){ - goodCutoff <- goodCutoff & as.matrix(padj <= padjCutoff) - } - goodCutoff[is.na(goodCutoff)] <- FALSE - - # check if we should go for aggregation - # TODO to speed it up we only use any hit within a feature - # but should do a holm's + BY correction per gene and genome wide if(isTRUE(aggregate)){ - goodCutoff <- as.matrix(data.table(goodCutoff, keep.rownames=TRUE)[, - as.data.table(t(colAnys(as.matrix(.SD)))), by=rn][,-1]) - rownames(goodCutoff) <- unique(mcols(object, type=type)[,"hgnc_symbol"]) - colnames(goodCutoff) <- colnames(zscores) + padjCutoffGene <- padjCutoff + padjCutoff <- 1 } - # return results - if(by == "feature"){ - return(rowSums(goodCutoff)) + if(isTRUE(all)){ + aberrantEvents <- matrix(TRUE, nrow=nrow(object), ncol=ncol(object)) + colnames(aberrantEvents) <- colnames(object) + } else{ + aberrantEvents <- as.matrix(padj) <= padjCutoff + + # check each cutoff if in use (not NA) + if(!is.na(minCount)){ + aberrantEvents <- aberrantEvents & as.matrix(n >= minCount) + } + if(!is.na(deltaPsiCutoff)){ + aberrantEvents <- aberrantEvents & + as.matrix(abs(dpsi) >= deltaPsiCutoff) + } + if(!is.na(rhoCutoff)){ + aberrantEvents <- aberrantEvents & as.matrix(rho <= rhoCutoff) + } + aberrantEvents[is.na(aberrantEvents)] <- FALSE } - if(by == "sample"){ - return(colSums(goodCutoff)) + + if(isTRUE(aggregate)){ + if(is.null(rownames(padj_gene))){ + stop("Missing rownames for gene-level padj values.") + } + # reduce aberrant matrix to one row per gene + # (TRUE if any junction is aberrant for each sample) + ab_dt <- data.table(geneID=getGeneIDs(object, type=type, unique=FALSE, + geneColumn=geneColumn), + aberrantEvents) + ab_dt[, dt_idx:=seq_len(.N)] + dt_tmp <- ab_dt[!is.na(geneID), splitGenes(geneID), by="dt_idx"] + ab_dt <- ab_dt[dt_tmp$dt_idx] + ab_dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + ab_dt <- ab_dt[,lapply(.SD, any), by="geneID"] + aberrantEvents <- as.matrix(ab_dt[,-1]) + rownames(aberrantEvents) <- ab_dt[,geneID] + + if(isFALSE(all)){ + aberrantEvents <- aberrantEvents & as.matrix( + padj_gene[rownames(aberrantEvents),colnames(aberrantEvents)] + ) <= padjCutoffGene + } } - return(goodCutoff) + + return(switch(match.arg(by), + none = aberrantEvents, + sample = colSums(aberrantEvents, na.rm=TRUE), + feature = rowSums(aberrantEvents, na.rm=TRUE) + )) } #' @rdname results #' @export setMethod("aberrant", "FraserDataSet", aberrant.FRASER) + diff --git a/R/FRASER-package.R b/R/FRASER-package.R index 87e5ad96..9cb5b30b 100644 --- a/R/FRASER-package.R +++ b/R/FRASER-package.R @@ -22,15 +22,18 @@ #' ### GRange/Experiment/bamFile packages #' @importFrom BiocGenerics updateObject counts counts<- strand strand<- which -#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes +#' @importFrom GenomicFeatures makeTxDbFromGFF intronsByTranscript genes exons +#' fiveUTRsByTranscript threeUTRsByTranscript seqlevels0 #' @importFrom GenomicAlignments junctions readGAlignments summarizeJunctions #' readGAlignmentPairs #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames #' colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment #' rbind Assays #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList -#' makeGRangesFromDataFrame invertStrand -#' @importFrom IRanges subsetByOverlaps from to IRanges ranges +#' makeGRangesFromDataFrame invertStrand start end start<- end<- +#' seqinfo<- +#' @importFrom IRanges subsetByOverlaps from to IRanges ranges nearest distance +#' %over% #' @importFrom Rsamtools ScanBamParam scanBamHeader bamMapqFilter #' bamWhich bamWhich<- BamFile idxstatsBam #' @importFrom Rsubread featureCounts @@ -41,16 +44,14 @@ #' @importFrom biomaRt useEnsembl getBM #' @importFrom AnnotationDbi select #' -#' ### Plotting #' #' @importFrom plotly plot_ly subplot layout add_trace ggplotly #' @importFrom pheatmap pheatmap #' @importFrom RColorBrewer brewer.pal -#' @importFrom cowplot theme_cowplot +#' @importFrom cowplot theme_cowplot background_grid #' @importFrom ggrepel geom_text_repel #' -#' ### Data handling #' #' @importFrom HDF5Array writeHDF5Array path HDF5Array @@ -74,18 +75,20 @@ #' @importFrom R.utils renameFile withTimeout #' @importFrom tools file_path_as_absolute #' @importFrom methods as callNextMethod is new show slot slot<- validObject -#' @importFrom utils capture.output packageVersion +#' @importFrom utils capture.output packageVersion tail #' #' #' ### To be added into the functions above #' #' @importFrom S4Vectors DataFrame metadata Rle SimpleList mcols mcols<- -#' start end metadata metadata<- subjectHits queryHits +#' start end metadata metadata<- subjectHits queryHits elementMetadata +#' values values<- #' @importFrom grDevices colorRampPalette #' @importFrom GenomeInfoDb keepStandardChromosomes seqlevels<- seqlevels #' seqlengths seqlengths<- seqlevelsStyle<- seqlevelsStyle seqnames -#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' seqinfo standardChromosomes dropSeqlevels keepSeqlevels +#' sortSeqlevels #' @importFrom DelayedArray rowMaxs rowMeans path<- cbind plogis qlogis #' DelayedArray #' @importFrom DelayedMatrixStats colSds rowMedians rowSds colMeans2 rowMeans2 @@ -100,9 +103,10 @@ #' scale_y_log10 scale_color_gradientn labs theme_bw theme #' scale_color_brewer scale_color_discrete scale_linetype_manual #' annotate geom_histogram scale_fill_manual xlim scale_colour_manual -#' element_blank annotation_logticks +#' element_blank annotation_logticks ylim quo_name facet_grid +#' facet_wrap geom_text guides guide_legend #' -#' @importFrom tibble as_tibble +#' @importFrom tibble as_tibble %>% #' #' @useDynLib FRASER #' @@ -128,5 +132,12 @@ globalVariables(c(".", "J", ".N", ".asDataFrame", "End", "first_feature", "model", "mu", "n", ",nsubset", "o3", "o5", "obsPsi", "os", "pa", "padj", "passed", "pByFeature", "pointNr", "predPsi", "psi3", "psi5", "psiType", "psiValue", "seqlength", "seqlevel", "Step", "traceNr", - "uniqueID", "V1", "value", "zscore", "maxDTheta"), + "uniqueID", "V1", "value", "zscore", "maxDTheta", "par", "genes_donor", + "genes_acceptor", "gene_pval", "gene_padj", "dt_idx", + "blacklist", "potentialImpact", "causesFrameshift", "annotatedJunction", + "distNearestGene", "UTR_overlap", "meanCount", "medianCount", + "potentialImpact2", "nonsplitProportion", "nonsplitCounts", + "nonsplitProportion_99quantile", "startID", "endID", "j_idx", "jidx", + "start_idx", "end_idx", "pval_gene", "FDR_subset_gene", "gene_id", + "pvalue"), package="FRASER") diff --git a/R/Fraser-pipeline.R b/R/Fraser-pipeline.R index c3b9b967..4258970c 100644 --- a/R/Fraser-pipeline.R +++ b/R/Fraser-pipeline.R @@ -31,7 +31,7 @@ #' splicing types. #' @param implementation The method that should be used to correct for #' confounders. -#' @param type The type of PSI (psi5, psi3 or theta for theta/splicing +#' @param type The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing #' efficiency) #' @param iterations The maximal number of iterations. When the autoencoder has #' not yet converged after these number of iterations, the fit stops anyway. @@ -61,16 +61,32 @@ #' # The functions run inside the FRASER function can also be directly #' # run themselves. #' # To directly run the fit function: -#' fds <- fit(fds, implementation="PCA", q=2, type="psi5") +#' fds <- fit(fds, implementation="PCA", q=2, type="jaccard") #' #' # To directly run the nomial and adjusted p value and z score #' # calculation, the following functions can be used: -#' fds <- calculatePvalues(fds, type="psi5") -#' head(pVals(fds, type="psi5")) -#' fds <- calculatePadjValues(fds, type="psi5", method="BY") -#' head(padjVals(fds, type="psi5")) -#' fds <- calculateZscore(fds, type="psi5") -#' head(zScores(fds, type="psi5")) +#' fds <- calculatePvalues(fds, type="jaccard") +#' head(pVals(fds, type="jaccard")) +#' fds <- calculatePadjValues(fds, type="jaccard", method="BY") +#' head(padjVals(fds, type="jaccard")) +#' fds <- calculateZscore(fds, type="jaccard") +#' head(zScores(fds, type="jaccard")) +#' +#' # example of restricting FDR correction to subsets of genes of interest +#' genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("exampleSubset"=genesOfInterest)) +#' padjVals(fds, type="jaccard", subsetName="exampleSubset") +#' padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +#' fds <- calculatePadjValues(fds, type="jaccard", +#' subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +#' padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") +#' +#' # only adding FDR corrected pvalues on a subset without calculating +#' # transcriptome-wide FDR again: +#' fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, +#' subsetName="setOfInterest", type="jaccard") +#' padjVals(fds, type="jaccard", subsetName="setOfInterest") #' #' @seealso \code{\link[FRASER]{fit}} #' @@ -85,9 +101,11 @@ NULL #' the beta-binomial fit, the computation of Z scores and p values as well as #' the computation of delta-PSI values. #' @export -FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", - "AE-weighted", "AE", "BB"), - iterations=15, BPPARAM=bpparam(), correction, ...){ +FRASER <- function(fds, q, type=fitMetrics(fds), + implementation=c("PCA", "PCA-BB-Decoder", "AE-weighted", + "AE", "BB"), + iterations=15, BPPARAM=bpparam(), correction, + subsets=NULL, ...){ # Check input implementation <- match.arg(implementation) if (!missing("correction")){ @@ -103,7 +121,7 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", } # fit each splicing type separately - for(i in psiTypes){ + for(i in type){ # get type specific q if(missing(q)){ @@ -126,10 +144,10 @@ FRASER <- function(fds, q, implementation=c("PCA", "PCA-BB-Decoder", fds <- calculatePvalues(fds, type=i) message(date(), ": Adjust p values for: '", i, "'.") - fds <- calculatePadjValues(fds, type=i) + fds <- calculatePadjValues(fds, type=i, subsets=subsets) - message(date(), ": Compute Z scores for: '", i, "'.") - fds <- calculateZscore(fds, type=i) + # message(date(), ": Compute Z scores for: '", i, "'.") + # fds <- calculateZscore(fds, type=i) } # return final analysis diff --git a/R/FraserDataSet-class.R b/R/FraserDataSet-class.R index 630361d4..62eee8e8 100644 --- a/R/FraserDataSet-class.R +++ b/R/FraserDataSet-class.R @@ -218,13 +218,13 @@ showFraserDataSet <- function(object) { cat("\n") cat("-------------------- BAM parameters --------------------\n") - if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ - cat(paste0("Default used with: ", - "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) - )) - } else { + # if(identical(scanBamParam(FraserDataSet()), scanBamParam(object))){ + # cat(paste0("Default used with: ", + # "bamMapqFilter=", bamMapqFilter(scanBamParam(object)) + # )) + # } else { show(scanBamParam(object)) - } + # } cat("\n\n") } diff --git a/R/RcppExports.R b/R/RcppExports.R index 659f3b4c..42ec8fbc 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,6 +41,10 @@ truncNLL_rho <- function(rho, yi, ki, ni) { .Call('_FRASER_truncNLL_rho', PACKAGE = 'FRASER', rho, yi, ki, ni) } +truncNLL_rho_penalized <- function(logit_rho, yi, ki, ni, lambda) { + .Call('_FRASER_truncNLL_rho_penalized', PACKAGE = 'FRASER', logit_rho, yi, ki, ni, lambda) +} + fullNLL <- function(y, rho, k, n, D, lambda, byRows = FALSE) { .Call('_FRASER_fullNLL', PACKAGE = 'FRASER', y, rho, k, n, D, lambda, byRows) } diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R index 62f6c377..8796cd0b 100644 --- a/R/annotationOfRanges.R +++ b/R/annotationOfRanges.R @@ -17,6 +17,10 @@ #' \code{TxDb.Hsapiens.UCSC.hg19.knownGene}. #' @param orgDb An \code{orgDb} object or a data table to map the feature names. #' If this is NULL, then \code{org.Hs.eg.db} is used as the default. +#' @param filter A named list specifying the filters which should be applied to +#' subset to e.g. only protein-coding genes for annotation. +#' \code{names(filter)} needs to be column names in the given +#' orgDb object (default: no filtering). #' @param keytype The keytype or column name of gene IDs in the \code{TxDb} #' object (see #' \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}} @@ -32,13 +36,13 @@ #' # either using biomart with GRCh38 #' try({ #' fds <- annotateRanges(fds, GRCh=38) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol")] #' }) #' #' # either using biomart with GRCh37 #' try({ #' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) -#' rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] +#' rowRanges(fds, type="j")[,c("hgnc_symbol_37")] #' }) #' #' # or with a provided TxDb object @@ -47,7 +51,7 @@ #' require(org.Hs.eg.db) #' orgDb <- org.Hs.eg.db #' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -#' rowRanges(fds, type="psi5")[,"hgnc_symbol"] +#' rowRanges(fds, type="j")[,"hgnc_symbol"] #' #' @rdname annotateRanges #' @export @@ -59,9 +63,6 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, if(length(fds) == 0) return(fds) # useEnsembl only understands GRCh=37 or GRCh=NULL (uses 38 then) - if(is.null(GRCh)){ - GRCh <- 38 - } if(GRCh == 38){ GRCh <- NULL } @@ -89,17 +90,16 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, annotation <- getFeatureAsGRange(ensembl, feature, featureName, biotype, useUSCS) - # annotate split reads - for(i in c("psi3", "theta")){ - gr <- rowRanges(fds, type=i) - if(any(strand(gr) == "*")){ - strand(annotation) <- "*" - } - annos <- getAnnotationFeature(data=gr, featureName, annotation) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # annotate splice sites first + gr <- rowRanges(fds, type="theta") + if(any(strand(gr) == "*")){ + strand(annotation) <- "*" } + annos <- getAnnotationFeature(data=gr, featureName, annotation) + mcols(fds, type="theta")[[featureName]] <- annos + + # annotate junctions with genes at donor and acceptor sites + fds <- annotateFeatureFromSpliceSite(fds, featureName) return(fds) } @@ -108,7 +108,7 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature, #' @export annotateRangesWithTxDb <- function(fds, feature="SYMBOL", featureName="hgnc_symbol", keytype="ENTREZID", - txdb=NULL, orgDb=NULL){ + txdb=NULL, orgDb=NULL, filter=list()){ gene_id <- NULL # check input @@ -132,41 +132,49 @@ annotateRangesWithTxDb <- function(fds, feature="SYMBOL", } } - for(i in c("psi3", "theta")){ - # get GRanges object with the split reads which should be annotated - gr <- rowRanges(fds, type=i) - - # get the annotation to compare to - anno <- genes(txdb) - if(is.data.table(orgDb)){ - tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, - by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, - .(gene_id, feature=get(feature))] - setnames(tmp, "feature", feature) - } else { - tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], - columns=feature, keytype=keytype)) - } - - # add the new feature to the annotation - tmp[, uniqueID := .GRP, by=keytype] - anno <- anno[tmp[,uniqueID]] - mcols(anno)[[featureName]] <- tmp[,get(feature)] + # get GRanges object with the splice sites which should be annotated + gr <- rowRanges(fds, type="theta") + + # get the annotation to compare to + anno <- genes(txdb) + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(anno)[,.(gene_id)], y=orgDb, + by.y=keytype, by.x="gene_id", all.x=TRUE, sort=FALSE)[, + c("gene_id", feature, names(filter)), with=FALSE] + } else { + tmp <- as.data.table(select(orgDb, keys=mcols(anno)[,"gene_id"], + columns=c(feature, names(filter)), keytype=keytype)) + } - # clean up of NA and "" ids - anno <- anno[!is.na(mcols(anno)[,featureName]),] - anno <- anno[mcols(anno)[,featureName] != "",] - if(any(strand(gr) == "*")){ - strand(anno) <- "*" + # filter genes as specified by user (e.g. only protein_coding) + tmp[, include:=TRUE] + if(!is.null(filter) & length(filter) > 0 & !is.null(names(filter))){ + for(n in names(filter)){ + stopifnot(n %in% colnames(tmp)) + tmp[!(get(n) %in% filter[[n]]), include:=FALSE] } + } + + # add the new feature to the annotation + tmp[, uniqueID := .GRP, by=keytype] + tmp <- tmp[include == TRUE,] + anno <- anno[tmp[,uniqueID]] + mcols(anno)[[featureName]] <- tmp[,get(feature)] - # retrieve the feature of interest for the split reads - annos <- getAnnotationFeature(data=gr, featureName, anno) - mcols(fds, type=i)[[featureName]] <- annos[["feature"]] - mcols(fds, type=i)[[paste0("other_", featureName)]] <- - annos[["other_features"]] + # clean up of NA and "" ids + anno <- anno[!is.na(mcols(anno)[,featureName]),] + anno <- anno[mcols(anno)[,featureName] != "",] + if(any(strand(gr) == "*")){ + strand(anno) <- "*" } + # retrieve the feature of interest for the splice sites + annos <- getAnnotationFeature(data=gr, featureName, anno) + mcols(fds, type="theta")[[featureName]] <- annos + + # transfer annoated features for splice sites to junctions + fds <- annotateFeatureFromSpliceSite(fds, featureName) + return(fds) } @@ -228,14 +236,12 @@ getAnnotationFeature <- function(data, feature, annotation){ } # extract only the feature and group them with a ";" - featureDT <- featureDT[, - list(first_feature=unique(feature)[1], - other_features=paste(unique(feature)[-1], collapse = ";")), - by="from" - ] + featureDT <- featureDT[,feature:=paste(unique(feature), collapse = ";"), + by="from"] + featureDT <- featureDT[!duplicated(featureDT),] + featureDT[feature == "NA", feature:=NA] - return(list(feature=featureDT[order(from),first_feature], - other_features=featureDT[order(from),other_features])) + return(featureDT[order(from),feature]) } @@ -314,4 +320,27 @@ findAnnotatedJunction <- function(fds, annotation, annotateNames=TRUE, fds } - +#' annotate junctions with genes at donor and acceptor sites +#' @noRd +annotateFeatureFromSpliceSite <- function(fds, featureName){ + ssdt <- data.table(spliceSiteID=mcols(fds, type="theta")$spliceSiteID, + genes=mcols(fds, type="theta")[[featureName]] + ) + junction_dt <- data.table(startID=mcols(fds, type="psi3")$startID, + endID=mcols(fds, type="psi3")$endID + ) + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="startID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_donor") + junction_dt <- merge(junction_dt, ssdt, all.x=TRUE, + by.x="endID", by.y="spliceSiteID", sort=FALSE) + setnames(junction_dt, "genes", "genes_acceptor") + + junction_dt[,genes:=paste(uniqueIgnoreNA( + c(splitGenes(genes_donor), splitGenes(genes_acceptor))), + collapse=";"), + by="startID,endID"] + junction_dt[genes == "NA", genes:=NA] + mcols(fds, type="j")[[featureName]] <- junction_dt[,genes] + return(fds) +} diff --git a/R/autoencoder.R b/R/autoencoder.R index e0e066a3..d9d0e7a2 100644 --- a/R/autoencoder.R +++ b/R/autoencoder.R @@ -2,8 +2,9 @@ #' Main autoencoder fit function #' #' @noRd -fitAutoencoder <- function(fds, q, type="psi3", noiseAlpha=1, minDeltaPsi=0.1, - rhoRange=c(1e-5, 1-1e-5), lambda=0, convergence=1e-5, +fitAutoencoder <- function(fds, q, type=currentType(fds), noiseAlpha=1, + minDeltaPsi=0.1, + rhoRange=c(-30, 30), lambda=0, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), verbose=FALSE, nrDecoderBatches=5, weighted=FALSE, nSubset=15000, multiRho=FALSE, diff --git a/R/calculatePSIValue.R b/R/calculatePSIValue.R index e605ba67..f9f3f991 100644 --- a/R/calculatePSIValue.R +++ b/R/calculatePSIValue.R @@ -14,14 +14,14 @@ #' #' @inheritParams countRNA #' @param types A vector with the psi types which should be calculated. Default -#' is all of psi5, psi3 and theta. +#' is all of jaccard, psi5, psi3 and theta. #' @param overwriteCts FALSE or TRUE (the default) the total counts (aka N) will #' be recalculated based on the existing junction counts (aka K) #' @return FraserDataSet #' @export #' @examples #' fds <- createTestFraserDataSet() -#' fds <- calculatePSIValues(fds, types="psi5") +#' fds <- calculatePSIValues(fds, types="jaccard") #' #' ### usually one would run this function for all psi types by using: #' # fds <- calculatePSIValues(fds) @@ -36,6 +36,10 @@ calculatePSIValues <- function(fds, types=psiTypes, overwriteCts=FALSE, overwriteCts=overwriteCts, BPPARAM=BPPARAM) } + # calculate intron jaccard index + fds <- calculateIntronNonsplitSum(fds, overwriteCts=overwriteCts) + fds <- calculateJaccardIntronIndex(fds, overwriteCts=overwriteCts) + # calculate the delta psi value for(psiType in types){ assayName <- paste0("delta_", psiType) @@ -183,7 +187,7 @@ calculateSitePSIValue <- function(fds, overwriteCts, BPPARAM){ # check input stopifnot(is(fds, "FraserDataSet")) - message(date(), ": Calculate the PSI site values ...") + message(date(), ": Calculate the theta values ...") psiName <- "theta" psiROCName <- "rawOtherCounts_theta" @@ -319,3 +323,101 @@ getOtherCountsCacheFolder <- function(fds){ # return it return(cachedir) } + +#' +#' calculates the jaccard intron value for the given junctions +#' +#' @noRd +calculateJaccardIntronIndex <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the Jaccard Intron values ...") + + # check if we have computed N_psi3, N_psi5 and K_nonsplit already + if(!all(c(paste0("rawOtherCounts_psi", c(5, 3)), "rawCountsJnonsplit") %in% + assayNames(fds))){ + stop("Please calculate N_psi3, N_psi5 and K_nonsplit first before ", + "calling this function.") + } + + # calculate intron jaccard value + jaccard_denom <- N(fds, "psi3") + N(fds, "psi5") + + assay(fds, "rawCountsJnonsplit") - K(fds, type="j") + jaccardValues <- K(fds, type="j") / jaccard_denom + otherCounts_jaccard <- jaccard_denom - K(fds, type="j") + + # assign it to our object + assay(fds, type="j", "jaccard", withDimnames=FALSE) <- jaccardValues + + if(isTRUE(overwriteCts) || + !("rawOtherCounts_jaccard" %in% assayNames(fds))){ + assay(fds, type="j", "rawOtherCounts_jaccard", + withDimnames=FALSE) <- otherCounts_jaccard + } + + return(fds) +} + +#' Calculates the sum of nonsplit reads overlapping either the donor or +#' acceptor splice site and stores it as a new assay (one value for each +#' junction and sample). +#' +#' @noRd +calculateIntronNonsplitSum <- function(fds, overwriteCts){ + stopifnot(is(fds, "FraserDataSet")) + + message(date(), ": Calculate the total nonsplit counts for each intron ", + "...") + + + # get splice site nonsplit counts + nsr_ss <- K(fds, "theta") + + # retrieve junction and splice site annotation + junction_dt <- as.data.table(rowRanges(fds, type="j"))[, + .(seqnames, start, end, + strand, startID, endID)] + junction_dt[, j_idx:=seq_len(.N)] + ss_map <- data.table(spliceSiteID=rowRanges(fds, type="ss")$spliceSiteID, + nsr_idx=seq_len(nrow(nonSplicedReads(fds)))) + + junction_dt <- merge(junction_dt, ss_map, + by.x="startID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "start_idx") + junction_dt <- merge(junction_dt, ss_map, + by.x="endID", by.y="spliceSiteID", + all.x=TRUE) + setnames(junction_dt, "nsr_idx", "end_idx") + + # for each junction, find the two rows in K_theta corresponding to its + # donor and acceptor splice site + donor_sites <- junction_dt[!is.na(start_idx),] + acc_sites <- junction_dt[!is.na(end_idx),] + + # set nsr counts to 0 for junctions for which no mapping by spliceSiteID + # could be found + nsr_donor <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + nsr_acc <- matrix(0, nrow=nrow(fds), ncol=ncol(fds)) + + nsr_donor[donor_sites[,j_idx],] <- + as.matrix(nsr_ss[donor_sites[,start_idx],]) + nsr_acc[acc_sites[,j_idx],] <- + as.matrix(nsr_ss[acc_sites[,end_idx],]) + + # sum them + nsr_j <- nsr_donor + nsr_acc + + if(nrow(nsr_j) != nrow(fds)){ + warning("Unequal number of junctions in fds and junctions with ", + "computed nonsplit count sum!") + } + + # assign it to our object + if(isTRUE(overwriteCts) || + !("rawCountsJnonsplit" %in% assayNames(fds))){ + assay(fds, type="j", "rawCountsJnonsplit", withDimnames=FALSE) <- nsr_j + } + + return(fds) +} diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R index f68c4bdc..3294096e 100644 --- a/R/countRNAseqData.R +++ b/R/countRNAseqData.R @@ -361,7 +361,7 @@ getNonSplitReadCountsForAllSamples <- function(fds, splitCountRanges, " splice junctions are found.") # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) message(date(), ": In total ", length(spliceSiteCoords), " splice sites (acceptor/donor) will be counted ...") @@ -495,7 +495,7 @@ countSplitReads <- function(sampleID, fds, NcpuPerSample=1, genome=NULL, chromosomes <- extractChromosomes(bamfile) if(isFALSE(keepNonStandardChromosomes)){ - chr_gr <- GRanges(seqnames=paste0(chromosomes, ":1-2")) + chr_gr <- GRanges(seqnames=chromosomes, ranges=IRanges(1, 2)) chromosomes <- standardChromosomes(chr_gr) } @@ -569,6 +569,10 @@ countSplitReadsPerChromosome <- function(chromosome, bamFile, bamFile, param=param, strandMode=strandMode) } + # remove read pairs with NA seqnames + # (occurs if reads of a pair align to different chromosomes) + galignment <- galignment[!is.na(seqnames(galignment))] + # remove the strand information if unstranded data if(isFALSE(as.logical(strandMode))){ strand(galignment) <- "*" @@ -852,7 +856,7 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, } # extract donor and acceptor sites - spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges, fds) + spliceSiteCoords <- extractSpliceSiteCoordinates(splitCountRanges) } @@ -895,7 +899,8 @@ countNonSplicedReads <- function(sampleID, splitCountRanges, fds, # extract the counts with Rsubread tmp_ssc <- checkSeqLevelStyle(spliceSiteCoords, fds, sampleID, TRUE) - anno <- GRanges2SAF(tmp_ssc, minAnchor=minAnchor) + # use minAnchor+1 here to allow for small variants in the anchor region + anno <- GRanges2SAF(tmp_ssc, minAnchor=(minAnchor+1)) rsubreadCounts <- featureCounts(files=bamFile, annot.ext=anno, minOverlap=minAnchor*2, allowMultiOverlap=TRUE, @@ -974,17 +979,12 @@ readJunctionMap <- function(junctionMap){ #' extracts the splice site coordinates from a junctions GRange object ( #' @noRd -extractSpliceSiteCoordinates <- function(junctions, fds){ +extractSpliceSiteCoordinates <- function(junctions){ - if(strandSpecific(fds) >= 1L){ - spliceSiteCoords <- unlist(GRangesList( - extractSpliceSiteCoordsPerStrand(junctions, "+"), - extractSpliceSiteCoordsPerStrand(junctions, "-") - )) - } else { - strand(junctions) <- "*" - spliceSiteCoords <- extractSpliceSiteCoordsPerStrand(junctions, "*") - } + spliceSiteCoords <- unlist(GRangesList( + lapply(unique(strand(junctions)), extractSpliceSiteCoordsPerStrand, + junctions=junctions) + )) return(unique(sort(spliceSiteCoords))) } @@ -1049,15 +1049,21 @@ annotateSpliceSite <- function(gr){ dt <- GRanges2SAF(gr) # extract donor/acceptor annotation - startSideDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] - endSideDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT <- dt[,.(End=Start, type="start"),by="Chr,Start,Strand"] + endSiteDT <- dt[,.(Start=End, type="end" ),by="Chr,End,Strand"] + startSiteDT[,Start:=Start-1] + endSiteDT[,End:=End+1] # annotate and enumerate donor/acceptor - annotadedDT <- rbind(startSideDT, endSideDT) - annotadedDT[,id:=seq_len(nrow(annotadedDT))] + annotatedDT <- rbind(startSiteDT, endSiteDT) + annotatedDT[,id:=.GRP, by="Chr,Start,End,Strand"] + + # set back start / end positions for merging with junction ranges + annotatedDT[type == "start", Start:=End] + annotatedDT[type == "end", End:=Start] # convert back to granges - annogr <- makeGRangesFromDataFrame(annotadedDT, keep.extra.columns=TRUE) + annogr <- makeGRangesFromDataFrame(annotatedDT, keep.extra.columns=TRUE) ids <- lapply(c("start", "end"), function(type){ # reduce annogr to only the specific type to prevent overlap diff --git a/R/example_functions.R b/R/example_functions.R index 620b5ffc..c4b992be 100644 --- a/R/example_functions.R +++ b/R/example_functions.R @@ -61,7 +61,7 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ if(all(file.exists(hdf5Files))){ if(isFALSE(rerun)){ fds <- loadFraserDataSet(workingDir, name="Data_Analysis") - if(all(paste0(c("zScores", "padjBetaBinomial", "predictedMeans"), + if(all(paste0(c("padjBetaBinomial", "predictedMeans"), "_", rep(psiTypes, 3)) %in% assayNames(fds))){ message(date(), ": Use existing cache data.") return(fds) @@ -80,12 +80,12 @@ createTestFraserDataSet <- function(workingDir="FRASER_output", rerun=FALSE){ fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=5, minDeltaPsi=0, quantileMinExpression=0) - # run FRASER pipeline - fds <- FRASER(fds, q=c(psi5=2, psi3=2, theta=2), iterations=2) - # annotate it suppressMessages({ fds <- annotateRangesWithTxDb(fds) }) + # run FRASER pipeline + fds <- FRASER(fds, q=c(jaccard=2, psi5=2, psi3=2, theta=2), iterations=2) + # save data for later fds <- saveFraserDataSet(fds) diff --git a/R/filterExpression.R b/R/filterExpression.R index dde3b29a..6c7eab8c 100644 --- a/R/filterExpression.R +++ b/R/filterExpression.R @@ -17,6 +17,9 @@ #' passed all filters is returned. If FALSE, no subsetting is done and the #' information of whether an intron passed the filters is only stored in the #' mcols. +#' @param filterOnJaccard If TRUE, the Intron Jaccard Metric is used to define +#' express introns during fitlering. Otherwise, the psi5, psi3 and theta +#' metrics are used (default: TRUE). #' @param delayed If FALSE, count matrices will be loaded into memory, #' otherwise the function works on the delayedMatrix representations. The #' default value depends on the number of samples in the fds-object. @@ -27,8 +30,8 @@ #' @examples #' fds <- createTestFraserDataSet() #' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -#' mcols(fds, type="psi5")[, c( -#' "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +#' mcols(fds, type="jaccard")[, c( +#' "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] #' #' plotFilterExpression(fds) #' plotFilterVariability(fds) @@ -42,21 +45,25 @@ NULL #' read support and introns that are not variable across samples. #' @export filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, minDeltaPsi=0.05, + quantile=0.75, quantileMinExpression=10, minDeltaPsi=0.0, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + # filter introns with low read support and corresponding splice sites object <- filterExpression(object, minExpressionInOneSample=minExpressionInOneSample, quantile=quantile, quantileMinExpression=quantileMinExpression, filter=filter, delayed=delayed, + filterOnJaccard=filterOnJaccard, BPPARAM=BPPARAM) # filter introns that are not variable across samples object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter, - delayed=delayed, BPPARAM=BPPARAM) + delayed=delayed, filterOnJaccard=filterOnJaccard, + BPPARAM=BPPARAM) # return fds message(date(), ": Filtering done!") @@ -64,11 +71,307 @@ filterExpressionAndVariability <- function(object, minExpressionInOneSample=20, } -filterExpression.FRASER <- function(object, minExpressionInOneSample=20, - quantile=0.95, quantileMinExpression=10, filter=TRUE, +#' @noRd +filterExpression.FRASER2 <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + return(filterExpression_jaccard(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } else{ + return(filterExpression.FRASER(object, + minExpressionInOneSample=minExpressionInOneSample, + quantile=quantile, + quantileMinExpression=quantileMinExpression, + filter=filter, delayed=delayed, + BPPARAM=BPPARAM)) + } +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterExpression", signature="FraserDataSet", + filterExpression.FRASER2) + +#' This function filters out introns and corresponding +#' splice sites which are expressed at very low levels across samples. +#' @noRd +filterExpression_jaccard <- function(object, minExpressionInOneSample=20, + quantile=0.75, quantileMinExpression=10, filter=TRUE, delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), BPPARAM=bpparam()){ + + stopifnot(is(object, "FraserDataSet")) + + message(date(), ": Filtering out introns with low read support ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN) + } + + # cutoff functions + f1 <- function(cts, ...){ + rowMaxs(cts) } + f2 <- function(cts, ctsN, quantile, ...){ + rowQuantiles(ctsN, probs=quantile, drop=FALSE)[,1] } + + funs <- c(maxCount=f1, quantileValueN=f2) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN, quantile=quantile) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + mcols(object, type="j")[['passedExpression']] <- + cutoffs$maxCount >= minExpressionInOneSample & + cutoffs$quantileValueN >= quantileMinExpression + if("passedVariability" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyExpressionFilters_jaccard(object, + minExpressionInOneSample, + quantileMinExpression) + } + + validObject(object) + return(object) +} +#' @noRd +filterVariability.FRASER2 <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard=TRUE, BPPARAM=bpparam()){ + if(isTRUE(filterOnJaccard)){ + object <- filterVariability_jaccard(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } else{ + object <- filterVariability.FRASER(object, minDeltaPsi=minDeltaPsi, + filter=filter, delayed=delayed, BPPARAM=BPPARAM) + } +} + +#' @describeIn filtering This function filters out introns and corresponding +#' splice sites that have low read support in all samples. +#' @export +setMethod("filterVariability", signature="FraserDataSet", + filterVariability.FRASER2) + + +#' This function filters out introns and corresponding +#' splice sites which do not show variablity across samples. +#' @noRd +filterVariability_jaccard <- function(object, minDeltaPsi=0, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + + message(date(), ": Filtering out non-variable introns ...") + + # extract counts + cts <- K(object, type="j") + ctsN <- N(object, type="jaccard") + + if(isFALSE(delayed)){ + cts <- as.matrix(cts) + ctsN <- as.matrix(ctsN) + } + + # cutoff functions + f1 <- function(cts, ctsN, ...) { + jaccard <- cts/ctsN + rowMaxs(abs(jaccard - rowMeans2(jaccard, na.rm=TRUE)), + na.rm=TRUE) } + + funs <- c(maxDJaccard=f1) + + # run it in parallel + cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, + cts=cts, ctsN=ctsN) + + # add annotation to object + for(n in names(cutoffs)){ + mcols(object, type="j")[n] <- cutoffs[[n]] + } + + # add annotation of theta on splice sites of introns to mcols + intron_dt <- as.data.table(rowRanges(object, type="j")) + + # check which introns pass the filter + mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, + cutoffs$maxDJaccard, + 0) >= minDeltaPsi + if("passedExpression" %in% colnames(mcols(object, type="j"))){ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedExpression']] & + mcols(object, type="j")[['passedVariability']] + } else{ + mcols(object, type="j")[['passed']] <- + mcols(object, type="j")[['passedVariability']] + } + + # filter if requested + if(isTRUE(filter)){ + object <- applyVariabilityFilters_jaccard(object, minDeltaPsi) + } + + validObject(object) + return(object) +} + +#' Applies previously calculated filters for expression filters +#' @noRd +applyExpressionFilters_jaccard <- function(fds, minExpressionInOneSample, + quantileMinExpression){ + + maxCount <- mcols(fds, type="j")[['maxCount']] + quantileValueN <- mcols(fds, type="j")[['quantileValueN']] + + # report rare junctions that passed minExpression filter but not + # quantileFilter as SE obj + junctionsToReport <- maxCount >= minExpressionInOneSample & + !(quantileValueN >= quantileMinExpression) + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + + if(any(junctionsToReport)){ + # get SE object of junctions to report + rareJunctions <- asSE(fds[junctionsToReport, by="j"]) + for(aname in assayNames(rareJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(rareJunctions, aname) <- NULL + } + } + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) + + # check if folder already exists from previous filtering + rareJctsDir <- file.path(outputDir, "rareJunctions") + if(dir.exists(rareJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir) + toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample & + !(mcols(rJ_stored)$quantileValueN >= quantileMinExpression) + + rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions) + + for(aname in assayNames(rJ_tmp)){ + assay(rJ_tmp, aname) <- + rbind(as.matrix(assay(rareJunctions, aname)), + as.matrix(assay(rJ_stored[toReport,], aname)) ) + } + rareJunctions <- rJ_tmp + rm(rJ_tmp) + } + + rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, + dir=rareJctsDir, replace=TRUE) + } + + # apply filter + numFilt <- sum(mcols(fds, type="j")[['passedExpression']]) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] + + return(fds) +} + + +#' Applies previously calculated variablilty filters +#' @noRd +applyVariabilityFilters_jaccard <- function(fds, minDeltaPsi){ + + # + passedVariability <- mcols(fds, type="j")[['passedVariability']] + + # store information of non-variable junctions + filtered <- !passedVariability + + outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) + if(any(filtered)){ + # get SE object of junctions to report + nonVariableJunctions <- asSE(fds[filtered, by="j"]) + for(aname in assayNames(nonVariableJunctions)){ + if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3", "jaccard", + "rawOtherCounts_intron_jaccard"))){ + assay(nonVariableJunctions, aname) <- NULL + } + } + nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE, + nonVariableJunctions, + dir=file.path(tempdir(), "tmp_nvJ")) + + # check if folder already exists from previous filtering + nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions") + if(dir.exists(nonVarJctsDir)){ + warning("Filtering has already been applied previously. Introns ", + "that were already filtered out but should be kept now ", + "cannot be restored.") + nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) + toReport <- mcols(nV_stored)$maxDJaccard < minDeltaPsi + + nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) + for(aname in assayNames(nVJunctions)){ + assay(nVJunctions, aname) <- + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) + } + nonVariableJunctions <- nVJunctions + rm(nVJunctions) + } + + nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir, + x=nonVariableJunctions, replace=TRUE) + + } + + # apply filtering + numFilt <- sum(passedVariability) + message(paste0("Keeping ", numFilt, " junctions out of ", length(fds), + ". This is ", signif(numFilt/length(fds)*100, 3), + "% of the junctions")) + fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] + return(fds) +} + + +#' Old FRASER1 filtering functions +#' @noRd +filterExpression.FRASER <- function(object, minExpressionInOneSample=20, + quantile=0.95, quantileMinExpression=10, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ + stopifnot(is(object, "FraserDataSet")) message(date(), ": Filtering out introns with low read support ...") @@ -83,30 +386,30 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, ctsN5 <- as.matrix(ctsN5) ctsN3 <- as.matrix(ctsN3) } - + # cutoff functions f1 <- function(cts, ...){ - rowMaxs(cts) } + rowMaxs(cts) } f2 <- function(cts, ctsN5, quantile, ...){ - rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } + rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] } f3 <- function(cts, ctsN3, quantile, ...) { - rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } - + rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] } + funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3) - + # run it in parallel cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM, - cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) - + cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile) + # add annotation to object for(n in names(cutoffs)){ mcols(object, type="j")[n] <- cutoffs[[n]] } mcols(object, type="j")[['passedExpression']] <- - cutoffs$maxCount >= minExpressionInOneSample & - (cutoffs$quantileValue5 >= quantileMinExpression & - cutoffs$quantileValue3 >= quantileMinExpression) + cutoffs$maxCount >= minExpressionInOneSample & + (cutoffs$quantileValue5 >= quantileMinExpression & + cutoffs$quantileValue3 >= quantileMinExpression) if("passedVariability" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -119,25 +422,18 @@ filterExpression.FRASER <- function(object, minExpressionInOneSample=20, # filter if requested if(isTRUE(filter)){ object <- applyExpressionFilters(object, minExpressionInOneSample, - quantileMinExpression) + quantileMinExpression) } - + validObject(object) return(object) } -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites that have low read support in all samples. -#' @export -setMethod("filterExpression", signature="FraserDataSet", - filterExpression.FRASER) - -#' @describeIn filtering This function filters out introns and corresponding -#' splice sites which do not show variablity across samples. -#' @export -filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, - delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), - BPPARAM=bpparam()){ +#' Old FRASER1 filtering functions +#' @noRd +filterVariability.FRASER <- function(object, minDeltaPsi=0.05, filter=TRUE, + delayed=ifelse(ncol(object) <= 300, FALSE, TRUE), + BPPARAM=bpparam()){ message(date(), ": Filtering out non-variable introns ...") @@ -167,7 +463,7 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, theta <- ctsSE/ctsNSE dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)), na.rm=TRUE) } - + funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3) @@ -194,14 +490,14 @@ filterVariability <- function(object, minDeltaPsi=0.05, filter=TRUE, mcols(object, type="j")["maxDThetaAcceptor"] <- merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID", all.x=TRUE, sort=FALSE)[,maxDTheta] - + # check which introns pass the filter mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE, - cutoffs$maxDPsi3, - cutoffs$maxDPsi5, - mcols(object, type="j")$maxDThetaDonor, - mcols(object, type="j")$maxDThetaAcceptor, - 0) >= minDeltaPsi + cutoffs$maxDPsi3, + cutoffs$maxDPsi5, + mcols(object, type="j")$maxDThetaDonor, + mcols(object, type="j")$maxDThetaAcceptor, + 0) >= minDeltaPsi if("passedExpression" %in% colnames(mcols(object, type="j"))){ mcols(object, type="j")[['passed']] <- mcols(object, type="j")[['passedExpression']] & @@ -232,8 +528,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, # report rare junctions that passed minExpression filter but not # quantileFilter as SE obj junctionsToReport <- maxCount >= minExpressionInOneSample & - !(quantileValue5 >= quantileMinExpression & - quantileValue3 >= quantileMinExpression) + !(quantileValue5 >= quantileMinExpression & + quantileValue3 >= quantileMinExpression) outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds)) if(any(junctionsToReport)){ @@ -247,8 +543,8 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, } } rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions, - dir=file.path(tempdir(), "tmp_rJ"), - replace=TRUE) + dir=file.path(tempdir(), "tmp_rJ"), + replace=TRUE) # check if folder already exists from previous filtering rareJctsDir <- file.path(outputDir, "rareJunctions") @@ -284,7 +580,6 @@ applyExpressionFilters <- function(fds, minExpressionInOneSample, fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"] return(fds) - } #' Applies previously calculated variablilty filters @@ -308,8 +603,8 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ nonVariableJunctions <- asSE(fds[filtered, by="j"]) for(aname in assayNames(nonVariableJunctions)){ if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5", - "rawOtherCounts_psi3", "psi5", "psi3", - "delta_psi5", "delta_psi3"))){ + "rawOtherCounts_psi3", "psi5", "psi3", + "delta_psi5", "delta_psi3"))){ assay(nonVariableJunctions, aname) <- NULL } } @@ -325,15 +620,15 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "cannot be restored.") nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir) toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi & - mcols(nV_stored)$maxDPsi3 < minDeltaPsi & - mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & - mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi + mcols(nV_stored)$maxDPsi3 < minDeltaPsi & + mcols(nV_stored)$maxDThetaDonor < minDeltaPsi & + mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,]) for(aname in assayNames(nVJunctions)){ assay(nVJunctions, aname) <- - rbind(as.matrix(assay(nonVariableJunctions, aname)), - as.matrix(assay(nV_stored[toReport,], aname)) ) + rbind(as.matrix(assay(nonVariableJunctions, aname)), + as.matrix(assay(nV_stored[toReport,], aname)) ) } nonVariableJunctions <- nVJunctions rm(nVJunctions) @@ -351,5 +646,5 @@ applyVariabilityFilters <- function(fds, minDeltaPsi){ "% of the junctions")) fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"] return(fds) - + } diff --git a/R/find_encoding_dimensions.R b/R/find_encoding_dimensions.R index b01a4d1a..ad4873ec 100644 --- a/R/find_encoding_dimensions.R +++ b/R/find_encoding_dimensions.R @@ -27,6 +27,8 @@ predict_outliers <- function(fds, type, implementation, BPPARAM){ fds <- calculatePvalues(fds, type=type, implementation=implementation, BPPARAM=BPPARAM) + fds <- calculatePadjValues(fds, type=type, geneLevel=FALSE, + BPPARAM=BPPARAM) return(fds) } @@ -50,7 +52,7 @@ eval_prot <- function(fds, type){ }, FUN.VALUE=logical(length(unique(index))) ) ) + 0 if(any(is.na(scores))){ - warning(sum(is.na(scores)), " P-values where NAs.") + # warning(sum(is.na(scores)), " P-values where NAs.") scores[is.na(scores)] <- min(scores, na.rm=TRUE)-1 } pr <- pr.curve(scores, weights.class0=labels) @@ -111,17 +113,18 @@ findEncodingDim <- function(i, fds, type, params, implementation, #' @examples #' # generate data #' fds <- makeSimulatedFraserDataSet(m=15, j=20) +#' fds <- calculatePSIValues(fds) #' #' # run hyperparameter optimization -#' fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) +#' fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) #' #' # get estimated optimal dimension of the latent space -#' bestQ(fds, type="psi5") -#' hyperParams(fds, type="psi5") +#' bestQ(fds, type="jaccard") +#' hyperParams(fds, type="jaccard") #' #' @export -optimHyperParams <- function(fds, type, implementation="PCA", - q_param=seq(2, min(40, ncol(fds)), by=3), +optimHyperParams <- function(fds, type=psiTypes, implementation="PCA", + q_param=getEncDimRange(fds), noise_param=0, minDeltaPsi=0.1, iterations=5, setSubset=50000, injectFreq=1e-2, BPPARAM=bpparam(), internalThreads=1, plot=TRUE, @@ -227,3 +230,19 @@ optimHyperParams <- function(fds, type, implementation="PCA", return(fds) } +#' Get default range of latent space dimensions to test during hyper param opt +#' @noRd +getEncDimRange <- function(fds, mp=3){ + # Get range for latent space dimension + a <- 2 + b <- min(ncol(fds), nrow(fds)) / mp # N/mp + + maxSteps <- 12 + if(mp < 6){ + maxSteps <- 15 + } + + Nsteps <- min(maxSteps, b) + pars_q <- round(exp(seq(log(a),log(b),length.out = Nsteps))) %>% unique + return(pars_q) +} diff --git a/R/fitCorrectionMethods.R b/R/fitCorrectionMethods.R index f9b67599..1b726403 100644 --- a/R/fitCorrectionMethods.R +++ b/R/fitCorrectionMethods.R @@ -40,7 +40,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), - q, type="psi3", rhoRange=c(1e-8, 1-1e-8), + q, type=psiTypes, rhoRange=c(-30, 30), weighted=FALSE, noiseAlpha=1, convergence=1e-5, iterations=15, initialize=TRUE, control=list(), BPPARAM=bpparam(), nSubset=15000, @@ -51,6 +51,7 @@ fit.FraserDataSet <- function(object, implementation=c("PCA", "PCA-BB-Decoder", paste(names(list(...)), collapse=", ")) } method <- match.arg(implementation) + type <- match.arg(type) verbose <- verbose(object) > 0 diff --git a/R/getNSetterFuns.R b/R/getNSetterFuns.R index 94e8ca5e..bee6932a 100644 --- a/R/getNSetterFuns.R +++ b/R/getNSetterFuns.R @@ -10,11 +10,14 @@ #' @param level Indicates if the retrieved p values should be adjusted on the #' donor/acceptor site-level (default) or if unadjusted junction-level #' p values should be returned. +#' @param filters A named list giving the filters that were applied for masking +#' during p value correction. Used for storing and retrieving the +#' correct set of requested p values. #' @param value The new value to be assigned. #' @param all Logical value indicating whether \code{hyperParams(fds)} should #' return the results of all evaluated parameter combinations or only #' for the optimal parameter combination. -#' @param ... Internally used parameteres. +#' @param ... Internally used parameters. #' @return A (delayed) matrix or vector dependent on the type of data retrieved. #' #' @name getter_setter_functions @@ -29,7 +32,7 @@ #' dontWriteHDF5 <- TRUE #' #' # get/set the splice metric for which results should be retrieved -#' currentType(fds) <- "psi5" +#' currentType(fds) <- "jaccard" #' currentType(fds) #' #' # get fitted parameters @@ -40,6 +43,9 @@ #' # get statistics #' pVals(fds) #' padjVals(fds) +#' +#' # zscore not calculated by default +#' fds <- calculateZscore(fds, type="jaccard") #' zScores(fds) #' #' # set and get pseudocount @@ -47,9 +53,9 @@ #' pseudocount() #' #' # retrieve or set a mask to exclude certain junctions in the fitting step -#' featureExclusionMask(fds, type="theta") <- sample( -#' c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -#' featureExclusionMask(fds, type="theta") +#' featureExclusionMask(fds, type="jaccard") <- sample( +#' c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +#' featureExclusionMask(fds, type="jaccard") #' #' # controlling the verbosity level of the output of some algorithms #' verbose(fds) <- 2 @@ -198,7 +204,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ } -`setAssayMatrix<-` <- function(fds, name, type, ..., value){ +`setAssayMatrix<-` <- function(fds, name, type=currentType(fds), ..., value){ if(!is.matrix(value)){ value <- matrix(value, ncol=ncol(fds), nrow=nrow(mcols(fds, type=type))) } @@ -217,7 +223,7 @@ predictY <- function(fds, type=currentType(fds), noiseAlpha=NULL){ fds } -getAssayMatrix <- function(fds, name, type, byGroup=FALSE){ +getAssayMatrix <- function(fds, name, type=currentType(fds), byGroup=FALSE){ if(missing(name)){ name <- type } else { @@ -248,8 +254,8 @@ zScores <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ #' @describeIn getter_setter_functions This returns the calculated p-values. #' @export pVals <- function(fds, type=currentType(fds), level="site", - dist="BetaBinomial", ...){ - level <- match.arg(level, choices=c("site", "junction")) + filters=list(), dist="BetaBinomial", ...){ + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ @@ -260,33 +266,148 @@ pVals <- function(fds, type=currentType(fds), level="site", warning("Did not find junction-level p values. ", "Using site-level p values instead.") } + } else{ + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ + stop("Did not find gene-level p values. ", + "Please compute them first.") + } + return(metadata(fds)[[paste(aname, type, sep="_")]]) + } } + getAssayMatrix(fds, aname, type=type, ...) } `pVals<-` <- function(fds, type=currentType(fds), level="site", + filters=list(), dist="BetaBinomial", ..., value){ - level <- match.arg(level, choices=c("site", "junction")) + level <- match.arg(level, choices=c("site", "junction", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) aname <- paste0("pvalues", dist) if(level == "junction"){ aname <- paste0(aname, "_junction") + setAssayMatrix(fds, name=aname, type=type, ...) <- value + return(fds) + } else if(level == "gene"){ + aname <- paste0(aname, "_gene") + } + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value } - setAssayMatrix(fds, name=aname, type=type, ...) <- value return(fds) } #' @describeIn getter_setter_functions This returns the adjusted p-values. #' @export -padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), ...){ +padjVals <- function(fds, type=currentType(fds), dist=c("BetaBinomial"), + level="site", subsetName=NULL, filters=list(), ...){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - return(getAssayMatrix(fds, paste0("padj", dist), type=type, ...)) + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } + # add information on used filters + if(is.null(names(filters))){ + filters <- list(rho=1) + } + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + if(level == "gene"){ + if(!paste(aname, type, sep="_") %in% names(metadata(fds))){ + stop("Did not find gene-level padj values. ", + "Please compute them first.") + } + return(metadata(fds)[[paste(aname, type, sep="_")]]) + } + return(getAssayMatrix(fds, aname, type=type, ...)) } -`padjVals<-` <- function(fds, type=currentType(fds), - dist="BetaBinomial", ..., value){ +`padjVals<-` <- function(fds, type=currentType(fds), level="site", + dist="BetaBinomial", subsetName=NULL, filters=list(), ..., + value){ + level <- match.arg(level, choices=c("site", "gene")) dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) - setAssayMatrix(fds, name=paste0("padj", dist), type=type, ...) <- value + aname <- paste0("padj", dist) + aname <- ifelse(level == "gene", paste0(aname, "_gene"), aname) + if(!is.null(subsetName)){ + aname <- paste0(aname, "_", subsetName) + } + # add information on used filters + for(n in sort(names(filters))){ + aname <- paste0(aname, "_", n, filters[[n]]) + } + if(level == "gene"){ + if(is.null(rownames(value))){ + stop("Missing rownames when storing gene-level pvalues.") + } + metadata(fds)[[paste(aname, type, sep="_")]] <- value + } else{ + setAssayMatrix(fds, name=aname, type=type, ...) <- value + } + return(fds) +} + +#' @describeIn getter_setter_functions This returns the names of FDR subsets +#' for which adjusted p values have been calculated. +#' @export +availableFDRsubsets <- function(fds){ + ans <- metadata(fds)[["FDRsubsets"]] + return(ans) +} + +`availableFDRsubsets<-` <- function(fds, value){ + metadata(fds)[["FDRsubsets"]] <- value + return(fds) +} + +`addToAvailableFDRsubsets<-` <- function(fds, value){ + if(!isScalarCharacter(value)){ + stop("The assigned value needs to be a scalar character.") + } + ans <- metadata(fds)[["FDRsubsets"]] + if(is.null(ans)){ + metadata(fds)[["FDRsubsets"]] <- value + } else{ + metadata(fds)[["FDRsubsets"]] <- unique(c(ans, value)) + } return(fds) } @@ -311,10 +432,14 @@ deltaPsiValue <- function(fds, type=currentType(fds)){ #' @describeIn getter_setter_functions Returns the psi type that is used -#' within several methods in the FRASER package. +#' within several methods in the FRASER package (defaults to jaccard). #' @export currentType <- function(fds){ - return(metadata(fds)[['currentType']]) + curType <- metadata(fds)[['currentType']] + if(is.null(curType)){ + curType <- "jaccard" + } + return(curType) } #' @describeIn getter_setter_functions Sets the psi type that is to be used @@ -326,6 +451,27 @@ currentType <- function(fds){ return(fds) } +#' @describeIn getter_setter_functions Returns the splice metrics that will be +#' fitted (defaults to jaccard, used within several methods in the +#' FRASER package). +#' @export +fitMetrics <- function(fds){ + metrics <- metadata(fds)[['fit_metrics']] + if(is.null(metrics)){ + metrics <- "jaccard" + } + return(metrics) +} + +#' @describeIn getter_setter_functions Sets the splice metrics that will be +#' fitted (used within several methods in the FRASER package). +#' @export +`fitMetrics<-` <- function(fds, value){ + stopifnot(is.character(whichPSIType(value))) + metadata(fds)[['fit_metrics']] <- whichPSIType(value) + return(fds) +} + #' @describeIn getter_setter_functions Sets and returns the pseudo count used #' within the FRASER fitting procedure. #' @export @@ -342,7 +488,7 @@ pseudocount <- function(value=NULL){ # set pseudo count if provided stopifnot(isScalarNumeric(value)) stopifnot(value >= 0) - value <- as.integer(value) + value <- as.numeric(value) options('FRASER.pseudoCount'=value) devNULL <- .setPseudoCount(value) stopifnot(value == devNULL) @@ -433,7 +579,7 @@ dontWriteHDF5 <- function(fds){ return(fds) } -getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ +getTrueOutliers <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueOutliers", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -443,7 +589,7 @@ getTrueOutliers <- function(fds, type, byGroup=FALSE, ...){ pmin(pmax(ans, -1), 1) } -getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getTrueDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ ans <- getAssayMatrix(fds, "trueDeltaPSI", type) if(isTRUE(byGroup)){ ans <- getAbsMaxByGroup(fds, type, ans, ...) @@ -451,7 +597,8 @@ getTrueDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ ans } -getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ +getAbsMaxByGroup <- function(fds, type=currentType(fds), mat, index=NULL, + BPPARAM=bpparam()){ if(is.null(index)){ index <- getSiteIndex(fds, type) } @@ -470,13 +617,13 @@ getAbsMaxByGroup <- function(fds, type, mat, index=NULL, BPPARAM=bpparam()){ return(values) } -getByGroup <- function(fds, type, value){ +getByGroup <- function(fds, type=currentType(fds), value){ index <- getSiteIndex(fds, type) idx <- !duplicated(index) return(value[idx,]) } -getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ +getDeltaPsi <- function(fds, type=currentType(fds), byGroup=FALSE, ...){ mu <- predictedMeans(fds, type) dataPsi <- (K(fds, type) + pseudocount())/(N(fds, type) + 2*pseudocount()) deltaPSI <- dataPsi - mu @@ -488,13 +635,14 @@ getDeltaPsi <- function(fds, type, byGroup=FALSE, ...){ # calculate FRASER weights -calcFraserWeights <- function(fds, psiType){ +calcFraserWeights <- function(fds, psiType=currentType(fds)){ k <- as.matrix(K(fds, psiType)) n <- as.matrix(N(fds, psiType)) mu <- t(predictMu(fds, psiType)) rho <- rho(fds, psiType) - dataPsi <- plogis(t( - x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + # dataPsi <- plogis(t( + # x(fds, type=psiType, all=TRUE, center=FALSE, noiseAlpha=NULL))) + dataPsi <- k / n # pearson residuals for BB # on counts of success k @@ -503,18 +651,23 @@ calcFraserWeights <- function(fds, psiType){ # (1+((n+2*pseudocount())-1)*rho)) # on probability of success mu r <- (dataPsi - mu) / sqrt( - mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / - (n+2*pseudocount())) + # mu * (1-mu) * (1+((n+2*pseudocount())-1)*rho) / + # (n+2*pseudocount())) + mu * (1-mu) * (1+(n-1)*rho) / n + ) # weights according to Huber function (as in edgeR) c <- 1.345; # constant, as suggested in edgeR paper w <- ifelse(abs(r) > c, c/abs(r) , 1) + # set weights to 0 if NA (i.e. N=0) + w[is.na(w)] <- 0 + return(w) } # get FRASER weights -weights <- function(fds, type){ +weights <- function(fds, type=currentType(fds)){ return(getAssayMatrix(fds, "weights", type)) } @@ -524,7 +677,7 @@ weights <- function(fds, type){ return(fds) } -getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ +getIndexFromResultTable <- function(fds, resultTable){ type <- as.character(resultTable$type) target <- makeGRangesFromDataFrame(resultTable) if(type == "theta"){ @@ -541,8 +694,9 @@ getIndexFromResultTable <- function(fds, resultTable, padj.method="holm"){ ov } -getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, - idx=NULL, aggregate=FALSE, pvalLevel="site", Ncpus=3, ...){ +getPlottingDT <- function(fds, axis=c("row", "col"), type=currentType(fds), + result=NULL, idx=NULL, aggregate=FALSE, pvalLevel="site", + Ncpus=3, geneColumn="hgnc_symbol", subsetName=NULL, ...){ if(!is.null(result)){ type <- as.character(result$type) idx <- getIndexFromResultTable(fds, result) @@ -564,8 +718,8 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, spliceID <- getSiteIndex(fds, type=type)[idxrow] feature_names <- rownames(mcols(fds, type=type))[idxrow] - if("hgnc_symbol" %in% colnames(mcols(fds, type=type))){ - feature_names <- mcols(fds, type=type)[idxrow,"hgnc_symbol"] + if(geneColumn %in% colnames(mcols(fds, type=type))){ + feature_names <- mcols(fds, type=type)[idxrow, geneColumn] } if(is.null(feature_names)){ feature_names <- as.character(seq_row(mcols(fds, type=type)))[idxrow] @@ -586,43 +740,78 @@ getPlottingDT <- function(fds, axis=c("row", "col"), type=NULL, result=NULL, n = c(n), pval = c(pVals(fds, type=type, level=pvalLevel)[idxrow, idxcol]), - padj = c(padjVals(fds, type=type)[idxrow, idxcol]), - zscore = c(zScores(fds, type=type)[idxrow, idxcol]), - obsPsi = c((k + pseudocount())/(n + 2*pseudocount())), - predPsi = c(predictedMeans(fds, type)[idxrow, idxcol])) + padj = c(padjVals(fds, type=type, + subsetName=subsetName)[idxrow, idxcol]), + obsPsi = c(k/n), + predPsi = c(predictedMeans(fds, type)[idxrow, idxcol]), + rho = rep(rho(fds, type=type)[idxrow], + ifelse(isTRUE(idxcol), ncol(fds), sum(idxcol))) + ) dt[, deltaPsi:=obsPsi - predPsi] - # add aberrant information to it - aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], - dPsi=dt[,.(deltaPsi)], zscores=dt[,.(zscore)], n=dt[,.(n)]) - dt[,aberrant:=aberrantVec] - - # if requested return gene p values (correct for multiple testing again) + # if requested return gene p values if(isTRUE(aggregate)){ + # get gene-level aberrant status + aberrantGeneLevel <- aberrant(fds[, idxcol], ..., aggregate=TRUE, + subsetName=subsetName) + aberrantGeneLevel <- melt( + data.table(featureID=rownames(aberrantGeneLevel), + aberrantGeneLevel), + value.name="aberrant", id.vars="featureID", + variable.name="sampleID") + + # split featureID into several rows if more than one dt <- dt[!is.na(featureID)] - - # correct by gene and take the smallest p value - dt <- rbindlist(bplapply(unique(dt[,sampleID]), - BPPARAM=getBPParam(Ncpus, length(unique(dt[,sampleID]))), - FUN=function(x){ - dttmp <- dt[sampleID == x] - dttmp[, pval:=p.adjust(pval, method="holm"), - by="sampleID,featureID,type"] - dttmp <- dttmp[order(sampleID, featureID, type, -aberrant, - pval, -abs(deltaPsi))][ - !duplicated(data.table(sampleID, featureID, type))] - dttmp <- dttmp[, padj:=p.adjust(pval, method="BY"), - by="sampleID,type"] - dttmp - })) + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(featureID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(featureID=dt_tmp$V1, dt_idx=NULL)] + + # get gene-level pvalue matrices + pvalsGene <- lapply(c("pval", "padj"), function(x){ + if(x == "pval"){ + pvalsGene <- pVals(fds, type=type, + level="gene")[,idxcol,drop=FALSE] + } else { + pvalsGene <- padjVals(fds, type=type, subsetName=subsetName, + level="gene")[,idxcol,drop=FALSE] + } + pvalsGene <- data.table(featureID=rownames(pvalsGene), pvalsGene) + pvalsGene <- melt(pvalsGene, value.name=paste0("gene_", x), + id.vars="featureID", variable.name="sampleID") + return(pvalsGene) + }) + pvalsGene <- merge(pvalsGene[[1]], pvalsGene[[2]], + by=c("featureID", "sampleID")) + + # merge with gene level aberrant status + pvalsGene <- merge(pvalsGene, aberrantGeneLevel, + by=c("featureID", "sampleID")) + + # merge with gene pval matrix + dt <- merge(dt, pvalsGene, by=c("featureID", "sampleID")) + dt[,`:=`(pval=gene_pval, padj=gene_padj, + gene_pval=NULL, gene_padj=NULL)] + + # sort + dt <- dt[order(sampleID, featureID, type, -aberrant, + padj, -abs(deltaPsi))][ + !duplicated(data.table(sampleID, featureID, type))] + } else{ + # add aberrant information to it + aberrantVec <- aberrant(fds, ..., padjVals=dt[,.(padj)], + dPsi=dt[,.(deltaPsi)], n=dt[,.(n)], + rhoVals=dt[,.(rho)], aggregate=FALSE, + subsetName=subsetName) + dt[,aberrant:=aberrantVec] } - + # return object dt } -#' @describeIn getter_setter_functions Dependend on the level of verbosity +#' @describeIn getter_setter_functions Dependent on the level of verbosity #' the algorithm reports more or less to the user. 0 means being quiet #' and 10 means everything. #' @export diff --git a/R/helper-functions.R b/R/helper-functions.R index 15b91fde..03d8de3b 100644 --- a/R/helper-functions.R +++ b/R/helper-functions.R @@ -71,7 +71,7 @@ checkReadType <- function(fds, type){ # check if type is null or missing if(missing(type) | is.null(type)){ - if(verbose(fds) > 0){ + if(verbose(fds) > 3){ warning("Read type was not specified!", "We will assume the default: 'j'") } @@ -79,7 +79,7 @@ checkReadType <- function(fds, type){ } type <- unique(type) stopifnot(isScalarCharacter(type)) - correctTypes <- c(psi3="j", psi5="j", theta="ss") + correctTypes <- c(psi3="j", psi5="j", theta="ss", jaccard="j") # check if it is already the correct type if(type %in% correctTypes) return(type) @@ -109,7 +109,7 @@ checkReadType <- function(fds, type){ #' #' @noRd whichPSIType <- function(type){ - unlist(regmatches(type, gregexpr("psi(3|5)|theta", type, perl=TRUE))) + unlist(regmatches(type, gregexpr("psi(3|5)|theta|jaccard", type, perl=TRUE))) } #' @@ -122,7 +122,8 @@ whichReadType <- function(fds, name){ # check writing if(name == "ss" | endsWith(name, "theta")) return("ss") - if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3")) + if(name == "j" | endsWith(name, "psi5") | endsWith(name, "psi3") | + endsWith(name, "jaccard")) return("j") # check assay names @@ -355,18 +356,18 @@ assayExists <- function(fds, assayName){ return(aexists) } -getAssayAsVector <- function(fds, prefix, psiType, sampleID){ +getAssayAsVector <- function(fds, prefix, psiType=currentType(fds), sampleID){ as.vector(assay(fds, paste0(prefix, psiType))[,sampleID]) } -variableJunctions <- function(fds, type, minDeltaPsi=0.1){ +variableJunctions <- function(fds, type=currentType(fds), minDeltaPsi=0.1){ psi <- K(fds, type=type)/N(fds, type=type) j2keep <- rowMaxs(abs(psi - rowMeans(psi, na.rm=TRUE)), na.rm=TRUE) j2keep >= minDeltaPsi } -subsetKMostVariableJunctions <- function(fds, type, n){ +subsetKMostVariableJunctions <- function(fds, type=currentType(fds), n){ curX <- x(fds, type=type, all=TRUE, center=FALSE, noiseAlpha=NULL) xsd <- colSds(curX) nMostVarJuncs <- which(xsd >= sort(xsd, TRUE)[min(length(xsd), n*2)]) @@ -375,7 +376,8 @@ subsetKMostVariableJunctions <- function(fds, type, n){ ans } -getSubsetVector <- function(fds, type, minDeltaPsi=0.1, nSubset=15000){ +getSubsetVector <- function(fds, type=currentType(fds), minDeltaPsi=0.1, + nSubset=15000){ # get any variable intron ans <- variableJunctions(fds, type, minDeltaPsi=minDeltaPsi) @@ -549,6 +551,130 @@ getStrandString <- function(fds){ return(strand) } + +#' +#' Check if adjusted pvalues have been computed for a given set of filters. +#' @noRd +checkPadjAvailableForFilters <- function(fds, type=currentType(fds), + filters=list(), dist="BetaBinomial", aggregate=FALSE, + subsetName=NULL){ + dist <- match.arg(dist, choices=c("BetaBinomial", "Binomial", "Normal")) + aname <- paste0("padj", dist) + aname <- ifelse(isTRUE(aggregate), paste0(aname, "_gene"), aname) + aname <- ifelse(!is.null(subsetName), paste0(aname, "_", subsetName), aname) + + # add information on used filters + for(n in sort(names(filters))){ + aname_new <- paste0(aname, "_", n, filters[[n]]) + if(n == "rho" && filters[[n]] == 1){ + if(any(grepl(aname_new, assayNames(fds))) || + any(grepl(aname_new, names(metadata(fds))))){ + aname <- aname_new + } + }else{ + aname <- aname_new + } + } + aname <- paste(aname, type, sep="_") + if(isTRUE(aggregate)){ + pvalsAvailable <- aname %in% names(metadata(fds)) + } else{ + pvalsAvailable <- aname %in% assayNames(fds) + } + return(pvalsAvailable) +} + +#' +#' Find most aberrant junction for each aberrant gene +#' +#' @param gr GRanges object with information about junctions. +#' @param aberrantGenes Significant genes for which the corresponding junction +#' should be extracted. +#' @param pvals Vector of pvalues (for one sample). +#' @param dpsi Vector of delta psi values (for one sample). +#' @param aberrantJunctions Vector indicating which junctions are considered +#' aberrant. +#' @param geneColumn Name of the column in mcols(fds) that has gene annotation. +#' @noRd +findJunctionsForAberrantGenes <- function(gr, aberrantGenes, pvals, dpsi, + aberrantJunctions, geneColumn="hgnc_symbol"){ + dt <- data.table(idx=mcols(gr)$idx, + geneID=mcols(gr)[,geneColumn], + pval=pvals, + dpsi=abs(dpsi), + aberrant=aberrantJunctions) + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + dt <- dt[geneID %in% aberrantGenes,] + dt <- dt[!is.na(aberrant) & aberrant == TRUE,] + + # sort per gene by lowest pvalue / highest deltaPsi and return index + dt <- dt[order(geneID, -aberrant, pval, -dpsi)] + dt <- dt[!duplicated(dt, by="geneID"),] + + # remove gene-level significant result if no junction in that gene passed + # the filters + dt <- dt[!is.na(pval),] + + junctionsToReport <- dt[,idx] + names(junctionsToReport) <- dt[,geneID] + junctionsToReport <- sort(junctionsToReport) + return(junctionsToReport) +} + +collapseResTablePerGene <- function(res, geneColumn="hgncSymbol"){ + if(length(res) == 0){ + return(res) + } + if(!is.data.table(res)){ + res <- as.data.table(res) + } + + if(any(!c("pValue", "pValueGene", geneColumn) %in% colnames(res))){ + stop("For collapsing per gene, the results table needs to contain ", + "the columns pValue, pValueGene and ", geneColumn, ".") + } + + res <- res[order(res$pValueGene, res$pValue)] + naIdx <- is.na(res[, get(geneColumn)]) + ansNoNA <- res[!is.na(res[, get(geneColumn)]),] + + # get final result table + dupIdx <- duplicated(data.table(as.vector(ansNoNA[, get(geneColumn)]), + as.vector(ansNoNA$sampleID))) + ans <- res[!naIdx,][!dupIdx,] + return(ans) +} + +#' ignores NA in unique if other values than NA are present +#' @noRd +uniqueIgnoreNA <- function(x){ + uniq <- unique(x) + if(length(uniq) > 1) uniq <- uniq[!is.na(uniq)] + return(uniq) +} + +#' split string of gene names into vector +#' @noRd +splitGenes <- function(x, sep=";"){ + return(unlist(strsplit(as.character(x), sep, fixed=TRUE))) +} + +#' cap string of gene names to show max 3 gene names +#' @noRd +limitGeneNamesList <- function(gene_names, maxLength=3){ + gene_names <- as.character(gene_names) + numFeatures <- unlist(lapply(gene_names, function(x) length(splitGenes(x)))) + gene_names[numFeatures > maxLength] <- + unlist(lapply(gene_names[numFeatures > maxLength], function(x){ + paste(c(splitGenes(x)[seq_len(maxLength)], "..."), + collapse=";") + } )) + return(gene_names) +} + checkForAndCreateDir <- function(object, dir){ verbose <- 0 if(is(object, "FraserDataSet")){ @@ -569,4 +695,3 @@ checkForAndCreateDir <- function(object, dir){ } return(TRUE) } - diff --git a/R/makeSimulatedDataset.R b/R/makeSimulatedDataset.R index 6e29adf8..7a1fb6f9 100644 --- a/R/makeSimulatedDataset.R +++ b/R/makeSimulatedDataset.R @@ -435,9 +435,10 @@ makeSimulatedFraserDataSet_Multinomial <- function(m=200, j=1000, q=10, #' @examples #' # A generic dataset #' fds <- makeSimulatedFraserDataSet() +#' fds <- calculatePSIValues(fds) #' fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) #' @export -injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), +injectOutliers <- function(fds, type=psiTypes, freq=1E-3, minDpsi=0.2, minCoverage=2, deltaDistr="uniformDistr", verbose=FALSE, method=c('samplePSI', 'meanPSI', 'simulatedPSI'), @@ -472,6 +473,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), setAssayMatrix(fds, type="psi3", "originalOtherCounts", withDimnames=FALSE) <- counts(fds, type="psi3", side="other") + setAssayMatrix(fds, type="jaccard", "originalOtherCounts", + withDimnames=FALSE) <- + counts(fds, type="jaccard", side="other") } # get infos from the fds @@ -500,7 +504,9 @@ injectOutliers <- function(fds, type=c("psi5", "psi3", "theta"), dt[,groupSize:=.N, by=groupID] # Get groups where outlier can be injected - available_groups <- dt[groupSize > ifelse(type == "theta", 0, 1), unique(groupID)] + available_groups <- dt[groupSize > ifelse(type == "theta" | + type == "jaccard", 0, 1), + unique(groupID)] # e.g. for psi3/5: no donor/acceptor # groups with at least 2 junctions (e.g in simulationBB) diff --git a/R/mergeExternalData.R b/R/mergeExternalData.R index c57d4af3..bbbd673a 100644 --- a/R/mergeExternalData.R +++ b/R/mergeExternalData.R @@ -69,6 +69,7 @@ mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){ extCts <- lapply(reqNames, function(id){ gr <- makeGRangesFromDataFrame(fread(countFiles[id]), keep.extra.columns=TRUE) + seqlevelsStyle(gr) <- seqlevelsStyle(fds) #force fds style onto external counts if(any(!sampleIDs %in% colnames(mcols(gr)))){ stop("Can not find provided sampleID in count data. Missing IDs: ", paste(collapse=", ", @@ -99,8 +100,8 @@ mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){ # merge psi5/psi3 data # extractExtData <- function(fds, countFun, type, ov, extData, extName){ - ctsOri <- as.matrix(countFun(fds, type=type)[from(ov),]) - ctsExt <- as.matrix(mcols(extData[[extName]])[to(ov),]) + ctsOri <- as.matrix(countFun(fds, type=type)[from(ov),,drop=FALSE]) + ctsExt <- as.matrix(mcols(extData[[extName]])[to(ov),,drop=FALSE]) ans <- cbind(ctsOri, ctsExt) mode(ans) <- "integer" ans diff --git a/R/plotMethods.R b/R/plotMethods.R index 6d824392..5abd6d3a 100644 --- a/R/plotMethods.R +++ b/R/plotMethods.R @@ -15,6 +15,10 @@ #' \item plotFilterExpression() #' \item plotFilterVariability() #' \item plotEncDimSearch() +#' \item plotBamCoverage() +#' \item plotBamCoverageFromResultTable() +#' \item plotManhattan() +#' \item plotSpliceMetricRank() #' } #' #' For a detailed description of each plot function please see the details. @@ -25,10 +29,10 @@ #' @param type The psi type: either psi5, psi3 or theta (for SE). #' @param sampleID A sample ID which should be plotted. Can also be a vector. #' Integers are treated as indices. -#' @param idx,site A junction site ID or gene ID or one of both, which +#' @param idx A junction site ID or gene ID or one of both, which #' should be plotted. Can also be a vector. Integers are treated #' as indices. -#' @param padjCutoff,zScoreCutoff,deltaPsiCutoff Significance, Z-score or delta +#' @param padjCutoff,deltaPsiCutoff Significance or delta #' psi cutoff to mark outliers #' @param global Flag to plot a global Q-Q plot, default FALSE #' @param normalized If TRUE, the normalized psi values are used, the default, @@ -42,6 +46,10 @@ #' samples. Labelling can be turned off by setting #' \code{label=NULL}. The user can also provide a custom #' list of gene symbols or sampleIDs. +#' @param subsetName The name of a subset of genes of interest for which FDR +#' corrected pvalues were previously computed. Those FDR values +#' on the subset will then be used to determine aberrant status. +#' Default is NULL (using transcriptome-wide FDR corrected pvalues). #' @param BPPARAM BiocParallel parameter to use. #' @param Ncpus Number of cores to use. #' @param plotType The type of plot that should be shown as character string. @@ -52,11 +60,61 @@ #' sample-sample correlation heatmap or \code{"junctionSample"} #' for a junction-sample correlation heatmap. #' @param onlyVariableIntrons Logical value indicating whether to show only -#' introns that also pass the variability filter. Defaults to -#' FALSE. +#' introns that also pass the variability filter. Defaults to +#' FALSE. #' @param onlyExpressedIntrons Logical value indicating whether to show only -#' introns that also pass the expression filter. Defaults to -#' FALSE. +#' introns that also pass the expression filter. Defaults to +#' FALSE. +#' @param gr A GRanges object indicating the genomic range that should be shown +#' in \code{plotBamCoverage}. +#' @param control_samples The sampleIDs of the samples used as control in +#' \code{plotBamCoverage}. +#' @param min_junction_count The minimal junction count across samples required +#' for a junction to appear in the splicegraph and coverage tracks +#' of \code{plotBamCoverage}. +#' @param txdb A TxDb object giving the gene/transcript annotation to use. +#' @param orgDb A OrgDb object giving the mapping of gene ids and symbols. +#' @param show_full_gene Should the full genomic range of the gene be shown in +#' \code{plotBamCoverageFromResultTable} (default: FALSE)? +#' If FALSE, only a certain region (see parameters left_extension +#' and right_extension) around the outlier junction is shown. +#' @param left_extension Indicating how far the plotted range around the outlier +#' junction should be extended to the left in +#' \code{plotBamCoverageFromResultTable}. +#' @param right_extension Indicating how far the plotted range around the +#' outlier junction should be extended to the right in +#' \code{plotBamCoverageFromResultTable}. +#' @param res_gene_col The column name in the given results table that +#' contains the gene annotation. +#' @param res_geneid_type The type of gene annotation in the results table in +#' \code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +#' information is needed for mapping between the results table and +#' the provided annotation in the txdb object. +#' @param txdb_geneid_type The type of gene_id present in \code{genes(txdb)} +#' (e.g. ENTREZID). This information is needed for +#' mapping between the results table and the provided annotation +#' in the txdb object. +#' @param highlight_range A \code{GenomicRanges} or \code{GenomicRangesList} +#' object of ranges to be highlighted in the splicegraph of +#' \code{plotBamCoverage}. +#' @param highlight_range_color The color of highlighted ranges in +#' the splicegraph of \code{plotBamCoverage}. +#' @param toscale In \code{plotBamCoverage}, indicates which part of the +#' plotted region should be drawn to scale. Possible values are +#' 'exon' (exonic regions are drawn to scale), +#' 'gene' (both exonic and intronic regions are drawn to scale) or +#' 'none' (exonic and intronic regions have constant length) +#' (see SGSeq package). +#' @param splicegraph_labels Indicated the format of exon/splice junction +#' labels in the splicegraph of \code{plotBamCoverage}. +#' Possible values are 'genomic_range' (gives the start position +#' of the first exon and the end position of the last exon that +#' are shown), 'id' (format E1,... J1,...), 'name' (format +#' type:chromosome:start-end:strand for each feature), +#' 'none' for no labels (see SGSeq package). +#' @param splicegraph_position The position of the splicegraph relative to the +#' coverage tracks in \code{plotBamCoverage}. Possible values +#' are 'top' (default) and 'bottom'. #' #### Graphical parameters #' @param main Title for the plot, if missing a default title will be used. @@ -89,6 +147,31 @@ #' @param bins Set the number of bins to be used in the histogram. #' @param legend.position Set legend position (x and y coordinate), defaults to #' the top right corner. +#' @param color_annotated The color for exons and junctions present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_novel The color for novel exons and junctions not present in +#' the given annotation (in the splicegraph of +#' \code{plotBamCoverage}). +#' @param color_sample_interest The color in \code{plotBamCoverage} for the +#' sample of interest. +#' @param color_control_samples The color in \code{plotBamCoverage} for the +#' samples used as controls. +#' @param curvature_splicegraph The curvature of the junction arcs in the +#' splicegraph in \code{plotBamCoverage}. Decrease this value +#' for flatter arcs and increase it for steeper arcs. +#' @param curvature_coverage The curvature of the junction arcs in the +#' coverage tracks of \code{plotBamCoverage}. Decrease this +#' value for flatter arcs and increase it for steeper arcs. +#' @param mar The margin of the plot area for \code{plotBamCoverage} +#' (b,l,t,r). +#' @param cex For controlling the size of text and numbers in +#' \code{plotBamCoverage}. +#' @param chr Vector of chromosome names to show in \code{plotManhattan}. The +#' default is to show all chromosomes. +#' @param value Indicates which assay is shown in the manhattan plot. Defaults +#' to 'pvalue'. Other options are 'deltaPsi' and 'zScore'. +#' @param chrColor Interchanging colors by chromosome for \code{plotManhattan}. #' #### Additional ... parameter #' @param ... Additional parameters passed to plot() or plot_ly() if not stated @@ -116,6 +199,9 @@ #' #' \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi #' against the predicted psi for a given site. +#' +#' \code{plotSpliceMetricRank}: This function plots for a given intron the +#' observed values of the selected splice metrix against the sample rank. #' #' \code{plotCountCorHeatmap}: The correlation heatmap of the count data either #' of the full data set (i.e. sample-sample correlations) or of the top x most @@ -137,6 +223,17 @@ #' It plots the encoding dimension against the achieved loss (area under the #' precision-recall curve). From this plot the optimum should be choosen for #' the \code{q} in fitting process. +#' +#' \code{plotManhattan}: A Manhattan plot showing the junction pvalues by +#' genomic position. Useful to identify if outliers cluster by genomic position. +#' +#' \code{plotBamCoverage}: A sashimi plot showing the read coverage from +#' the underlying bam files for a given genomic range and sampleIDs. +#' +#' \code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +#' coverage from the underlying bam files for a row in the results table. Can +#' either show the full range of the gene with the outlier junction or only a +#' certain region around the outlier. #' #' @return If base R graphics are used nothing is returned else the plotly or #' the gplot object is returned. @@ -145,50 +242,118 @@ #' @rdname plotFunctions #' @aliases plotFunctions plotAberrantPerSample plotVolcano plotQQ #' plotExpression plotCountCorHeatmap plotFilterExpression -#' plotExpectedVsObservedPsi plotEncDimSearch +#' plotExpectedVsObservedPsi plotEncDimSearch plotManhattan +#' plotBamCoverage plotBamCoverageFromResultTable #' @examples +#' \dontshow{set.seed(42)} #' # create full FRASER object #' fds <- makeSimulatedFraserDataSet(m=40, j=200) #' fds <- calculatePSIValues(fds) #' fds <- filterExpressionAndVariability(fds, filter=FALSE) -#' # this step should be done for all splicing metrics and more dimensions -#' fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) -#' fds <- FRASER(fds) +#' # this step should be done for more dimensions in practice +#' fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) +#' +#' # assign gene names to show functionality on test dataset +#' # use fds <- annotateRanges(fds) on real data +#' mcols(fds, type="j")$hgnc_symbol <- +#' paste0("gene", sample(1:25, nrow(fds), replace=TRUE)) +#' +#' # fit and calculate pvalues +#' genesOfInterest <- rep(list(paste0("gene", sample(1:25, 10))), 4) +#' names(genesOfInterest) <- c("sample1", "sample6", "sample15", "sample23") +#' fds <- FRASER(fds, subsets=list("testSet"=genesOfInterest)) #' #' # QC plotting #' plotFilterExpression(fds) #' plotFilterVariability(fds) -#' plotCountCorHeatmap(fds, "theta") -#' plotCountCorHeatmap(fds, "theta", normalized=TRUE) -#' plotEncDimSearch(fds, type="psi5") +#' plotCountCorHeatmap(fds, "jaccard") +#' plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +#' plotEncDimSearch(fds, type="jaccard") #' #' # extract results #' plotAberrantPerSample(fds, aggregate=FALSE) -#' plotVolcano(fds, "sample1", "psi5") +#' plotAberrantPerSample(fds, aggregate=TRUE, subsetName="testSet") +#' plotVolcano(fds, "sample2", "jaccard", label="aberrant") +#' plotVolcano(fds, "sample1", "jaccard", aggregate=TRUE, subsetName="testSet") #' #' # dive into gene/sample level results -#' res <- results(fds) +#' res <- as.data.table(results(fds)) #' res #' plotExpression(fds, result=res[1]) #' plotQQ(fds, result=res[1]) -#' plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +#' plotExpectedVsObservedPsi(fds, res=res[1]) +#' plotSpliceMetricRank(fds, res=res[1]) +#' +#' # other ways to call these plotting functions +#' plotExpression(fds, idx=10, sampleID="sample1", type="jaccard") +#' plotExpression(fds, result=res[FDR_set == "testSet",][1], +#' subsetName="testSet") +#' plotQQ(fds, idx=10, sampleID="sample1", type="jaccard") +#' plotQQ(fds, result=res[FDR_set == "testSet",][1], subsetName="testSet") +#' plotExpectedVsObservedPsi(fds, idx=10, sampleID="sample1", type="jaccard") +#' plotExpectedVsObservedPsi(fds, result=res[FDR_set == "testSet",][1], +#' subsetName="testSet") +#' plotSpliceMetricRank(fds, idx=10, sampleID="sample1", type="jaccard") +#' plotSpliceMetricRank(fds, result=res[FDR_set == "testSet",][1], +#' subsetName="testSet") +#' +#' # create manhattan plot of pvalues by genomic position +#' if(require(ggbio)){ +#' plotManhattan(fds, type="jaccard", sampleID="sample10") +#' } +#' +#' # plot splice graph and coverage from bam files in a given region +#' if(require(SGSeq)){ +#' fds <- createTestFraserSettings() +#' gr <- GRanges(seqnames="chr19", +#' IRanges(start=7587496, end=7598895), +#' strand="+") +#' plotBamCoverage(fds, gr=gr, sampleID="sample3", +#' control_samples="sample2", min_junction_count=5, +#' curvature_splicegraph=1, curvature_coverage=1, +#' mar=c(1, 7, 0.1, 3)) +#' +#' # plot coverage from bam file for a row in the result table +#' fds <- createTestFraserDataSet() +#' require(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' require(org.Hs.eg.db) +#' orgDb <- org.Hs.eg.db +#' +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +#' res_dt <- as.data.table(res) +#' res_dt <- res_dt[sampleID == "sample2",] +#' +#' # plot full range of gene containing outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, +#' txdb=txdb, orgDb=orgDb, control_samples="sample3") +#' +#' # plot only certain range around outlier junction +#' plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, +#' control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, +#' curvature_coverage=0.5, right_extension=5000, left_extension=5000, +#' splicegraph_labels="id") +#' } #' -#' NULL plotVolcano.FRASER <- function(object, sampleID, - type=c("psi3", "psi5", "theta"), basePlot=TRUE, + type=fitMetrics(object), basePlot=TRUE, aggregate=FALSE, main=NULL, label=NULL, - deltaPsiCutoff=0.3, padjCutoff=0.1, ...){ + deltaPsiCutoff=0.1, padjCutoff=0.1, subsetName=NULL, ...){ type <- match.arg(type) dt <- getPlottingDT(object, axis="col", type=type, idx=sampleID, aggregate=aggregate, deltaPsiCutoff=deltaPsiCutoff, - padjCutoff=padjCutoff, ...) + padjCutoff=padjCutoff, subsetName=subsetName, ...) + dt[is.na(padj), aberrant:=NA] + dt[aberrant == TRUE, aberrantLabel:="aberrant"] + dt[aberrant == FALSE, aberrantLabel:="not aberrant"] + dt[is.na(aberrant), aberrantLabel:="not in tested group"] - g <- ggplot(dt, aes(x=deltaPsi, y=-log10(pval), color=aberrant, + g <- ggplot(dt, aes(x=deltaPsi, y=-log10(pval), color=aberrantLabel, label=featureID, text=paste0( "SampleID: ", sampleID, "
", "featureID: ", featureID, "
", @@ -197,32 +362,17 @@ plotVolcano.FRASER <- function(object, sampleID, "p value: ", signif(pval, 5), "
", "delta Psi: ", round(deltaPsi, 2), "
", "Type: ", type))) + - geom_point(aes(alpha=ifelse(aberrant == TRUE, 1, 0.8))) + + geom_point(aes(alpha=ifelse(aberrantLabel == "aberrant", 1, 0.8))) + xlab(as.expression( bquote(paste(Delta, .(ggplotLabelPsi(type)[[1]]) )) )) + ylab(expression(paste(-log[10], "(P value)"))) + + scale_color_manual(values=c("not aberrant"="black", + "aberrant"="firebrick", + "not in tested group"="lightsteelblue")) + theme_bw() + - theme(legend.position="none") + - scale_color_manual(values=c("gray40", "firebrick")) - - if(!is.na(deltaPsiCutoff)){ - g <- g + - geom_vline(xintercept=c(-deltaPsiCutoff, deltaPsiCutoff), - color="firebrick", linetype=2) - } - - if(!is.na(padjCutoff)){ - if(dt[,any(padj <= padjCutoff)]){ - padj_line <- min(dt[padj <= padjCutoff, -log10(pval)]) - } - if(!"padj_line" %in% ls() || padj_line > 10 || is.na(padj_line)){ - padj_line <- 6 - } - g <- g + - geom_hline(yintercept=padj_line, color="firebrick", linetype=4) - } - + theme(legend.position="bottom") + + guides(alpha="none", color=guide_legend(title="")) if(isFALSE(basePlot)){ g <- g + xlab(paste("delta", @@ -258,9 +408,17 @@ plotVolcano.FRASER <- function(object, sampleID, main <- as.expression(bquote(paste( bold("Volcano plot: "), .(sampleID), ", ", .(ggplotLabelPsi(type)[[1]])))) - } + } } - g <- g + ggtitle(main) + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across ", + ifelse(isTRUE(aggregate), "genes", "introns"), + " in the ", subsetName, + " group (N=", dt[!is.na(padj), .N], ")") + } + g <- g + ggtitle(main, subtitle=subtitle) plotBasePlot(g, basePlot) } @@ -278,9 +436,9 @@ setMethod("plotVolcano", signature="FraserDataSet", plotVolcano.FRASER) plotAberrantPerSample.FRASER <- function(object, main, - type=c("psi3", "psi5", "theta"), - padjCutoff=0.1, zScoreCutoff=NA, deltaPsiCutoff=0.3, - aggregate=TRUE, BPPARAM=bpparam(), ...){ + type=fitMetrics(object), + padjCutoff=0.1, deltaPsiCutoff=0.1, + aggregate=TRUE, subsetName=NULL, BPPARAM=bpparam(), ...){ type <- match.arg(type, several.ok=TRUE) @@ -290,14 +448,27 @@ plotAberrantPerSample.FRASER <- function(object, main, main <- paste(main, "by gene") } } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } # extract outliers outliers <- bplapply(type, aberrant, object=object, by="sample", - padjCutoff=padjCutoff, zScoreCutoff=zScoreCutoff, + padjCutoff=padjCutoff, deltaPsiCutoff=deltaPsiCutoff, aggregate=aggregate, ..., - BPPARAM=BPPARAM) + subsetName=subsetName, BPPARAM=BPPARAM) dt2p <- rbindlist(lapply(seq_along(outliers), function(idx){ vals <- outliers[[idx]] + padj_assay <- padjVals(object, type=type[idx], subsetName=subsetName, + level=ifelse(isTRUE(aggregate), "gene", "site")) + testedSamples <- names( + which(colSums(is.na(padj_assay)) != nrow(padj_assay))) + if(length(testedSamples) == 0){ + stop("No non-NA padj values found for the tested group.") + } + vals <- vals[testedSamples] data.table(type=type[idx], value=sort(vals), median=median(vals), rank=seq_along(vals)) })) @@ -307,8 +478,8 @@ plotAberrantPerSample.FRASER <- function(object, main, geom_line() + geom_hline(aes(yintercept=median, color=type, lty="Median")) + theme_bw() + - theme_cowplot() + - ggtitle(main) + + theme_cowplot() + background_grid(major="xy", minor="xy") + + ggtitle(main, subtitle=subtitle) + xlab("Sample rank") + ylab("Number of outliers") + scale_color_brewer(palette="Dark2", name=element_blank(), @@ -316,7 +487,8 @@ plotAberrantPerSample.FRASER <- function(object, main, scale_linetype_manual(name="", values=2, labels="Median") if(!all(dt2p[,value] == 0)){ - g <- g + scale_y_log10() + g <- g + scale_y_log10(limits=c(1, max(unlist(outliers)))) + + annotation_logticks(sides="l") } g @@ -341,17 +513,21 @@ setMethod("plotAberrantPerSample", signature="FraserDataSet", #' #' @rdname plotFunctions #' @export -plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), - site=NULL, result=NULL, colGroup=NULL, - basePlot=TRUE, main=NULL, label="aberrant", ...){ +plotExpression <- function(fds, type=fitMetrics(fds), + idx=NULL, result=NULL, colGroup=NULL, + basePlot=TRUE, main=NULL, label="aberrant", + subsetName=NULL, ...){ if(!is.null(result)){ type <- as.character(result$type) - site <- getIndexFromResultTable(fds, result) + idx <- getIndexFromResultTable(fds, result) } else { type <- match.arg(type) } - dt <- getPlottingDT(fds, axis="row", type=type, idx=site, ...) + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, + subsetName=subsetName, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + if(!is.null(colGroup)){ if(all(colGroup %in% samples(fds))){ colGroup <- samples(fds) %in% colGroup @@ -360,18 +536,28 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), } dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))] + gr <- granges(rowRanges(fds,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), + ":", strand(gr)) if(is.null(main)){ if(isTRUE(basePlot)){ main <- as.expression(bquote(bold(paste( .(ggplotLabelPsi(type)[[1]]), " expression plot: ", - bolditalic(.(as.character(dt[,unique(featureID)]))), - " (site ", .(as.character(dt[,unique(idx)])), ")")))) + .(genomic_pos_label), + " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + "; row index: ", .(as.character(dt[,unique(idx)])), ")")))) } else{ main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], - " expression plot: ", dt[,unique(featureID)], - " (site ", dt[,unique(idx)], ")") + " expression plot: ", genomic_pos_label, + " (", dt[,unique(featureID)], + "; row index: ", dt[,unique(idx)], ")") } } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } g <- ggplot(dt, aes(x=n + 2, y=k + 1, color=aberrant, label=sampleID, text=paste0( @@ -389,7 +575,7 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), theme(legend.position="none", title=) + xlab("Total junction coverage + 2 (N)") + ylab("Junction count + 1 (K)") + - ggtitle(main) + + ggtitle(main, subtitle=subtitle) + annotation_logticks(sides='bl') if(isTRUE(basePlot) && !is.null(label)){ @@ -422,6 +608,116 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), plotBasePlot(g, basePlot) } +#' +#' Junction splice metric plot +#' +#' Plots the observed values of the splice metric across samples for a +#' junction of interest. +#' +#' @rdname plotFunctions +#' @export +plotSpliceMetricRank <- function(fds, type=fitMetrics(fds), + idx=NULL, result=NULL, colGroup=NULL, + basePlot=TRUE, main=NULL, label="aberrant", + subsetName=NULL, ...){ + if(!is.null(result)){ + type <- as.character(result$type) + idx <- getIndexFromResultTable(fds, result) + } else { + type <- match.arg(type) + } + + dt <- getPlottingDT(fds, axis="row", type=type, idx=idx, + subsetName=subsetName, ...) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + + # rank on observed value of splice metric of interest + dt[, rank := rank(obsPsi, ties.method="random", na.last=FALSE)] + + if(!is.null(colGroup)){ + if(all(colGroup %in% samples(fds))){ + colGroup <- samples(fds) %in% colGroup + } + dt[colGroup,aberrant:=TRUE] + } + dt[,aberrant:=factor(aberrant, levels=c("TRUE", "FALSE"))] + + gr <- granges(rowRanges(fds,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), + ":", strand(gr)) + + if(is.null(main)){ + if(isTRUE(basePlot)){ + main <- as.expression(bquote(bold(paste( + .(genomic_pos_label), + " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + "; row index: ", .(as.character(dt[,unique(idx)])), ")")))) + } else{ + main <- paste0(genomic_pos_label, + " (", dt[,unique(featureID)], + "; row index: ", dt[,unique(idx)], ")") + } + } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } + + if(isTRUE(basePlot)){ + ylab <- bquote("Observed " ~ .(ggplotLabelPsi(type)[[1]])) + } else{ + ylab <- paste("Observed", ggplotLabelPsi(type, asCharacter=TRUE)[[1]]) + } + + g <- ggplot(dt, aes(x=rank, y=obsPsi, color=aberrant, label=sampleID, + text=paste0( + "Sample: ", sampleID, "
", + "Counts (K): ", k, "
", + "Total counts (N): ", n, "
", + "p value: ", signif(pval, 5), "
", + "padjust: ", signif(padj, 5), "
", + "Observed Psi: ", round(obsPsi, 2), "
", + "Predicted mu: ", round(predPsi, 2), "
"))) + + geom_point(alpha=ifelse(as.character(dt$aberrant) == "TRUE", 1, 0.7)) + + theme_bw() + + theme(legend.position="none", title=) + + xlab("Sample rank") + + ylab(ylab) + + ggtitle(main, subtitle=subtitle) + + ylim(0,1) + + + if(isTRUE(basePlot) && !is.null(label)){ + if(isScalarCharacter(label) && label == "aberrant"){ + if(nrow(dt[aberrant == TRUE,]) > 0){ + g <- g + geom_text_repel(data=dt[aberrant == TRUE,], + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + } + else{ + if(nrow(dt[sampleID %in% label]) > 0){ + g <- g + geom_text_repel(data=subset(dt, sampleID %in% label), + aes(col=aberrant), + fontface='bold', hjust=-.2, vjust=-.2) + } + if(any(!(label %in% dt[,sampleID]))){ + warning("Did not find sample(s) ", + paste(label[!(label %in% dt[,sampleID])], + collapse=", "), " to label.") + } + } + } + + if(is.null(colGroup)){ + g <- g + scale_colour_manual( + values=c("FALSE"="gray70", "TRUE"="firebrick")) + } + + plotBasePlot(g, basePlot) +} + #' #' Expected over Overserved plot @@ -431,30 +727,38 @@ plotExpression <- function(fds, type=c("psi5", "psi3", "theta"), #' #' @rdname plotFunctions #' @export -plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), +plotExpectedVsObservedPsi <- function(fds, type=fitMetrics(fds), idx=NULL, result=NULL, colGroup=NULL, main=NULL, - basePlot=TRUE, label="aberrant", ...){ + basePlot=TRUE, label="aberrant", subsetName=NULL, ...){ type <- match.arg(type) # get plotting data dt <- getPlottingDT(fds, axis="row", type=type, result=result, - idx=idx, ...) + idx=idx, subsetName=subsetName, ...) type <- as.character(unique(dt$type)) idx <- unique(dt$idx) + dt[,featureID:=limitGeneNamesList(featureID, maxLength=3)] + gr <- granges(rowRanges(fds,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", start(gr), "-", end(gr), + ":", strand(gr)) if(is.null(main)){ if(isTRUE(basePlot)){ main <- as.expression(bquote(bold(paste( - .(ggplotLabelPsi(type)[[1]]), - " observed expression vs prediction plot: ", - bolditalic(.(as.character(dt[,unique(featureID)]))), - " (site ", .(as.character(idx)), ")")))) + .(genomic_pos_label), + " (", bolditalic(.(as.character(dt[,unique(featureID)]))), + "; row index: ", .(as.character(dt[,unique(idx)])), ")")))) } else{ - main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], - " observed expression vs prediction plot: ", - dt[,unique(featureID)], " (site ", idx, ")") + main <- paste0(genomic_pos_label, + " (", dt[,unique(featureID)], + "; row index: ", dt[,unique(idx)], ")") } } + if(is.null(subsetName)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } if(!is.null(colGroup)){ if(is.logical(colGroup)){ @@ -484,11 +788,12 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), geom_point(alpha=ifelse(dt$aberrant, 1, 0.5), color=c("gray70", "firebrick")[dt$aberrant + 1]) + geom_abline(intercept = 0, slope=1) + + xlim(c(0,1)) + ylim(c(0,1)) + theme_bw() + theme(legend.position="none") + xlab(xlab) + ylab(ylab) + - ggtitle(main) + ggtitle(main, subtitle=subtitle) if(isTRUE(basePlot) && !is.null(label)){ if(isScalarCharacter(label) && label == "aberrant"){ @@ -524,7 +829,7 @@ plotExpectedVsObservedPsi <- function(fds, type=c("psi5", "psi3", "theta"), plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, aggregate=FALSE, global=FALSE, main=NULL, conf.alpha=0.05, samplingPrecision=3, basePlot=TRUE, label="aberrant", - Ncpus=min(3, getDTthreads()), ...){ + Ncpus=min(3, getDTthreads()), subsetName=NULL, ...){ # check parameters if(is.null(aggregate)){ @@ -537,10 +842,11 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, if(isTRUE(global)){ if(is.null(type)){ - type <- psiTypes + type <- fitMetrics(object) } dt <- rbindlist(bplapply(type, getPlottingDT, fds=object, axis="col", - idx=TRUE, aggregate=aggregate, Ncpus=Ncpus, ...)) + idx=TRUE, aggregate=aggregate, subsetName=subsetName, + Ncpus=Ncpus, ...)) # remove duplicated entries donor/acceptor sites if not aggregated # by a feature if(isFALSE(aggregate)){ @@ -552,7 +858,8 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, dots[["pvalLevel"]] <- "junction" } dots <- append(list(fds=object, axis="row", type=type, idx=idx, - result=result, aggregate=aggregate), + result=result, aggregate=aggregate, + subsetName=subsetName), dots) dt <- do.call(getPlottingDT, args=dots) } @@ -562,19 +869,30 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, } else { type <- as.character(dt[,unique(type)]) featureID <- as.character(dt[,unique(featureID)]) + featureID <- limitGeneNamesList(featureID, maxLength=3) + idx <- dt[, unique(idx)] + gr <- granges(rowRanges(object,type=type)[idx,]) + genomic_pos_label <- paste0(seqnames(gr), ":", + start(gr), "-", end(gr), ":", strand(gr)) if(isTRUE(basePlot)){ main <- as.expression(bquote(bold(paste( - .(ggplotLabelPsi(type)[[1]]), - " Q-Q plot: ", bolditalic(.(featureID)), - " (site ", .(as.character(dt[,unique(idx)])), ")")))) + .(ggplotLabelPsi(type)[[1]]), " Q-Q plot: ", + .(genomic_pos_label), + " (", bolditalic(.(featureID)), + "; row index: ", .(as.character(idx)), ")")))) } else{ - main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], - " Q-Q plot: ", featureID, - " (site ", dt[,unique(idx)], ")") + main <- paste0(ggplotLabelPsi(type, asCharacter=TRUE)[[1]], + " Q-Q plot: ", genomic_pos_label, + " (", featureID, + "; row index: ", idx, ")") } } } - + if(is.null(subsetName) || isTRUE(global)){ + subtitle <- NULL + } else{ + subtitle <- paste0("FDR across genes in the ", subsetName, " group") + } # points dt2p <- dt[order(type, pval)] @@ -606,7 +924,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, "
SampleID: ", sampleID, "
K: ", k, "
N: ", n))) + geom_point() + theme_bw() + - ggtitle(main) + ggtitle(main, subtitle=subtitle) if(isTRUE(basePlot)){ g <- g + @@ -692,7 +1010,7 @@ plotQQ.FRASER <- function(object, type=NULL, idx=NULL, result=NULL, setMethod("plotQQ", signature="FraserDataSet", plotQQ.FRASER) -plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), +plotEncDimSearch.FRASER <- function(object, type=psiTypes, plotType=c("auc", "loss")){ type <- match.arg(type) plotType <- match.arg(plotType) @@ -717,6 +1035,8 @@ plotEncDimSearch.FRASER <- function(object, type=c("psi3", "psi5", "theta"), geom_smooth(method="loess", formula=y~x) + geom_vline(data=data[isOptimalQ == TRUE,], mapping=aes(xintercept=q, col=nsubset, linetype=noise)) + + geom_text(data=data[isOptimalQ == TRUE,], + aes(y=0.0, q+1, label=q)) + ggtitle(as.expression(bquote(bold(paste( "Q estimation for ", .(ggplotLabelPsi(type)[[1]])))))) + xlab("Estimated q") + @@ -795,8 +1115,8 @@ plotFilterExpression <- function(fds, bins=200, legend.position=c(0.8, 0.8), scale_y_log10() + scale_fill_manual(values=colors, name="Passed", labels=c("True", "False")) + - xlab("Mean Junction Expression") + - ylab("Count") + + xlab("Mean Intron Expression") + + ylab("Introns") + ggtitle("Expression filtering") + theme_bw() + theme(legend.position=legend.position) @@ -813,17 +1133,19 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), onlyExpressedIntrons=FALSE){ # check that expression filter has been calculated - if(!("passedVariability" %in% colnames(mcols(fds, type="j")))){ + mcolNames <- colnames(mcols(fds, type="j")) + if(!("passedVariability" %in% mcolNames)){ stop("Please calculate the expression filter values first with the ", "filterVariability function.") } # get plotting data + delta_cols <- mcolNames[grepl("maxD", mcolNames)] + if(any(delta_cols == "maxDJaccard")){ + delta_cols <- "maxDJaccard" + } dt <- data.table( - value=pmax(mcols(fds, type="j")[['maxDPsi3']], - mcols(fds, type="j")[['maxDPsi5']], - mcols(fds, type="j")[['maxDThetaDonor']], - mcols(fds, type="j")[['maxDThetaAcceptor']]), + value=apply(mcols(fds, type="j")[, delta_cols, drop=FALSE], 1, max), passed=mcols(fds, type="j")[['passedVariability']]) if(isTRUE(onlyExpressedIntrons)){ dt[,passed:=mcols(fds, type="j")[['passed']]] @@ -835,24 +1157,32 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), if(dir.exists(nonVarDir)){ nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarDir) + mcolNames_stored <- colnames(mcols(nV_stored)) + delta_cols_stored <- mcolNames_stored[grepl("maxD", mcolNames_stored)] + if(any(delta_cols_stored == "maxDJaccard")){ + delta_cols_stored <- "maxDJaccard" + } nonVar_dt <- data.table( - value=pmax(mcols(nV_stored)[['maxDPsi3']], - mcols(nV_stored)[['maxDPsi5']], - mcols(nV_stored)[['maxDThetaDonor']], - mcols(nV_stored)[['maxDThetaAcceptor']]), + value=apply(mcols(nV_stored)[,delta_cols_stored, drop=FALSE], 1, max), passed=FALSE) dt <- rbind(dt, nonVar_dt) } dt[,passed:=factor(passed, levels=c(TRUE, FALSE))] colors <- brewer.pal(3, "Dark2")[seq_len(2)] + if(any(delta_cols == "maxDJaccard")){ + xlab <- bquote("Maximal" ~ Delta ~ "to mean J per intron") + } else{ + xlab <- bquote("Maximal" ~ Delta ~ "to mean" ~Psi[5] ~ "," ~ Psi[3] ~ + "or" ~ theta ~ "per intron") + } ggplot(dt, aes(value, fill=passed)) + geom_histogram(bins=bins) + scale_y_log10() + scale_fill_manual(values=colors, name="Passed", labels=c("True", "False")) + - xlab(bquote("Maximal Junction" ~ Delta*Psi[5] ~ "or" ~ Delta*Psi[3])) + - ylab("Count") + + xlab(xlab) + + ylab("Introns") + ggtitle("Variability filtering") + theme_bw() + theme(legend.position=legend.position) @@ -860,8 +1190,8 @@ plotFilterVariability <- function(fds, bins=200, legend.position=c(0.8, 0.8), plotCountCorHeatmap.FRASER <- function(object, - type=c("psi5", "psi3", "theta"), logit=FALSE, topN=50000, - topJ=5000, minMedian=1, minCount=10, + type=psiTypes, logit=FALSE, + topN=50000, topJ=5000, minMedian=1, minCount=10, main=NULL, normalized=FALSE, show_rownames=FALSE, show_colnames=FALSE, minDeltaPsi=0.1, annotation_col=NA, annotation_row=NA, border_color=NA, nClust=5, @@ -903,7 +1233,7 @@ plotCountCorHeatmap.FRASER <- function(object, object <- object[,ids2plot] } - xmat <- (skmat + 1)/(snmat + 2) + xmat <- (skmat + 1*pseudocount())/(snmat + 2*pseudocount()) if(isTRUE(logit)){ xmat <- qlogisWithCap(xmat) } @@ -1056,6 +1386,355 @@ plotCountCorHeatmap.FRASER <- function(object, setMethod("plotCountCorHeatmap", signature="FraserDataSet", plotCountCorHeatmap.FRASER) +#' +#' Plot coverage from bam files for given genomic range and sample ids +#' +#' @rdname plotFunctions +#' @export +plotBamCoverage <- function(fds, gr, sampleID, + control_samples=sample( + samples(fds[, which(samples(fds) != sampleID)]), + min(3, ncol(fds)-length(sampleID))), + txdb=NULL, min_junction_count=20, + highlight_range=NULL, highlight_range_color="firebrick", + color_annotated="gray", color_novel="goldenrod3", + color_sample_interest="firebrick", color_control_samples="dodgerblue4", + toscale=c("exon", "gene", "none"), mar=c(2, 10, 0.1, 5), + curvature_splicegraph=1, curvature_coverage=1, cex=1, + splicegraph_labels=c("genomic_range", "id", "name", "none"), + splicegraph_position=c("top", "bottom"), ...){ + + if(missing(fds)){ + stop("Missing input: fds (FraserDataSet object)") + } else{ + stopifnot(is(fds, "FraserDataSet")) + } + if(missing(gr)){ + stop("Missing input gr (genomic range to plot).") + } else{ + stopifnot(is(gr, "GenomicRanges")) + stopifnot(length(gr) > 0) + } + if(missing(sampleID)){ + stop("Missing input: sample_of_interest") + } + toscale <- match.arg(toscale) + splicegraph_labels <- match.arg(splicegraph_labels) + splicegraph_position <- match.arg(splicegraph_position) + + # extract bam info for sample ids to plot + all_sids <- c(sampleID, control_samples) + si_out <- getSGSeqSI(fds, all_sids) + sgseq_si <- si_out[[1]] + fds <- si_out[[2]] + + # collapse input ranges if several + if(any(strand(gr) == "*")){ + # seems to throw an error with * strand so guessing strand instead + if(all(strand(gr) == "*")){ + guessStrand <- "+" + } else{ + guessStrand <- strand(gr[strand(gr) != "*"])[1] + } + strand(gr)[strand(gr) == "*"] <- guessStrand + warning("Input genomic ranges contained unstranded ranges.\n", + "This function needs strand information, guessing strand to ", + "be ", guessStrand, ".") + } + if(!all(strand(gr) == strand(gr[1,]))){ + warning("Input genomic ranges contained ranges on different strands,\n", + "only showing coverage for the ", strand(gr[1,]), " strand.") + strand(gr) <- rep(strand(gr[1,]), length(gr)) + } + gr <- range(gr) + gr <- keepSeqlevels(gr, unique(as.character(seqnames(gr)))) + + # convert highlight_range to GRangesList if not + if(!is.null(highlight_range) && !is(highlight_range, "GRangesList")){ + stopifnot(is(highlight_range, "GRanges")) + highlight_range <- GRangesList(highlight_range) + } + + # extract splice graph + sgfc_pred <- SGSeq::analyzeFeatures(sgseq_si, which = gr, + min_junction_count=min_junction_count, psi=0, + ...) + + # overlap detected junctions with annotation + if(!is.null(txdb)){ + # subset to chr of interest + seqlevels(txdb) <- unique(as.character(seqnames(gr))) + + # extract transcript features with SGSeq package + txf <- SGSeq::convertToTxFeatures(txdb) + txf <- txf[txf %over% gr] + + # restore seqlevels of txdb object + seqlevels(txdb) <- seqlevels0(txdb) + + # annotate splice junctions with annotation features + sgfc_pred <- SGSeq::annotate(sgfc_pred, txf) + } else{ + # when no annotation is given, show everything in the same color + color_novel <- color_annotated + } + + # get genomic positions for first and last exon in given range + if(splicegraph_labels == "genomic_range"){ + # tell plotSpliceGraph function to use custom labels + splicegraph_labels <- "label" + # create custom labels (only for first and last exon for readability) + mcols(sgfc_pred)$label <- "" + exons <- which(SGSeq::type(sgfc_pred) == "E" & + rowRanges(sgfc_pred) %over% gr) + exons <- unique(c(exons[1], tail(exons, n=1))) + if(length(exons) == 1){ + mcols(sgfc_pred)$label[exons] <- + paste(seqnames(sgfc_pred), + paste(start(sgfc_pred), end(sgfc_pred), sep="-"), + strand(sgfc_pred), sep=":")[exons] + } + if(length(exons) == 2){ + mcols(sgfc_pred)$label[exons[1]] <- + paste(seqnames(sgfc_pred), + start(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[1]] + mcols(sgfc_pred)$label[exons[2]] <- + paste(seqnames(sgfc_pred), + end(sgfc_pred), + strand(sgfc_pred), sep=":")[exons[2]] + } + } + + # plot splice graph and coverage of junctions from bam + nr_sa2p <- length(all_sids) + par(mfrow = c(nr_sa2p+1, 1), mar=mar, cex=cex) + if(splicegraph_position == "top"){ + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color=color_annotated, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + for (j in seq_along(sampleID)) { + SGSeq::plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == sampleID[j])], + which = gr, + toscale = toscale, + label=sampleID[j], + color=color_sample_interest, + curvature=curvature_coverage) + } + for (j in seq_along(control_samples)) { + SGSeq::plotCoverage( + sgfc_pred[, which(colnames(sgfc_pred) == control_samples[j])], + which = gr, + toscale = toscale, + label=control_samples[j], + color=color_control_samples, + curvature=curvature_coverage) + } + if(splicegraph_position == "bottom"){ + SGSeq::plotSpliceGraph(rowRanges(sgfc_pred), + which=gr, + toscale=toscale, + color_novel=color_novel, + ypos=c(0.25, 0.1), + ranges=highlight_range, + ranges_color=highlight_range_color, + ranges_ypos=c(0.01, 0.02), + curvature=curvature_splicegraph, + label=splicegraph_labels) + } + + return(invisible(fds)) +} + +#' +#' Plot coverage from bam files for given row of results table +#' +#' @rdname plotFunctions +#' @export +plotBamCoverageFromResultTable <- function(fds, result, show_full_gene=FALSE, + txdb=NULL, orgDb=NULL, res_gene_col="hgncSymbol", + res_geneid_type="SYMBOL", txdb_geneid_type="ENTREZID", + left_extension=1000, right_extension=1000, ...){ + stopifnot(is(fds, "FraserDataSet")) + + if(is(result, "GenomicRanges")){ + result <- as.data.table(result) + } + + stopifnot(is.data.table(result)) + stopifnot(result[,.N] == 1) + + sid <- result[,sampleID] + jidx <- getIndexFromResultTable(fds, result) + outlier_range <- rowRanges(fds, type=result[,type])[jidx,] + + # showing either full range of the gene in which the outlier occured + if(show_full_gene == TRUE){ + if(missing(txdb)){ + stop("Missing input: txdb (for extracting gene range)") + } + if(missing(orgDb)){ + stop("Missing input: orgDb (for mapping of IDs to txdb)") + } + result_gene <- result[,get(res_gene_col)] + result_gene <- strsplit(result_gene, ";", fixed=TRUE)[[1]] + if(is.data.table(orgDb)){ + tmp <- merge(x=as.data.table(genes(txdb))[,.(gene_id)], y=orgDb, + by.y=txdb_geneid_type, by.x="gene_id", all.x=TRUE, + sort=FALSE)[,.(gene_id, feature=get(res_geneid_type))] + setnames(tmp, "feature", res_geneid_type) + txdb_geneid <- tmp[get(res_geneid_type) %in% result_gene, gene_id] + } else { + tmp <- as.data.table( + select(orgDb, + keys=result_gene, + columns=txdb_geneid_type, + keytype=res_geneid_type) + ) + txdb_geneid <- tmp[, get(txdb_geneid_type)] + } + gr <- genes(txdb, filter=list("gene_id"=txdb_geneid)) + if(length(gr) == 0){ + stop("Could not extract genomic coordinates for input gene.") + } + } else{ + # or just showing a certain region around the outlier junction + gr <- outlier_range + start(gr) <- start(gr) - left_extension + end(gr) <- end(gr) + right_extension + } + + # if several genes overlap, only show those on same strand as outlier + if(as.character(strand(outlier_range)) != "*" & + length(gr[strand(gr) == strand(outlier_range),]) > 0){ + gr <- gr[strand(gr) == strand(outlier_range),] + } + + # create the coverage plot for the given outlier + fds <- plotBamCoverage(fds, + gr=gr, + sampleID=sid, + txdb=txdb, + highlight_range=outlier_range, + ...) + return(invisible(fds)) +} + +plotManhattan.FRASER <- function(object, sampleID, value="pvalue", + type=fitMetrics(object), chr=NULL, + main=paste0("sample: ", sampleID), + chrColor=c("black", "darkgrey"), + subsetName=NULL, ...){ + # check necessary packages + if (!requireNamespace('ggbio')){ + stop("For this function, the ggbio package is required.") + } + if (!requireNamespace('biovizBase')){ + stop("For this function, the biovizBase package is required.") + } + + # check arguments + stopifnot(is(object, "FraserDataSet")) + stopifnot(sampleID %in% samples(object)) + type <- match.arg(type) + additional_args <- list(...) + padjCutoff <- 0.1 + if("padjCutoff" %in% names(additional_args)){ + padjCutoff <- additional_args$padjCutoff + } + deltaPsiCutoff <- ifelse(type == "jaccard", 0.1, 0.3) + if("deltaPsiCutoff" %in% names(additional_args)){ + deltaPsiCutoff <- additional_args$deltaPsiCutoff + } + + # extract neccessary informations + gr_sample <- rowRanges(object, type=type) + seqlevelsStyle(gr_sample) <- seqlevelsStyle(object) + mcols(gr_sample)[,"pvalue"] <- -log10( + pVals(object, type=type, level="junction")[,sampleID]) + mcols(gr_sample)[,"padjust"] <- -log10(padjVals(object, type=type, + level="site", subsetName=subsetName)[,sampleID]) + mcols(gr_sample)[,"delta"] <- deltaPsiValue(object, type=type)[,sampleID] + + # Add values to granges + if(value %in% c('pvalue', 'pValue', 'pv')){ + gr_sample$value <- mcols(gr_sample)[, "pvalue"] + ylabel <- expression(paste(-log[10], "(P-value)")) + } + if(value %in% c('zscore', 'zScore')){ + gr_sample$value <- zScores(object, type=type)[, sampleID] + ylabel <- value + } + if(value %in% c('delta', 'deltaPsi', 'deltaJaccard')){ + gr_sample$value <- mcols(gr_sample)[, "delta"] + ylabel <- bquote(Delta ~ .(ggplotLabelPsi(type)[[1]])) + } + + # only one point per donor/acceptor site (relevant only for psi5 and psi3) + index <- getSiteIndex(object, type=type) + nonDup <- !duplicated(index) + gr_sample <- gr_sample[nonDup,] + + # Sort granges for plot + gr_sample <- sortSeqlevels(gr_sample) + gr_sample <- sort(gr_sample) + + # subset to chromosomes in chrSubset if requested + if(!is.null(chr)){ + # check input + if(!all(chr %in% unique(seqnames(gr_sample)))){ + stop("Not all chromosomes selected for subsetting are present ", + "in the GRanges object.") + } + + # subset + gr_sample <- gr_sample[as.character(seqnames(gr_sample)) %in% chr] + + # add chr to plot title if only one chr given + if(length(chr) == 1){ + main <- paste0(main, "; ", + paste(chr, collapse=", ", sep="")) + } + } + + # find outlier indices + if(!type %in% c("psi3", "psi5")){ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff) & + abs(gr_sample$delta) >= deltaPsiCutoff) + } else{ + outlier_idx <- which(gr_sample$padjust >= -log10(padjCutoff)) + } + message("highlighting ", length(gr_sample[outlier_idx,]), " outliers ...") + + # plot manhattan plot + plotGrandLinear.adapted(gr_sample, aes(y=value), + color=chrColor, + highlight.gr=gr_sample[outlier_idx,], + highlight.overlap="equal", + use.genome.coords=is.null(chr)) + + labs(title=main, x="", y=ylabel) + +} + +#' +#' Plot manhattan plot of junction pvalues +#' +#' @rdname plotFunctions +#' @export +setMethod("plotManhattan", signature="FraserDataSet", + plotManhattan.FRASER) + + #' #' helper function to get the annotation as data frame from the col data object #' @@ -1099,6 +1778,7 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ if(isFALSE(asCharacter)){ vapply(type, FUN=function(x) switch (x, + jaccard = c(bquote(Intron~Jaccard~Index)), psi5 = c(bquote(psi[5])), psi3 = c(bquote(psi[3])), theta = c(bquote(theta))), @@ -1106,9 +1786,180 @@ ggplotLabelPsi <- function(type, asCharacter=FALSE){ } else{ vapply(type, FUN=function(x) switch (x, + jaccard = "Intron-Jaccard-Index", psi5 = "psi[5]", psi3 = "psi[3]", theta = "theta"), FUN.VALUE=character(1)) } } + +#' +#' Extract info from bam files needed for SGSeq functions to work +#' +#' @noRd +getSGSeqSI <- function(fds, sample_ids){ + + # check if bam info is already stored in fds for given samples + if("SGSeq_sampleinfo" %in% names(metadata(fds))){ + si <- metadata(fds)[["SGSeq_sampleinfo"]] + si <- si[si$sample_name %in% sample_ids,] + if(nrow(si) != length(sample_ids)){ + # add bam info for missing sample_ids + missing_ids <- sample_ids[!sample_ids %in% si$sample_name] + message("Extracting SGSeq sample info from BAM files for samples ", + paste(missing_ids, collapse=", "), " ...") + df_missing <- data.frame( + sample_name=samples(fds)[samples(fds) %in% missing_ids], + file_bam=bamFile(fds)[samples(fds) %in% missing_ids]) + si_new <- SGSeq::getBamInfo(df_missing, yieldSize=1e6) + si_new$lib_size <- 50e6 # dummy value to speed up this part + si <- rbind(si, si_new) + metadata(fds)[["SGSeq_sampleinfo"]] <- + rbind(metadata(fds)[["SGSeq_sampleinfo"]], si_new) + } + return(list(si, fds)) + } else{ + message("Extracting SGSeq sample info from BAM files for samples ", + paste(sample_ids, collapse=", "), " ...") + df <- data.frame( + sample_name=samples(fds)[samples(fds) %in% sample_ids], + file_bam=bamFile(fds)[samples(fds) %in% sample_ids]) + si <- SGSeq::getBamInfo(df, yieldSize=1e6) + si$lib_size <- 50e6 # dummy value to speed up this part + metadata(fds)[["SGSeq_sampleinfo"]] <- si + return(list(si, fds)) + } +} + +#' +#' Adapted function from ggbio package to create manhattan plot. +#' Adapted to allow highlighting only ranges that exactly match. Uses functions +#' from package biovizBase. +#' +#' @noRd +plotGrandLinear.adapted <- function (obj, ..., facets, space.skip = 0.01, + geom = NULL, cutoff = NULL, cutoff.color = "red", cutoff.size = 1, + legend = FALSE, xlim, ylim, xlab, ylab, main, highlight.gr = NULL, + highlight.name = NULL, highlight.col = "red", highlight.label = TRUE, + highlight.label.size = 5, highlight.label.offset = 0.05, + highlight.label.col = "black", + highlight.overlap = c("any", "start", "end", "within", "equal"), + spaceline = FALSE, use.genome.coords=TRUE){ + if (is.null(geom)) + geom <- "point" + args <- list(...) + args.aes <- biovizBase::parseArgsForAes(args) + args.non <- biovizBase::parseArgsForNonAes(args) + two.color <- c("#0080FF", "#4CC4FF") + .is.seq <- FALSE + if (!"colour" %in% names(args.aes)) { + if (!any(c("color", "colour") %in% names(args.non))) { + .color <- two.color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + } + else { + if (length(args.non$color) > 1) { + .color <- args.non$color + args.aes$color <- as.name("seqnames") + .is.seq <- TRUE + args.non <- args.non[!names(args.non) %in% c("colour", + "color")] + } + } + } + else { + if (quo_name(args.aes$colour) == "seqnames") + args.aes$colour <- as.name("seqnames") + } + if (!"y" %in% names(args.aes)) + stop("need to provide y") + if(isTRUE(use.genome.coords)){ + args.non$coord <- "genome" + } + args.non$space.skip <- space.skip + args.non$geom <- geom + args.non$object <- obj + aes.res <- do.call(aes, args.aes) + p <- do.call(ggbio::autoplot, c(list(aes.res), args.non)) + if (!legend) + p <- p + theme(legend.position = "none") + if (!missing(ylab)) + p <- p + ylab(ylab) + if (!is.null(cutoff)) + p <- p + geom_hline(yintercept = cutoff, color = cutoff.color, + size = cutoff.size) + chrs <- names(seqlengths(obj)) + if (.is.seq) { + N <- length(chrs) + cols <- rep(.color, round(N/length(.color)) + 1)[1:N] + names(cols) <- chrs + p <- p + scale_color_manual(values = cols) + } + if (!missing(facets)) { + args$facets <- facets + args.facets <- biovizBase::subsetArgsByFormals(args, facet_grid, + facet_wrap) + facet <- ggbio:::.buildFacetsFromArgs(obj, args.facets) + p <- p + facet + } + p <- p + theme(panel.grid.minor = element_blank()) + if (!is.null(highlight.gr)) { + highlight.overlap <- match.arg(highlight.overlap) + idx <- findOverlaps(obj, highlight.gr, type=highlight.overlap) + .h.pos <- lapply(split(queryHits(idx), subjectHits(idx)), + function(id) { + gr <- GRanges(as.character(seqnames(p@data))[id][1], + IRanges(start = min(start(p@data[id])), end = max(end(p@data[id])))) + val <- max(as.numeric(values(p@data[id])[, quo_name(args.aes$y)])) + val <- val * (1 + highlight.label.offset) + values(gr)$val <- val + gr + }) + .h.pos <- suppressWarnings(do.call("c", unname(.h.pos))) + if (length(.h.pos)) { + if (is.null(highlight.name)) { + highlight.name <- names(highlight.gr) + } + else { + highlight.name <- values(highlight.gr)[, highlight.name] + } + p <- p + geom_point(data = biovizBase::mold(p@data[queryHits(idx)]), + do.call(aes, list(x = substitute(midpoint), y = args.aes$y)), + color = highlight.col) + if (!is.null(highlight.name)) { + seqlevels(.h.pos, pruning.mode = "coarse") <- seqlevels(obj) + suppressWarnings(seqinfo(.h.pos) <- seqinfo(obj)) + .trans <- biovizBase::transformToGenome(.h.pos, space.skip = space.skip) + values(.trans)$mean <- (start(.trans) + end(.trans))/2 + values(.trans)$names <- highlight.name + p <- p + geom_text(data = biovizBase::mold(.trans), + size = highlight.label.size, + vjust = 0, color = highlight.label.col, do.call(aes, + list(x = substitute(mean), y = as.name("val"), + label = as.name("names")))) + } + } + } + if (spaceline) { + vline.df <- p@ggplot$data + vline.df <- do.call(rbind, by(vline.df, vline.df$seqnames, + function(dd) { + data.frame(start = min(dd$start), end = max(dd$end)) + })) + gap <- (vline.df$start[-1] + vline.df$end[-nrow(vline.df)])/2 + p <- p + geom_vline(xintercept = gap, alpha = 0.5, color = "gray70") + + theme(panel.grid = element_blank()) + } + if (!missing(main)) + p <- p + labs(title = main) + if (!missing(xlim)) + p <- p + xlim(xlim) + if (!missing(ylim)) + p <- p + ylim(ylim) + if (missing(xlab)) + xlab <- "" + p <- p + ggplot2::xlab(xlab) + p +} diff --git a/R/pvalsNzscore.R b/R/pvalsNzscore.R index deac8b59..be256009 100644 --- a/R/pvalsNzscore.R +++ b/R/pvalsNzscore.R @@ -33,7 +33,7 @@ calculateZscore <- function(fds, type=currentType(fds), logit=TRUE){ #' @describeIn FRASER This function calculates two-sided p-values based on #' the beta-binomial distribution (or binomial or normal if desired). The -#' returned p values are already adjusted with Holm's method per donor or +#' returned p values are not yet adjusted with Holm's method per donor or #' acceptor site, respectively. #' #' @param distributions The distribution based on which the p-values are @@ -109,11 +109,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="BetaBinomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="BetaBinomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("binomial" %in% distributions){ @@ -125,11 +120,6 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval + dval, 0.5) pVals(fds, dist="Binomial", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Binomial", level="site", - withDimnames=FALSE) <- fwer_pvals } if("normal" %in% distributions){ @@ -142,22 +132,57 @@ calculatePvalues <- function(fds, type=currentType(fds), pvals <- 2 * pmin(pval, 1 - pval, 0.5) pVals(fds, dist="Normal", level="junction", withDimnames=FALSE) <- pvals - fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, - pvals=pvals, index, BPPARAM=BPPARAM) - fwer_pvals <- do.call(cbind, fwer_pval) - pVals(fds, dist="Normal", level="site", - withDimnames=FALSE) <- fwer_pvals } fds } -adjust_FWER_PValues <- function(i, pvals=pvals, index=index){ - dt <- data.table(p=pvals[,i], idx=index) - dt2 <- dt[,.(pa=min(p.adjust(p, method="holm"), na.rm=TRUE)),by=idx] +adjust_FWER_PValues <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + dt <- data.table(p=pvals[,i], idx=index, rho=rho) + dt[rho > rhoCutoff, p:=NA] + suppressWarnings(dt2 <- dt[,.(pa=min(p.adjust(p, method=method), + na.rm=TRUE)),by=idx]) + dt2[is.infinite(pa), pa:=NA] setkey(dt2, "idx")[J(index)][,pa] } +adjust_FWER_PValues_per_idx <- function(i, pvals, index, rho, rhoCutoff, + method="holm"){ + pvals[rho > rhoCutoff,] <- NA + dttmp <- data.table(idx=index, rho=rho, + apply(pvals, 2, as.numeric))[idx == i,] + suppressWarnings( + pa <- apply(as.matrix(dttmp[,-c("idx", "rho")]), 2, + function(x) min(p.adjust(x, method=method), + na.rm = TRUE) ) + ) + pa[is.infinite(pa)] <- NA + return(pa) +} + +getFWERpvals_bySample <- function(pvals, index, rho, method="holm", + rhoCutoff, BPPARAM=bpparam()){ + fwer_pval <- bplapply(seq_col(pvals), adjust_FWER_PValues, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(cbind, fwer_pval) + return(fwer_pvals) +} + +getFWERpvals_byIdx <- function(pvals, index, rho, method="holm", + rhoCutoff, BPPARAM=bpparam()){ + unique_idx <- unique(index) + fwer_pval <- bplapply(unique_idx, adjust_FWER_PValues_per_idx, + pvals=pvals, index, BPPARAM=BPPARAM, + method=method, rho=rho, rhoCutoff=rhoCutoff) + fwer_pvals <- do.call(rbind, fwer_pval) + fwer_pvals <- as.matrix( + setkey(data.table(idx=unique_idx, fwer_pvals), + "idx")[J(index)][,-c("idx")]) + return(fwer_pvals) +} + singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ ki <- k[idx,] @@ -171,7 +196,7 @@ singlePvalueBetaBinomial <- function(idx, k, n, mu, rho){ pvals <- pmin(1, pbbinom(ki, ni, alphai, betai)) if(any(is.na(pvals))){ - message(date(), " : ", idx) + message(date(), ": obtained NA pvalues for junction ", idx) } return (pvals) @@ -188,37 +213,159 @@ singlePvalueBinomial <- function(idx, k, n, mu){ } #' @describeIn FRASER This function adjusts the previously calculated -#' p-values per sample for multiple testing. +#' p-values per sample for multiple testing. First, the previoulsy calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, if gene symbols have been annotated to +#' junctions (and not otherwise requested), gene-level p values are computed. #' -#' @param method The p.adjust method that should be used. +#' @param method The p.adjust method that should be used for genome-wide +#' multiple testing correction. +#' @param rhoCutoff The cutoff value on the fitted rho value +#' (overdispersion parameter of the betabinomial) above which junctions are +#' masked with NA during p value adjustment (default: NA, no masking). +#' @param geneLevel Logical value indiciating whether gene-level p values +#' should be calculated. Defaults to TRUE. +#' @param geneColumn The column name of the column that has the gene annotation +#' that will be used for gene-level pvalue computation. +#' @param subsets A named list of named lists specifying any number of gene +#' subsets (can differ per sample). For each subset, FDR correction +#' will be limited to genes in the subset, and the FDR corrected +#' pvalues stored as an assay in the fds object in addition to the +#' transcriptome-wide FDR corrected pvalues. See the examples for +#' how to use this argument. #' #' @export -calculatePadjValues <- function(fds, type=currentType(fds), method="BY"){ +calculatePadjValues <- function(fds, type=currentType(fds), method="BY", + rhoCutoff=NA, geneLevel=TRUE, + geneColumn="hgnc_symbol", subsets=NULL, + BPPARAM=bpparam()){ currentType(fds) <- type index <- getSiteIndex(fds, type=type) idx <- !duplicated(index) for(i in c("BetaBinomial", "Binomial", "Normal")){ # only do it if it exists - if(!paste0("pvalues", i, "_", type) %in% assayNames(fds)){ + if(!paste0("pvalues", i, "_junction_", type) %in% assayNames(fds)){ next } - pvals <- pVals(fds, dist=i) - padj <- apply(pvals[idx,], 2, p.adjust, method=method) + pvals <- pVals(fds, dist=i, level="junction") + rho <- rho(fds, type=type) + + # splice site-level pval correction + message(date(), ": adjusting junction-level pvalues ...") + fwer_pvals <- getFWERpvals_bySample(pvals, index, rho, method="holm", + rhoCutoff=ifelse(is.na(rhoCutoff), 1, rhoCutoff), + BPPARAM=BPPARAM) + if(!is.na(rhoCutoff)){ + filters <- list(rho=rhoCutoff) + } else{ + filters <- list() + } + pVals(fds, dist=i, level="site", filters=filters, + withDimnames=FALSE) <- fwer_pvals + + # junction-level FDR correction + message(date(), ": genome-wide FDR for junction-level pvalues ...") + padj <- apply(fwer_pvals[idx,], 2, p.adjust, method=method) padjDT <- data.table(cbind(i=unique(index), padj), key="i")[J(index)] padjDT[,i:=NULL] - padjVals(fds, dist=i, withDimnames=FALSE) <- as.matrix(padjDT) + padjVals(fds, dist=i, level="site", filters=filters, + withDimnames=FALSE) <- as.matrix(padjDT) + + # gene-level pval correction and FDR + if(isTRUE(geneLevel) && + geneColumn %in% colnames(mcols(fds, type=type))){ + message(date(), ": calculating gene-level pvalues ...") + gene_pvals <- getPvalsPerGene(fds=fds, type=type, pvals=fwer_pvals, + method="holm", FDRmethod=method, + geneColumn=geneColumn, + BPPARAM=BPPARAM) + pVals(fds, dist=i, level="gene", filters=filters, + withDimnames=FALSE) <- gene_pvals[["pvals"]] + padjVals(fds, dist=i, level="gene", filters=filters, + withDimnames=FALSE) <- gene_pvals[["padj"]] + } else if(isTRUE(geneLevel)){ + warning("Gene-level pvalues could not be calculated as column ", + geneColumn, "\nwas not found for the given fds object. ", + "Please annotate gene symbols \nfirst using the ", + "annotateRanges function.") + } + + # calculate FDR for each provided subset and assign to fds + if(!is.null(subsets)){ + stopifnot(is.list(subsets)) + stopifnot(!is.null(names(subsets))) + for(setName in names(subsets)){ + geneListSubset <- subsets[[setName]] + fds <- calculatePadjValuesOnSubset(fds=fds, + genesToTest=geneListSubset, + subsetName=setName, + type=type, method=method, + geneColumn=geneColumn, + BPPARAM=BPPARAM) + } + } } return(fds) } -getSiteIndex <- function(fds, type){ +getPvalsPerGene <- function(fds, type=currentType(fds), + pvals=pVals(fds, type=type, level="site"), + sampleID=NULL, method="holm", FDRmethod="BY", + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ + # extract data and take only the first index of per site + message(date(), ": starting gene-level pval computation for type ", type) + samples <- samples(fds) + if(is.null(colnames(pvals))){ + colnames(pvals) <- samples + } + dt <- data.table( + idx=getSiteIndex(fds, type=type), + geneID=getGeneIDs(fds, type=type, unique=FALSE, + geneColumn=geneColumn), + as.data.table(pvals)) + dt <- dt[!is.na(geneID)] + geneIDs <- getGeneIDs(fds, type=type, unique=TRUE, + geneColumn=geneColumn) + + # separate geneIDs into rows + dt[, dt_idx:=seq_len(.N)] + dt_tmp <- dt[, splitGenes(geneID), by="dt_idx"] + dt <- dt[dt_tmp$dt_idx,] + dt[,`:=`(geneID=dt_tmp$V1, dt_idx=NULL)] + setkey(dt, geneID) + + # extract samples + if(!is.null(sampleID)){ + samples <- sampleID + } + + # aggregate pvalues to gene level per sample + message(date(), ": gene-level pval computation per gene (n=", + length(geneIDs), ")") + pvalsPerGene <- genePvalsByGeneID(dt, samples=samples, geneIDs=geneIDs, + method=method, BPPARAM=BPPARAM) + + # compute FDR + message(date(), ": genome-wide FDR for gene-level pvals for type ", type) + padjPerGene <- apply(pvalsPerGene, 2, p.adjust, method=FDRmethod) + + message(date(), ": finished gene-level pval computation for type ", type) + return(list(pvals=pvalsPerGene, padj=padjPerGene)) + +} + +getSiteIndex <- function(fds, type=currentType(fds)){ if(type == "theta"){ return(mcols(fds, type=type)[['spliceSiteID']]) } + if(type == "jaccard"){ + return(seq_len(nrow(fds))) + } + startId <- mcols(fds, type=type)[,"startID"] endId <- mcols(fds, type=type)[,"endID"] strand <- strand(rowRanges(fds, type=type)) @@ -233,48 +380,178 @@ getSiteIndex <- function(fds, type){ ans[selectionMat] } -getGeneIDs <- function(fds, type, unique=TRUE){ - geneIDs <- mcols(fds, type=type)$hgnc_symbol +getGeneIDs <- function(fds, type=currentType(fds), unique=TRUE, + geneColumn="hgnc_symbol"){ + if(!geneColumn %in% colnames(mcols(fds, type=type))){ + stop("Did not find column '", geneColumn, "' in mcols(fds, type='", + type, "'). Please assign introns\nto genes first using the ", + "annotateRanges(fds, ...) or annotateRangesWithTxDb(fds, ...) ", + "function.") + } + + geneIDs <- mcols(fds, type=type)[[geneColumn]] if(isTRUE(unique)){ - geneIDs <- unique(geneIDs) + geneIDs <- unique(unlist(lapply(geneIDs, FUN=function(g){ + unlist(strsplit(g, ";"))}) )) geneIDs <- geneIDs[!is.na(geneIDs)] } geneIDs } -getPvalsPerGene <- function(fds, type, pvals=pVals(fds, type=type), - sampleID=NULL, method="holm", BPPARAM=bpparam()){ - # extract data and take only the first index of per site - dt <- data.table( - idx=getSiteIndex(fds, type=type), - geneID=getGeneIDs(fds, type=type, unique=FALSE), - as.data.table(pvals)) - dt <- dt[!duplicated(idx) & !is.na(geneID)] - setkey(dt, geneID) +genePvalsByGeneID <- function(dt, samples, geneIDs, method, BPPARAM){ + pvalsPerGene <- bplapply(geneIDs, BPPARAM=BPPARAM, + FUN=function(g) { + dttmp <- dt[geneID == g][!duplicated(idx)] + suppressWarnings( + pval_g <- apply(as.matrix(dttmp[,-c("idx", "geneID")]), 2, + function(x) min(p.adjust(x, method=method), na.rm = TRUE) ) + ) + pval_g[is.infinite(pval_g)] <- NA + pval_g + }) + pvalsPerGene <- do.call(rbind, pvalsPerGene) + rownames(pvalsPerGene) <- geneIDs + return(pvalsPerGene) +} + +#' @describeIn FRASER This function does FDR correction only for all junctions +#' in a certain subset of genes which can differ per sample. Requires gene +#' symbols to have been annotated to junctions. As with the full FDR +#' correction across all junctions, first the previously calculated +#' junction-level p values are adjusted with Holm's method per donor or +#' acceptor site, respectively. Then, gene-level p values are computed. +#' +#' @param genesToTest A named list with the subset of genes to test per sample. +#' The names must correspond to the sampleIDs in the given fds object. +#' @param subsetName The name under which the resulting FDR corrected pvalues +#' will be stored in metadata(fds). +#' +#' @export +calculatePadjValuesOnSubset <- function(fds, genesToTest, subsetName, + type=currentType(fds), method='BY', + geneColumn="hgnc_symbol", BPPARAM=bpparam()){ - samples <- samples(fds) - if(!is.null(sampleID)){ - samples <- sampleID + # check input + stopifnot(!is.null(genesToTest)) + stopifnot(is.list(genesToTest) || is.vector(genesToTest)) + + # replicate subset genes for each sample if given as vector + if(!is.list(genesToTest)){ + genesToTest <- rep(list(genesToTest), ncol(fds)) + names(genesToTest) <- colnames(fds) } - pvalsPerGene <- matrix(unlist(bplapply(samples, BPPARAM=BPPARAM, - function(i){ - dttmp <- dt[,min(p.adjust(get(i), method=method)),by=geneID] - setkey(dttmp, geneID) - dttmp[J(getGeneIDs(fds, type=type)), V1] - })), ncol=length(samples)) + # check that names are present and correspond to samples in ods + stopifnot(!is.null(names(genesToTest))) + if(!all(names(genesToTest) %in% colnames(fds))){ + stop("names(genesToTest) need to be sampleIDs in the given fds object.") + } - colnames(pvalsPerGene) <- samples - rownames(pvalsPerGene) <- getGeneIDs(fds, type=type) + # get genes from fds object + fds_genes <- getGeneIDs(fds, unique=TRUE, type=type, geneColumn=geneColumn) + ngenes <- length(fds_genes) - return(pvalsPerGene) - + # site index (for psi3/5) + site_idx <- getSiteIndex(fds, type=type) + + # compute FDR on the given subsets of genes + message(date(), ": starting FDR calculation on subset of genes...") + + # compute FDR on the given subsets of genes + fdrSubset <- bplapply(colnames(fds), FUN=function(sampleId){ + + # get genes to test for this sample + genesToTestSample <- genesToTest[[sampleId]] + padj <- rep(NA, nrow(mcols(fds, type=type))) + padj_gene <- rep(NA, ngenes) + + # if no genes present in the subset for this sample, return NAs + if(is.null(genesToTestSample)){ + return(list(padj=padj, padj_gene=padj_gene)) + } + + # get idx of junctions corresponding to genes to test + if(is.character(genesToTestSample)){ + rowIdx <- sort(which(fds_genes %in% genesToTestSample)) + rowIdx <- unlist(lapply(genesToTestSample, function(gene){ + idx <- which(grepl(paste0("(^|;)", gene, "(;|$)"), + mcols(fds, type=type)[, geneColumn])) + names(idx) <- rep(gene, length(idx)) + if(length(idx) == 0 && verbose(fds) > 0){ + warning("No introns found in fds object for gene: ", gene, + " and sample: ", sampleId, ". Skipping this gene.") + } + return(idx) + })) + rowIdx <- sort(rowIdx[!duplicated(rowIdx)]) + } else{ + stop("Genes in the list to test must be a character vector ", + "of geneIDs.") + } + + # check that rowIdx is not empty vector + if(length(rowIdx) == 0){ + warning("No genes from the given subset found in the fds ", + "object for sample: ", sampleId) + return(list(padj=padj, padj_gene=padj_gene)) + } + + + + # retrieve pvalues of introns to test + p <- as.matrix(pVals(fds, type=type)) + if(ncol(p) == 1){ + colnames(p) <- colnames(fds) + } + p <- p[rowIdx, sampleId] + + # FDR correction on subset + non_dup_site_idx <- !duplicated(site_idx[rowIdx]) + padjSub <- p.adjust(p[non_dup_site_idx], method=method) + + # set intron FDR on subset (filled with NA for all other genes) + padj[rowIdx] <- padjSub + + # gene level pvals + dt <- data.table(sampleID=sampleId, type=type, pval=p, + gene=names(rowIdx), jidx=rowIdx, site_idx=site_idx[rowIdx]) + dt <- merge(dt, + data.table(site_idx=site_idx[rowIdx][non_dup_site_idx], + FDR_subset=padjSub), + by="site_idx") + dt[!duplicated(dt$site_idx), + pval_gene:=min(p.adjust(pval, method="holm")), by="gene"] + dt[, pval_gene := .SD[!is.na(pval_gene), unique(pval_gene)], by="gene"] + + # gene level FDR + dt2 <- dt[, unique(pval_gene), by="gene"] + dt2[, FDR_subset_gene := p.adjust(V1, method=method)] + dt2[, gene_rowIdx := sapply(gene, function(g) which(fds_genes == g))] + + # set intron FDR on subset (filled with NA for all other genes) + padj_gene[dt2[,gene_rowIdx]] <- dt2[, FDR_subset_gene] + + # return new FDR + return(list(padj=padj, padj_gene=padj_gene)) + + }, BPPARAM=BPPARAM) + padjSub <- vapply(fdrSubset, '[[', + double(nrow(mcols(fds, type=type))), 'padj') + padjSub_gene <- vapply(fdrSubset, '[[', double(ngenes), 'padj_gene') + + colnames(padjSub) <- colnames(fds) + rownames(padjSub_gene) <- fds_genes + colnames(padjSub_gene) <- colnames(fds) + + # add FDR subset info to ods object and return + padjVals(fds, type=type, level="site", subsetName=subsetName, + withDimnames=FALSE) <- padjSub + padjVals(fds, type=type, level="gene", subsetName=subsetName, + withDimnames=FALSE) <- padjSub_gene + addToAvailableFDRsubsets(fds) <- subsetName + + message(date(), ": finished FDR calculation on subset of genes.") + validObject(fds) + return(fds) } -getPadjPerGene <- function(pvals, method="BY"){ - - padjPerGene <- apply(pvals, 2, p.adjust, method=method) - - return(padjPerGene) - -} diff --git a/R/resultAnnotations.R b/R/resultAnnotations.R new file mode 100644 index 00000000..7ab9b61d --- /dev/null +++ b/R/resultAnnotations.R @@ -0,0 +1,974 @@ +#' +#' @title Additional result annotations +#' +#' @description These functions work on the result table and add additional +#' annotations to the reported introns: the type of potential impact on +#' splicing (e.g. exon skipping, exon truncation, ...), potential occurence +#' of frameshift, overlap with UTR regions as well as a flag for introns +#' that are located in blacklist regions of the genome. +#' +#' \code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the +#' fds for each intron based on the overlap of the intron's location with +#' the reference annotation. Has to be run before the result table is +#' created so that the new column can be included in it (see examples). +#' +#' \code{\link{annotatePotentialImpact}} annotates each intron in the results +#' table with the type of potential impact on splicing and potential +#' occurence of frameshift (likely, unlikely, inconclusive). Can also +#' calculate overlap with annotated UTR regions. Potential impact can be: +#' annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, +#' exonTruncation, exonElongation, exonTruncation&Elongation, +#' exonSkipping, splicingBeyondGene, +#' multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, +#' complex (everything else). +#' Splice sites (theta metric) annotations indicate how the splice site is +#' located with respect to the reference annotation. The annotated types +#' are: annotatedSpliceSite, exonicRegion, intronicRegion. +#' +#' \code{\link{flagBlacklistRegions}} flags introns in the results table on +#' whether or not they are located in a blacklist region of the genome. By +#' default, the blacklist regions as reported in +#' \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from +#' \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} +#' are used. +#' +#' @param fds A FraserDataSet +#' @param txdb A txdb object providing the reference annotation. +#' @param result A result table as generated by FRASER, including the column +#' \code{annotatedJunction} as generated by the function +#' \code{annotateIntronReferenceOverlap}. +#' @param addPotentialImpact Logical, indicating if the type of the potential +#' impact should be added to the results table. Defaults to \code{TRUE}. +#' @param addUTRoverlap Logical, indicating if the overlap with UTR regions +#' should checked and added to the results table. Defaults to \code{TRUE}. +#' @param minoverlap Integer value defining the number of base pairs around the +#' splice site that need to overlap with UTR or blacklist region, +#' respectivly, to be considered matching. Defaults to 5 bp. +#' @param blacklist_regions A BED file that contains the blacklist regions. +#' If \code{NULL} (default), the BED files that are packaged with FRASER +#' are used (see Details for more information). +#' @param assemblyVersion Indicates the genome assembly version of the intron +#' coordinates. Only used if blacklist_regions is NULL. For other versions, +#' please provide the BED file containing the blacklist regions directly. +#' @param BPPARAM For controlling parallelization behavior. Defaults to +#' \code{bpparam()}. +#' @return An annotated FraserDataSet or results table, respectively +#' +#' @name potentialImpactAnnotations +#' @rdname potentialImpactAnnotations +#' +#' @examples +#' # get data, fit and compute p-values and z-scores +#' fds <- createTestFraserDataSet() +#' +#' # load reference annotation +#' library(TxDb.Hsapiens.UCSC.hg19.knownGene) +#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +#' +#' # add basic annotations for overlap with the reference annotation +#' # run this function before creating the results table +#' fds <- annotateIntronReferenceOverlap(fds, txdb) +#' +#' # extract results: for this small example dataset, no cutoffs used +#' # to get some results +#' res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +#' +#' # annotate the type of potential impact on splicing and UTR overlap +#' res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) +#' +#' # annotate overlap with blacklist regions +#' res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") +#' +#' # show results table containing additional annotations +#' res +#' +NULL + +#' @describeIn potentialImpactAnnotations This method calculates basic annotations +#' based on overlap with the reference annotation (start, end, none, both) +#' for the full fds. The overlap type is added as a new column +#' \code{annotatedJunction} in \code{mcols(fds)}. +#' @export +annotateIntronReferenceOverlap <- function(fds, txdb, BPPARAM=bpparam()){ + message("loading introns ...") + #seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + introns <- unique(unlist(intronsByTranscript(txdb))) + # reduce the introns to only the actually expressed introns + fds_known <- fds[unique(to(findOverlaps(introns, + rowRanges(fds, type = "j"), type = "equal"))),] + anno_introns <- as.data.table(rowRanges(fds_known, + type="j"))[,.(seqnames, start, end, strand)] + + # calculate extra columns with mean/median intron expression count + # add the new columns + sampleCounts <- as.matrix(K(fds_known, type = "j")) + anno_introns[, meanCount := rowMeans(sampleCounts)] + anno_introns[, medianCount := rowMedians(sampleCounts)] + # order by medianCount (highest first) + setorderv(anno_introns, "medianCount", order=-1) + anno_introns_ranges <- makeGRangesFromDataFrame(anno_introns, + keep.extra.columns = TRUE) + + # get all fds junctions + fds_junctions <- rowRanges(fds, type = "j") + + # Do the annotation just for the intron with highest median expression + message("start calculating basic annotations ...") + overlaps <- findOverlaps(fds_junctions, anno_introns_ranges, select="first") + annotations <- bplapply(seq_len(length(fds_junctions)), + function(i, overlaps, fds_junctions, anno_introns_ranges){ + # only select first intron as already ordered by medianCount beforehand + overlap <- overlaps[i] + if(is.na(overlap)) return("none") #no overlap with any intron + + hit_equal <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="equal")) + if(length(hit_equal) > 0) return("both") + + hit_start <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="start")) + if(length(hit_start) > 0) return("start") + hit_end <- from(findOverlaps(fds_junctions[i], + anno_introns_ranges[overlap], + type="end")) + if(length(hit_end) > 0) return("end") + + # overlaps but no start/end match + return("none") + }, overlaps=overlaps, fds_junctions=fds_junctions, + anno_introns_ranges=anno_introns_ranges, BPPARAM=BPPARAM) + annotations <- unlist(annotations) + + rowRanges(fds)$annotatedJunction <- annotations + mcols(fds, type="ss")$annotatedJunction <- "not computed" + message("basic annotations done") + return(fds) +} + +#' @describeIn potentialImpactAnnotations This method annotates the splice event +#' type to junctions in the given results table. +#' @export +annotatePotentialImpact <- function(result, txdb, fds, addPotentialImpact=TRUE, + addUTRoverlap=TRUE, minoverlap=5, + BPPARAM=bpparam()){ + + # convert to data.table if not already + if(!is.data.table(result)){ + annoResult <- as.data.table(result) + } else{ + annoResult <- result + } + + # Create basic annotation of overlap with reference + if(!("annotatedJunction" %in% colnames(annoResult))){ + stop("Column 'annotatedJunction' not found in the results table!\n", + "Please run 'fds <- annotateIntronReferenceOverlap(fds, txdb)' ", + "first and then extract \nthe results table using the ", + "'results(fds, ...)' function before calling this function.") + } + + # Calculate splice types and frameshift + if(isTRUE(addPotentialImpact)){ + annoResult <- addPotentialImpactLabels(annoResult, fds, txdb) + annoResult[potentialImpact == "singleExonSkipping", + potentialImpact := "exonSkipping"] + } + + # Add UTR labels + if(isTRUE(addUTRoverlap)){ + annoResult <- addUTRLabels(annoResult, txdb) + } + + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) +} + +#' @describeIn potentialImpactAnnotations This method flags all introns and +#' splice sites in the given results table for which at least one splice +#' site (donor or acceptor) is located in a blacklist region. Blacklist +#' regions of the genome are determined from the provided BED file. +#' @export +flagBlacklistRegions <- function(result, blacklist_regions=NULL, + assemblyVersion=c('hg19', 'hg38'), + minoverlap=5){ + + # convert to data.table if not already + if(!is.data.table(result)){ + annoResult <- as.data.table(result) + } else{ + annoResult <- result + } + + assemblyVersion <- match.arg(assemblyVersion) + if(is.null(blacklist_regions)){ + blacklist_regions <- + system.file("extdata", "blacklist_regions", + paste0(assemblyVersion, "-blacklist.v2.bed.gz"), + package = "FRASER") + } + if(!file.exists(blacklist_regions)){ + stop("BED file with blacklist regions does not exist: ", + blacklist_regions) + } + message("Importing blacklist regions ...") + blacklist_gr <- rtracklayer::import(blacklist_regions, format = "BED") + annoResult <- addBlacklistLabels(annoResult, blacklist_gr) + + if(is(result, "GenomicRanges")){ + annoResult <- makeGRangesFromDataFrame(annoResult, + keep.extra.columns=TRUE) + } + + return(annoResult) +} + +############# helper functions ############################## + +#' blacklist annotation for aberrant splicing events +#' @noRd +addBlacklistLabels <- function(junctions_dt, blacklist_gr, minoverlap=5){ + # add the blacklist information + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) + + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + + # set to the same seqlevelsstyle + seqlevelsStyle(blacklist_gr) <- seqlevelsStyle(junctions_gr) + + ## create overlap with blacklist and annotate extra column + message("finding blacklist overlap ...") + black_hits_start_ss <- unique(from(findOverlaps(gr_start_ss, blacklist_gr))) + black_hits_end_ss <- unique(from(findOverlaps(gr_end_ss, blacklist_gr))) + junctions_dt[, blacklist := FALSE] + + junctions_dt[black_hits_start_ss, blacklist := TRUE] + junctions_dt[black_hits_end_ss, blacklist := TRUE] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + + message("blacklist labels done") + return(junctions_dt) +} + +#' adds UTR overlap annotation to results table +#' @noRd +addUTRLabels <- function(junctions_dt, txdb, minoverlap=5){ + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + # get gr with start/end positions of each intron + gr_start_ss <- junctions_gr + end(gr_start_ss) <- start(gr_start_ss) + minoverlap - 1 + start(gr_start_ss) <- start(gr_start_ss) - minoverlap + gr_end_ss <- junctions_gr + start(gr_end_ss) <- end(gr_end_ss) - minoverlap + 1 + end(gr_end_ss) <- end(gr_end_ss) + minoverlap + + ### UTR labels based on txdb file + ### add 5' 3' UTR labels + message("finding UTR overlap ...") + threes_start <- unique(from(findOverlaps(gr_start_ss, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + threes_end <- unique(from(findOverlaps(gr_end_ss, + threeUTRsByTranscript(txdb, use.names = TRUE)))) + fives_start <- unique(from(findOverlaps(gr_start_ss, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + fives_end <- unique(from(findOverlaps(gr_end_ss, + fiveUTRsByTranscript(txdb, use.names = TRUE)))) + junctions_dt[, UTR_overlap := "no"] + junctions_dt[threes_start, UTR_overlap := "3'-UTR"] + junctions_dt[threes_end, UTR_overlap := "3'-UTR"] + junctions_dt[fives_start, UTR_overlap := "5'-UTR"] + junctions_dt[fives_end, UTR_overlap := "5'-UTR"] + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + message("UTR labels done") + return(junctions_dt) +} + + + +#' adds type of splicing to each intron in the results table +#' @noRd +addPotentialImpactLabels <- function(junctions_dt, fds, txdb){ + message("preparing ...") + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + introns_tmp <- unique(unlist(intronsByTranscript(txdb))) + exons <- exons(txdb) + + # seqlevelsStyle(fds) <- seqlevelsStyle(txdb)[1] + fds_known <- fds[unique(to(findOverlaps(introns_tmp, + rowRanges(fds, type = "j"), + type = "equal"))),] + grIntrons <- rowRanges(fds_known, type="j") + introns <- as.data.table(grIntrons) + introns <- introns[,.(seqnames, start, end, strand)] + + sampleCounts <- K(fds_known, type = "j") + introns[, "meanCount" := rowMeans(sampleCounts)] + introns[, "medianCount" := rowMedians(as.matrix(sampleCounts))] + intron_ranges <- makeGRangesFromDataFrame(introns, + keep.extra.columns = TRUE) + + # prepare the results column + junctions_dt[, potentialImpact := "complex"] + junctions_dt[, causesFrameshift := "inconclusive"] + junctions_dt[annotatedJunction == "both" & deltaPsi >= 0, + potentialImpact := "annotatedIntron_increasedUsage"] + junctions_dt[annotatedJunction == "both" & deltaPsi < 0, + potentialImpact := "annotatedIntron_reducedUsage"] + junctions_dt[annotatedJunction == "both", causesFrameshift := "unlikely"] + + if(all(c("nonsplitProportion", "nonsplitProportion_99quantile") %in% + colnames(junctions_dt))){ + junctions_dt[potentialImpact == "annotatedIntron_reducedUsage" & + type == "jaccard" & + nonsplitProportion >= nonsplitProportion_99quantile + 0.05 & + nonsplitCounts >= 10, + potentialImpact := "(partial)intronRetention"] + + # TODO check frameshift for intron retention + junctions_dt[potentialImpact == "(partial)intronRetention", + causesFrameshift := "inconclusive"] + } + + starts <- which(junctions_dt[psi_positions]$annotatedJunction=="start") + ends <- which(junctions_dt[psi_positions]$annotatedJunction=="end") + nones <- which(junctions_dt[psi_positions]$annotatedJunction=="none") + + message("calculating splice types ...") + # start junctions + start_results <- sapply(starts, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "start")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareEnds(junctions_gr, i, overlap[maxExpr], FALSE, + intron_ranges, exons)) + }) + junctions_dt[psi_positions[starts], + causesFrameshift:=start_results[2,]] + junctions_dt[psi_positions[starts], + potentialImpact := start_results[1,]] + + # end junctions + end_results <- sapply(ends, function(i){ + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges, + type = "end")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + return(compareStarts(junctions_gr, i, overlap[maxExpr], FALSE, + intron_ranges, exons)) + + }) + junctions_dt[psi_positions[ends], causesFrameshift:=end_results[2,]] + junctions_dt[psi_positions[ends], potentialImpact := end_results[1,]] + + # none junctions pt1 + none_results <- sapply(nones, function(i){ + # find most freq intron + # check start and end + + # find the most freq intron that overlaps again + overlap <- to(findOverlaps(junctions_gr[i], intron_ranges)) + if(length(overlap) == 0) return(c("noOverlap", "inconclusive")) + expre <- sapply(overlap, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + # returns type of exon splicing, frameshift TRUE/FALSE, amount of shift + st = compareStarts(junctions_gr, i, overlap[maxExpr], TRUE, + intron_ranges, exons) + en = compareEnds(junctions_gr, i, overlap[maxExpr], TRUE, + intron_ranges, exons) + + # merge, start and end results + # merge exon elongation/truncation + # if both likely/unlikely fine + # if one is likely -> return likely + # if one is notYet -> return notYet + if((st[1] == "singleExonSkipping" & !(en[1] %in% + c("singleExonSkipping", "exonSkipping"))) || + (en[1] == "singleExonSkipping" & !(st[1] %in% + c("singleExonSkipping", "exonSkipping")))){ + ## only one is single exonSkipping, the other is trunc/elong + if((as.integer(st[3])+as.integer(en[3])) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("singleExonSkipping", frs)) + } + + if(st[1] %in% c("exonSkipping", "singleExonSkipping") || en[1] %in% + c("exonSkipping", "singleExonSkipping")){ + return(c("exonSkipping", "inconclusive")) + } + + if((as.integer(st[3])+as.integer(en[3]))%%3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + if( st[1] != en[1]){ + combined = "exonTruncation&Elongation" + }else{combined = st[1]} + return(c(combined,frs)) + + }) + junctions_dt[psi_positions[nones], causesFrameshift:=none_results[2,]] + junctions_dt[psi_positions[nones], potentialImpact := none_results[1,]] + + noLaps <-which(junctions_dt[psi_positions]$potentialImpact=="noOverlap") + refseq.genes<- genes(txdb) + + # none junctions pt2 + noLaps_results <- sapply(noLaps, function(i){ + overlap <- to(findOverlaps(junctions_gr[i], exons)) + # no overlap with an intron or an exon + if(length(overlap) == 0){ + return(checkIntergenic(junctions_gr, i, refseq.genes)) + } + + # for the exons, check if splice site is contained in the exon + for(j in overlap){ + exon_start = start(exons[j]) + exon_end = end(exons[j]) + if(exon_start <= start(junctions_gr[i]) & + exon_end >= end(junctions_gr[i])){ + if((end(junctions_gr[i]) - + start(junctions_gr[i]) + 1) %% 3 != 0){ + frs = "likely" + }else{ frs = "unlikely"} + return(c("exonTruncation", frs)) + } + } + + return(c("complex","inconclusive")) + }) + junctions_dt[psi_positions[noLaps], + causesFrameshift:=noLaps_results[2,]] + junctions_dt[psi_positions[noLaps], + potentialImpact := noLaps_results[1,]] + + # theta annotations + thetas <- which(junctions_dt$type == "theta") + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[thetas,], + keep.extra.columns = TRUE) + + # specify default type for theta results as NA + junctions_dt[thetas, potentialImpact := NA] + junctions_dt[thetas, causesFrameshift := NA] + + # label all as intronic first if they have any intron overlap + intronic <- unique(from(findOverlaps(junctions_gr, introns_tmp))) + junctions_dt[thetas[intronic], potentialImpact := "intronicRegion"] + + # for exonic, check if theta is fully contained in an exon + # if one end is in an intron and the other in an exon it is a splice site + exonic <- unique(from(findOverlaps(junctions_gr, exons))) + within <- findOverlaps(junctions_gr, exons, type = "within") + all <- findOverlaps(junctions_gr, exons) + exonic_results <- sapply(exonic, function(i){ + w <- unique(to(within)[which(from(within) == i)]) + a <- unique(to(all)[which(from(all) == i)]) + if(length(a) == length(w)) return("exonicRegion") + return("annotatedSpliceSite") + }) + junctions_dt[thetas[exonic], potentialImpact := exonic_results] + + # check cases that don't overlap with an exon/intron + nones <- which(is.na(junctions_dt[thetas,]$potentialImpact)) + none_results <- sapply(nones, function(i){ + if(length(findOverlaps(junctions_gr[i], refseq.genes)) > 0) return(NA) + #return("intergenic") + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start(junctions_gr[i])){ + ifelse(strand(junctions_gr[i]) == "+", + return("upstreamOfNearestGene"), + return("downstreamOfNearestGene")) + }else{ + ifelse(strand(junctions_gr[i]) == "+", + return("downstreamOfNearestGene"), + return("upstreamOfNearestGene")) + } + }) + junctions_dt[thetas[nones], potentialImpact := none_results] + + # add distance to closest neighbour gene for intergenic results + # (both psi and theta) + message("adding distances to nearest gene ...") + up <- which(junctions_dt$potentialImpact == "upstreamOfNearestGene") + down <- which(junctions_dt$potentialImpact == "downstreamOfNearestGene") + + # create full grange object containing psi and theta + junctions_gr <- makeGRangesFromDataFrame(junctions_dt, + keep.extra.columns = TRUE) + + # Calculate distances + if(length(up) > 0){ + distanceNearestGene_up <- sapply(up, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) + if(length(distanceNearestGene_up > 0)){ + junctions_dt[psi_positions[up], + distNearestGene := distanceNearestGene_up] + } else{ + junctions_dt[psi_positions[up], distNearestGene := NA] + message("No distances found for upstream") + } + }else{message("No upstream targets")} + + if(length(down) > 0){ + distanceNearestGene_down <- sapply(down, function(i){ + min(distance(junctions_gr[i], refseq.genes), na.rm = TRUE)}) + if(length(distanceNearestGene_down > 0)){ + junctions_dt[psi_positions[down], + distNearestGene := distanceNearestGene_down] + }else{ + junctions_dt[psi_positions[down], distNearestGene := NA] + message("No distances found for downstream") + } + }else{message("No downstream targets")} + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + message("done calculating splice types") + + # Add the subtypes for exonSkipping and inconclusive + junctions_dt <- checkExonSkipping(junctions_dt, txdb) + junctions_dt <- checkInconclusive(junctions_dt, txdb) + + return(junctions_dt) +} + +#' +#' @noRd +compareStarts <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_start = start(intron_ranges[max_lap]) + ss_start = start(junctions_gr[i]) + + # found the most freq intron with same end again + # check if intron starts before splice site -> exon elongation -> FRS + if(intron_start < ss_start){ + if(((ss_start - intron_start) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, + (ss_start - intron_start))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_start > ss_start){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(intron_start-2, end = intron_start -1), + strand = toString(strand(intron_ranges[max_lap])) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "end")) + for(j in exonChoices){ + exon_start = start(exons[j]) + if(exon_start < ss_start){ + if((end(exons[j]) - ss_start + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation", frs, + (-1)*(end(exons[j]) - ss_start + 1))), + return(c("exonTruncation", frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, + end(junctions_gr[i])), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(ss_start, start(exons[exonChoices[1]]) - 1) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "end")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_start >= start(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation + + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + ss_start - start(intron_ranges[secItrChoices[maxExpr]]) + + frs = ifelse(shift %% 3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +compareEnds <- function(junctions_gr, i, max_lap, shift_needed, + intron_ranges, exons){ + intron_end = end(intron_ranges[max_lap]) + ss_end = end(junctions_gr[i]) + + # found the most freq intron with same start again + # check if intron ends after splice site -> exon elongation -> FRS -> done + if(intron_end > ss_end){ + if(((intron_end - ss_end) %% 3) != 0){ + frs = "likely" + }else{ frs = "unlikely"} + + ifelse(shift_needed, + return(c("exonElongation", frs, (intron_end - ss_end))), + return(c("exonElongation", frs))) + } + + # check if splice site ends in following exon -> exon truncation -> FRS + if(intron_end < ss_end){ + + # create dummy exon find all exons starting from that intron end + dummy_exon <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + ranges = IRanges(intron_end + 1, end = intron_end + 2), + strand = toString(intron_ranges[max_lap]@strand@values) + ) + exonChoices <- to(findOverlaps(dummy_exon, exons, type = "start")) + for(j in exonChoices){ + exon_end = end(exons[j]) + if(exon_end > ss_end){ + if((ss_end - start(exons[j]) + 1)%%3 != 0){ + frs = "likely" + }else{frs = "unlikely"} + ifelse(shift_needed, + return(c("exonTruncation",frs, + (-1)*(ss_end - start(exons[j]) + 1))), + return(c("exonTruncation",frs))) + } + } + + # check for single exon skipping + if(length(exonChoices) == 1){ + + # check if there is no other exon within the first intron: + # splice site end until exon end + dummyFirstItr <- GRanges( + seqnames = toString(seqnames(intron_ranges[max_lap])), + ranges = IRanges(start(junctions_gr[i]), + start(exons[exonChoices[1]]) - 1), + strand = toString(strand(intron_ranges[max_lap])) + ) + + if(length(findOverlaps(exons, dummyFirstItr, + type = "within")) > 0){ + # another exon is contained within the most freq used intron + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + + secondItr <- GRanges( + seqnames = toString(intron_ranges[max_lap]@seqnames@values), + strand = toString(intron_ranges[max_lap]@strand@values), + ranges = IRanges(end(exons[exonChoices[1]]) + 1, ss_end) + # end of exon + 1, end of aberrant junction + ) + secItrChoices <- to(findOverlaps(secondItr, intron_ranges, + type = "start")) + # only look at most used one + expre <- sapply(secItrChoices, function(j){ + elementMetadata(intron_ranges[j])$medianCount + }) + maxExpr <- which.max(expre) + + if(length(secItrChoices) == 0){ + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) + } + + if(ss_end <= end(intron_ranges[secItrChoices[maxExpr]])){ + # check if there is no other exon in that range + if(length(findOverlaps(exons, + intron_ranges[secItrChoices[maxExpr]], + type = "within")) == 0){ + # clear exon skipping, only exon is skipped + # calculate frameshift, skipped exon plus possible exon + # elongation at end + shift = (-1)*(end(exons[exonChoices[1]]) - + start(exons[exonChoices[1]]) + 1) + + end(intron_ranges[secItrChoices[maxExpr]]) - ss_end + frs = ifelse(shift%%3 == 0,"unlikely","likely") + ifelse(shift_needed, + return(c("singleExonSkipping", "inconclusive", + shift)), + return(c("singleExonSkipping", frs))) + } + } + } # single exon skipping end + + + } + + # splice site longer than one intron + exon -> not defined for now + ifelse(shift_needed, + return(c("exonSkipping", "inconclusive", 0)), + return(c("exonSkipping", "inconclusive"))) +} + +#' +#' @noRd +checkIntergenic <- function(junctions_gr, i, refseq.genes){ + # check if start > 1000 + # start - 1000, end + 1000 + start = start(junctions_gr[i]) + # ifelse(start > 1000, start = start - 1000, start = 1) + # if(start > 1000){ + # start = start - 1000 + # }else{start = 1} + + end = end(junctions_gr[i]) #+ 1000 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_junction <- GRanges( + seqnames = seqnames(junctions_gr[i]), + ranges = IRanges(start, end), + strand = strand(junctions_gr[i]) + ) + + # overlap with introns and exon + # IGNORE STRANDS? -> decided its not necessary + + # check if distance to nearest is > 1000 -> intergenic + # otherwise up/downstream + dist = min(distance(test_junction, refseq.genes), na.rm = TRUE) + if(dist > 0){ + # find nearest and compare starts + if(start(refseq.genes[nearest(junctions_gr[i], + refseq.genes)]) > start){ + ifelse(strand(junctions_gr[i]) == "+", + return(c("upstreamOfNearestGene", "unlikely")), + return(c("downstreamOfNearestGene", "unlikely"))) + }else{ + ifelse(strand(junctions_gr[i]) == "+", + return(c("downstreamOfNearestGene", "unlikely")), + return(c("upstreamOfNearestGene", "unlikely"))) + } + } + return(c("complex", "inconclusive")) +} + +#' +#' @noRd +checkExonSkipping <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + exonSkip <- which(junctions_dt[psi_positions]$potentialImpact %in% + c("exonSkipping", "singleExonSkipping")) + + message("start checking exonSkipping") + newSkip_results <- sapply(exonSkip, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("splicingBeyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("exonSkipping") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("splicingBeyondGene") + # end is in a different gene + return("multigenicSplicing") + }) + + # checking exonSkipping done + if(length(exonSkip) > 0){ + junctions_dt[psi_positions[exonSkip], + potentialImpact2 := newSkip_results] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + potentialImpact := "splicingBeyondGene"] + junctions_dt[potentialImpact2 == "splicingBeyondGene", + causesFrameshift := "inconclusive"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + potentialImpact := "multigenicSplicing"] + junctions_dt[potentialImpact2 == "multigenicSplicing", + causesFrameshift := "inconclusive"] + junctions_dt[, potentialImpact2 := NULL] + } + + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + return(junctions_dt) +} + +#' +#' @noRd +checkInconclusive <- function(junctions_dt, txdb){ + psi_positions <- which(junctions_dt$type != "theta") + colnames(junctions_dt)[which(names(junctions_dt) == "STRAND")] <- "strand2" + junctions_gr <- makeGRangesFromDataFrame(junctions_dt[psi_positions], + keep.extra.columns = TRUE) + seqlevelsStyle(txdb) <- seqlevelsStyle(junctions_gr) + + refseq.genes<- genes(txdb) + + inconclusive <- which(junctions_dt[psi_positions + ]$potentialImpact == "complex") + + inconclusive_results <- sapply(inconclusive, function(i){ + start = start(junctions_gr[i]) + end = end(junctions_gr[i]) + + # reduce the junction width so adjacent genes have a distance of 1 + if(start + 2 < end){ + start = start + 1 + end = end - 1 + } + + test_start <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(start, start + 1) + ) + + test_end <- GRanges( + seqnames = seqnames(junctions_gr[i]), + strand = strand(junctions_gr[i]), + ranges = IRanges(end - 1, end) + ) + + # check for which genes distance to start is 0 + start_genes <- which(distance(test_start, refseq.genes) == 0) + # start is not in a gene + if(length(start_genes) == 0) return("splicingBeyondGene") + + # start is in a gene -> is end in same gene + for(to in start_genes){ + # end is in same gene + if(distance(test_end, refseq.genes[to]) == 0){ + return("complex") + } + } + + end_genes <- which(distance(test_end, refseq.genes) == 0) + # end is not in a gene + if(length(end_genes) == 0) return("splicingBeyondGene") + # end is in a different gene + return("multigenicSplicing") + }) + + colnames(junctions_dt)[which(names(junctions_dt) == "strand2")] <- "STRAND" + + if(length(inconclusive) > 0){ + junctions_dt[psi_positions[inconclusive], + potentialImpact := inconclusive_results] + } + + return(junctions_dt) +} diff --git a/R/updateRho.R b/R/updateRho.R index f5a0d429..628b21fd 100644 --- a/R/updateRho.R +++ b/R/updateRho.R @@ -8,11 +8,15 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ n <- N(fds) y <- predictY(fds, noiseAlpha=currentNoiseAlpha(fds)) - fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, - k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + # fitparameters <- bplapply(seq_len(nrow(k)), estRho, nll=truncNLL_rho, + # k=k, n=n, y=y, rhoRange=rhoRange, BPPARAM=BPPARAM) + fitparameters <- bplapply(seq_len(nrow(k)), estRho, + nll=fullNLLRho_penalized, + k=k, n=n, y=y, rhoRange=rhoRange, lambda=1e-4, + BPPARAM=BPPARAM) - rho(fds) <- vapply(fitparameters, "[[", - double(1), "minimum") + rho(fds) <- plogis(vapply(fitparameters, "[[", + double(1), "minimum")) if(isTRUE(verbose)){ stxt <- capture.output(summary(rho(fds))) @@ -23,16 +27,27 @@ updateRho <- function(fds, type, rhoRange, BPPARAM, verbose){ return(fds) } -estRho <- function(idx, k, n, y, rhoRange, nll, control=list()){ +estRho <- function(idx, k, n, y, rhoRange, nll, control=list(), lambda=1e-4){ ki <- k[idx,] ni <- n[idx,] yi <- y[idx,] - est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # est <- optimize(f=nll, interval=rhoRange, yi=yi, ki=ki, ni=ni, + # maximum=FALSE, tol=0.0000001) + # est + est <- optimize(f=nll, interval=rhoRange, + mui=plogis(yi), ki=ki, ni=ni, lambda=lambda, maximum=FALSE, tol=0.0000001) est } +fullNLLRho_penalized <- function(logit_rho, ki, ni, mui, lambda=1e-4){ + rho <- plogis(logit_rho) + nll <- -mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) + nll <- nll + lambda * (logit_rho^2) + return(nll) +} + negLogLikelihoodRho <- function(rho, ki, ni, mui){ #-mean(dbetabinom(ki + 0.5, ni + 1, mu, rho, log=TRUE)) @@ -63,6 +78,20 @@ trunc_negLogLikelihoodRho <- function(rho, ki, ni, mui){ mean(alpha + beta - alphaK - betaNK ) } +trunc_negLogLikelihoodRho_penalized <- function(logit_rho, ki, ni, mui, lambda){ + #-mean(dbetabinom(ki, ni, mui, rho, log=TRUE)) + + rho <- plogis(logit_rho) + r <- (1-rho)/rho + alpha <- lgamma(mui*r) + alphaK <- lgamma(mui*r + ki) + beta <- lgamma((mui-1)*(-r)) + betaNK <- lgamma((mui-1)*(-r) + (ni - ki)) + + #mean negative log likelihood with pseudocounts + mean(alpha + beta - alphaK - betaNK ) + lambda * (logit_rho*logit_rho) +} + methodOfMomentsRho <- function(k, n, rhoRange=c(1e-5, 1 - 1e-5)){ # taken from wiki: diff --git a/R/variables.R b/R/variables.R index ccba65f5..d097201c 100644 --- a/R/variables.R +++ b/R/variables.R @@ -1,11 +1,11 @@ #' -#' Available psi types +#' Available splice metrics #' #' @examples -#' # to show available psi types: +#' # to show all available splice metrics: #' psiTypes #' +#' @rdname psiTypes #' @export -psiTypes <- c("psi5", "psi3", "theta") -names(psiTypes) <- psiTypes - +psiTypes <- c("jaccard", "psi5", "psi3", "theta") +names(psiTypes) <- c("Intron Jaccard Index", "psi5", "psi3", "theta") diff --git a/R/zzz.R b/R/zzz.R index 8f087a08..7f0b67d5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,7 +8,7 @@ op.fraser <- list( `FRASER-hdf5-chunk-nrow` = 30000, `FRASER-hdf5-chunk-ncol` = 20, - `FRASER.pseudoCount` = 1, + `FRASER.pseudoCount` = 0.1, `FRASER.minSamplesForDelayed` = 1000, `FRASER.maxSamplesNoHDF5` = 20, `FRASER.maxJunctionsNoHDF5` = 1000) diff --git a/README.md b/README.md index 8e9443aa..5b65533f 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Coverage status](https://codecov.io/gh/c-mertes/FRASER/branch/master/graph/badge.svg)](https://codecov.io/github/c-mertes/FRASER/branch/master) [![License](https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000)](https://github.com/c-mertes/FRASER/blob/master/LICENSE) -FRASER is a tool to detect aberrant splicing events in RNA-seq data. The method is described and published in [Nature Communications](https://doi.org/doi:10.1038/s41467-020-20573-7) and available through [Bioconductor](https://doi.org/doi:10.18129/B9.bioc.FRASER). It is also part of the [Detection of RNA Outlier Pipeline (DROP)](https://github.com/gagneurlab/drop). The DROP pipeline is described and published in [Nature Protocols](https://doi.org/doi:10.1038/s41596-020-00462-5). +FRASER is a tool to detect aberrant splicing events in RNA-seq data. The method is described and published in [Nature Communications](https://doi.org/doi:10.1038/s41467-020-20573-7) and available through [Bioconductor](https://doi.org/doi:10.18129/B9.bioc.FRASER). It is also part of the [Detection of RNA Outlier Pipeline (DROP)](https://github.com/gagneurlab/drop). DROP is described and published in [Nature Protocols](https://doi.org/doi:10.1038/s41596-020-00462-5). The FRASER framework and workflow aims to assist the diagnostics in the field of rare diseases where RNA-seq is performed to identify aberrant splicing defects. For a short tutorial on how to use FRASER on a dataset please use the [vignette](http://bioconductor.org/packages/release/bioc/vignettes/FRASER/inst/doc/FRASER.pdf) or our Colab tutorial at: [http://tinyurl.com/RNA-ASHG-colab](http://tinyurl.com/RNA-ASHG-colab). The Colab is based on a workshop that we presented at ASHG 2019/2020. @@ -13,6 +13,21 @@ Please cite our method paper if you use it in a publication: > Mertes, C., Scheller, I.F., Yépez, V.A. *et al.* Detection of aberrant splicing events in RNA-seq data using FRASER. *Nat Commun* **12**, 529 (2021). https://doi.org/10.1038/s41467-020-20573-7 +## What's new + +FRASER 2.0, an improved version of FRASER, is now available and used by default (version 1.99.0 and above). +FRASER 2.0 uses the Intron Jaccard Index as its splice metric instead of FRASER's +previous three metrics along with some other parameter optimizations of pseudocount, +filtering settings and default delta cutoff. + +To change the splice metric, set `fitMetrics(fds)` to one or more of the metrics +specified in `FRASER::psiTypes`. For FRASER 2.0 and the Intron Jaccard Index, the +new default delta cutoff is 0.1 instead of the previous value of 0.3. When using +the 3 previous metrics, the delta cutoff should be set manually to 0.3 +during results extraction, e.g. `results(fds, deltaPsiCutoff=0.3,...)`. + +The preprint describing these changes in more detail is available in [medRxiv](https://www.medrxiv.org/content/10.1101/2023.03.31.23287997v1). + ## Installation `FRASER` is an R/Bioconductor software package requiring a running diff --git a/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz new file mode 100644 index 00000000..5d87eb13 Binary files /dev/null and b/inst/extdata/blacklist_regions/hg19-blacklist.v2.bed.gz differ diff --git a/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz new file mode 100644 index 00000000..a4ec8581 Binary files /dev/null and b/inst/extdata/blacklist_regions/hg38-blacklist.v2.bed.gz differ diff --git a/man/FRASER.Rd b/man/FRASER.Rd index 8ae94611..0eeb4ed6 100644 --- a/man/FRASER.Rd +++ b/man/FRASER.Rd @@ -5,15 +5,18 @@ \alias{calculateZscore} \alias{calculatePvalues} \alias{calculatePadjValues} +\alias{calculatePadjValuesOnSubset} \title{FRASER: Find RAre Splicing Events in RNA-seq data} \usage{ FRASER( fds, q, + type = fitMetrics(fds), implementation = c("PCA", "PCA-BB-Decoder", "AE-weighted", "AE", "BB"), iterations = 15, BPPARAM = bpparam(), correction, + subsets = NULL, ... ) @@ -28,7 +31,26 @@ calculatePvalues( capN = 5 * 1e+05 ) -calculatePadjValues(fds, type = currentType(fds), method = "BY") +calculatePadjValues( + fds, + type = currentType(fds), + method = "BY", + rhoCutoff = NA, + geneLevel = TRUE, + geneColumn = "hgnc_symbol", + subsets = NULL, + BPPARAM = bpparam() +) + +calculatePadjValuesOnSubset( + fds, + genesToTest, + subsetName, + type = currentType(fds), + method = "BY", + geneColumn = "hgnc_symbol", + BPPARAM = bpparam() +) } \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} @@ -38,6 +60,9 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing +efficiency)} + \item{implementation}{The method that should be used to correct for confounders.} @@ -48,10 +73,14 @@ not yet converged after these number of iterations, the fit stops anyway.} \item{correction}{Deprecated. The name changed to implementation.} -\item{...}{Additional parameters passed on to the internal fit function} +\item{subsets}{A named list of named lists specifying any number of gene +subsets (can differ per sample). For each subset, FDR correction +will be limited to genes in the subset, and the FDR corrected +pvalues stored as an assay in the fds object in addition to the +transcriptome-wide FDR corrected pvalues. See the examples for +how to use this argument.} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing -efficiency)} +\item{...}{Additional parameters passed on to the internal fit function} \item{logit}{Indicates if z scores are computed on the logit scale (default) or in the natural (psi) scale.} @@ -62,7 +91,24 @@ calculated. Possible are beta-binomial, binomial and normal.} \item{capN}{Counts are capped at this value to speed up the p-value calculation} -\item{method}{The p.adjust method that should be used.} +\item{method}{The p.adjust method that should be used for genome-wide +multiple testing correction.} + +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which junctions are +masked with NA during p value adjustment (default: NA, no masking).} + +\item{geneLevel}{Logical value indiciating whether gene-level p values +should be calculated. Defaults to TRUE.} + +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} + +\item{genesToTest}{A named list with the subset of genes to test per sample. +The names must correspond to the sampleIDs in the given fds object.} + +\item{subsetName}{The name under which the resulting FDR corrected pvalues +will be stored in metadata(fds).} } \value{ FraserDataSet @@ -104,11 +150,21 @@ psi. \item \code{calculatePvalues()}: This function calculates two-sided p-values based on the beta-binomial distribution (or binomial or normal if desired). The -returned p values are already adjusted with Holm's method per donor or +returned p values are not yet adjusted with Holm's method per donor or acceptor site, respectively. \item \code{calculatePadjValues()}: This function adjusts the previously calculated -p-values per sample for multiple testing. +p-values per sample for multiple testing. First, the previoulsy calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, if gene symbols have been annotated to +junctions (and not otherwise requested), gene-level p values are computed. + +\item \code{calculatePadjValuesOnSubset()}: This function does FDR correction only for all junctions +in a certain subset of genes which can differ per sample. Requires gene +symbols to have been annotated to junctions. As with the full FDR +correction across all junctions, first the previously calculated +junction-level p values are adjusted with Holm's method per donor or +acceptor site, respectively. Then, gene-level p values are computed. }} \examples{ @@ -132,16 +188,32 @@ fds <- FRASER(fds, q=2, implementation="PCA") # The functions run inside the FRASER function can also be directly # run themselves. # To directly run the fit function: -fds <- fit(fds, implementation="PCA", q=2, type="psi5") +fds <- fit(fds, implementation="PCA", q=2, type="jaccard") # To directly run the nomial and adjusted p value and z score # calculation, the following functions can be used: -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds, type="psi5")) -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds, type="jaccard")) +fds <- calculateZscore(fds, type="jaccard") +head(zScores(fds, type="jaccard")) + +# example of restricting FDR correction to subsets of genes of interest +genesOfInterest <- list("sample1"=c("TIMMDC1"), "sample2"=c("MCOLN1")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +padjVals(fds, type="jaccard", subsetName="exampleSubset") +padjVals(fds, type="jaccard", level="gene", subsetName="exampleSubset") +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("anotherExampleSubset"=c("TIMMDC1"))) +padjVals(fds, type="jaccard", subsetName="anotherExampleSubset") + +# only adding FDR corrected pvalues on a subset without calculating +# transcriptome-wide FDR again: +fds <- calculatePadjValuesOnSubset(fds, genesToTest=genesOfInterest, + subsetName="setOfInterest", type="jaccard") +padjVals(fds, type="jaccard", subsetName="setOfInterest") } \seealso{ diff --git a/man/annotateRanges.Rd b/man/annotateRanges.Rd index c82a3211..e74728b2 100644 --- a/man/annotateRanges.Rd +++ b/man/annotateRanges.Rd @@ -20,7 +20,8 @@ annotateRangesWithTxDb( featureName = "hgnc_symbol", keytype = "ENTREZID", txdb = NULL, - orgDb = NULL + orgDb = NULL, + filter = list() ) } \arguments{ @@ -52,6 +53,11 @@ one is used, currently this is \item{orgDb}{An \code{orgDb} object or a data table to map the feature names. If this is NULL, then \code{org.Hs.eg.db} is used as the default.} + +\item{filter}{A named list specifying the filters which should be applied to +subset to e.g. only protein-coding genes for annotation. +\code{names(filter)} needs to be column names in the given +orgDb object (default: no filtering).} } \value{ FraserDataSet @@ -67,13 +73,13 @@ fds <- createTestFraserDataSet() # either using biomart with GRCh38 try({ fds <- annotateRanges(fds, GRCh=38) - rowRanges(fds, type="psi5")[,c("hgnc_symbol")] + rowRanges(fds, type="j")[,c("hgnc_symbol")] }) # either using biomart with GRCh37 try({ fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37) - rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")] + rowRanges(fds, type="j")[,c("hgnc_symbol_37")] }) # or with a provided TxDb object @@ -82,6 +88,6 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene require(org.Hs.eg.db) orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -rowRanges(fds, type="psi5")[,"hgnc_symbol"] +rowRanges(fds, type="j")[,"hgnc_symbol"] } diff --git a/man/calculatePSIValues.Rd b/man/calculatePSIValues.Rd index 0128221d..9e0c14fa 100644 --- a/man/calculatePSIValues.Rd +++ b/man/calculatePSIValues.Rd @@ -15,7 +15,7 @@ calculatePSIValues( \item{fds}{A \code{\link{FraserDataSet}} object} \item{types}{A vector with the psi types which should be calculated. Default -is all of psi5, psi3 and theta.} +is all of jaccard, psi5, psi3 and theta.} \item{overwriteCts}{FALSE or TRUE (the default) the total counts (aka N) will be recalculated based on the existing junction counts (aka K)} @@ -31,7 +31,7 @@ based on the FraserDataSet object } \examples{ fds <- createTestFraserDataSet() - fds <- calculatePSIValues(fds, types="psi5") + fds <- calculatePSIValues(fds, types="jaccard") ### usually one would run this function for all psi types by using: # fds <- calculatePSIValues(fds) diff --git a/man/counts.Rd b/man/counts.Rd index 1ddac4d5..79beb47f 100644 --- a/man/counts.Rd +++ b/man/counts.Rd @@ -11,9 +11,14 @@ K(fds, type = currentType(fds)) N(fds, type = currentType(fds)) -\S4method{counts}{FraserDataSet}(object, type = NULL, side = c("ofInterest", "otherSide")) - -\S4method{counts}{FraserDataSet,ANY}(object, type = NULL, side = c("ofInterest", "otherSide"), ...) <- value +\S4method{counts}{FraserDataSet}(object, type = currentType(object), side = c("ofInterest", "otherSide")) + +\S4method{counts}{FraserDataSet,ANY}( + object, + type = currentType(object), + side = c("ofInterest", "otherSide"), + ... +) <- value } \arguments{ \item{fds, object}{FraserDataSet} @@ -39,8 +44,10 @@ setter for count data \examples{ fds <- createTestFraserDataSet() - counts(fds, type="psi5", side="ofInterest") - counts(fds, type="psi5", side="other") + counts(fds, side="ofInterest") + counts(fds, type="jaccard", side="other") + head(K(fds)) + head(K(fds, type="psi5")) head(K(fds, type="psi3")) head(N(fds, type="theta")) diff --git a/man/fds-methods.Rd b/man/fds-methods.Rd index 32ebf323..d6caf3fd 100644 --- a/man/fds-methods.Rd +++ b/man/fds-methods.Rd @@ -141,8 +141,8 @@ passed to GenomeInfoDb::mapSeqlevels().} Getter method return the respective current value. } \description{ -The following methods are getter and setter methods to extract or set -certain values of a FraserDataSet object. +The following methods are getter and setter methods to extract +or set certain values of a FraserDataSet object. \code{samples} sets or gets the sample IDs; \code{condition} ; \code{} diff --git a/man/filtering.Rd b/man/filtering.Rd index 361dde7e..226df199 100644 --- a/man/filtering.Rd +++ b/man/filtering.Rd @@ -1,44 +1,52 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filterExpression.R -\name{filtering} +% Please edit documentation in R/AllGenerics-definitions.R, R/filterExpression.R +\name{filterVariability} +\alias{filterVariability} \alias{filtering} \alias{filterExpressionAndVariability} \alias{filterExpression,FraserDataSet-method} -\alias{filterVariability} +\alias{filterVariability,FraserDataSet-method} \title{Filtering FraserDataSets} \usage{ +filterVariability(object, ...) + filterExpressionAndVariability( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, - minDeltaPsi = 0.05, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) \S4method{filterExpression}{FraserDataSet}( object, minExpressionInOneSample = 20, - quantile = 0.95, + quantile = 0.75, quantileMinExpression = 10, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) -filterVariability( +\S4method{filterVariability}{FraserDataSet}( object, - minDeltaPsi = 0.05, + minDeltaPsi = 0, filter = TRUE, delayed = ifelse(ncol(object) <= 300, FALSE, TRUE), + filterOnJaccard = TRUE, BPPARAM = bpparam() ) } \arguments{ \item{object}{A \code{\link{FraserDataSet}} object} +\item{...}{Further parameters passed on to Rsubread::featureCounts.} + \item{minExpressionInOneSample}{The minimal read count in at least one sample that is required for an intron to pass the filter.} @@ -60,6 +68,10 @@ mcols.} otherwise the function works on the delayedMatrix representations. The default value depends on the number of samples in the fds-object.} +\item{filterOnJaccard}{If TRUE, the Intron Jaccard Metric is used to define +express introns during fitlering. Otherwise, the psi5, psi3 and theta +metrics are used (default: TRUE).} + \item{BPPARAM}{the BiocParallel parameters for the parallelization} } \value{ @@ -78,15 +90,15 @@ read support and introns that are not variable across samples. \item \code{filterExpression(FraserDataSet)}: This function filters out introns and corresponding splice sites that have low read support in all samples. -\item \code{filterVariability()}: This function filters out introns and corresponding -splice sites which do not show variablity across samples. +\item \code{filterVariability(FraserDataSet)}: This function filters out introns and corresponding +splice sites that have low read support in all samples. }} \examples{ fds <- createTestFraserDataSet() fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE) -mcols(fds, type="psi5")[, c( - "maxCount", "passedExpression", "maxDPsi3", "passedVariability")] +mcols(fds, type="jaccard")[, c( + "maxCount", "passedExpression", "maxDJaccard", "passedVariability")] plotFilterExpression(fds) plotFilterVariability(fds) diff --git a/man/fit.Rd b/man/fit.Rd index eb68296c..dcbee2dc 100644 --- a/man/fit.Rd +++ b/man/fit.Rd @@ -10,8 +10,8 @@ implementation = c("PCA", "PCA-BB-Decoder", "AE", "AE-weighted", "PCA-BB-full", "fullAE", "PCA-regression", "PCA-reg-full", "PCA-BB-Decoder-no-weights", "BB"), q, - type = "psi3", - rhoRange = c(1e-08, 1 - 1e-08), + type = psiTypes, + rhoRange = c(-30, 30), weighted = FALSE, noiseAlpha = 1, convergence = 1e-05, @@ -35,7 +35,7 @@ Should be fitted using \code{\link{optimHyperParams}} if unknown. If a named vector is provided it is used for the different splicing types.} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{rhoRange}{Defines the range of values that rho parameter from the diff --git a/man/getter_setter_functions.Rd b/man/getter_setter_functions.Rd index 4dad7b0d..48c729f2 100644 --- a/man/getter_setter_functions.Rd +++ b/man/getter_setter_functions.Rd @@ -15,10 +15,13 @@ \alias{zScores} \alias{pVals} \alias{padjVals} +\alias{availableFDRsubsets} \alias{predictedMeans} \alias{deltaPsiValue} \alias{currentType} \alias{currentType<-} +\alias{fitMetrics} +\alias{fitMetrics<-} \alias{pseudocount} \alias{hyperParams} \alias{dontWriteHDF5} @@ -35,9 +38,26 @@ rho(fds, type = currentType(fds)) zScores(fds, type = currentType(fds), byGroup = FALSE, ...) -pVals(fds, type = currentType(fds), level = "site", dist = "BetaBinomial", ...) - -padjVals(fds, type = currentType(fds), dist = c("BetaBinomial"), ...) +pVals( + fds, + type = currentType(fds), + level = "site", + filters = list(), + dist = "BetaBinomial", + ... +) + +padjVals( + fds, + type = currentType(fds), + dist = c("BetaBinomial"), + level = "site", + subsetName = NULL, + filters = list(), + ... +) + +availableFDRsubsets(fds) predictedMeans(fds, type = currentType(fds)) @@ -47,6 +67,10 @@ currentType(fds) currentType(fds) <- value +fitMetrics(fds) + +fitMetrics(fds) <- value + pseudocount(value = NULL) hyperParams(fds, type = currentType(fds), all = FALSE) @@ -70,12 +94,16 @@ verbose(fds) <- value \item{byGroup}{If TRUE, aggregation by donor/acceptor site will be done.} -\item{...}{Internally used parameteres.} +\item{...}{Internally used parameters.} \item{level}{Indicates if the retrieved p values should be adjusted on the donor/acceptor site-level (default) or if unadjusted junction-level p values should be returned.} +\item{filters}{A named list giving the filters that were applied for masking +during p value correction. Used for storing and retrieving the +correct set of requested p values.} + \item{dist}{Distribution for which the p-values should be extracted.} \item{all}{Logical value indicating whether \code{hyperParams(fds)} should @@ -110,6 +138,9 @@ beta-binomial distribution \item \code{padjVals()}: This returns the adjusted p-values. +\item \code{availableFDRsubsets()}: This returns the names of FDR subsets +for which adjusted p values have been calculated. + \item \code{predictedMeans()}: This returns the fitted mu (i.e. psi) values. @@ -117,11 +148,18 @@ values. observed and the fitted psi values. \item \code{currentType()}: Returns the psi type that is used -within several methods in the FRASER package. +within several methods in the FRASER package (defaults to jaccard). \item \code{currentType(fds) <- value}: Sets the psi type that is to be used within several methods in the FRASER package. +\item \code{fitMetrics()}: Returns the splice metrics that will be +fitted (defaults to jaccard, used within several methods in the +FRASER package). + +\item \code{fitMetrics(fds) <- value}: Sets the splice metrics that will be +fitted (used within several methods in the FRASER package). + \item \code{pseudocount()}: Sets and returns the pseudo count used within the FRASER fitting procedure. @@ -140,7 +178,7 @@ assays should be stored as hdf5 files. \item \code{dontWriteHDF5(fds) <- value}: Sets whether the assays should be stored as hdf5 files. -\item \code{verbose()}: Dependend on the level of verbosity +\item \code{verbose()}: Dependent on the level of verbosity the algorithm reports more or less to the user. 0 means being quiet and 10 means everything. @@ -156,7 +194,7 @@ dontWriteHDF5(fds) dontWriteHDF5 <- TRUE # get/set the splice metric for which results should be retrieved -currentType(fds) <- "psi5" +currentType(fds) <- "jaccard" currentType(fds) # get fitted parameters @@ -167,6 +205,9 @@ rho(fds) # get statistics pVals(fds) padjVals(fds) + +# zscore not calculated by default +fds <- calculateZscore(fds, type="jaccard") zScores(fds) # set and get pseudocount @@ -174,9 +215,9 @@ pseudocount(4L) pseudocount() # retrieve or set a mask to exclude certain junctions in the fitting step -featureExclusionMask(fds, type="theta") <- sample( - c(FALSE, TRUE), nrow(mcols(fds, type="theta")), replace=TRUE) -featureExclusionMask(fds, type="theta") +featureExclusionMask(fds, type="jaccard") <- sample( + c(FALSE, TRUE), nrow(mcols(fds, type="jaccard")), replace=TRUE) +featureExclusionMask(fds, type="jaccard") # controlling the verbosity level of the output of some algorithms verbose(fds) <- 2 diff --git a/man/injectOutliers.Rd b/man/injectOutliers.Rd index 1f85338e..5132c6e4 100644 --- a/man/injectOutliers.Rd +++ b/man/injectOutliers.Rd @@ -6,7 +6,7 @@ \usage{ injectOutliers( fds, - type = c("psi5", "psi3", "theta"), + type = psiTypes, freq = 0.001, minDpsi = 0.2, minCoverage = 2, @@ -50,5 +50,6 @@ Inject artificial outliers in an existing fds \examples{ # A generic dataset fds <- makeSimulatedFraserDataSet() +fds <- calculatePSIValues(fds) fds <- injectOutliers(fds, minDpsi=0.2, freq=1E-3) } diff --git a/man/optimHyperParams.Rd b/man/optimHyperParams.Rd index f3d10622..e1333b4a 100644 --- a/man/optimHyperParams.Rd +++ b/man/optimHyperParams.Rd @@ -6,9 +6,9 @@ \usage{ optimHyperParams( fds, - type, + type = psiTypes, implementation = "PCA", - q_param = seq(2, min(40, ncol(fds)), by = 3), + q_param = getEncDimRange(fds), noise_param = 0, minDeltaPsi = 0.1, iterations = 5, @@ -24,7 +24,7 @@ optimHyperParams( \arguments{ \item{fds}{A \code{\link{FraserDataSet}} object} -\item{type}{The type of PSI (psi5, psi3 or theta for theta/splicing +\item{type}{The type of PSI (jaccard, psi5, psi3 or theta for theta/splicing efficiency)} \item{implementation}{The method that should be used to correct for @@ -71,13 +71,14 @@ ratios while maximizing the precision-recall curve. \examples{ # generate data fds <- makeSimulatedFraserDataSet(m=15, j=20) + fds <- calculatePSIValues(fds) # run hyperparameter optimization - fds <- optimHyperParams(fds, type="psi5", q_param=c(2, 5)) + fds <- optimHyperParams(fds, type="jaccard", q_param=c(2, 5)) # get estimated optimal dimension of the latent space - bestQ(fds, type="psi5") - hyperParams(fds, type="psi5") + bestQ(fds, type="jaccard") + hyperParams(fds, type="jaccard") } \seealso{ diff --git a/man/plotFunctions.Rd b/man/plotFunctions.Rd index cbeddee6..b27bf750 100644 --- a/man/plotFunctions.Rd +++ b/man/plotFunctions.Rd @@ -1,6 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plotMethods.R -\name{plotFunctions} +% Please edit documentation in R/AllGenerics-definitions.R, R/plotMethods.R +\name{plotManhattan} +\alias{plotManhattan} \alias{plotFunctions} \alias{plotAberrantPerSample} \alias{plotVolcano} @@ -10,60 +11,82 @@ \alias{plotFilterExpression} \alias{plotExpectedVsObservedPsi} \alias{plotEncDimSearch} +\alias{plotBamCoverage} +\alias{plotBamCoverageFromResultTable} \alias{plotVolcano,FraserDataSet-method} \alias{plotAberrantPerSample,FraserDataSet-method} +\alias{plotSpliceMetricRank} \alias{plotQQ,FraserDataSet-method} \alias{plotEncDimSearch,FraserDataSet-method} \alias{plotFilterVariability} \alias{plotCountCorHeatmap,FraserDataSet-method} +\alias{plotManhattan,FraserDataSet-method} \title{Visualization functions for FRASER} \usage{ +plotManhattan(object, ...) + \S4method{plotVolcano}{FraserDataSet}( object, sampleID, - type = c("psi3", "psi5", "theta"), + type = fitMetrics(object), basePlot = TRUE, aggregate = FALSE, main = NULL, label = NULL, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, padjCutoff = 0.1, + subsetName = NULL, ... ) \S4method{plotAberrantPerSample}{FraserDataSet}( object, main, - type = c("psi3", "psi5", "theta"), + type = fitMetrics(object), padjCutoff = 0.1, - zScoreCutoff = NA, - deltaPsiCutoff = 0.3, + deltaPsiCutoff = 0.1, aggregate = TRUE, + subsetName = NULL, BPPARAM = bpparam(), ... ) plotExpression( fds, - type = c("psi5", "psi3", "theta"), - site = NULL, + type = fitMetrics(fds), + idx = NULL, result = NULL, colGroup = NULL, basePlot = TRUE, main = NULL, label = "aberrant", + subsetName = NULL, + ... +) + +plotSpliceMetricRank( + fds, + type = fitMetrics(fds), + idx = NULL, + result = NULL, + colGroup = NULL, + basePlot = TRUE, + main = NULL, + label = "aberrant", + subsetName = NULL, ... ) plotExpectedVsObservedPsi( fds, - type = c("psi5", "psi3", "theta"), + type = fitMetrics(fds), idx = NULL, result = NULL, colGroup = NULL, main = NULL, basePlot = TRUE, label = "aberrant", + subsetName = NULL, ... ) @@ -80,14 +103,11 @@ plotExpectedVsObservedPsi( basePlot = TRUE, label = "aberrant", Ncpus = min(3, getDTthreads()), + subsetName = NULL, ... ) -\S4method{plotEncDimSearch}{FraserDataSet}( - object, - type = c("psi3", "psi5", "theta"), - plotType = c("auc", "loss") -) +\S4method{plotEncDimSearch}{FraserDataSet}(object, type = psiTypes, plotType = c("auc", "loss")) plotFilterExpression( fds, @@ -105,7 +125,7 @@ plotFilterVariability( \S4method{plotCountCorHeatmap}{FraserDataSet}( object, - type = c("psi5", "psi3", "theta"), + type = psiTypes, logit = FALSE, topN = 50000, topJ = 5000, @@ -126,10 +146,63 @@ plotFilterVariability( plotCov = TRUE, ... ) + +plotBamCoverage( + fds, + gr, + sampleID, + control_samples = sample(samples(fds[, which(samples(fds) != sampleID)]), min(3, + ncol(fds) - length(sampleID))), + txdb = NULL, + min_junction_count = 20, + highlight_range = NULL, + highlight_range_color = "firebrick", + color_annotated = "gray", + color_novel = "goldenrod3", + color_sample_interest = "firebrick", + color_control_samples = "dodgerblue4", + toscale = c("exon", "gene", "none"), + mar = c(2, 10, 0.1, 5), + curvature_splicegraph = 1, + curvature_coverage = 1, + cex = 1, + splicegraph_labels = c("genomic_range", "id", "name", "none"), + splicegraph_position = c("top", "bottom"), + ... +) + +plotBamCoverageFromResultTable( + fds, + result, + show_full_gene = FALSE, + txdb = NULL, + orgDb = NULL, + res_gene_col = "hgncSymbol", + res_geneid_type = "SYMBOL", + txdb_geneid_type = "ENTREZID", + left_extension = 1000, + right_extension = 1000, + ... +) + +\S4method{plotManhattan}{FraserDataSet}( + object, + sampleID, + value = "pvalue", + type = fitMetrics(object), + chr = NULL, + main = paste0("sample: ", sampleID), + chrColor = c("black", "darkgrey"), + subsetName = NULL, + ... +) } \arguments{ \item{object, fds}{An \code{\link{FraserDataSet}} object.} +\item{...}{Additional parameters passed to plot() or plot_ly() if not stated +otherwise in the details for each plot function} + \item{sampleID}{A sample ID which should be plotted. Can also be a vector. Integers are treated as indices.} @@ -150,22 +223,24 @@ samples. Labelling can be turned off by setting \code{label=NULL}. The user can also provide a custom list of gene symbols or sampleIDs.} -\item{padjCutoff, zScoreCutoff, deltaPsiCutoff}{Significance, Z-score or delta +\item{padjCutoff, deltaPsiCutoff}{Significance or delta psi cutoff to mark outliers} -\item{...}{Additional parameters passed to plot() or plot_ly() if not stated -otherwise in the details for each plot function} +\item{subsetName}{The name of a subset of genes of interest for which FDR +corrected pvalues were previously computed. Those FDR values +on the subset will then be used to determine aberrant status. +Default is NULL (using transcriptome-wide FDR corrected pvalues).} \item{BPPARAM}{BiocParallel parameter to use.} +\item{idx}{A junction site ID or gene ID or one of both, which +should be plotted. Can also be a vector. Integers are treated +as indices.} + \item{result}{The result table to be used by the method.} \item{colGroup}{Group of samples that should be colored.} -\item{idx, site}{A junction site ID or gene ID or one of both, which -should be plotted. Can also be a vector. Integers are treated -as indices.} - \item{global}{Flag to plot a global Q-Q plot, default FALSE} \item{conf.alpha}{If set, a confidence interval is plotted, defaults to 0.05} @@ -228,6 +303,108 @@ annotation of the heatmap.} \item{plotMeanPsi, plotCov}{If \code{TRUE}, then the heatmap is annotated with the mean psi values or the junction coverage.} + +\item{gr}{A GRanges object indicating the genomic range that should be shown +in \code{plotBamCoverage}.} + +\item{control_samples}{The sampleIDs of the samples used as control in +\code{plotBamCoverage}.} + +\item{txdb}{A TxDb object giving the gene/transcript annotation to use.} + +\item{min_junction_count}{The minimal junction count across samples required +for a junction to appear in the splicegraph and coverage tracks +of \code{plotBamCoverage}.} + +\item{highlight_range}{A \code{GenomicRanges} or \code{GenomicRangesList} +object of ranges to be highlighted in the splicegraph of +\code{plotBamCoverage}.} + +\item{highlight_range_color}{The color of highlighted ranges in +the splicegraph of \code{plotBamCoverage}.} + +\item{color_annotated}{The color for exons and junctions present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_novel}{The color for novel exons and junctions not present in +the given annotation (in the splicegraph of +\code{plotBamCoverage}).} + +\item{color_sample_interest}{The color in \code{plotBamCoverage} for the +sample of interest.} + +\item{color_control_samples}{The color in \code{plotBamCoverage} for the +samples used as controls.} + +\item{toscale}{In \code{plotBamCoverage}, indicates which part of the +plotted region should be drawn to scale. Possible values are +'exon' (exonic regions are drawn to scale), +'gene' (both exonic and intronic regions are drawn to scale) or +'none' (exonic and intronic regions have constant length) +(see SGSeq package).} + +\item{mar}{The margin of the plot area for \code{plotBamCoverage} +(b,l,t,r).} + +\item{curvature_splicegraph}{The curvature of the junction arcs in the +splicegraph in \code{plotBamCoverage}. Decrease this value +for flatter arcs and increase it for steeper arcs.} + +\item{curvature_coverage}{The curvature of the junction arcs in the +coverage tracks of \code{plotBamCoverage}. Decrease this +value for flatter arcs and increase it for steeper arcs.} + +\item{cex}{For controlling the size of text and numbers in +\code{plotBamCoverage}.} + +\item{splicegraph_labels}{Indicated the format of exon/splice junction +labels in the splicegraph of \code{plotBamCoverage}. +Possible values are 'genomic_range' (gives the start position +of the first exon and the end position of the last exon that +are shown), 'id' (format E1,... J1,...), 'name' (format +type:chromosome:start-end:strand for each feature), +'none' for no labels (see SGSeq package).} + +\item{splicegraph_position}{The position of the splicegraph relative to the +coverage tracks in \code{plotBamCoverage}. Possible values +are 'top' (default) and 'bottom'.} + +\item{show_full_gene}{Should the full genomic range of the gene be shown in +\code{plotBamCoverageFromResultTable} (default: FALSE)? +If FALSE, only a certain region (see parameters left_extension +and right_extension) around the outlier junction is shown.} + +\item{orgDb}{A OrgDb object giving the mapping of gene ids and symbols.} + +\item{res_gene_col}{The column name in the given results table that +contains the gene annotation.} + +\item{res_geneid_type}{The type of gene annotation in the results table in +\code{res_gene_col} (e.g. SYMBOL or ENTREZID etc.). This +information is needed for mapping between the results table and +the provided annotation in the txdb object.} + +\item{txdb_geneid_type}{The type of gene_id present in \code{genes(txdb)} +(e.g. ENTREZID). This information is needed for +mapping between the results table and the provided annotation +in the txdb object.} + +\item{left_extension}{Indicating how far the plotted range around the outlier +junction should be extended to the left in +\code{plotBamCoverageFromResultTable}.} + +\item{right_extension}{Indicating how far the plotted range around the +outlier junction should be extended to the right in +\code{plotBamCoverageFromResultTable}.} + +\item{value}{Indicates which assay is shown in the manhattan plot. Defaults +to 'pvalue'. Other options are 'deltaPsi' and 'zScore'.} + +\item{chr}{Vector of chromosome names to show in \code{plotManhattan}. The +default is to show all chromosomes.} + +\item{chrColor}{Interchanging colors by chromosome for \code{plotManhattan}.} } \value{ If base R graphics are used nothing is returned else the plotly or @@ -246,6 +423,9 @@ Plot the number of aberrant events per samples Plots the observed split reads of the junction of interest over all reads coming from the given donor/acceptor. +Plots the observed values of the splice metric across samples for a +junction of interest. + Plots the expected psi value over the observed psi value of the given junction. @@ -269,6 +449,10 @@ This is the list of all plotting function provided by FRASER: \item plotFilterExpression() \item plotFilterVariability() \item plotEncDimSearch() + \item plotBamCoverage() + \item plotBamCoverageFromResultTable() + \item plotManhattan() + \item plotSpliceMetricRank() } For a detailed description of each plot function please see the details. @@ -297,6 +481,9 @@ log10 space. \code{plotExpectedVsObservedPsi}: A scatter plot of the observed psi against the predicted psi for a given site. +\code{plotSpliceMetricRank}: This function plots for a given intron the +observed values of the selected splice metrix against the sample rank. + \code{plotCountCorHeatmap}: The correlation heatmap of the count data either of the full data set (i.e. sample-sample correlations) or of the top x most variable junctions (i.e. junction-sample correlations). By default the values @@ -317,33 +504,107 @@ introns and for the filtered (i.e. non-variable) introns. It plots the encoding dimension against the achieved loss (area under the precision-recall curve). From this plot the optimum should be choosen for the \code{q} in fitting process. + +\code{plotManhattan}: A Manhattan plot showing the junction pvalues by +genomic position. Useful to identify if outliers cluster by genomic position. + +\code{plotBamCoverage}: A sashimi plot showing the read coverage from +the underlying bam files for a given genomic range and sampleIDs. + +\code{plotBamCoverageFromResultTable}: A sashimi plot showing the read +coverage from the underlying bam files for a row in the results table. Can +either show the full range of the gene with the outlier junction or only a +certain region around the outlier. } \examples{ +\dontshow{set.seed(42)} # create full FRASER object fds <- makeSimulatedFraserDataSet(m=40, j=200) fds <- calculatePSIValues(fds) fds <- filterExpressionAndVariability(fds, filter=FALSE) -# this step should be done for all splicing metrics and more dimensions -fds <- optimHyperParams(fds, "psi5", q_param=c(2,5,10,25)) -fds <- FRASER(fds) +# this step should be done for more dimensions in practice +fds <- optimHyperParams(fds, "jaccard", q_param=c(2,5,10,25)) + +# assign gene names to show functionality on test dataset +# use fds <- annotateRanges(fds) on real data +mcols(fds, type="j")$hgnc_symbol <- + paste0("gene", sample(1:25, nrow(fds), replace=TRUE)) + +# fit and calculate pvalues +genesOfInterest <- rep(list(paste0("gene", sample(1:25, 10))), 4) +names(genesOfInterest) <- c("sample1", "sample6", "sample15", "sample23") +fds <- FRASER(fds, subsets=list("testSet"=genesOfInterest)) # QC plotting plotFilterExpression(fds) plotFilterVariability(fds) -plotCountCorHeatmap(fds, "theta") -plotCountCorHeatmap(fds, "theta", normalized=TRUE) -plotEncDimSearch(fds, type="psi5") +plotCountCorHeatmap(fds, "jaccard") +plotCountCorHeatmap(fds, "jaccard", normalized=TRUE) +plotEncDimSearch(fds, type="jaccard") # extract results plotAberrantPerSample(fds, aggregate=FALSE) -plotVolcano(fds, "sample1", "psi5") +plotAberrantPerSample(fds, aggregate=TRUE, subsetName="testSet") +plotVolcano(fds, "sample2", "jaccard", label="aberrant") +plotVolcano(fds, "sample1", "jaccard", aggregate=TRUE, subsetName="testSet") # dive into gene/sample level results -res <- results(fds) +res <- as.data.table(results(fds)) res plotExpression(fds, result=res[1]) plotQQ(fds, result=res[1]) -plotExpectedVsObservedPsi(fds, type="psi5", res=res[1]) +plotExpectedVsObservedPsi(fds, res=res[1]) +plotSpliceMetricRank(fds, res=res[1]) + +# other ways to call these plotting functions +plotExpression(fds, idx=10, sampleID="sample1", type="jaccard") +plotExpression(fds, result=res[FDR_set == "testSet",][1], + subsetName="testSet") +plotQQ(fds, idx=10, sampleID="sample1", type="jaccard") +plotQQ(fds, result=res[FDR_set == "testSet",][1], subsetName="testSet") +plotExpectedVsObservedPsi(fds, idx=10, sampleID="sample1", type="jaccard") +plotExpectedVsObservedPsi(fds, result=res[FDR_set == "testSet",][1], + subsetName="testSet") +plotSpliceMetricRank(fds, idx=10, sampleID="sample1", type="jaccard") +plotSpliceMetricRank(fds, result=res[FDR_set == "testSet",][1], + subsetName="testSet") + +# create manhattan plot of pvalues by genomic position +if(require(ggbio)){ + plotManhattan(fds, type="jaccard", sampleID="sample10") +} +# plot splice graph and coverage from bam files in a given region +if(require(SGSeq)){ + fds <- createTestFraserSettings() + gr <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") + plotBamCoverage(fds, gr=gr, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + + # plot coverage from bam file for a row in the result table + fds <- createTestFraserDataSet() + require(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + require(org.Hs.eg.db) + orgDb <- org.Hs.eg.db + + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) + res_dt <- as.data.table(res) + res_dt <- res_dt[sampleID == "sample2",] + + # plot full range of gene containing outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + + # plot only certain range around outlier junction + plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") +} } diff --git a/man/potentialImpactAnnotations.Rd b/man/potentialImpactAnnotations.Rd new file mode 100644 index 00000000..a06e304a --- /dev/null +++ b/man/potentialImpactAnnotations.Rd @@ -0,0 +1,135 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/resultAnnotations.R +\name{potentialImpactAnnotations} +\alias{potentialImpactAnnotations} +\alias{annotateIntronReferenceOverlap} +\alias{annotatePotentialImpact} +\alias{flagBlacklistRegions} +\title{Additional result annotations} +\usage{ +annotateIntronReferenceOverlap(fds, txdb, BPPARAM = bpparam()) + +annotatePotentialImpact( + result, + txdb, + fds, + addPotentialImpact = TRUE, + addUTRoverlap = TRUE, + minoverlap = 5, + BPPARAM = bpparam() +) + +flagBlacklistRegions( + result, + blacklist_regions = NULL, + assemblyVersion = c("hg19", "hg38"), + minoverlap = 5 +) +} +\arguments{ +\item{fds}{A FraserDataSet} + +\item{txdb}{A txdb object providing the reference annotation.} + +\item{BPPARAM}{For controlling parallelization behavior. Defaults to +\code{bpparam()}.} + +\item{result}{A result table as generated by FRASER, including the column +\code{annotatedJunction} as generated by the function +\code{annotateIntronReferenceOverlap}.} + +\item{addPotentialImpact}{Logical, indicating if the type of the potential +impact should be added to the results table. Defaults to \code{TRUE}.} + +\item{addUTRoverlap}{Logical, indicating if the overlap with UTR regions +should checked and added to the results table. Defaults to \code{TRUE}.} + +\item{minoverlap}{Integer value defining the number of base pairs around the +splice site that need to overlap with UTR or blacklist region, +respectivly, to be considered matching. Defaults to 5 bp.} + +\item{blacklist_regions}{A BED file that contains the blacklist regions. +If \code{NULL} (default), the BED files that are packaged with FRASER +are used (see Details for more information).} + +\item{assemblyVersion}{Indicates the genome assembly version of the intron +coordinates. Only used if blacklist_regions is NULL. For other versions, +please provide the BED file containing the blacklist regions directly.} +} +\value{ +An annotated FraserDataSet or results table, respectively +} +\description{ +These functions work on the result table and add additional + annotations to the reported introns: the type of potential impact on + splicing (e.g. exon skipping, exon truncation, ...), potential occurence + of frameshift, overlap with UTR regions as well as a flag for introns + that are located in blacklist regions of the genome. + +\code{\link{annotateIntronReferenceOverlap}} adds basic annotations to the + fds for each intron based on the overlap of the intron's location with + the reference annotation. Has to be run before the result table is + created so that the new column can be included in it (see examples). + +\code{\link{annotatePotentialImpact}} annotates each intron in the results + table with the type of potential impact on splicing and potential + occurence of frameshift (likely, unlikely, inconclusive). Can also + calculate overlap with annotated UTR regions. Potential impact can be: + annotatedIntron_increasedUsage, annotatedIntron_reducedUsage, + exonTruncation, exonElongation, exonTruncation&Elongation, + exonSkipping, splicingBeyondGene, + multigenicSplicing, downstreamOfNearestGene, upstreamOfNearestGene, + complex (everything else). + Splice sites (theta metric) annotations indicate how the splice site is + located with respect to the reference annotation. The annotated types + are: annotatedSpliceSite, exonicRegion, intronicRegion. + +\code{\link{flagBlacklistRegions}} flags introns in the results table on + whether or not they are located in a blacklist region of the genome. By + default, the blacklist regions as reported in + \cite{Amemiya, Kundaje & Boyle (2019)} and downloaded from + \href{https://www.encodeproject.org/annotations/ENCSR636HFF/}{here} + are used. +} +\section{Functions}{ +\itemize{ +\item \code{annotateIntronReferenceOverlap()}: This method calculates basic annotations +based on overlap with the reference annotation (start, end, none, both) +for the full fds. The overlap type is added as a new column +\code{annotatedJunction} in \code{mcols(fds)}. + +\item \code{annotatePotentialImpact()}: This method annotates the splice event +type to junctions in the given results table. + +\item \code{flagBlacklistRegions()}: This method flags all introns and +splice sites in the given results table for which at least one splice +site (donor or acceptor) is located in a blacklist region. Blacklist +regions of the genome are determined from the provided BED file. + +}} +\examples{ + # get data, fit and compute p-values and z-scores + fds <- createTestFraserDataSet() + + # load reference annotation + library(TxDb.Hsapiens.UCSC.hg19.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene + + # add basic annotations for overlap with the reference annotation + # run this function before creating the results table + fds <- annotateIntronReferenceOverlap(fds, txdb) + + # extract results: for this small example dataset, no cutoffs used + # to get some results + res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) + + # annotate the type of potential impact on splicing and UTR overlap + res <- annotatePotentialImpact(result=res, txdb=txdb, fds=fds) + + # annotate overlap with blacklist regions + res <- flagBlacklistRegions(result=res, assemblyVersion="hg19") + + # show results table containing additional annotations + res + +} diff --git a/man/psiTypes.Rd b/man/psiTypes.Rd index f07be433..9a4b9ca2 100644 --- a/man/psiTypes.Rd +++ b/man/psiTypes.Rd @@ -3,18 +3,18 @@ \docType{data} \name{psiTypes} \alias{psiTypes} -\title{Available psi types} +\title{Available splice metrics} \format{ -An object of class \code{character} of length 3. +An object of class \code{character} of length 4. } \usage{ psiTypes } \description{ -Available psi types +Available splice metrics } \examples{ - # to show available psi types: + # to show all available splice metrics: psiTypes } diff --git a/man/results.Rd b/man/results.Rd index 57682616..0cd1fa2d 100644 --- a/man/results.Rd +++ b/man/results.Rd @@ -2,34 +2,38 @@ % Please edit documentation in R/AllGenerics.R \name{results,FraserDataSet-method} \alias{results,FraserDataSet-method} -\alias{resultsByGenes} \alias{aberrant,FraserDataSet-method} \title{Extracting results and aberrant splicing events} \usage{ \S4method{results}{FraserDataSet}( object, sampleIDs = samples(object), - padjCutoff = 0.05, - zScoreCutoff = NA, - deltaPsiCutoff = 0.3, + padjCutoff = 0.1, + deltaPsiCutoff = 0.1, + rhoCutoff = NA, + aggregate = FALSE, + collapse = FALSE, minCount = 5, - psiType = c("psi3", "psi5", "theta"), + psiType = psiTypes, + geneColumn = "hgnc_symbol", + all = FALSE, + returnTranscriptomewideResults = TRUE, additionalColumns = NULL, - BPPARAM = bpparam(), - ... + BPPARAM = bpparam() ) -resultsByGenes(res, geneColumn = "hgncSymbol", method = "BY") - \S4method{aberrant}{FraserDataSet}( object, - type = currentType(object), - padjCutoff = 0.05, - deltaPsiCutoff = 0.3, - zScoreCutoff = NA, + type = fitMetrics(object), + padjCutoff = 0.1, + deltaPsiCutoff = 0.1, minCount = 5, + rhoCutoff = NA, by = c("none", "sample", "feature"), aggregate = FALSE, + geneColumn = "hgnc_symbol", + subsetName = NULL, + all = FALSE, ... ) } @@ -41,16 +45,38 @@ retrieved} \item{padjCutoff}{The FDR cutoff to be applied or NA if not requested.} -\item{zScoreCutoff}{The z-score cutoff to be applied or NA if not requested.} - \item{deltaPsiCutoff}{The cutoff on delta psi or NA if not requested.} +\item{rhoCutoff}{The cutoff value on the fitted rho value +(overdispersion parameter of the betabinomial) above which +junctions are filtered} + +\item{aggregate}{If TRUE the returned object is aggregated to the feature +level (i.e. gene level).} + +\item{collapse}{Only takes effect if \code{aggregate=TRUE}. +If TRUE, collapses results across the different psi +types to return only one row per feature (gene) and sample.} + \item{minCount}{The minimum count value of the total coverage of an intron to be considered as significant. result} \item{psiType}{The psi types for which the results should be retrieved.} +\item{geneColumn}{The column name of the column that has the gene annotation +that will be used for gene-level pvalue computation.} + +\item{all}{By default FALSE, only significant introns (or genes) are listed +in the results. If TRUE, results are assembled for all +samples and introns/genes regardless of significance.} + +\item{returnTranscriptomewideResults}{If FDR corrected pvalues for subsets +of genes of interest have been calculated, this parameter +indicates whether additionally the transcriptome-wide results +should be returned as well (default), or whether only results +for those subsets should be retrieved.} + \item{additionalColumns}{Character vector containing the names of additional columns from mcols(fds) that should appear in the result table (e.g. ensembl_gene_id). Default is \code{NULL}, so no additional columns @@ -58,26 +84,20 @@ are included.} \item{BPPARAM}{The BiocParallel parameter.} -\item{...}{Further arguments can be passed to the method. If "zscores", -"padjVals" or "dPsi" is given, the values of those arguments -are used to define the aberrant events.} - -\item{res}{Result as created with \code{results()}} - -\item{geneColumn}{The name of the column in \code{mcols(res)} that contains -the gene symbols.} - -\item{method}{The p.adjust method that is being used to adjust p values per -sample.} - \item{type}{Splicing type (psi5, psi3 or theta)} \item{by}{By default \code{none} which means no grouping. But if \code{sample} or \code{feature} is specified the sum by sample or feature is returned} -\item{aggregate}{If TRUE the returned object is based on the grouped -features} +\item{subsetName}{The name of a subset of genes of interest for which FDR +corrected pvalues were previously computed. Those FDR values +on the subset will then be used to determine aberrant status. +Default is NULL (using transcriptome-wide FDR corrected pvalues).} + +\item{...}{Further arguments can be passed to the method. If "n", +"padjVals", "dPsi" or "rhoVals" are given, the values of those +arguments are used to define the aberrant events.} } \value{ For \code{results}: GRanges object containing significant results. @@ -95,24 +115,36 @@ aberrant splicing events based on the given cutoffs. # get data, fit and compute p-values and z-scores fds <- createTestFraserDataSet() -# extract results: for this example dataset, z score cutoff of 2 is used to -# get at least one result and show the output -res <- results(fds, padjCutoff=NA, zScoreCutoff=3, deltaPsiCutoff=0.05) +# extract results: for this example dataset, no cutoffs are used to +# show the output of the results function +res <- results(fds, all=TRUE) res # aggregate the results by genes (gene symbols need to be annotated first # using annotateRanges() function) -resultsByGenes(res) +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE) + +# aggregate the results by genes and collapse over all psi types to obtain +# only one row per gene in the results table +results(fds, padjCutoff=NA, deltaPsiCutoff=0.1, aggregate=TRUE, + collapse=TRUE) # get aberrant events per sample: on the example data, nothing is aberrant # based on the adjusted p-value -aberrant(fds, type="psi5", by="sample") +aberrant(fds, type="jaccard", by="sample") # get aberrant events per gene (first annotate gene symbols) fds <- annotateRangesWithTxDb(fds) -aberrant(fds, type="psi5", by="feature", zScoreCutoff=2, padjCutoff=NA, - aggregate=TRUE) +aberrant(fds, type="jaccard", by="feature", padjCutoff=NA, aggregate=TRUE) # find aberrant junctions/splice sites -aberrant(fds, type="psi5") +aberrant(fds, type="jaccard") + +# retrieve results limiting FDR correction to only a subset of genes +# first, we need to create a list of genes per sample that will be tested +geneList <- list('sample1'=c("TIMMDC1"), 'sample2'=c("MCOLN1")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=geneList)) +results(fds, all=TRUE, returnTranscriptomewideResults=FALSE) + } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 461280c0..b03ef489 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -152,6 +152,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// truncNLL_rho_penalized +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda); +RcppExport SEXP _FRASER_truncNLL_rho_penalized(SEXP logit_rhoSEXP, SEXP yiSEXP, SEXP kiSEXP, SEXP niSEXP, SEXP lambdaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< double >::type logit_rho(logit_rhoSEXP); + Rcpp::traits::input_parameter< arma::vec >::type yi(yiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ki(kiSEXP); + Rcpp::traits::input_parameter< arma::vec >::type ni(niSEXP); + Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); + rcpp_result_gen = Rcpp::wrap(truncNLL_rho_penalized(logit_rho, yi, ki, ni, lambda)); + return rcpp_result_gen; +END_RCPP +} // fullNLL arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows); RcppExport SEXP _FRASER_fullNLL(SEXP ySEXP, SEXP rhoSEXP, SEXP kSEXP, SEXP nSEXP, SEXP DSEXP, SEXP lambdaSEXP, SEXP byRowsSEXP) { @@ -215,6 +230,7 @@ static const R_CallMethodDef CallEntries[] = { {"_FRASER_truncNLL_e", (DL_FUNC) &_FRASER_truncNLL_e, 7}, {"_FRASER_truncGrad_e", (DL_FUNC) &_FRASER_truncGrad_e, 7}, {"_FRASER_truncNLL_rho", (DL_FUNC) &_FRASER_truncNLL_rho, 4}, + {"_FRASER_truncNLL_rho_penalized", (DL_FUNC) &_FRASER_truncNLL_rho_penalized, 5}, {"_FRASER_fullNLL", (DL_FUNC) &_FRASER_fullNLL, 7}, {"_FRASER_truncWeightedNLL_db", (DL_FUNC) &_FRASER_truncWeightedNLL_db, 7}, {"_FRASER_truncWeightedGrad_db", (DL_FUNC) &_FRASER_truncWeightedGrad_db, 7}, diff --git a/src/loss_n_gradient_functions.cpp b/src/loss_n_gradient_functions.cpp index 07eadcca..0078a270 100644 --- a/src/loss_n_gradient_functions.cpp +++ b/src/loss_n_gradient_functions.cpp @@ -6,7 +6,7 @@ using namespace Rcpp; const double MAX_EXP_VALUE = 700; -double PSEUDO_COUNT = 1; +double PSEUDO_COUNT = 0; // [[Rcpp::export(.setPseudoCount)]] double setPseudoCount(double pseudoCount){ @@ -142,6 +142,12 @@ double truncNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, double infPosB = arma::find_nonfinite(beta); // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; @@ -266,6 +272,12 @@ double truncNLL_e(arma::vec par, arma::mat x, arma::mat D, arma::vec b, beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); + nll = arma::accu(alpha + beta - alphaK - betaNK)/k.n_elem; return arma::as_scalar(nll); @@ -361,11 +373,58 @@ double truncNLL_rho(double rho, arma::vec yi, arma::vec ki, arma::vec ni){ // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; return arma::as_scalar(nll); } +// [[Rcpp::export()]] +double truncNLL_rho_penalized(double logit_rho, arma::vec yi, arma::vec ki, arma::vec ni, double lambda){ + arma::vec mui, u, alpha, alphaK, beta, betaNK, alphaBeta, nll; + double rho, rhoa, rhob; + + rho = exp(logit_rho)/(1 + exp(logit_rho)); + rhoa = (1 - rho)/rho; + rhob = (rho - 1)/rho; + mui = predictMuCpp(yi); + u = (mui-1) * rhob; + + alpha = arma::lgamma(mui * rhoa); + alphaK = arma::lgamma(mui * rhoa + ki + PSEUDO_COUNT); + beta = arma::lgamma(u); + betaNK = arma::lgamma(u + ni - ki + PSEUDO_COUNT); + alphaBeta = arma::lgamma(rhoa + ni + (2*PSEUDO_COUNT)) - lgamma(rhoa); + + // arma::vec abs; + arma::uvec infPosA, infPosB; + // abs = arma::abs(yi); + infPosA = arma::find_nonfinite(alpha); + // alpha.elem( infPosA ) = abs.elem( infPosA ); + alpha.elem( infPosA ) = estLgammaAlpha(yi, infPosA, rhoa); + infPosB = arma::find_nonfinite(beta); + // beta.elem( infPosB ) = abs.elem( infPosB ); + beta.elem( infPosB ) = estLgammaBeta(yi, infPosB, rhob); + + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(yi, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(yi, infPosB2, rhob); + + + nll = arma::accu(alpha + beta - alphaK - betaNK + alphaBeta)/ki.n_elem; + nll = nll + lambda * (logit_rho*logit_rho); + + return arma::as_scalar(nll); +} + // [[Rcpp::export()]] arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::mat D, double lambda, bool byRows=false){ arma::mat rhoa, rhob; @@ -395,6 +454,11 @@ arma::vec fullNLL(arma::mat y, arma::mat rho, arma::mat k, arma::mat n, arma::ma infPosB = arma::find_nonfinite(beta); beta.elem( infPosB ) = abs.elem( infPosB ); // beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = abs.elem( infPosA2 ); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = abs.elem( infPosB2 ); if(byRows){ nll = rowMeans(alpha + beta - alphaK - betaNK + nonTruncTerms); @@ -471,6 +535,12 @@ double truncWeightedNLL_db(arma::vec par, arma::mat H, arma::vec k, arma::vec n, // beta.elem( infPosB ) = abs.elem( infPosB ); beta.elem( infPosB ) = estLgammaBeta(y, infPosB, rhob); + arma::uvec infPosA2, infPosB2; + infPosA2 = arma::find_nonfinite(alphaK); + alphaK.elem( infPosA2 ) = estLgammaAlpha(y, infPosA2, rhoa); + infPosB2 = arma::find_nonfinite(betaNK); + betaNK.elem( infPosB2 ) = estLgammaBeta(y, infPosB2, rhob); + nll = arma::accu((alpha + beta - alphaK - betaNK)%w)/k.n_elem; nll = nll + (lambda/k.n_elem) * arma::accu(d % d); diff --git a/tests/testthat/test_counting.R b/tests/testthat/test_counting.R index 4fcd0d61..1bdbfc37 100644 --- a/tests/testthat/test_counting.R +++ b/tests/testthat/test_counting.R @@ -51,7 +51,7 @@ test_that("test minAnchor", { "sample3", features, fds, minAnchor=25, recount=TRUE)) }) expect_equivalent(c(7, 8, 0, 0, 7), ctnNS5[,1]) - expect_equivalent(c(5, 8, 0, 0, 6), ctnNS25[,1]) + expect_equivalent(c(5, 8, 0, 0, 7), ctnNS25[,1]) }) test_that("Test psi values", { diff --git a/tests/testthat/test_fraser_pipeline.R b/tests/testthat/test_fraser_pipeline.R index 069a217d..45c9b2ff 100644 --- a/tests/testthat/test_fraser_pipeline.R +++ b/tests/testthat/test_fraser_pipeline.R @@ -4,8 +4,8 @@ test_that("FRASER function", { fds <- createTestFraserDataSet() expect_is(fds, "FraserDataSet") anames <- c(psiTypes, paste0(c("delta", "predictedMeans", - "pvaluesBetaBinomial", "padjBetaBinomial", "zScores"), "_", - rep(psiTypes, 5))) + "pvaluesBetaBinomial", "padjBetaBinomial"), "_", + rep(fitMetrics(fds), 5))) expect_equal(anames %in% assayNames(fds), !logical(length(anames))) }) diff --git a/tests/testthat/test_hyperParams.R b/tests/testthat/test_hyperParams.R index 26331409..fd8a961b 100644 --- a/tests/testthat/test_hyperParams.R +++ b/tests/testthat/test_hyperParams.R @@ -2,6 +2,7 @@ context("Test hyper param optimization") test_that("Test hyper param testing", { fds <- makeSimulatedFraserDataSet(m=15, j=20, dist="BB") + fds <- calculatePSIValues(fds) # test BB no hyper params and accessors fds <- optimHyperParams(fds, type="psi3", implementation="BB") diff --git a/tests/testthat/test_plotJunctionDist.R b/tests/testthat/test_plotJunctionDist.R index 85a4b9d3..26e5a1e1 100644 --- a/tests/testthat/test_plotJunctionDist.R +++ b/tests/testthat/test_plotJunctionDist.R @@ -3,15 +3,15 @@ context("Test distribution plots for given results/junction") test_that("Main junction distribution plot", { # get results fds <- getFraser() - res <- results(fds, padjCutoff=1, zScoreCutoff=NA, deltaPsiCutoff=NA) + res <- results(fds, all=TRUE) # plot distributions expect_silent(plotExpression(fds, result=res[1])) - expect_silent(plotVolcano(fds, "sample1", "psi5")) + expect_silent(plotVolcano(fds, "sample1", "jaccard")) expect_silent(plotExpectedVsObservedPsi(fds, result=res[2])) - expect_is(plotCountCorHeatmap(fds, "psi5", norm=FALSE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE), "pheatmap") - expect_is(plotCountCorHeatmap(fds, "psi5", norm=TRUE, topN=10), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=FALSE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE), "pheatmap") + expect_is(plotCountCorHeatmap(fds, "jaccard", norm=TRUE, topN=10), "pheatmap") }) diff --git a/tests/testthat/test_plotSampleResults.R b/tests/testthat/test_plotSampleResults.R index e8b5fde4..e259d13e 100644 --- a/tests/testthat/test_plotSampleResults.R +++ b/tests/testthat/test_plotSampleResults.R @@ -1,5 +1,27 @@ context("Test generation of results") +test_that("Results function", { + set.seed(42) + # get subset to speed up test + fds <- getFraser() + + # intron-level results + res <- results(fds, aggregate=FALSE, all=TRUE) + expect_equal(length(res), prod(dim(fds))) + res_signif <- results(fds, aggregate=FALSE, all=FALSE, + padjCutoff=NA, deltaPsiCutoff=0.2) + expect_equal(length(res_signif), 1) + + # gene-level results + res_gene <- results(fds, aggregate=TRUE, all=TRUE) + expect_equal(length(res_gene), + prod(dim(pVals(fds, level="gene", type="jaccard")))) + res_gene_signif <- results(fds, aggregate=TRUE, all=FALSE, + padjCutoff=NA, deltaPsiCutoff=0.2) + expect_equal(length(res_gene_signif), 1) + +}) + test_that("Main plotting function", { # get subset to speed up test # fds <- getFraser() diff --git a/tests/testthat/test_stats.R b/tests/testthat/test_stats.R index 8e76cc19..b1eedb8c 100644 --- a/tests/testthat/test_stats.R +++ b/tests/testthat/test_stats.R @@ -17,16 +17,94 @@ test_that("PSI value calculation", { expect_true(all(N(fds, "psi5")[ is.na(psiVal)] == 0)) }) -test_that("Zscore calculation", { - fds <- getFraser(clean = TRUE) +# test_that("Zscore calculation", { +# fds <- getFraser(clean = TRUE) +# +# # prepare zScore input for logit scale +# psiVal <- (K(fds, "jaccard") + pseudocount())/(N(fds, "jaccard") + 2*pseudocount()) +# mu <- predictedMeans(fds, "jaccard") +# residual <- qlogis(psiVal) - qlogis(mu) +# +# # compute zscore +# zscores <- (residual - rowMeans(residual)) / rowSds(residual) +# +# expect_equal(zscores, zScores(fds, "jaccard")) +# }) + +test_that("Gene p value calculation with NAs", { + fds <- getFraser() + fds <- fds[15:24,] + mcols(fds, type="j")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(3, 4, 3)) + mcols(fds, type="ss")$hgnc_symbol <- rep(c("geneA", "geneB", "geneC"), + times=c(4, 6, 4)) + + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho[c(1, 4:7)] <- 0.5 + rho(fds, type="jaccard") <- rho + + # calc p values + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) + + # check dimension of junction-, site- and gene-level pval matrices + expect_equal(nrow(pVals(fds, type="jaccard", level="junction")), nrow(fds)) + expect_equal(nrow(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))), nrow(fds)) + expect_equal(nrow(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))), 3) + + # check jaccard pvals are partly NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))[4:7,]))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))["geneB",]))) - # prepare zScore input for logit scale - psiVal <- (K(fds, "psi5") + pseudocount())/(N(fds, "psi5") + 2*pseudocount()) - mu <- predictedMeans(fds, "psi5") - residual <- qlogis(psiVal) - qlogis(mu) + # simulate junction with bad rho fit to create partly NAs + rho <- rho(fds, type="jaccard") + rho <- rep(0.5, length(rho)) + rho(fds, type="jaccard") <- rho + fds <- calculatePadjValues(fds, type="jaccard", rhoCutoff=0.1) + + # check jaccard pvals are all NAs + expect_true(all(is.na(pVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(pVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="site", + filters=list(rho=0.1))))) + expect_true(all(is.na(padjVals(fds, type="jaccard", level="gene", + filters=list(rho=0.1))))) +}) + +test_that("FDR on subset of genes", { + fds <- getFraser() + mcols(fds, type="j")$hgnc_symbol <- + rep(c("geneA", "geneB", "geneC", "geneD", "geneE"), + times=c(3, 7, 5, 4, 8)) - # compute zscore - zscores <- (residual - rowMeans(residual)) / rowSds(residual) + # define gene subset per sample + genes_per_sample <- list( + "sample1" = c("geneE", "geneC", "geneA"), + "sample2" = c("geneB"), + "sample3" = c("geneA", "geneB", "geneC", "geneD") + ) - expect_equal(zscores, zScores(fds, "psi5")) + subsetName <- "subset_test" + fds <- calculatePadjValuesOnSubset(fds, genesToTest=genes_per_sample, + subsetName=subsetName, type="jaccard") + subset_padj <- padjVals(fds, type="jaccard", subsetName=subsetName) + expect_true(is(subset_padj, "matrix")) + expect_true(nrow(subset_padj) == 27) + expect_true(ncol(subset_padj) == 3) + subset_padj_gene <- padjVals(fds, type="jaccard", level="gene", + subsetName=subsetName) + expect_true(is(subset_padj_gene, "matrix")) + expect_true(nrow(subset_padj_gene) == 5) + expect_true(ncol(subset_padj_gene) == 3) + }) diff --git a/vignettes/FRASER.Rnw b/vignettes/FRASER.Rnw index 82e9b56c..4c51170c 100644 --- a/vignettes/FRASER.Rnw +++ b/vignettes/FRASER.Rnw @@ -1,4 +1,4 @@ -%\VignetteIndexEntry{FRASER: Find RAre Splicing Evens in RNA-seq Data} +%\VignetteIndexEntry{FRASER: Find RAre Splicing Events in RNA-seq Data} %\VignettePackage{FRASER} %\VignetteEngine{knitr::knitr} %\VignetteEncoding{UTF-8} @@ -36,11 +36,11 @@ opts_chunk$set( \newcommand{\fraser}{\Biocpkg{FRASER}} \newcommand{\fds}{\Rclass{FraserDataSet}} -\title{FRASER: Find RAre Splicing Events in RNA-seq} +\title{FRASER: Find RAre Splicing Events in RNA-seq Data} \author{ Christian Mertes$^{1}$, Ines Scheller$^{1}$, Julien Gagneur$^{1}$ \\ - \small{$^{1}$ Technische Universit\"at M\"unchen, Department of + \small{$^{1}$ Technical University of Munich, Department of Informatics, Garching, Germany} } @@ -77,7 +77,14 @@ diseases. \begin{center} \begin{tabular}{ | l | } \hline -If you use \fraser{} in published research, please cite: \\ +If you use \fraser{} version >= 1.99.0 in published research, please cite: \\ +\\ +Scheller I, Lutz K, Mertes C, \emph{et al.} +\textbf{Improved detection of aberrant splicing with FRASER 2.0} \\ +\textbf{using the Intron Jaccard Index}, medrXiv, 2023, \\ +\emph{\url{https://doi.org/10.1101/2023.03.31.23287997}} \\ +\hline +For previous versions of \fraser{}, please cite: \\ \\ Mertes C, Scheller I, Yepez V, \emph{et al.} \textbf{Detection of aberrant splicing events} \\ @@ -175,13 +182,39 @@ intron-exon boundary of acceptor A. While we calculate $\theta$ for the 5' and between $\theta_5$ and $\theta_3$ and hence call it jointly $\theta$ in the following. +From \fraser{} 2.0 on, only a single metric - the Intron Jaccard Index (Figure +\ref{IntronJaccardIndex_sketch}) - is used by default. The Intron Jaccard +Index is more robust and allows to focus more on functionally relevant +aberrant splicing events. It allows to detect all types of aberrant splicing +previously detected using the three metrics ($\psi_5$, $\psi_3$, $\theta$) +within a single metric. + +\incfig{IntronJaccardIndex_sketch}{1\textwidth}{Overview over the Intron +Jaccard Index, the splice metric used in \fraser{}2.}{ +The Intron Jaccard Index considers both split and nonsplit reads within a +single metric and allows to detect all different types of aberrant splicing +previously captured with either of the metrics $\psi_5$, $\psi_3$, $\theta$. +} + +The Intron Jaccard Index considers both split and nonsplit reads and is +defined as the Jaccard index of the set of donor reads (reads sharing a donor +site with the intron of interest and nonsplit reads at that donor site) and +acceptor reads (reads sharing an acceptor site with the intron of interest and +nonsplit reads at that acceptor site): + +\begin{equation} + J(D,A) = \frac{n(D,A)}{\sum_{A'} n(D,A') + \sum_{D'} n(D',A) + n(D) + n(A) - n(D,A)} + \label{eq:jaccard} +\end{equation} + + \section{Quick guide to \fraser{}} -Here we quickly show how to do an analysis with \fraser{}, starting from a -sample annotation table and the corresponding bam files. First, we create an -\fds{} from the sample annotation and count the relevant reads in the bam files. +Here we show how to do an analysis with \fraser{}, starting from a +sample annotation table and raw data (RNA-seq BAM files). First, we create a +\fds{} object from the sample annotation and count the relevant reads in the BAM files. Then, we compute the $\psi/\theta$ values and -filter out introns that are just noise. Secondly, we run the full +filter out introns that are lowly expressed. Secondly, we run the full pipeline using the command \Rfunction{FRASER}. In the last step, we extract the results table from the \fds{} using the \Rfunction{results} function. Additionally, the user can create several analysis plots directly from the @@ -192,7 +225,7 @@ fitted \fds{} object. These plotting functions are described in section # load FRASER library library(FRASER) -# count data +# count raw data fds <- createTestFraserSettings() fds <- countRNAData(fds) fds @@ -200,15 +233,11 @@ fds # compute stats fds <- calculatePSIValues(fds) -# filtering junction with low expression +# filter junctions with low expression fds <- filterExpressionAndVariability(fds, minExpressionInOneSample=20, minDeltaPsi=0.0, filter=TRUE) -# fit the splicing model for each metric -# with a specific latentsapce dimension -fds <- FRASER(fds, q=c(psi5=2, psi3=3, theta=3)) - -# we provide two ways to anntoate introns with the corresponding gene symbols: +# we provide two ways to annotate introns with the corresponding gene symbols: # the first way uses TxDb-objects provided by the user as shown here library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(org.Hs.eg.db) @@ -216,17 +245,21 @@ txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene orgDb <- org.Hs.eg.db fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) -# alternatively, we also provide a way to use biomart for the annotation: +# fit the splicing model for each metric +# with a specific latentspace dimension +fds <- FRASER(fds, q=c(jaccard=2)) + +# Alternatively, we also provide a way to use BioMart for the annotation: # fds <- annotateRanges(fds) -# get results: we recommend to use an FDR cutoff 0.05, but due to the small -# dataset size we extract all events and their associated values -# eg: res <- results(fds, zScoreCutoff=NA, padjCutoff=0.05, deltaPsiCutoff=0.3) -res <- results(fds, zScoreCutoff=NA, padjCutoff=NA, deltaPsiCutoff=NA) +# get results: we recommend to use an FDR cutoff of 0.05, but due to the small +# dataset size, we extract all events and their associated values +# eg: res <- results(fds, padjCutoff=0.05, deltaPsiCutoff=0.1) +res <- results(fds, all=TRUE) res -# result visualization -plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE) +# result visualization, aggregate=TRUE means that results are aggregated at the gene level +plotVolcano(fds, sampleID="sample1", type="jaccard", aggregate=TRUE) @ @@ -235,22 +268,21 @@ plotVolcano(fds, sampleID="sample1", type="psi5", aggregate=TRUE) The analysis workflow of \fraser{} for detecting rare aberrant splicing events in RNA-seq data can be divided into the following steps: \begin{enumerate} - \item Data import or Counting reads \ref{sec:dataPreparation} + \item Data import or counting reads \ref{sec:dataPreparation} \item Data preprocessing and QC \ref{sec:DataPreprocessing} \item Correcting for confounders \ref{sec:correction} - \item Calculate P-values \ref{sec:P-value-calculation} - \item Calculate Z-scores \ref{sec:Z-score-calculation} - \item Visualize the results \ref{sec:result-vis} + \item Calculating P-values \ref{sec:P-value-calculation} + \item Visualizing the results \ref{sec:result-vis} \end{enumerate} -Step 3-5 are wrapped up in one function \Rfunction{FRASER}, but each step can -be called individually and parametrizied. Either way, data preprocessing should +Steps 3 and 4 are wrapped up in one function \Rfunction{FRASER}, but each step can +be called individually and parametrized. Either way, data preprocessing should be done before starting the analysis, so that samples failing quality measurements or introns stemming from background noise are discarded. Detailed explanations of each step are given in the following subsections. -For this tutorial we will use the a small example dataset that is contained +For this tutorial, we will use the a small example dataset that is contained in the package. \subsection{Data preparation} @@ -259,7 +291,7 @@ in the package. \subsubsection{Creating a \fds{} and Counting reads} \label{sec:CountingReads} -To start a RNA-seq data analysis with \fraser{} some preparation steps are +To start an RNA-seq data analysis with \fraser{} some preparation steps are needed. The first step is the creation of a \fds{} which derives from a RangedSummarizedExperiment object. To create the \fds, sample annotation and two count matrices are needed: one containing counts for the splice junctions, @@ -270,10 +302,10 @@ splice junctions. You can first create the \fds{} with only the sample annotation and subsequently count the reads as described in \ref{sec:CountingReads}. For this, we need a table with basic informations which then can be transformed into a -\Rclass{FraserSettings} object. The minimum of information per sample is an -unique sample name, the path to the aligned bam file. -Additionally groups can be specified for the P-value calculations later. -If a \textbf{NA} is assigned no P-values will be calculated. An example sample +\Rclass{FraserSettings} object. The minimum of information per sample is a +unique sample name and the path to the BAM file. +Additionally groups can be specified for the P-value calculations. +If a \textbf{NA} is assigned, no P-values will be calculated. An example sample table is given within the package: <>= @@ -282,7 +314,7 @@ sampleTable <- fread(system.file( head(sampleTable) @ -To create a settings object for \fraser{} the constructor +To create a settings object for \fraser{}, the constructor \Rfunction{FraserSettings} should be called with at least a sampleData table. For an example have a look into the \Rfunction{createTestFraserSettings}. In addition to the sampleData you can specify further parameters. @@ -300,7 +332,7 @@ options from the sample annotation above: <>= # convert it to a bamFile list bamFiles <- system.file(sampleTable[,bamFile], package="FRASER", mustWork=TRUE) -sampleTable[,bamFile:=bamFiles] +sampleTable[, bamFile := bamFiles] # create FRASER object settings <- FraserDataSet(colData=sampleTable, workingDir="FRASER_output") @@ -317,15 +349,15 @@ settings <- createTestFraserSettings() settings @ -Counting of the reads are straight forward and is done through the +Counting the reads is straightforward and is done through the \Rfunction{countRNAData} function. The only required parameter is the -FraserSettings object. First all split reads are extracted from each individual -sample and cached if enabled. Then a dataset wide junction map is created +FraserSettings object. First, all split reads are extracted from each individual +sample and cached if enabled. Then a dataset-wide junction map is created (all visible junctions over all samples). After that for each sample the -non-spliced reads at each given donor and acceptor site is counted. The +non-spliced reads at each given donor and acceptor site are counted. The resulting \Rclass{FraserDataSet} object contains two -\Rclass{SummarizedExperiment} objects for each the junctions and the splice -sites. +\Rclass{SummarizedExperiment} objects, one for the junctions and one for the +splice sites. <>= # example of how to use parallelization: use 10 cores or the maximal number of @@ -349,7 +381,7 @@ If the count matrices already exist, you can use these matrices directly together with the sample annotation from above to create the \fds: <>= -# example sample annoation for precalculated count matrices +# example sample annotation for precalculated count matrices sampleTable <- fread(system.file("extdata", "sampleTable_countTable.tsv", package="FRASER", mustWork=TRUE)) head(sampleTable) @@ -379,28 +411,28 @@ slides\footnote{\url{http://tinyurl.com/RNA-ASHG-presentation}}. At the time of writing this vignette, we recommend that the RNA-seq data should be aligned with a splice-aware aligner like STAR\cite{Dobin2013} or GEM\cite{MarcoSola2012}. -To gain better results, at least 20 samples should be sequenced and they should -be processed with the same protocol and origin from the same tissue. +To obtain better results, at least 50 samples should be sequenced and they should +be processed with the same protocol and originated from the same tissue. \subsubsection{Filtering} \label{sec:filtering} -Before we can filter the data, we have to compute the main splicing metric: -the $\psi$-value (Percent Spliced In). +Before filtering the data, we have to compute the main splicing metrics: +the $\psi$-value (Percent Spliced In) and the Intron Jaccard Index. -<>= +<>= fds <- calculatePSIValues(fds) fds @ -Now we can have some cut-offs to filter down the number of junctions we want to +Now we can filter down the number of junctions we want to test later on. -Currently, we keep only junctions which support the following: +Currently, we suggest keeping only junctions which support the following: \begin{itemize} - \item At least one sample has 20 reads - \item 5\% of the samples have at least 1 read + \item At least one sample has 20 (or more) reads + \item 25\% (or more) of the samples have at least 10 reads \end{itemize} Furthemore one could filter for: @@ -411,7 +443,7 @@ Furthemore one could filter for: <>= -fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.0, filter=FALSE) +fds <- filterExpressionAndVariability(fds, minDeltaPsi=0, filter=FALSE) plotFilterExpression(fds, bins=100) @ @@ -432,14 +464,14 @@ Since $\psi$ values are ratios within a sample, one might think that there should not be as much correlation structure as observed in gene expression data within the splicing data. -This is not true as we do see strong sample co-variation across different -tissues and cohorts. Let's have a look into our data to see if we do have +However, we do see strong sample co-variation across different +tissues and cohorts. Let's have a look into our demo data to see if we it has correlation structure or not. To have a better estimate, we use the logit transformed $\psi$ values to compute the correlation. <>= # Heatmap of the sample correlation -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE) +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE) @ It is also possible to visualize the correlation structure of the logit @@ -447,13 +479,13 @@ transformed $\psi$ values of the $topJ$ most variable introns for all samples: <>= # Heatmap of the intron/sample expression -plotCountCorHeatmap(fds, type="psi5", logit=TRUE, normalized=FALSE, +plotCountCorHeatmap(fds, type="jaccard", logit=TRUE, normalized=FALSE, plotType="junctionSample", topJ=100, minDeltaPsi = 0.01) @ \subsection{Detection of aberrant splicing events} -After preprocessing the raw data and visualizing it, we can start our analysis. +After preprocessing the raw data and visualizing it, we can start with our analysis. Let's start with the first step in the aberrant splicing detection: the model fitting. @@ -465,32 +497,32 @@ latent space with a dimension $q=10$ . Using the correct dimension is crucial to have the best performance (see \ref{sec:encDim}). Alternatively, one can also use a PCA to correct the data. The wrapper function \Rfunction{FRASER} both fits the model and calculates the -p-values and z-scores for all $\psi$ types. For more details see section +p-values for all $\psi$ types. For more details see section \ref{sec:details}. <>= -# This is computational heavy on real size datasets and can take awhile -fds <- FRASER(fds, q=c(psi5=3, psi3=5, theta=2)) +# This is computational heavy on real datasets and can take some hours +fds <- FRASER(fds, q=c(jaccard=3)) @ To check whether the correction worked, we can have a look at the correlation heatmap using the normalized $\psi$ values from the fit. <>= -plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE) +plotCountCorHeatmap(fds, type="jaccard", normalized=TRUE, logit=TRUE) @ \subsubsection{Calling splicing outliers} -Before we extract the results, we should add the human readable HGNC symbols. +Before we extract the results, we should add HGNC symbols to the junctions. \fraser{} comes already with an annotation function. The function uses \Biocpkg{biomaRt} in the background to overlap the genomic ranges with the known HGNC symbols. To have more flexibilty on the annotation, one can also provide a custom `txdb` object to annotate the HGNC symbols. Here we assume a beta binomial distribution and call outliers based on the -significance level. The user can choose between a p value cutoff, a Z score -cutoff or a cutoff on the $\Delta\psi$ values between the observed and expected +significance level. The user can choose between a p value cutoff, +a cutoff on the $\Delta\psi$ values between the observed and expected $\psi$ values or both. <>= @@ -508,7 +540,7 @@ fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb) res <- results(fds) @ -\subsubsection{Interpreting the result table} +\subsubsection{Interpreting the results table} The function \Rfunction{results} retrieves significant events based on the specified cutoffs as a \Rclass{GRanges} object which contains the genomic @@ -517,45 +549,68 @@ the following additional information: \begin{itemize} \item sampleID: the sampleID in which this aberrant event occurred \item hgncSymbol: the gene symbol of the gene that contains the splice - junction or site if available + junction or site, if available \item type: the metric for which the aberrant event was detected (either - psi5 for $\psi_5$, psi3 for $\psi_3$ or theta for $\theta$) - \item pValue, padjust, zScore: the p-value, adjusted p-value and z-score of - this event - \item psiValue: the value of $\psi_5$, $\psi_3$ or $\theta$ metric - (depending on the type column) of this junction or splice site for the - sample in which it is detected as aberrant + jaccard for Intron Jaccard Index or psi5 for $\psi_5$, psi3 for $\psi_3$ or + theta for $\theta$) + \item pValue, padjust: the p-value and adjusted p-value (FDR) of + this event (at intron or splice site level depending on metric) + \item pValueGene, padjustGene: only present in the gene-level results table, + gives the p-value and FDR adjusted p-value at gene-level + \item psiValue: the value of the splice metric (see 'type' column for the + name of the metric) of this junction or splice site for the sample in which + it is detected as aberrant \item deltaPsi: the $\Delta\psi$-value of the event in this sample, which is the difference between the actual observed $\psi$ and the expected $\psi$ + \item counts, totalCounts: the count (k) and total count (n) of the splice + junction or site for the sample where it is detected as aberrant \item meanCounts: the mean count (k) of reads mapping to this splice junction or site over all samples \item meanTotalCounts: the mean total count (n) of reads mapping to the same donor or acceptor site as this junction or site over all samples - \item counts, totalCounts: the count (k) and total count (n) of the splice - junction or site for the sample where it is detected as aberrant + \item nonsplitCounts, nonsplitProportion: only present for the Intron + Jaccard Index. States the sum of nonsplit counts overlapping either the + donor or acceptor site of the outlier intron for the sample where it is + detected as aberrant; and their proportion out of the total counts (N). + A high nonsplitProportion indicates possible (partial) intron retention. + \item FDR\_set The set of genes on which FDR correction is applied. If not + otherwise specified, FDR correction is transcriptome-wide. \end{itemize} Please refer to section \ref{sec:Introduction} for more information about the -metrics $\psi_5$, $\psi_3$ and $\theta$ and their definition. In general, an +Intron Jaccard Index metric (or the previous metrics $\psi_5$, $\psi_3$ and +$\theta$) and their definition. In general, an aberrant $\psi_5$ value might indicate aberrant acceptor site usage of the junction where the event is detected; an aberrant $\psi_3$ value might indicate aberrant donor site usage of the junction where the event is detected; and an aberrant $\theta$ value might indicate partial or full intron retention, or -exon truncation or elongation. We recommend using a genome browser to -investigate interesting detected events in more detail. +exon truncation or elongation. As the Intron Jaccard Index combines the +three metrics, an aberrant Intron Jaccard value can indicate any +of the above described cases. We recommend inspecting the outliers using IGV. +\fraser{}2 also provides the function \Rfunction{plotBamCoverageFromResultTable} +to create a sashimi plot for an outlier in the results table directly in R (if paths to +bam files are available in the \fds{} object). <>= -# to show result visualization functions for this tuturial, zScore cutoff used -res <- results(fds, zScoreCutoff=2, padjCutoff=NA, deltaPsiCutoff=0.1) +# for visualization purposes for this tutorial, no cutoffs were used +res <- results(fds, all=TRUE) res + +# for the gene level pvalues, gene symbols need to be added to the fds object +# before calling the calculatePadjValues function (part of FRASER() function) +# as we previously called FRASER() before annotating genes, we run it again here +fds <- calculatePadjValues(fds, type="jaccard", geneLevel=TRUE) +# generate gene-level results table (if gene symbols have been annotated) +res_gene <- results(fds, aggregate=TRUE, all=TRUE) +res_gene @ \subsection{Finding splicing candidates in patients} -Let's hava a look at sample 10 and check if we got some splicing +Let's have a look at sample 10 and check if we got some splicing candidates for this sample. <>= -plotVolcano(fds, type="psi5", "sample10") +plotVolcano(fds, type="jaccard", "sample10") @ Which are the splicing events in detail? @@ -568,13 +623,14 @@ sampleRes To have a closer look at the junction level, use the following functions: <>= -plotExpression(fds, type="psi5", result=sampleRes[1]) -plotExpectedVsObservedPsi(fds, result=sampleRes[1]) +plotExpression(fds, type="jaccard", result=sampleRes[9]) # plots the 9th row +plotSpliceMetricRank(fds, type="jaccard", result=sampleRes[9]) +plotExpectedVsObservedPsi(fds, result=sampleRes[9]) @ \subsection{Saving and loading a \fds{}} -A \fds{} object can be easily saved and reloaded at any time as follows: +A \fds{} object can be easily saved and reloaded as follows: <>= # saving a fds @@ -595,7 +651,7 @@ fds <- loadFraserDataSet(file=file.path(workingDir(fds), The function \Rfunction{FRASER} is a convenient wrapper function that takes care of correcting for confounders, fitting the beta binomial distribution and -calculating p-values and z-scores for all $\psi$ types. To have more control +calculating p-values for all $\psi$ types. To have more control over the individual steps, the different functions can also be called separately. The following sections give a short explanation of these steps. @@ -617,17 +673,17 @@ confounders in the data. Currently the following methods are implemented: <>= # Using an alternative way to correct splicing ratios -# here: only 2 iteration to speed the calculation up -# for the vignette, the default is 15 iterations -fds <- fit(fds, q=3, type="psi5", implementation="PCA-BB-Decoder", +# here: only 2 iterations to speed the calculation up for the vignette +# the default is 15 iterations +fds <- fit(fds, q=3, type="jaccard", implementation="PCA-BB-Decoder", iterations=2) @ \subsubsection{Finding the dimension of the latent space} \label{sec:encDim} -For the previous call, the dimension $q$ of the latent space has been fixed to -$q=10$. Since working with the correct $q$ is very important, the \fraser{} +For the previous call, the dimension $q$ of the latent space has been fixed. +Since working with the correct $q$ is very important, the \fraser{} package also provides the function \Rfunction{optimHyperParams} that can be used to estimate the dimension $q$ of the latent space of the data. It works by artificially injecting outliers into the data and then comparing the AUC of @@ -638,17 +694,17 @@ for a subset of the dataset: <>= set.seed(42) # hyperparameter opimization -fds <- optimHyperParams(fds, type="psi5", plot=FALSE) +fds <- optimHyperParams(fds, type="jaccard", plot=FALSE) # retrieve the estimated optimal dimension of the latent space -bestQ(fds, type="psi5") +bestQ(fds, type="jaccard") @ The results from this hyper parameter optimization can be visualized with the function \Rfunction{plotEncDimSearch}. <>= -plotEncDimSearch(fds, type="psi5") +plotEncDimSearch(fds, type="jaccard") @ \subsection{P-value calculation} @@ -669,8 +725,8 @@ computed as the product of the fitted correction values from the autoencoder and the fitted mean adjustements. <>= -fds <- calculatePvalues(fds, type="psi5") -head(pVals(fds, type="psi5")) +fds <- calculatePvalues(fds, type="jaccard") +head(pVals(fds, type="jaccard")) @ Afterwards, adjusted p-values can be calculated. Multiple testing correction is @@ -680,45 +736,35 @@ methods supported by \Rfunction{p.adjust} can be used via the \Robject{method} argument. <>= -fds <- calculatePadjValues(fds, type="psi5", method="BY") -head(padjVals(fds,type="psi5")) +fds <- calculatePadjValues(fds, type="jaccard", method="BY") +head(padjVals(fds,type="jaccard")) @ -\subsection{Z-score calculation} -\label{sec:Z-score-calculation} - -To calculate z-scores on the logit transformed $\Delta\psi$ values and to store -them in the \fds{} object, the function \Rfunction{calculateZScores} can be -called. The Z-scores can be used for visualization, filtering, and ranking of -samples. The Z-scores are calculated as follows: - -\begin{equation} - z_{ij} = \frac{\delta_{ij} - \bar{\delta_j}}{sd(\delta_j)} -\end{equation} -\begin{equation*} - \delta_{ij} = logit{(\frac{k_{ij} + 1}{n_{ij} + 2})} - logit{(\mu_{ij})}, -\end{equation*} - -where $\delta_{ij}$ is the difference on the logit scale between the measured -counts and the counts after correction for confounders and $\bar{\delta_j}$ is -the mean of intron $j$. +With FRASER 2.0 we introduce the option to limit FDR correction to a subset of +genes based on prior knowledge, e.g. genes that contain a rare variant per +sample. To use this option, provide a list of genes per sample during FDR +computation: -<>= -fds <- calculateZscore(fds, type="psi5") -head(zScores(fds, type="psi5")) +<>= +genesOfInterest <- list("sample1"=c("XAB2", "PNPLA6", "STXBP2", "ARHGEF18"), + "sample2"=c("ARHGEF18", "TRAPPC5")) +fds <- calculatePadjValues(fds, type="jaccard", + subsets=list("exampleSubset"=genesOfInterest)) +head(padjVals(fds, type="jaccard", subsetName="exampleSubset")) @ \subsection{Result visualization} \label{sec:result-vis} -In addition to the plotting methods \Rfunction{plotVolcano}, +Besides the plotting methods \Rfunction{plotVolcano}, \Rfunction{plotExpression}, \Rfunction{plotExpectedVsObservedPsi}, +\Rfunction{plotSpliceMetricRank}, \Rfunction{plotFilterExpression} and \Rfunction{plotEncDimSearch} used above, -the \fraser{} package provides two additional functions to visualize the +the \fraser{} package provides additional functions to visualize the results: \Rfunction{plotAberrantPerSample} displays the number of aberrant events per -sample based on the given cutoff values and \Rfunction{plotQQ} gives a +sample of the whole cohort based on the given cutoff values and \Rfunction{plotQQ} gives a quantile-quantile plot either for a single junction/splice site or globally. <>= @@ -731,6 +777,54 @@ plotQQ(fds, result=res[1]) plotQQ(fds, aggregate=TRUE, global=TRUE) @ +The \Rfunction{plotManhattan} function can be used to visualize the p-values +along with the genomic coordinates of the introns: +<>= +plotManhattan(fds, sampleID="sample10") +plotManhattan(fds, sampleID="sample10", chr="chr19") +@ + +Finally, when one has access to the bam files from which the split and unsplit +counts of FRASER were created, the \Rfunction{plotBamCoverage} and +\Rfunction{plotBamCoverageFromResultTable} functions use the \Rpackage{SGSeq} +package to allow visualizing the read coverage in the bam file a certain intron +from the results table or within a given genomic region as a sashimi plot: +<>= +### plot coverage from bam file for a certain genomic region +fds <- createTestFraserSettings() +vizRange <- GRanges(seqnames="chr19", + IRanges(start=7587496, end=7598895), + strand="+") +plotBamCoverage(fds, gr=vizRange, sampleID="sample3", + control_samples="sample2", min_junction_count=5, + curvature_splicegraph=1, curvature_coverage=1, + mar=c(1, 7, 0.1, 3)) + +### plot coverage from bam file for a row in the result table +fds <- createTestFraserDataSet() + +# load gene annotation +require(TxDb.Hsapiens.UCSC.hg19.knownGene) +txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene +require(org.Hs.eg.db) +orgDb <- org.Hs.eg.db + +# get results table +res <- results(fds, padjCutoff=NA, deltaPsiCutoff=NA) +res_dt <- as.data.table(res) +res_dt <- res_dt[sampleID == "sample2",] + +# plot full range of gene highlighting the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=TRUE, + txdb=txdb, orgDb=orgDb, control_samples="sample3") + +# plot only certain range around the outlier intron +plotBamCoverageFromResultTable(fds, result=res_dt[1,], show_full_gene=FALSE, + control_samples="sample3", curvature_splicegraph=0.5, txdb=txdb, + curvature_coverage=0.5, right_extension=5000, left_extension=5000, + splicegraph_labels="id") +@ + \bibliography{bibliography} \section{Session Info} diff --git a/vignettes/IntronJaccardIndex_sketch.png b/vignettes/IntronJaccardIndex_sketch.png new file mode 100644 index 00000000..79cca6a8 Binary files /dev/null and b/vignettes/IntronJaccardIndex_sketch.png differ