Skip to content

Commit 73abb98

Browse files
author
Daniel Cameron
committed
improved findBreakpointOverlaps() performance when both query and subject are large GRanges
1 parent 7cdd5b9 commit 73abb98

3 files changed

Lines changed: 46 additions & 51 deletions

File tree

DESCRIPTION

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
Package: StructuralVariantAnnotation
22
Type: Package
33
Title: VariantionAnnoration for Structural Variants
4-
Version: 0.7.1
5-
Date: 2018-08-23
4+
Version: 0.7.2
5+
Date: 2018-01-25
66
Author: Daniel Cameron
77
Maintainer: Daniel Cameron <daniel.l.cameron@gmail.com>
88
Description: StructuralVariantAnnotation contains useful helper
@@ -20,7 +20,8 @@ Depends:
2020
stats,
2121
R (>= 3.2.2),
2222
S4Vectors (>= 0.10.0),
23-
VariantAnnotation (>= 1.16.3)
23+
VariantAnnotation (>= 1.16.3),
24+
dplyr
2425
Imports:
2526
assertthat,
2627
Biostrings (>= 2.40.0),

R/BreakpointGRanges.R

Lines changed: 30 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -31,32 +31,15 @@ partner <- function(gr) {
3131
#'
3232
#'@export
3333
findBreakpointOverlaps <- function(query, subject, maxgap=-1L, minoverlap=0L, ignore.strand=FALSE, sizemargin=0.25, restrictMarginToSizeMultiple=0.5) {
34-
hitdf <- as.data.frame(findOverlaps(query, subject, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=ignore.strand), row.names=NULL)
35-
# instead of running findOverlaps(partner(query), partner(subject), ...
36-
# we can reduce our runtime cost by just performing partner index lookups
37-
# partner lookups
38-
subjectPartnerIndexLookup <- seq_along(names(subject))
39-
names(subjectPartnerIndexLookup) <- names(subject)
40-
queryPartnerIndexLookup <- seq_along(names(query))
41-
names(queryPartnerIndexLookup) <- names(query)
42-
phitdf <- data.frame(
43-
queryHits=queryPartnerIndexLookup[query$partner[hitdf$queryHits]],
44-
subjectHits=subjectPartnerIndexLookup[subject$partner[hitdf$subjectHits]])
45-
hits <- rbind(hitdf, phitdf, make.row.names=FALSE)
34+
hits <- dplyr::bind_rows(
35+
as.data.frame(findOverlaps(query, subject, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=ignore.strand), row.names=NULL),
36+
as.data.frame(findOverlaps(query, subject, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=ignore.strand), row.names=NULL))
4637
# we now want to do:
4738
# hits <- hits[duplicated(hits),] # both breakends match
4839
# but for large hit sets (such as focal false positive loci) we run out of memory (>32GB)
49-
# instead, we sort then check that we match the previous record
50-
hits <- hits[base::order(hits$queryHits, hits$subjectHits), ]
51-
lg <- function(x) {
52-
if (length(x) == 0) {
53-
return(x)
54-
} else {
55-
return(c(-1, x[1:(length(x)-1)])) # -1 to ensure FALSE match instead of NA match
56-
}
57-
}
58-
isDup <- hits$queryHits == lg(hits$queryHits) & hits$subjectHits == lg(hits$subjectHits)
59-
hits <- hits[isDup,]
40+
# instead, we sort then check that we match the next record
41+
hits = hits %>% dplyr::arrange(queryHits, subjectHits) %>%
42+
dplyr::filter(!is.na(dplyr::lead(.$queryHits)) & !is.na(dplyr::lead(.$subjectHits)) & dplyr::lead(.$queryHits) == .$queryHits & dplyr::lead(.$subjectHits) == .$subjectHits)
6043
if (!is.null(sizemargin) && !is.na(sizemargin)) {
6144
# take into account confidence intervals when calculating event size
6245
callwidth <- .distance(query, partner(query))
@@ -343,29 +326,28 @@ calculateBlastHomology <- function(gr, ref, db, anchorLength=150) {
343326
#' @param gr breakpoint GRanges object. Can contain both breakpoint and single breakend SV records
344327
#'
345328
#'@export
346-
breakpointGRangesToVCF <- function(gr) {
347-
if (is.null(gr$insSeq)) {
348-
gr$insSeq = rep("", length(gr))
349-
}
350-
nominalgr = GRanges(seqnames=seqnames(gr), ranges=IRanges(start=(end(gr) + start(gr)) / 2, width=1))
351-
if (is.null(gr$REF)) {
352-
gr$REF = rep("N", length(gr))
353-
}
354-
gr$ALT[is.na(gr$ALT)] = ""
355-
if (is.null(gr$ALT)) {
356-
gr$ALT = rep("", length(gr))
357-
}
358-
gr$ALT[is.na(gr$ALT)] = ""
359-
gr$ALT[gr$ALT == ""] = .toVcfBreakendNotationAlt(gr)[gr$ALT == ""]
360-
ciposstart = start(gr) - start(nominalgr)
361-
ciposend = end(gr) - end(nominalgr)
362-
vcf = VCF(rowRanges=nominalgr, collapsed=FALSE)
363-
fixeddf = data.frame(
364-
ALT=gr$ALT,
365-
REF=gr$REF,
366-
QUAL=gr$QUAL,
367-
FILTER=gr$FILTER)
368-
369-
VCF(rowRanges = GRanges(), colData = DataFrame(), exptData = list(header = VCFHeader()), fixed = DataFrame(), info = DataFrame(), geno = SimpleList(), ..., collapsed=FALSE, verbose = FALSE
370-
}
329+
# breakpointGRangesToVCF <- function(gr) {
330+
# if (is.null(gr$insSeq)) {
331+
# gr$insSeq = rep("", length(gr))
332+
# }
333+
# nominalgr = GRanges(seqnames=seqnames(gr), ranges=IRanges(start=(end(gr) + start(gr)) / 2, width=1))
334+
# if (is.null(gr$REF)) {
335+
# gr$REF = rep("N", length(gr))
336+
# }
337+
# gr$ALT[is.na(gr$ALT)] = ""
338+
# if (is.null(gr$ALT)) {
339+
# gr$ALT = rep("", length(gr))
340+
# }
341+
# gr$ALT[is.na(gr$ALT)] = ""
342+
# gr$ALT[gr$ALT == ""] = .toVcfBreakendNotationAlt(gr)[gr$ALT == ""]
343+
# ciposstart = start(gr) - start(nominalgr)
344+
# ciposend = end(gr) - end(nominalgr)
345+
# vcf = VCF(rowRanges=nominalgr, collapsed=FALSE)
346+
# fixeddf = data.frame(
347+
# ALT=gr$ALT,
348+
# REF=gr$REF,
349+
# QUAL=gr$QUAL,
350+
# FILTER=gr$FILTER)
351+
# VCF(rowRanges = GRanges(), colData = DataFrame(), exptData = list(header = VCFHeader()), fixed = DataFrame(), info = DataFrame(), geno = SimpleList(), ..., collapsed=FALSE, verbose = FALSE)
352+
# }
371353

tests/testthat/test-BreakpointGRanges.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,18 @@ test_that("calculateBlastHomology", {
211211
#bh <- calculateBlastHomology(gr, hg19, "~/blastdb/16SMicrobial")
212212

213213
})
214+
test_that("performance_test_partner", {
215+
n = 10000
216+
gr = GRanges(
217+
seqnames="1",
218+
ranges=IRanges(start=1:(2*n), width=1),
219+
partner=c(paste0(1:n, "o"), paste0(1:n, "h")))
220+
names(gr)=c(paste0(1:n, "h"), paste0(1:n, "o"))
221+
tictoc::tic(paste0("Start", n))
222+
pgr = partner(gr)
223+
tictoc::toc()
224+
})
225+
214226

215227

216228

0 commit comments

Comments
 (0)