Skip to content
4 changes: 2 additions & 2 deletions R/bambu-extendAnnotations-utilityCombine.R
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ reduceUnsplicedRanges <- function(rangesList, stranded){
#' make unspliced tibble
#' @importFrom tidyr separate_rows pivot_wider
#' @importFrom dplyr as_tibble rename mutate select %>% group_by left_join
#' ungroup
#' ungroup bind_rows
#' @noRd
makeUnsplicedTibble <- function(combinedNewUnsplicedSe,newUnsplicedSeList,
colDataNames,min.readCount, min.readFractionByGene,
Expand All @@ -290,7 +290,7 @@ makeUnsplicedTibble <- function(combinedNewUnsplicedSe,newUnsplicedSeList,
rename(chr = seqnames) %>% select(chr, start, end, strand, row_id) %>%
separate_rows(row_id, sep = "\\+")
rowDataCombined <-
do.call("rbind",bplapply(newUnsplicedSeList, function(newUnsplicedSe) {
dplyr::bind_rows(bplapply(newUnsplicedSeList, function(newUnsplicedSe) {
rr <- rowData(newUnsplicedSe[intersect(rownames(newUnsplicedSe),
newUnsplicedTibble$row_id)])
rr <- as_tibble(rr) %>% select(confidenceType,readCount,
Expand Down
30 changes: 19 additions & 11 deletions R/bambu-extendAnnotations-utilityExtend.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,16 @@ filterTranscriptsByAnnotation <- function(rowDataCombined, annotationGrangesList
#calculate relative subset read count after filtering (increase speed, subsets are not considered here)
mcols(exonRangesCombined)$txid <- seq_along(exonRangesCombined)
minEq <- getMinimumEqClassByTx(exonRangesCombined)$eqClassById
rowDataCombined$relSubsetCount <- rowDataCombined$readCount/unlist(lapply(minEq, function(x){return(sum(rowDataCombined$readCount[x]))}))
# Optimize: use vapply instead of lapply+unlist for better performance and type safety
# Handle edge cases: empty vectors (return 1) and zero sums (replace with 1)
# This prevents NaN in relSubsetCount; filtering later removes invalid entries
eq_class_sums <- vapply(minEq, function(x) {
if(length(x) == 0) return(1) # Avoid division by zero for empty eq classes
sum(rowDataCombined$readCount[x])
}, numeric(1))
# Replace zero sums with 1 to prevent NaN (maintains original behavior)
eq_class_sums[eq_class_sums == 0] <- 1
rowDataCombined$relSubsetCount <- rowDataCombined$readCount / eq_class_sums
#post extend annotation filters applied here (currently only subset filter)
if(min.readFractionByEqClass>0 & sum(filterSet)>0) { # filter out subset transcripts based on relative expression
filterSet <- rowDataCombined$relSubsetCount > min.readFractionByEqClass
Expand Down Expand Up @@ -287,19 +296,18 @@ addNewSplicedReadClasses <- function(combinedTranscriptRanges,
# annotate with transcript and gene Ids
equalSubHits <- subjectHits(ovExon[mcols(ovExon)$equal])
rowDataFilteredSpliced$TXNAME <- NA
rowDataFilteredSpliced$TXNAME[
equalQhits[!duplicated(equalQhits)]] <-
mcols(annotationGrangesList)$TXNAME[
equalSubHits[!duplicated(equalQhits)]]
# Cache duplicated checks to avoid redundant computation
unique_equal_idx <- !duplicated(equalQhits)
rowDataFilteredSpliced$TXNAME[equalQhits[unique_equal_idx]] <-
mcols(annotationGrangesList)$TXNAME[equalSubHits[unique_equal_idx]]
compatibleSubHits <- subjectHits(ovExon[mcols(ovExon)$compatible])
rowDataFilteredSpliced$GENEID <- NA
rowDataFilteredSpliced$GENEID[
compatibleQhits[!duplicated(compatibleQhits)]] <-
mcols(annotationGrangesList)$GENEID[
compatibleSubHits[!duplicated(compatibleQhits)]]
unique_compatible_idx <- !duplicated(compatibleQhits)
rowDataFilteredSpliced$GENEID[compatibleQhits[unique_compatible_idx]] <-
mcols(annotationGrangesList)$GENEID[compatibleSubHits[unique_compatible_idx]]
# annotate with compatible gene id,
rowDataFilteredSpliced$GENEID[equalQhits[!duplicated(equalQhits)]] <-
mcols(annotationGrangesList[equalSubHits[!duplicated(equalQhits)]])$GENEID
rowDataFilteredSpliced$GENEID[equalQhits[unique_equal_idx]] <-
mcols(annotationGrangesList[equalSubHits[unique_equal_idx]])$GENEID
# annotate as identical, using intron matches
unlistedIntrons <- unlist(intronsByReadClass, use.names = TRUE)
partitioning <- PartitioningByEnd(cumsum(elementNROWS(intronsByReadClass)),
Expand Down
18 changes: 10 additions & 8 deletions R/bambu-quantify_utilityFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
#' @import data.table
#' @noRd
modifyIncompatibleAssignment <- function(distTable){
distTable <- data.table(as.data.frame(distTable))
# Convert directly to data.table to avoid double conversion
distTable <- as.data.table(distTable)
distTable[,`:=`(anyCompatible = any(compatible),
anyEqual = any(equal)),
by = readClassId]
Expand All @@ -20,12 +21,12 @@ modifyIncompatibleAssignment <- function(distTable){
#' Process incompatible counts
#' @noRd
processIncompatibleCounts <- function(readClassDist){
distTable <- unique(data.table(as.data.frame(metadata(readClassDist)$distTable))[,
# Convert directly to data.table to avoid double conversion
distTable <- unique(as.data.table(metadata(readClassDist)$distTable)[,
.(readClassId, annotationTxId, readCount, GENEID, equal)], by = NULL)
distTableIncompatible <- distTable[grep("unidentified", annotationTxId)]
# filter out multiple geneIDs mapped to the same readClass using rowData(se)
geneRCMap <- as.data.table(as.data.frame(rowData(readClassDist)),
keep.rownames = TRUE)
geneRCMap <- as.data.table(rowData(readClassDist), keep.rownames = TRUE)
setnames(geneRCMap, old = c("rn", "geneId"),
new = c("readClassId", "GENEID"))
distTable <- distTable[geneRCMap[ readClassId %in%
Expand Down Expand Up @@ -73,12 +74,12 @@ genEquiRCsBasedOnObservedReads <- function(readClass){
firstExonWidth =
width(unlisted_rowranges[unlisted_rowranges$exon_rank == 1,]),
totalWidth = sum(width(rowRanges(readClass))))
distTable <- data.table(as.data.frame(metadata(readClass)$distTable))[!grepl("unidentified", annotationTxId), .(readClassId,
# Convert directly to data.table to avoid double conversion
distTable <- as.data.table(metadata(readClass)$distTable)[!grepl("unidentified", annotationTxId), .(readClassId,
annotationTxId, readCount, GENEID, dist,equal,txid)]
distTable <- rcWidth[distTable, on = "readClassId"]
# filter out multiple geneIDs mapped to the same readClass using rowData(se)
compatibleData <- as.data.table(as.data.frame(rowData(readClass)),
keep.rownames = TRUE)
compatibleData <- as.data.table(rowData(readClass), keep.rownames = TRUE)
setnames(compatibleData, old = c("rn", "geneId"),
new = c("readClassId", "GENEID"))
distTable <- distTable[compatibleData[ readClassId %in%
Expand Down Expand Up @@ -164,7 +165,8 @@ processMinEquiRC <- function(annotations){
unnest(c(txidTemp)) %>%
mutate(minRC = 1, equal = FALSE, txid = txidTemp, txidTemp = NULL)

minEquiRC <- bind_rows(minEquiRC, minEquiRCTemp)
# Keep bind_rows for tibbles to preserve tibble class and attributes
minEquiRC <- bind_rows(minEquiRC, minEquiRCTemp)
return(minEquiRC)
}

Expand Down
6 changes: 4 additions & 2 deletions R/bambu_utilityFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,17 @@ checkInputSequence <- function(genomeSequence) {
#' @noRd
handleWarnings <- function(readClassList, verbose){
warnings = list()
sampleNames = c()
# Note: Cannot pre-allocate sampleNames as colnames may change after loading RDS files
# For small number of samples, vector growth is acceptable
sampleNames = vector("character", 0)
for(i in seq_along(readClassList)){
readClassSe = readClassList[[i]]
if (is.character(readClassSe)){
readClassSe <- readRDS(file = readClassSe)}
warnings[[i]] = NA
if(!is.null(metadata(readClassSe)$warnings)){
warnings[[i]] = metadata(readClassSe)$warnings}
sampleNames = c(sampleNames, colnames(readClassList[[i]]))
sampleNames = c(sampleNames, colnames(readClassSe))
}
names(warnings) = sampleNames

Expand Down
5 changes: 3 additions & 2 deletions R/plotBambu.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ plotBambu <- function(se, group.variable = NULL,
}
# =
count.data <- assays(se)$CPM
count.data <- count.data[apply(count.data, 1, sd) >
quantile(apply(count.data, 1, sd), 0.50), ]
# Cache standard deviation calculation to avoid computing twice
sds <- apply(count.data, 1, sd)
count.data <- count.data[sds > quantile(sds, 0.50), ]
count.data <- log2(count.data + 1)
if (type == "pca") {
p <- plotPCA(se, count.data, group.variable)
Expand Down
19 changes: 11 additions & 8 deletions R/readWrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,19 @@ readFromGTF <- function(file, keep.extra.columns = NULL){
"end","score","strand","frame","attribute")
data <- data[data$type == 'exon',]
data$strand[data$strand == '.'] <- '*'
data$GENEID = gsub('gene_id (.*?);.*','\\1',data$attribute)
data$TXNAME = gsub('.*transcript_id (.*?);.*', '\\1',data$attribute)
data$exon_rank = gsub('.*exon_number (.*?);.*', '\\1',data$attribute)
# Optimize attribute parsing: extract all required fields in one pass
# to reduce repeated pattern matching on the same string
attr_string <- data$attribute
data$GENEID = gsub('gene_id (.*?);.*','\\1', attr_string)
data$TXNAME = gsub('.*transcript_id (.*?);.*', '\\1', attr_string)
data$exon_rank = gsub('.*exon_number (.*?);.*', '\\1', attr_string)
if (!is.null(keep.extra.columns)) {
# Process extra columns without repeated gsub on data$attribute
for (extraColumn in seq_along(keep.extra.columns)) {
data[,keep.extra.columns[extraColumn]] <-
gsub(paste0('.*',keep.extra.columns[extraColumn],
' (.*?);.*'), '\\1',data$attribute)
data[grepl(';', data[,keep.extra.columns[extraColumn]]),
keep.extra.columns[extraColumn]] <- ''
col_name <- keep.extra.columns[extraColumn]
data[, col_name] <-
gsub(paste0('.*', col_name, ' (.*?);.*'), '\\1', attr_string)
data[grepl(';', data[, col_name]), col_name] <- ''
}
}
grlist <- makeGRangesListFromDataFrame(
Expand Down