sterrettJD · sterrettJD · Apr 24, 2025 · Apr 9, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/.github/workflows/check-bioc.yaml b/.github/workflows/check-bioc.yaml
@@ -175,7 +175,11 @@ jobs:
 
           ## Pass #1 at installing dependencies
           message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
-          remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
+          if (Sys.info()["sysname"] == "Windows") {
+            remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE, force = TRUE)
+          } else {
+            remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
+          }
         continue-on-error: true
         shell: Rscript {0}
 

diff --git a/R/mtxDE.R b/R/mtxDE.R
diff --git a/benchmarking/benchmark_mtxDE_with_paired_mgx.qmd b/benchmarking/benchmark_mtxDE_with_paired_mgx.qmd
@@ -0,0 +1,117 @@
+---
+title: "Benchmark mtxDE with paired metagenomics data"
+author: "Kat Terwelp"
+format: html
+editor: visual
+---
+
+## Benchmark mtxDE with paired metagenomics data
+
+```{r setup, include=FALSE}
+# Load required libraries
+library(dplyr)
+library(stringr)
+library(ggplot2)
+library(caret)
+```
+
+**Step One: Download Data**
+
+```{r}
+# if synth_mgx_mtx data is not already downloaded, download it
+if (!dir.exists("synth_mgx_mtx")){
+utils::download.file(url = "http://199.94.60.28/MTX_2021/synth_mgx_mtx.tar.gz", 
+              destfile = "synth_mgx_mtx.tar.gz")
+utils::untar(tarfile = "synth_mgx_mtx.tar.gz", exdir = "synth_mgx_mtx")
+}
+```
+
+**Step Two: Import metadata, feature table, and dna table**
+
+```{r}
+# Read in null df for RNA features
+null.df <- data.table::fread("synth_mgx_mtx/group-null-enc.mtx_abunds_groups.tsv",
+                             data.table=F)
+null.mgx.df <- data.table::fread("synth_mgx_mtx/group-null-enc.mgx_abunds_groups.tsv",
+                                 data.table=F)
+
+# Create metadata
+metadata <- data.frame(phenotype=t(null.df[1,]),
+                       rna_depth=t(null.df[2,]),
+                       dna_depth=t(null.mgx.df[2,]))
+metadata <- metadata[2:nrow(metadata),]
+colnames(metadata) <- c("phenotype", "rna_depth", "dna_depth")
+metadata$SampleID <- rownames(metadata)
+metadata$phenotype <- as.numeric(metadata$phenotype)
+
+# Filter the df into a feature table where the samples are rows
+null.feature.table <- null.df[3:nrow(null.df),]
+null.feature.table <- as.data.frame(t(null.feature.table))
+colnames(null.feature.table) <- null.feature.table["#",]
+null.feature.table <- null.feature.table[2:nrow(null.feature.table),]
+null.feature.table <- dplyr::mutate_all(null.feature.table, as.numeric)
+
+
+# Filter the dna df into a dna table where the samples are rows
+null.dna.table <- null.mgx.df[3:nrow(null.mgx.df),]
+null.dna.table <- as.data.frame(t(null.dna.table))
+colnames(null.dna.table) <- null.dna.table["#",]
+null.dna.table <- null.dna.table[2:nrow(null.dna.table),]
+null.dna.table<- dplyr::mutate_all(null.dna.table, as.numeric)
+```
+
+```{r}
+# make sure there's no NA values
+null.feature.table[is.na(null.feature.table)] <- 0
+null.dna.table[is.na(null.dna.table)] <- 0
+# transform into relative abundance
+null.feature.table.rel <- null.feature.table/rowSums(null.feature.table)
+null.dna.table.rel <- null.dna.table/rowSums(null.dna.table)
+
+
+```
+
+```{r}
+# run the null model if not already available
+if (!file.exists("null_res.RDS")){
+null.res <- run_mtxDE(formula="phenotype", 
+                      feature.table=null.feature.table.rel,
+                      metadata=metadata, sampleID="SampleID", reg.method = "zibr",
+                      ncores = 4,
+                      dna.table = null.dna.table.rel
+                      )
+
+saveRDS(null.res, file = "null_res.RDS")
+} else {
+  null.res <- readRDS("null_res.RDS")
+}
+```
+
+```{r}
+hist(null.res$q)
+
+null.res <- null.res %>%
+  dplyr::mutate(mgx = dplyr::if_else(stringr::str_detect(term, "_mgx$"), TRUE, FALSE),
+                significant = if_else(q < 0.05, TRUE, FALSE))
+
+null.res %>% 
+  ggplot2::ggplot(aes(x = mgx)) +
+  ggplot2::geom_bar(ggplot2::aes(fill = significant))
+
+null.res.filt <- null.res %>% 
+  dplyr::filter(term != "intercept") %>% 
+  dplyr::filter(parameter == "beta") %>% 
+  dplyr::mutate(expected_significance = mgx)
+
+null.res.filt %>% 
+  ggplot2::ggplot(aes(x = mgx)) +
+  ggplot2::geom_bar(ggplot2::aes(fill = significant))
+
+# confusion matrix
+predicted <- factor(null.res.filt$significant)
+expected <- factor(null.res.filt$expected_significance)
+
+caret::confusionMatrix(data=predicted, reference = expected)
+
+
+```
diff --git a/benchmarking/paired_mtxDE_sims.Rmd b/benchmarking/paired_mtxDE_sims.Rmd
@@ -0,0 +1,117 @@
+---
+title: "paired_mtxDE_sims"
+author: "John Sterrett"
+date: "2025-04-17"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(ggplot2)
+```
+
+# Simulate normal data
+```{r}
+set.seed(42)
+
+simulate_norm_run_lm <- function(n=1000, var_explain_by_x=1){
+    x <- rnorm(n=n)
+    y_noise <-  (1-var_explain_by_x) * rnorm(n=n)
+    y <- (var_explain_by_x * x) + y_noise
+
+    df <- data.frame(x=x,y=y)
+
+    return(broom::tidy(lm(y ~ x, df)))
+}
+
+# set parameters
+N <- 1E5
+vars_explained <- seq(-2,2,0.1)
+results <- data.frame(matrix(nrow=0, ncol=6))
+colnames(results) <- c("term", "estimate", "std.error", 
+                       "statistic", "p.value", "true_effect")
+
+# try simulating with different variance explained
+for (var in vars_explained){
+    new_res <- simulate_norm_run_lm(n=N, var_explain_by_x=var)
+    new_res$true_effect <- var
+    results <- rbind(results, new_res)
+}
+
+results %>%
+    filter(term=="x") %>%
+    ggplot(mapping=aes(x=true_effect, y=estimate)) +
+    geom_point()
+
+```
+
+
+```{r}
+x <- rnbinom(n=1000, size=5, mu=100)
+hist(x)
+zero_indexes <- rbinom(n=1000, size=1, prob=0.2)
+x[which(zero_indexes==1)] <- 0
+hist(x)
+
+simulate_zero_inflated_rnbinom <- function(n=1000, size=5, mu=100, zero_prob=0.2, var_explain_by_x=1){
+    x <- rnbinom(n=n, size=size, mu=mu)
+
+    y_base <- (1-var_explain_by_x) * rnbinom(n=n, size=size, mu=mu)
+    # add effect of x
+    y <- (var_explain_by_x * x) + y_base
+    # zero inflate our data
+    y[which(rbinom(n, size=1, prob=zero_prob)==1)] <- 0
+
+    df <- data.frame(x=x,y=y)
+    return(df)
+}
+
+
+simulate_zi_rnbinom_run_lm <- function(n=1000, size=5, mu=100, 
+                                       zero_prob=0.2,
+                                       var_explain_by_x=1){
+    df <- simulate_zero_inflated_rnbinom(n, size, mu, zero_prob, var_explain_by_x)
+    return(broom::tidy(lm(y ~ x, df)))
+}
+
+df <- simulate_zero_inflated_rnbinom(var_explain_by_x=0.4)
+ggplot(df, mapping=aes(x=x, y=y)) +
+    geom_point() +
+    geom_smooth(method="lm")
+
+
+
+# set parameters
+N <- 200
+vars_explained <- seq(-2,2,0.1)
+results <- data.frame(matrix(nrow=0, ncol=6))
+colnames(results) <- c("term", "estimate", "std.error", 
+                       "statistic", "p.value", "true_effect")
+
+# try simulating with different variance explained
+for (var in vars_explained){
+    new_res <- simulate_rnbinom_run_lm(n=N, 
+                                       size=1,
+                                       var_explain_by_x=var)
+    new_res$true_effect <- var
+    results <- rbind(results, new_res)
+}
+
+results %>%
+    filter(term=="x") %>%
+    ggplot(mapping=aes(x=true_effect, y=estimate)) +
+    geom_point()
+
+```
+
+# MGX-MTX
+```{r}
+# simulate MGX ~ neg binomial with zero inflation
+
+# simulate MTX ~ MGX * noise (multiply to keep zeros zero)
+
+# simulate MTX ~ MGX * noise * treatment_effect
+
+
+```
diff --git a/man/check_data_mtxDE.Rd b/man/check_data_mtxDE.Rd
diff --git a/man/dot-add_dna_to_formula.Rd b/man/dot-add_dna_to_formula.Rd
diff --git a/man/dot-prepare_data_mtxDE.Rd b/man/dot-prepare_data_mtxDE.Rd