diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index bb5edd7..cf5b734 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -120,34 +120,67 @@ get_hierarchical_clusters <- function(cormat, min_cluster_corr = 0.8) { #' @noRd get_modularity <- function(Weight, B) { if (dim(Weight)[1] == 1) return(0) - + if (nrow(B) != nrow(Weight)) { + stop("B must have one row per variable in Weight") + } + if (any(rowSums(B) == 0)) { + stop("Each row of B must belong to one cluster") + } + index <- max.col(B, ties.method = "first") + get_modularity_partition(get_modularity_components(Weight), index, ncol(B)) +} + +get_modularity_components <- function(Weight) { W_pos <- Weight * (Weight > 0) W_neg <- Weight * (Weight < 0) - N <- dim(Weight)[1] K_pos <- colSums(W_pos) K_neg <- colSums(W_neg) m_pos <- sum(K_pos) m_neg <- sum(K_neg) - m <- m_pos + m_neg - - if (m == 0) return(0) - - # cate <- B %*% t(B) - cate <- tcrossprod(B) - - if (m_pos == 0 & m_neg == 0) return(0) - - if (m_pos == 0) { + list( + W_pos = W_pos, + W_neg = W_neg, + K_pos = K_pos, + K_neg = K_neg, + m_pos = m_pos, + m_neg = m_neg, + m = m_pos + m_neg + ) +} + +get_modularity_partition <- function(mod_components, index, n_cluster) { + if (nrow(mod_components$W_pos) == 1) return(0) + if (mod_components$m == 0) return(0) + + index_factor <- factor(index, levels = seq_len(n_cluster)) + block_sum <- function(W) { + row_sums <- rowsum(W, index_factor, reorder = TRUE) + vapply(seq_len(n_cluster), function(i) sum(row_sums[i, index == i]), numeric(1)) + } + group_sum <- function(x) { + as.numeric(rowsum(matrix(x, ncol = 1), index_factor, reorder = TRUE)) + } + + A_pos <- block_sum(mod_components$W_pos) + A_neg <- block_sum(mod_components$W_neg) + D_pos <- group_sum(mod_components$K_pos) + D_neg <- group_sum(mod_components$K_neg) + + if (mod_components$m_pos == 0 & mod_components$m_neg == 0) return(0) + + if (mod_components$m_pos == 0) { Q_positive <- 0 - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg - } else if (m_neg == 0) { - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos + Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg + } else if (mod_components$m_neg == 0) { + Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos Q_negative <- 0 } else { - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + Q_positive <- sum(A_pos - D_pos^2 / mod_components$m_pos) / mod_components$m_pos + Q_negative <- sum(A_neg - D_neg^2 / mod_components$m_neg) / mod_components$m_neg } - Q <- m_pos / m * Q_positive - m_neg / m * Q_negative + + Q <- mod_components$m_pos / mod_components$m * Q_positive - + mod_components$m_neg / mod_components$m * Q_negative return(Q) } @@ -165,15 +198,16 @@ get_n_cluster <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) { IND <- 1 Q <- 1 } else { - Q <- c() + Q <- numeric(m) if (ncol(Sigma) < 10) { m <- ncol(Sigma) + Q <- numeric(m) } + mod_components <- get_modularity_components(Sigma) for (i in 1:m) { index <- cutree(hc, i) - B <- sapply(1:i, function(t) as.numeric(index == t)) - Q[i] <- get_modularity(Sigma, B) + Q[i] <- get_modularity_partition(mod_components, index, i) } IND <- which(Q == max(Q)) diff --git a/tests/testthat/helper-plot-device.R b/tests/testthat/helper-plot-device.R new file mode 100644 index 0000000..71e1e67 --- /dev/null +++ b/tests/testthat/helper-plot-device.R @@ -0,0 +1,12 @@ +.colocboost_plot_device_env <- new.env(parent = emptyenv()) +.colocboost_plot_device_env$file <- tempfile("colocboost-test-plots-", fileext = ".pdf") +grDevices::pdf(.colocboost_plot_device_env$file) + +reg.finalizer(.colocboost_plot_device_env, function(e) { + while (grDevices::dev.cur() > 1) { + grDevices::dev.off() + } + unlink(e$file) + unlink("Rplots.pdf") + unlink(file.path("tests", "testthat", "Rplots.pdf")) +}, onexit = TRUE) diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 1b23c0f..74c11c6 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -237,6 +237,76 @@ test_that("get_hierarchical_clusters functions correctly", { expect_equal(result_all_high$n_cluster, 1) }) +test_that("block-sum modularity path matches reference modularity", { + set.seed(20260525) + + get_modularity_reference <- function(Weight, B) { + if (dim(Weight)[1] == 1) return(0) + W_pos <- Weight * (Weight > 0) + W_neg <- Weight * (Weight < 0) + K_pos <- colSums(W_pos) + K_neg <- colSums(W_neg) + m_pos <- sum(K_pos) + m_neg <- sum(K_neg) + m <- m_pos + m_neg + if (m == 0) return(0) + cate <- tcrossprod(B) + if (m_pos == 0 & m_neg == 0) return(0) + if (m_pos == 0) { + Q_positive <- 0 + Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + } else if (m_neg == 0) { + Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos + Q_negative <- 0 + } else { + Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos + Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + } + m_pos / m * Q_positive - m_neg / m * Q_negative + } + + get_n_cluster_reference <- function(hc, Sigma, m = ncol(Sigma), min_cluster_corr = 0.8) { + if (min(Sigma) > min_cluster_corr) { + return(list(n_cluster = 1, Qmodularity = 1)) + } + if (ncol(Sigma) < 10) { + m <- ncol(Sigma) + } + Q <- numeric(m) + for (i in seq_len(m)) { + index <- cutree(hc, i) + B <- sapply(seq_len(i), function(t) as.numeric(index == t)) + Q[i] <- get_modularity_reference(Sigma, B) + } + IND <- which(Q == max(Q)) + if (length(IND) > 1) IND <- IND[1] + list(n_cluster = IND, Qmodularity = Q) + } + + for (p in c(8, 20, 50)) { + Sigma <- matrix(rnorm(p * p), p, p) + Sigma <- (Sigma + t(Sigma)) / 2 + diag(Sigma) <- 1 + hc <- hclust(as.dist(1 - Sigma)) + + current <- get_n_cluster(hc, Sigma, m = p, min_cluster_corr = 0.999) + reference <- get_n_cluster_reference(hc, Sigma, m = p, min_cluster_corr = 0.999) + + expect_equal(current$n_cluster, reference$n_cluster) + expect_equal(current$Qmodularity, reference$Qmodularity, tolerance = 1e-10) + + for (k in c(1, ceiling(p / 3), p)) { + index <- cutree(hc, k) + B <- sapply(seq_len(k), function(t) as.numeric(index == t)) + direct <- get_modularity_reference(Sigma, B) + components <- get_modularity_components(Sigma) + block_sum <- get_modularity_partition(components, index, k) + expect_equal(block_sum, direct, tolerance = 1e-10) + expect_equal(get_modularity(Sigma, B), direct, tolerance = 1e-10) + } + } +}) + # Test get_ambiguous_colocalization function test_that("get_ambiguous_colocalization identifies ambiguous colocalizations correctly", { @@ -1176,4 +1246,4 @@ test_that("get_robust_ucos with different cutoffs produces expected ordering", { # The number of remaining ucos should be monotonically non-increasing expect_true(all(diff(n_ucos_remaining) <= 0)) -}) \ No newline at end of file +}) diff --git a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd index 00aade1..9b26e9a 100644 --- a/vignettes/ColocBoost_Wrapper_Pipeline.Rmd +++ b/vignettes/ColocBoost_Wrapper_Pipeline.Rmd @@ -23,7 +23,7 @@ This vignette demonstrates how to use the bioinformatics pipeline for ColocBoost Acknowledgment: Thanks to Kate (Kathryn) Lawrence (GitHub:@kal26) for her contributions to this vignette. -# 1. Loading Data using `colocboost_analysis_pipeline` function +# 1. Loading Data using `colocboost_pipeline` function This function harmonizes the input data and prepares it for colocalization analysis. @@ -32,7 +32,7 @@ In this section, we introduce how to load the regional data required for the Col This function loads mixed datasets for a specific region, including individual-level data (genotype, phenotype, covariate data), summary statistics (sumstats, LD), or a combination of both. It runs `load_regional_univariate_data` for each individual-level dataset and `load_rss_data` for each summary statistics dataset. The output is a list with `individual_data` and `sumstat_data` components, where `individual_data` is a list of individual-level data and `sumstat_data` is a list of summary statistics data. -This list is then passed to the `colocboost_analysis_pipeline` function for the colocalization analysis. +This list is then passed to the `colocboost_pipeline` function for the colocalization analysis. Below are the input parameters for this function for loading individual-level data: @@ -199,9 +199,9 @@ The LD metadata file is a tab-separated file with the following columns: ``` -# 2. Perform ColocBoost using `colocboost_analysis_pipeline` function +# 2. Perform ColocBoost using `colocboost_pipeline` function -In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_analysis_pipeline` function. +In this section, we load region data for a combination of individual-level and summary statistics data, then perform the colocalization analysis using the `colocboost_pipeline` function. The colocalization analysis can be run in any one of three modes, or in a combination of these modes (names assume that individual-level data is xQTL data and summary statistics data is GWAS data): - **`xQTL-only mode`**: Only perform colocalization analysis on the individual-level data. Summary statistics data is not used. @@ -226,7 +226,7 @@ Example: for sQTL, `list(type_pattern = ".*clu_(\\d+_[+-?]).*", valid_pattern = Outputs: -- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_analysis_pipeline` function. If the mode is not run, the corresponding element will be `NULL`. +- **`colocboost_results`**: List of colocboost objects (with `xqtl_coloc`, `joint_gwas`, `separate_gwas`); Output of the `colocboost_pipeline` function. If the mode is not run, the corresponding element will be `NULL`. ```{r, colocboost-analysis, eval = FALSE} #### Please check the example code below #### @@ -263,7 +263,7 @@ pip_cutoff_to_skip_sumstat = rep(0, length(sumstat_path_list)) qc_method = "rss_qc" # run colocboost analysis -colocboost_results <- colocboost_analysis_pipeline( +colocboost_results <- colocboost_pipeline( region_data_combined, maf_cutoff = maf_cutoff, pip_cutoff_to_skip_ind = pip_cutoff_to_skip_ind,