From 1e88b61b2acc0718868bfdcea70e4a5739196c8f Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Tue, 12 May 2026 08:12:38 -0400 Subject: [PATCH 1/4] Update colocboost_plot.R --- R/colocboost_plot.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 1f0f7c1..ebe1644 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -711,7 +711,10 @@ plot_initial <- function(cb_plot_input, y = "log10p", # - set data and x-lab and y-lab if (y == "log10p") { plot_data <- lapply(cb_plot_input$Zscores, function(z) { - -log10(2 * pnorm(-abs(z))) + y_val <- -log10(2 * pnorm(-abs(z))) + max_finite <- max(y_val[is.finite(y_val)], na.rm = TRUE) + y_val[!is.finite(y_val)] <- max_finite + return(y_val) }) ylab <- "-log10(p)" } else if (y == "z_original") { @@ -765,7 +768,11 @@ plot_initial <- function(cb_plot_input, y = "log10p", ymin <- rep(args$ylim[1], length(args$y)) } else { ymax <- NULL - ymin <- rep(0, length(args$y)) + if (y %in% c("z_original", "coef")) { + ymin <- sapply(plot_data, function(p) min(p[is.finite(p)], na.rm = TRUE) * 1.05) + } else { + ymin <- rep(0, length(args$y)) + } # Check if ylim_each is FALSE but no ylim is provided if (!ylim_each) { @@ -784,9 +791,7 @@ plot_initial <- function(cb_plot_input, y = "log10p", } return(ymax) }) - if (y == "coef") { - ymin <- sapply(plot_data, function(p) min(p) * 1.05) - } + } args$ymax <- ymax args$ymin <- ymin From 974279e3d30673e73493d703f7b7721b7a47a023 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 20 May 2026 11:24:06 -0400 Subject: [PATCH 2/4] trim Xref before pre-calculating Trim X_ref supersets before any expensive reference processing. --- R/colocboost.R | 26 ++++++++++++++++++++++++++ tests/testthat/test_Xref.R | 21 +++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/R/colocboost.R b/R/colocboost.R index 01693dc..542103c 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -734,6 +734,32 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, } else { if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref) if (is.matrix(X_ref)) X_ref <- list(X_ref) + + # Trim X_ref supersets before any expensive reference processing. + # ColocBoost later matches each sumstat by variant name, so columns + # absent from every mapped sumstat cannot affect the fit. + get_needed_sumstat_variants <- function(ref_idx) { + if (length(X_ref) == 1) { + ss_idx <- seq_along(sumstat) + } else if (length(X_ref) == length(sumstat)) { + ss_idx <- ref_idx + } else if (!is.null(dict_sumstatLD) && !is.null(dim(dict_sumstatLD))) { + ss_idx <- unique(dict_sumstatLD[dict_sumstatLD[, 2] == ref_idx, 1]) + } else { + return(NULL) + } + unique(unlist(lapply(sumstat[ss_idx], function(ss) ss$variant), use.names = FALSE)) + } + for (idx in seq_along(X_ref)) { + xref_variants <- colnames(X_ref[[idx]]) + if (is.null(xref_variants)) next + needed_variants <- get_needed_sumstat_variants(idx) + if (is.null(needed_variants) || length(needed_variants) == 0) next + keep_variants <- intersect(xref_variants, needed_variants) + if (length(keep_variants) > 0 && length(keep_variants) < ncol(X_ref[[idx]])) { + X_ref[[idx]] <- X_ref[[idx]][, keep_variants, drop = FALSE] + } + } # When N_ref >= P, precompute LD (avoids repeated O(N_ref*P) in boosting loop) # When N_ref < P, keep X_ref for on-the-fly computation (avoids P*P memory) diff --git a/tests/testthat/test_Xref.R b/tests/testthat/test_Xref.R index 5d30abc..b5cb8a6 100644 --- a/tests/testthat/test_Xref.R +++ b/tests/testthat/test_Xref.R @@ -113,6 +113,27 @@ test_that("X_ref with N_ref >= P precomputes LD and produces valid results", { expect_equal(length(result_xref$data_info$variables), 30) }) +test_that("X_ref superset is trimmed before LD precomputation", { + test_data <- generate_xref_test_data(n_ref = 80, p = 20) + extra_ref <- matrix(rnorm(nrow(test_data$X_ref) * 35), nrow(test_data$X_ref), 35) + colnames(extra_ref) <- paste0("EXTRA", seq_len(ncol(extra_ref))) + X_ref_superset <- cbind(test_data$X_ref, extra_ref) + + validated <- suppressMessages( + colocboost_validate_input_data( + sumstat = test_data$sumstat, + X_ref = X_ref_superset + ) + ) + + expected_variants <- unique(unlist(lapply(test_data$sumstat, function(ss) ss$variant))) + expect_equal(validated$ref_label, "LD") + expect_null(validated$X_ref) + expect_equal(ncol(validated$LD[[1]]), length(expected_variants)) + expect_setequal(colnames(validated$LD[[1]]), expected_variants) + expect_false(any(grepl("^EXTRA", colnames(validated$LD[[1]])))) +}) + # ============================================================================ # Test 3: X_ref with N_ref < P keeps X_ref for on-the-fly computation From 821e5b98ad2391cf09e3f1f9ec94a31f613dcdd4 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 20 May 2026 14:12:09 -0400 Subject: [PATCH 3/4] Update colocboost_inference.R simple profiling computational improvement --- R/colocboost_inference.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index a837d11..bb5edd7 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -474,7 +474,7 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU X_sub2 <- scale(X[, pos2, drop = FALSE], center = T, scale = F) value <- abs(get_matrix_mult(X_sub1, X_sub2)) } else { - if (identical(ref_label, "No_ref") || sum(Xcorr) == 1) { + if (identical(ref_label, "No_ref") || (length(Xcorr) == 1 && Xcorr == 1)) { value <- 0 } else { if (length(miss_idx) != 0) { From b7dd40628f97c61b748fec053e7507195b4eea41 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Wed, 20 May 2026 14:48:56 -0400 Subject: [PATCH 4/4] improve of computational --- R/colocboost_init.R | 19 ++++++++++++++----- R/colocboost_update.R | 24 ++++++++++++++++++------ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 077a217..403e114 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -163,11 +163,16 @@ colocboost_init_model <- function(cb_data, "ld_jk" = c(), "jk" = c(), "scaling_factor" = if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1, - "beta_scaling" = if (!is.null(cb_data$data[[i]]$N)) 1 else 100 + "beta_scaling" = if (!is.null(cb_data$data[[i]]$N)) 1 else 100, + "XtX_beta_cache" = NULL ) data_each <- cb_data$data[[i]] X_dict <- cb_data$dict[i] + if (!is.null(cb_data$data[[X_dict]]$XtX)) { + tmp$XtX_beta_cache <- rep(0, P - length(data_each$variable_miss)) + } + # - calculate change of loglikelihood for data tmp$change_loglike <- estimate_change_profile( X = cb_data$data[[X_dict]]$X, Y = data_each[["Y"]], N = data_each$N, @@ -184,6 +189,7 @@ colocboost_init_model <- function(cb_data, XtX = cb_data$data[[X_dict]]$XtX, beta_k = tmp$beta, miss_idx = data_each$variable_miss, + XtX_beta_cache = tmp$XtX_beta_cache, ref_label = cb_data$data[[X_dict]]$ref_label ) # - initial z-score between X and residual based on correlation @@ -240,7 +246,8 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, jk_equiv_corr = 0.8, jk_equiv_loglik = 1, func_compare = "min_max", - coloc_thresh = 0.1) { + coloc_thresh = 0.1, + ld_mismatch = "none") { ################# initialization ####################################### # - sample size N <- sapply(cb_data$data, function(dt) dt$N) @@ -312,7 +319,8 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, "func_multi_test" = func_multi_test, "multi_test_thresh" = multi_test_thresh, "multi_test_max" = multi_test_max, - "model_used" = "original" + "model_used" = "original", + "ld_mismatch" = ld_mismatch ) class(cb_model_para) <- "colocboost" @@ -391,7 +399,7 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, } if (identical(ref_label, "No_ref")) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2) - } else if (!is.null(XtX_beta_cache)) { + } else if (!is.null(XtX_beta_cache) && length(XtX_beta_cache) == length(beta_k)) { var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_cache * beta_k) } else { XtX_beta_val <- compute_XtX_product(XtX, beta_k, ref_label) @@ -591,7 +599,8 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, "XtY" = NULL, "YtY" = NULL, "N" = N[[i]], - "variable_miss" = NULL + "variable_miss" = NULL, + "R_finite_B" = NULL ) # Get current status diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 86993d0..4037214 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -110,15 +110,27 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { ref_label_i <- cb_data$data[[X_dict]]$ref_label cb_model[[i]]$res <- rep(0, cb_model_para$P) if (length(cb_data$data[[i]]$variable_miss) != 0) { - beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] / beta_scaling - xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] - XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta - + obs_idx <- setdiff(seq_len(cb_model_para$P), cb_data$data[[i]]$variable_miss) + beta <- cb_model[[i]]$beta[obs_idx] / beta_scaling + xty <- cb_data$data[[i]]$XtY[obs_idx] + delta_beta <- step1 * beta_grad[obs_idx] / beta_scaling + XtX_beta_cache <- cb_model[[i]]$XtX_beta_cache + if (!is.null(XtX_beta_cache) && length(XtX_beta_cache) == length(beta)) { + XtX_beta <- XtX_beta_cache + compute_XtX_product(xtx, delta_beta, ref_label_i) + } else { + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + } + cb_model[[i]]$res[obs_idx] <- xty - scaling_factor * XtX_beta } else { beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY - XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + delta_beta <- step1 * beta_grad / beta_scaling + XtX_beta_cache <- cb_model[[i]]$XtX_beta_cache + if (!is.null(XtX_beta_cache) && length(XtX_beta_cache) == length(beta)) { + XtX_beta <- XtX_beta_cache + compute_XtX_product(xtx, delta_beta, ref_label_i) + } else { + XtX_beta <- compute_XtX_product(xtx, beta, ref_label_i) + } cb_model[[i]]$res <- xty - scaling_factor * XtX_beta } # - cache XtX %*% beta for reuse in get_correlation (avoids redundant O(P^2) computation)