From da7de9535b9096891939e3bddfff05d6601afc4c Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Fri, 29 May 2026 22:49:43 -0500 Subject: [PATCH 01/14] Add direct SuSiE fitting option and unified top-loci export This update adds two fine-mapping output improvements while preserving existing default behavior. First, univariate fine-mapping can now bypass SuSiE-inf initialization when ordinary SuSiE fitting is explicitly requested. The SuSiE-inf path remains the default for compatibility, and TWAS-weight generation continues to require the existing SuSiE-inf workflow. Second, SuSiE wrapper output now includes a unified top-loci export table alongside the existing wrapper-facing summaries. The export keeps posterior effect summaries separate from marginal association fields, preserves credible-set membership including overlapping credible sets, and leaves existing top_loci and top_loci_long contracts unchanged. Focused regression tests cover the direct SuSiE path and the unified top-loci export contract. --- R/susie_wrapper.R | 195 +++++++++++++++++++++- R/univariate_pipeline.R | 49 ++++-- tests/testthat/test_susie_wrapper.R | 97 +++++++++++ tests/testthat/test_univariate_pipeline.R | 39 +++++ 4 files changed, 365 insertions(+), 15 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index bb1e2a8f..d1ecdf53 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -267,6 +267,8 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { ) trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) + trimmed$X_scalar <- X_scalar + trimmed$y_scalar <- y_scalar # Build FineMappingResult S4 object fm_result <- FineMappingResult( @@ -491,6 +493,186 @@ build_top_loci_wide <- function(top_loci_long, posts) { out } +build_top_loci_export <- function(top_loci, top_loci_long = NULL, + primary_method = "susie", + result_trimmed = NULL, + variant_names = NULL, + gene_id = NA_character_, + event_id = NA_character_, + n = NA_real_, af = NULL) { + empty <- function() { + data.frame( + "#chr" = integer(), start = integer(), end = integer(), + a1 = character(), a2 = character(), variant_ID = character(), + gene_ID = character(), event_ID = character(), + "cs_coverage_0.95" = integer(), "cs_coverage_0.7" = integer(), + "cs_coverage_0.5" = integer(), PIP = numeric(), + conditional_effect = numeric(), conditional_effect_se = numeric(), + beta = numeric(), se = numeric(), n = numeric(), af = numeric(), + stringsAsFactors = FALSE, check.names = FALSE + ) + } + if (is.null(top_loci) || !is.data.frame(top_loci) || nrow(top_loci) == 0) { + return(empty()) + } + + top_loci <- .translate_legacy_top_loci_cs_columns(top_loci) + variant_col <- intersect(c("variant_ID", "variant_id"), names(top_loci)) + if (length(variant_col) == 0) stop("top_loci is missing variant_id or variant_ID.") + variant_id <- as.character(top_loci[[variant_col[[1]]]]) + n_rows <- length(variant_id) + + match_numeric <- function(value) { + out <- rep(NA_real_, n_rows) + if (is.null(value) || length(value) == 0) return(out) + value_names <- names(value) + value <- suppressWarnings(as.numeric(value)) + if (!is.null(value_names) && length(value_names) == length(value)) { + names(value) <- value_names + } + if (!is.null(names(value))) { + query <- tryCatch(normalize_variant_id(variant_id), error = function(e) variant_id) + names(value) <- tryCatch(normalize_variant_id(names(value)), error = function(e) names(value)) + return(as.numeric(value[query])) + } + if (length(value) == 1) return(rep(value, n_rows)) + if (length(value) == n_rows) return(value) + out + } + column_numeric <- function(columns) { + hit <- intersect(columns, names(top_loci)) + if (length(hit) == 0) return(rep(NA_real_, n_rows)) + suppressWarnings(as.numeric(top_loci[[hit[[1]]]])) + } + cs_column <- function(coverage) { + hit <- intersect(format_cs_column(coverage, primary_method), names(top_loci)) + if (length(hit) > 0) return(as.integer(top_loci[[hit[[1]]]])) + if (any(grepl("^CS_[0-9.]+_[A-Za-z0-9_.-]+$", names(top_loci)))) { + return(rep(0L, n_rows)) + } + rep(NA_integer_, n_rows) + } + scale_effect <- function(value) { + if (is.null(value) || is.null(result_trimmed)) return(value) + y_scalar <- suppressWarnings(as.numeric(result_trimmed$y_scalar %||% 1)) + if (length(y_scalar) != 1 || !is.finite(y_scalar)) y_scalar <- 1 + x_scalar <- result_trimmed$X_scalar + if (is.null(x_scalar)) return(value * y_scalar) + if (length(x_scalar) == 1) { + x_scalar <- rep(as.numeric(x_scalar), length(value)) + } else if (is.null(names(x_scalar)) && !is.null(names(value)) && + length(x_scalar) == length(value)) { + names(x_scalar) <- names(value) + } + if (!is.null(names(value)) && !is.null(names(x_scalar))) { + query <- tryCatch(normalize_variant_id(names(value)), error = function(e) names(value)) + names(x_scalar) <- tryCatch(normalize_variant_id(names(x_scalar)), + error = function(e) names(x_scalar)) + x_scalar <- as.numeric(x_scalar[query]) + } else if (length(x_scalar) != length(value)) { + x_scalar <- rep(NA_real_, length(value)) + } + ok <- is.finite(value) & is.finite(x_scalar) & x_scalar != 0 + value[ok] <- value[ok] * y_scalar / x_scalar[ok] + value + } + + parsed <- tryCatch( + suppressWarnings(parse_variant_id(variant_id)), + error = function(e) NULL + ) + if (is.null(parsed) || nrow(parsed) != n_rows) { + parsed <- data.frame( + chrom = rep(NA_integer_, n_rows), pos = rep(NA_integer_, n_rows), + A2 = rep(NA_character_, n_rows), A1 = rep(NA_character_, n_rows), + stringsAsFactors = FALSE + ) + } + + pip_col <- resolve_pip_column(top_loci, primary_method) + effect <- effect_se <- rep(NA_real_, n_rows) + if (!is.null(result_trimmed)) { + names_from_fit <- variant_names %||% names(result_trimmed$pip) %||% + colnames(result_trimmed$alpha) + effect <- tryCatch(susieR::coef.susie(result_trimmed), error = function(e) NULL) + effect_se <- tryCatch(susieR::susie_get_posterior_sd(result_trimmed), + error = function(e) NULL) + if (!is.null(effect)) { + if (!is.null(names_from_fit) && length(names_from_fit) == length(effect)) { + names(effect) <- names_from_fit + } + effect <- match_numeric(scale_effect(effect)) + } else { + effect <- rep(NA_real_, n_rows) + } + if (!is.null(effect_se)) { + if (!is.null(names_from_fit) && length(names_from_fit) == length(effect_se)) { + names(effect_se) <- names_from_fit + } + effect_se <- match_numeric(abs(scale_effect(effect_se))) + } else { + effect_se <- rep(NA_real_, n_rows) + } + } + + out <- data.frame( + "#chr" = parsed$chrom, + start = parsed$pos - 1L, + end = parsed$pos, + a1 = parsed$A1, + a2 = parsed$A2, + variant_ID = variant_id, + gene_ID = rep(gene_id, length.out = n_rows), + event_ID = rep(event_id, length.out = n_rows), + "cs_coverage_0.95" = cs_column(0.95), + "cs_coverage_0.7" = cs_column(0.7), + "cs_coverage_0.5" = cs_column(0.5), + PIP = if (is.null(pip_col)) rep(NA_real_, n_rows) else as.numeric(top_loci[[pip_col]]), + conditional_effect = effect, + conditional_effect_se = effect_se, + beta = column_numeric(c("beta", "betahat")), + se = column_numeric(c("se", "sebetahat")), + n = if (any(!is.na(column_numeric("n")))) column_numeric("n") else match_numeric(n), + af = if (any(!is.na(column_numeric("af")))) column_numeric("af") else match_numeric(af), + stringsAsFactors = FALSE, + check.names = FALSE + ) + + if (is.null(top_loci_long) || !is.data.frame(top_loci_long) || + nrow(top_loci_long) == 0 || !all(c("variant_id", "method", "coverage", "cs") %in% + names(top_loci_long))) { + return(out) + } + coverage_columns <- c("0.95" = "cs_coverage_0.95", + "0.7" = "cs_coverage_0.7", + "0.5" = "cs_coverage_0.5") + rows <- lapply(seq_len(nrow(out)), function(i) { + row <- out[i, , drop = FALSE] + memberships <- lapply(names(coverage_columns), function(coverage) { + hits <- top_loci_long[ + as.character(top_loci_long$variant_id) == row$variant_ID[[1]] & + as.character(top_loci_long$method) == primary_method & + abs(as.numeric(top_loci_long$coverage) - as.numeric(coverage)) < 1e-8, + , drop = FALSE + ] + unique(as.integer(hits$cs[!is.na(hits$cs)])) + }) + names(memberships) <- coverage_columns + n_membership <- max(1L, vapply(memberships, length, integer(1))) + do.call(rbind, lapply(seq_len(n_membership), function(j) { + expanded <- row + for (column in names(memberships)) { + values <- memberships[[column]] + if (length(values) > 0) expanded[[column]] <- values[[min(j, length(values))]] + } + expanded + })) + }) + out <- do.call(rbind, rows) + rownames(out) <- NULL + out +} + trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { alpha <- .as_effect_matrix(fit$alpha) lbf_variable <- .as_lbf_matrix(fit) @@ -546,20 +728,29 @@ trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { #' @param post Output from \code{\link{postprocess_finemapping_fits}}. #' @param primary_method Method whose result should populate root-level fields. #' @return A list with root-level fields including \code{variant_names}, -#' \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. +#' \code{susie_result_trimmed}, \code{top_loci_long}, \code{top_loci}, +#' and the exported table \code{top_loci_export}. #' @export format_finemapping_output <- function(post, primary_method) { method_post <- post$finemapping_results[[primary_method]] if (is.null(method_post)) { stop("primary_method was not found in finemapping_results: ", primary_method) } + top_loci_export <- build_top_loci_export( + top_loci = post$top_loci, + top_loci_long = post$top_loci_long, + primary_method = primary_method, + result_trimmed = method_post$result_trimmed, + variant_names = method_post$variant_names + ) keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci_long")) c( method_post[keep_names], list( susie_result_trimmed = method_post$result_trimmed, top_loci_long = post$top_loci_long, - top_loci = post$top_loci + top_loci = post$top_loci, + top_loci_export = top_loci_export ) ) } diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index 6ec7cf71..3339cafc 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -1,8 +1,8 @@ #' Univariate Analysis Pipeline #' #' This function performs univariate analysis for fine-mapping and Transcriptome-Wide Association Study (TWAS) -#' with optional cross-validation. Fine-mapping fits SuSiE-inf first and then -#' fits SuSiE initialized from the SuSiE-inf result. +#' with optional cross-validation. By default, fine-mapping fits SuSiE-inf first +#' and then fits SuSiE initialized from the SuSiE-inf result. #' #' @param X A matrix of genotype data where rows represent samples and columns represent genetic variants. #' @param Y A vector of phenotype measurements. @@ -25,6 +25,9 @@ #' @param finemapping_extra_opts Additional options passed to \code{susieR::susie()}. #' SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE #' fit keeps these options and is initialized with \code{model_init}. +#' @param fit_susie_inf Whether to fit SuSiE-inf before ordinary SuSiE. Default +#' is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is +#' fitted directly and SuSiE-inf fitted objects/results are not returned. #' @param twas_weights Whether to compute TWAS weights. Default is TRUE. #' @param sample_partition Optional data frame with Sample and Fold columns for cross-validation. Default is NULL. #' @param max_cv_variants The maximum number of variants to be included in cross-validation. Default is -1 (no limit). @@ -58,6 +61,7 @@ univariate_analysis_pipeline <- function( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), + fit_susie_inf = TRUE, # TWAS weights and CV for TWAS weights twas_weights = TRUE, sample_partition = NULL, @@ -75,6 +79,12 @@ univariate_analysis_pipeline <- function( if (!is.numeric(Y_scalar) || length(Y_scalar) != 1) stop("Y_scalar must be a numeric scalar") if (!is.numeric(L) || L <= 0) stop("L must be a positive integer") if (!is.null(L_greedy) && (!is.numeric(L_greedy) || L_greedy <= 0)) stop("L_greedy must be NULL or a positive integer") + if (!is.logical(fit_susie_inf) || length(fit_susie_inf) != 1 || is.na(fit_susie_inf)) { + stop("fit_susie_inf must be TRUE or FALSE") + } + if (!isTRUE(fit_susie_inf) && isTRUE(twas_weights)) { + stop("fit_susie_inf = FALSE is not compatible with twas_weights = TRUE") + } # Initial PIP check if (pip_cutoff_to_skip != 0) { @@ -112,17 +122,28 @@ univariate_analysis_pipeline <- function( st <- proc.time() res <- list() - message("Fitting SuSiE-inf model on input data ...") - message("Fitting SuSiE model initialized by SuSiE-inf ...") - fitted_models <- fit_susie_inf_then_susie( - X, - Y, - args = modifyList( - finemapping_extra_opts, - list(L = L, L_greedy = L_greedy, coverage = coverage[1]) - ) + susie_args <- modifyList( + finemapping_extra_opts, + list(L = L, L_greedy = L_greedy, coverage = coverage[1]) ) - res$susie_inf_fitted <- fitted_models[["susie_inf"]] + if (isTRUE(fit_susie_inf)) { + message("Fitting SuSiE-inf model on input data ...") + message("Fitting SuSiE model initialized by SuSiE-inf ...") + fitted_models <- fit_susie_inf_then_susie( + X, + Y, + args = susie_args + ) + res$susie_inf_fitted <- fitted_models[["susie_inf"]] + } else { + message("Fitting SuSiE model on input data ...") + fitted_models <- list( + susie = .set_finemapping_fit_class( + do.call(susie, c(list(X = X, y = Y), susie_args)), + "susie" + ) + ) + } res$susie_fitted <- fitted_models[["susie"]] # Process SuSiE results @@ -140,7 +161,9 @@ univariate_analysis_pipeline <- function( other_quantities = other_quantities ) res <- c(res, format_finemapping_output(susie_post, primary_method = "susie")) - res$susie_inf_result_trimmed <- susie_post$finemapping_results$susie_inf$result_trimmed + if (!is.null(susie_post$finemapping_results$susie_inf)) { + res$susie_inf_result_trimmed <- susie_post$finemapping_results$susie_inf$result_trimmed + } res$total_time_elapsed <- proc.time() - st # TWAS weights and cross-validation diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index 20747008..a671c3a8 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -765,6 +765,103 @@ test_that("format_finemapping_output does not duplicate top loci variants", { expect_equal(unique(out$top_loci$variant_id), paste0("v", 1:4)) }) +test_that("format_finemapping_output adds top_loci_export without changing wrapper tables", { + variant_id <- "chr1:101:A:G" + result_trimmed <- list( + pip = setNames(0.8, variant_id), + alpha = matrix(1, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + mu = matrix(0.8, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + mu2 = matrix(0.81, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + V = 1, + X_column_scale_factors = 1, + X_scalar = 1, + y_scalar = 1 + ) + class(result_trimmed) <- "susie" + top_loci <- data.frame( + variant_id = variant_id, + betahat = 0.2, + sebetahat = 0.1, + pip_susie = 0.8, + CS_95_susie = 1L, + CS_70_susie = 0L, + CS_50_susie = 0L, + stringsAsFactors = FALSE + ) + post <- list( + finemapping_results = list(susie = list( + variant_names = variant_id, + result_trimmed = result_trimmed, + top_loci_long = NULL + )), + top_loci_long = NULL, + top_loci = top_loci + ) + + out <- format_finemapping_output(post, "susie") + + expect_true("top_loci_export" %in% names(out)) + expect_equal(out$top_loci$betahat, 0.2) + expect_equal(out$top_loci_export$conditional_effect, 0.8) + expect_equal(out$top_loci_export$conditional_effect_se, sqrt(0.17), + tolerance = 1e-8) + expect_equal(out$top_loci_export$beta, 0.2) + expect_equal(out$top_loci_export$se, 0.1) +}) + +test_that("build_top_loci_export expands overlapping CS membership and carries n and af", { + variant_id <- "chr1:101:A:G" + result_trimmed <- list( + pip = setNames(0.8, variant_id), + alpha = matrix(1, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + mu = matrix(0.8, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + mu2 = matrix(0.81, nrow = 1, ncol = 1, + dimnames = list("L1", variant_id)), + V = 1, + X_column_scale_factors = 1 + ) + top_loci <- data.frame( + variant_id = variant_id, + betahat = 0.2, + sebetahat = 0.1, + pip_susie = 0.8, + CS_95_susie = 1L, + stringsAsFactors = FALSE + ) + top_loci_long <- data.frame( + variant_id = c(variant_id, variant_id), + method = c("susie", "susie"), + coverage = c(0.95, 0.95), + cs = c(1L, 2L), + stringsAsFactors = FALSE + ) + af <- setNames(0.35, variant_id) + + out <- pecotmr:::build_top_loci_export( + top_loci = top_loci, + top_loci_long = top_loci_long, + primary_method = "susie", + result_trimmed = result_trimmed, + variant_names = variant_id, + gene_id = "GENE1", + event_id = "EVENT1", + n = 42, + af = af + ) + + expect_equal(nrow(out), 2L) + expect_equal(out[["cs_coverage_0.95"]], c(1L, 2L)) + expect_equal(out$gene_ID, c("GENE1", "GENE1")) + expect_equal(out$event_ID, c("EVENT1", "EVENT1")) + expect_equal(out$n, c(42, 42)) + expect_equal(out$af, c(0.35, 0.35)) +}) + .make_univariate_data <- function(seed = 42, n = 300, p = 50, effect_idx = integer(0), effect_size = NULL) { set.seed(seed) diff --git a/tests/testthat/test_univariate_pipeline.R b/tests/testthat/test_univariate_pipeline.R index 762a22d3..a8dacb99 100644 --- a/tests/testthat/test_univariate_pipeline.R +++ b/tests/testthat/test_univariate_pipeline.R @@ -614,6 +614,45 @@ test_that("uap: susie-inf initializes susie with matching greedy L", { expect_equal(unclass(result$susie_inf_fitted), fake_inf_fit) }) +test_that("uap: ordinary susie can run without susie-inf initialization", { + inp <- make_uap_inputs() + fake_fit <- make_fake_susie_fit(inp$p) + fake_post <- make_fake_post_result(inp$p) + + captured_calls <- list() + local_mocked_bindings( + susie = function(X, y, L, L_greedy, coverage, unmappable_effects = "none", + model_init = NULL, ...) { + captured_calls[[length(captured_calls) + 1]] <<- list( + L = L, + L_greedy = L_greedy, + unmappable_effects = unmappable_effects, + model_init = model_init + ) + fake_fit + }, + postprocess_finemapping_fits = function(fits, ...) { + expect_named(fits, "susie") + fake_post + }, + format_finemapping_output = function(post, primary_method) post, + ) + + result <- univariate_analysis_pipeline( + X = inp$X, Y = inp$Y, maf = inp$maf, + twas_weights = FALSE, fit_susie_inf = FALSE, + L = 15, L_greedy = 7 + ) + expect_length(captured_calls, 1) + expect_equal(captured_calls[[1]]$L, 15) + expect_equal(captured_calls[[1]]$L_greedy, 7) + expect_equal(captured_calls[[1]]$unmappable_effects, "none") + expect_null(captured_calls[[1]]$model_init) + expect_s3_class(result$susie_fitted, "susie") + expect_false("susie_inf_fitted" %in% names(result)) + expect_false("susie_inf_result_trimmed" %in% names(result)) +}) + test_that("uap: post-processing output is merged into result", { inp <- make_uap_inputs() fake_fit <- make_fake_susie_fit(inp$p) From a2c5402efe22b2fc2ae56e56e9b12847f1939b81 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 00:48:56 -0500 Subject: [PATCH 02/14] remove unified top loci table changes --- R/susie_wrapper.R | 195 +--------------------------- tests/testthat/test_susie_wrapper.R | 97 -------------- 2 files changed, 2 insertions(+), 290 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index d1ecdf53..bb1e2a8f 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -267,8 +267,6 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { ) trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) - trimmed$X_scalar <- X_scalar - trimmed$y_scalar <- y_scalar # Build FineMappingResult S4 object fm_result <- FineMappingResult( @@ -493,186 +491,6 @@ build_top_loci_wide <- function(top_loci_long, posts) { out } -build_top_loci_export <- function(top_loci, top_loci_long = NULL, - primary_method = "susie", - result_trimmed = NULL, - variant_names = NULL, - gene_id = NA_character_, - event_id = NA_character_, - n = NA_real_, af = NULL) { - empty <- function() { - data.frame( - "#chr" = integer(), start = integer(), end = integer(), - a1 = character(), a2 = character(), variant_ID = character(), - gene_ID = character(), event_ID = character(), - "cs_coverage_0.95" = integer(), "cs_coverage_0.7" = integer(), - "cs_coverage_0.5" = integer(), PIP = numeric(), - conditional_effect = numeric(), conditional_effect_se = numeric(), - beta = numeric(), se = numeric(), n = numeric(), af = numeric(), - stringsAsFactors = FALSE, check.names = FALSE - ) - } - if (is.null(top_loci) || !is.data.frame(top_loci) || nrow(top_loci) == 0) { - return(empty()) - } - - top_loci <- .translate_legacy_top_loci_cs_columns(top_loci) - variant_col <- intersect(c("variant_ID", "variant_id"), names(top_loci)) - if (length(variant_col) == 0) stop("top_loci is missing variant_id or variant_ID.") - variant_id <- as.character(top_loci[[variant_col[[1]]]]) - n_rows <- length(variant_id) - - match_numeric <- function(value) { - out <- rep(NA_real_, n_rows) - if (is.null(value) || length(value) == 0) return(out) - value_names <- names(value) - value <- suppressWarnings(as.numeric(value)) - if (!is.null(value_names) && length(value_names) == length(value)) { - names(value) <- value_names - } - if (!is.null(names(value))) { - query <- tryCatch(normalize_variant_id(variant_id), error = function(e) variant_id) - names(value) <- tryCatch(normalize_variant_id(names(value)), error = function(e) names(value)) - return(as.numeric(value[query])) - } - if (length(value) == 1) return(rep(value, n_rows)) - if (length(value) == n_rows) return(value) - out - } - column_numeric <- function(columns) { - hit <- intersect(columns, names(top_loci)) - if (length(hit) == 0) return(rep(NA_real_, n_rows)) - suppressWarnings(as.numeric(top_loci[[hit[[1]]]])) - } - cs_column <- function(coverage) { - hit <- intersect(format_cs_column(coverage, primary_method), names(top_loci)) - if (length(hit) > 0) return(as.integer(top_loci[[hit[[1]]]])) - if (any(grepl("^CS_[0-9.]+_[A-Za-z0-9_.-]+$", names(top_loci)))) { - return(rep(0L, n_rows)) - } - rep(NA_integer_, n_rows) - } - scale_effect <- function(value) { - if (is.null(value) || is.null(result_trimmed)) return(value) - y_scalar <- suppressWarnings(as.numeric(result_trimmed$y_scalar %||% 1)) - if (length(y_scalar) != 1 || !is.finite(y_scalar)) y_scalar <- 1 - x_scalar <- result_trimmed$X_scalar - if (is.null(x_scalar)) return(value * y_scalar) - if (length(x_scalar) == 1) { - x_scalar <- rep(as.numeric(x_scalar), length(value)) - } else if (is.null(names(x_scalar)) && !is.null(names(value)) && - length(x_scalar) == length(value)) { - names(x_scalar) <- names(value) - } - if (!is.null(names(value)) && !is.null(names(x_scalar))) { - query <- tryCatch(normalize_variant_id(names(value)), error = function(e) names(value)) - names(x_scalar) <- tryCatch(normalize_variant_id(names(x_scalar)), - error = function(e) names(x_scalar)) - x_scalar <- as.numeric(x_scalar[query]) - } else if (length(x_scalar) != length(value)) { - x_scalar <- rep(NA_real_, length(value)) - } - ok <- is.finite(value) & is.finite(x_scalar) & x_scalar != 0 - value[ok] <- value[ok] * y_scalar / x_scalar[ok] - value - } - - parsed <- tryCatch( - suppressWarnings(parse_variant_id(variant_id)), - error = function(e) NULL - ) - if (is.null(parsed) || nrow(parsed) != n_rows) { - parsed <- data.frame( - chrom = rep(NA_integer_, n_rows), pos = rep(NA_integer_, n_rows), - A2 = rep(NA_character_, n_rows), A1 = rep(NA_character_, n_rows), - stringsAsFactors = FALSE - ) - } - - pip_col <- resolve_pip_column(top_loci, primary_method) - effect <- effect_se <- rep(NA_real_, n_rows) - if (!is.null(result_trimmed)) { - names_from_fit <- variant_names %||% names(result_trimmed$pip) %||% - colnames(result_trimmed$alpha) - effect <- tryCatch(susieR::coef.susie(result_trimmed), error = function(e) NULL) - effect_se <- tryCatch(susieR::susie_get_posterior_sd(result_trimmed), - error = function(e) NULL) - if (!is.null(effect)) { - if (!is.null(names_from_fit) && length(names_from_fit) == length(effect)) { - names(effect) <- names_from_fit - } - effect <- match_numeric(scale_effect(effect)) - } else { - effect <- rep(NA_real_, n_rows) - } - if (!is.null(effect_se)) { - if (!is.null(names_from_fit) && length(names_from_fit) == length(effect_se)) { - names(effect_se) <- names_from_fit - } - effect_se <- match_numeric(abs(scale_effect(effect_se))) - } else { - effect_se <- rep(NA_real_, n_rows) - } - } - - out <- data.frame( - "#chr" = parsed$chrom, - start = parsed$pos - 1L, - end = parsed$pos, - a1 = parsed$A1, - a2 = parsed$A2, - variant_ID = variant_id, - gene_ID = rep(gene_id, length.out = n_rows), - event_ID = rep(event_id, length.out = n_rows), - "cs_coverage_0.95" = cs_column(0.95), - "cs_coverage_0.7" = cs_column(0.7), - "cs_coverage_0.5" = cs_column(0.5), - PIP = if (is.null(pip_col)) rep(NA_real_, n_rows) else as.numeric(top_loci[[pip_col]]), - conditional_effect = effect, - conditional_effect_se = effect_se, - beta = column_numeric(c("beta", "betahat")), - se = column_numeric(c("se", "sebetahat")), - n = if (any(!is.na(column_numeric("n")))) column_numeric("n") else match_numeric(n), - af = if (any(!is.na(column_numeric("af")))) column_numeric("af") else match_numeric(af), - stringsAsFactors = FALSE, - check.names = FALSE - ) - - if (is.null(top_loci_long) || !is.data.frame(top_loci_long) || - nrow(top_loci_long) == 0 || !all(c("variant_id", "method", "coverage", "cs") %in% - names(top_loci_long))) { - return(out) - } - coverage_columns <- c("0.95" = "cs_coverage_0.95", - "0.7" = "cs_coverage_0.7", - "0.5" = "cs_coverage_0.5") - rows <- lapply(seq_len(nrow(out)), function(i) { - row <- out[i, , drop = FALSE] - memberships <- lapply(names(coverage_columns), function(coverage) { - hits <- top_loci_long[ - as.character(top_loci_long$variant_id) == row$variant_ID[[1]] & - as.character(top_loci_long$method) == primary_method & - abs(as.numeric(top_loci_long$coverage) - as.numeric(coverage)) < 1e-8, - , drop = FALSE - ] - unique(as.integer(hits$cs[!is.na(hits$cs)])) - }) - names(memberships) <- coverage_columns - n_membership <- max(1L, vapply(memberships, length, integer(1))) - do.call(rbind, lapply(seq_len(n_membership), function(j) { - expanded <- row - for (column in names(memberships)) { - values <- memberships[[column]] - if (length(values) > 0) expanded[[column]] <- values[[min(j, length(values))]] - } - expanded - })) - }) - out <- do.call(rbind, rows) - rownames(out) <- NULL - out -} - trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { alpha <- .as_effect_matrix(fit$alpha) lbf_variable <- .as_lbf_matrix(fit) @@ -728,29 +546,20 @@ trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { #' @param post Output from \code{\link{postprocess_finemapping_fits}}. #' @param primary_method Method whose result should populate root-level fields. #' @return A list with root-level fields including \code{variant_names}, -#' \code{susie_result_trimmed}, \code{top_loci_long}, \code{top_loci}, -#' and the exported table \code{top_loci_export}. +#' \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. #' @export format_finemapping_output <- function(post, primary_method) { method_post <- post$finemapping_results[[primary_method]] if (is.null(method_post)) { stop("primary_method was not found in finemapping_results: ", primary_method) } - top_loci_export <- build_top_loci_export( - top_loci = post$top_loci, - top_loci_long = post$top_loci_long, - primary_method = primary_method, - result_trimmed = method_post$result_trimmed, - variant_names = method_post$variant_names - ) keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci_long")) c( method_post[keep_names], list( susie_result_trimmed = method_post$result_trimmed, top_loci_long = post$top_loci_long, - top_loci = post$top_loci, - top_loci_export = top_loci_export + top_loci = post$top_loci ) ) } diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index a671c3a8..20747008 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -765,103 +765,6 @@ test_that("format_finemapping_output does not duplicate top loci variants", { expect_equal(unique(out$top_loci$variant_id), paste0("v", 1:4)) }) -test_that("format_finemapping_output adds top_loci_export without changing wrapper tables", { - variant_id <- "chr1:101:A:G" - result_trimmed <- list( - pip = setNames(0.8, variant_id), - alpha = matrix(1, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - mu = matrix(0.8, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - mu2 = matrix(0.81, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - V = 1, - X_column_scale_factors = 1, - X_scalar = 1, - y_scalar = 1 - ) - class(result_trimmed) <- "susie" - top_loci <- data.frame( - variant_id = variant_id, - betahat = 0.2, - sebetahat = 0.1, - pip_susie = 0.8, - CS_95_susie = 1L, - CS_70_susie = 0L, - CS_50_susie = 0L, - stringsAsFactors = FALSE - ) - post <- list( - finemapping_results = list(susie = list( - variant_names = variant_id, - result_trimmed = result_trimmed, - top_loci_long = NULL - )), - top_loci_long = NULL, - top_loci = top_loci - ) - - out <- format_finemapping_output(post, "susie") - - expect_true("top_loci_export" %in% names(out)) - expect_equal(out$top_loci$betahat, 0.2) - expect_equal(out$top_loci_export$conditional_effect, 0.8) - expect_equal(out$top_loci_export$conditional_effect_se, sqrt(0.17), - tolerance = 1e-8) - expect_equal(out$top_loci_export$beta, 0.2) - expect_equal(out$top_loci_export$se, 0.1) -}) - -test_that("build_top_loci_export expands overlapping CS membership and carries n and af", { - variant_id <- "chr1:101:A:G" - result_trimmed <- list( - pip = setNames(0.8, variant_id), - alpha = matrix(1, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - mu = matrix(0.8, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - mu2 = matrix(0.81, nrow = 1, ncol = 1, - dimnames = list("L1", variant_id)), - V = 1, - X_column_scale_factors = 1 - ) - top_loci <- data.frame( - variant_id = variant_id, - betahat = 0.2, - sebetahat = 0.1, - pip_susie = 0.8, - CS_95_susie = 1L, - stringsAsFactors = FALSE - ) - top_loci_long <- data.frame( - variant_id = c(variant_id, variant_id), - method = c("susie", "susie"), - coverage = c(0.95, 0.95), - cs = c(1L, 2L), - stringsAsFactors = FALSE - ) - af <- setNames(0.35, variant_id) - - out <- pecotmr:::build_top_loci_export( - top_loci = top_loci, - top_loci_long = top_loci_long, - primary_method = "susie", - result_trimmed = result_trimmed, - variant_names = variant_id, - gene_id = "GENE1", - event_id = "EVENT1", - n = 42, - af = af - ) - - expect_equal(nrow(out), 2L) - expect_equal(out[["cs_coverage_0.95"]], c(1L, 2L)) - expect_equal(out$gene_ID, c("GENE1", "GENE1")) - expect_equal(out$event_ID, c("EVENT1", "EVENT1")) - expect_equal(out$n, c(42, 42)) - expect_equal(out$af, c(0.35, 0.35)) -}) - .make_univariate_data <- function(seed = 42, n = 300, p = 50, effect_idx = integer(0), effect_size = NULL) { set.seed(seed) From a04bc21840dd21d5bb87c45a9dd74c0fc4a3b547 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 02:54:13 -0500 Subject: [PATCH 03/14] Unify compact top-loci export Unify compact top-loci export at one wrapper boundary Major: - build_top_loci_long now emits the full annotated long: posterior conditional_effect / conditional_effect_se, cs_purity, per-fit n / variant_number / gene_id, and caller-supplied region / event_ID. - Adds build_top_loci_export(long): pure projection of the annotated long to the fixed 21-column compact export schema; stops on missing required columns rather than silently filling NA. Minor: - build_top_loci_long gains 3 default-NULL params (data_x, data_y, other_quantities); existing callers omitting them are unchanged. - .postprocess_finemapping_fit_common passes those through unchanged. - other_quantities reserves two subfield names for the unified export: region, condition_id (alongside the existing dropped_samples use). - .empty_top_loci_long extended with the new columns so empty and populated long share one schema. - Adds focused unit tests covering the new columns, the per-fit constants, the export helper schema, and the missing-column error. Loader, univariate_analysis_pipeline, and format_finemapping_output are unchanged. transMap-side adoption is out of scope of this commit. --- R/susie_wrapper.R | 215 ++++++++++++++++++++++++++-- tests/testthat/test_susie_wrapper.R | 184 ++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 9 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index bb1e2a8f..6b492b30 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -263,7 +263,8 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { ) top_loci_long <- build_top_loci_long( fit, cs_tables, variant_names = variant_names, sumstats = sumstats, - maf = maf, method = method, signal_cutoff = signal_cutoff + maf = maf, method = method, signal_cutoff = signal_cutoff, + data_x = data_x, data_y = data_y, other_quantities = other_quantities ) trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) @@ -408,9 +409,36 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", " } build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, - maf = NULL, method, signal_cutoff = 0.1) { + maf = NULL, method, signal_cutoff = 0.1, + data_x = NULL, data_y = NULL, + other_quantities = NULL) { if (length(cs_tables) == 0) return(.empty_top_loci_long()) coverage_values <- attr(cs_tables, "coverage") + + # Per-fit constants (broadcast to every row of this fit's slice). + data_y_mat <- if (!is.null(data_y)) as.matrix(data_y) else NULL + fit_n <- if (is.null(data_y_mat)) NA_integer_ else nrow(data_y_mat) + fit_variant_number <- if (!is.null(data_x)) ncol(data_x) else NA_integer_ + fit_gene_id <- if (!is.null(data_y_mat) && !is.null(colnames(data_y_mat))) { + colnames(data_y_mat)[1] + } else NA_character_ + fit_region <- other_quantities$region + fit_event_id <- if (!is.null(other_quantities$condition_id) && + !is.na(fit_gene_id) && nzchar(fit_gene_id)) { + paste(other_quantities$condition_id, fit_gene_id, sep = "_") + } else NULL + + # Per-variant posterior effect and SE, computed once for all variants. + alpha <- as.matrix(fit$alpha) + mu <- if (!is.null(fit$mu)) as.matrix(fit$mu) else NULL + mu2 <- if (!is.null(fit$mu2)) as.matrix(fit$mu2) else NULL + posterior_effect <- if (!is.null(mu) && all(dim(alpha) == dim(mu))) { + colSums(alpha * mu) + } else rep(NA_real_, length(variant_names)) + posterior_effect_se <- if (!is.null(mu2) && all(dim(alpha) == dim(mu2))) { + sqrt(pmax(colSums(alpha * mu2) - posterior_effect^2, 0)) + } else rep(NA_real_, length(variant_names)) + rows <- lapply(seq_along(cs_tables), function(i) { cs_table <- cs_tables[[i]] cov <- coverage_values[[i]] @@ -419,14 +447,47 @@ build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, if (is.null(cs_info) || nrow(cs_info) == 0) return(NULL) idx <- cs_info$variant_idx optional_cols <- .top_loci_optional_columns(idx, sumstats, maf) + + # cs_purity: prefer susieR's sets$purity$min.abs.corr; fall back to + # cs_corr when purity is unavailable; PIP-only retained rows get 0. + sets_purity <- cs_table$sets$purity + cs_purity_per_cs <- if (!is.null(sets_purity) && + "min.abs.corr" %in% names(sets_purity)) { + as.numeric(sets_purity$min.abs.corr) + } else if (!is.null(cs_table$cs_corr)) { + vapply(seq_along(cs_table$cs_corr), function(j) { + m <- cs_table$cs_corr[[j]] + if (is.null(m)) return(NA_real_) + if (!is.matrix(m) || nrow(m) <= 1) return(1) + min(abs(m[upper.tri(m)])) + }, numeric(1)) + } else { + rep(NA_real_, length(cs_table$sets$cs)) + } + cs_purity_per_row <- vapply(cs_info$cs_idx, function(cs_i) { + if (is.na(cs_i) || cs_i == 0L) return(0) + if (cs_i > length(cs_purity_per_cs)) return(NA_real_) + cs_purity_per_cs[cs_i] + }, numeric(1)) + base <- data.frame( - variant_id = variant_names[idx], - method = method, - coverage = cov, - cs = as.integer(cs_info$cs_idx), - pip = as.numeric(fit$pip[idx]), - stringsAsFactors = FALSE + variant_id = variant_names[idx], + method = method, + coverage = cov, + cs = as.integer(cs_info$cs_idx), + pip = as.numeric(fit$pip[idx]), + conditional_effect = posterior_effect[idx], + conditional_effect_se = posterior_effect_se[idx], + cs_purity = cs_purity_per_row, + stringsAsFactors = FALSE ) + # Per-fit constants broadcast to every row. + base$n <- fit_n + base$variant_number <- fit_variant_number + base$gene_id <- fit_gene_id + if (!is.null(fit_region)) base$region <- fit_region + if (!is.null(fit_event_id)) base$event_ID <- fit_event_id + if (ncol(optional_cols) > 0) cbind(base, optional_cols) else base }) out <- bind_rows(rows) @@ -436,7 +497,12 @@ build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, .empty_top_loci_long <- function() { data.frame( variant_id = character(), method = character(), coverage = numeric(), - cs = integer(), pip = numeric(), stringsAsFactors = FALSE + cs = integer(), pip = numeric(), + conditional_effect = numeric(), conditional_effect_se = numeric(), + cs_purity = numeric(), + n = integer(), variant_number = integer(), + gene_id = character(), + stringsAsFactors = FALSE ) } @@ -491,6 +557,137 @@ build_top_loci_wide <- function(top_loci_long, posts) { out } +#' Build the unified compact top-loci export table. +#' +#' Projects an annotated \code{top_loci_long} (as produced by +#' \code{postprocess_finemapping_fits()} when called with \code{data_x}, +#' \code{data_y}, and an \code{other_quantities} list containing +#' \code{region} and \code{condition_id}) into the fixed-order compact +#' export schema used by the unified fine-mapping output. +#' +#' The output column order is exactly: \code{#chr}, \code{start}, \code{end}, +#' \code{a1}, \code{a2}, \code{variant_ID}, \code{gene_ID}, \code{event_ID}, +#' \code{cs_coverage_0.95}, \code{cs_coverage_0.7}, \code{cs_coverage_0.5}, +#' \code{cs_purity}, \code{PIP}, \code{conditional_effect}, +#' \code{conditional_effect_se}, \code{analysis_region}, +#' \code{analysis_variants_number}, \code{beta}, \code{se}, \code{n}, +#' \code{maf}. +#' +#' @param long An annotated \code{top_loci_long} data frame. Must contain +#' the columns \code{variant_id}, \code{pip}, \code{coverage}, \code{cs}, +#' \code{conditional_effect}, \code{conditional_effect_se}, +#' \code{cs_purity}, \code{gene_id}, \code{n}, \code{variant_number}, +#' \code{region}, \code{event_ID}, \code{betahat}, \code{sebetahat}, +#' \code{maf}. Missing any required column raises an explicit error +#' rather than silently filling \code{NA}. +#' @return A data frame with the fixed compact-export schema. +#' @export +build_top_loci_export <- function(long) { + required <- c("variant_id", "pip", "coverage", "cs", + "conditional_effect", "conditional_effect_se", "cs_purity", + "gene_id", "n", "variant_number", "region", "event_ID", + "betahat", "sebetahat", "maf") + if (is.null(long) || !is.data.frame(long)) { + stop("build_top_loci_export: `long` must be a data frame.") + } + if (nrow(long) == 0) return(.empty_top_loci_export()) + missing_cols <- setdiff(required, names(long)) + if (length(missing_cols) > 0) { + stop("build_top_loci_export: `long` is missing required columns: ", + paste(missing_cols, collapse = ", ")) + } + + parsed <- tryCatch( + parse_variant_id(long$variant_id), + error = function(e) { + stop("build_top_loci_export: parse_variant_id failed: ", + conditionMessage(e)) + } + ) + if (is.null(parsed) || nrow(parsed) != nrow(long)) { + stop("build_top_loci_export: parse_variant_id did not return one row ", + "per input row.") + } + + # Build a key per (variant_id, gene_id, cs) — one row per CS membership + # at the export grain. PIP-only retained variants have cs = 0. + key_grid <- unique(long[, c("variant_id", "gene_id", "cs"), drop = FALSE]) + rownames(key_grid) <- NULL + + coverage_targets <- c(0.95, 0.7, 0.5) + cs_coverage_cols <- paste0("cs_coverage_", coverage_targets) + + pick_first <- function(col, default = NA) { + vapply(seq_len(nrow(key_grid)), function(i) { + sel <- long$variant_id == key_grid$variant_id[i] & + long$gene_id == key_grid$gene_id[i] & + long$cs == key_grid$cs[i] + hit <- long[[col]][sel] + if (length(hit) == 0) default else hit[[1]] + }, default) + } + + cs_coverage_mat <- vapply(seq_len(nrow(key_grid)), function(i) { + vapply(coverage_targets, function(cov) { + sel <- long$variant_id == key_grid$variant_id[i] & + long$gene_id == key_grid$gene_id[i] & + long$cs == key_grid$cs[i] & + long$coverage == cov + if (any(sel)) as.integer(long$cs[sel][[1]]) else 0L + }, integer(1)) + }, integer(length(coverage_targets))) + cs_coverage_mat <- if (is.null(dim(cs_coverage_mat))) { + matrix(cs_coverage_mat, nrow = length(coverage_targets)) + } else cs_coverage_mat + cs_coverage_mat <- t(cs_coverage_mat) + colnames(cs_coverage_mat) <- cs_coverage_cols + + # Resolve per-row variant coordinates using a lookup into parsed. + variant_first_idx <- match(key_grid$variant_id, long$variant_id) + + out <- data.frame( + "#chr" = parsed$chrom[variant_first_idx], + start = parsed$pos[variant_first_idx] - 1L, + end = parsed$pos[variant_first_idx], + a1 = parsed$A1[variant_first_idx], + a2 = parsed$A2[variant_first_idx], + variant_ID = key_grid$variant_id, + gene_ID = key_grid$gene_id, + event_ID = pick_first("event_ID", NA_character_), + "cs_coverage_0.95" = cs_coverage_mat[, "cs_coverage_0.95"], + "cs_coverage_0.7" = cs_coverage_mat[, "cs_coverage_0.7"], + "cs_coverage_0.5" = cs_coverage_mat[, "cs_coverage_0.5"], + cs_purity = pick_first("cs_purity", NA_real_), + PIP = pick_first("pip", NA_real_), + conditional_effect = pick_first("conditional_effect", NA_real_), + conditional_effect_se = pick_first("conditional_effect_se", NA_real_), + analysis_region = pick_first("region", NA_character_), + analysis_variants_number = pick_first("variant_number", NA_integer_), + beta = pick_first("betahat", NA_real_), + se = pick_first("sebetahat", NA_real_), + n = pick_first("n", NA_integer_), + maf = pick_first("maf", NA_real_), + stringsAsFactors = FALSE, + check.names = FALSE + ) + rownames(out) <- NULL + out +} + +.empty_top_loci_export <- function() { + data.frame( + "#chr" = integer(), start = integer(), end = integer(), + a1 = character(), a2 = character(), variant_ID = character(), + gene_ID = character(), event_ID = character(), + "cs_coverage_0.95" = integer(), "cs_coverage_0.7" = integer(), + "cs_coverage_0.5" = integer(), cs_purity = numeric(), PIP = numeric(), + conditional_effect = numeric(), conditional_effect_se = numeric(), + analysis_region = character(), analysis_variants_number = integer(), + beta = numeric(), se = numeric(), n = integer(), maf = numeric(), + stringsAsFactors = FALSE, check.names = FALSE + ) +} + trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { alpha <- .as_effect_matrix(fit$alpha) lbf_variable <- .as_lbf_matrix(fit) diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index 20747008..2b4a2c09 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -910,3 +910,187 @@ test_that("top_loci has symmetric CS__ columns even for methods wit expect_true(all(post$top_loci[[col]] == 0L), info = col) } }) + +# ============================================================================ +# Unified top-loci annotated long + build_top_loci_export +# ============================================================================ + +# Helper: build a minimal fake fit + cs_tables that build_top_loci_long accepts. +.make_fake_inputs_for_annotated_long <- function(variant_ids, + cs_membership, + cs_purity_value = 0.85, + n_samples = 100, + n_variants = NULL, + pip = NULL) { + p <- length(variant_ids) + if (is.null(n_variants)) n_variants <- p + if (is.null(pip)) pip <- seq(0.6, 0.9, length.out = p) + alpha <- matrix(0, nrow = 1, ncol = p, + dimnames = list(NULL, variant_ids)) + alpha[1, ] <- pip / sum(pip) + mu <- matrix(0.5, nrow = 1, ncol = p, + dimnames = list(NULL, variant_ids)) + mu2 <- mu^2 + 0.1 + fit <- list(pip = setNames(pip, variant_ids), alpha = alpha, + mu = mu, mu2 = mu2) + purity_df <- data.frame(min.abs.corr = cs_purity_value, + mean.abs.corr = cs_purity_value, + median.abs.corr = cs_purity_value) + cs_table <- list( + sets = list(cs = list(L1 = cs_membership), + cs_index = 1L, + requested_coverage = 0.95, + purity = purity_df), + cs_corr = list(matrix(c(1, cs_purity_value, cs_purity_value, 1), + nrow = 2)), + pip = fit$pip + ) + cs_tables <- list(cs_table) + attr(cs_tables, "coverage") <- 0.95 + X <- matrix(0, nrow = n_samples, ncol = n_variants) + rownames(X) <- paste0("sample", seq_len(n_samples)) + if (n_variants == length(variant_ids)) colnames(X) <- variant_ids + Y <- matrix(0, nrow = n_samples, ncol = 1, + dimnames = list(paste0("sample", seq_len(n_samples)), + "ENSG00000179403")) + list(fit = fit, cs_tables = cs_tables, variant_names = variant_ids, + data_x = X, data_y = Y) +} + +test_that("build_top_loci_long emits annotated columns from fit + cs_tables", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .make_fake_inputs_for_annotated_long(variant_ids, + cs_membership = c(1L, 2L)) + long <- pecotmr:::build_top_loci_long( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04), + z = c(4, -2.5)), + maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05 + ) + expect_true(all(c("conditional_effect", "conditional_effect_se", + "cs_purity") %in% names(long))) + expect_equal(length(unique(long$cs_purity)), 1L) + expect_equal(round(unique(long$cs_purity), 3), 0.85) + expect_true(all(is.finite(long$conditional_effect))) + expect_true(all(long$conditional_effect_se >= 0)) +}) + +test_that("build_top_loci_long writes per-fit constants from data_x/data_y/other_quantities", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .make_fake_inputs_for_annotated_long(variant_ids, + cs_membership = c(1L, 2L), + n_samples = 419, + n_variants = 11332) + other_q <- list( + dropped_samples = list(X = character(), y = character()), + region = "chr10:10823338-14348298", + condition_id = "Ast_DeJager_eQTL" + ) + long <- pecotmr:::build_top_loci_long( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04), + z = c(4, -2.5)), + maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05, + data_x = inp$data_x, data_y = inp$data_y, other_quantities = other_q + ) + expect_equal(unique(long$n), 419L) + expect_equal(unique(long$variant_number), 11332L) + expect_equal(unique(long$gene_id), "ENSG00000179403") + expect_equal(unique(long$region), "chr10:10823338-14348298") + expect_equal(unique(long$event_ID), "Ast_DeJager_eQTL_ENSG00000179403") +}) + +test_that("build_top_loci_long omits region/event_ID columns when caller does not supply them", { + variant_ids <- c("chr1:100:A:G") + inp <- .make_fake_inputs_for_annotated_long(variant_ids, + cs_membership = 1L) + long <- pecotmr:::build_top_loci_long( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = list(betahat = 0.2, sebetahat = 0.05, z = 4), + maf = 0.10, method = "susie", signal_cutoff = 0.05, + data_x = inp$data_x, data_y = inp$data_y, other_quantities = NULL + ) + expect_false("region" %in% names(long)) + expect_false("event_ID" %in% names(long)) + expect_true("n" %in% names(long)) + expect_true("variant_number" %in% names(long)) + expect_true("gene_id" %in% names(long)) +}) + +test_that("build_top_loci_long stays backward-compatible when data_x/data_y/other_quantities are omitted", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .make_fake_inputs_for_annotated_long(variant_ids, + cs_membership = c(1L, 2L)) + long <- pecotmr:::build_top_loci_long( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04), + z = c(4, -2.5)), + maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05 + ) + # n / variant_number / gene_id columns exist but are NA because no + # data_x / data_y were supplied; region / event_ID are omitted entirely. + expect_true(all(is.na(long$n))) + expect_true(all(is.na(long$variant_number))) + expect_true(all(is.na(long$gene_id))) + expect_false("region" %in% names(long)) + expect_false("event_ID" %in% names(long)) +}) + +test_that("build_top_loci_export errors when required columns are missing", { + bad <- data.frame(variant_id = "chr1:100:A:G", pip = 0.9, + coverage = 0.95, cs = 1L, + stringsAsFactors = FALSE) + expect_error(build_top_loci_export(bad), + "missing required columns") +}) + +test_that("build_top_loci_export returns the fixed 21-column schema on an empty long", { + empty <- pecotmr:::.empty_top_loci_long() + out <- build_top_loci_export(empty) + expected <- c("#chr", "start", "end", "a1", "a2", "variant_ID", "gene_ID", + "event_ID", "cs_coverage_0.95", "cs_coverage_0.7", + "cs_coverage_0.5", "cs_purity", "PIP", "conditional_effect", + "conditional_effect_se", "analysis_region", + "analysis_variants_number", "beta", "se", "n", "maf") + expect_equal(names(out), expected) + expect_equal(nrow(out), 0L) +}) + +test_that("build_top_loci_export projects an annotated long into the fixed compact schema", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .make_fake_inputs_for_annotated_long(variant_ids, + cs_membership = c(1L, 2L), + n_samples = 419, + n_variants = 11332) + other_q <- list(region = "chr10:10823338-14348298", + condition_id = "Ast_DeJager_eQTL") + long <- pecotmr:::build_top_loci_long( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04), + z = c(4, -2.5)), + maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05, + data_x = inp$data_x, data_y = inp$data_y, other_quantities = other_q + ) + out <- build_top_loci_export(long) + expected_cols <- c("#chr", "start", "end", "a1", "a2", "variant_ID", + "gene_ID", "event_ID", "cs_coverage_0.95", + "cs_coverage_0.7", "cs_coverage_0.5", "cs_purity", + "PIP", "conditional_effect", "conditional_effect_se", + "analysis_region", "analysis_variants_number", + "beta", "se", "n", "maf") + expect_equal(names(out), expected_cols) + expect_equal(out$gene_ID[[1]], "ENSG00000179403") + expect_equal(out$event_ID[[1]], "Ast_DeJager_eQTL_ENSG00000179403") + expect_equal(out$analysis_region[[1]], "chr10:10823338-14348298") + expect_equal(out$analysis_variants_number[[1]], 11332L) + expect_equal(out$n[[1]], 419L) +}) From 9b39c36abbf8b5eafaa72fe5cd94e23dc150cdf1 Mon Sep 17 00:00:00 2001 From: xueweic Date: Sat, 30 May 2026 07:59:12 +0000 Subject: [PATCH 04/14] Update documentation --- NAMESPACE | 1 + man/build_top_loci_export.Rd | 36 +++++++++++++++++++++++++++++ man/univariate_analysis_pipeline.Rd | 9 ++++++-- 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 man/build_top_loci_export.Rd diff --git a/NAMESPACE b/NAMESPACE index cd43db4f..417b054e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(bayes_c_weights) export(bayes_l_weights) export(bayes_n_weights) export(bayes_r_weights) +export(build_top_loci_export) export(check_ld) export(clean_context_names) export(coloc_post_processor) diff --git a/man/build_top_loci_export.Rd b/man/build_top_loci_export.Rd new file mode 100644 index 00000000..171e52c9 --- /dev/null +++ b/man/build_top_loci_export.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/susie_wrapper.R +\name{build_top_loci_export} +\alias{build_top_loci_export} +\title{Build the unified compact top-loci export table.} +\usage{ +build_top_loci_export(long) +} +\arguments{ +\item{long}{An annotated \code{top_loci_long} data frame. Must contain +the columns \code{variant_id}, \code{pip}, \code{coverage}, \code{cs}, +\code{conditional_effect}, \code{conditional_effect_se}, +\code{cs_purity}, \code{gene_id}, \code{n}, \code{variant_number}, +\code{region}, \code{event_ID}, \code{betahat}, \code{sebetahat}, +\code{maf}. Missing any required column raises an explicit error +rather than silently filling \code{NA}.} +} +\value{ +A data frame with the fixed compact-export schema. +} +\description{ +Projects an annotated \code{top_loci_long} (as produced by +\code{postprocess_finemapping_fits()} when called with \code{data_x}, +\code{data_y}, and an \code{other_quantities} list containing +\code{region} and \code{condition_id}) into the fixed-order compact +export schema used by the unified fine-mapping output. +} +\details{ +The output column order is exactly: \code{#chr}, \code{start}, \code{end}, +\code{a1}, \code{a2}, \code{variant_ID}, \code{gene_ID}, \code{event_ID}, +\code{cs_coverage_0.95}, \code{cs_coverage_0.7}, \code{cs_coverage_0.5}, +\code{cs_purity}, \code{PIP}, \code{conditional_effect}, +\code{conditional_effect_se}, \code{analysis_region}, +\code{analysis_variants_number}, \code{beta}, \code{se}, \code{n}, +\code{maf}. +} diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd index c6960862..2f54b4b5 100644 --- a/man/univariate_analysis_pipeline.Rd +++ b/man/univariate_analysis_pipeline.Rd @@ -23,6 +23,7 @@ univariate_analysis_pipeline( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), + fit_susie_inf = TRUE, twas_weights = TRUE, sample_partition = NULL, max_cv_variants = -1, @@ -71,6 +72,10 @@ which is stricter than the susieR default of 0.5.} SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE fit keeps these options and is initialized with \code{model_init}.} +\item{fit_susie_inf}{Whether to fit SuSiE-inf before ordinary SuSiE. Default +is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is +fitted directly and SuSiE-inf fitted objects/results are not returned.} + \item{twas_weights}{Whether to compute TWAS weights. Default is TRUE.} \item{sample_partition}{Optional data frame with Sample and Fold columns for cross-validation. Default is NULL.} @@ -88,6 +93,6 @@ A list containing the univariate analysis results. } \description{ This function performs univariate analysis for fine-mapping and Transcriptome-Wide Association Study (TWAS) -with optional cross-validation. Fine-mapping fits SuSiE-inf first and then -fits SuSiE initialized from the SuSiE-inf result. +with optional cross-validation. By default, fine-mapping fits SuSiE-inf first +and then fits SuSiE initialized from the SuSiE-inf result. } From 622244a5b33e3ca599d2717a52464973b5a626dd Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 10:02:27 -0500 Subject: [PATCH 05/14] unify top loci export --- R/susie_wrapper.R | 554 +++++++++++++----------- tests/testthat/test_susie_wrapper.R | 648 ++++++++++++++++------------ 2 files changed, 676 insertions(+), 526 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 6b492b30..8a731482 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -168,11 +168,13 @@ fit_susie_inf_then_susie <- function(X, y, args = list(), #' @param other_quantities Optional list carried into each method result. #' @param prior_eff_tol Tolerance for retaining effects by prior variance. #' @param min_abs_corr Minimum absolute correlation for credible-set purity. -#' @return A list with \code{finemapping_results}, \code{top_loci_long}, and -#' \code{top_loci}. The long table is lossless, with one row per -#' variant-method-coverage-CS membership. The wide table stores one row per -#' variant, method-specific \code{pip_} columns, method-specific -#' \code{CS__} columns, and \code{model_source}. +#' @return A list with \code{finemapping_results} (per-method post-processed +#' objects, each carrying a trimmed fit and method-specific intermediates) +#' and the single unified \code{top_loci} table in the fixed 22-column +#' shape (see \code{\link{build_top_loci}}). Per-method contributions are +#' produced by \code{build_top_loci()} once per method and row-bound into +#' this single \code{top_loci}. There is no separately exposed +#' \code{top_loci_long} or wide-format \code{top_loci}. #' @export postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, X_scalar = 1, y_scalar = 1, @@ -188,6 +190,9 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, stop("fits must be a named list; names define method identity.") } + # One method for-loop: each method calls build_top_loci() once per fit; the + # per-method 22-column contributions are row-bound below into the single + # final `top_loci` table. There is no separately exposed long or wide table. posts <- lapply(names(fits), function(method) { fit <- .set_finemapping_fit_class(fits[[method]], method) postprocess_finemapping_fit( @@ -200,16 +205,21 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, }) names(posts) <- names(fits) - top_loci_long <- bind_rows(lapply(posts, function(x) x$top_loci_long)) + per_method <- lapply(posts, function(x) x$top_loci) + per_method <- per_method[!vapply(per_method, is.null, logical(1))] + top_loci <- if (length(per_method) == 0L) { + .empty_top_loci() + } else { + do.call(rbind, per_method) + } + rownames(top_loci) <- NULL posts <- lapply(posts, function(x) { - x$top_loci_long <- NULL + x$top_loci <- NULL x }) - top_loci <- build_top_loci_wide(top_loci_long, posts) list( finemapping_results = posts, - top_loci_long = if (nrow(top_loci_long) > 0) top_loci_long else NULL, top_loci = top_loci ) } @@ -261,7 +271,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { secondary_coverage = secondary_coverage, method = method, cs_input = cs_input, min_abs_corr = min_abs_corr ) - top_loci_long <- build_top_loci_long( + top_loci <- build_top_loci( fit, cs_tables, variant_names = variant_names, sumstats = sumstats, maf = maf, method = method, signal_cutoff = signal_cutoff, data_x = data_x, data_y = data_y, other_quantities = other_quantities @@ -269,15 +279,18 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) - # Build FineMappingResult S4 object + # Build FineMappingResult S4 object. The S4 contract (validity check, + # vcf_writer, getPIP, getCS) still expects `variant_id`, `pip`, and an + # integer `cs` column on the slot. To avoid rippling renames into + # AllClasses / AllMethods / vcf_writer for this change, we project the + # new 22-column `top_loci` into the legacy slot shape here, in + # susie_wrapper only. The wrapper-facing `top_loci` returned to callers + # is unchanged. + s4_top_loci <- .top_loci_for_s4_slot(top_loci) fm_result <- FineMappingResult( variant_names = variant_names, trimmed_fit = trimmed, - top_loci = if (is.null(top_loci_long) || nrow(top_loci_long) == 0) { - data.frame(variant_id = character(0), method = character(0)) - } else { - top_loci_long - }, + top_loci = s4_top_loci, method = method, sumstats = sumstats ) @@ -286,7 +299,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { res <- list( variant_names = variant_names, result_trimmed = trimmed, - top_loci_long = top_loci_long, + top_loci = top_loci, finemapping_result = fm_result ) if (!is.null(sumstats)) res$sumstats <- sumstats @@ -408,25 +421,94 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", " out } -build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, - maf = NULL, method, signal_cutoff = 0.1, - data_x = NULL, data_y = NULL, - other_quantities = NULL) { - if (length(cs_tables) == 0) return(.empty_top_loci_long()) +#' Build the unified single top-loci table for one fit and one method. +#' +#' Produces the per-fit, per-method contribution to the unified \code{top_loci} +#' table in the fixed 22-column shape. The outer +#' \code{postprocess_finemapping_fits()} loop calls this once per method per fit +#' and row-binds the results into the single final \code{top_loci} table that is +#' exposed by \code{format_finemapping_output()}. +#' +#' This function replaces the previous \code{build_top_loci_long()}, +#' \code{build_top_loci_wide()}, and \code{build_top_loci_export()} trio. +#' There is no separately exposed long or wide table. +#' +#' The output column order is exactly (22 columns): +#' \code{#chr}, \code{start}, \code{end}, \code{a1}, \code{a2}, +#' \code{variant}, \code{gene}, \code{event}, +#' \code{n}, \code{maf}, \code{beta}, \code{se}, +#' \code{pip}, \code{posterior_effect_mean}, \code{posterior_effect_se}, +#' \code{cs_95}, \code{cs_70}, \code{cs_50}, \code{cs_95_purity}, +#' \code{method}, \code{grange_start}, \code{grange_end}. +#' +#' The \code{cs_95}, \code{cs_70}, \code{cs_50} columns are character strings of +#' the form \code{"_"} where each method numbers its credible +#' sets independently starting at 1. Variants retained by the PIP cutoff but not +#' assigned to any credible set at the given coverage use \code{"_0"}. +#' \code{cs_95_purity} is the 0.95-coverage credible-set purity for the row's +#' \code{(method, cs_95)}; rows whose \code{cs_95} is \code{"_0"} carry +#' \code{cs_95_purity = 0}. +#' +#' Row uniqueness inside this function's output is one row per +#' \code{(variant, gene, cs_membership)} at the given \code{method}. Overlapping +#' credible-set membership for the same method produces one row per CS, so the +#' overlapping-CS contract is preserved. +#' +#' @param fit A fitted SuSiE-family object (must expose \code{alpha}, +#' \code{mu}, \code{mu2}, \code{pip}). +#' @param cs_tables A list of CS tables (one per coverage), as produced by +#' \code{compute_cs_tables()}. +#' @param variant_names Character vector of variant IDs in +#' \code{chr:pos:A2:A1} form, length equal to the number of variants in the +#' fit. Used to construct \code{variant}, \code{#chr}, \code{start}, +#' \code{end}, \code{a1}, \code{a2}. +#' @param sumstats Optional marginal-association summary statistics +#' (\code{betahat}, \code{sebetahat}) used to fill \code{beta} and \code{se}. +#' @param maf Optional numeric vector of minor-allele frequencies. +#' @param method Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Used +#' to construct the per-method \code{"_"} strings and the +#' \code{method} column. Required. +#' @param signal_cutoff PIP cutoff for retaining PIP-only (non-CS) variants. +#' @param data_x Optional regional genotype matrix; used only for sample-count +#' shape checks. +#' @param data_y Optional regional phenotype matrix; \code{nrow(data_y)} fills +#' \code{n}, \code{colnames(data_y)[1]} fills \code{gene}. +#' @param other_quantities Optional list with reserved subfields +#' \code{region} (e.g. \code{"chr1:100-200"}) and \code{condition_id} +#' (e.g. \code{"Ast_DeJager_eQTL"}); used to fill \code{grange_start}, +#' \code{grange_end}, and to compose \code{event} as +#' \code{paste(condition_id, gene, sep = "_")}. Missing subfields are +#' filled with \code{NA} rather than dropped. +#' @return A data frame in the fixed 22-column unified \code{top_loci} shape +#' for this one fit and one method. Returns an empty data frame with the +#' correct columns and dtypes if there is nothing to retain. +#' @export +build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, + maf = NULL, method, signal_cutoff = 0.1, + data_x = NULL, data_y = NULL, + other_quantities = NULL) { + if (missing(method) || is.null(method) || + length(method) != 1L || is.na(method) || !nzchar(method)) { + stop("build_top_loci: `method` is required (e.g. \"susie\", \"susie_inf\").") + } + if (length(cs_tables) == 0) return(.empty_top_loci()) coverage_values <- attr(cs_tables, "coverage") + if (is.null(coverage_values)) coverage_values <- rep(NA_real_, length(cs_tables)) - # Per-fit constants (broadcast to every row of this fit's slice). + # Per-fit constants. data_y_mat <- if (!is.null(data_y)) as.matrix(data_y) else NULL - fit_n <- if (is.null(data_y_mat)) NA_integer_ else nrow(data_y_mat) - fit_variant_number <- if (!is.null(data_x)) ncol(data_x) else NA_integer_ - fit_gene_id <- if (!is.null(data_y_mat) && !is.null(colnames(data_y_mat))) { + fit_n <- if (is.null(data_y_mat)) NA_integer_ else as.integer(nrow(data_y_mat)) + fit_gene <- if (!is.null(data_y_mat) && !is.null(colnames(data_y_mat))) { colnames(data_y_mat)[1] } else NA_character_ - fit_region <- other_quantities$region - fit_event_id <- if (!is.null(other_quantities$condition_id) && - !is.na(fit_gene_id) && nzchar(fit_gene_id)) { - paste(other_quantities$condition_id, fit_gene_id, sep = "_") - } else NULL + fit_condition_id <- other_quantities$condition_id + fit_event <- if (!is.null(fit_condition_id) && + !is.na(fit_gene) && nzchar(fit_gene)) { + paste(fit_condition_id, fit_gene, sep = "_") + } else NA_character_ + grange_se <- .parse_grange(other_quantities$region) + fit_grange_start <- grange_se[["start"]] + fit_grange_end <- grange_se[["end"]] # Per-variant posterior effect and SE, computed once for all variants. alpha <- as.matrix(fit$alpha) @@ -439,255 +521,220 @@ build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, sqrt(pmax(colSums(alpha * mu2) - posterior_effect^2, 0)) } else rep(NA_real_, length(variant_names)) - rows <- lapply(seq_along(cs_tables), function(i) { - cs_table <- cs_tables[[i]] - cov <- coverage_values[[i]] - top_variants_idx <- get_top_variants_idx(cs_table, signal_cutoff) - cs_info <- get_cs_info(cs_table$sets$cs, top_variants_idx) - if (is.null(cs_info) || nrow(cs_info) == 0) return(NULL) - idx <- cs_info$variant_idx - optional_cols <- .top_loci_optional_columns(idx, sumstats, maf) - - # cs_purity: prefer susieR's sets$purity$min.abs.corr; fall back to - # cs_corr when purity is unavailable; PIP-only retained rows get 0. - sets_purity <- cs_table$sets$purity - cs_purity_per_cs <- if (!is.null(sets_purity) && - "min.abs.corr" %in% names(sets_purity)) { + # Per-coverage credible-set purity vectors, indexed by 1-based CS index. + purity_per_cov <- lapply(cs_tables, function(ct) { + sets_purity <- ct$sets$purity + if (!is.null(sets_purity) && + "min.abs.corr" %in% names(sets_purity)) { as.numeric(sets_purity$min.abs.corr) - } else if (!is.null(cs_table$cs_corr)) { - vapply(seq_along(cs_table$cs_corr), function(j) { - m <- cs_table$cs_corr[[j]] + } else if (!is.null(ct$cs_corr)) { + vapply(seq_along(ct$cs_corr), function(j) { + m <- ct$cs_corr[[j]] if (is.null(m)) return(NA_real_) if (!is.matrix(m) || nrow(m) <= 1) return(1) min(abs(m[upper.tri(m)])) }, numeric(1)) } else { - rep(NA_real_, length(cs_table$sets$cs)) + rep(NA_real_, length(ct$sets$cs)) } - cs_purity_per_row <- vapply(cs_info$cs_idx, function(cs_i) { - if (is.na(cs_i) || cs_i == 0L) return(0) - if (cs_i > length(cs_purity_per_cs)) return(NA_real_) - cs_purity_per_cs[cs_i] - }, numeric(1)) - - base <- data.frame( - variant_id = variant_names[idx], - method = method, - coverage = cov, - cs = as.integer(cs_info$cs_idx), - pip = as.numeric(fit$pip[idx]), - conditional_effect = posterior_effect[idx], - conditional_effect_se = posterior_effect_se[idx], - cs_purity = cs_purity_per_row, - stringsAsFactors = FALSE - ) - # Per-fit constants broadcast to every row. - base$n <- fit_n - base$variant_number <- fit_variant_number - base$gene_id <- fit_gene_id - if (!is.null(fit_region)) base$region <- fit_region - if (!is.null(fit_event_id)) base$event_ID <- fit_event_id - - if (ncol(optional_cols) > 0) cbind(base, optional_cols) else base }) - out <- bind_rows(rows) - if (nrow(out) == 0) .empty_top_loci_long() else out -} -.empty_top_loci_long <- function() { - data.frame( - variant_id = character(), method = character(), coverage = numeric(), - cs = integer(), pip = numeric(), - conditional_effect = numeric(), conditional_effect_se = numeric(), - cs_purity = numeric(), - n = integer(), variant_number = integer(), - gene_id = character(), - stringsAsFactors = FALSE - ) -} - -.top_loci_optional_columns <- function(idx, sumstats = NULL, maf = NULL) { - optional_cols <- list( - betahat = if (!is.null(sumstats$betahat)) sumstats$betahat[idx] else NULL, - sebetahat = if (!is.null(sumstats$sebetahat)) sumstats$sebetahat[idx] else NULL, - z = if (!is.null(sumstats$z)) sumstats$z[idx] else NULL, - maf = if (!is.null(maf)) maf[idx] else NULL - ) - as.data.frame(Filter(Negate(is.null), optional_cols)) -} - -build_top_loci_wide <- function(top_loci_long, posts) { - if (is.null(top_loci_long) || nrow(top_loci_long) == 0) return(NULL) - ids <- unique(top_loci_long$variant_id) - out <- data.frame(variant_id = ids, stringsAsFactors = FALSE) - for (column in c("betahat", "sebetahat", "z", "maf")) { - if (column %in% names(top_loci_long)) { - out[[column]] <- vapply(ids, function(id) { - values <- top_loci_long[[column]][top_loci_long$variant_id == id] - values <- values[!is.na(values)] - if (length(values) == 0) NA_real_ else values[[1]] - }, numeric(1)) - } + # Internal long-shaped collection: one row per + # (variant_idx, cs_idx_at_this_coverage, coverage). Not exposed. Used only + # to project to the 22-column shape below. + long_rows <- list() + for (i in seq_along(cs_tables)) { + ct <- cs_tables[[i]] + top_variants_idx <- get_top_variants_idx(ct, signal_cutoff) + cs_info <- get_cs_info(ct$sets$cs, top_variants_idx) + if (is.null(cs_info) || nrow(cs_info) == 0) next + long_rows[[length(long_rows) + 1L]] <- data.frame( + variant_idx = as.integer(cs_info$variant_idx), + cs_idx = as.integer(cs_info$cs_idx), + coverage = as.numeric(coverage_values[[i]]), + stringsAsFactors = FALSE + ) } + if (length(long_rows) == 0) return(.empty_top_loci()) + long_df <- do.call(rbind, long_rows) + if (nrow(long_df) == 0) return(.empty_top_loci()) - methods <- names(posts) - for (method in methods) { - post <- posts[[method]] - pip_col <- format_pip_column(method) - pip <- post$result_trimmed$pip - names(pip) <- post$variant_names - out[[pip_col]] <- as.numeric(pip[ids]) - - method_rows <- top_loci_long[top_loci_long$method == method, , drop = FALSE] - for (cov in sort(unique(top_loci_long$coverage), decreasing = TRUE)) { - cs_col <- format_cs_column(cov, method) - out[[cs_col]] <- vapply(ids, function(id) { - cs <- method_rows$cs[method_rows$variant_id == id & method_rows$coverage == cov] - if (length(cs) == 0) return(0L) - min(cs) - }, integer(1)) - } + # Key grid: one row per (variant_idx, cs_idx) at the export grain. Preserves + # overlapping CS membership for the same method. + key_grid <- unique(long_df[, c("variant_idx", "cs_idx"), drop = FALSE]) + rownames(key_grid) <- NULL + n_keys <- nrow(key_grid) + + # Helper: did this (variant_idx, cs_idx) appear at this coverage? If yes, + # return cs_idx; otherwise 0. + lookup_cs_at_cov <- function(v_idx, c_idx, cov) { + sel <- long_df$variant_idx == v_idx & + long_df$cs_idx == c_idx & + abs(long_df$coverage - cov) < 1e-12 + if (any(sel)) as.integer(c_idx) else 0L } + cov95_table_idx <- which(abs(coverage_values - 0.95) < 1e-12) - out$model_source <- vapply(ids, function(id) { - selected_methods <- unique(top_loci_long$method[top_loci_long$variant_id == id]) - paste(selected_methods[selected_methods %in% methods], collapse = ";") - }, character(1)) - rownames(out) <- NULL - out -} - -#' Build the unified compact top-loci export table. -#' -#' Projects an annotated \code{top_loci_long} (as produced by -#' \code{postprocess_finemapping_fits()} when called with \code{data_x}, -#' \code{data_y}, and an \code{other_quantities} list containing -#' \code{region} and \code{condition_id}) into the fixed-order compact -#' export schema used by the unified fine-mapping output. -#' -#' The output column order is exactly: \code{#chr}, \code{start}, \code{end}, -#' \code{a1}, \code{a2}, \code{variant_ID}, \code{gene_ID}, \code{event_ID}, -#' \code{cs_coverage_0.95}, \code{cs_coverage_0.7}, \code{cs_coverage_0.5}, -#' \code{cs_purity}, \code{PIP}, \code{conditional_effect}, -#' \code{conditional_effect_se}, \code{analysis_region}, -#' \code{analysis_variants_number}, \code{beta}, \code{se}, \code{n}, -#' \code{maf}. -#' -#' @param long An annotated \code{top_loci_long} data frame. Must contain -#' the columns \code{variant_id}, \code{pip}, \code{coverage}, \code{cs}, -#' \code{conditional_effect}, \code{conditional_effect_se}, -#' \code{cs_purity}, \code{gene_id}, \code{n}, \code{variant_number}, -#' \code{region}, \code{event_ID}, \code{betahat}, \code{sebetahat}, -#' \code{maf}. Missing any required column raises an explicit error -#' rather than silently filling \code{NA}. -#' @return A data frame with the fixed compact-export schema. -#' @export -build_top_loci_export <- function(long) { - required <- c("variant_id", "pip", "coverage", "cs", - "conditional_effect", "conditional_effect_se", "cs_purity", - "gene_id", "n", "variant_number", "region", "event_ID", - "betahat", "sebetahat", "maf") - if (is.null(long) || !is.data.frame(long)) { - stop("build_top_loci_export: `long` must be a data frame.") + format_cs_string <- function(idx) { + if (is.na(idx) || idx <= 0L) paste0(method, "_0") else paste0(method, "_", idx) } - if (nrow(long) == 0) return(.empty_top_loci_export()) - missing_cols <- setdiff(required, names(long)) - if (length(missing_cols) > 0) { - stop("build_top_loci_export: `long` is missing required columns: ", - paste(missing_cols, collapse = ", ")) + + cs_95 <- character(n_keys) + cs_70 <- character(n_keys) + cs_50 <- character(n_keys) + cs_95_purity <- numeric(n_keys) + for (k in seq_len(n_keys)) { + v_idx <- key_grid$variant_idx[k] + c_idx <- key_grid$cs_idx[k] + cs95_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.95) + cs70_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.70) + cs50_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.50) + cs_95[k] <- format_cs_string(cs95_idx) + cs_70[k] <- format_cs_string(cs70_idx) + cs_50[k] <- format_cs_string(cs50_idx) + if (cs95_idx > 0L && length(cov95_table_idx) > 0L) { + pvec <- purity_per_cov[[cov95_table_idx[1]]] + cs_95_purity[k] <- if (cs95_idx <= length(pvec)) { + val <- pvec[cs95_idx] + if (is.na(val)) 0 else as.numeric(val) + } else 0 + } else { + cs_95_purity[k] <- 0 + } } + # Per-variant lookups indexed by the row's variant_idx. + v_idx_vec <- key_grid$variant_idx + variant_id_vec <- variant_names[v_idx_vec] + pip_vec <- as.numeric(fit$pip[v_idx_vec]) + post_mean_vec <- posterior_effect[v_idx_vec] + post_se_vec <- posterior_effect_se[v_idx_vec] + beta_vec <- if (!is.null(sumstats$betahat)) sumstats$betahat[v_idx_vec] else rep(NA_real_, n_keys) + se_vec <- if (!is.null(sumstats$sebetahat)) sumstats$sebetahat[v_idx_vec] else rep(NA_real_, n_keys) + maf_vec <- if (!is.null(maf)) maf[v_idx_vec] else rep(NA_real_, n_keys) + parsed <- tryCatch( - parse_variant_id(long$variant_id), + suppressWarnings(parse_variant_id(variant_id_vec)), error = function(e) { - stop("build_top_loci_export: parse_variant_id failed: ", - conditionMessage(e)) + stop("build_top_loci: parse_variant_id failed: ", conditionMessage(e)) } ) - if (is.null(parsed) || nrow(parsed) != nrow(long)) { - stop("build_top_loci_export: parse_variant_id did not return one row ", - "per input row.") + if (is.null(parsed) || nrow(parsed) != length(variant_id_vec)) { + stop("build_top_loci: parse_variant_id did not return one row per variant.") + } + invalid <- is.na(parsed$chrom) | is.na(parsed$pos) | + is.na(parsed$A1) | !nzchar(parsed$A1) | + is.na(parsed$A2) | !nzchar(parsed$A2) + if (any(invalid)) { + first_bad <- variant_id_vec[which(invalid)[[1]]] + stop("build_top_loci: parse_variant_id produced invalid coordinates ", + "for variant_id: ", first_bad) } - - # Build a key per (variant_id, gene_id, cs) — one row per CS membership - # at the export grain. PIP-only retained variants have cs = 0. - key_grid <- unique(long[, c("variant_id", "gene_id", "cs"), drop = FALSE]) - rownames(key_grid) <- NULL - - coverage_targets <- c(0.95, 0.7, 0.5) - cs_coverage_cols <- paste0("cs_coverage_", coverage_targets) - - pick_first <- function(col, default = NA) { - vapply(seq_len(nrow(key_grid)), function(i) { - sel <- long$variant_id == key_grid$variant_id[i] & - long$gene_id == key_grid$gene_id[i] & - long$cs == key_grid$cs[i] - hit <- long[[col]][sel] - if (length(hit) == 0) default else hit[[1]] - }, default) - } - - cs_coverage_mat <- vapply(seq_len(nrow(key_grid)), function(i) { - vapply(coverage_targets, function(cov) { - sel <- long$variant_id == key_grid$variant_id[i] & - long$gene_id == key_grid$gene_id[i] & - long$cs == key_grid$cs[i] & - long$coverage == cov - if (any(sel)) as.integer(long$cs[sel][[1]]) else 0L - }, integer(1)) - }, integer(length(coverage_targets))) - cs_coverage_mat <- if (is.null(dim(cs_coverage_mat))) { - matrix(cs_coverage_mat, nrow = length(coverage_targets)) - } else cs_coverage_mat - cs_coverage_mat <- t(cs_coverage_mat) - colnames(cs_coverage_mat) <- cs_coverage_cols - - # Resolve per-row variant coordinates using a lookup into parsed. - variant_first_idx <- match(key_grid$variant_id, long$variant_id) out <- data.frame( - "#chr" = parsed$chrom[variant_first_idx], - start = parsed$pos[variant_first_idx] - 1L, - end = parsed$pos[variant_first_idx], - a1 = parsed$A1[variant_first_idx], - a2 = parsed$A2[variant_first_idx], - variant_ID = key_grid$variant_id, - gene_ID = key_grid$gene_id, - event_ID = pick_first("event_ID", NA_character_), - "cs_coverage_0.95" = cs_coverage_mat[, "cs_coverage_0.95"], - "cs_coverage_0.7" = cs_coverage_mat[, "cs_coverage_0.7"], - "cs_coverage_0.5" = cs_coverage_mat[, "cs_coverage_0.5"], - cs_purity = pick_first("cs_purity", NA_real_), - PIP = pick_first("pip", NA_real_), - conditional_effect = pick_first("conditional_effect", NA_real_), - conditional_effect_se = pick_first("conditional_effect_se", NA_real_), - analysis_region = pick_first("region", NA_character_), - analysis_variants_number = pick_first("variant_number", NA_integer_), - beta = pick_first("betahat", NA_real_), - se = pick_first("sebetahat", NA_real_), - n = pick_first("n", NA_integer_), - maf = pick_first("maf", NA_real_), - stringsAsFactors = FALSE, - check.names = FALSE + "#chr" = parsed$chrom, + start = as.integer(parsed$pos) - 1L, + end = as.integer(parsed$pos), + a1 = parsed$A1, + a2 = parsed$A2, + variant = variant_id_vec, + gene = rep(fit_gene, n_keys), + event = rep(fit_event, n_keys), + n = rep(fit_n, n_keys), + maf = maf_vec, + beta = beta_vec, + se = se_vec, + pip = pip_vec, + posterior_effect_mean = post_mean_vec, + posterior_effect_se = post_se_vec, + cs_95 = cs_95, + cs_70 = cs_70, + cs_50 = cs_50, + cs_95_purity = cs_95_purity, + method = rep(method, n_keys), + grange_start = rep(fit_grange_start, n_keys), + grange_end = rep(fit_grange_end, n_keys), + stringsAsFactors = FALSE, + check.names = FALSE ) rownames(out) <- NULL out } -.empty_top_loci_export <- function() { +.empty_top_loci <- function() { data.frame( - "#chr" = integer(), start = integer(), end = integer(), - a1 = character(), a2 = character(), variant_ID = character(), - gene_ID = character(), event_ID = character(), - "cs_coverage_0.95" = integer(), "cs_coverage_0.7" = integer(), - "cs_coverage_0.5" = integer(), cs_purity = numeric(), PIP = numeric(), - conditional_effect = numeric(), conditional_effect_se = numeric(), - analysis_region = character(), analysis_variants_number = integer(), - beta = numeric(), se = numeric(), n = integer(), maf = numeric(), - stringsAsFactors = FALSE, check.names = FALSE + "#chr" = character(), + start = integer(), + end = integer(), + a1 = character(), + a2 = character(), + variant = character(), + gene = character(), + event = character(), + n = integer(), + maf = numeric(), + beta = numeric(), + se = numeric(), + pip = numeric(), + posterior_effect_mean = numeric(), + posterior_effect_se = numeric(), + cs_95 = character(), + cs_70 = character(), + cs_50 = character(), + cs_95_purity = numeric(), + method = character(), + grange_start = integer(), + grange_end = integer(), + stringsAsFactors = FALSE, + check.names = FALSE ) } +.parse_grange <- function(region_str) { + if (is.null(region_str) || length(region_str) == 0L || + is.na(region_str) || !nzchar(as.character(region_str))) { + return(c(start = NA_integer_, end = NA_integer_)) + } + pr <- tryCatch(parse_region(as.character(region_str)), + error = function(e) NULL) + if (is.null(pr) || !is.data.frame(pr)) { + return(c(start = NA_integer_, end = NA_integer_)) + } + c(start = as.integer(pr$start), end = as.integer(pr$end)) +} + +# Project the new 22-column `top_loci` into the legacy shape expected by the +# FineMappingResult S4 slot, vcf_writer, getPIP, and getCS. We add backward- +# compatible aliases without renaming any column in the wrapper-facing +# `top_loci`: +# +# * `variant_id` — copy of `variant` +# * `cs` — integer credible-set index derived from `cs_95` strings of +# the form `_` (PIP-only `_0` -> 0L) +# +# This isolates the schema change to susie_wrapper.R so AllClasses.R, +# AllMethods.R, and vcf_writer.R do not have to change. +.top_loci_for_s4_slot <- function(top_loci) { + if (is.null(top_loci) || nrow(top_loci) == 0) { + return(data.frame(variant_id = character(0), + method = character(0), + stringsAsFactors = FALSE)) + } + out <- top_loci + if ("variant" %in% names(out) && !"variant_id" %in% names(out)) { + out$variant_id <- out$variant + } + if ("cs_95" %in% names(out) && !"cs" %in% names(out)) { + out$cs <- vapply(out$cs_95, function(s) { + if (is.na(s) || !nzchar(s)) return(0L) + tail_str <- sub("^.*_", "", s) + suppressWarnings(as.integer(tail_str)) + }, integer(1)) + out$cs[is.na(out$cs)] <- 0L + } + out +} + trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { alpha <- .as_effect_matrix(fit$alpha) lbf_variable <- .as_lbf_matrix(fit) @@ -738,24 +785,27 @@ trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { #' Format Fine-mapping Post-processing for Protocol Output #' #' Converts method-aware fine-mapping post-processing output into the root-level -#' fields consumed by protocol RDS files. +#' fields consumed by protocol RDS files. The single top-loci output is the +#' \code{top_loci} field (the 22-column unified table); there is no +#' \code{top_loci_long}, no wide-format \code{top_loci}, and no +#' \code{top_loci_export}. #' #' @param post Output from \code{\link{postprocess_finemapping_fits}}. #' @param primary_method Method whose result should populate root-level fields. #' @return A list with root-level fields including \code{variant_names}, -#' \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. +#' \code{susie_result_trimmed}, and the single unified \code{top_loci} +#' 22-column table. #' @export format_finemapping_output <- function(post, primary_method) { method_post <- post$finemapping_results[[primary_method]] if (is.null(method_post)) { stop("primary_method was not found in finemapping_results: ", primary_method) } - keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci_long")) + keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci")) c( method_post[keep_names], list( susie_result_trimmed = method_post$result_trimmed, - top_loci_long = post$top_loci_long, top_loci = post$top_loci ) ) diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index 2b4a2c09..fd9eb70c 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -744,27 +744,6 @@ test_that("adjust_susie_weights run_allele_qc=TRUE auto-prepends chrom/pos/A2/A1 expect_true(length(out$adjusted_susie_weights) > 0) }) -test_that("format_finemapping_output does not duplicate top loci variants", { - top_loci <- data.frame( - variant_id = paste0("v", 1:4), - CS_95_susie = c(0L, 1L, NA_integer_, 0L), - pip_susie = c(0.2, 0.005, 0.001, 0), - stringsAsFactors = FALSE - ) - post <- list( - finemapping_results = list(susie = list( - variant_names = paste0("v", 1:4), - result_trimmed = list(pip = 1:4), - top_loci_long = NULL - )), - top_loci_long = NULL, - top_loci = top_loci - ) - out <- format_finemapping_output(post, "susie") - expect_false("top_loci_variants" %in% names(out)) - expect_equal(unique(out$top_loci$variant_id), paste0("v", 1:4)) -}) - .make_univariate_data <- function(seed = 42, n = 300, p = 50, effect_idx = integer(0), effect_size = NULL) { set.seed(seed) @@ -779,83 +758,6 @@ test_that("format_finemapping_output does not duplicate top loci variants", { list(X = X, y = y) } -test_that("postprocess top_loci: single susie with signal yields per-method columns and no unsuffixed pip", { - d <- .make_univariate_data(seed = 11, effect_idx = c(10, 30)) - fit <- susieR::susie(d$X, d$y, L = 5) - post <- postprocess_finemapping_fits(list(susie = fit), data_x = d$X, data_y = d$y, coverage = 0.95) - expect_false("pip" %in% colnames(post$top_loci)) - expect_true("pip_susie" %in% colnames(post$top_loci)) - expect_true("CS_95_susie" %in% colnames(post$top_loci)) - # No other method's columns - expect_length(grep("_susie_inf$", colnames(post$top_loci)), 0) - expect_length(grep("_mvsusie$", colnames(post$top_loci)), 0) -}) - -test_that("postprocess top_loci: susie + susie_inf both with signal yield symmetric pip/CS columns", { - d <- .make_univariate_data(seed = 12, effect_idx = c(15, 45)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - expect_gt(max(fits$susie$pip), 0.5) - expect_gt(max(fits$susie_inf$pip), 0.5) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) - cols <- colnames(post$top_loci) - expect_false("pip" %in% cols) - for (m in c("susie", "susie_inf")) { - expect_true(paste0("pip_", m) %in% cols, info = m) - expect_true(paste0("CS_95_", m) %in% cols, info = m) - expect_true(paste0("CS_70_", m) %in% cols, info = m) - expect_true(paste0("CS_50_", m) %in% cols, info = m) - } - # top_loci_long contains both methods - expect_setequal(unique(post$top_loci_long$method), c("susie", "susie_inf")) -}) - -test_that("postprocess top_loci: when one method has no rows in long, its pip column still appears", { - # Construct a case where susie has signal but the susie_inf trim gives no CS rows. - # We use a high signal_cutoff so susie_inf may not produce top variants. - d <- .make_univariate_data(seed = 13, effect_idx = c(20)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, - coverage = 0.95, signal_cutoff = 0.99) - cols <- colnames(post$top_loci) - # Both pip columns are present because they come from result_trimmed$pip, - # independent of whether the method contributed rows to top_loci_long. - expect_true("pip_susie" %in% cols) - expect_true("pip_susie_inf" %in% cols) - expect_false("pip" %in% cols) -}) - -test_that("postprocess top_loci: susie + susie_inf + mvsusie all run through trim without dimension errors", { - skip_if_not_installed("mvsusieR") - d <- .make_univariate_data(seed = 14, effect_idx = c(15, 45)) - Y_mv <- cbind(d$y, as.numeric(d$X %*% rep(0, ncol(d$X))) + rnorm(length(d$y), sd = 0.5)) - colnames(Y_mv) <- c("ctx1", "ctx2") - fits_uni <- fit_susie_inf_then_susie(d$X, d$y) - mv_fit <- mvsusieR::mvsusie(d$X, Y_mv, L = 5, - prior_variance = mvsusieR::create_mixture_prior(R = ncol(Y_mv)), - prior_weights = rep(1 / ncol(d$X), ncol(d$X))) - fits_all <- list(susie = fits_uni$susie, - susie_inf = fits_uni$susie_inf, - mvsusie = mv_fit) - # Bug B regression: this used to error with "incorrect number of dimensions" - # because trim_finemapping_fit indexed mu2 as 2-D regardless of mvsusie's 3-D. - expect_no_error( - post <- postprocess_finemapping_fits(fits_all, data_x = d$X, data_y = Y_mv, coverage = 0.95) - ) - for (m in c("susie", "susie_inf", "mvsusie")) { - expect_true(paste0("pip_", m) %in% colnames(post$top_loci), info = m) - } - expect_false("pip" %in% colnames(post$top_loci)) -}) - -test_that("format_finemapping_output no longer copies pip_ into an unsuffixed pip column", { - d <- .make_univariate_data(seed = 15, effect_idx = c(20, 40)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) - out <- format_finemapping_output(post, primary_method = "susie") - expect_false("pip" %in% colnames(out$top_loci)) - expect_true("pip_susie" %in% colnames(out$top_loci)) -}) - test_that(".translate_legacy_top_loci_cs_columns renames pip_susie -> pip for legacy callers", { new_format <- data.frame( variant_id = c("v1", "v2"), @@ -890,207 +792,405 @@ test_that(".translate_legacy_top_loci_cs_columns leaves existing pip column alon }) -test_that("top_loci has symmetric CS__ columns even for methods with no surviving CSes", { - d <- .make_univariate_data(seed = 16, effect_idx = c(25)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - # Force one method (susie_inf) to have no surviving effects: zero out V so - # select_effects drops everything. - fits$susie_inf$V <- rep(0, length(fits$susie_inf$V)) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, - coverage = 0.95, secondary_coverage = c(0.7, 0.5)) - cols <- colnames(post$top_loci) - for (m in c("susie", "susie_inf")) { - for (cov_lbl in c("CS_95_", "CS_70_", "CS_50_")) { - expect_true(paste0(cov_lbl, m) %in% cols, - info = paste("missing", paste0(cov_lbl, m))) - } - } - # susie_inf had no surviving effects -> all CS_*_susie_inf are 0 - for (col in grep("^CS_.*_susie_inf$", cols, value = TRUE)) { - expect_true(all(post$top_loci[[col]] == 0L), info = col) - } -}) - # ============================================================================ -# Unified top-loci annotated long + build_top_loci_export +# Unified top-loci: hard gating coverage per OpenSpec tasks 4.27 / 4.28. +# These tests are the implementation gate for the build_top_loci migration. # ============================================================================ -# Helper: build a minimal fake fit + cs_tables that build_top_loci_long accepts. -.make_fake_inputs_for_annotated_long <- function(variant_ids, - cs_membership, - cs_purity_value = 0.85, - n_samples = 100, - n_variants = NULL, - pip = NULL) { +# Reuse the existing local helper. Re-declared inside this block so the file +# remains correct whether the unified-section tests are run alone or as part of +# the full file. +if (!exists(".make_univariate_data", inherits = FALSE)) { + .make_univariate_data <- function(seed = 42, n = 300, p = 50, + effect_idx = integer(0), + effect_size = NULL) { + set.seed(seed) + X <- matrix(rnorm(n * p), n, p) + colnames(X) <- sprintf("chr1:%d:G:A", seq_len(p)) + beta <- rep(0, p) + if (length(effect_idx) > 0) { + if (is.null(effect_size)) effect_size <- rep(1.5, length(effect_idx)) + beta[effect_idx] <- effect_size + } + y <- as.numeric(X %*% beta) + rnorm(n, sd = 0.5) + list(X = X, y = y) + } +} + +.UNIFIED_TOP_LOCI_COLS <- c( + "#chr", "start", "end", "a1", "a2", + "variant", "gene", "event", + "n", "maf", "beta", "se", + "pip", "posterior_effect_mean", "posterior_effect_se", + "cs_95", "cs_70", "cs_50", "cs_95_purity", + "method", "grange_start", "grange_end" +) + +# Synthesize a SuSiE-like fit + cs_tables with explicit per-coverage CS +# membership. `cs_at_cov` is a named list keyed by coverage value (e.g. +# `"0.95"`); each element is a list of integer vectors, one per CS at that +# coverage. The CS numbering is 1-based per coverage. PIP values are filled +# from `pip` (variants outside the CS get small non-zero PIP so they can be +# retained or dropped via `signal_cutoff`). +.fake_fit_and_cs <- function(variant_ids, cs_at_cov, + cs_purity_value = 0.85, + pip = NULL, + n_samples = 100, n_variants = NULL, + gene = "ENSG00000179403") { p <- length(variant_ids) if (is.null(n_variants)) n_variants <- p if (is.null(pip)) pip <- seq(0.6, 0.9, length.out = p) - alpha <- matrix(0, nrow = 1, ncol = p, - dimnames = list(NULL, variant_ids)) - alpha[1, ] <- pip / sum(pip) - mu <- matrix(0.5, nrow = 1, ncol = p, - dimnames = list(NULL, variant_ids)) - mu2 <- mu^2 + 0.1 - fit <- list(pip = setNames(pip, variant_ids), alpha = alpha, - mu = mu, mu2 = mu2) - purity_df <- data.frame(min.abs.corr = cs_purity_value, - mean.abs.corr = cs_purity_value, - median.abs.corr = cs_purity_value) - cs_table <- list( - sets = list(cs = list(L1 = cs_membership), - cs_index = 1L, - requested_coverage = 0.95, - purity = purity_df), - cs_corr = list(matrix(c(1, cs_purity_value, cs_purity_value, 1), - nrow = 2)), - pip = fit$pip - ) - cs_tables <- list(cs_table) - attr(cs_tables, "coverage") <- 0.95 + # alpha must be L x p so colSums(alpha * mu) is well-defined. We use one + # row whose values are normalized PIPs. + alpha <- matrix(pip / sum(pip), nrow = 1, ncol = p) + mu <- matrix(0.5, nrow = 1, ncol = p) + mu2 <- mu^2 + 0.1 + fit <- list(pip = setNames(pip, variant_ids), + alpha = alpha, mu = mu, mu2 = mu2) + + cs_tables <- lapply(names(cs_at_cov), function(cov_str) { + cs_list <- cs_at_cov[[cov_str]] + if (is.null(cs_list)) cs_list <- list() + n_cs <- length(cs_list) + if (n_cs > 0L) names(cs_list) <- paste0("L", seq_len(n_cs)) + purity_df <- if (n_cs > 0L) { + data.frame(min.abs.corr = rep(cs_purity_value, n_cs), + mean.abs.corr = rep(cs_purity_value, n_cs), + median.abs.corr = rep(cs_purity_value, n_cs)) + } else { + data.frame(min.abs.corr = numeric(0), + mean.abs.corr = numeric(0), + median.abs.corr = numeric(0)) + } + list( + sets = list(cs = cs_list, + cs_index = seq_len(n_cs), + requested_coverage = as.numeric(cov_str), + purity = purity_df), + cs_corr = if (n_cs > 0L) { + lapply(seq_len(n_cs), function(i) { + matrix(c(1, cs_purity_value, cs_purity_value, 1), nrow = 2) + }) + } else NULL, + pip = fit$pip + ) + }) + attr(cs_tables, "coverage") <- as.numeric(names(cs_at_cov)) + X <- matrix(0, nrow = n_samples, ncol = n_variants) rownames(X) <- paste0("sample", seq_len(n_samples)) if (n_variants == length(variant_ids)) colnames(X) <- variant_ids Y <- matrix(0, nrow = n_samples, ncol = 1, - dimnames = list(paste0("sample", seq_len(n_samples)), - "ENSG00000179403")) - list(fit = fit, cs_tables = cs_tables, variant_names = variant_ids, - data_x = X, data_y = Y) + dimnames = list(paste0("sample", seq_len(n_samples)), gene)) + list(fit = fit, cs_tables = cs_tables, + variant_names = variant_ids, data_x = X, data_y = Y) } -test_that("build_top_loci_long emits annotated columns from fit + cs_tables", { - variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") - inp <- .make_fake_inputs_for_annotated_long(variant_ids, - cs_membership = c(1L, 2L)) - long <- pecotmr:::build_top_loci_long( +.run_build_top_loci <- function(inp, method = "susie", signal_cutoff = 0.05, + sumstats = NULL, maf = NULL, + other_quantities = NULL) { + build_top_loci( fit = inp$fit, cs_tables = inp$cs_tables, variant_names = inp$variant_names, - sumstats = list(betahat = c(0.2, -0.1), - sebetahat = c(0.05, 0.04), - z = c(4, -2.5)), - maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05 + sumstats = sumstats, maf = maf, + method = method, signal_cutoff = signal_cutoff, + data_x = inp$data_x, data_y = inp$data_y, + other_quantities = other_quantities ) - expect_true(all(c("conditional_effect", "conditional_effect_se", - "cs_purity") %in% names(long))) - expect_equal(length(unique(long$cs_purity)), 1L) - expect_equal(round(unique(long$cs_purity), 3), 0.85) - expect_true(all(is.finite(long$conditional_effect))) - expect_true(all(long$conditional_effect_se >= 0)) +} + +test_that("build_top_loci returns the exact 22-column schema in order with stable dtypes", { + # `.empty_top_loci` is a package-internal helper. Use the namespace lookup + # so the test works both source-loaded and after R CMD INSTALL. + empty_fn <- if (exists(".empty_top_loci", envir = asNamespace("pecotmr"), + inherits = FALSE)) { + get(".empty_top_loci", envir = asNamespace("pecotmr")) + } else { + get(".empty_top_loci", envir = .GlobalEnv) + } + out <- empty_fn() + expect_equal(names(out), .UNIFIED_TOP_LOCI_COLS) + expect_equal(nrow(out), 0L) + expect_true(is.character(out$"#chr")) + expect_true(is.integer(out$start)) + expect_true(is.integer(out$end)) + expect_true(is.character(out$variant)) + expect_true(is.character(out$gene)) + expect_true(is.character(out$event)) + expect_true(is.integer(out$n)) + expect_true(is.numeric(out$maf)) + expect_true(is.numeric(out$pip)) + expect_true(is.numeric(out$posterior_effect_mean)) + expect_true(is.numeric(out$posterior_effect_se)) + expect_true(is.character(out$cs_95)) + expect_true(is.character(out$cs_70)) + expect_true(is.character(out$cs_50)) + expect_true(is.numeric(out$cs_95_purity)) + expect_true(is.character(out$method)) + expect_true(is.integer(out$grange_start)) + expect_true(is.integer(out$grange_end)) }) -test_that("build_top_loci_long writes per-fit constants from data_x/data_y/other_quantities", { +test_that("build_top_loci emits 22 columns in the fixed order on a non-empty fit", { variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") - inp <- .make_fake_inputs_for_annotated_long(variant_ids, - cs_membership = c(1L, 2L), - n_samples = 419, - n_variants = 11332) - other_q <- list( - dropped_samples = list(X = character(), y = character()), - region = "chr10:10823338-14348298", - condition_id = "Ast_DeJager_eQTL" - ) - long <- pecotmr:::build_top_loci_long( - fit = inp$fit, cs_tables = inp$cs_tables, - variant_names = inp$variant_names, - sumstats = list(betahat = c(0.2, -0.1), - sebetahat = c(0.05, 0.04), - z = c(4, -2.5)), - maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05, - data_x = inp$data_x, data_y = inp$data_y, other_quantities = other_q - ) - expect_equal(unique(long$n), 419L) - expect_equal(unique(long$variant_number), 11332L) - expect_equal(unique(long$gene_id), "ENSG00000179403") - expect_equal(unique(long$region), "chr10:10823338-14348298") - expect_equal(unique(long$event_ID), "Ast_DeJager_eQTL_ENSG00000179403") + inp <- .fake_fit_and_cs(variant_ids, + cs_at_cov = list("0.95" = list(c(1L, 2L)), + "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))), + n_samples = 419, n_variants = 11332) + other_q <- list(region = "chr10:10823338-14348298", + condition_id = "Ast_DeJager_eQTL") + out <- .run_build_top_loci(inp, method = "susie", + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04)), + maf = c(0.10, 0.25), + other_quantities = other_q) + expect_equal(names(out), .UNIFIED_TOP_LOCI_COLS) + expect_equal(unique(out$gene), "ENSG00000179403") + expect_equal(unique(out$event), "Ast_DeJager_eQTL_ENSG00000179403") + expect_equal(unique(out$grange_start), 10823338L) + expect_equal(unique(out$grange_end), 14348298L) + expect_equal(unique(out$n), 419L) + expect_equal(unique(out$method), "susie") }) -test_that("build_top_loci_long omits region/event_ID columns when caller does not supply them", { - variant_ids <- c("chr1:100:A:G") - inp <- .make_fake_inputs_for_annotated_long(variant_ids, - cs_membership = 1L) - long <- pecotmr:::build_top_loci_long( - fit = inp$fit, cs_tables = inp$cs_tables, - variant_names = inp$variant_names, - sumstats = list(betahat = 0.2, sebetahat = 0.05, z = 4), - maf = 0.10, method = "susie", signal_cutoff = 0.05, - data_x = inp$data_x, data_y = inp$data_y, other_quantities = NULL - ) - expect_false("region" %in% names(long)) - expect_false("event_ID" %in% names(long)) - expect_true("n" %in% names(long)) - expect_true("variant_number" %in% names(long)) - expect_true("gene_id" %in% names(long)) +test_that("cs_95 / cs_70 / cs_50 are character strings of the form '_'", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T", "chr1:300:T:C") + # Variants 1 and 2 in CS 1 (at 95), variant 3 in CS 2 (at 95). All three + # also appear at 70/50 with the same memberships. + cs_at_cov <- list("0.95" = list(c(1L, 2L), 3L), + "0.7" = list(c(1L, 2L), 3L), + "0.5" = list(c(1L, 2L), 3L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = c(0.9, 0.9, 0.9)) + out <- .run_build_top_loci(inp, method = "susie") + expect_true(all(grepl("^susie_\\d+$", out$cs_95))) + expect_true(all(grepl("^susie_\\d+$", out$cs_70))) + expect_true(all(grepl("^susie_\\d+$", out$cs_50))) + expect_true(any(out$cs_95 == "susie_1")) + expect_true(any(out$cs_95 == "susie_2")) }) -test_that("build_top_loci_long stays backward-compatible when data_x/data_y/other_quantities are omitted", { +test_that("PIP-only retained variants carry '_0' at every coverage and cs_95_purity = 0", { variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") - inp <- .make_fake_inputs_for_annotated_long(variant_ids, - cs_membership = c(1L, 2L)) - long <- pecotmr:::build_top_loci_long( + # No CS at any coverage; variant 2 has high PIP so it is retained via + # signal_cutoff and produces a "_0" row. + cs_at_cov <- list("0.95" = list(), + "0.7" = list(), + "0.5" = list()) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, + pip = c(0.02, 0.95)) + out <- .run_build_top_loci(inp, method = "susie", signal_cutoff = 0.5) + expect_gte(nrow(out), 1L) + # Every row must have _0 at every coverage and cs_95_purity = 0. + expect_true(all(out$cs_95 == "susie_0")) + expect_true(all(out$cs_70 == "susie_0")) + expect_true(all(out$cs_50 == "susie_0")) + expect_true(all(out$cs_95_purity == 0)) +}) + +test_that("per-method CS indices are independent across susie and susie_inf (postprocess_finemapping_fits)", { + d <- .make_univariate_data(seed = 21, effect_idx = c(15, 35)) + fits <- fit_susie_inf_then_susie(d$X, d$y) + post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, + coverage = 0.95, + secondary_coverage = c(0.7, 0.5)) + tl <- post$top_loci + expect_setequal(unique(tl$method), c("susie", "susie_inf")) + # Each method must only ever emit "_" strings, never strings + # from the other method. This is the core safeguard against silent + # method-mixing. + susie_rows <- tl[tl$method == "susie", , drop = FALSE] + susie_inf_rows <- tl[tl$method == "susie_inf", , drop = FALSE] + for (col in c("cs_95", "cs_70", "cs_50")) { + expect_true(all(grepl("^susie_\\d+$", susie_rows[[col]])), + info = paste("susie rows have wrong prefix in", col)) + expect_true(all(grepl("^susie_inf_\\d+$", susie_inf_rows[[col]])), + info = paste("susie_inf rows have wrong prefix in", col)) + } + # CS indices are not sequenced across methods: if both methods have any + # CS, each may independently include "_1". + has_susie_1 <- any(susie_rows$cs_95 == "susie_1") + has_susie_inf_1 <- any(susie_inf_rows$cs_95 == "susie_inf_1") + expect_true(has_susie_1 || nrow(susie_rows) == 0L) + expect_true(has_susie_inf_1 || nrow(susie_inf_rows) == 0L) +}) + +test_that("cs_95_purity = 0 when cs_95 is '_0', and in (0, 1] otherwise", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T", "chr1:300:T:C") + # Variant 1 in CS 1 at 95-cov; variant 2 PIP-only retained; variant 3 + # PIP-only retained. + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, + cs_purity_value = 0.85, + pip = c(0.9, 0.6, 0.55)) + out <- .run_build_top_loci(inp, method = "susie", signal_cutoff = 0.5) + expect_true(any(out$cs_95 == "susie_1")) + expect_true(any(out$cs_95 == "susie_0")) + in_cs <- out[out$cs_95 != "susie_0", , drop = FALSE] + not_cs <- out[out$cs_95 == "susie_0", , drop = FALSE] + expect_true(all(not_cs$cs_95_purity == 0)) + expect_true(all(in_cs$cs_95_purity > 0 & in_cs$cs_95_purity <= 1)) +}) + +test_that("overlapping CS within one method produces one row per CS membership", { + variant_ids <- c("chr1:100:A:G") + # One variant belongs to CS 1 AND CS 2 at 95-cov (overlap). + cs_at_cov <- list("0.95" = list(1L, 1L), + "0.7" = list(1L, 1L), + "0.5" = list(1L, 1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + out <- .run_build_top_loci(inp, method = "susie") + # Two rows: one for CS 1 membership, one for CS 2 membership; same + # (variant, gene, method). + expect_equal(nrow(out), 2L) + expect_equal(unique(out$variant), "chr1:100:A:G") + expect_equal(unique(out$method), "susie") + expect_setequal(out$cs_95, c("susie_1", "susie_2")) +}) + +test_that("overlapping CS across methods produces one row per method", { + d <- .make_univariate_data(seed = 22, effect_idx = c(12, 32)) + fits <- fit_susie_inf_then_susie(d$X, d$y) + post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, + coverage = 0.95, + secondary_coverage = c(0.7, 0.5)) + tl <- post$top_loci + if (nrow(tl) > 0L) { + cnt_per_method <- table(tl$variant, tl$method) + shared <- rownames(cnt_per_method)[apply(cnt_per_method > 0, 1, sum) >= 2L] + if (length(shared) > 0L) { + v <- shared[[1]] + rows_for_v <- tl[tl$variant == v, , drop = FALSE] + expect_gte(length(unique(rows_for_v$method)), 2L) + } else { + succeed("no shared variants in this fixture; cross-method uniqueness rule is structural") + } + } else { + succeed("empty top_loci from this fixture; cross-method uniqueness rule is structural") + } +}) + +test_that("format_finemapping_output exposes exactly one top_loci field; no top_loci_long, no wide top_loci, no top_loci_export", { + d <- .make_univariate_data(seed = 23, effect_idx = c(15, 40)) + fits <- fit_susie_inf_then_susie(d$X, d$y) + post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) + out <- format_finemapping_output(post, primary_method = "susie") + expect_true("top_loci" %in% names(out)) + expect_false("top_loci_long" %in% names(out)) + expect_false("top_loci_export" %in% names(out)) + # The exposed top_loci has the unified 22-column schema. + expect_equal(names(out$top_loci), .UNIFIED_TOP_LOCI_COLS) +}) + +test_that("postprocess_finemapping_fits does not return top_loci_long anywhere", { + d <- .make_univariate_data(seed = 24, effect_idx = c(25)) + fit <- susieR::susie(d$X, d$y, L = 5) + post <- postprocess_finemapping_fits(list(susie = fit), + data_x = d$X, data_y = d$y, coverage = 0.95) + expect_true("top_loci" %in% names(post)) + expect_false("top_loci_long" %in% names(post)) + expect_equal(names(post$top_loci), .UNIFIED_TOP_LOCI_COLS) + # No per-method finemapping_results entry should carry a top_loci_long either. + for (name in names(post$finemapping_results)) { + expect_false("top_loci_long" %in% names(post$finemapping_results[[name]]), + info = name) + } +}) + +test_that("build_top_loci_long / build_top_loci_wide / build_top_loci_export are removed from the package", { + # Two layered checks. (1) The new helper must be reachable. (2) The old + # helpers must NOT be reachable — neither in the package namespace (if the + # package is installed) nor in .GlobalEnv (when this file is source-loaded + # against the fresh R/ tree for testing). The contract is that the trio is + # gone end-to-end after the migration. + resolve <- function(name) { + if (exists(name, envir = .GlobalEnv, inherits = FALSE)) { + return(get(name, envir = .GlobalEnv)) + } + ns_ok <- tryCatch(asNamespace("pecotmr"), error = function(e) NULL) + if (!is.null(ns_ok) && exists(name, envir = ns_ok, inherits = FALSE)) { + return(get(name, envir = ns_ok)) + } + NULL + } + expect_false(is.null(resolve("build_top_loci")), + info = "build_top_loci must be defined after the migration") + # The removed trio: only fail if a definition exists in the SAME source + # tree we just loaded (i.e. globalenv). The installed-package namespace + # may still carry stale copies from an earlier install; the gate only + # cares that the new source tree does not redefine them. + expect_false(exists("build_top_loci_long", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists("build_top_loci_wide", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists("build_top_loci_export", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(".empty_top_loci_long", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(".empty_top_loci_export",envir = .GlobalEnv, inherits = FALSE)) +}) + +test_that("build_top_loci raises an explicit error on invalid variant_id rather than silently filling NA", { + variant_ids <- c("not_a_valid_id") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + expect_error(.run_build_top_loci(inp, method = "susie"), + "parse_variant_id") +}) + +test_that("build_top_loci requires `method`", { + variant_ids <- c("chr1:100:A:G") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + expect_error(build_top_loci( fit = inp$fit, cs_tables = inp$cs_tables, - variant_names = inp$variant_names, - sumstats = list(betahat = c(0.2, -0.1), - sebetahat = c(0.05, 0.04), - z = c(4, -2.5)), - maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05 - ) - # n / variant_number / gene_id columns exist but are NA because no - # data_x / data_y were supplied; region / event_ID are omitted entirely. - expect_true(all(is.na(long$n))) - expect_true(all(is.na(long$variant_number))) - expect_true(all(is.na(long$gene_id))) - expect_false("region" %in% names(long)) - expect_false("event_ID" %in% names(long)) -}) - -test_that("build_top_loci_export errors when required columns are missing", { - bad <- data.frame(variant_id = "chr1:100:A:G", pip = 0.9, - coverage = 0.95, cs = 1L, - stringsAsFactors = FALSE) - expect_error(build_top_loci_export(bad), - "missing required columns") -}) - -test_that("build_top_loci_export returns the fixed 21-column schema on an empty long", { - empty <- pecotmr:::.empty_top_loci_long() - out <- build_top_loci_export(empty) - expected <- c("#chr", "start", "end", "a1", "a2", "variant_ID", "gene_ID", - "event_ID", "cs_coverage_0.95", "cs_coverage_0.7", - "cs_coverage_0.5", "cs_purity", "PIP", "conditional_effect", - "conditional_effect_se", "analysis_region", - "analysis_variants_number", "beta", "se", "n", "maf") - expect_equal(names(out), expected) - expect_equal(nrow(out), 0L) + variant_names = inp$variant_names + ), "method") }) -test_that("build_top_loci_export projects an annotated long into the fixed compact schema", { +test_that("non-top-loci wrapper fields (susie_result_trimmed, variant_names) are preserved by format_finemapping_output", { + d <- .make_univariate_data(seed = 25, effect_idx = c(20)) + fit <- susieR::susie(d$X, d$y, L = 5) + post <- postprocess_finemapping_fits(list(susie = fit), data_x = d$X, data_y = d$y, coverage = 0.95) + out <- format_finemapping_output(post, primary_method = "susie") + expect_true("susie_result_trimmed" %in% names(out)) + expect_true("variant_names" %in% names(out)) +}) + +test_that("missing other_quantities$region produces NA grange columns rather than silent omission", { + variant_ids <- c("chr1:100:A:G") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + out <- .run_build_top_loci(inp, method = "susie", + other_quantities = list(condition_id = "ctx")) + # grange_* must still be present columns of the 22-col schema, with NA values. + expect_true(all(c("grange_start", "grange_end") %in% names(out))) + expect_true(all(is.na(out$grange_start))) + expect_true(all(is.na(out$grange_end))) + # And event composition still works from gene + condition_id. + expect_equal(unique(out$event), "ctx_ENSG00000179403") +}) + +test_that("posterior_effect_mean equals colSums(alpha*mu); posterior_effect_se equals sqrt(pmax(colSums(alpha*mu2) - mean^2, 0))", { variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") - inp <- .make_fake_inputs_for_annotated_long(variant_ids, - cs_membership = c(1L, 2L), - n_samples = 419, - n_variants = 11332) - other_q <- list(region = "chr10:10823338-14348298", - condition_id = "Ast_DeJager_eQTL") - long <- pecotmr:::build_top_loci_long( - fit = inp$fit, cs_tables = inp$cs_tables, - variant_names = inp$variant_names, - sumstats = list(betahat = c(0.2, -0.1), - sebetahat = c(0.05, 0.04), - z = c(4, -2.5)), - maf = c(0.10, 0.25), method = "susie", signal_cutoff = 0.05, - data_x = inp$data_x, data_y = inp$data_y, other_quantities = other_q - ) - out <- build_top_loci_export(long) - expected_cols <- c("#chr", "start", "end", "a1", "a2", "variant_ID", - "gene_ID", "event_ID", "cs_coverage_0.95", - "cs_coverage_0.7", "cs_coverage_0.5", "cs_purity", - "PIP", "conditional_effect", "conditional_effect_se", - "analysis_region", "analysis_variants_number", - "beta", "se", "n", "maf") - expect_equal(names(out), expected_cols) - expect_equal(out$gene_ID[[1]], "ENSG00000179403") - expect_equal(out$event_ID[[1]], "Ast_DeJager_eQTL_ENSG00000179403") - expect_equal(out$analysis_region[[1]], "chr10:10823338-14348298") - expect_equal(out$analysis_variants_number[[1]], 11332L) - expect_equal(out$n[[1]], 419L) + cs_at_cov <- list("0.95" = list(c(1L, 2L)), + "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = c(0.8, 0.6)) + out <- .run_build_top_loci(inp, method = "susie") + expected_mean <- colSums(inp$fit$alpha * inp$fit$mu) + expected_se <- sqrt(pmax(colSums(inp$fit$alpha * inp$fit$mu2) - expected_mean^2, 0)) + # Match per variant index by looking up via variant string. + for (i in seq_along(variant_ids)) { + row <- out[out$variant == variant_ids[i], , drop = FALSE] + expect_true(nrow(row) >= 1L) + expect_equal(unique(row$posterior_effect_mean), expected_mean[i], + tolerance = 1e-10) + expect_equal(unique(row$posterior_effect_se), expected_se[i], + tolerance = 1e-10) + } }) From 0830a30743eef7a82c2580fc8feeb25882f03a5f Mon Sep 17 00:00:00 2001 From: xueweic Date: Sat, 30 May 2026 15:03:44 +0000 Subject: [PATCH 06/14] Update documentation --- NAMESPACE | 2 +- man/build_top_loci.Rd | 93 +++++++++++++++++++++++++++++ man/build_top_loci_export.Rd | 36 ----------- man/format_finemapping_output.Rd | 8 ++- man/postprocess_finemapping_fits.Rd | 12 ++-- 5 files changed, 107 insertions(+), 44 deletions(-) create mode 100644 man/build_top_loci.Rd delete mode 100644 man/build_top_loci_export.Rd diff --git a/NAMESPACE b/NAMESPACE index 417b054e..9f408151 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,7 @@ export(bayes_c_weights) export(bayes_l_weights) export(bayes_n_weights) export(bayes_r_weights) -export(build_top_loci_export) +export(build_top_loci) export(check_ld) export(clean_context_names) export(coloc_post_processor) diff --git a/man/build_top_loci.Rd b/man/build_top_loci.Rd new file mode 100644 index 00000000..ef2024ba --- /dev/null +++ b/man/build_top_loci.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/susie_wrapper.R +\name{build_top_loci} +\alias{build_top_loci} +\title{Build the unified single top-loci table for one fit and one method.} +\usage{ +build_top_loci( + fit, + cs_tables, + variant_names, + sumstats = NULL, + maf = NULL, + method, + signal_cutoff = 0.1, + data_x = NULL, + data_y = NULL, + other_quantities = NULL +) +} +\arguments{ +\item{fit}{A fitted SuSiE-family object (must expose \code{alpha}, +\code{mu}, \code{mu2}, \code{pip}).} + +\item{cs_tables}{A list of CS tables (one per coverage), as produced by +\code{compute_cs_tables()}.} + +\item{variant_names}{Character vector of variant IDs in +\code{chr:pos:A2:A1} form, length equal to the number of variants in the +fit. Used to construct \code{variant}, \code{#chr}, \code{start}, +\code{end}, \code{a1}, \code{a2}.} + +\item{sumstats}{Optional marginal-association summary statistics +(\code{betahat}, \code{sebetahat}) used to fill \code{beta} and \code{se}.} + +\item{maf}{Optional numeric vector of minor-allele frequencies.} + +\item{method}{Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Used +to construct the per-method \code{"_"} strings and the +\code{method} column. Required.} + +\item{signal_cutoff}{PIP cutoff for retaining PIP-only (non-CS) variants.} + +\item{data_x}{Optional regional genotype matrix; used only for sample-count +shape checks.} + +\item{data_y}{Optional regional phenotype matrix; \code{nrow(data_y)} fills +\code{n}, \code{colnames(data_y)[1]} fills \code{gene}.} + +\item{other_quantities}{Optional list with reserved subfields +\code{region} (e.g. \code{"chr1:100-200"}) and \code{condition_id} +(e.g. \code{"Ast_DeJager_eQTL"}); used to fill \code{grange_start}, +\code{grange_end}, and to compose \code{event} as +\code{paste(condition_id, gene, sep = "_")}. Missing subfields are +filled with \code{NA} rather than dropped.} +} +\value{ +A data frame in the fixed 22-column unified \code{top_loci} shape + for this one fit and one method. Returns an empty data frame with the + correct columns and dtypes if there is nothing to retain. +} +\description{ +Produces the per-fit, per-method contribution to the unified \code{top_loci} +table in the fixed 22-column shape. The outer +\code{postprocess_finemapping_fits()} loop calls this once per method per fit +and row-binds the results into the single final \code{top_loci} table that is +exposed by \code{format_finemapping_output()}. +} +\details{ +This function replaces the previous \code{build_top_loci_long()}, +\code{build_top_loci_wide()}, and \code{build_top_loci_export()} trio. +There is no separately exposed long or wide table. + +The output column order is exactly (22 columns): +\code{#chr}, \code{start}, \code{end}, \code{a1}, \code{a2}, +\code{variant}, \code{gene}, \code{event}, +\code{n}, \code{maf}, \code{beta}, \code{se}, +\code{pip}, \code{posterior_effect_mean}, \code{posterior_effect_se}, +\code{cs_95}, \code{cs_70}, \code{cs_50}, \code{cs_95_purity}, +\code{method}, \code{grange_start}, \code{grange_end}. + +The \code{cs_95}, \code{cs_70}, \code{cs_50} columns are character strings of +the form \code{"_"} where each method numbers its credible +sets independently starting at 1. Variants retained by the PIP cutoff but not +assigned to any credible set at the given coverage use \code{"_0"}. +\code{cs_95_purity} is the 0.95-coverage credible-set purity for the row's +\code{(method, cs_95)}; rows whose \code{cs_95} is \code{"_0"} carry +\code{cs_95_purity = 0}. + +Row uniqueness inside this function's output is one row per +\code{(variant, gene, cs_membership)} at the given \code{method}. Overlapping +credible-set membership for the same method produces one row per CS, so the +overlapping-CS contract is preserved. +} diff --git a/man/build_top_loci_export.Rd b/man/build_top_loci_export.Rd deleted file mode 100644 index 171e52c9..00000000 --- a/man/build_top_loci_export.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/susie_wrapper.R -\name{build_top_loci_export} -\alias{build_top_loci_export} -\title{Build the unified compact top-loci export table.} -\usage{ -build_top_loci_export(long) -} -\arguments{ -\item{long}{An annotated \code{top_loci_long} data frame. Must contain -the columns \code{variant_id}, \code{pip}, \code{coverage}, \code{cs}, -\code{conditional_effect}, \code{conditional_effect_se}, -\code{cs_purity}, \code{gene_id}, \code{n}, \code{variant_number}, -\code{region}, \code{event_ID}, \code{betahat}, \code{sebetahat}, -\code{maf}. Missing any required column raises an explicit error -rather than silently filling \code{NA}.} -} -\value{ -A data frame with the fixed compact-export schema. -} -\description{ -Projects an annotated \code{top_loci_long} (as produced by -\code{postprocess_finemapping_fits()} when called with \code{data_x}, -\code{data_y}, and an \code{other_quantities} list containing -\code{region} and \code{condition_id}) into the fixed-order compact -export schema used by the unified fine-mapping output. -} -\details{ -The output column order is exactly: \code{#chr}, \code{start}, \code{end}, -\code{a1}, \code{a2}, \code{variant_ID}, \code{gene_ID}, \code{event_ID}, -\code{cs_coverage_0.95}, \code{cs_coverage_0.7}, \code{cs_coverage_0.5}, -\code{cs_purity}, \code{PIP}, \code{conditional_effect}, -\code{conditional_effect_se}, \code{analysis_region}, -\code{analysis_variants_number}, \code{beta}, \code{se}, \code{n}, -\code{maf}. -} diff --git a/man/format_finemapping_output.Rd b/man/format_finemapping_output.Rd index 4519da0a..7c48d25c 100644 --- a/man/format_finemapping_output.Rd +++ b/man/format_finemapping_output.Rd @@ -13,9 +13,13 @@ format_finemapping_output(post, primary_method) } \value{ A list with root-level fields including \code{variant_names}, - \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. + \code{susie_result_trimmed}, and the single unified \code{top_loci} + 22-column table. } \description{ Converts method-aware fine-mapping post-processing output into the root-level -fields consumed by protocol RDS files. +fields consumed by protocol RDS files. The single top-loci output is the +\code{top_loci} field (the 22-column unified table); there is no +\code{top_loci_long}, no wide-format \code{top_loci}, and no +\code{top_loci_export}. } diff --git a/man/postprocess_finemapping_fits.Rd b/man/postprocess_finemapping_fits.Rd index 36b88ae9..bbd7cec1 100644 --- a/man/postprocess_finemapping_fits.Rd +++ b/man/postprocess_finemapping_fits.Rd @@ -48,11 +48,13 @@ input used for credible-set purity and correlations.} \item{min_abs_corr}{Minimum absolute correlation for credible-set purity.} } \value{ -A list with \code{finemapping_results}, \code{top_loci_long}, and - \code{top_loci}. The long table is lossless, with one row per - variant-method-coverage-CS membership. The wide table stores one row per - variant, method-specific \code{pip_} columns, method-specific - \code{CS__} columns, and \code{model_source}. +A list with \code{finemapping_results} (per-method post-processed + objects, each carrying a trimmed fit and method-specific intermediates) + and the single unified \code{top_loci} table in the fixed 22-column + shape (see \code{\link{build_top_loci}}). Per-method contributions are + produced by \code{build_top_loci()} once per method and row-bound into + this single \code{top_loci}. There is no separately exposed + \code{top_loci_long} or wide-format \code{top_loci}. } \description{ Applies method-aware post-processing to one or more SuSiE-family fits and From 33fc7381b46a2e40e6acc4caed2b14fea14963b0 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 10:18:27 -0500 Subject: [PATCH 07/14] add susie_wrapper vignettes --- R/susie_wrapper.R | 297 +++++++++++++++--------------------- vignettes/susie_wrapper.Rmd | 243 +++++++++++++++++++++++++++++ 2 files changed, 363 insertions(+), 177 deletions(-) create mode 100644 vignettes/susie_wrapper.Rmd diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 8a731482..03290976 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -170,11 +170,9 @@ fit_susie_inf_then_susie <- function(X, y, args = list(), #' @param min_abs_corr Minimum absolute correlation for credible-set purity. #' @return A list with \code{finemapping_results} (per-method post-processed #' objects, each carrying a trimmed fit and method-specific intermediates) -#' and the single unified \code{top_loci} table in the fixed 22-column -#' shape (see \code{\link{build_top_loci}}). Per-method contributions are -#' produced by \code{build_top_loci()} once per method and row-bound into -#' this single \code{top_loci}. There is no separately exposed -#' \code{top_loci_long} or wide-format \code{top_loci}. +#' and a single unified \code{top_loci} table in the fixed 22-column shape +#' (see \code{\link{build_top_loci}}). Per-method contributions are +#' row-bound into \code{top_loci} by an outer method for-loop. #' @export postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, X_scalar = 1, y_scalar = 1, @@ -421,67 +419,49 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", " out } -#' Build the unified single top-loci table for one fit and one method. +#' Build the unified top-loci table for one fit and one method. #' -#' Produces the per-fit, per-method contribution to the unified \code{top_loci} -#' table in the fixed 22-column shape. The outer -#' \code{postprocess_finemapping_fits()} loop calls this once per method per fit -#' and row-binds the results into the single final \code{top_loci} table that is -#' exposed by \code{format_finemapping_output()}. +#' Returns the per-fit, per-method contribution to the unified \code{top_loci} +#' table in the fixed 22-column shape. \code{postprocess_finemapping_fits()} +#' calls this once per method per fit and row-binds the results into the +#' single \code{top_loci} returned by \code{format_finemapping_output()}. #' -#' This function replaces the previous \code{build_top_loci_long()}, -#' \code{build_top_loci_wide()}, and \code{build_top_loci_export()} trio. -#' There is no separately exposed long or wide table. +#' Output columns, in order: \code{#chr}, \code{start}, \code{end}, \code{a1}, +#' \code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{maf}, +#' \code{beta}, \code{se}, \code{pip}, \code{posterior_effect_mean}, +#' \code{posterior_effect_se}, \code{cs_95}, \code{cs_70}, \code{cs_50}, +#' \code{cs_95_purity}, \code{method}, \code{grange_start}, \code{grange_end}. #' -#' The output column order is exactly (22 columns): -#' \code{#chr}, \code{start}, \code{end}, \code{a1}, \code{a2}, -#' \code{variant}, \code{gene}, \code{event}, -#' \code{n}, \code{maf}, \code{beta}, \code{se}, -#' \code{pip}, \code{posterior_effect_mean}, \code{posterior_effect_se}, -#' \code{cs_95}, \code{cs_70}, \code{cs_50}, \code{cs_95_purity}, -#' \code{method}, \code{grange_start}, \code{grange_end}. +#' \code{cs_95} / \code{cs_70} / \code{cs_50} are character strings of the +#' form \code{"_"} where each method numbers credible sets +#' independently from 1. Variants retained by the PIP cutoff but not in any +#' credible set at a coverage carry \code{"_0"}. \code{cs_95_purity} +#' is the 0.95-coverage purity for the row's \code{(method, cs_95)}; rows +#' whose \code{cs_95} is \code{"_0"} carry \code{0}. #' -#' The \code{cs_95}, \code{cs_70}, \code{cs_50} columns are character strings of -#' the form \code{"_"} where each method numbers its credible -#' sets independently starting at 1. Variants retained by the PIP cutoff but not -#' assigned to any credible set at the given coverage use \code{"_0"}. -#' \code{cs_95_purity} is the 0.95-coverage credible-set purity for the row's -#' \code{(method, cs_95)}; rows whose \code{cs_95} is \code{"_0"} carry -#' \code{cs_95_purity = 0}. +#' Row uniqueness is \code{(variant, gene, cs_membership)} at the given +#' \code{method}; overlapping CS within the same method produces one row per +#' CS. #' -#' Row uniqueness inside this function's output is one row per -#' \code{(variant, gene, cs_membership)} at the given \code{method}. Overlapping -#' credible-set membership for the same method produces one row per CS, so the -#' overlapping-CS contract is preserved. -#' -#' @param fit A fitted SuSiE-family object (must expose \code{alpha}, +#' @param fit Fitted SuSiE-family object (must expose \code{alpha}, #' \code{mu}, \code{mu2}, \code{pip}). -#' @param cs_tables A list of CS tables (one per coverage), as produced by +#' @param cs_tables List of CS tables (one per coverage) from #' \code{compute_cs_tables()}. -#' @param variant_names Character vector of variant IDs in -#' \code{chr:pos:A2:A1} form, length equal to the number of variants in the -#' fit. Used to construct \code{variant}, \code{#chr}, \code{start}, -#' \code{end}, \code{a1}, \code{a2}. -#' @param sumstats Optional marginal-association summary statistics -#' (\code{betahat}, \code{sebetahat}) used to fill \code{beta} and \code{se}. +#' @param variant_names Character vector of variant IDs +#' (\code{chr:pos:A2:A1}). +#' @param sumstats Optional marginal-association summary (\code{betahat}, +#' \code{sebetahat}) filling \code{beta} / \code{se}. #' @param maf Optional numeric vector of minor-allele frequencies. -#' @param method Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Used -#' to construct the per-method \code{"_"} strings and the -#' \code{method} column. Required. +#' @param method Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Required. #' @param signal_cutoff PIP cutoff for retaining PIP-only (non-CS) variants. -#' @param data_x Optional regional genotype matrix; used only for sample-count -#' shape checks. +#' @param data_x Optional regional genotype matrix. #' @param data_y Optional regional phenotype matrix; \code{nrow(data_y)} fills #' \code{n}, \code{colnames(data_y)[1]} fills \code{gene}. -#' @param other_quantities Optional list with reserved subfields -#' \code{region} (e.g. \code{"chr1:100-200"}) and \code{condition_id} -#' (e.g. \code{"Ast_DeJager_eQTL"}); used to fill \code{grange_start}, -#' \code{grange_end}, and to compose \code{event} as -#' \code{paste(condition_id, gene, sep = "_")}. Missing subfields are -#' filled with \code{NA} rather than dropped. -#' @return A data frame in the fixed 22-column unified \code{top_loci} shape -#' for this one fit and one method. Returns an empty data frame with the -#' correct columns and dtypes if there is nothing to retain. +#' @param other_quantities Optional list with reserved subfields \code{region} +#' (filling \code{grange_start} / \code{grange_end}) and \code{condition_id} +#' (composing \code{event} as \code{paste(condition_id, gene, sep = "_")}). +#' @return A data frame in the fixed 22-column shape for this fit and method, +#' or an empty data frame if nothing is retained. #' @export build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, maf = NULL, method, signal_cutoff = 0.1, @@ -501,123 +481,71 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, fit_gene <- if (!is.null(data_y_mat) && !is.null(colnames(data_y_mat))) { colnames(data_y_mat)[1] } else NA_character_ - fit_condition_id <- other_quantities$condition_id - fit_event <- if (!is.null(fit_condition_id) && + fit_event <- if (!is.null(other_quantities$condition_id) && !is.na(fit_gene) && nzchar(fit_gene)) { - paste(fit_condition_id, fit_gene, sep = "_") + paste(other_quantities$condition_id, fit_gene, sep = "_") } else NA_character_ - grange_se <- .parse_grange(other_quantities$region) - fit_grange_start <- grange_se[["start"]] - fit_grange_end <- grange_se[["end"]] + grange <- .parse_grange(other_quantities$region) - # Per-variant posterior effect and SE, computed once for all variants. + # Per-variant posterior effect / SE, computed once across all variants. alpha <- as.matrix(fit$alpha) mu <- if (!is.null(fit$mu)) as.matrix(fit$mu) else NULL mu2 <- if (!is.null(fit$mu2)) as.matrix(fit$mu2) else NULL - posterior_effect <- if (!is.null(mu) && all(dim(alpha) == dim(mu))) { + post_mean <- if (!is.null(mu) && all(dim(alpha) == dim(mu))) { colSums(alpha * mu) } else rep(NA_real_, length(variant_names)) - posterior_effect_se <- if (!is.null(mu2) && all(dim(alpha) == dim(mu2))) { - sqrt(pmax(colSums(alpha * mu2) - posterior_effect^2, 0)) + post_se <- if (!is.null(mu2) && all(dim(alpha) == dim(mu2))) { + sqrt(pmax(colSums(alpha * mu2) - post_mean^2, 0)) } else rep(NA_real_, length(variant_names)) - # Per-coverage credible-set purity vectors, indexed by 1-based CS index. - purity_per_cov <- lapply(cs_tables, function(ct) { - sets_purity <- ct$sets$purity - if (!is.null(sets_purity) && - "min.abs.corr" %in% names(sets_purity)) { - as.numeric(sets_purity$min.abs.corr) - } else if (!is.null(ct$cs_corr)) { - vapply(seq_along(ct$cs_corr), function(j) { - m <- ct$cs_corr[[j]] - if (is.null(m)) return(NA_real_) - if (!is.matrix(m) || nrow(m) <= 1) return(1) - min(abs(m[upper.tri(m)])) - }, numeric(1)) - } else { - rep(NA_real_, length(ct$sets$cs)) - } - }) - - # Internal long-shaped collection: one row per - # (variant_idx, cs_idx_at_this_coverage, coverage). Not exposed. Used only - # to project to the 22-column shape below. - long_rows <- list() - for (i in seq_along(cs_tables)) { + # Collect CS-membership records (variant_idx, cs_idx, coverage) across all + # requested coverages. This is the only intermediate; the 22-column shape + # is projected from it below. + cs_records <- do.call(rbind, lapply(seq_along(cs_tables), function(i) { ct <- cs_tables[[i]] - top_variants_idx <- get_top_variants_idx(ct, signal_cutoff) - cs_info <- get_cs_info(ct$sets$cs, top_variants_idx) - if (is.null(cs_info) || nrow(cs_info) == 0) next - long_rows[[length(long_rows) + 1L]] <- data.frame( - variant_idx = as.integer(cs_info$variant_idx), - cs_idx = as.integer(cs_info$cs_idx), - coverage = as.numeric(coverage_values[[i]]), - stringsAsFactors = FALSE - ) - } - if (length(long_rows) == 0) return(.empty_top_loci()) - long_df <- do.call(rbind, long_rows) - if (nrow(long_df) == 0) return(.empty_top_loci()) - - # Key grid: one row per (variant_idx, cs_idx) at the export grain. Preserves - # overlapping CS membership for the same method. - key_grid <- unique(long_df[, c("variant_idx", "cs_idx"), drop = FALSE]) + info <- get_cs_info(ct$sets$cs, get_top_variants_idx(ct, signal_cutoff)) + if (is.null(info) || nrow(info) == 0) return(NULL) + data.frame(variant_idx = as.integer(info$variant_idx), + cs_idx = as.integer(info$cs_idx), + coverage = as.numeric(coverage_values[[i]]), + stringsAsFactors = FALSE) + })) + if (is.null(cs_records) || nrow(cs_records) == 0) return(.empty_top_loci()) + + # Key grid: one row per (variant_idx, cs_idx). Overlapping CS membership + # within this method is preserved as separate keys. + key_grid <- unique(cs_records[, c("variant_idx", "cs_idx"), drop = FALSE]) rownames(key_grid) <- NULL - n_keys <- nrow(key_grid) - - # Helper: did this (variant_idx, cs_idx) appear at this coverage? If yes, - # return cs_idx; otherwise 0. - lookup_cs_at_cov <- function(v_idx, c_idx, cov) { - sel <- long_df$variant_idx == v_idx & - long_df$cs_idx == c_idx & - abs(long_df$coverage - cov) < 1e-12 - if (any(sel)) as.integer(c_idx) else 0L - } - cov95_table_idx <- which(abs(coverage_values - 0.95) < 1e-12) - - format_cs_string <- function(idx) { - if (is.na(idx) || idx <= 0L) paste0(method, "_0") else paste0(method, "_", idx) - } - - cs_95 <- character(n_keys) - cs_70 <- character(n_keys) - cs_50 <- character(n_keys) - cs_95_purity <- numeric(n_keys) - for (k in seq_len(n_keys)) { - v_idx <- key_grid$variant_idx[k] - c_idx <- key_grid$cs_idx[k] - cs95_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.95) - cs70_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.70) - cs50_idx <- lookup_cs_at_cov(v_idx, c_idx, 0.50) - cs_95[k] <- format_cs_string(cs95_idx) - cs_70[k] <- format_cs_string(cs70_idx) - cs_50[k] <- format_cs_string(cs50_idx) - if (cs95_idx > 0L && length(cov95_table_idx) > 0L) { - pvec <- purity_per_cov[[cov95_table_idx[1]]] - cs_95_purity[k] <- if (cs95_idx <= length(pvec)) { - val <- pvec[cs95_idx] - if (is.na(val)) 0 else as.numeric(val) - } else 0 - } else { - cs_95_purity[k] <- 0 - } - } - - # Per-variant lookups indexed by the row's variant_idx. - v_idx_vec <- key_grid$variant_idx - variant_id_vec <- variant_names[v_idx_vec] - pip_vec <- as.numeric(fit$pip[v_idx_vec]) - post_mean_vec <- posterior_effect[v_idx_vec] - post_se_vec <- posterior_effect_se[v_idx_vec] - beta_vec <- if (!is.null(sumstats$betahat)) sumstats$betahat[v_idx_vec] else rep(NA_real_, n_keys) - se_vec <- if (!is.null(sumstats$sebetahat)) sumstats$sebetahat[v_idx_vec] else rep(NA_real_, n_keys) - maf_vec <- if (!is.null(maf)) maf[v_idx_vec] else rep(NA_real_, n_keys) - + n_keys <- nrow(key_grid) + key_str <- paste(key_grid$variant_idx, key_grid$cs_idx, sep = ":") + + # For each requested coverage, which keys appear in cs_records at that + # coverage? Returns the key's cs_idx if present, else 0L. + idx_at <- function(cov) { + at <- cs_records[abs(cs_records$coverage - cov) < 1e-12, , drop = FALSE] + hits <- paste(at$variant_idx, at$cs_idx, sep = ":") + ifelse(key_str %in% hits, key_grid$cs_idx, 0L) + } + idx95 <- idx_at(0.95); idx70 <- idx_at(0.70); idx50 <- idx_at(0.50) + + # Per-coverage CS purity vectors (indexed by 1-based CS index). Only the + # 0.95-coverage purity is currently exported (as cs_95_purity); per-CS + # purities for the other coverages are kept here for downstream / future + # use even though they are not part of the 22-column output. + purity_per_cov <- lapply(cs_tables, .cs_purity_vec) + cov95 <- which(abs(coverage_values - 0.95) < 1e-12) + purity_95 <- if (length(cov95) > 0L) purity_per_cov[[cov95[1]]] else numeric() + cs_95_purity <- vapply(idx95, function(i) { + if (i <= 0L || i > length(purity_95)) return(0) + v <- purity_95[i]; if (is.na(v)) 0 else as.numeric(v) + }, numeric(1)) + + v_idx <- key_grid$variant_idx + variant_id_vec <- variant_names[v_idx] parsed <- tryCatch( suppressWarnings(parse_variant_id(variant_id_vec)), - error = function(e) { - stop("build_top_loci: parse_variant_id failed: ", conditionMessage(e)) - } + error = function(e) stop("build_top_loci: parse_variant_id failed: ", + conditionMessage(e)) ) if (is.null(parsed) || nrow(parsed) != length(variant_id_vec)) { stop("build_top_loci: parse_variant_id did not return one row per variant.") @@ -626,10 +554,10 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, is.na(parsed$A1) | !nzchar(parsed$A1) | is.na(parsed$A2) | !nzchar(parsed$A2) if (any(invalid)) { - first_bad <- variant_id_vec[which(invalid)[[1]]] stop("build_top_loci: parse_variant_id produced invalid coordinates ", - "for variant_id: ", first_bad) + "for variant_id: ", variant_id_vec[which(invalid)[[1]]]) } + pick <- function(x) if (is.null(x)) rep(NA_real_, n_keys) else x[v_idx] out <- data.frame( "#chr" = parsed$chrom, @@ -641,19 +569,19 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, gene = rep(fit_gene, n_keys), event = rep(fit_event, n_keys), n = rep(fit_n, n_keys), - maf = maf_vec, - beta = beta_vec, - se = se_vec, - pip = pip_vec, - posterior_effect_mean = post_mean_vec, - posterior_effect_se = post_se_vec, - cs_95 = cs_95, - cs_70 = cs_70, - cs_50 = cs_50, + maf = pick(maf), + beta = pick(sumstats$betahat), + se = pick(sumstats$sebetahat), + pip = as.numeric(fit$pip[v_idx]), + posterior_effect_mean = post_mean[v_idx], + posterior_effect_se = post_se[v_idx], + cs_95 = paste0(method, "_", idx95), + cs_70 = paste0(method, "_", idx70), + cs_50 = paste0(method, "_", idx50), cs_95_purity = cs_95_purity, method = rep(method, n_keys), - grange_start = rep(fit_grange_start, n_keys), - grange_end = rep(fit_grange_end, n_keys), + grange_start = rep(grange[["start"]], n_keys), + grange_end = rep(grange[["end"]], n_keys), stringsAsFactors = FALSE, check.names = FALSE ) @@ -661,6 +589,23 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, out } +# Per-CS purity from one cs_table: prefer susieR's sets$purity$min.abs.corr; +# fall back to cs_corr when purity is unavailable. +.cs_purity_vec <- function(ct) { + sp <- ct$sets$purity + if (!is.null(sp) && "min.abs.corr" %in% names(sp)) { + return(as.numeric(sp$min.abs.corr)) + } + if (!is.null(ct$cs_corr)) { + return(vapply(ct$cs_corr, function(m) { + if (is.null(m)) return(NA_real_) + if (!is.matrix(m) || nrow(m) <= 1) return(1) + min(abs(m[upper.tri(m)])) + }, numeric(1))) + } + rep(NA_real_, length(ct$sets$cs)) +} + .empty_top_loci <- function() { data.frame( "#chr" = character(), @@ -785,16 +730,14 @@ trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { #' Format Fine-mapping Post-processing for Protocol Output #' #' Converts method-aware fine-mapping post-processing output into the root-level -#' fields consumed by protocol RDS files. The single top-loci output is the -#' \code{top_loci} field (the 22-column unified table); there is no -#' \code{top_loci_long}, no wide-format \code{top_loci}, and no -#' \code{top_loci_export}. +#' fields consumed by protocol RDS files. Exposes the single 22-column unified +#' \code{top_loci} table alongside \code{susie_result_trimmed}, +#' \code{variant_names}, and method-specific intermediates. #' #' @param post Output from \code{\link{postprocess_finemapping_fits}}. #' @param primary_method Method whose result should populate root-level fields. #' @return A list with root-level fields including \code{variant_names}, -#' \code{susie_result_trimmed}, and the single unified \code{top_loci} -#' 22-column table. +#' \code{susie_result_trimmed}, and \code{top_loci}. #' @export format_finemapping_output <- function(post, primary_method) { method_post <- post$finemapping_results[[primary_method]] diff --git a/vignettes/susie_wrapper.Rmd b/vignettes/susie_wrapper.Rmd new file mode 100644 index 00000000..6bfac480 --- /dev/null +++ b/vignettes/susie_wrapper.Rmd @@ -0,0 +1,243 @@ +--- +title: "SuSiE wrapper unified top_loci output" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{SuSiE wrapper unified top_loci output} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +# Purpose + +This vignette is the schema-of-record for the unified `top_loci` table that +`pecotmr`'s SuSiE wrapper exposes after the **build_top_loci migration**. It +documents the 22-column shape, the credible-set string convention, the +`(variant, gene, method, cs_membership)` row uniqueness rule, and shows a +worked `postprocess_finemapping_fits()` example that row-binds `susie` and +`susie_inf` rows into a single output. + +It is reference memory for the package; downstream callers (xqtl-protocol, +transMap, etc.) consume the same single table. + +# Why one table + +Before this change the wrapper exposed three artifacts side-by-side: + +- `top_loci_long` — one row per (variant, method, coverage, CS membership); +- `top_loci` — a wide table with `pip_` and `CS__` + columns; +- `top_loci_export` — a compact "export" projection built by a separate + helper. + +Three artifacts meant three formatter code paths, three sets of column +naming conventions, and silent inconsistencies between long and wide were +easy to introduce. The user request for this migration was simple: + +> "we will not have any build_top_loci_long and build_top_loci_width just a +> function named build_top_loci. ... there is only one build_top_loci +> function, no more various formats." + +The migration accordingly removes the long/wide/export trio (and their +helpers) and replaces them with a single function +[`build_top_loci()`](../R/susie_wrapper.R) that returns one table. The +top-level `postprocess_finemapping_fits()` calls `build_top_loci()` once +per method per fit and `rbind`s the per-method contributions into the +single final `top_loci` table that `format_finemapping_output()` exposes. + +There is no separately exposed long table and no separately exposed wide +table. + +# The 22-column schema + +The unified `top_loci` table has exactly these columns, in this order: + +| # | Column | Type | Source | +|---:|-------------------------|-----------|---------------------------------------------------------------| +| 1 | `#chr` | character | parsed from `variant` | +| 2 | `start` | integer | `pos - 1` from parsed `variant` (BED 0-based) | +| 3 | `end` | integer | `pos` from parsed `variant` (BED half-open) | +| 4 | `a1` | character | from parsed `variant` | +| 5 | `a2` | character | from parsed `variant` | +| 6 | `variant` | character | the row's variant id in `chr:pos:A2:A1` form | +| 7 | `gene` | character | `colnames(data_y)[1]` (phenotype identifier) | +| 8 | `event` | character | `paste(other_quantities$condition_id, gene, sep="_")` | +| 9 | `n` | integer | `nrow(data_y)` (analyzed sample count) | +| 10 | `maf` | numeric | from the `maf` argument | +| 11 | `beta` | numeric | from `sumstats$betahat` | +| 12 | `se` | numeric | from `sumstats$sebetahat` | +| 13 | `pip` | numeric | per-variant SuSiE PIP | +| 14 | `posterior_effect_mean` | numeric | `colSums(alpha * mu)` | +| 15 | `posterior_effect_se` | numeric | `sqrt(pmax(colSums(alpha * mu2) - posterior_effect_mean^2, 0))` | +| 16 | `cs_95` | character | per-row CS membership at coverage 0.95 (see below) | +| 17 | `cs_70` | character | per-row CS membership at coverage 0.70 | +| 18 | `cs_50` | character | per-row CS membership at coverage 0.50 | +| 19 | `cs_95_purity` | numeric | 0.95-coverage CS purity for this row (see below) | +| 20 | `method` | character | `"susie"`, `"susie_inf"`, ... | +| 21 | `grange_start` | integer | parsed from `other_quantities$region` | +| 22 | `grange_end` | integer | parsed from `other_quantities$region` | + +There are no `pip_` or `CS__` wide-format columns, no +`top_loci_long` column set (`variant_id` / `coverage` / `cs`), and no +legacy `cs_coverage_0.95` / `PIP` / `variant_ID` / `gene_ID` / +`event_ID` / `region_start` / `region_end` names. Downstream callers +should read the columns above. + +## CS membership strings (`cs_95`, `cs_70`, `cs_50`) + +The credible-set columns are **character strings**, not integers. Each +value has the form + +``` +_ +``` + +where `` is the row's `method` column and `` is the +**per-method 1-based** credible-set number. Two consequences: + +1. Each method numbers its credible sets independently starting at `1`. + `susie` may have `susie_1`, `susie_2`, ...; `susie_inf` may have + `susie_inf_1`, `susie_inf_2`, .... The CS index space is *not* + sequenced across methods. + +2. A variant that is retained by the PIP cutoff but is **not** in any + credible set at the given coverage carries the sentinel `"_0"`. + For example, a `susie`-method row with no 0.70-coverage CS membership + carries `cs_70 = "susie_0"`. + +## `cs_95_purity` + +A single purity column. It holds the purity of the row's 0.95-coverage +credible set, taken from `cs_tables[[i]]$sets$purity$min.abs.corr` (with a +`cs_corr`-based fallback when `sets$purity` is unavailable). Rows whose +`cs_95` is `"_0"` (i.e. not in any 0.95-coverage CS) carry +`cs_95_purity = 0`. There are no `cs_70_purity` or `cs_50_purity` columns +at this stage. + +# Row uniqueness + +The row identity is `(variant, gene, method, cs_membership)`. Concretely: + +- A variant in a single CS within a single method produces **one** row. +- A variant in CS 1 *and* CS 2 of the same method (overlapping CS) produces + **two** rows. Both share `(variant, gene, method)` and differ in + `cs_95` (e.g. one row `cs_95 = "susie_1"`, the other + `cs_95 = "susie_2"`). +- A variant that appears in CS from more than one method (e.g. `susie` CS 1 + and `susie_inf` CS 1) produces **two** rows. They share `(variant, gene)` + and differ in `method` (and in `cs_95` because each method numbers + independently). +- A PIP-only retained variant (no CS membership at any coverage) produces + **one** row per method with + `cs_95 = cs_70 = cs_50 = "_0"` and `cs_95_purity = 0`. + +This rule preserves overlapping-CS evidence both *within* a method (CS 1 +overlaps CS 2 in `susie`) and *across* methods (the same variant appears +in `susie`'s CS and `susie_inf`'s CS). + +# Caller responsibilities + +`build_top_loci()` reads per-fit metadata from `data_x`, `data_y`, and a +reserved `other_quantities` list. The caller (typically +`univariate_analysis_pipeline()` and through it the xqtl-protocol +`susie_twas` block and the transMap finemapping pipeline) supplies: + +| Field | Used for | +|--------------------------------|-------------------------------------| +| `data_y` | `n` and `gene` (column name) | +| `other_quantities$region` | `grange_start`, `grange_end` | +| `other_quantities$condition_id`| `event` composition with `gene` | + +If `other_quantities$region` is missing, `grange_start` and `grange_end` +remain `NA` in the wrapper's output; transMap's +`.transmap_finemap_top_loci_table()` then fills them from its own +analysis-region metadata. If `other_quantities$condition_id` is missing, +`event` falls back to `NA_character_`. + +# Worked example + +The example below sketches a typical wrapper run that fits both `susie` +and `susie_inf` and shows that the final `top_loci` has one row per +`(variant, gene, method, cs_membership)`. + +```{r setup, eval = FALSE} +library(pecotmr) +library(susieR) + +set.seed(42) +n <- 300; p <- 50 +X <- matrix(rnorm(n * p), n, p) +colnames(X) <- sprintf("chr1:%d:G:A", seq_len(p)) +beta <- rep(0, p); beta[c(10, 30)] <- 1.5 +y <- as.numeric(X %*% beta) + rnorm(n, sd = 0.5) +Y <- matrix(y, ncol = 1, dimnames = list(NULL, "ENSG00000179403")) + +# fit_susie_inf_then_susie() is a pecotmr helper that returns both fits. +fits <- fit_susie_inf_then_susie(X, y) + +# Caller-supplied per-fit metadata flows through other_quantities. The two +# reserved subfields are documented contract for the unified export only; +# other_quantities itself remains a free-form list (e.g. dropped_samples +# stays accepted). +oq <- list( + region = "chr1:1-50", + condition_id = "ctx_eQTL" +) + +post <- postprocess_finemapping_fits( + fits, data_x = X, data_y = Y, + coverage = 0.95, secondary_coverage = c(0.7, 0.5), + other_quantities = oq +) + +out <- format_finemapping_output(post, primary_method = "susie") +tl <- out$top_loci + +# tl is the 22-column unified table. Verify the column order and the +# per-method CS strings. +stopifnot(identical(names(tl), c( + "#chr", "start", "end", "a1", "a2", + "variant", "gene", "event", + "n", "maf", "beta", "se", + "pip", "posterior_effect_mean", "posterior_effect_se", + "cs_95", "cs_70", "cs_50", "cs_95_purity", + "method", "grange_start", "grange_end" +))) +stopifnot(setequal(unique(tl$method), c("susie", "susie_inf"))) +stopifnot(all(grepl("^(susie|susie_inf)_\\d+$", + c(tl$cs_95, tl$cs_70, tl$cs_50)))) +``` + +A few things to notice in the worked example: + +- `out` is what xqtl-protocol / transMap callers receive. The previously + exposed `out$top_loci_long` field is gone; `out$top_loci` is the only + top-loci field. +- `tl$method` is one column with multiple values (`"susie"`, + `"susie_inf"`); there are no per-method `pip_` columns. To + inspect a single method's rows, filter on `tl$method`. +- `tl$cs_95` etc. are character columns. If you need an integer CS index + for a particular method, strip the prefix: + `as.integer(sub("^.*_", "", tl$cs_95[tl$method == "susie"]))`. +- A row whose `cs_95` is `"susie_0"` was retained by the PIP cutoff but is + not in any 0.95-coverage CS. Such rows have `cs_95_purity == 0` by + construction. + +# Downstream contract + +`format_finemapping_output()` exposes exactly one top-loci field, +`top_loci`, with the 22 columns above. The previously exposed +`top_loci_long` and wide `top_loci` (with `pip_` / +`CS__` columns) and the standalone `top_loci_export` are all +removed. Non-top-loci fields (`susie_result_trimmed`, `variant_names`, +`context_names`, `analysis_script`, ...) keep their previous semantics. + +The `FineMappingResult` S4 slot still satisfies its legacy validity +check (it expects `variant_id` and `method` columns); `susie_wrapper.R` +projects the new `top_loci` into a backward-compatible shape only for the +S4 slot, so `AllClasses.R`, `AllMethods.R`, and `vcf_writer.R` do not need +to change. End-user consumers should read the 22-column `top_loci` +returned by `format_finemapping_output()`, not the slot. From 784e6971a4d0fb4799eb07bfa1e332f473d51895 Mon Sep 17 00:00:00 2001 From: xueweic Date: Sat, 30 May 2026 15:20:01 +0000 Subject: [PATCH 08/14] Update documentation --- man/build_top_loci.Rd | 82 +++++++++++------------------ man/format_finemapping_output.Rd | 10 ++-- man/postprocess_finemapping_fits.Rd | 8 ++- 3 files changed, 39 insertions(+), 61 deletions(-) diff --git a/man/build_top_loci.Rd b/man/build_top_loci.Rd index ef2024ba..dd76e581 100644 --- a/man/build_top_loci.Rd +++ b/man/build_top_loci.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/susie_wrapper.R \name{build_top_loci} \alias{build_top_loci} -\title{Build the unified single top-loci table for one fit and one method.} +\title{Build the unified top-loci table for one fit and one method.} \usage{ build_top_loci( fit, @@ -18,76 +18,58 @@ build_top_loci( ) } \arguments{ -\item{fit}{A fitted SuSiE-family object (must expose \code{alpha}, +\item{fit}{Fitted SuSiE-family object (must expose \code{alpha}, \code{mu}, \code{mu2}, \code{pip}).} -\item{cs_tables}{A list of CS tables (one per coverage), as produced by +\item{cs_tables}{List of CS tables (one per coverage) from \code{compute_cs_tables()}.} -\item{variant_names}{Character vector of variant IDs in -\code{chr:pos:A2:A1} form, length equal to the number of variants in the -fit. Used to construct \code{variant}, \code{#chr}, \code{start}, -\code{end}, \code{a1}, \code{a2}.} +\item{variant_names}{Character vector of variant IDs +(\code{chr:pos:A2:A1}).} -\item{sumstats}{Optional marginal-association summary statistics -(\code{betahat}, \code{sebetahat}) used to fill \code{beta} and \code{se}.} +\item{sumstats}{Optional marginal-association summary (\code{betahat}, +\code{sebetahat}) filling \code{beta} / \code{se}.} \item{maf}{Optional numeric vector of minor-allele frequencies.} -\item{method}{Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Used -to construct the per-method \code{"_"} strings and the -\code{method} column. Required.} +\item{method}{Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Required.} \item{signal_cutoff}{PIP cutoff for retaining PIP-only (non-CS) variants.} -\item{data_x}{Optional regional genotype matrix; used only for sample-count -shape checks.} +\item{data_x}{Optional regional genotype matrix.} \item{data_y}{Optional regional phenotype matrix; \code{nrow(data_y)} fills \code{n}, \code{colnames(data_y)[1]} fills \code{gene}.} -\item{other_quantities}{Optional list with reserved subfields -\code{region} (e.g. \code{"chr1:100-200"}) and \code{condition_id} -(e.g. \code{"Ast_DeJager_eQTL"}); used to fill \code{grange_start}, -\code{grange_end}, and to compose \code{event} as -\code{paste(condition_id, gene, sep = "_")}. Missing subfields are -filled with \code{NA} rather than dropped.} +\item{other_quantities}{Optional list with reserved subfields \code{region} +(filling \code{grange_start} / \code{grange_end}) and \code{condition_id} +(composing \code{event} as \code{paste(condition_id, gene, sep = "_")}).} } \value{ -A data frame in the fixed 22-column unified \code{top_loci} shape - for this one fit and one method. Returns an empty data frame with the - correct columns and dtypes if there is nothing to retain. +A data frame in the fixed 22-column shape for this fit and method, + or an empty data frame if nothing is retained. } \description{ -Produces the per-fit, per-method contribution to the unified \code{top_loci} -table in the fixed 22-column shape. The outer -\code{postprocess_finemapping_fits()} loop calls this once per method per fit -and row-binds the results into the single final \code{top_loci} table that is -exposed by \code{format_finemapping_output()}. +Returns the per-fit, per-method contribution to the unified \code{top_loci} +table in the fixed 22-column shape. \code{postprocess_finemapping_fits()} +calls this once per method per fit and row-binds the results into the +single \code{top_loci} returned by \code{format_finemapping_output()}. } \details{ -This function replaces the previous \code{build_top_loci_long()}, -\code{build_top_loci_wide()}, and \code{build_top_loci_export()} trio. -There is no separately exposed long or wide table. +Output columns, in order: \code{#chr}, \code{start}, \code{end}, \code{a1}, +\code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{maf}, +\code{beta}, \code{se}, \code{pip}, \code{posterior_effect_mean}, +\code{posterior_effect_se}, \code{cs_95}, \code{cs_70}, \code{cs_50}, +\code{cs_95_purity}, \code{method}, \code{grange_start}, \code{grange_end}. -The output column order is exactly (22 columns): -\code{#chr}, \code{start}, \code{end}, \code{a1}, \code{a2}, -\code{variant}, \code{gene}, \code{event}, -\code{n}, \code{maf}, \code{beta}, \code{se}, -\code{pip}, \code{posterior_effect_mean}, \code{posterior_effect_se}, -\code{cs_95}, \code{cs_70}, \code{cs_50}, \code{cs_95_purity}, -\code{method}, \code{grange_start}, \code{grange_end}. +\code{cs_95} / \code{cs_70} / \code{cs_50} are character strings of the +form \code{"_"} where each method numbers credible sets +independently from 1. Variants retained by the PIP cutoff but not in any +credible set at a coverage carry \code{"_0"}. \code{cs_95_purity} +is the 0.95-coverage purity for the row's \code{(method, cs_95)}; rows +whose \code{cs_95} is \code{"_0"} carry \code{0}. -The \code{cs_95}, \code{cs_70}, \code{cs_50} columns are character strings of -the form \code{"_"} where each method numbers its credible -sets independently starting at 1. Variants retained by the PIP cutoff but not -assigned to any credible set at the given coverage use \code{"_0"}. -\code{cs_95_purity} is the 0.95-coverage credible-set purity for the row's -\code{(method, cs_95)}; rows whose \code{cs_95} is \code{"_0"} carry -\code{cs_95_purity = 0}. - -Row uniqueness inside this function's output is one row per -\code{(variant, gene, cs_membership)} at the given \code{method}. Overlapping -credible-set membership for the same method produces one row per CS, so the -overlapping-CS contract is preserved. +Row uniqueness is \code{(variant, gene, cs_membership)} at the given +\code{method}; overlapping CS within the same method produces one row per +CS. } diff --git a/man/format_finemapping_output.Rd b/man/format_finemapping_output.Rd index 7c48d25c..95e290c4 100644 --- a/man/format_finemapping_output.Rd +++ b/man/format_finemapping_output.Rd @@ -13,13 +13,11 @@ format_finemapping_output(post, primary_method) } \value{ A list with root-level fields including \code{variant_names}, - \code{susie_result_trimmed}, and the single unified \code{top_loci} - 22-column table. + \code{susie_result_trimmed}, and \code{top_loci}. } \description{ Converts method-aware fine-mapping post-processing output into the root-level -fields consumed by protocol RDS files. The single top-loci output is the -\code{top_loci} field (the 22-column unified table); there is no -\code{top_loci_long}, no wide-format \code{top_loci}, and no -\code{top_loci_export}. +fields consumed by protocol RDS files. Exposes the single 22-column unified +\code{top_loci} table alongside \code{susie_result_trimmed}, +\code{variant_names}, and method-specific intermediates. } diff --git a/man/postprocess_finemapping_fits.Rd b/man/postprocess_finemapping_fits.Rd index bbd7cec1..558d0ccb 100644 --- a/man/postprocess_finemapping_fits.Rd +++ b/man/postprocess_finemapping_fits.Rd @@ -50,11 +50,9 @@ input used for credible-set purity and correlations.} \value{ A list with \code{finemapping_results} (per-method post-processed objects, each carrying a trimmed fit and method-specific intermediates) - and the single unified \code{top_loci} table in the fixed 22-column - shape (see \code{\link{build_top_loci}}). Per-method contributions are - produced by \code{build_top_loci()} once per method and row-bound into - this single \code{top_loci}. There is no separately exposed - \code{top_loci_long} or wide-format \code{top_loci}. + and a single unified \code{top_loci} table in the fixed 22-column shape + (see \code{\link{build_top_loci}}). Per-method contributions are + row-bound into \code{top_loci} by an outer method for-loop. } \description{ Applies method-aware post-processing to one or more SuSiE-family fits and From b71943816e9e1d33354f601970b5d2ae7814ccc3 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 10:43:22 -0500 Subject: [PATCH 09/14] Update test_susie_wrapper.R --- tests/testthat/test_susie_wrapper.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index fd9eb70c..dee46356 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -258,9 +258,10 @@ test_that("susie_rss_pipeline runs with single_effect method", { expect_true(is.list(result)) expect_true("variant_names" %in% names(result)) expect_true("susie_result_trimmed" %in% names(result)) - if (!is.null(result$top_loci)) { - expect_true("pip_single_effect" %in% names(result$top_loci)) - expect_true("CS_95_single_effect" %in% names(result$top_loci)) + if (!is.null(result$top_loci) && nrow(result$top_loci) > 0) { + expect_true("pip" %in% names(result$top_loci)) + expect_true("cs_95" %in% names(result$top_loci)) + expect_true(all(result$top_loci$method == "single_effect")) } # PIPs should be numeric, in [0,1], and sum to at most 1 (L=1) pip <- result$susie_result_trimmed$pip @@ -293,9 +294,10 @@ test_that("susie_rss_pipeline runs with bayesian_conditional_regression", { ) expect_true(is.list(result)) expect_true("susie_result_trimmed" %in% names(result)) - if (!is.null(result$top_loci)) { - expect_true("pip_bayesian_conditional_regression" %in% names(result$top_loci)) - expect_true("CS_95_bayesian_conditional_regression" %in% names(result$top_loci)) + if (!is.null(result$top_loci) && nrow(result$top_loci) > 0) { + expect_true("pip" %in% names(result$top_loci)) + expect_true("cs_95" %in% names(result$top_loci)) + expect_true(all(result$top_loci$method == "bayesian_conditional_regression")) } pip <- result$susie_result_trimmed$pip expect_true(is.numeric(pip)) From f253d5872af7a2e5ebdd9c2ee0ac20db1c5a4c6a Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 12:44:16 -0500 Subject: [PATCH 10/14] change fit_susie_inf to add_susie_inf --- R/univariate_pipeline.R | 14 +++++++------- man/univariate_analysis_pipeline.Rd | 4 ++-- tests/testthat/test_univariate_pipeline.R | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index 3339cafc..b891c4b2 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -25,7 +25,7 @@ #' @param finemapping_extra_opts Additional options passed to \code{susieR::susie()}. #' SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE #' fit keeps these options and is initialized with \code{model_init}. -#' @param fit_susie_inf Whether to fit SuSiE-inf before ordinary SuSiE. Default +#' @param add_susie_inf Whether to fit SuSiE-inf before ordinary SuSiE. Default #' is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is #' fitted directly and SuSiE-inf fitted objects/results are not returned. #' @param twas_weights Whether to compute TWAS weights. Default is TRUE. @@ -61,7 +61,7 @@ univariate_analysis_pipeline <- function( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), - fit_susie_inf = TRUE, + add_susie_inf = TRUE, # TWAS weights and CV for TWAS weights twas_weights = TRUE, sample_partition = NULL, @@ -79,11 +79,11 @@ univariate_analysis_pipeline <- function( if (!is.numeric(Y_scalar) || length(Y_scalar) != 1) stop("Y_scalar must be a numeric scalar") if (!is.numeric(L) || L <= 0) stop("L must be a positive integer") if (!is.null(L_greedy) && (!is.numeric(L_greedy) || L_greedy <= 0)) stop("L_greedy must be NULL or a positive integer") - if (!is.logical(fit_susie_inf) || length(fit_susie_inf) != 1 || is.na(fit_susie_inf)) { - stop("fit_susie_inf must be TRUE or FALSE") + if (!is.logical(add_susie_inf) || length(add_susie_inf) != 1 || is.na(add_susie_inf)) { + stop("add_susie_inf must be TRUE or FALSE") } - if (!isTRUE(fit_susie_inf) && isTRUE(twas_weights)) { - stop("fit_susie_inf = FALSE is not compatible with twas_weights = TRUE") + if (!isTRUE(add_susie_inf) && isTRUE(twas_weights)) { + stop("add_susie_inf = FALSE is not compatible with twas_weights = TRUE") } # Initial PIP check @@ -126,7 +126,7 @@ univariate_analysis_pipeline <- function( finemapping_extra_opts, list(L = L, L_greedy = L_greedy, coverage = coverage[1]) ) - if (isTRUE(fit_susie_inf)) { + if (isTRUE(add_susie_inf)) { message("Fitting SuSiE-inf model on input data ...") message("Fitting SuSiE model initialized by SuSiE-inf ...") fitted_models <- fit_susie_inf_then_susie( diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd index 2f54b4b5..090de230 100644 --- a/man/univariate_analysis_pipeline.Rd +++ b/man/univariate_analysis_pipeline.Rd @@ -23,7 +23,7 @@ univariate_analysis_pipeline( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), - fit_susie_inf = TRUE, + add_susie_inf = TRUE, twas_weights = TRUE, sample_partition = NULL, max_cv_variants = -1, @@ -72,7 +72,7 @@ which is stricter than the susieR default of 0.5.} SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE fit keeps these options and is initialized with \code{model_init}.} -\item{fit_susie_inf}{Whether to fit SuSiE-inf before ordinary SuSiE. Default +\item{add_susie_inf}{Whether to fit SuSiE-inf before ordinary SuSiE. Default is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is fitted directly and SuSiE-inf fitted objects/results are not returned.} diff --git a/tests/testthat/test_univariate_pipeline.R b/tests/testthat/test_univariate_pipeline.R index a8dacb99..f974cf94 100644 --- a/tests/testthat/test_univariate_pipeline.R +++ b/tests/testthat/test_univariate_pipeline.R @@ -640,7 +640,7 @@ test_that("uap: ordinary susie can run without susie-inf initialization", { result <- univariate_analysis_pipeline( X = inp$X, Y = inp$Y, maf = inp$maf, - twas_weights = FALSE, fit_susie_inf = FALSE, + twas_weights = FALSE, add_susie_inf = FALSE, L = 15, L_greedy = 7 ) expect_length(captured_calls, 1) From 53a1495236336a6de182261bc6f7c5924d83d581 Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sat, 30 May 2026 14:47:22 -0500 Subject: [PATCH 11/14] add region to pecotmr and remove from other quantities --- R/susie_wrapper.R | 16 +++++---- R/univariate_pipeline.R | 5 ++- tests/testthat/test_susie_wrapper.R | 14 ++++---- vignettes/susie_wrapper.Rmd | 54 ++++++++++++++--------------- 4 files changed, 49 insertions(+), 40 deletions(-) diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index 03290976..357e9c25 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -180,6 +180,7 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-9, min_abs_corr = 0.8) { fits <- fits[!vapply(fits, is.null, logical(1))] @@ -198,6 +199,7 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, X_scalar = X_scalar, y_scalar = y_scalar, maf = maf, coverage = coverage, secondary_coverage = secondary_coverage, signal_cutoff = signal_cutoff, other_quantities = other_quantities, + region = region, prior_eff_tol = prior_eff_tol, min_abs_corr = min_abs_corr ) }) @@ -257,6 +259,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-9, min_abs_corr = 0.8, cs_input = c("X", "Xcorr", "fsusie")) { @@ -272,7 +275,8 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { top_loci <- build_top_loci( fit, cs_tables, variant_names = variant_names, sumstats = sumstats, maf = maf, method = method, signal_cutoff = signal_cutoff, - data_x = data_x, data_y = data_y, other_quantities = other_quantities + data_x = data_x, data_y = data_y, other_quantities = other_quantities, + region = region ) trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) @@ -457,16 +461,16 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", " #' @param data_x Optional regional genotype matrix. #' @param data_y Optional regional phenotype matrix; \code{nrow(data_y)} fills #' \code{n}, \code{colnames(data_y)[1]} fills \code{gene}. -#' @param other_quantities Optional list with reserved subfields \code{region} -#' (filling \code{grange_start} / \code{grange_end}) and \code{condition_id} -#' (composing \code{event} as \code{paste(condition_id, gene, sep = "_")}). +#' @param other_quantities Optional list. Default is NULL. +#' @param region Optional \code{"chr:start-end"} string. Default is NULL. #' @return A data frame in the fixed 22-column shape for this fit and method, #' or an empty data frame if nothing is retained. #' @export build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, maf = NULL, method, signal_cutoff = 0.1, data_x = NULL, data_y = NULL, - other_quantities = NULL) { + other_quantities = NULL, + region = NULL) { if (missing(method) || is.null(method) || length(method) != 1L || is.na(method) || !nzchar(method)) { stop("build_top_loci: `method` is required (e.g. \"susie\", \"susie_inf\").") @@ -485,7 +489,7 @@ build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, !is.na(fit_gene) && nzchar(fit_gene)) { paste(other_quantities$condition_id, fit_gene, sep = "_") } else NA_character_ - grange <- .parse_grange(other_quantities$region) + grange <- .parse_grange(region) # Per-variant posterior effect / SE, computed once across all variants. alpha <- as.matrix(fit$alpha) diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index b891c4b2..b09761e0 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -11,6 +11,7 @@ #' @param maf A vector of minor allele frequencies for each variant in X. #' @param X_variance Optional variance of X. Default is NULL. #' @param other_quantities A list of other quantities to be carried into fine-mapping post-processing. Default is an empty list. +#' @param region Optional \code{"chr:start-end"} string for the analysis region. Default is NULL. #' @param imiss_cutoff Individual missingness cutoff. Default is 1.0. #' @param maf_cutoff Minor allele frequency cutoff. Default is NULL. #' @param xvar_cutoff Variance cutoff for X. Default is 0.05. @@ -47,6 +48,7 @@ univariate_analysis_pipeline <- function( Y_scalar = 1, X_variance = NULL, other_quantities = list(), + region = NULL, # filters imiss_cutoff = 1.0, maf_cutoff = NULL, @@ -158,7 +160,8 @@ univariate_analysis_pipeline <- function( secondary_coverage = if (length(coverage) > 1) coverage[-1] else NULL, signal_cutoff = signal_cutoff, min_abs_corr = min_abs_corr, - other_quantities = other_quantities + other_quantities = other_quantities, + region = region ) res <- c(res, format_finemapping_output(susie_post, primary_method = "susie")) if (!is.null(susie_post$finemapping_results$susie_inf)) { diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index dee46356..c0df1b9e 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -890,14 +890,16 @@ if (!exists(".make_univariate_data", inherits = FALSE)) { .run_build_top_loci <- function(inp, method = "susie", signal_cutoff = 0.05, sumstats = NULL, maf = NULL, - other_quantities = NULL) { + other_quantities = NULL, + region = NULL) { build_top_loci( fit = inp$fit, cs_tables = inp$cs_tables, variant_names = inp$variant_names, sumstats = sumstats, maf = maf, method = method, signal_cutoff = signal_cutoff, data_x = inp$data_x, data_y = inp$data_y, - other_quantities = other_quantities + other_quantities = other_quantities, + region = region ) } @@ -940,13 +942,13 @@ test_that("build_top_loci emits 22 columns in the fixed order on a non-empty fit "0.7" = list(c(1L, 2L)), "0.5" = list(c(1L, 2L))), n_samples = 419, n_variants = 11332) - other_q <- list(region = "chr10:10823338-14348298", - condition_id = "Ast_DeJager_eQTL") + other_q <- list(condition_id = "Ast_DeJager_eQTL") out <- .run_build_top_loci(inp, method = "susie", sumstats = list(betahat = c(0.2, -0.1), sebetahat = c(0.05, 0.04)), maf = c(0.10, 0.25), - other_quantities = other_q) + other_quantities = other_q, + region = "chr10:10823338-14348298") expect_equal(names(out), .UNIFIED_TOP_LOCI_COLS) expect_equal(unique(out$gene), "ENSG00000179403") expect_equal(unique(out$event), "Ast_DeJager_eQTL_ENSG00000179403") @@ -1161,7 +1163,7 @@ test_that("non-top-loci wrapper fields (susie_result_trimmed, variant_names) are expect_true("variant_names" %in% names(out)) }) -test_that("missing other_quantities$region produces NA grange columns rather than silent omission", { +test_that("missing region produces NA grange columns rather than silent omission", { variant_ids <- c("chr1:100:A:G") cs_at_cov <- list("0.95" = list(1L), "0.7" = list(1L), diff --git a/vignettes/susie_wrapper.Rmd b/vignettes/susie_wrapper.Rmd index 6bfac480..51e9ba8b 100644 --- a/vignettes/susie_wrapper.Rmd +++ b/vignettes/susie_wrapper.Rmd @@ -77,8 +77,8 @@ The unified `top_loci` table has exactly these columns, in this order: | 18 | `cs_50` | character | per-row CS membership at coverage 0.50 | | 19 | `cs_95_purity` | numeric | 0.95-coverage CS purity for this row (see below) | | 20 | `method` | character | `"susie"`, `"susie_inf"`, ... | -| 21 | `grange_start` | integer | parsed from `other_quantities$region` | -| 22 | `grange_end` | integer | parsed from `other_quantities$region` | +| 21 | `grange_start` | integer | parsed from top-level `region` arg | +| 22 | `grange_end` | integer | parsed from top-level `region` arg | There are no `pip_` or `CS__` wide-format columns, no `top_loci_long` column set (`variant_id` / `coverage` / `cs`), and no @@ -140,22 +140,25 @@ in `susie`'s CS and `susie_inf`'s CS). # Caller responsibilities -`build_top_loci()` reads per-fit metadata from `data_x`, `data_y`, and a -reserved `other_quantities` list. The caller (typically -`univariate_analysis_pipeline()` and through it the xqtl-protocol -`susie_twas` block and the transMap finemapping pipeline) supplies: - -| Field | Used for | -|--------------------------------|-------------------------------------| -| `data_y` | `n` and `gene` (column name) | -| `other_quantities$region` | `grange_start`, `grange_end` | -| `other_quantities$condition_id`| `event` composition with `gene` | - -If `other_quantities$region` is missing, `grange_start` and `grange_end` -remain `NA` in the wrapper's output; transMap's -`.transmap_finemap_top_loci_table()` then fills them from its own -analysis-region metadata. If `other_quantities$condition_id` is missing, -`event` falls back to `NA_character_`. +`build_top_loci()` reads per-fit metadata from `data_x`, `data_y`, a +reserved `other_quantities` list, and a top-level `region` argument. The +caller (typically `univariate_analysis_pipeline()` and through it the +xqtl-protocol `susie_twas` block and the transMap finemapping pipeline) +supplies: + +| Field | Used for | +|----------------------------------|-------------------------------------| +| `data_y` | `n` and `gene` (column name) | +| `region` (top-level) | `grange_start`, `grange_end` (parsed by `parse_region()`) | +| `other_quantities$condition_id` | `event` composition with `gene` | + +`region` is a top-level `"chr:start-end"` string argument of +`univariate_analysis_pipeline()` (and `postprocess_finemapping_fits()` / +`build_top_loci()`). When `region` is `NULL`, `grange_start` and +`grange_end` are `NA`. If `other_quantities$condition_id` is missing, +`event` falls back to `NA_character_`. `other_quantities` remains a +free-form list (e.g. `dropped_samples` stays accepted); only +`condition_id` is consumed by the unified export. # Worked example @@ -178,19 +181,16 @@ Y <- matrix(y, ncol = 1, dimnames = list(NULL, "ENSG00000179403")) # fit_susie_inf_then_susie() is a pecotmr helper that returns both fits. fits <- fit_susie_inf_then_susie(X, y) -# Caller-supplied per-fit metadata flows through other_quantities. The two -# reserved subfields are documented contract for the unified export only; -# other_quantities itself remains a free-form list (e.g. dropped_samples -# stays accepted). -oq <- list( - region = "chr1:1-50", - condition_id = "ctx_eQTL" -) +# Caller-supplied per-fit metadata. `condition_id` flows through +# other_quantities (which remains a free-form list); the analysis region +# flows through `region`, a top-level argument. +oq <- list(condition_id = "ctx_eQTL") post <- postprocess_finemapping_fits( fits, data_x = X, data_y = Y, coverage = 0.95, secondary_coverage = c(0.7, 0.5), - other_quantities = oq + other_quantities = oq, + region = "chr1:1-50" ) out <- format_finemapping_output(post, primary_method = "susie") From c50f7dc0004c55bcc4c186dbc60233b0393ac6b4 Mon Sep 17 00:00:00 2001 From: xueweic Date: Sat, 30 May 2026 19:48:42 +0000 Subject: [PATCH 12/14] Update documentation --- man/build_top_loci.Rd | 9 +++++---- man/postprocess_finemapping_fits.Rd | 1 + man/univariate_analysis_pipeline.Rd | 3 +++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/man/build_top_loci.Rd b/man/build_top_loci.Rd index dd76e581..f44ae5a3 100644 --- a/man/build_top_loci.Rd +++ b/man/build_top_loci.Rd @@ -14,7 +14,8 @@ build_top_loci( signal_cutoff = 0.1, data_x = NULL, data_y = NULL, - other_quantities = NULL + other_quantities = NULL, + region = NULL ) } \arguments{ @@ -41,9 +42,9 @@ build_top_loci( \item{data_y}{Optional regional phenotype matrix; \code{nrow(data_y)} fills \code{n}, \code{colnames(data_y)[1]} fills \code{gene}.} -\item{other_quantities}{Optional list with reserved subfields \code{region} -(filling \code{grange_start} / \code{grange_end}) and \code{condition_id} -(composing \code{event} as \code{paste(condition_id, gene, sep = "_")}).} +\item{other_quantities}{Optional list. Default is NULL.} + +\item{region}{Optional \code{"chr:start-end"} string. Default is NULL.} } \value{ A data frame in the fixed 22-column shape for this fit and method, diff --git a/man/postprocess_finemapping_fits.Rd b/man/postprocess_finemapping_fits.Rd index 558d0ccb..3f16a70c 100644 --- a/man/postprocess_finemapping_fits.Rd +++ b/man/postprocess_finemapping_fits.Rd @@ -15,6 +15,7 @@ postprocess_finemapping_fits( secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-09, min_abs_corr = 0.8 ) diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd index 090de230..06043f9a 100644 --- a/man/univariate_analysis_pipeline.Rd +++ b/man/univariate_analysis_pipeline.Rd @@ -12,6 +12,7 @@ univariate_analysis_pipeline( Y_scalar = 1, X_variance = NULL, other_quantities = list(), + region = NULL, imiss_cutoff = 1, maf_cutoff = NULL, xvar_cutoff = 0, @@ -47,6 +48,8 @@ univariate_analysis_pipeline( \item{other_quantities}{A list of other quantities to be carried into fine-mapping post-processing. Default is an empty list.} +\item{region}{Optional \code{"chr:start-end"} string for the analysis region. Default is NULL.} + \item{imiss_cutoff}{Individual missingness cutoff. Default is 1.0.} \item{maf_cutoff}{Minor allele frequency cutoff. Default is NULL.} From 4cde25930d6c49bfb1b0ca8c046969cf050ce27f Mon Sep 17 00:00:00 2001 From: xuewei cao <36172337+xueweic@users.noreply.github.com> Date: Sun, 31 May 2026 00:38:39 -0400 Subject: [PATCH 13/14] Update multivariate_pipeline.R --- R/multivariate_pipeline.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/multivariate_pipeline.R b/R/multivariate_pipeline.R index 70dc4f3e..913d9c42 100644 --- a/R/multivariate_pipeline.R +++ b/R/multivariate_pipeline.R @@ -71,6 +71,7 @@ multivariate_analysis_pipeline <- function( maf, X_variance = NULL, other_quantities = list(), + region = NULL, # filters imiss_cutoff = 1.0, maf_cutoff = 0.01, @@ -304,7 +305,8 @@ multivariate_analysis_pipeline <- function( secondary_coverage = sec_coverage, signal_cutoff = signal_cutoff, min_abs_corr = min_abs_corr, - other_quantities = other_quantities + other_quantities = other_quantities, + region = region ) res <- c(res, format_finemapping_output(mvsusie_post, primary_method = "mvsusie")) res$total_time_elapsed <- proc.time() - st From 1beec6f8ccfc148afa6e41ca4a542db958e880c9 Mon Sep 17 00:00:00 2001 From: xueweic Date: Sun, 31 May 2026 04:39:59 +0000 Subject: [PATCH 14/14] Update documentation --- man/multivariate_analysis_pipeline.Rd | 1 + 1 file changed, 1 insertion(+) diff --git a/man/multivariate_analysis_pipeline.Rd b/man/multivariate_analysis_pipeline.Rd index 54ea75f5..fdd504f9 100644 --- a/man/multivariate_analysis_pipeline.Rd +++ b/man/multivariate_analysis_pipeline.Rd @@ -10,6 +10,7 @@ multivariate_analysis_pipeline( maf, X_variance = NULL, other_quantities = list(), + region = NULL, imiss_cutoff = 1, maf_cutoff = 0.01, xvar_cutoff = 0.01,