diff --git a/NAMESPACE b/NAMESPACE index cd43db4f..9f408151 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ export(bayes_c_weights) export(bayes_l_weights) export(bayes_n_weights) export(bayes_r_weights) +export(build_top_loci) export(check_ld) export(clean_context_names) export(coloc_post_processor) diff --git a/R/multivariate_pipeline.R b/R/multivariate_pipeline.R index 70dc4f3e..913d9c42 100644 --- a/R/multivariate_pipeline.R +++ b/R/multivariate_pipeline.R @@ -71,6 +71,7 @@ multivariate_analysis_pipeline <- function( maf, X_variance = NULL, other_quantities = list(), + region = NULL, # filters imiss_cutoff = 1.0, maf_cutoff = 0.01, @@ -304,7 +305,8 @@ multivariate_analysis_pipeline <- function( secondary_coverage = sec_coverage, signal_cutoff = signal_cutoff, min_abs_corr = min_abs_corr, - other_quantities = other_quantities + other_quantities = other_quantities, + region = region ) res <- c(res, format_finemapping_output(mvsusie_post, primary_method = "mvsusie")) res$total_time_elapsed <- proc.time() - st diff --git a/R/susie_wrapper.R b/R/susie_wrapper.R index bb1e2a8f..357e9c25 100644 --- a/R/susie_wrapper.R +++ b/R/susie_wrapper.R @@ -168,11 +168,11 @@ fit_susie_inf_then_susie <- function(X, y, args = list(), #' @param other_quantities Optional list carried into each method result. #' @param prior_eff_tol Tolerance for retaining effects by prior variance. #' @param min_abs_corr Minimum absolute correlation for credible-set purity. -#' @return A list with \code{finemapping_results}, \code{top_loci_long}, and -#' \code{top_loci}. The long table is lossless, with one row per -#' variant-method-coverage-CS membership. The wide table stores one row per -#' variant, method-specific \code{pip_} columns, method-specific -#' \code{CS__} columns, and \code{model_source}. +#' @return A list with \code{finemapping_results} (per-method post-processed +#' objects, each carrying a trimmed fit and method-specific intermediates) +#' and a single unified \code{top_loci} table in the fixed 22-column shape +#' (see \code{\link{build_top_loci}}). Per-method contributions are +#' row-bound into \code{top_loci} by an outer method for-loop. #' @export postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, X_scalar = 1, y_scalar = 1, @@ -180,6 +180,7 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-9, min_abs_corr = 0.8) { fits <- fits[!vapply(fits, is.null, logical(1))] @@ -188,6 +189,9 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, stop("fits must be a named list; names define method identity.") } + # One method for-loop: each method calls build_top_loci() once per fit; the + # per-method 22-column contributions are row-bound below into the single + # final `top_loci` table. There is no separately exposed long or wide table. posts <- lapply(names(fits), function(method) { fit <- .set_finemapping_fit_class(fits[[method]], method) postprocess_finemapping_fit( @@ -195,21 +199,27 @@ postprocess_finemapping_fits <- function(fits, data_x, data_y = NULL, X_scalar = X_scalar, y_scalar = y_scalar, maf = maf, coverage = coverage, secondary_coverage = secondary_coverage, signal_cutoff = signal_cutoff, other_quantities = other_quantities, + region = region, prior_eff_tol = prior_eff_tol, min_abs_corr = min_abs_corr ) }) names(posts) <- names(fits) - top_loci_long <- bind_rows(lapply(posts, function(x) x$top_loci_long)) + per_method <- lapply(posts, function(x) x$top_loci) + per_method <- per_method[!vapply(per_method, is.null, logical(1))] + top_loci <- if (length(per_method) == 0L) { + .empty_top_loci() + } else { + do.call(rbind, per_method) + } + rownames(top_loci) <- NULL posts <- lapply(posts, function(x) { - x$top_loci_long <- NULL + x$top_loci <- NULL x }) - top_loci <- build_top_loci_wide(top_loci_long, posts) list( finemapping_results = posts, - top_loci_long = if (nrow(top_loci_long) > 0) top_loci_long else NULL, top_loci = top_loci ) } @@ -249,6 +259,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-9, min_abs_corr = 0.8, cs_input = c("X", "Xcorr", "fsusie")) { @@ -261,22 +272,27 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { secondary_coverage = secondary_coverage, method = method, cs_input = cs_input, min_abs_corr = min_abs_corr ) - top_loci_long <- build_top_loci_long( + top_loci <- build_top_loci( fit, cs_tables, variant_names = variant_names, sumstats = sumstats, - maf = maf, method = method, signal_cutoff = signal_cutoff + maf = maf, method = method, signal_cutoff = signal_cutoff, + data_x = data_x, data_y = data_y, other_quantities = other_quantities, + region = region ) trimmed <- trim_finemapping_fit(fit, effect_idx, method, cs_tables) - # Build FineMappingResult S4 object + # Build FineMappingResult S4 object. The S4 contract (validity check, + # vcf_writer, getPIP, getCS) still expects `variant_id`, `pip`, and an + # integer `cs` column on the slot. To avoid rippling renames into + # AllClasses / AllMethods / vcf_writer for this change, we project the + # new 22-column `top_loci` into the legacy slot shape here, in + # susie_wrapper only. The wrapper-facing `top_loci` returned to callers + # is unchanged. + s4_top_loci <- .top_loci_for_s4_slot(top_loci) fm_result <- FineMappingResult( variant_names = variant_names, trimmed_fit = trimmed, - top_loci = if (is.null(top_loci_long) || nrow(top_loci_long) == 0) { - data.frame(variant_id = character(0), method = character(0)) - } else { - top_loci_long - }, + top_loci = s4_top_loci, method = method, sumstats = sumstats ) @@ -285,7 +301,7 @@ postprocess_finemapping_fit.susiF <- function(fit, method = "fsusie", ...) { res <- list( variant_names = variant_names, result_trimmed = trimmed, - top_loci_long = top_loci_long, + top_loci = top_loci, finemapping_result = fm_result ) if (!is.null(sumstats)) res$sumstats <- sumstats @@ -407,87 +423,264 @@ compute_cs_table <- function(fit, data_x, coverage, cs_input = c("X", "Xcorr", " out } -build_top_loci_long <- function(fit, cs_tables, variant_names, sumstats = NULL, - maf = NULL, method, signal_cutoff = 0.1) { - if (length(cs_tables) == 0) return(.empty_top_loci_long()) +#' Build the unified top-loci table for one fit and one method. +#' +#' Returns the per-fit, per-method contribution to the unified \code{top_loci} +#' table in the fixed 22-column shape. \code{postprocess_finemapping_fits()} +#' calls this once per method per fit and row-binds the results into the +#' single \code{top_loci} returned by \code{format_finemapping_output()}. +#' +#' Output columns, in order: \code{#chr}, \code{start}, \code{end}, \code{a1}, +#' \code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{maf}, +#' \code{beta}, \code{se}, \code{pip}, \code{posterior_effect_mean}, +#' \code{posterior_effect_se}, \code{cs_95}, \code{cs_70}, \code{cs_50}, +#' \code{cs_95_purity}, \code{method}, \code{grange_start}, \code{grange_end}. +#' +#' \code{cs_95} / \code{cs_70} / \code{cs_50} are character strings of the +#' form \code{"_"} where each method numbers credible sets +#' independently from 1. Variants retained by the PIP cutoff but not in any +#' credible set at a coverage carry \code{"_0"}. \code{cs_95_purity} +#' is the 0.95-coverage purity for the row's \code{(method, cs_95)}; rows +#' whose \code{cs_95} is \code{"_0"} carry \code{0}. +#' +#' Row uniqueness is \code{(variant, gene, cs_membership)} at the given +#' \code{method}; overlapping CS within the same method produces one row per +#' CS. +#' +#' @param fit Fitted SuSiE-family object (must expose \code{alpha}, +#' \code{mu}, \code{mu2}, \code{pip}). +#' @param cs_tables List of CS tables (one per coverage) from +#' \code{compute_cs_tables()}. +#' @param variant_names Character vector of variant IDs +#' (\code{chr:pos:A2:A1}). +#' @param sumstats Optional marginal-association summary (\code{betahat}, +#' \code{sebetahat}) filling \code{beta} / \code{se}. +#' @param maf Optional numeric vector of minor-allele frequencies. +#' @param method Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Required. +#' @param signal_cutoff PIP cutoff for retaining PIP-only (non-CS) variants. +#' @param data_x Optional regional genotype matrix. +#' @param data_y Optional regional phenotype matrix; \code{nrow(data_y)} fills +#' \code{n}, \code{colnames(data_y)[1]} fills \code{gene}. +#' @param other_quantities Optional list. Default is NULL. +#' @param region Optional \code{"chr:start-end"} string. Default is NULL. +#' @return A data frame in the fixed 22-column shape for this fit and method, +#' or an empty data frame if nothing is retained. +#' @export +build_top_loci <- function(fit, cs_tables, variant_names, sumstats = NULL, + maf = NULL, method, signal_cutoff = 0.1, + data_x = NULL, data_y = NULL, + other_quantities = NULL, + region = NULL) { + if (missing(method) || is.null(method) || + length(method) != 1L || is.na(method) || !nzchar(method)) { + stop("build_top_loci: `method` is required (e.g. \"susie\", \"susie_inf\").") + } + if (length(cs_tables) == 0) return(.empty_top_loci()) coverage_values <- attr(cs_tables, "coverage") - rows <- lapply(seq_along(cs_tables), function(i) { - cs_table <- cs_tables[[i]] - cov <- coverage_values[[i]] - top_variants_idx <- get_top_variants_idx(cs_table, signal_cutoff) - cs_info <- get_cs_info(cs_table$sets$cs, top_variants_idx) - if (is.null(cs_info) || nrow(cs_info) == 0) return(NULL) - idx <- cs_info$variant_idx - optional_cols <- .top_loci_optional_columns(idx, sumstats, maf) - base <- data.frame( - variant_id = variant_names[idx], - method = method, - coverage = cov, - cs = as.integer(cs_info$cs_idx), - pip = as.numeric(fit$pip[idx]), - stringsAsFactors = FALSE - ) - if (ncol(optional_cols) > 0) cbind(base, optional_cols) else base - }) - out <- bind_rows(rows) - if (nrow(out) == 0) .empty_top_loci_long() else out + if (is.null(coverage_values)) coverage_values <- rep(NA_real_, length(cs_tables)) + + # Per-fit constants. + data_y_mat <- if (!is.null(data_y)) as.matrix(data_y) else NULL + fit_n <- if (is.null(data_y_mat)) NA_integer_ else as.integer(nrow(data_y_mat)) + fit_gene <- if (!is.null(data_y_mat) && !is.null(colnames(data_y_mat))) { + colnames(data_y_mat)[1] + } else NA_character_ + fit_event <- if (!is.null(other_quantities$condition_id) && + !is.na(fit_gene) && nzchar(fit_gene)) { + paste(other_quantities$condition_id, fit_gene, sep = "_") + } else NA_character_ + grange <- .parse_grange(region) + + # Per-variant posterior effect / SE, computed once across all variants. + alpha <- as.matrix(fit$alpha) + mu <- if (!is.null(fit$mu)) as.matrix(fit$mu) else NULL + mu2 <- if (!is.null(fit$mu2)) as.matrix(fit$mu2) else NULL + post_mean <- if (!is.null(mu) && all(dim(alpha) == dim(mu))) { + colSums(alpha * mu) + } else rep(NA_real_, length(variant_names)) + post_se <- if (!is.null(mu2) && all(dim(alpha) == dim(mu2))) { + sqrt(pmax(colSums(alpha * mu2) - post_mean^2, 0)) + } else rep(NA_real_, length(variant_names)) + + # Collect CS-membership records (variant_idx, cs_idx, coverage) across all + # requested coverages. This is the only intermediate; the 22-column shape + # is projected from it below. + cs_records <- do.call(rbind, lapply(seq_along(cs_tables), function(i) { + ct <- cs_tables[[i]] + info <- get_cs_info(ct$sets$cs, get_top_variants_idx(ct, signal_cutoff)) + if (is.null(info) || nrow(info) == 0) return(NULL) + data.frame(variant_idx = as.integer(info$variant_idx), + cs_idx = as.integer(info$cs_idx), + coverage = as.numeric(coverage_values[[i]]), + stringsAsFactors = FALSE) + })) + if (is.null(cs_records) || nrow(cs_records) == 0) return(.empty_top_loci()) + + # Key grid: one row per (variant_idx, cs_idx). Overlapping CS membership + # within this method is preserved as separate keys. + key_grid <- unique(cs_records[, c("variant_idx", "cs_idx"), drop = FALSE]) + rownames(key_grid) <- NULL + n_keys <- nrow(key_grid) + key_str <- paste(key_grid$variant_idx, key_grid$cs_idx, sep = ":") + + # For each requested coverage, which keys appear in cs_records at that + # coverage? Returns the key's cs_idx if present, else 0L. + idx_at <- function(cov) { + at <- cs_records[abs(cs_records$coverage - cov) < 1e-12, , drop = FALSE] + hits <- paste(at$variant_idx, at$cs_idx, sep = ":") + ifelse(key_str %in% hits, key_grid$cs_idx, 0L) + } + idx95 <- idx_at(0.95); idx70 <- idx_at(0.70); idx50 <- idx_at(0.50) + + # Per-coverage CS purity vectors (indexed by 1-based CS index). Only the + # 0.95-coverage purity is currently exported (as cs_95_purity); per-CS + # purities for the other coverages are kept here for downstream / future + # use even though they are not part of the 22-column output. + purity_per_cov <- lapply(cs_tables, .cs_purity_vec) + cov95 <- which(abs(coverage_values - 0.95) < 1e-12) + purity_95 <- if (length(cov95) > 0L) purity_per_cov[[cov95[1]]] else numeric() + cs_95_purity <- vapply(idx95, function(i) { + if (i <= 0L || i > length(purity_95)) return(0) + v <- purity_95[i]; if (is.na(v)) 0 else as.numeric(v) + }, numeric(1)) + + v_idx <- key_grid$variant_idx + variant_id_vec <- variant_names[v_idx] + parsed <- tryCatch( + suppressWarnings(parse_variant_id(variant_id_vec)), + error = function(e) stop("build_top_loci: parse_variant_id failed: ", + conditionMessage(e)) + ) + if (is.null(parsed) || nrow(parsed) != length(variant_id_vec)) { + stop("build_top_loci: parse_variant_id did not return one row per variant.") + } + invalid <- is.na(parsed$chrom) | is.na(parsed$pos) | + is.na(parsed$A1) | !nzchar(parsed$A1) | + is.na(parsed$A2) | !nzchar(parsed$A2) + if (any(invalid)) { + stop("build_top_loci: parse_variant_id produced invalid coordinates ", + "for variant_id: ", variant_id_vec[which(invalid)[[1]]]) + } + pick <- function(x) if (is.null(x)) rep(NA_real_, n_keys) else x[v_idx] + + out <- data.frame( + "#chr" = parsed$chrom, + start = as.integer(parsed$pos) - 1L, + end = as.integer(parsed$pos), + a1 = parsed$A1, + a2 = parsed$A2, + variant = variant_id_vec, + gene = rep(fit_gene, n_keys), + event = rep(fit_event, n_keys), + n = rep(fit_n, n_keys), + maf = pick(maf), + beta = pick(sumstats$betahat), + se = pick(sumstats$sebetahat), + pip = as.numeric(fit$pip[v_idx]), + posterior_effect_mean = post_mean[v_idx], + posterior_effect_se = post_se[v_idx], + cs_95 = paste0(method, "_", idx95), + cs_70 = paste0(method, "_", idx70), + cs_50 = paste0(method, "_", idx50), + cs_95_purity = cs_95_purity, + method = rep(method, n_keys), + grange_start = rep(grange[["start"]], n_keys), + grange_end = rep(grange[["end"]], n_keys), + stringsAsFactors = FALSE, + check.names = FALSE + ) + rownames(out) <- NULL + out +} + +# Per-CS purity from one cs_table: prefer susieR's sets$purity$min.abs.corr; +# fall back to cs_corr when purity is unavailable. +.cs_purity_vec <- function(ct) { + sp <- ct$sets$purity + if (!is.null(sp) && "min.abs.corr" %in% names(sp)) { + return(as.numeric(sp$min.abs.corr)) + } + if (!is.null(ct$cs_corr)) { + return(vapply(ct$cs_corr, function(m) { + if (is.null(m)) return(NA_real_) + if (!is.matrix(m) || nrow(m) <= 1) return(1) + min(abs(m[upper.tri(m)])) + }, numeric(1))) + } + rep(NA_real_, length(ct$sets$cs)) } -.empty_top_loci_long <- function() { +.empty_top_loci <- function() { data.frame( - variant_id = character(), method = character(), coverage = numeric(), - cs = integer(), pip = numeric(), stringsAsFactors = FALSE + "#chr" = character(), + start = integer(), + end = integer(), + a1 = character(), + a2 = character(), + variant = character(), + gene = character(), + event = character(), + n = integer(), + maf = numeric(), + beta = numeric(), + se = numeric(), + pip = numeric(), + posterior_effect_mean = numeric(), + posterior_effect_se = numeric(), + cs_95 = character(), + cs_70 = character(), + cs_50 = character(), + cs_95_purity = numeric(), + method = character(), + grange_start = integer(), + grange_end = integer(), + stringsAsFactors = FALSE, + check.names = FALSE ) } -.top_loci_optional_columns <- function(idx, sumstats = NULL, maf = NULL) { - optional_cols <- list( - betahat = if (!is.null(sumstats$betahat)) sumstats$betahat[idx] else NULL, - sebetahat = if (!is.null(sumstats$sebetahat)) sumstats$sebetahat[idx] else NULL, - z = if (!is.null(sumstats$z)) sumstats$z[idx] else NULL, - maf = if (!is.null(maf)) maf[idx] else NULL - ) - as.data.frame(Filter(Negate(is.null), optional_cols)) -} - -build_top_loci_wide <- function(top_loci_long, posts) { - if (is.null(top_loci_long) || nrow(top_loci_long) == 0) return(NULL) - ids <- unique(top_loci_long$variant_id) - out <- data.frame(variant_id = ids, stringsAsFactors = FALSE) - for (column in c("betahat", "sebetahat", "z", "maf")) { - if (column %in% names(top_loci_long)) { - out[[column]] <- vapply(ids, function(id) { - values <- top_loci_long[[column]][top_loci_long$variant_id == id] - values <- values[!is.na(values)] - if (length(values) == 0) NA_real_ else values[[1]] - }, numeric(1)) - } +.parse_grange <- function(region_str) { + if (is.null(region_str) || length(region_str) == 0L || + is.na(region_str) || !nzchar(as.character(region_str))) { + return(c(start = NA_integer_, end = NA_integer_)) } - - methods <- names(posts) - for (method in methods) { - post <- posts[[method]] - pip_col <- format_pip_column(method) - pip <- post$result_trimmed$pip - names(pip) <- post$variant_names - out[[pip_col]] <- as.numeric(pip[ids]) - - method_rows <- top_loci_long[top_loci_long$method == method, , drop = FALSE] - for (cov in sort(unique(top_loci_long$coverage), decreasing = TRUE)) { - cs_col <- format_cs_column(cov, method) - out[[cs_col]] <- vapply(ids, function(id) { - cs <- method_rows$cs[method_rows$variant_id == id & method_rows$coverage == cov] - if (length(cs) == 0) return(0L) - min(cs) - }, integer(1)) - } + pr <- tryCatch(parse_region(as.character(region_str)), + error = function(e) NULL) + if (is.null(pr) || !is.data.frame(pr)) { + return(c(start = NA_integer_, end = NA_integer_)) + } + c(start = as.integer(pr$start), end = as.integer(pr$end)) +} + +# Project the new 22-column `top_loci` into the legacy shape expected by the +# FineMappingResult S4 slot, vcf_writer, getPIP, and getCS. We add backward- +# compatible aliases without renaming any column in the wrapper-facing +# `top_loci`: +# +# * `variant_id` — copy of `variant` +# * `cs` — integer credible-set index derived from `cs_95` strings of +# the form `_` (PIP-only `_0` -> 0L) +# +# This isolates the schema change to susie_wrapper.R so AllClasses.R, +# AllMethods.R, and vcf_writer.R do not have to change. +.top_loci_for_s4_slot <- function(top_loci) { + if (is.null(top_loci) || nrow(top_loci) == 0) { + return(data.frame(variant_id = character(0), + method = character(0), + stringsAsFactors = FALSE)) + } + out <- top_loci + if ("variant" %in% names(out) && !"variant_id" %in% names(out)) { + out$variant_id <- out$variant + } + if ("cs_95" %in% names(out) && !"cs" %in% names(out)) { + out$cs <- vapply(out$cs_95, function(s) { + if (is.na(s) || !nzchar(s)) return(0L) + tail_str <- sub("^.*_", "", s) + suppressWarnings(as.integer(tail_str)) + }, integer(1)) + out$cs[is.na(out$cs)] <- 0L } - - out$model_source <- vapply(ids, function(id) { - selected_methods <- unique(top_loci_long$method[top_loci_long$variant_id == id]) - paste(selected_methods[selected_methods %in% methods], collapse = ";") - }, character(1)) - rownames(out) <- NULL out } @@ -541,24 +734,25 @@ trim_finemapping_fit <- function(fit, effect_idx, method, cs_tables) { #' Format Fine-mapping Post-processing for Protocol Output #' #' Converts method-aware fine-mapping post-processing output into the root-level -#' fields consumed by protocol RDS files. +#' fields consumed by protocol RDS files. Exposes the single 22-column unified +#' \code{top_loci} table alongside \code{susie_result_trimmed}, +#' \code{variant_names}, and method-specific intermediates. #' #' @param post Output from \code{\link{postprocess_finemapping_fits}}. #' @param primary_method Method whose result should populate root-level fields. #' @return A list with root-level fields including \code{variant_names}, -#' \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. +#' \code{susie_result_trimmed}, and \code{top_loci}. #' @export format_finemapping_output <- function(post, primary_method) { method_post <- post$finemapping_results[[primary_method]] if (is.null(method_post)) { stop("primary_method was not found in finemapping_results: ", primary_method) } - keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci_long")) + keep_names <- setdiff(names(method_post), c("result_trimmed", "top_loci")) c( method_post[keep_names], list( susie_result_trimmed = method_post$result_trimmed, - top_loci_long = post$top_loci_long, top_loci = post$top_loci ) ) diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index 6ec7cf71..b09761e0 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -1,8 +1,8 @@ #' Univariate Analysis Pipeline #' #' This function performs univariate analysis for fine-mapping and Transcriptome-Wide Association Study (TWAS) -#' with optional cross-validation. Fine-mapping fits SuSiE-inf first and then -#' fits SuSiE initialized from the SuSiE-inf result. +#' with optional cross-validation. By default, fine-mapping fits SuSiE-inf first +#' and then fits SuSiE initialized from the SuSiE-inf result. #' #' @param X A matrix of genotype data where rows represent samples and columns represent genetic variants. #' @param Y A vector of phenotype measurements. @@ -11,6 +11,7 @@ #' @param maf A vector of minor allele frequencies for each variant in X. #' @param X_variance Optional variance of X. Default is NULL. #' @param other_quantities A list of other quantities to be carried into fine-mapping post-processing. Default is an empty list. +#' @param region Optional \code{"chr:start-end"} string for the analysis region. Default is NULL. #' @param imiss_cutoff Individual missingness cutoff. Default is 1.0. #' @param maf_cutoff Minor allele frequency cutoff. Default is NULL. #' @param xvar_cutoff Variance cutoff for X. Default is 0.05. @@ -25,6 +26,9 @@ #' @param finemapping_extra_opts Additional options passed to \code{susieR::susie()}. #' SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE #' fit keeps these options and is initialized with \code{model_init}. +#' @param add_susie_inf Whether to fit SuSiE-inf before ordinary SuSiE. Default +#' is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is +#' fitted directly and SuSiE-inf fitted objects/results are not returned. #' @param twas_weights Whether to compute TWAS weights. Default is TRUE. #' @param sample_partition Optional data frame with Sample and Fold columns for cross-validation. Default is NULL. #' @param max_cv_variants The maximum number of variants to be included in cross-validation. Default is -1 (no limit). @@ -44,6 +48,7 @@ univariate_analysis_pipeline <- function( Y_scalar = 1, X_variance = NULL, other_quantities = list(), + region = NULL, # filters imiss_cutoff = 1.0, maf_cutoff = NULL, @@ -58,6 +63,7 @@ univariate_analysis_pipeline <- function( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), + add_susie_inf = TRUE, # TWAS weights and CV for TWAS weights twas_weights = TRUE, sample_partition = NULL, @@ -75,6 +81,12 @@ univariate_analysis_pipeline <- function( if (!is.numeric(Y_scalar) || length(Y_scalar) != 1) stop("Y_scalar must be a numeric scalar") if (!is.numeric(L) || L <= 0) stop("L must be a positive integer") if (!is.null(L_greedy) && (!is.numeric(L_greedy) || L_greedy <= 0)) stop("L_greedy must be NULL or a positive integer") + if (!is.logical(add_susie_inf) || length(add_susie_inf) != 1 || is.na(add_susie_inf)) { + stop("add_susie_inf must be TRUE or FALSE") + } + if (!isTRUE(add_susie_inf) && isTRUE(twas_weights)) { + stop("add_susie_inf = FALSE is not compatible with twas_weights = TRUE") + } # Initial PIP check if (pip_cutoff_to_skip != 0) { @@ -112,17 +124,28 @@ univariate_analysis_pipeline <- function( st <- proc.time() res <- list() - message("Fitting SuSiE-inf model on input data ...") - message("Fitting SuSiE model initialized by SuSiE-inf ...") - fitted_models <- fit_susie_inf_then_susie( - X, - Y, - args = modifyList( - finemapping_extra_opts, - list(L = L, L_greedy = L_greedy, coverage = coverage[1]) - ) + susie_args <- modifyList( + finemapping_extra_opts, + list(L = L, L_greedy = L_greedy, coverage = coverage[1]) ) - res$susie_inf_fitted <- fitted_models[["susie_inf"]] + if (isTRUE(add_susie_inf)) { + message("Fitting SuSiE-inf model on input data ...") + message("Fitting SuSiE model initialized by SuSiE-inf ...") + fitted_models <- fit_susie_inf_then_susie( + X, + Y, + args = susie_args + ) + res$susie_inf_fitted <- fitted_models[["susie_inf"]] + } else { + message("Fitting SuSiE model on input data ...") + fitted_models <- list( + susie = .set_finemapping_fit_class( + do.call(susie, c(list(X = X, y = Y), susie_args)), + "susie" + ) + ) + } res$susie_fitted <- fitted_models[["susie"]] # Process SuSiE results @@ -137,10 +160,13 @@ univariate_analysis_pipeline <- function( secondary_coverage = if (length(coverage) > 1) coverage[-1] else NULL, signal_cutoff = signal_cutoff, min_abs_corr = min_abs_corr, - other_quantities = other_quantities + other_quantities = other_quantities, + region = region ) res <- c(res, format_finemapping_output(susie_post, primary_method = "susie")) - res$susie_inf_result_trimmed <- susie_post$finemapping_results$susie_inf$result_trimmed + if (!is.null(susie_post$finemapping_results$susie_inf)) { + res$susie_inf_result_trimmed <- susie_post$finemapping_results$susie_inf$result_trimmed + } res$total_time_elapsed <- proc.time() - st # TWAS weights and cross-validation diff --git a/man/build_top_loci.Rd b/man/build_top_loci.Rd new file mode 100644 index 00000000..f44ae5a3 --- /dev/null +++ b/man/build_top_loci.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/susie_wrapper.R +\name{build_top_loci} +\alias{build_top_loci} +\title{Build the unified top-loci table for one fit and one method.} +\usage{ +build_top_loci( + fit, + cs_tables, + variant_names, + sumstats = NULL, + maf = NULL, + method, + signal_cutoff = 0.1, + data_x = NULL, + data_y = NULL, + other_quantities = NULL, + region = NULL +) +} +\arguments{ +\item{fit}{Fitted SuSiE-family object (must expose \code{alpha}, +\code{mu}, \code{mu2}, \code{pip}).} + +\item{cs_tables}{List of CS tables (one per coverage) from +\code{compute_cs_tables()}.} + +\item{variant_names}{Character vector of variant IDs +(\code{chr:pos:A2:A1}).} + +\item{sumstats}{Optional marginal-association summary (\code{betahat}, +\code{sebetahat}) filling \code{beta} / \code{se}.} + +\item{maf}{Optional numeric vector of minor-allele frequencies.} + +\item{method}{Method name (e.g. \code{"susie"}, \code{"susie_inf"}). Required.} + +\item{signal_cutoff}{PIP cutoff for retaining PIP-only (non-CS) variants.} + +\item{data_x}{Optional regional genotype matrix.} + +\item{data_y}{Optional regional phenotype matrix; \code{nrow(data_y)} fills +\code{n}, \code{colnames(data_y)[1]} fills \code{gene}.} + +\item{other_quantities}{Optional list. Default is NULL.} + +\item{region}{Optional \code{"chr:start-end"} string. Default is NULL.} +} +\value{ +A data frame in the fixed 22-column shape for this fit and method, + or an empty data frame if nothing is retained. +} +\description{ +Returns the per-fit, per-method contribution to the unified \code{top_loci} +table in the fixed 22-column shape. \code{postprocess_finemapping_fits()} +calls this once per method per fit and row-binds the results into the +single \code{top_loci} returned by \code{format_finemapping_output()}. +} +\details{ +Output columns, in order: \code{#chr}, \code{start}, \code{end}, \code{a1}, +\code{a2}, \code{variant}, \code{gene}, \code{event}, \code{n}, \code{maf}, +\code{beta}, \code{se}, \code{pip}, \code{posterior_effect_mean}, +\code{posterior_effect_se}, \code{cs_95}, \code{cs_70}, \code{cs_50}, +\code{cs_95_purity}, \code{method}, \code{grange_start}, \code{grange_end}. + +\code{cs_95} / \code{cs_70} / \code{cs_50} are character strings of the +form \code{"_"} where each method numbers credible sets +independently from 1. Variants retained by the PIP cutoff but not in any +credible set at a coverage carry \code{"_0"}. \code{cs_95_purity} +is the 0.95-coverage purity for the row's \code{(method, cs_95)}; rows +whose \code{cs_95} is \code{"_0"} carry \code{0}. + +Row uniqueness is \code{(variant, gene, cs_membership)} at the given +\code{method}; overlapping CS within the same method produces one row per +CS. +} diff --git a/man/format_finemapping_output.Rd b/man/format_finemapping_output.Rd index 4519da0a..95e290c4 100644 --- a/man/format_finemapping_output.Rd +++ b/man/format_finemapping_output.Rd @@ -13,9 +13,11 @@ format_finemapping_output(post, primary_method) } \value{ A list with root-level fields including \code{variant_names}, - \code{susie_result_trimmed}, \code{top_loci_long}, and \code{top_loci}. + \code{susie_result_trimmed}, and \code{top_loci}. } \description{ Converts method-aware fine-mapping post-processing output into the root-level -fields consumed by protocol RDS files. +fields consumed by protocol RDS files. Exposes the single 22-column unified +\code{top_loci} table alongside \code{susie_result_trimmed}, +\code{variant_names}, and method-specific intermediates. } diff --git a/man/multivariate_analysis_pipeline.Rd b/man/multivariate_analysis_pipeline.Rd index 54ea75f5..fdd504f9 100644 --- a/man/multivariate_analysis_pipeline.Rd +++ b/man/multivariate_analysis_pipeline.Rd @@ -10,6 +10,7 @@ multivariate_analysis_pipeline( maf, X_variance = NULL, other_quantities = list(), + region = NULL, imiss_cutoff = 1, maf_cutoff = 0.01, xvar_cutoff = 0.01, diff --git a/man/postprocess_finemapping_fits.Rd b/man/postprocess_finemapping_fits.Rd index 36b88ae9..3f16a70c 100644 --- a/man/postprocess_finemapping_fits.Rd +++ b/man/postprocess_finemapping_fits.Rd @@ -15,6 +15,7 @@ postprocess_finemapping_fits( secondary_coverage = c(0.7, 0.5), signal_cutoff = 0.1, other_quantities = NULL, + region = NULL, prior_eff_tol = 1e-09, min_abs_corr = 0.8 ) @@ -48,11 +49,11 @@ input used for credible-set purity and correlations.} \item{min_abs_corr}{Minimum absolute correlation for credible-set purity.} } \value{ -A list with \code{finemapping_results}, \code{top_loci_long}, and - \code{top_loci}. The long table is lossless, with one row per - variant-method-coverage-CS membership. The wide table stores one row per - variant, method-specific \code{pip_} columns, method-specific - \code{CS__} columns, and \code{model_source}. +A list with \code{finemapping_results} (per-method post-processed + objects, each carrying a trimmed fit and method-specific intermediates) + and a single unified \code{top_loci} table in the fixed 22-column shape + (see \code{\link{build_top_loci}}). Per-method contributions are + row-bound into \code{top_loci} by an outer method for-loop. } \description{ Applies method-aware post-processing to one or more SuSiE-family fits and diff --git a/man/univariate_analysis_pipeline.Rd b/man/univariate_analysis_pipeline.Rd index c6960862..06043f9a 100644 --- a/man/univariate_analysis_pipeline.Rd +++ b/man/univariate_analysis_pipeline.Rd @@ -12,6 +12,7 @@ univariate_analysis_pipeline( Y_scalar = 1, X_variance = NULL, other_quantities = list(), + region = NULL, imiss_cutoff = 1, maf_cutoff = NULL, xvar_cutoff = 0, @@ -23,6 +24,7 @@ univariate_analysis_pipeline( coverage = c(0.95, 0.7, 0.5), min_abs_corr = 0.8, finemapping_extra_opts = list(refine = TRUE), + add_susie_inf = TRUE, twas_weights = TRUE, sample_partition = NULL, max_cv_variants = -1, @@ -46,6 +48,8 @@ univariate_analysis_pipeline( \item{other_quantities}{A list of other quantities to be carried into fine-mapping post-processing. Default is an empty list.} +\item{region}{Optional \code{"chr:start-end"} string for the analysis region. Default is NULL.} + \item{imiss_cutoff}{Individual missingness cutoff. Default is 1.0.} \item{maf_cutoff}{Minor allele frequency cutoff. Default is NULL.} @@ -71,6 +75,10 @@ which is stricter than the susieR default of 0.5.} SuSiE-inf is always fitted with \code{refine = FALSE}; the ordinary SuSiE fit keeps these options and is initialized with \code{model_init}.} +\item{add_susie_inf}{Whether to fit SuSiE-inf before ordinary SuSiE. Default +is TRUE for existing pipeline compatibility. If FALSE, ordinary SuSiE is +fitted directly and SuSiE-inf fitted objects/results are not returned.} + \item{twas_weights}{Whether to compute TWAS weights. Default is TRUE.} \item{sample_partition}{Optional data frame with Sample and Fold columns for cross-validation. Default is NULL.} @@ -88,6 +96,6 @@ A list containing the univariate analysis results. } \description{ This function performs univariate analysis for fine-mapping and Transcriptome-Wide Association Study (TWAS) -with optional cross-validation. Fine-mapping fits SuSiE-inf first and then -fits SuSiE initialized from the SuSiE-inf result. +with optional cross-validation. By default, fine-mapping fits SuSiE-inf first +and then fits SuSiE initialized from the SuSiE-inf result. } diff --git a/tests/testthat/test_susie_wrapper.R b/tests/testthat/test_susie_wrapper.R index 20747008..c0df1b9e 100644 --- a/tests/testthat/test_susie_wrapper.R +++ b/tests/testthat/test_susie_wrapper.R @@ -258,9 +258,10 @@ test_that("susie_rss_pipeline runs with single_effect method", { expect_true(is.list(result)) expect_true("variant_names" %in% names(result)) expect_true("susie_result_trimmed" %in% names(result)) - if (!is.null(result$top_loci)) { - expect_true("pip_single_effect" %in% names(result$top_loci)) - expect_true("CS_95_single_effect" %in% names(result$top_loci)) + if (!is.null(result$top_loci) && nrow(result$top_loci) > 0) { + expect_true("pip" %in% names(result$top_loci)) + expect_true("cs_95" %in% names(result$top_loci)) + expect_true(all(result$top_loci$method == "single_effect")) } # PIPs should be numeric, in [0,1], and sum to at most 1 (L=1) pip <- result$susie_result_trimmed$pip @@ -293,9 +294,10 @@ test_that("susie_rss_pipeline runs with bayesian_conditional_regression", { ) expect_true(is.list(result)) expect_true("susie_result_trimmed" %in% names(result)) - if (!is.null(result$top_loci)) { - expect_true("pip_bayesian_conditional_regression" %in% names(result$top_loci)) - expect_true("CS_95_bayesian_conditional_regression" %in% names(result$top_loci)) + if (!is.null(result$top_loci) && nrow(result$top_loci) > 0) { + expect_true("pip" %in% names(result$top_loci)) + expect_true("cs_95" %in% names(result$top_loci)) + expect_true(all(result$top_loci$method == "bayesian_conditional_regression")) } pip <- result$susie_result_trimmed$pip expect_true(is.numeric(pip)) @@ -744,27 +746,6 @@ test_that("adjust_susie_weights run_allele_qc=TRUE auto-prepends chrom/pos/A2/A1 expect_true(length(out$adjusted_susie_weights) > 0) }) -test_that("format_finemapping_output does not duplicate top loci variants", { - top_loci <- data.frame( - variant_id = paste0("v", 1:4), - CS_95_susie = c(0L, 1L, NA_integer_, 0L), - pip_susie = c(0.2, 0.005, 0.001, 0), - stringsAsFactors = FALSE - ) - post <- list( - finemapping_results = list(susie = list( - variant_names = paste0("v", 1:4), - result_trimmed = list(pip = 1:4), - top_loci_long = NULL - )), - top_loci_long = NULL, - top_loci = top_loci - ) - out <- format_finemapping_output(post, "susie") - expect_false("top_loci_variants" %in% names(out)) - expect_equal(unique(out$top_loci$variant_id), paste0("v", 1:4)) -}) - .make_univariate_data <- function(seed = 42, n = 300, p = 50, effect_idx = integer(0), effect_size = NULL) { set.seed(seed) @@ -779,83 +760,6 @@ test_that("format_finemapping_output does not duplicate top loci variants", { list(X = X, y = y) } -test_that("postprocess top_loci: single susie with signal yields per-method columns and no unsuffixed pip", { - d <- .make_univariate_data(seed = 11, effect_idx = c(10, 30)) - fit <- susieR::susie(d$X, d$y, L = 5) - post <- postprocess_finemapping_fits(list(susie = fit), data_x = d$X, data_y = d$y, coverage = 0.95) - expect_false("pip" %in% colnames(post$top_loci)) - expect_true("pip_susie" %in% colnames(post$top_loci)) - expect_true("CS_95_susie" %in% colnames(post$top_loci)) - # No other method's columns - expect_length(grep("_susie_inf$", colnames(post$top_loci)), 0) - expect_length(grep("_mvsusie$", colnames(post$top_loci)), 0) -}) - -test_that("postprocess top_loci: susie + susie_inf both with signal yield symmetric pip/CS columns", { - d <- .make_univariate_data(seed = 12, effect_idx = c(15, 45)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - expect_gt(max(fits$susie$pip), 0.5) - expect_gt(max(fits$susie_inf$pip), 0.5) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) - cols <- colnames(post$top_loci) - expect_false("pip" %in% cols) - for (m in c("susie", "susie_inf")) { - expect_true(paste0("pip_", m) %in% cols, info = m) - expect_true(paste0("CS_95_", m) %in% cols, info = m) - expect_true(paste0("CS_70_", m) %in% cols, info = m) - expect_true(paste0("CS_50_", m) %in% cols, info = m) - } - # top_loci_long contains both methods - expect_setequal(unique(post$top_loci_long$method), c("susie", "susie_inf")) -}) - -test_that("postprocess top_loci: when one method has no rows in long, its pip column still appears", { - # Construct a case where susie has signal but the susie_inf trim gives no CS rows. - # We use a high signal_cutoff so susie_inf may not produce top variants. - d <- .make_univariate_data(seed = 13, effect_idx = c(20)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, - coverage = 0.95, signal_cutoff = 0.99) - cols <- colnames(post$top_loci) - # Both pip columns are present because they come from result_trimmed$pip, - # independent of whether the method contributed rows to top_loci_long. - expect_true("pip_susie" %in% cols) - expect_true("pip_susie_inf" %in% cols) - expect_false("pip" %in% cols) -}) - -test_that("postprocess top_loci: susie + susie_inf + mvsusie all run through trim without dimension errors", { - skip_if_not_installed("mvsusieR") - d <- .make_univariate_data(seed = 14, effect_idx = c(15, 45)) - Y_mv <- cbind(d$y, as.numeric(d$X %*% rep(0, ncol(d$X))) + rnorm(length(d$y), sd = 0.5)) - colnames(Y_mv) <- c("ctx1", "ctx2") - fits_uni <- fit_susie_inf_then_susie(d$X, d$y) - mv_fit <- mvsusieR::mvsusie(d$X, Y_mv, L = 5, - prior_variance = mvsusieR::create_mixture_prior(R = ncol(Y_mv)), - prior_weights = rep(1 / ncol(d$X), ncol(d$X))) - fits_all <- list(susie = fits_uni$susie, - susie_inf = fits_uni$susie_inf, - mvsusie = mv_fit) - # Bug B regression: this used to error with "incorrect number of dimensions" - # because trim_finemapping_fit indexed mu2 as 2-D regardless of mvsusie's 3-D. - expect_no_error( - post <- postprocess_finemapping_fits(fits_all, data_x = d$X, data_y = Y_mv, coverage = 0.95) - ) - for (m in c("susie", "susie_inf", "mvsusie")) { - expect_true(paste0("pip_", m) %in% colnames(post$top_loci), info = m) - } - expect_false("pip" %in% colnames(post$top_loci)) -}) - -test_that("format_finemapping_output no longer copies pip_ into an unsuffixed pip column", { - d <- .make_univariate_data(seed = 15, effect_idx = c(20, 40)) - fits <- fit_susie_inf_then_susie(d$X, d$y) - post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) - out <- format_finemapping_output(post, primary_method = "susie") - expect_false("pip" %in% colnames(out$top_loci)) - expect_true("pip_susie" %in% colnames(out$top_loci)) -}) - test_that(".translate_legacy_top_loci_cs_columns renames pip_susie -> pip for legacy callers", { new_format <- data.frame( variant_id = c("v1", "v2"), @@ -890,23 +794,407 @@ test_that(".translate_legacy_top_loci_cs_columns leaves existing pip column alon }) -test_that("top_loci has symmetric CS__ columns even for methods with no surviving CSes", { - d <- .make_univariate_data(seed = 16, effect_idx = c(25)) +# ============================================================================ +# Unified top-loci: hard gating coverage per OpenSpec tasks 4.27 / 4.28. +# These tests are the implementation gate for the build_top_loci migration. +# ============================================================================ + +# Reuse the existing local helper. Re-declared inside this block so the file +# remains correct whether the unified-section tests are run alone or as part of +# the full file. +if (!exists(".make_univariate_data", inherits = FALSE)) { + .make_univariate_data <- function(seed = 42, n = 300, p = 50, + effect_idx = integer(0), + effect_size = NULL) { + set.seed(seed) + X <- matrix(rnorm(n * p), n, p) + colnames(X) <- sprintf("chr1:%d:G:A", seq_len(p)) + beta <- rep(0, p) + if (length(effect_idx) > 0) { + if (is.null(effect_size)) effect_size <- rep(1.5, length(effect_idx)) + beta[effect_idx] <- effect_size + } + y <- as.numeric(X %*% beta) + rnorm(n, sd = 0.5) + list(X = X, y = y) + } +} + +.UNIFIED_TOP_LOCI_COLS <- c( + "#chr", "start", "end", "a1", "a2", + "variant", "gene", "event", + "n", "maf", "beta", "se", + "pip", "posterior_effect_mean", "posterior_effect_se", + "cs_95", "cs_70", "cs_50", "cs_95_purity", + "method", "grange_start", "grange_end" +) + +# Synthesize a SuSiE-like fit + cs_tables with explicit per-coverage CS +# membership. `cs_at_cov` is a named list keyed by coverage value (e.g. +# `"0.95"`); each element is a list of integer vectors, one per CS at that +# coverage. The CS numbering is 1-based per coverage. PIP values are filled +# from `pip` (variants outside the CS get small non-zero PIP so they can be +# retained or dropped via `signal_cutoff`). +.fake_fit_and_cs <- function(variant_ids, cs_at_cov, + cs_purity_value = 0.85, + pip = NULL, + n_samples = 100, n_variants = NULL, + gene = "ENSG00000179403") { + p <- length(variant_ids) + if (is.null(n_variants)) n_variants <- p + if (is.null(pip)) pip <- seq(0.6, 0.9, length.out = p) + # alpha must be L x p so colSums(alpha * mu) is well-defined. We use one + # row whose values are normalized PIPs. + alpha <- matrix(pip / sum(pip), nrow = 1, ncol = p) + mu <- matrix(0.5, nrow = 1, ncol = p) + mu2 <- mu^2 + 0.1 + fit <- list(pip = setNames(pip, variant_ids), + alpha = alpha, mu = mu, mu2 = mu2) + + cs_tables <- lapply(names(cs_at_cov), function(cov_str) { + cs_list <- cs_at_cov[[cov_str]] + if (is.null(cs_list)) cs_list <- list() + n_cs <- length(cs_list) + if (n_cs > 0L) names(cs_list) <- paste0("L", seq_len(n_cs)) + purity_df <- if (n_cs > 0L) { + data.frame(min.abs.corr = rep(cs_purity_value, n_cs), + mean.abs.corr = rep(cs_purity_value, n_cs), + median.abs.corr = rep(cs_purity_value, n_cs)) + } else { + data.frame(min.abs.corr = numeric(0), + mean.abs.corr = numeric(0), + median.abs.corr = numeric(0)) + } + list( + sets = list(cs = cs_list, + cs_index = seq_len(n_cs), + requested_coverage = as.numeric(cov_str), + purity = purity_df), + cs_corr = if (n_cs > 0L) { + lapply(seq_len(n_cs), function(i) { + matrix(c(1, cs_purity_value, cs_purity_value, 1), nrow = 2) + }) + } else NULL, + pip = fit$pip + ) + }) + attr(cs_tables, "coverage") <- as.numeric(names(cs_at_cov)) + + X <- matrix(0, nrow = n_samples, ncol = n_variants) + rownames(X) <- paste0("sample", seq_len(n_samples)) + if (n_variants == length(variant_ids)) colnames(X) <- variant_ids + Y <- matrix(0, nrow = n_samples, ncol = 1, + dimnames = list(paste0("sample", seq_len(n_samples)), gene)) + list(fit = fit, cs_tables = cs_tables, + variant_names = variant_ids, data_x = X, data_y = Y) +} + +.run_build_top_loci <- function(inp, method = "susie", signal_cutoff = 0.05, + sumstats = NULL, maf = NULL, + other_quantities = NULL, + region = NULL) { + build_top_loci( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names, + sumstats = sumstats, maf = maf, + method = method, signal_cutoff = signal_cutoff, + data_x = inp$data_x, data_y = inp$data_y, + other_quantities = other_quantities, + region = region + ) +} + +test_that("build_top_loci returns the exact 22-column schema in order with stable dtypes", { + # `.empty_top_loci` is a package-internal helper. Use the namespace lookup + # so the test works both source-loaded and after R CMD INSTALL. + empty_fn <- if (exists(".empty_top_loci", envir = asNamespace("pecotmr"), + inherits = FALSE)) { + get(".empty_top_loci", envir = asNamespace("pecotmr")) + } else { + get(".empty_top_loci", envir = .GlobalEnv) + } + out <- empty_fn() + expect_equal(names(out), .UNIFIED_TOP_LOCI_COLS) + expect_equal(nrow(out), 0L) + expect_true(is.character(out$"#chr")) + expect_true(is.integer(out$start)) + expect_true(is.integer(out$end)) + expect_true(is.character(out$variant)) + expect_true(is.character(out$gene)) + expect_true(is.character(out$event)) + expect_true(is.integer(out$n)) + expect_true(is.numeric(out$maf)) + expect_true(is.numeric(out$pip)) + expect_true(is.numeric(out$posterior_effect_mean)) + expect_true(is.numeric(out$posterior_effect_se)) + expect_true(is.character(out$cs_95)) + expect_true(is.character(out$cs_70)) + expect_true(is.character(out$cs_50)) + expect_true(is.numeric(out$cs_95_purity)) + expect_true(is.character(out$method)) + expect_true(is.integer(out$grange_start)) + expect_true(is.integer(out$grange_end)) +}) + +test_that("build_top_loci emits 22 columns in the fixed order on a non-empty fit", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + inp <- .fake_fit_and_cs(variant_ids, + cs_at_cov = list("0.95" = list(c(1L, 2L)), + "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))), + n_samples = 419, n_variants = 11332) + other_q <- list(condition_id = "Ast_DeJager_eQTL") + out <- .run_build_top_loci(inp, method = "susie", + sumstats = list(betahat = c(0.2, -0.1), + sebetahat = c(0.05, 0.04)), + maf = c(0.10, 0.25), + other_quantities = other_q, + region = "chr10:10823338-14348298") + expect_equal(names(out), .UNIFIED_TOP_LOCI_COLS) + expect_equal(unique(out$gene), "ENSG00000179403") + expect_equal(unique(out$event), "Ast_DeJager_eQTL_ENSG00000179403") + expect_equal(unique(out$grange_start), 10823338L) + expect_equal(unique(out$grange_end), 14348298L) + expect_equal(unique(out$n), 419L) + expect_equal(unique(out$method), "susie") +}) + +test_that("cs_95 / cs_70 / cs_50 are character strings of the form '_'", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T", "chr1:300:T:C") + # Variants 1 and 2 in CS 1 (at 95), variant 3 in CS 2 (at 95). All three + # also appear at 70/50 with the same memberships. + cs_at_cov <- list("0.95" = list(c(1L, 2L), 3L), + "0.7" = list(c(1L, 2L), 3L), + "0.5" = list(c(1L, 2L), 3L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = c(0.9, 0.9, 0.9)) + out <- .run_build_top_loci(inp, method = "susie") + expect_true(all(grepl("^susie_\\d+$", out$cs_95))) + expect_true(all(grepl("^susie_\\d+$", out$cs_70))) + expect_true(all(grepl("^susie_\\d+$", out$cs_50))) + expect_true(any(out$cs_95 == "susie_1")) + expect_true(any(out$cs_95 == "susie_2")) +}) + +test_that("PIP-only retained variants carry '_0' at every coverage and cs_95_purity = 0", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + # No CS at any coverage; variant 2 has high PIP so it is retained via + # signal_cutoff and produces a "_0" row. + cs_at_cov <- list("0.95" = list(), + "0.7" = list(), + "0.5" = list()) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, + pip = c(0.02, 0.95)) + out <- .run_build_top_loci(inp, method = "susie", signal_cutoff = 0.5) + expect_gte(nrow(out), 1L) + # Every row must have _0 at every coverage and cs_95_purity = 0. + expect_true(all(out$cs_95 == "susie_0")) + expect_true(all(out$cs_70 == "susie_0")) + expect_true(all(out$cs_50 == "susie_0")) + expect_true(all(out$cs_95_purity == 0)) +}) + +test_that("per-method CS indices are independent across susie and susie_inf (postprocess_finemapping_fits)", { + d <- .make_univariate_data(seed = 21, effect_idx = c(15, 35)) fits <- fit_susie_inf_then_susie(d$X, d$y) - # Force one method (susie_inf) to have no surviving effects: zero out V so - # select_effects drops everything. - fits$susie_inf$V <- rep(0, length(fits$susie_inf$V)) post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, - coverage = 0.95, secondary_coverage = c(0.7, 0.5)) - cols <- colnames(post$top_loci) - for (m in c("susie", "susie_inf")) { - for (cov_lbl in c("CS_95_", "CS_70_", "CS_50_")) { - expect_true(paste0(cov_lbl, m) %in% cols, - info = paste("missing", paste0(cov_lbl, m))) + coverage = 0.95, + secondary_coverage = c(0.7, 0.5)) + tl <- post$top_loci + expect_setequal(unique(tl$method), c("susie", "susie_inf")) + # Each method must only ever emit "_" strings, never strings + # from the other method. This is the core safeguard against silent + # method-mixing. + susie_rows <- tl[tl$method == "susie", , drop = FALSE] + susie_inf_rows <- tl[tl$method == "susie_inf", , drop = FALSE] + for (col in c("cs_95", "cs_70", "cs_50")) { + expect_true(all(grepl("^susie_\\d+$", susie_rows[[col]])), + info = paste("susie rows have wrong prefix in", col)) + expect_true(all(grepl("^susie_inf_\\d+$", susie_inf_rows[[col]])), + info = paste("susie_inf rows have wrong prefix in", col)) + } + # CS indices are not sequenced across methods: if both methods have any + # CS, each may independently include "_1". + has_susie_1 <- any(susie_rows$cs_95 == "susie_1") + has_susie_inf_1 <- any(susie_inf_rows$cs_95 == "susie_inf_1") + expect_true(has_susie_1 || nrow(susie_rows) == 0L) + expect_true(has_susie_inf_1 || nrow(susie_inf_rows) == 0L) +}) + +test_that("cs_95_purity = 0 when cs_95 is '_0', and in (0, 1] otherwise", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T", "chr1:300:T:C") + # Variant 1 in CS 1 at 95-cov; variant 2 PIP-only retained; variant 3 + # PIP-only retained. + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, + cs_purity_value = 0.85, + pip = c(0.9, 0.6, 0.55)) + out <- .run_build_top_loci(inp, method = "susie", signal_cutoff = 0.5) + expect_true(any(out$cs_95 == "susie_1")) + expect_true(any(out$cs_95 == "susie_0")) + in_cs <- out[out$cs_95 != "susie_0", , drop = FALSE] + not_cs <- out[out$cs_95 == "susie_0", , drop = FALSE] + expect_true(all(not_cs$cs_95_purity == 0)) + expect_true(all(in_cs$cs_95_purity > 0 & in_cs$cs_95_purity <= 1)) +}) + +test_that("overlapping CS within one method produces one row per CS membership", { + variant_ids <- c("chr1:100:A:G") + # One variant belongs to CS 1 AND CS 2 at 95-cov (overlap). + cs_at_cov <- list("0.95" = list(1L, 1L), + "0.7" = list(1L, 1L), + "0.5" = list(1L, 1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + out <- .run_build_top_loci(inp, method = "susie") + # Two rows: one for CS 1 membership, one for CS 2 membership; same + # (variant, gene, method). + expect_equal(nrow(out), 2L) + expect_equal(unique(out$variant), "chr1:100:A:G") + expect_equal(unique(out$method), "susie") + expect_setequal(out$cs_95, c("susie_1", "susie_2")) +}) + +test_that("overlapping CS across methods produces one row per method", { + d <- .make_univariate_data(seed = 22, effect_idx = c(12, 32)) + fits <- fit_susie_inf_then_susie(d$X, d$y) + post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, + coverage = 0.95, + secondary_coverage = c(0.7, 0.5)) + tl <- post$top_loci + if (nrow(tl) > 0L) { + cnt_per_method <- table(tl$variant, tl$method) + shared <- rownames(cnt_per_method)[apply(cnt_per_method > 0, 1, sum) >= 2L] + if (length(shared) > 0L) { + v <- shared[[1]] + rows_for_v <- tl[tl$variant == v, , drop = FALSE] + expect_gte(length(unique(rows_for_v$method)), 2L) + } else { + succeed("no shared variants in this fixture; cross-method uniqueness rule is structural") + } + } else { + succeed("empty top_loci from this fixture; cross-method uniqueness rule is structural") + } +}) + +test_that("format_finemapping_output exposes exactly one top_loci field; no top_loci_long, no wide top_loci, no top_loci_export", { + d <- .make_univariate_data(seed = 23, effect_idx = c(15, 40)) + fits <- fit_susie_inf_then_susie(d$X, d$y) + post <- postprocess_finemapping_fits(fits, data_x = d$X, data_y = d$y, coverage = 0.95) + out <- format_finemapping_output(post, primary_method = "susie") + expect_true("top_loci" %in% names(out)) + expect_false("top_loci_long" %in% names(out)) + expect_false("top_loci_export" %in% names(out)) + # The exposed top_loci has the unified 22-column schema. + expect_equal(names(out$top_loci), .UNIFIED_TOP_LOCI_COLS) +}) + +test_that("postprocess_finemapping_fits does not return top_loci_long anywhere", { + d <- .make_univariate_data(seed = 24, effect_idx = c(25)) + fit <- susieR::susie(d$X, d$y, L = 5) + post <- postprocess_finemapping_fits(list(susie = fit), + data_x = d$X, data_y = d$y, coverage = 0.95) + expect_true("top_loci" %in% names(post)) + expect_false("top_loci_long" %in% names(post)) + expect_equal(names(post$top_loci), .UNIFIED_TOP_LOCI_COLS) + # No per-method finemapping_results entry should carry a top_loci_long either. + for (name in names(post$finemapping_results)) { + expect_false("top_loci_long" %in% names(post$finemapping_results[[name]]), + info = name) + } +}) + +test_that("build_top_loci_long / build_top_loci_wide / build_top_loci_export are removed from the package", { + # Two layered checks. (1) The new helper must be reachable. (2) The old + # helpers must NOT be reachable — neither in the package namespace (if the + # package is installed) nor in .GlobalEnv (when this file is source-loaded + # against the fresh R/ tree for testing). The contract is that the trio is + # gone end-to-end after the migration. + resolve <- function(name) { + if (exists(name, envir = .GlobalEnv, inherits = FALSE)) { + return(get(name, envir = .GlobalEnv)) + } + ns_ok <- tryCatch(asNamespace("pecotmr"), error = function(e) NULL) + if (!is.null(ns_ok) && exists(name, envir = ns_ok, inherits = FALSE)) { + return(get(name, envir = ns_ok)) } + NULL } - # susie_inf had no surviving effects -> all CS_*_susie_inf are 0 - for (col in grep("^CS_.*_susie_inf$", cols, value = TRUE)) { - expect_true(all(post$top_loci[[col]] == 0L), info = col) + expect_false(is.null(resolve("build_top_loci")), + info = "build_top_loci must be defined after the migration") + # The removed trio: only fail if a definition exists in the SAME source + # tree we just loaded (i.e. globalenv). The installed-package namespace + # may still carry stale copies from an earlier install; the gate only + # cares that the new source tree does not redefine them. + expect_false(exists("build_top_loci_long", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists("build_top_loci_wide", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists("build_top_loci_export", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(".empty_top_loci_long", envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(".empty_top_loci_export",envir = .GlobalEnv, inherits = FALSE)) +}) + +test_that("build_top_loci raises an explicit error on invalid variant_id rather than silently filling NA", { + variant_ids <- c("not_a_valid_id") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + expect_error(.run_build_top_loci(inp, method = "susie"), + "parse_variant_id") +}) + +test_that("build_top_loci requires `method`", { + variant_ids <- c("chr1:100:A:G") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + expect_error(build_top_loci( + fit = inp$fit, cs_tables = inp$cs_tables, + variant_names = inp$variant_names + ), "method") +}) + +test_that("non-top-loci wrapper fields (susie_result_trimmed, variant_names) are preserved by format_finemapping_output", { + d <- .make_univariate_data(seed = 25, effect_idx = c(20)) + fit <- susieR::susie(d$X, d$y, L = 5) + post <- postprocess_finemapping_fits(list(susie = fit), data_x = d$X, data_y = d$y, coverage = 0.95) + out <- format_finemapping_output(post, primary_method = "susie") + expect_true("susie_result_trimmed" %in% names(out)) + expect_true("variant_names" %in% names(out)) +}) + +test_that("missing region produces NA grange columns rather than silent omission", { + variant_ids <- c("chr1:100:A:G") + cs_at_cov <- list("0.95" = list(1L), + "0.7" = list(1L), + "0.5" = list(1L)) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = 0.9) + out <- .run_build_top_loci(inp, method = "susie", + other_quantities = list(condition_id = "ctx")) + # grange_* must still be present columns of the 22-col schema, with NA values. + expect_true(all(c("grange_start", "grange_end") %in% names(out))) + expect_true(all(is.na(out$grange_start))) + expect_true(all(is.na(out$grange_end))) + # And event composition still works from gene + condition_id. + expect_equal(unique(out$event), "ctx_ENSG00000179403") +}) + +test_that("posterior_effect_mean equals colSums(alpha*mu); posterior_effect_se equals sqrt(pmax(colSums(alpha*mu2) - mean^2, 0))", { + variant_ids <- c("chr1:100:A:G", "chr1:200:C:T") + cs_at_cov <- list("0.95" = list(c(1L, 2L)), + "0.7" = list(c(1L, 2L)), + "0.5" = list(c(1L, 2L))) + inp <- .fake_fit_and_cs(variant_ids, cs_at_cov, pip = c(0.8, 0.6)) + out <- .run_build_top_loci(inp, method = "susie") + expected_mean <- colSums(inp$fit$alpha * inp$fit$mu) + expected_se <- sqrt(pmax(colSums(inp$fit$alpha * inp$fit$mu2) - expected_mean^2, 0)) + # Match per variant index by looking up via variant string. + for (i in seq_along(variant_ids)) { + row <- out[out$variant == variant_ids[i], , drop = FALSE] + expect_true(nrow(row) >= 1L) + expect_equal(unique(row$posterior_effect_mean), expected_mean[i], + tolerance = 1e-10) + expect_equal(unique(row$posterior_effect_se), expected_se[i], + tolerance = 1e-10) } }) diff --git a/tests/testthat/test_univariate_pipeline.R b/tests/testthat/test_univariate_pipeline.R index 762a22d3..f974cf94 100644 --- a/tests/testthat/test_univariate_pipeline.R +++ b/tests/testthat/test_univariate_pipeline.R @@ -614,6 +614,45 @@ test_that("uap: susie-inf initializes susie with matching greedy L", { expect_equal(unclass(result$susie_inf_fitted), fake_inf_fit) }) +test_that("uap: ordinary susie can run without susie-inf initialization", { + inp <- make_uap_inputs() + fake_fit <- make_fake_susie_fit(inp$p) + fake_post <- make_fake_post_result(inp$p) + + captured_calls <- list() + local_mocked_bindings( + susie = function(X, y, L, L_greedy, coverage, unmappable_effects = "none", + model_init = NULL, ...) { + captured_calls[[length(captured_calls) + 1]] <<- list( + L = L, + L_greedy = L_greedy, + unmappable_effects = unmappable_effects, + model_init = model_init + ) + fake_fit + }, + postprocess_finemapping_fits = function(fits, ...) { + expect_named(fits, "susie") + fake_post + }, + format_finemapping_output = function(post, primary_method) post, + ) + + result <- univariate_analysis_pipeline( + X = inp$X, Y = inp$Y, maf = inp$maf, + twas_weights = FALSE, add_susie_inf = FALSE, + L = 15, L_greedy = 7 + ) + expect_length(captured_calls, 1) + expect_equal(captured_calls[[1]]$L, 15) + expect_equal(captured_calls[[1]]$L_greedy, 7) + expect_equal(captured_calls[[1]]$unmappable_effects, "none") + expect_null(captured_calls[[1]]$model_init) + expect_s3_class(result$susie_fitted, "susie") + expect_false("susie_inf_fitted" %in% names(result)) + expect_false("susie_inf_result_trimmed" %in% names(result)) +}) + test_that("uap: post-processing output is merged into result", { inp <- make_uap_inputs() fake_fit <- make_fake_susie_fit(inp$p) diff --git a/vignettes/susie_wrapper.Rmd b/vignettes/susie_wrapper.Rmd new file mode 100644 index 00000000..51e9ba8b --- /dev/null +++ b/vignettes/susie_wrapper.Rmd @@ -0,0 +1,243 @@ +--- +title: "SuSiE wrapper unified top_loci output" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{SuSiE wrapper unified top_loci output} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +# Purpose + +This vignette is the schema-of-record for the unified `top_loci` table that +`pecotmr`'s SuSiE wrapper exposes after the **build_top_loci migration**. It +documents the 22-column shape, the credible-set string convention, the +`(variant, gene, method, cs_membership)` row uniqueness rule, and shows a +worked `postprocess_finemapping_fits()` example that row-binds `susie` and +`susie_inf` rows into a single output. + +It is reference memory for the package; downstream callers (xqtl-protocol, +transMap, etc.) consume the same single table. + +# Why one table + +Before this change the wrapper exposed three artifacts side-by-side: + +- `top_loci_long` — one row per (variant, method, coverage, CS membership); +- `top_loci` — a wide table with `pip_` and `CS__` + columns; +- `top_loci_export` — a compact "export" projection built by a separate + helper. + +Three artifacts meant three formatter code paths, three sets of column +naming conventions, and silent inconsistencies between long and wide were +easy to introduce. The user request for this migration was simple: + +> "we will not have any build_top_loci_long and build_top_loci_width just a +> function named build_top_loci. ... there is only one build_top_loci +> function, no more various formats." + +The migration accordingly removes the long/wide/export trio (and their +helpers) and replaces them with a single function +[`build_top_loci()`](../R/susie_wrapper.R) that returns one table. The +top-level `postprocess_finemapping_fits()` calls `build_top_loci()` once +per method per fit and `rbind`s the per-method contributions into the +single final `top_loci` table that `format_finemapping_output()` exposes. + +There is no separately exposed long table and no separately exposed wide +table. + +# The 22-column schema + +The unified `top_loci` table has exactly these columns, in this order: + +| # | Column | Type | Source | +|---:|-------------------------|-----------|---------------------------------------------------------------| +| 1 | `#chr` | character | parsed from `variant` | +| 2 | `start` | integer | `pos - 1` from parsed `variant` (BED 0-based) | +| 3 | `end` | integer | `pos` from parsed `variant` (BED half-open) | +| 4 | `a1` | character | from parsed `variant` | +| 5 | `a2` | character | from parsed `variant` | +| 6 | `variant` | character | the row's variant id in `chr:pos:A2:A1` form | +| 7 | `gene` | character | `colnames(data_y)[1]` (phenotype identifier) | +| 8 | `event` | character | `paste(other_quantities$condition_id, gene, sep="_")` | +| 9 | `n` | integer | `nrow(data_y)` (analyzed sample count) | +| 10 | `maf` | numeric | from the `maf` argument | +| 11 | `beta` | numeric | from `sumstats$betahat` | +| 12 | `se` | numeric | from `sumstats$sebetahat` | +| 13 | `pip` | numeric | per-variant SuSiE PIP | +| 14 | `posterior_effect_mean` | numeric | `colSums(alpha * mu)` | +| 15 | `posterior_effect_se` | numeric | `sqrt(pmax(colSums(alpha * mu2) - posterior_effect_mean^2, 0))` | +| 16 | `cs_95` | character | per-row CS membership at coverage 0.95 (see below) | +| 17 | `cs_70` | character | per-row CS membership at coverage 0.70 | +| 18 | `cs_50` | character | per-row CS membership at coverage 0.50 | +| 19 | `cs_95_purity` | numeric | 0.95-coverage CS purity for this row (see below) | +| 20 | `method` | character | `"susie"`, `"susie_inf"`, ... | +| 21 | `grange_start` | integer | parsed from top-level `region` arg | +| 22 | `grange_end` | integer | parsed from top-level `region` arg | + +There are no `pip_` or `CS__` wide-format columns, no +`top_loci_long` column set (`variant_id` / `coverage` / `cs`), and no +legacy `cs_coverage_0.95` / `PIP` / `variant_ID` / `gene_ID` / +`event_ID` / `region_start` / `region_end` names. Downstream callers +should read the columns above. + +## CS membership strings (`cs_95`, `cs_70`, `cs_50`) + +The credible-set columns are **character strings**, not integers. Each +value has the form + +``` +_ +``` + +where `` is the row's `method` column and `` is the +**per-method 1-based** credible-set number. Two consequences: + +1. Each method numbers its credible sets independently starting at `1`. + `susie` may have `susie_1`, `susie_2`, ...; `susie_inf` may have + `susie_inf_1`, `susie_inf_2`, .... The CS index space is *not* + sequenced across methods. + +2. A variant that is retained by the PIP cutoff but is **not** in any + credible set at the given coverage carries the sentinel `"_0"`. + For example, a `susie`-method row with no 0.70-coverage CS membership + carries `cs_70 = "susie_0"`. + +## `cs_95_purity` + +A single purity column. It holds the purity of the row's 0.95-coverage +credible set, taken from `cs_tables[[i]]$sets$purity$min.abs.corr` (with a +`cs_corr`-based fallback when `sets$purity` is unavailable). Rows whose +`cs_95` is `"_0"` (i.e. not in any 0.95-coverage CS) carry +`cs_95_purity = 0`. There are no `cs_70_purity` or `cs_50_purity` columns +at this stage. + +# Row uniqueness + +The row identity is `(variant, gene, method, cs_membership)`. Concretely: + +- A variant in a single CS within a single method produces **one** row. +- A variant in CS 1 *and* CS 2 of the same method (overlapping CS) produces + **two** rows. Both share `(variant, gene, method)` and differ in + `cs_95` (e.g. one row `cs_95 = "susie_1"`, the other + `cs_95 = "susie_2"`). +- A variant that appears in CS from more than one method (e.g. `susie` CS 1 + and `susie_inf` CS 1) produces **two** rows. They share `(variant, gene)` + and differ in `method` (and in `cs_95` because each method numbers + independently). +- A PIP-only retained variant (no CS membership at any coverage) produces + **one** row per method with + `cs_95 = cs_70 = cs_50 = "_0"` and `cs_95_purity = 0`. + +This rule preserves overlapping-CS evidence both *within* a method (CS 1 +overlaps CS 2 in `susie`) and *across* methods (the same variant appears +in `susie`'s CS and `susie_inf`'s CS). + +# Caller responsibilities + +`build_top_loci()` reads per-fit metadata from `data_x`, `data_y`, a +reserved `other_quantities` list, and a top-level `region` argument. The +caller (typically `univariate_analysis_pipeline()` and through it the +xqtl-protocol `susie_twas` block and the transMap finemapping pipeline) +supplies: + +| Field | Used for | +|----------------------------------|-------------------------------------| +| `data_y` | `n` and `gene` (column name) | +| `region` (top-level) | `grange_start`, `grange_end` (parsed by `parse_region()`) | +| `other_quantities$condition_id` | `event` composition with `gene` | + +`region` is a top-level `"chr:start-end"` string argument of +`univariate_analysis_pipeline()` (and `postprocess_finemapping_fits()` / +`build_top_loci()`). When `region` is `NULL`, `grange_start` and +`grange_end` are `NA`. If `other_quantities$condition_id` is missing, +`event` falls back to `NA_character_`. `other_quantities` remains a +free-form list (e.g. `dropped_samples` stays accepted); only +`condition_id` is consumed by the unified export. + +# Worked example + +The example below sketches a typical wrapper run that fits both `susie` +and `susie_inf` and shows that the final `top_loci` has one row per +`(variant, gene, method, cs_membership)`. + +```{r setup, eval = FALSE} +library(pecotmr) +library(susieR) + +set.seed(42) +n <- 300; p <- 50 +X <- matrix(rnorm(n * p), n, p) +colnames(X) <- sprintf("chr1:%d:G:A", seq_len(p)) +beta <- rep(0, p); beta[c(10, 30)] <- 1.5 +y <- as.numeric(X %*% beta) + rnorm(n, sd = 0.5) +Y <- matrix(y, ncol = 1, dimnames = list(NULL, "ENSG00000179403")) + +# fit_susie_inf_then_susie() is a pecotmr helper that returns both fits. +fits <- fit_susie_inf_then_susie(X, y) + +# Caller-supplied per-fit metadata. `condition_id` flows through +# other_quantities (which remains a free-form list); the analysis region +# flows through `region`, a top-level argument. +oq <- list(condition_id = "ctx_eQTL") + +post <- postprocess_finemapping_fits( + fits, data_x = X, data_y = Y, + coverage = 0.95, secondary_coverage = c(0.7, 0.5), + other_quantities = oq, + region = "chr1:1-50" +) + +out <- format_finemapping_output(post, primary_method = "susie") +tl <- out$top_loci + +# tl is the 22-column unified table. Verify the column order and the +# per-method CS strings. +stopifnot(identical(names(tl), c( + "#chr", "start", "end", "a1", "a2", + "variant", "gene", "event", + "n", "maf", "beta", "se", + "pip", "posterior_effect_mean", "posterior_effect_se", + "cs_95", "cs_70", "cs_50", "cs_95_purity", + "method", "grange_start", "grange_end" +))) +stopifnot(setequal(unique(tl$method), c("susie", "susie_inf"))) +stopifnot(all(grepl("^(susie|susie_inf)_\\d+$", + c(tl$cs_95, tl$cs_70, tl$cs_50)))) +``` + +A few things to notice in the worked example: + +- `out` is what xqtl-protocol / transMap callers receive. The previously + exposed `out$top_loci_long` field is gone; `out$top_loci` is the only + top-loci field. +- `tl$method` is one column with multiple values (`"susie"`, + `"susie_inf"`); there are no per-method `pip_` columns. To + inspect a single method's rows, filter on `tl$method`. +- `tl$cs_95` etc. are character columns. If you need an integer CS index + for a particular method, strip the prefix: + `as.integer(sub("^.*_", "", tl$cs_95[tl$method == "susie"]))`. +- A row whose `cs_95` is `"susie_0"` was retained by the PIP cutoff but is + not in any 0.95-coverage CS. Such rows have `cs_95_purity == 0` by + construction. + +# Downstream contract + +`format_finemapping_output()` exposes exactly one top-loci field, +`top_loci`, with the 22 columns above. The previously exposed +`top_loci_long` and wide `top_loci` (with `pip_` / +`CS__` columns) and the standalone `top_loci_export` are all +removed. Non-top-loci fields (`susie_result_trimmed`, `variant_names`, +`context_names`, `analysis_script`, ...) keep their previous semantics. + +The `FineMappingResult` S4 slot still satisfies its legacy validity +check (it expects `variant_id` and `method` columns); `susie_wrapper.R` +projects the new `top_loci` into a backward-compatible shape only for the +S4 slot, so `AllClasses.R`, `AllMethods.R`, and `vcf_writer.R` do not need +to change. End-user consumers should read the 22-column `top_loci` +returned by `format_finemapping_output()`, not the slot.