From 848c862dbe54d7602a56ef8c38ee3319cde2066b Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Mon, 18 May 2026 20:17:44 -0400 Subject: [PATCH 1/6] Add MZMinetoMSstatsFormat converter Brings metabolomics into the MSstats family by adding an MZMine converter that mirrors the structure of DIANNtoMSstatsFormat. Phase 1 of a two-phase task; Phase 2 (MSstatsShiny BIO=Metabolomics) will be a separate PR. --- .Rbuildignore | 2 + .gitignore | 2 + DESCRIPTION | 4 +- NAMESPACE | 1 + R/MSstatsConvert_core_functions.R | 13 +- R/clean_MZMine.R | 88 +++++++++++++ R/converters_MZMinetoMSstatsFormat.R | 91 ++++++++++++++ inst/tinytest/raw_data/MZMine/annotation.csv | 5 + .../raw_data/MZMine/mzmine_annotations.csv | 6 + .../tinytest/raw_data/MZMine/mzmine_input.csv | 7 ++ .../test_converters_MZMinetoMSstatsFormat.R | 117 ++++++++++++++++++ man/MSstatsClean.Rd | 14 +++ man/MSstatsInputFiles.Rd | 3 + man/MZMinetoMSstatsFormat.Rd | 79 ++++++++++++ man/dot-cleanRawMZMine.Rd | 29 +++++ vignettes/msstats_data_format.Rmd | 50 ++++++++ 16 files changed, 509 insertions(+), 2 deletions(-) create mode 100644 R/clean_MZMine.R create mode 100644 R/converters_MZMinetoMSstatsFormat.R create mode 100644 inst/tinytest/raw_data/MZMine/annotation.csv create mode 100644 inst/tinytest/raw_data/MZMine/mzmine_annotations.csv create mode 100644 inst/tinytest/raw_data/MZMine/mzmine_input.csv create mode 100644 inst/tinytest/test_converters_MZMinetoMSstatsFormat.R create mode 100644 man/MZMinetoMSstatsFormat.Rd create mode 100644 man/dot-cleanRawMZMine.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 613e0b649..37ad74244 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,5 @@ ^pkgdown$ ^\.positai$ ^\.claude$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 6d24a0fdc..c796408d5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ inst/doc .lintr .vscode .positai +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index af973548e..fd7c2e4f4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Suggests: rmarkdown LinkingTo: Rcpp Collate: + 'clean_MZMine.R' 'clean_ProteinProspector.R' 'clean_Metamorpheus.R' 'clean_DIANN.R' @@ -53,6 +54,7 @@ Collate: 'converters_DIANNtoMSstatsFormat.R' 'converters_DIAUmpiretoMSstatsFormat.R' 'converters_FragPipetoMSstatsFormat.R' + 'converters_MZMinetoMSstatsFormat.R' 'converters_MaxQtoMSstatsFormat.R' 'converters_MaxQtoMSstatsTMTFormat.R' 'converters_MetamorpheusToMSstatsFormat.R' @@ -81,4 +83,4 @@ Collate: 'utils_fractions.R' 'utils_logging.R' 'utils_shared_peptides.R' -VignetteBuilder: knitr +VignetteBuilder: knitr \ No newline at end of file diff --git a/NAMESPACE b/NAMESPACE index cc2cfa210..e51cffd66 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(MSstatsLogsSettings) export(MSstatsMakeAnnotation) export(MSstatsPreprocess) export(MSstatsSaveSessionInfo) +export(MZMinetoMSstatsFormat) export(MaxQtoMSstatsFormat) export(MaxQtoMSstatsTMTFormat) export(MetamorpheusToMSstatsFormat) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 3d9ee7dc3..7e4c77ae3 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -71,6 +71,10 @@ setClass("MSstatsMetamorpheusFiles", contains = "MSstatsInputFiles") #' @rdname MSstatsInputFiles #' @keywords internal setClass("MSstatsProteinProspectorFiles", contains = "MSstatsInputFiles") +#' MSstatsMZMineFiles: class for MZMine files. +#' @rdname MSstatsInputFiles +#' @keywords internal +setClass("MSstatsMZMineFiles", contains = "MSstatsInputFiles") #' Get one of files contained in an instance of `MSstatsInputFiles` class. @@ -291,8 +295,15 @@ setMethod("MSstatsClean", signature = "MSstatsMetamorpheusFiles", #' @rdname MSstatsClean #' @inheritParams .cleanRawProteinProspector #' @return data.table -setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", +setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", .cleanRawProteinProspector) +#' Clean MZMine files +#' @include clean_MZMine.R +#' @rdname MSstatsClean +#' @inheritParams .cleanRawMZMine +#' @return data.table +setMethod("MSstatsClean", signature = "MSstatsMZMineFiles", + .cleanRawMZMine) #' Preprocess outputs from MS signal processing tools for analysis with MSstats diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R new file mode 100644 index 000000000..fea645100 --- /dev/null +++ b/R/clean_MZMine.R @@ -0,0 +1,88 @@ +#' Clean raw MZMine files +#' +#' Operates on the column names produced by MZMine after MSstatsConvert's +#' internal column-name standardization (spaces collapsed and dots removed): +#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time" +#' becomes `rowretentiontime`, and each " Peak area" becomes +#' `Peakarea`. +#' +#' @param msstats_object an object of class `MSstatsMZMineFiles`. +#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. When supplied, +#' the highest-scoring `compound_name` per feature is used as `ProteinName`. +#' Features without a matching annotation row fall back to an mz_rt string +#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature +#' uses the mz_rt fallback. +#' @return data.table +#' @keywords internal +.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) { + ProteinName = PeptideSequence = Intensity = Run = NULL + PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL + sample_col = id = score = compound_name = NULL + + mz_input <- getInputFile(msstats_object, "input") + mz_input <- data.table::as.data.table(mz_input) + + peak_area_suffix <- "Peakarea" + peak_area_cols <- grep(paste0(peak_area_suffix, "$"), + colnames(mz_input), value = TRUE) + if (length(peak_area_cols) == 0) { + stop("No 'Peak area' columns found in the input. Expected per-sample ", + "columns named ' Peak area' (e.g. 'sampleA.mzML Peak area').") + } + id_col <- "rowID" + mz_col <- "rowmz" + rt_col <- "rowretentiontime" + required_meta <- c(id_col, mz_col, rt_col) + missing_meta <- setdiff(required_meta, colnames(mz_input)) + if (length(missing_meta) > 0) { + stop("Missing required MZMine metadata column(s) (expected 'row ID', ", + "'row m/z', 'row retention time'). After standardization, ", + "looked for: ", paste(missing_meta, collapse = ", "), ".") + } + + mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_", + round(mz_input[[rt_col]], 2)) + + if (!is.null(mzmine_annotations)) { + ann <- data.table::as.data.table(mzmine_annotations) + required_ann <- c("id", "compound_name", "score") + missing_ann <- setdiff(required_ann, colnames(ann)) + if (length(missing_ann) > 0) { + stop("mzmine_annotations is missing required column(s): ", + paste(missing_ann, collapse = ", "), ".") + } + data.table::setorder(ann, id, -score) + ann_top <- unique(ann, by = "id") + matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]), + compound_name] + compound <- ifelse(is.na(matched), mz_rt_fallback, matched) + } else { + compound <- mz_rt_fallback + } + + mz_input[, ProteinName := compound] + mz_input[, PeptideSequence := as.character(get(id_col))] + + long <- data.table::melt( + mz_input, + id.vars = c("ProteinName", "PeptideSequence"), + measure.vars = peak_area_cols, + variable.name = "sample_col", + value.name = "Intensity", + variable.factor = FALSE) + + long[, PrecursorCharge := NA_integer_] + long[, FragmentIon := NA_character_] + long[, ProductCharge := NA_integer_] + long[, IsotopeLabelType := "Light"] + long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)] + + final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", "IsotopeLabelType", + "Run", "Intensity") + long <- long[, final_cols, with = FALSE] + + .logSuccess("MZMine", "clean") + long +} diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R new file mode 100644 index 000000000..5ec42bfb9 --- /dev/null +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -0,0 +1,91 @@ +#' Import MZMine files +#' +#' @inheritParams .sharedParametersAmongConverters +#' @param input MZMine feature-quantification table (wide format; one row per +#' feature). Must include the metadata columns `row ID`, `row m/z`, +#' `row retention time`, and per-sample peak-area columns named +#' `" Peak area"` (e.g. `"sampleA.mzML Peak area"`). +#' @param annotation `data.frame` with columns `Run`, `Condition`, +#' `BioReplicate`. `Run` values must match the sample column names with the +#' trailing `" Peak area"` stripped. +#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. When supplied, +#' the highest-scoring `compound_name` per feature is used as `ProteinName`; +#' features without a matching annotation row fall back to an mz_rt string +#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature +#' uses the mz_rt fallback. +#' @param removeProtein_with1Feature `TRUE` will remove proteins (compounds) +#' represented by a single feature. Default `FALSE`. +#' @param summaryforMultipleRows `max` (default) or `sum` — used when multiple +#' rows map to the same feature/run combination. +#' +#' @return data.table in the MSstats required format. +#' +#' @export +#' +#' @examples +#' input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", +#' package = "MSstatsConvert") +#' annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv", +#' package = "MSstatsConvert") +#' lib_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", +#' package = "MSstatsConvert") +#' input = data.table::fread(input_path) +#' annot = data.table::fread(annot_path) +#' lib = data.table::fread(lib_path) +#' output = MZMinetoMSstatsFormat(input, annotation = annot, +#' mzmine_annotations = lib, +#' use_log_file = FALSE) +#' head(output) +MZMinetoMSstatsFormat = function( + input, + annotation = NULL, + mzmine_annotations = NULL, + removeProtein_with1Feature = FALSE, + summaryforMultipleRows = max, + use_log_file = TRUE, + append = FALSE, + verbose = TRUE, + log_file_path = NULL, + ...) { + MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, + log_file_path) + + input = MSstatsConvert::MSstatsImport(list(input = input), + "MSstats", "MZMine") + input = MSstatsConvert::MSstatsClean( + input, mzmine_annotations = mzmine_annotations) + annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) + + feature_columns = c("PeptideSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge") + fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input)) + list() else list("IsotopeLabelType" = "Light") + + input = MSstatsConvert::MSstatsPreprocess( + input, + annotation, + feature_columns, + remove_shared_peptides = FALSE, + remove_single_feature_proteins = removeProtein_with1Feature, + exact_filtering = NULL, + pattern_filtering = NULL, + aggregate_isotopic = FALSE, + feature_cleaning = list( + remove_features_with_few_measurements = FALSE, + summarize_multiple_psms = summaryforMultipleRows), + columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type)) + input[, Intensity := ifelse(Intensity == 0, NA, Intensity)] + + input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, + fill_incomplete = TRUE, + handle_fractions = FALSE, + remove_few = FALSE) + + msg_final = paste("** Finished preprocessing. The dataset is ready", + "to be processed by the dataProcess function.") + getOption("MSstatsLog")("INFO", msg_final) + getOption("MSstatsMsg")("INFO", msg_final) + getOption("MSstatsLog")("INFO", "\n") + input +} diff --git a/inst/tinytest/raw_data/MZMine/annotation.csv b/inst/tinytest/raw_data/MZMine/annotation.csv new file mode 100644 index 000000000..f28f4a863 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/annotation.csv @@ -0,0 +1,5 @@ +Run,Condition,BioReplicate +sampleA.mzML,Control,1 +sampleB.mzML,Control,2 +sampleC.mzML,Treatment,3 +sampleD.mzML,Treatment,4 diff --git a/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv new file mode 100644 index 000000000..a2c38a57d --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv @@ -0,0 +1,6 @@ +id,compound_name,score,adduct +1,Caffeine,0.95,[M+H]+ +2,GlucoseLow,0.72,[M+H]+ +2,GlucoseHigh,0.91,[M-H]- +3,Lactate,0.88,[M+H]+ +6,Caffeine,0.80,[M+Na]+ diff --git a/inst/tinytest/raw_data/MZMine/mzmine_input.csv b/inst/tinytest/raw_data/MZMine/mzmine_input.csv new file mode 100644 index 000000000..b887ed1a1 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/mzmine_input.csv @@ -0,0 +1,7 @@ +row ID,row m/z,row retention time,sampleA.mzML Peak area,sampleB.mzML Peak area,sampleC.mzML Peak area,sampleD.mzML Peak area +1,123.0560,1.23,1000,1100,1200,1300 +2,245.1290,3.45,5000,4800,5200,4900 +3,367.2010,5.67,800,0,750,820 +4,489.3340,7.89,2000,2100,1900,2050 +5,555.4470,9.10,100,0,0,0 +6,123.0560,1.45,600,650,700,680 diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R new file mode 100644 index 000000000..5a6aeaeb2 --- /dev/null +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -0,0 +1,117 @@ +# Test MZMinetoMSstatsFormat --------------------------- +input_file_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert") +annotation_file_path = system.file("tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert") +mzmine_ann_file_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert") +input = data.table::fread(input_file_path) +annot = data.table::fread(annotation_file_path) +mzmine_ann = data.table::fread(mzmine_ann_file_path) + +# With mzmine_annotations supplied ------------------------------------------- +output = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + use_log_file = FALSE) +output_dt = data.table::as.data.table(output) + +# Basic structure: 6 features x 4 runs = 24 rows, 11 standard columns +expect_equal(ncol(output), 11) +expect_equal(nrow(output), 24) +expect_true("Run" %in% colnames(output)) +expect_true("ProteinName" %in% colnames(output)) +expect_true("PeptideSequence" %in% colnames(output)) +expect_true("PrecursorCharge" %in% colnames(output)) +expect_true("Intensity" %in% colnames(output)) +expect_true("FragmentIon" %in% colnames(output)) +expect_true("ProductCharge" %in% colnames(output)) +expect_true("IsotopeLabelType" %in% colnames(output)) +expect_true("Condition" %in% colnames(output)) +expect_true("BioReplicate" %in% colnames(output)) +expect_true("Fraction" %in% colnames(output)) + +# Metabolomics has no isotope labeling, so every row is "Light" +expect_true(all(output_dt$IsotopeLabelType == "Light")) + +# Charge / fragment columns are not applicable for metabolomics +expect_true(all(is.na(output_dt$PrecursorCharge))) +expect_true(all(is.na(output_dt$FragmentIon))) +expect_true(all(is.na(output_dt$ProductCharge))) + +# Fraction filled to 1 +expect_true(all(output_dt$Fraction == 1)) + +# Annotation join: feature 2 has two annotation rows; the highest-scoring one wins +feature2_proteins = unique(output_dt[PeptideSequence == "2", ProteinName]) +expect_equal(as.character(feature2_proteins), "GlucoseHigh") + +# Clean annotation cases +feature1_proteins = unique(output_dt[PeptideSequence == "1", ProteinName]) +expect_equal(as.character(feature1_proteins), "Caffeine") +feature3_proteins = unique(output_dt[PeptideSequence == "3", ProteinName]) +expect_equal(as.character(feature3_proteins), "Lactate") +feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName]) +expect_equal(as.character(feature6_proteins), "Caffeine") + +# Features without annotation rows fall back to the mz_rt string +feature4_proteins = unique(output_dt[PeptideSequence == "4", ProteinName]) +expect_equal(as.character(feature4_proteins), "489.334_7.89") +feature5_proteins = unique(output_dt[PeptideSequence == "5", ProteinName]) +expect_equal(as.character(feature5_proteins), "555.447_9.1") + +# Zero-intensity input cells are converted to NA in output +# Feature 3 sampleB = 0 -> NA +feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML", + Intensity] +expect_true(is.na(feature3_sampleB_int)) +# Feature 5 sampleB/C/D all = 0 -> NA +feature5_zero_ints = output_dt[PeptideSequence == "5" & + Run %in% c("sampleBmzML", "sampleCmzML", "sampleDmzML"), + Intensity] +expect_true(all(is.na(feature5_zero_ints))) + +# Annotation merges correctly: sampleA is Control rep 1 +sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition]) +expect_equal(as.character(sampleA_cond), "Control") +sampleA_rep = unique(output_dt[Run == "sampleAmzML", BioReplicate]) +expect_equal(as.character(sampleA_rep), "1") +sampleC_cond = unique(output_dt[Run == "sampleCmzML", Condition]) +expect_equal(as.character(sampleC_cond), "Treatment") + +# Intensity values trace back to input +feature1_sampleA_int = output_dt[PeptideSequence == "1" & Run == "sampleAmzML", + Intensity] +expect_equal(as.numeric(feature1_sampleA_int), 1000) +feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML", + Intensity] +expect_equal(as.numeric(feature2_sampleC_int), 5200) + +# Without mzmine_annotations ------------------------------------------------- +output_nolib = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = NULL, + use_log_file = FALSE) +output_nolib_dt = data.table::as.data.table(output_nolib) + +# Every ProteinName is the mz_rt fallback string +expect_equal(ncol(output_nolib), 11) +expect_equal(nrow(output_nolib), 24) +expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67", + "489.334_7.89", "555.447_9.1", "123.056_1.45") +expect_true(all(as.character(output_nolib_dt$ProteinName) %in% expected_mz_rt)) +# Compound names from the library must not leak in +expect_false(any(as.character(output_nolib_dt$ProteinName) %in% + c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate"))) + +# removeProtein_with1Feature filters non-Caffeine proteins ------------------- +# Caffeine has 2 features (PeptideSequence "1" and "6"); all others have 1. +output_filtered = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + removeProtein_with1Feature = TRUE, + use_log_file = FALSE) +output_filtered_dt = data.table::as.data.table(output_filtered) + +expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine") +# 2 features x 4 runs = 8 rows +expect_equal(nrow(output_filtered), 8) +expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))), + c("1", "6")) diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index d5559610b..0809a567b 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -15,6 +15,7 @@ \alias{MSstatsClean,MSstatsDIANNFiles-method} \alias{MSstatsClean,MSstatsMetamorpheusFiles-method} \alias{MSstatsClean,MSstatsProteinProspectorFiles-method} +\alias{MSstatsClean,MSstatsMZMineFiles-method} \title{Clean files generated by a signal processing tools.} \usage{ MSstatsClean(msstats_object, ...) @@ -80,6 +81,8 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05) \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object) + +\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations = NULL) } \arguments{ \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.} @@ -197,6 +200,13 @@ The SILAC suffix is then stripped from \code{PeptideSequence}. When \code{NULL} (default), protein-turnover mode is disabled and all peptides receive \code{IsotopeLabelType = "Light"}.} + +\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, +the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}. +Features without a matching annotation row fall back to an mz_rt string +\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature +uses the mz_rt fallback.} } \value{ data.table @@ -223,6 +233,8 @@ data.table data.table +data.table + data.table } \description{ @@ -253,6 +265,8 @@ Clean DIA-NN files Clean Metamorpheus files Clean Protein Prospector files + +Clean MZMine files } \examples{ evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", diff --git a/man/MSstatsInputFiles.Rd b/man/MSstatsInputFiles.Rd index e09fff8de..eb4f86463 100644 --- a/man/MSstatsInputFiles.Rd +++ b/man/MSstatsInputFiles.Rd @@ -17,6 +17,7 @@ \alias{MSstatsFragPipeFiles-class} \alias{MSstatsMetamorpheusFiles-class} \alias{MSstatsProteinProspectorFiles-class} +\alias{MSstatsMZMineFiles-class} \title{Class to model files that describe a single MS dataset.} \description{ Class to model files that describe a single MS dataset. @@ -48,6 +49,8 @@ MSstatsFragPipeFiles: class for FragPipe files. MSstatsMetamorpheusFiles: class for Metamorpheus files. MSstatsProteinProspectorFiles: class for ProteinProspector files. + +MSstatsMZMineFiles: class for MZMine files. } \section{Slots}{ diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd new file mode 100644 index 000000000..3f6aa4de1 --- /dev/null +++ b/man/MZMinetoMSstatsFormat.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/converters_MZMinetoMSstatsFormat.R +\name{MZMinetoMSstatsFormat} +\alias{MZMinetoMSstatsFormat} +\title{Import MZMine files} +\usage{ +MZMinetoMSstatsFormat( + input, + annotation = NULL, + mzmine_annotations = NULL, + removeProtein_with1Feature = FALSE, + summaryforMultipleRows = max, + use_log_file = TRUE, + append = FALSE, + verbose = TRUE, + log_file_path = NULL, + ... +) +} +\arguments{ +\item{input}{MZMine feature-quantification table (wide format; one row per +feature). Must include the metadata columns \verb{row ID}, \verb{row m/z}, +\verb{row retention time}, and per-sample peak-area columns named +\code{" Peak area"} (e.g. \code{"sampleA.mzML Peak area"}).} + +\item{annotation}{\code{data.frame} with columns \code{Run}, \code{Condition}, +\code{BioReplicate}. \code{Run} values must match the sample column names with the +trailing \code{" Peak area"} stripped.} + +\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, +the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}; +features without a matching annotation row fall back to an mz_rt string +\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature +uses the mz_rt fallback.} + +\item{removeProtein_with1Feature}{\code{TRUE} will remove proteins (compounds) +represented by a single feature. Default \code{FALSE}.} + +\item{summaryforMultipleRows}{\code{max} (default) or \code{sum} — used when multiple +rows map to the same feature/run combination.} + +\item{use_log_file}{logical. If TRUE, information about data processing +will be saved to a file.} + +\item{append}{logical. If TRUE, information about data processing will be added +to an existing log file.} + +\item{verbose}{logical. If TRUE, information about data processing will be printed +to the console.} + +\item{log_file_path}{character. Path to a file to which information about +data processing will be saved. +If not provided, such a file will be created automatically. +If \code{append = TRUE}, has to be a valid path to a file.} + +\item{...}{additional parameters to \code{data.table::fread}.} +} +\value{ +data.table in the MSstats required format. +} +\description{ +Import MZMine files +} +\examples{ +input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert") +annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert") +lib_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert") +input = data.table::fread(input_path) +annot = data.table::fread(annot_path) +lib = data.table::fread(lib_path) +output = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = lib, + use_log_file = FALSE) +head(output) +} diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd new file mode 100644 index 000000000..82794933a --- /dev/null +++ b/man/dot-cleanRawMZMine.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_MZMine.R +\name{.cleanRawMZMine} +\alias{.cleanRawMZMine} +\title{Clean raw MZMine files} +\usage{ +.cleanRawMZMine(msstats_object, mzmine_annotations = NULL) +} +\arguments{ +\item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.} + +\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, +the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}. +Features without a matching annotation row fall back to an mz_rt string +\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature +uses the mz_rt fallback.} +} +\value{ +data.table +} +\description{ +Operates on the column names produced by MZMine after MSstatsConvert's +internal column-name standardization (spaces collapsed and dots removed): +"row ID" becomes \code{rowID}, "row m/z" becomes \code{rowmz}, "row retention time" +becomes \code{rowretentiontime}, and each "\if{html}{\out{}} Peak area" becomes +\verb{Peakarea}. +} +\keyword{internal} diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index dbf77916a..78e92ff5b 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -329,3 +329,53 @@ Such a `data.frame` will be recognized by statistical processing functions from `MSstats` and `MSstatsTMT` packages as a valid input, which will allow them to skip checks and transformation necessary to fit data into this format. +# Metabolomics with MZMine + +`MZMinetoMSstatsFormat` brings untargeted metabolomics into the MSstats family. +It takes the wide-format feature-quantification table exported by MZMine (one row +per feature, one ` Peak area` column per sample) together with a standard +MSstats annotation and produces an MSstats-ready long-format `data.table`. + +Optionally, an MZMine spectral-library annotation table (with `id`, `compound_name`, +`score` columns) can be supplied; the highest-scoring `compound_name` per feature +is used as `ProteinName`. Features with no library match fall back to an mz_rt +string of the form `paste0(round(mz, 4), "_", round(rt, 2))`. When no library +annotation is supplied, every feature uses the mz_rt fallback. + +```{r mzmine} +mzmine_input = data.table::fread(system.file( + "tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert" +)) +mzmine_annotation = data.table::fread(system.file( + "tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert" +)) +mzmine_library = data.table::fread(system.file( + "tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert" +)) + +# With a spectral-library annotation: ProteinName is the matched compound name +mzmine_with_lib = MZMinetoMSstatsFormat( + mzmine_input, + annotation = mzmine_annotation, + mzmine_annotations = mzmine_library, + use_log_file = FALSE +) +head(mzmine_with_lib) + +# Without a library: ProteinName is an mz_rt fallback string +mzmine_no_lib = MZMinetoMSstatsFormat( + mzmine_input, + annotation = mzmine_annotation, + mzmine_annotations = NULL, + use_log_file = FALSE +) +head(mzmine_no_lib) +``` + +Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence` +holds the MZMine `row ID` (as a string), and `PrecursorCharge`, `FragmentIon`, and +`ProductCharge` are all `NA`. `IsotopeLabelType` is set to `"Light"` for every row. + From 778899f6b43cca1dc53cbf23eab61e4a0922685a Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Tue, 19 May 2026 23:33:31 -0400 Subject: [PATCH 2/6] Address CodeRabbit review feedback --- R/clean_MZMine.R | 4 ++++ R/converters_MZMinetoMSstatsFormat.R | 9 ++++++--- inst/tinytest/test_converters_MZMinetoMSstatsFormat.R | 5 ++++- man/MZMinetoMSstatsFormat.Rd | 7 +++++-- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index fea645100..ce2713c9e 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -52,6 +52,10 @@ stop("mzmine_annotations is missing required column(s): ", paste(missing_ann, collapse = ", "), ".") } + ann[, score := suppressWarnings(as.numeric(score))] + if (anyNA(ann$score)) { + stop("mzmine_annotations$score must be numeric (or coercible to numeric).") + } data.table::setorder(ann, id, -score) ann_top <- unique(ann, by = "id") matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]), diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index 5ec42bfb9..3c340b8e6 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -6,8 +6,11 @@ #' `row retention time`, and per-sample peak-area columns named #' `" Peak area"` (e.g. `"sampleA.mzML Peak area"`). #' @param annotation `data.frame` with columns `Run`, `Condition`, -#' `BioReplicate`. `Run` values must match the sample column names with the -#' trailing `" Peak area"` stripped. +#' `BioReplicate`. `Run` values must match MSstatsConvert-standardized sample +#' names (after column-name normalization removes spaces and dots) with the +#' trailing `"Peakarea"` suffix removed. For example, a quant-file column +#' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization, +#' so the corresponding `Run` value must be `sampleAmzML`. #' @param mzmine_annotations optional `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. When supplied, #' the highest-scoring `compound_name` per feature is used as `ProteinName`; @@ -52,7 +55,7 @@ MZMinetoMSstatsFormat = function( log_file_path) input = MSstatsConvert::MSstatsImport(list(input = input), - "MSstats", "MZMine") + "MSstats", "MZMine", ...) input = MSstatsConvert::MSstatsClean( input, mzmine_annotations = mzmine_annotations) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R index 5a6aeaeb2..6d0fff53a 100644 --- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -97,7 +97,10 @@ expect_equal(ncol(output_nolib), 11) expect_equal(nrow(output_nolib), 24) expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67", "489.334_7.89", "555.447_9.1", "123.056_1.45") -expect_true(all(as.character(output_nolib_dt$ProteinName) %in% expected_mz_rt)) +expect_equal( + sort(unique(as.character(output_nolib_dt$ProteinName))), + sort(expected_mz_rt) +) # Compound names from the library must not leak in expect_false(any(as.character(output_nolib_dt$ProteinName) %in% c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate"))) diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index 3f6aa4de1..5f753215b 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -24,8 +24,11 @@ feature). Must include the metadata columns \verb{row ID}, \verb{row m/z}, \code{" Peak area"} (e.g. \code{"sampleA.mzML Peak area"}).} \item{annotation}{\code{data.frame} with columns \code{Run}, \code{Condition}, -\code{BioReplicate}. \code{Run} values must match the sample column names with the -trailing \code{" Peak area"} stripped.} +\code{BioReplicate}. \code{Run} values must match MSstatsConvert-standardized sample +names (after column-name normalization removes spaces and dots) with the +trailing \code{"Peakarea"} suffix removed. For example, a quant-file column +\code{"sampleA.mzML Peak area"} becomes \code{"sampleAmzML"} after standardization, +so the corresponding \code{Run} value must be \code{sampleAmzML}.} \item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, From 2957f6d46968cbeac1b62c661ec9a7805955f8f4 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Tue, 19 May 2026 23:41:07 -0400 Subject: [PATCH 3/6] Coerce score via character to handle factor inputs --- R/clean_MZMine.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index ce2713c9e..9b1b59691 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -52,7 +52,7 @@ stop("mzmine_annotations is missing required column(s): ", paste(missing_ann, collapse = ", "), ".") } - ann[, score := suppressWarnings(as.numeric(score))] + ann[, score := suppressWarnings(as.numeric(as.character(score)))] if (anyNA(ann$score)) { stop("mzmine_annotations$score must be numeric (or coercible to numeric).") } From 53fed81dd8f9afc58e81afa70c6df19541c7a000 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Wed, 27 May 2026 10:30:49 -0400 Subject: [PATCH 4/6] =?UTF-8?q?Address=20Tony=20review=20feedback=20for=20?= =?UTF-8?q?workstream=20A=20-=20Let=20MSstatsPreprocess=20fill=20IsotopeLa?= =?UTF-8?q?belType=20-=20Hardcode=20IsotopeLabelType=20in=20converter=20-?= =?UTF-8?q?=20Remove=20redundant=20inherited=20@params=20-=20Simplify=20sc?= =?UTF-8?q?ore=20coercion=20-=20Improve=20non-numeric=20score=20error=20-?= =?UTF-8?q?=20Rename=20`ann`=20=E2=86=92=20feature=5Fto=5Fcompound=20-=20R?= =?UTF-8?q?ename=20melt=20variable=20to=20Run=20-=20Refactor=20compound-na?= =?UTF-8?q?me=20assignment=20with=20explicit=20data.table=20join?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/clean_MZMine.R | 44 ++++++++++++++++------------ R/converters_MZMinetoMSstatsFormat.R | 11 ++----- man/MZMinetoMSstatsFormat.Rd | 6 ++-- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index 9b1b59691..c714a3c72 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -17,8 +17,8 @@ #' @keywords internal .cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) { ProteinName = PeptideSequence = Intensity = Run = NULL - PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL - sample_col = id = score = compound_name = NULL + PrecursorCharge = FragmentIon = ProductCharge = NULL + id = score = compound_name = i.compound_name = NULL mz_input <- getInputFile(msstats_object, "input") mz_input <- data.table::as.data.table(mz_input) @@ -43,47 +43,53 @@ mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_", round(mz_input[[rt_col]], 2)) + mz_input[, ProteinName := mz_rt_fallback] if (!is.null(mzmine_annotations)) { - ann <- data.table::as.data.table(mzmine_annotations) + feature_to_compound <- data.table::as.data.table(mzmine_annotations) required_ann <- c("id", "compound_name", "score") - missing_ann <- setdiff(required_ann, colnames(ann)) + missing_ann <- setdiff(required_ann, colnames(feature_to_compound)) if (length(missing_ann) > 0) { stop("mzmine_annotations is missing required column(s): ", paste(missing_ann, collapse = ", "), ".") } - ann[, score := suppressWarnings(as.numeric(as.character(score)))] - if (anyNA(ann$score)) { - stop("mzmine_annotations$score must be numeric (or coercible to numeric).") + feature_to_compound[, score := suppressWarnings(as.numeric(score))] + if (anyNA(feature_to_compound$score)) { + stop("The 'score' column in the mzmine annotations file must contain numeric values.") } - data.table::setorder(ann, id, -score) - ann_top <- unique(ann, by = "id") - matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]), - compound_name] - compound <- ifelse(is.na(matched), mz_rt_fallback, matched) - } else { - compound <- mz_rt_fallback + # Sort by id ascending and score descending so the highest-scoring + # annotation per id is the first row in each group. + data.table::setorder(feature_to_compound, id, -score) + # Collapse to one row per id (the highest-scoring). data.table's + # unique() with a 'by' arg keeps the first row per group, which after + # the sort above is the highest-scoring annotation. + feature_to_compound <- unique(feature_to_compound, by = "id") + # Join: unmatched mz_input rows keep the mz_rt_fallback ProteinName + # set above. + mz_input[ + feature_to_compound, + ProteinName := i.compound_name, + on = setNames("id", id_col) + ] } - mz_input[, ProteinName := compound] mz_input[, PeptideSequence := as.character(get(id_col))] long <- data.table::melt( mz_input, id.vars = c("ProteinName", "PeptideSequence"), measure.vars = peak_area_cols, - variable.name = "sample_col", + variable.name = "Run", value.name = "Intensity", variable.factor = FALSE) long[, PrecursorCharge := NA_integer_] long[, FragmentIon := NA_character_] long[, ProductCharge := NA_integer_] - long[, IsotopeLabelType := "Light"] - long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)] + long[, Run := sub(paste0(peak_area_suffix, "$"), "", Run)] final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", - "FragmentIon", "ProductCharge", "IsotopeLabelType", + "FragmentIon", "ProductCharge", "Run", "Intensity") long <- long[, final_cols, with = FALSE] diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index 3c340b8e6..ae7a517ad 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -17,10 +17,6 @@ #' features without a matching annotation row fall back to an mz_rt string #' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature #' uses the mz_rt fallback. -#' @param removeProtein_with1Feature `TRUE` will remove proteins (compounds) -#' represented by a single feature. Default `FALSE`. -#' @param summaryforMultipleRows `max` (default) or `sum` — used when multiple -#' rows map to the same feature/run combination. #' #' @return data.table in the MSstats required format. #' @@ -60,10 +56,7 @@ MZMinetoMSstatsFormat = function( input, mzmine_annotations = mzmine_annotations) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) - feature_columns = c("PeptideSequence", "PrecursorCharge", - "FragmentIon", "ProductCharge") - fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input)) - list() else list("IsotopeLabelType" = "Light") + feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") input = MSstatsConvert::MSstatsPreprocess( input, @@ -77,7 +70,7 @@ MZMinetoMSstatsFormat = function( feature_cleaning = list( remove_features_with_few_measurements = FALSE, summarize_multiple_psms = summaryforMultipleRows), - columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type)) + columns_to_fill = list(Fraction = 1, IsotopeLabelType = "Light")) input[, Intensity := ifelse(Intensity == 0, NA, Intensity)] input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index 5f753215b..3ce9cc9b2 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -37,11 +37,9 @@ features without a matching annotation row fall back to an mz_rt string \code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature uses the mz_rt fallback.} -\item{removeProtein_with1Feature}{\code{TRUE} will remove proteins (compounds) -represented by a single feature. Default \code{FALSE}.} +\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} -\item{summaryforMultipleRows}{\code{max} (default) or \code{sum} — used when multiple -rows map to the same feature/run combination.} +\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.} \item{use_log_file}{logical. If TRUE, information about data processing will be saved to a file.} From e4d8966a824a4dbbaf7a13c35fb2f04c9caa5b2f Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Wed, 27 May 2026 15:08:19 -0400 Subject: [PATCH 5/6] Require mzmine_annotations; inner-join filter - Error on NULL/missing mzmine_annotations - Drop quant-only unmatched features (no mz_rt fallback) - Log retained feature IDs after join - Update tests, vignette, and roxygen for filtering + MSI Level 2 scope - Remove unused .cleanRawMZMine metadata requirements removeProtein_with1Feature default unchanged (FALSE). --- DESCRIPTION | 2 +- R/clean_MZMine.R | 89 +++++++++---------- R/converters_MZMinetoMSstatsFormat.R | 26 ++++-- .../test_converters_MZMinetoMSstatsFormat.R | 55 +++++------- man/MSstatsClean.Rd | 16 ++-- man/MZMinetoMSstatsFormat.Rd | 21 +++-- man/dot-cleanRawMZMine.Rd | 19 ++-- vignettes/msstats_data_format.Rmd | 32 ++++--- 8 files changed, 136 insertions(+), 124 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fd7c2e4f4..073329f4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -83,4 +83,4 @@ Collate: 'utils_fractions.R' 'utils_logging.R' 'utils_shared_peptides.R' -VignetteBuilder: knitr \ No newline at end of file +VignetteBuilder: knitr diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index c714a3c72..ab9d559ec 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -2,20 +2,21 @@ #' #' Operates on the column names produced by MZMine after MSstatsConvert's #' internal column-name standardization (spaces collapsed and dots removed): -#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time" -#' becomes `rowretentiontime`, and each " Peak area" becomes +#' "row ID" becomes `rowID`, and each " Peak area" becomes #' `Peakarea`. #' #' @param msstats_object an object of class `MSstatsMZMineFiles`. -#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library -#' annotations with columns `id`, `compound_name`, `score`. When supplied, -#' the highest-scoring `compound_name` per feature is used as `ProteinName`. -#' Features without a matching annotation row fall back to an mz_rt string -#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature -#' uses the mz_rt fallback. +#' @param mzmine_annotations `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. Required; +#' passing `NULL` raises an error. The highest-scoring `compound_name` +#' per feature is used as `ProteinName`, and features in the quant +#' table with no matching annotation row are dropped from the output. +#' These are MSI Level 2 annotations (putative identification via +#' MS/MS spectral matching). See the public `MZMinetoMSstatsFormat` +#' docstring for the full scope discussion. #' @return data.table #' @keywords internal -.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) { +.cleanRawMZMine <- function(msstats_object, mzmine_annotations) { ProteinName = PeptideSequence = Intensity = Run = NULL PrecursorCharge = FragmentIon = ProductCharge = NULL id = score = compound_name = i.compound_name = NULL @@ -31,47 +32,45 @@ "columns named ' Peak area' (e.g. 'sampleA.mzML Peak area').") } id_col <- "rowID" - mz_col <- "rowmz" - rt_col <- "rowretentiontime" - required_meta <- c(id_col, mz_col, rt_col) + required_meta <- id_col missing_meta <- setdiff(required_meta, colnames(mz_input)) if (length(missing_meta) > 0) { - stop("Missing required MZMine metadata column(s) (expected 'row ID', ", - "'row m/z', 'row retention time'). After standardization, ", - "looked for: ", paste(missing_meta, collapse = ", "), ".") + stop("Missing required MZMine metadata column (expected 'row ID'). ", + "After standardization, looked for: ", + paste(missing_meta, collapse = ", "), ".") } - mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_", - round(mz_input[[rt_col]], 2)) - mz_input[, ProteinName := mz_rt_fallback] - - if (!is.null(mzmine_annotations)) { - feature_to_compound <- data.table::as.data.table(mzmine_annotations) - required_ann <- c("id", "compound_name", "score") - missing_ann <- setdiff(required_ann, colnames(feature_to_compound)) - if (length(missing_ann) > 0) { - stop("mzmine_annotations is missing required column(s): ", - paste(missing_ann, collapse = ", "), ".") - } - feature_to_compound[, score := suppressWarnings(as.numeric(score))] - if (anyNA(feature_to_compound$score)) { - stop("The 'score' column in the mzmine annotations file must contain numeric values.") - } - # Sort by id ascending and score descending so the highest-scoring - # annotation per id is the first row in each group. - data.table::setorder(feature_to_compound, id, -score) - # Collapse to one row per id (the highest-scoring). data.table's - # unique() with a 'by' arg keeps the first row per group, which after - # the sort above is the highest-scoring annotation. - feature_to_compound <- unique(feature_to_compound, by = "id") - # Join: unmatched mz_input rows keep the mz_rt_fallback ProteinName - # set above. - mz_input[ - feature_to_compound, - ProteinName := i.compound_name, - on = setNames("id", id_col) - ] + if (is.null(mzmine_annotations)) { + stop("mzmine_annotations is required. Pass a data.frame with ", + "columns 'id', 'compound_name', 'score'.") + } + feature_to_compound <- data.table::as.data.table(mzmine_annotations) + required_ann <- c("id", "compound_name", "score") + missing_ann <- setdiff(required_ann, colnames(feature_to_compound)) + if (length(missing_ann) > 0) { + stop("mzmine_annotations is missing required column(s): ", + paste(missing_ann, collapse = ", "), ".") } + feature_to_compound[, score := suppressWarnings(as.numeric(score))] + if (anyNA(feature_to_compound$score)) { + stop("The 'score' column in the mzmine annotations file must contain numeric values.") + } + data.table::setorder(feature_to_compound, id, -score) + feature_to_compound <- unique(feature_to_compound, by = "id") + # Inner-join filter: drop quant rows with no matching annotation. + mz_input[ + feature_to_compound, + ProteinName := i.compound_name, + on = setNames("id", id_col) + ] + mz_input <- mz_input[!is.na(ProteinName)] + + retained_ids <- feature_to_compound$id + retained_msg <- paste0("** MZMine: retained ", length(retained_ids), + " feature(s) after annotation join: ", + paste(retained_ids, collapse = ", ")) + getOption("MSstatsLog")("INFO", retained_msg) + getOption("MSstatsMsg")("INFO", retained_msg) mz_input[, PeptideSequence := as.character(get(id_col))] diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index ae7a517ad..4e3df158b 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -11,12 +11,19 @@ #' trailing `"Peakarea"` suffix removed. For example, a quant-file column #' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization, #' so the corresponding `Run` value must be `sampleAmzML`. -#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library -#' annotations with columns `id`, `compound_name`, `score`. When supplied, -#' the highest-scoring `compound_name` per feature is used as `ProteinName`; -#' features without a matching annotation row fall back to an mz_rt string -#' `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature -#' uses the mz_rt fallback. +#' @param mzmine_annotations `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. Required: +#' the highest-scoring `compound_name` per feature is used as +#' `ProteinName`, and features in the quant table with no matching +#' annotation row are dropped from the output. +#' +#' These are MSI Level 2 annotations (putative identification via +#' MS/MS spectral matching against a reference library). Higher- +#' confidence Level 1 identifications require pure reference standards +#' and are out of scope here. Lower-confidence annotations such as +#' Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via +#' CANOPUS) are not currently supported -- features without a Level 2 +#' annotation row are filtered out. #' #' @return data.table in the MSstats required format. #' @@ -39,7 +46,7 @@ MZMinetoMSstatsFormat = function( input, annotation = NULL, - mzmine_annotations = NULL, + mzmine_annotations, removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, @@ -50,6 +57,11 @@ MZMinetoMSstatsFormat = function( MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, log_file_path) + if (missing(mzmine_annotations) || is.null(mzmine_annotations)) { + stop("mzmine_annotations is required. Pass a data.frame with ", + "columns 'id', 'compound_name', 'score'.") + } + input = MSstatsConvert::MSstatsImport(list(input = input), "MSstats", "MZMine", ...) input = MSstatsConvert::MSstatsClean( diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R index 6d0fff53a..dcfccf904 100644 --- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -15,9 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot, use_log_file = FALSE) output_dt = data.table::as.data.table(output) -# Basic structure: 6 features x 4 runs = 24 rows, 11 standard columns +# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns +# Features 4 and 5 have no annotation row and are dropped by the inner join. expect_equal(ncol(output), 11) -expect_equal(nrow(output), 24) +expect_equal(nrow(output), 16) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) @@ -53,22 +54,17 @@ expect_equal(as.character(feature3_proteins), "Lactate") feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName]) expect_equal(as.character(feature6_proteins), "Caffeine") -# Features without annotation rows fall back to the mz_rt string -feature4_proteins = unique(output_dt[PeptideSequence == "4", ProteinName]) -expect_equal(as.character(feature4_proteins), "489.334_7.89") -feature5_proteins = unique(output_dt[PeptideSequence == "5", ProteinName]) -expect_equal(as.character(feature5_proteins), "555.447_9.1") +# Features absent from the annotations file are filtered out (no mz_rt fallback) +expect_false("4" %in% as.character(output_dt$PeptideSequence)) +expect_false("5" %in% as.character(output_dt$PeptideSequence)) +expect_false(any(as.character(output_dt$ProteinName) %in% + c("489.334_7.89", "555.447_9.1"))) # Zero-intensity input cells are converted to NA in output -# Feature 3 sampleB = 0 -> NA +# Feature 3 sampleB = 0 -> NA (feature 3 is annotated as Lactate) feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML", Intensity] expect_true(is.na(feature3_sampleB_int)) -# Feature 5 sampleB/C/D all = 0 -> NA -feature5_zero_ints = output_dt[PeptideSequence == "5" & - Run %in% c("sampleBmzML", "sampleCmzML", "sampleDmzML"), - Intensity] -expect_true(all(is.na(feature5_zero_ints))) # Annotation merges correctly: sampleA is Control rep 1 sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition]) @@ -86,27 +82,24 @@ feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML", Intensity] expect_equal(as.numeric(feature2_sampleC_int), 5200) -# Without mzmine_annotations ------------------------------------------------- -output_nolib = MZMinetoMSstatsFormat(input, annotation = annot, - mzmine_annotations = NULL, - use_log_file = FALSE) -output_nolib_dt = data.table::as.data.table(output_nolib) - -# Every ProteinName is the mz_rt fallback string -expect_equal(ncol(output_nolib), 11) -expect_equal(nrow(output_nolib), 24) -expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67", - "489.334_7.89", "555.447_9.1", "123.056_1.45") -expect_equal( - sort(unique(as.character(output_nolib_dt$ProteinName))), - sort(expected_mz_rt) +# mzmine_annotations is mandatory -------------------------------------------- +# Passing NULL must raise an error (no silent mz_rt fallback) +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = NULL, + use_log_file = FALSE), + "mzmine_annotations is required" +) +# Omitting the argument entirely must also raise an error +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + use_log_file = FALSE), + "mzmine_annotations is required" ) -# Compound names from the library must not leak in -expect_false(any(as.character(output_nolib_dt$ProteinName) %in% - c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate"))) # removeProtein_with1Feature filters non-Caffeine proteins ------------------- -# Caffeine has 2 features (PeptideSequence "1" and "6"); all others have 1. +# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6); +# Lactate and Glucose each have 1. output_filtered = MZMinetoMSstatsFormat(input, annotation = annot, mzmine_annotations = mzmine_ann, removeProtein_with1Feature = TRUE, diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 0809a567b..8863b4ea7 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -82,7 +82,7 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object) -\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations = NULL) +\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations) } \arguments{ \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.} @@ -201,12 +201,14 @@ The SILAC suffix is then stripped from \code{PeptideSequence}. When \code{NULL} (default), protein-turnover mode is disabled and all peptides receive \code{IsotopeLabelType = "Light"}.} -\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library -annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, -the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}. -Features without a matching annotation row fall back to an mz_rt string -\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature -uses the mz_rt fallback.} +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature is used as \code{ProteinName}, and features in the quant +table with no matching annotation row are dropped from the output. +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} +docstring for the full scope discussion.} } \value{ data.table diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index 3ce9cc9b2..b3fdcfe11 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -7,7 +7,7 @@ MZMinetoMSstatsFormat( input, annotation = NULL, - mzmine_annotations = NULL, + mzmine_annotations, removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, @@ -30,12 +30,19 @@ trailing \code{"Peakarea"} suffix removed. For example, a quant-file column \code{"sampleA.mzML Peak area"} becomes \code{"sampleAmzML"} after standardization, so the corresponding \code{Run} value must be \code{sampleAmzML}.} -\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library -annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, -the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}; -features without a matching annotation row fall back to an mz_rt string -\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature -uses the mz_rt fallback.} +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: +the highest-scoring \code{compound_name} per feature is used as +\code{ProteinName}, and features in the quant table with no matching +annotation row are dropped from the output. + +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching against a reference library). Higher- +confidence Level 1 identifications require pure reference standards +and are out of scope here. Lower-confidence annotations such as +Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via +CANOPUS) are not currently supported -- features without a Level 2 +annotation row are filtered out.} \item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd index 82794933a..8c93db083 100644 --- a/man/dot-cleanRawMZMine.Rd +++ b/man/dot-cleanRawMZMine.Rd @@ -4,17 +4,19 @@ \alias{.cleanRawMZMine} \title{Clean raw MZMine files} \usage{ -.cleanRawMZMine(msstats_object, mzmine_annotations = NULL) +.cleanRawMZMine(msstats_object, mzmine_annotations) } \arguments{ \item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.} -\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library -annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied, -the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}. -Features without a matching annotation row fall back to an mz_rt string -\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature -uses the mz_rt fallback.} +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature is used as \code{ProteinName}, and features in the quant +table with no matching annotation row are dropped from the output. +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} +docstring for the full scope discussion.} } \value{ data.table @@ -22,8 +24,7 @@ data.table \description{ Operates on the column names produced by MZMine after MSstatsConvert's internal column-name standardization (spaces collapsed and dots removed): -"row ID" becomes \code{rowID}, "row m/z" becomes \code{rowmz}, "row retention time" -becomes \code{rowretentiontime}, and each "\if{html}{\out{}} Peak area" becomes +"row ID" becomes \code{rowID}, and each "\if{html}{\out{}} Peak area" becomes \verb{Peakarea}. } \keyword{internal} diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index 78e92ff5b..ea8f0b729 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -336,11 +336,18 @@ It takes the wide-format feature-quantification table exported by MZMine (one ro per feature, one ` Peak area` column per sample) together with a standard MSstats annotation and produces an MSstats-ready long-format `data.table`. -Optionally, an MZMine spectral-library annotation table (with `id`, `compound_name`, -`score` columns) can be supplied; the highest-scoring `compound_name` per feature -is used as `ProteinName`. Features with no library match fall back to an mz_rt -string of the form `paste0(round(mz, 4), "_", round(rt, 2))`. When no library -annotation is supplied, every feature uses the mz_rt fallback. +An MZMine spectral-library annotation table with `id`, `compound_name`, and +`score` columns is **required**. The highest-scoring `compound_name` per feature +is used as `ProteinName`. Features in the quant table with no matching annotation +row are dropped from the output — there is no synthesized mz_rt fallback, +because placeholder identifiers inflate the hypothesis count for downstream +`groupComparison` without biological signal. + +These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) +(putative identification via MS/MS spectral matching against a reference library). +Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS +(Level 4) — are out of scope for this iteration; features without a Level 2 +annotation row are filtered out. ```{r mzmine} mzmine_input = data.table::fread(system.file( @@ -356,23 +363,14 @@ mzmine_library = data.table::fread(system.file( package = "MSstatsConvert" )) -# With a spectral-library annotation: ProteinName is the matched compound name -mzmine_with_lib = MZMinetoMSstatsFormat( +# ProteinName comes from the matched compound_name; unannotated features are dropped +mzmine_converted = MZMinetoMSstatsFormat( mzmine_input, annotation = mzmine_annotation, mzmine_annotations = mzmine_library, use_log_file = FALSE ) -head(mzmine_with_lib) - -# Without a library: ProteinName is an mz_rt fallback string -mzmine_no_lib = MZMinetoMSstatsFormat( - mzmine_input, - annotation = mzmine_annotation, - mzmine_annotations = NULL, - use_log_file = FALSE -) -head(mzmine_no_lib) +head(mzmine_converted) ``` Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence` From 91feff31e7b00657eb5cdfd8f8569631448f72b5 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Wed, 27 May 2026 15:55:46 -0400 Subject: [PATCH 6/6] Replace <- with = assignment operator in cleanRawMZMine function --- R/clean_MZMine.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index ab9d559ec..cee0cf050 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -21,8 +21,8 @@ PrecursorCharge = FragmentIon = ProductCharge = NULL id = score = compound_name = i.compound_name = NULL - mz_input <- getInputFile(msstats_object, "input") - mz_input <- data.table::as.data.table(mz_input) + mz_input = getInputFile(msstats_object, "input") + mz_input = data.table::as.data.table(mz_input) peak_area_suffix <- "Peakarea" peak_area_cols <- grep(paste0(peak_area_suffix, "$"),