diff --git a/.Rbuildignore b/.Rbuildignore index 613e0b649..37ad74244 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,5 @@ ^pkgdown$ ^\.positai$ ^\.claude$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 6d24a0fdc..c796408d5 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ inst/doc .lintr .vscode .positai +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index af973548e..073329f4c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Suggests: rmarkdown LinkingTo: Rcpp Collate: + 'clean_MZMine.R' 'clean_ProteinProspector.R' 'clean_Metamorpheus.R' 'clean_DIANN.R' @@ -53,6 +54,7 @@ Collate: 'converters_DIANNtoMSstatsFormat.R' 'converters_DIAUmpiretoMSstatsFormat.R' 'converters_FragPipetoMSstatsFormat.R' + 'converters_MZMinetoMSstatsFormat.R' 'converters_MaxQtoMSstatsFormat.R' 'converters_MaxQtoMSstatsTMTFormat.R' 'converters_MetamorpheusToMSstatsFormat.R' diff --git a/NAMESPACE b/NAMESPACE index cc2cfa210..e51cffd66 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,7 @@ export(MSstatsLogsSettings) export(MSstatsMakeAnnotation) export(MSstatsPreprocess) export(MSstatsSaveSessionInfo) +export(MZMinetoMSstatsFormat) export(MaxQtoMSstatsFormat) export(MaxQtoMSstatsTMTFormat) export(MetamorpheusToMSstatsFormat) diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R index 3d9ee7dc3..7e4c77ae3 100644 --- a/R/MSstatsConvert_core_functions.R +++ b/R/MSstatsConvert_core_functions.R @@ -71,6 +71,10 @@ setClass("MSstatsMetamorpheusFiles", contains = "MSstatsInputFiles") #' @rdname MSstatsInputFiles #' @keywords internal setClass("MSstatsProteinProspectorFiles", contains = "MSstatsInputFiles") +#' MSstatsMZMineFiles: class for MZMine files. +#' @rdname MSstatsInputFiles +#' @keywords internal +setClass("MSstatsMZMineFiles", contains = "MSstatsInputFiles") #' Get one of files contained in an instance of `MSstatsInputFiles` class. @@ -291,8 +295,15 @@ setMethod("MSstatsClean", signature = "MSstatsMetamorpheusFiles", #' @rdname MSstatsClean #' @inheritParams .cleanRawProteinProspector #' @return data.table -setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", +setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", .cleanRawProteinProspector) +#' Clean MZMine files +#' @include clean_MZMine.R +#' @rdname MSstatsClean +#' @inheritParams .cleanRawMZMine +#' @return data.table +setMethod("MSstatsClean", signature = "MSstatsMZMineFiles", + .cleanRawMZMine) #' Preprocess outputs from MS signal processing tools for analysis with MSstats diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R new file mode 100644 index 000000000..cee0cf050 --- /dev/null +++ b/R/clean_MZMine.R @@ -0,0 +1,97 @@ +#' Clean raw MZMine files +#' +#' Operates on the column names produced by MZMine after MSstatsConvert's +#' internal column-name standardization (spaces collapsed and dots removed): +#' "row ID" becomes `rowID`, and each " Peak area" becomes +#' `Peakarea`. +#' +#' @param msstats_object an object of class `MSstatsMZMineFiles`. +#' @param mzmine_annotations `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. Required; +#' passing `NULL` raises an error. The highest-scoring `compound_name` +#' per feature is used as `ProteinName`, and features in the quant +#' table with no matching annotation row are dropped from the output. +#' These are MSI Level 2 annotations (putative identification via +#' MS/MS spectral matching). See the public `MZMinetoMSstatsFormat` +#' docstring for the full scope discussion. +#' @return data.table +#' @keywords internal +.cleanRawMZMine <- function(msstats_object, mzmine_annotations) { + ProteinName = PeptideSequence = Intensity = Run = NULL + PrecursorCharge = FragmentIon = ProductCharge = NULL + id = score = compound_name = i.compound_name = NULL + + mz_input = getInputFile(msstats_object, "input") + mz_input = data.table::as.data.table(mz_input) + + peak_area_suffix <- "Peakarea" + peak_area_cols <- grep(paste0(peak_area_suffix, "$"), + colnames(mz_input), value = TRUE) + if (length(peak_area_cols) == 0) { + stop("No 'Peak area' columns found in the input. Expected per-sample ", + "columns named ' Peak area' (e.g. 'sampleA.mzML Peak area').") + } + id_col <- "rowID" + required_meta <- id_col + missing_meta <- setdiff(required_meta, colnames(mz_input)) + if (length(missing_meta) > 0) { + stop("Missing required MZMine metadata column (expected 'row ID'). ", + "After standardization, looked for: ", + paste(missing_meta, collapse = ", "), ".") + } + + if (is.null(mzmine_annotations)) { + stop("mzmine_annotations is required. Pass a data.frame with ", + "columns 'id', 'compound_name', 'score'.") + } + feature_to_compound <- data.table::as.data.table(mzmine_annotations) + required_ann <- c("id", "compound_name", "score") + missing_ann <- setdiff(required_ann, colnames(feature_to_compound)) + if (length(missing_ann) > 0) { + stop("mzmine_annotations is missing required column(s): ", + paste(missing_ann, collapse = ", "), ".") + } + feature_to_compound[, score := suppressWarnings(as.numeric(score))] + if (anyNA(feature_to_compound$score)) { + stop("The 'score' column in the mzmine annotations file must contain numeric values.") + } + data.table::setorder(feature_to_compound, id, -score) + feature_to_compound <- unique(feature_to_compound, by = "id") + # Inner-join filter: drop quant rows with no matching annotation. + mz_input[ + feature_to_compound, + ProteinName := i.compound_name, + on = setNames("id", id_col) + ] + mz_input <- mz_input[!is.na(ProteinName)] + + retained_ids <- feature_to_compound$id + retained_msg <- paste0("** MZMine: retained ", length(retained_ids), + " feature(s) after annotation join: ", + paste(retained_ids, collapse = ", ")) + getOption("MSstatsLog")("INFO", retained_msg) + getOption("MSstatsMsg")("INFO", retained_msg) + + mz_input[, PeptideSequence := as.character(get(id_col))] + + long <- data.table::melt( + mz_input, + id.vars = c("ProteinName", "PeptideSequence"), + measure.vars = peak_area_cols, + variable.name = "Run", + value.name = "Intensity", + variable.factor = FALSE) + + long[, PrecursorCharge := NA_integer_] + long[, FragmentIon := NA_character_] + long[, ProductCharge := NA_integer_] + long[, Run := sub(paste0(peak_area_suffix, "$"), "", Run)] + + final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge", + "FragmentIon", "ProductCharge", + "Run", "Intensity") + long <- long[, final_cols, with = FALSE] + + .logSuccess("MZMine", "clean") + long +} diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R new file mode 100644 index 000000000..4e3df158b --- /dev/null +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -0,0 +1,99 @@ +#' Import MZMine files +#' +#' @inheritParams .sharedParametersAmongConverters +#' @param input MZMine feature-quantification table (wide format; one row per +#' feature). Must include the metadata columns `row ID`, `row m/z`, +#' `row retention time`, and per-sample peak-area columns named +#' `" Peak area"` (e.g. `"sampleA.mzML Peak area"`). +#' @param annotation `data.frame` with columns `Run`, `Condition`, +#' `BioReplicate`. `Run` values must match MSstatsConvert-standardized sample +#' names (after column-name normalization removes spaces and dots) with the +#' trailing `"Peakarea"` suffix removed. For example, a quant-file column +#' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization, +#' so the corresponding `Run` value must be `sampleAmzML`. +#' @param mzmine_annotations `data.frame` of MZMine spectral-library +#' annotations with columns `id`, `compound_name`, `score`. Required: +#' the highest-scoring `compound_name` per feature is used as +#' `ProteinName`, and features in the quant table with no matching +#' annotation row are dropped from the output. +#' +#' These are MSI Level 2 annotations (putative identification via +#' MS/MS spectral matching against a reference library). Higher- +#' confidence Level 1 identifications require pure reference standards +#' and are out of scope here. Lower-confidence annotations such as +#' Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via +#' CANOPUS) are not currently supported -- features without a Level 2 +#' annotation row are filtered out. +#' +#' @return data.table in the MSstats required format. +#' +#' @export +#' +#' @examples +#' input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", +#' package = "MSstatsConvert") +#' annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv", +#' package = "MSstatsConvert") +#' lib_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", +#' package = "MSstatsConvert") +#' input = data.table::fread(input_path) +#' annot = data.table::fread(annot_path) +#' lib = data.table::fread(lib_path) +#' output = MZMinetoMSstatsFormat(input, annotation = annot, +#' mzmine_annotations = lib, +#' use_log_file = FALSE) +#' head(output) +MZMinetoMSstatsFormat = function( + input, + annotation = NULL, + mzmine_annotations, + removeProtein_with1Feature = FALSE, + summaryforMultipleRows = max, + use_log_file = TRUE, + append = FALSE, + verbose = TRUE, + log_file_path = NULL, + ...) { + MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose, + log_file_path) + + if (missing(mzmine_annotations) || is.null(mzmine_annotations)) { + stop("mzmine_annotations is required. Pass a data.frame with ", + "columns 'id', 'compound_name', 'score'.") + } + + input = MSstatsConvert::MSstatsImport(list(input = input), + "MSstats", "MZMine", ...) + input = MSstatsConvert::MSstatsClean( + input, mzmine_annotations = mzmine_annotations) + annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) + + feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") + + input = MSstatsConvert::MSstatsPreprocess( + input, + annotation, + feature_columns, + remove_shared_peptides = FALSE, + remove_single_feature_proteins = removeProtein_with1Feature, + exact_filtering = NULL, + pattern_filtering = NULL, + aggregate_isotopic = FALSE, + feature_cleaning = list( + remove_features_with_few_measurements = FALSE, + summarize_multiple_psms = summaryforMultipleRows), + columns_to_fill = list(Fraction = 1, IsotopeLabelType = "Light")) + input[, Intensity := ifelse(Intensity == 0, NA, Intensity)] + + input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns, + fill_incomplete = TRUE, + handle_fractions = FALSE, + remove_few = FALSE) + + msg_final = paste("** Finished preprocessing. The dataset is ready", + "to be processed by the dataProcess function.") + getOption("MSstatsLog")("INFO", msg_final) + getOption("MSstatsMsg")("INFO", msg_final) + getOption("MSstatsLog")("INFO", "\n") + input +} diff --git a/inst/tinytest/raw_data/MZMine/annotation.csv b/inst/tinytest/raw_data/MZMine/annotation.csv new file mode 100644 index 000000000..f28f4a863 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/annotation.csv @@ -0,0 +1,5 @@ +Run,Condition,BioReplicate +sampleA.mzML,Control,1 +sampleB.mzML,Control,2 +sampleC.mzML,Treatment,3 +sampleD.mzML,Treatment,4 diff --git a/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv new file mode 100644 index 000000000..a2c38a57d --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv @@ -0,0 +1,6 @@ +id,compound_name,score,adduct +1,Caffeine,0.95,[M+H]+ +2,GlucoseLow,0.72,[M+H]+ +2,GlucoseHigh,0.91,[M-H]- +3,Lactate,0.88,[M+H]+ +6,Caffeine,0.80,[M+Na]+ diff --git a/inst/tinytest/raw_data/MZMine/mzmine_input.csv b/inst/tinytest/raw_data/MZMine/mzmine_input.csv new file mode 100644 index 000000000..b887ed1a1 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/mzmine_input.csv @@ -0,0 +1,7 @@ +row ID,row m/z,row retention time,sampleA.mzML Peak area,sampleB.mzML Peak area,sampleC.mzML Peak area,sampleD.mzML Peak area +1,123.0560,1.23,1000,1100,1200,1300 +2,245.1290,3.45,5000,4800,5200,4900 +3,367.2010,5.67,800,0,750,820 +4,489.3340,7.89,2000,2100,1900,2050 +5,555.4470,9.10,100,0,0,0 +6,123.0560,1.45,600,650,700,680 diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R new file mode 100644 index 000000000..dcfccf904 --- /dev/null +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -0,0 +1,113 @@ +# Test MZMinetoMSstatsFormat --------------------------- +input_file_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert") +annotation_file_path = system.file("tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert") +mzmine_ann_file_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert") +input = data.table::fread(input_file_path) +annot = data.table::fread(annotation_file_path) +mzmine_ann = data.table::fread(mzmine_ann_file_path) + +# With mzmine_annotations supplied ------------------------------------------- +output = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + use_log_file = FALSE) +output_dt = data.table::as.data.table(output) + +# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns +# Features 4 and 5 have no annotation row and are dropped by the inner join. +expect_equal(ncol(output), 11) +expect_equal(nrow(output), 16) +expect_true("Run" %in% colnames(output)) +expect_true("ProteinName" %in% colnames(output)) +expect_true("PeptideSequence" %in% colnames(output)) +expect_true("PrecursorCharge" %in% colnames(output)) +expect_true("Intensity" %in% colnames(output)) +expect_true("FragmentIon" %in% colnames(output)) +expect_true("ProductCharge" %in% colnames(output)) +expect_true("IsotopeLabelType" %in% colnames(output)) +expect_true("Condition" %in% colnames(output)) +expect_true("BioReplicate" %in% colnames(output)) +expect_true("Fraction" %in% colnames(output)) + +# Metabolomics has no isotope labeling, so every row is "Light" +expect_true(all(output_dt$IsotopeLabelType == "Light")) + +# Charge / fragment columns are not applicable for metabolomics +expect_true(all(is.na(output_dt$PrecursorCharge))) +expect_true(all(is.na(output_dt$FragmentIon))) +expect_true(all(is.na(output_dt$ProductCharge))) + +# Fraction filled to 1 +expect_true(all(output_dt$Fraction == 1)) + +# Annotation join: feature 2 has two annotation rows; the highest-scoring one wins +feature2_proteins = unique(output_dt[PeptideSequence == "2", ProteinName]) +expect_equal(as.character(feature2_proteins), "GlucoseHigh") + +# Clean annotation cases +feature1_proteins = unique(output_dt[PeptideSequence == "1", ProteinName]) +expect_equal(as.character(feature1_proteins), "Caffeine") +feature3_proteins = unique(output_dt[PeptideSequence == "3", ProteinName]) +expect_equal(as.character(feature3_proteins), "Lactate") +feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName]) +expect_equal(as.character(feature6_proteins), "Caffeine") + +# Features absent from the annotations file are filtered out (no mz_rt fallback) +expect_false("4" %in% as.character(output_dt$PeptideSequence)) +expect_false("5" %in% as.character(output_dt$PeptideSequence)) +expect_false(any(as.character(output_dt$ProteinName) %in% + c("489.334_7.89", "555.447_9.1"))) + +# Zero-intensity input cells are converted to NA in output +# Feature 3 sampleB = 0 -> NA (feature 3 is annotated as Lactate) +feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML", + Intensity] +expect_true(is.na(feature3_sampleB_int)) + +# Annotation merges correctly: sampleA is Control rep 1 +sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition]) +expect_equal(as.character(sampleA_cond), "Control") +sampleA_rep = unique(output_dt[Run == "sampleAmzML", BioReplicate]) +expect_equal(as.character(sampleA_rep), "1") +sampleC_cond = unique(output_dt[Run == "sampleCmzML", Condition]) +expect_equal(as.character(sampleC_cond), "Treatment") + +# Intensity values trace back to input +feature1_sampleA_int = output_dt[PeptideSequence == "1" & Run == "sampleAmzML", + Intensity] +expect_equal(as.numeric(feature1_sampleA_int), 1000) +feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML", + Intensity] +expect_equal(as.numeric(feature2_sampleC_int), 5200) + +# mzmine_annotations is mandatory -------------------------------------------- +# Passing NULL must raise an error (no silent mz_rt fallback) +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = NULL, + use_log_file = FALSE), + "mzmine_annotations is required" +) +# Omitting the argument entirely must also raise an error +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + use_log_file = FALSE), + "mzmine_annotations is required" +) + +# removeProtein_with1Feature filters non-Caffeine proteins ------------------- +# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6); +# Lactate and Glucose each have 1. +output_filtered = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + removeProtein_with1Feature = TRUE, + use_log_file = FALSE) +output_filtered_dt = data.table::as.data.table(output_filtered) + +expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine") +# 2 features x 4 runs = 8 rows +expect_equal(nrow(output_filtered), 8) +expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))), + c("1", "6")) diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index d5559610b..8863b4ea7 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -15,6 +15,7 @@ \alias{MSstatsClean,MSstatsDIANNFiles-method} \alias{MSstatsClean,MSstatsMetamorpheusFiles-method} \alias{MSstatsClean,MSstatsProteinProspectorFiles-method} +\alias{MSstatsClean,MSstatsMZMineFiles-method} \title{Clean files generated by a signal processing tools.} \usage{ MSstatsClean(msstats_object, ...) @@ -80,6 +81,8 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05) \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object) + +\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations) } \arguments{ \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.} @@ -197,6 +200,15 @@ The SILAC suffix is then stripped from \code{PeptideSequence}. When \code{NULL} (default), protein-turnover mode is disabled and all peptides receive \code{IsotopeLabelType = "Light"}.} + +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature is used as \code{ProteinName}, and features in the quant +table with no matching annotation row are dropped from the output. +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} +docstring for the full scope discussion.} } \value{ data.table @@ -223,6 +235,8 @@ data.table data.table +data.table + data.table } \description{ @@ -253,6 +267,8 @@ Clean DIA-NN files Clean Metamorpheus files Clean Protein Prospector files + +Clean MZMine files } \examples{ evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", diff --git a/man/MSstatsInputFiles.Rd b/man/MSstatsInputFiles.Rd index e09fff8de..eb4f86463 100644 --- a/man/MSstatsInputFiles.Rd +++ b/man/MSstatsInputFiles.Rd @@ -17,6 +17,7 @@ \alias{MSstatsFragPipeFiles-class} \alias{MSstatsMetamorpheusFiles-class} \alias{MSstatsProteinProspectorFiles-class} +\alias{MSstatsMZMineFiles-class} \title{Class to model files that describe a single MS dataset.} \description{ Class to model files that describe a single MS dataset. @@ -48,6 +49,8 @@ MSstatsFragPipeFiles: class for FragPipe files. MSstatsMetamorpheusFiles: class for Metamorpheus files. MSstatsProteinProspectorFiles: class for ProteinProspector files. + +MSstatsMZMineFiles: class for MZMine files. } \section{Slots}{ diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd new file mode 100644 index 000000000..b3fdcfe11 --- /dev/null +++ b/man/MZMinetoMSstatsFormat.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/converters_MZMinetoMSstatsFormat.R +\name{MZMinetoMSstatsFormat} +\alias{MZMinetoMSstatsFormat} +\title{Import MZMine files} +\usage{ +MZMinetoMSstatsFormat( + input, + annotation = NULL, + mzmine_annotations, + removeProtein_with1Feature = FALSE, + summaryforMultipleRows = max, + use_log_file = TRUE, + append = FALSE, + verbose = TRUE, + log_file_path = NULL, + ... +) +} +\arguments{ +\item{input}{MZMine feature-quantification table (wide format; one row per +feature). Must include the metadata columns \verb{row ID}, \verb{row m/z}, +\verb{row retention time}, and per-sample peak-area columns named +\code{" Peak area"} (e.g. \code{"sampleA.mzML Peak area"}).} + +\item{annotation}{\code{data.frame} with columns \code{Run}, \code{Condition}, +\code{BioReplicate}. \code{Run} values must match MSstatsConvert-standardized sample +names (after column-name normalization removes spaces and dots) with the +trailing \code{"Peakarea"} suffix removed. For example, a quant-file column +\code{"sampleA.mzML Peak area"} becomes \code{"sampleAmzML"} after standardization, +so the corresponding \code{Run} value must be \code{sampleAmzML}.} + +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: +the highest-scoring \code{compound_name} per feature is used as +\code{ProteinName}, and features in the quant table with no matching +annotation row are dropped from the output. + +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching against a reference library). Higher- +confidence Level 1 identifications require pure reference standards +and are out of scope here. Lower-confidence annotations such as +Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via +CANOPUS) are not currently supported -- features without a Level 2 +annotation row are filtered out.} + +\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} + +\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.} + +\item{use_log_file}{logical. If TRUE, information about data processing +will be saved to a file.} + +\item{append}{logical. If TRUE, information about data processing will be added +to an existing log file.} + +\item{verbose}{logical. If TRUE, information about data processing will be printed +to the console.} + +\item{log_file_path}{character. Path to a file to which information about +data processing will be saved. +If not provided, such a file will be created automatically. +If \code{append = TRUE}, has to be a valid path to a file.} + +\item{...}{additional parameters to \code{data.table::fread}.} +} +\value{ +data.table in the MSstats required format. +} +\description{ +Import MZMine files +} +\examples{ +input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert") +annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert") +lib_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert") +input = data.table::fread(input_path) +annot = data.table::fread(annot_path) +lib = data.table::fread(lib_path) +output = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = lib, + use_log_file = FALSE) +head(output) +} diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd new file mode 100644 index 000000000..8c93db083 --- /dev/null +++ b/man/dot-cleanRawMZMine.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_MZMine.R +\name{.cleanRawMZMine} +\alias{.cleanRawMZMine} +\title{Clean raw MZMine files} +\usage{ +.cleanRawMZMine(msstats_object, mzmine_annotations) +} +\arguments{ +\item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.} + +\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature is used as \code{ProteinName}, and features in the quant +table with no matching annotation row are dropped from the output. +These are MSI Level 2 annotations (putative identification via +MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} +docstring for the full scope discussion.} +} +\value{ +data.table +} +\description{ +Operates on the column names produced by MZMine after MSstatsConvert's +internal column-name standardization (spaces collapsed and dots removed): +"row ID" becomes \code{rowID}, and each "\if{html}{\out{}} Peak area" becomes +\verb{Peakarea}. +} +\keyword{internal} diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index dbf77916a..ea8f0b729 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -329,3 +329,51 @@ Such a `data.frame` will be recognized by statistical processing functions from `MSstats` and `MSstatsTMT` packages as a valid input, which will allow them to skip checks and transformation necessary to fit data into this format. +# Metabolomics with MZMine + +`MZMinetoMSstatsFormat` brings untargeted metabolomics into the MSstats family. +It takes the wide-format feature-quantification table exported by MZMine (one row +per feature, one ` Peak area` column per sample) together with a standard +MSstats annotation and produces an MSstats-ready long-format `data.table`. + +An MZMine spectral-library annotation table with `id`, `compound_name`, and +`score` columns is **required**. The highest-scoring `compound_name` per feature +is used as `ProteinName`. Features in the quant table with no matching annotation +row are dropped from the output — there is no synthesized mz_rt fallback, +because placeholder identifiers inflate the hypothesis count for downstream +`groupComparison` without biological signal. + +These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) +(putative identification via MS/MS spectral matching against a reference library). +Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS +(Level 4) — are out of scope for this iteration; features without a Level 2 +annotation row are filtered out. + +```{r mzmine} +mzmine_input = data.table::fread(system.file( + "tinytest/raw_data/MZMine/mzmine_input.csv", + package = "MSstatsConvert" +)) +mzmine_annotation = data.table::fread(system.file( + "tinytest/raw_data/MZMine/annotation.csv", + package = "MSstatsConvert" +)) +mzmine_library = data.table::fread(system.file( + "tinytest/raw_data/MZMine/mzmine_annotations.csv", + package = "MSstatsConvert" +)) + +# ProteinName comes from the matched compound_name; unannotated features are dropped +mzmine_converted = MZMinetoMSstatsFormat( + mzmine_input, + annotation = mzmine_annotation, + mzmine_annotations = mzmine_library, + use_log_file = FALSE +) +head(mzmine_converted) +``` + +Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence` +holds the MZMine `row ID` (as a string), and `PrecursorCharge`, `FragmentIon`, and +`ProductCharge` are all `NA`. `IsotopeLabelType` is set to `"Light"` for every row. +