Vitek-Lab · tonywu1999 · May 27, 2026 · May 19, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -8,3 +8,5 @@
 ^pkgdown$
 ^\.positai$
 ^\.claude$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,5 @@ inst/doc
 .lintr
 .vscode
 .positai
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -35,6 +35,7 @@ Suggests:
     rmarkdown
 LinkingTo: Rcpp
 Collate: 
+    'clean_MZMine.R'
     'clean_ProteinProspector.R'
     'clean_Metamorpheus.R'
     'clean_DIANN.R'
@@ -53,6 +54,7 @@ Collate:
     'converters_DIANNtoMSstatsFormat.R'
     'converters_DIAUmpiretoMSstatsFormat.R'
     'converters_FragPipetoMSstatsFormat.R'
+    'converters_MZMinetoMSstatsFormat.R'
     'converters_MaxQtoMSstatsFormat.R'
     'converters_MaxQtoMSstatsTMTFormat.R'
     'converters_MetamorpheusToMSstatsFormat.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(MSstatsLogsSettings)
 export(MSstatsMakeAnnotation)
 export(MSstatsPreprocess)
 export(MSstatsSaveSessionInfo)
+export(MZMinetoMSstatsFormat)
 export(MaxQtoMSstatsFormat)
 export(MaxQtoMSstatsTMTFormat)
 export(MetamorpheusToMSstatsFormat)

diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
@@ -71,6 +71,10 @@ setClass("MSstatsMetamorpheusFiles", contains = "MSstatsInputFiles")
 #' @rdname MSstatsInputFiles
 #' @keywords internal
 setClass("MSstatsProteinProspectorFiles", contains = "MSstatsInputFiles")
+#' MSstatsMZMineFiles: class for MZMine files.
+#' @rdname MSstatsInputFiles
+#' @keywords internal
+setClass("MSstatsMZMineFiles", contains = "MSstatsInputFiles")
 
 
 #' Get one of files contained in an instance of `MSstatsInputFiles` class.
@@ -291,8 +295,15 @@ setMethod("MSstatsClean", signature = "MSstatsMetamorpheusFiles",
 #' @rdname MSstatsClean
 #' @inheritParams .cleanRawProteinProspector
 #' @return data.table
-setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", 
+setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
           .cleanRawProteinProspector)
+#' Clean MZMine files
+#' @include clean_MZMine.R
+#' @rdname MSstatsClean
+#' @inheritParams .cleanRawMZMine
+#' @return data.table
+setMethod("MSstatsClean", signature = "MSstatsMZMineFiles",
+          .cleanRawMZMine)
 
 
 #' Preprocess outputs from MS signal processing tools for analysis with MSstats

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
@@ -0,0 +1,97 @@
+#' Clean raw MZMine files
+#'
+#' Operates on the column names produced by MZMine after MSstatsConvert's
+#' internal column-name standardization (spaces collapsed and dots removed):
+#' "row ID" becomes `rowID`, and each "<sample> Peak area" becomes
+#' `<standardized-sample>Peakarea`.
+#'
+#' @param msstats_object an object of class `MSstatsMZMineFiles`.
+#' @param mzmine_annotations `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. Required;
+#'   passing `NULL` raises an error. The highest-scoring `compound_name`
+#'   per feature is used as `ProteinName`, and features in the quant
+#'   table with no matching annotation row are dropped from the output.
+#'   These are MSI Level 2 annotations (putative identification via
+#'   MS/MS spectral matching). See the public `MZMinetoMSstatsFormat`
+#'   docstring for the full scope discussion.
+#' @return data.table
+#' @keywords internal
+.cleanRawMZMine <- function(msstats_object, mzmine_annotations) {
+    ProteinName = PeptideSequence = Intensity = Run = NULL
+    PrecursorCharge = FragmentIon = ProductCharge = NULL
+    id = score = compound_name = i.compound_name = NULL
+
+    mz_input = getInputFile(msstats_object, "input")
+    mz_input = data.table::as.data.table(mz_input)
+
+    peak_area_suffix <- "Peakarea"
+    peak_area_cols <- grep(paste0(peak_area_suffix, "$"),
+                           colnames(mz_input), value = TRUE)
+    if (length(peak_area_cols) == 0) {
+        stop("No 'Peak area' columns found in the input. Expected per-sample ",
+             "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
+    }
+    id_col <- "rowID"
+    required_meta <- id_col
+    missing_meta <- setdiff(required_meta, colnames(mz_input))
+    if (length(missing_meta) > 0) {
+        stop("Missing required MZMine metadata column (expected 'row ID'). ",
+             "After standardization, looked for: ",
+             paste(missing_meta, collapse = ", "), ".")
+    }
+
+    if (is.null(mzmine_annotations)) {
+        stop("mzmine_annotations is required. Pass a data.frame with ",
+             "columns 'id', 'compound_name', 'score'.")
+    }
+    feature_to_compound <- data.table::as.data.table(mzmine_annotations)
+    required_ann <- c("id", "compound_name", "score")
+    missing_ann <- setdiff(required_ann, colnames(feature_to_compound))
+    if (length(missing_ann) > 0) {
+        stop("mzmine_annotations is missing required column(s): ",
+             paste(missing_ann, collapse = ", "), ".")
+    }
+    feature_to_compound[, score := suppressWarnings(as.numeric(score))]
+    if (anyNA(feature_to_compound$score)) {
+        stop("The 'score' column in the mzmine annotations file must contain numeric values.")
+    }
+    data.table::setorder(feature_to_compound, id, -score)
+    feature_to_compound <- unique(feature_to_compound, by = "id")
+    # Inner-join filter: drop quant rows with no matching annotation.
+    mz_input[
+        feature_to_compound,
+        ProteinName := i.compound_name,
+        on = setNames("id", id_col)
+    ]
+    mz_input <- mz_input[!is.na(ProteinName)]
+
+    retained_ids <- feature_to_compound$id
+    retained_msg <- paste0("** MZMine: retained ", length(retained_ids),
+                           " feature(s) after annotation join: ",
+                           paste(retained_ids, collapse = ", "))
+    getOption("MSstatsLog")("INFO", retained_msg)
+    getOption("MSstatsMsg")("INFO", retained_msg)
+
+    mz_input[, PeptideSequence := as.character(get(id_col))]
+
+    long <- data.table::melt(
+        mz_input,
+        id.vars = c("ProteinName", "PeptideSequence"),
+        measure.vars = peak_area_cols,
+        variable.name = "Run",
+        value.name = "Intensity",
+        variable.factor = FALSE)
+
+    long[, PrecursorCharge := NA_integer_]
+    long[, FragmentIon := NA_character_]
+    long[, ProductCharge := NA_integer_]
+    long[, Run := sub(paste0(peak_area_suffix, "$"), "", Run)]
+
+    final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
+                    "FragmentIon", "ProductCharge",
+                    "Run", "Intensity")
+    long <- long[, final_cols, with = FALSE]
+
+    .logSuccess("MZMine", "clean")
+    long
+}
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
@@ -0,0 +1,99 @@
+#' Import MZMine files
+#'
+#' @inheritParams .sharedParametersAmongConverters
+#' @param input MZMine feature-quantification table (wide format; one row per
+#'   feature). Must include the metadata columns `row ID`, `row m/z`,
+#'   `row retention time`, and per-sample peak-area columns named
+#'   `"<run> Peak area"` (e.g. `"sampleA.mzML Peak area"`).
+#' @param annotation `data.frame` with columns `Run`, `Condition`,
+#'   `BioReplicate`. `Run` values must match MSstatsConvert-standardized sample
+#'   names (after column-name normalization removes spaces and dots) with the
+#'   trailing `"Peakarea"` suffix removed. For example, a quant-file column
+#'   `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
+#'   so the corresponding `Run` value must be `sampleAmzML`.
+#' @param mzmine_annotations `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. Required:
+#'   the highest-scoring `compound_name` per feature is used as
+#'   `ProteinName`, and features in the quant table with no matching
+#'   annotation row are dropped from the output.
+#'
+#'   These are MSI Level 2 annotations (putative identification via
+#'   MS/MS spectral matching against a reference library). Higher-
+#'   confidence Level 1 identifications require pure reference standards
+#'   and are out of scope here. Lower-confidence annotations such as
+#'   Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
+#'   CANOPUS) are not currently supported -- features without a Level 2
+#'   annotation row are filtered out.
+#'
+#' @return data.table in the MSstats required format.
+#'
+#' @export
+#'
+#' @examples
+#' input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
+#'                          package = "MSstatsConvert")
+#' annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
+#'                          package = "MSstatsConvert")
+#' lib_path   = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
+#'                          package = "MSstatsConvert")
+#' input = data.table::fread(input_path)
+#' annot = data.table::fread(annot_path)
+#' lib   = data.table::fread(lib_path)
+#' output = MZMinetoMSstatsFormat(input, annotation = annot,
+#'                                mzmine_annotations = lib,
+#'                                use_log_file = FALSE)
+#' head(output)
+MZMinetoMSstatsFormat = function(
+    input,
+    annotation = NULL,
+    mzmine_annotations,
+    removeProtein_with1Feature = FALSE,
+    summaryforMultipleRows = max,
+    use_log_file = TRUE,
+    append = FALSE,
+    verbose = TRUE,
+    log_file_path = NULL,
+    ...) {
+    MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
+                                        log_file_path)
+
+    if (missing(mzmine_annotations) || is.null(mzmine_annotations)) {
+        stop("mzmine_annotations is required. Pass a data.frame with ",
+             "columns 'id', 'compound_name', 'score'.")
+    }
+
+    input = MSstatsConvert::MSstatsImport(list(input = input),
+                                          "MSstats", "MZMine", ...)
+    input = MSstatsConvert::MSstatsClean(
+        input, mzmine_annotations = mzmine_annotations)
+    annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
+
+    feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge")
+
+    input = MSstatsConvert::MSstatsPreprocess(
+        input,
+        annotation,
+        feature_columns,
+        remove_shared_peptides = FALSE,
+        remove_single_feature_proteins = removeProtein_with1Feature,
+        exact_filtering = NULL,
+        pattern_filtering = NULL,
+        aggregate_isotopic = FALSE,
+        feature_cleaning = list(
+            remove_features_with_few_measurements = FALSE,
+            summarize_multiple_psms = summaryforMultipleRows),
+        columns_to_fill = list(Fraction = 1, IsotopeLabelType = "Light"))
+    input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
+
+    input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns,
+                                                  fill_incomplete = TRUE,
+                                                  handle_fractions = FALSE,
+                                                  remove_few = FALSE)
+
+    msg_final = paste("** Finished preprocessing. The dataset is ready",
+                      "to be processed by the dataProcess function.")
+    getOption("MSstatsLog")("INFO", msg_final)
+    getOption("MSstatsMsg")("INFO", msg_final)
+    getOption("MSstatsLog")("INFO", "\n")
+    input
+}
diff --git a/inst/tinytest/raw_data/MZMine/annotation.csv b/inst/tinytest/raw_data/MZMine/annotation.csv
@@ -0,0 +1,5 @@
+Run,Condition,BioReplicate
+sampleA.mzML,Control,1
+sampleB.mzML,Control,2
+sampleC.mzML,Treatment,3
+sampleD.mzML,Treatment,4
diff --git a/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv
@@ -0,0 +1,6 @@
+id,compound_name,score,adduct
+1,Caffeine,0.95,[M+H]+
+2,GlucoseLow,0.72,[M+H]+
+2,GlucoseHigh,0.91,[M-H]-
+3,Lactate,0.88,[M+H]+
+6,Caffeine,0.80,[M+Na]+
diff --git a/inst/tinytest/raw_data/MZMine/mzmine_input.csv b/inst/tinytest/raw_data/MZMine/mzmine_input.csv
@@ -0,0 +1,7 @@
+row ID,row m/z,row retention time,sampleA.mzML Peak area,sampleB.mzML Peak area,sampleC.mzML Peak area,sampleD.mzML Peak area
+1,123.0560,1.23,1000,1100,1200,1300
+2,245.1290,3.45,5000,4800,5200,4900
+3,367.2010,5.67,800,0,750,820
+4,489.3340,7.89,2000,2100,1900,2050
+5,555.4470,9.10,100,0,0,0
+6,123.0560,1.45,600,650,700,680
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -0,0 +1,113 @@
+# Test MZMinetoMSstatsFormat ---------------------------
+input_file_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
+                              package = "MSstatsConvert")
+annotation_file_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
+                                   package = "MSstatsConvert")
+mzmine_ann_file_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
+                                   package = "MSstatsConvert")
+input = data.table::fread(input_file_path)
+annot = data.table::fread(annotation_file_path)
+mzmine_ann = data.table::fread(mzmine_ann_file_path)
+
+# With mzmine_annotations supplied -------------------------------------------
+output = MZMinetoMSstatsFormat(input, annotation = annot,
+                               mzmine_annotations = mzmine_ann,
+                               use_log_file = FALSE)
+output_dt = data.table::as.data.table(output)
+
+# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns
+# Features 4 and 5 have no annotation row and are dropped by the inner join.
+expect_equal(ncol(output), 11)
+expect_equal(nrow(output), 16)
+expect_true("Run" %in% colnames(output))
+expect_true("ProteinName" %in% colnames(output))
+expect_true("PeptideSequence" %in% colnames(output))
+expect_true("PrecursorCharge" %in% colnames(output))
+expect_true("Intensity" %in% colnames(output))
+expect_true("FragmentIon" %in% colnames(output))
+expect_true("ProductCharge" %in% colnames(output))
+expect_true("IsotopeLabelType" %in% colnames(output))
+expect_true("Condition" %in% colnames(output))
+expect_true("BioReplicate" %in% colnames(output))
+expect_true("Fraction" %in% colnames(output))
+
+# Metabolomics has no isotope labeling, so every row is "Light"
+expect_true(all(output_dt$IsotopeLabelType == "Light"))
+
+# Charge / fragment columns are not applicable for metabolomics
+expect_true(all(is.na(output_dt$PrecursorCharge)))
+expect_true(all(is.na(output_dt$FragmentIon)))
+expect_true(all(is.na(output_dt$ProductCharge)))
+
+# Fraction filled to 1
+expect_true(all(output_dt$Fraction == 1))
+
+# Annotation join: feature 2 has two annotation rows; the highest-scoring one wins
+feature2_proteins = unique(output_dt[PeptideSequence == "2", ProteinName])
+expect_equal(as.character(feature2_proteins), "GlucoseHigh")
+
+# Clean annotation cases
+feature1_proteins = unique(output_dt[PeptideSequence == "1", ProteinName])
+expect_equal(as.character(feature1_proteins), "Caffeine")
+feature3_proteins = unique(output_dt[PeptideSequence == "3", ProteinName])
+expect_equal(as.character(feature3_proteins), "Lactate")
+feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
+expect_equal(as.character(feature6_proteins), "Caffeine")
+
+# Features absent from the annotations file are filtered out (no mz_rt fallback)
+expect_false("4" %in% as.character(output_dt$PeptideSequence))
+expect_false("5" %in% as.character(output_dt$PeptideSequence))
+expect_false(any(as.character(output_dt$ProteinName) %in%
+                 c("489.334_7.89", "555.447_9.1")))
+
+# Zero-intensity input cells are converted to NA in output
+# Feature 3 sampleB = 0  ->  NA  (feature 3 is annotated as Lactate)
+feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML",
+                                  Intensity]
+expect_true(is.na(feature3_sampleB_int))
+
+# Annotation merges correctly: sampleA is Control rep 1
+sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition])
+expect_equal(as.character(sampleA_cond), "Control")
+sampleA_rep = unique(output_dt[Run == "sampleAmzML", BioReplicate])
+expect_equal(as.character(sampleA_rep), "1")
+sampleC_cond = unique(output_dt[Run == "sampleCmzML", Condition])
+expect_equal(as.character(sampleC_cond), "Treatment")
+
+# Intensity values trace back to input
+feature1_sampleA_int = output_dt[PeptideSequence == "1" & Run == "sampleAmzML",
+                                  Intensity]
+expect_equal(as.numeric(feature1_sampleA_int), 1000)
+feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML",
+                                  Intensity]
+expect_equal(as.numeric(feature2_sampleC_int), 5200)
+
+# mzmine_annotations is mandatory --------------------------------------------
+# Passing NULL must raise an error (no silent mz_rt fallback)
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          mzmine_annotations = NULL,
+                          use_log_file = FALSE),
+    "mzmine_annotations is required"
+)
+# Omitting the argument entirely must also raise an error
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          use_log_file = FALSE),
+    "mzmine_annotations is required"
+)
+
+# removeProtein_with1Feature filters non-Caffeine proteins -------------------
+# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6);
+# Lactate and Glucose each have 1.
+output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
+                                        mzmine_annotations = mzmine_ann,
+                                        removeProtein_with1Feature = TRUE,
+                                        use_log_file = FALSE)
+output_filtered_dt = data.table::as.data.table(output_filtered)
+
+expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
+# 2 features x 4 runs = 8 rows
+expect_equal(nrow(output_filtered), 8)
+expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
+             c("1", "6"))
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,5 @@ inst/doc @@
     .lintr
     .vscode
     .positai
+    /doc/
+    /Meta/