From 848c862dbe54d7602a56ef8c38ee3319cde2066b Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Mon, 18 May 2026 20:17:44 -0400
Subject: [PATCH 1/6] Add MZMinetoMSstatsFormat converter

Brings metabolomics into the MSstats family by adding an MZMine converter that mirrors the structure of DIANNtoMSstatsFormat. Phase 1 of a two-phase task; Phase 2 (MSstatsShiny BIO=Metabolomics) will be a separate PR.
---
 .Rbuildignore                                 |   2 +
 .gitignore                                    |   2 +
 DESCRIPTION                                   |   4 +-
 NAMESPACE                                     |   1 +
 R/MSstatsConvert_core_functions.R             |  13 +-
 R/clean_MZMine.R                              |  88 +++++++++++++
 R/converters_MZMinetoMSstatsFormat.R          |  91 ++++++++++++++
 inst/tinytest/raw_data/MZMine/annotation.csv  |   5 +
 .../raw_data/MZMine/mzmine_annotations.csv    |   6 +
 .../tinytest/raw_data/MZMine/mzmine_input.csv |   7 ++
 .../test_converters_MZMinetoMSstatsFormat.R   | 117 ++++++++++++++++++
 man/MSstatsClean.Rd                           |  14 +++
 man/MSstatsInputFiles.Rd                      |   3 +
 man/MZMinetoMSstatsFormat.Rd                  |  79 ++++++++++++
 man/dot-cleanRawMZMine.Rd                     |  29 +++++
 vignettes/msstats_data_format.Rmd             |  50 ++++++++
 16 files changed, 509 insertions(+), 2 deletions(-)
 create mode 100644 R/clean_MZMine.R
 create mode 100644 R/converters_MZMinetoMSstatsFormat.R
 create mode 100644 inst/tinytest/raw_data/MZMine/annotation.csv
 create mode 100644 inst/tinytest/raw_data/MZMine/mzmine_annotations.csv
 create mode 100644 inst/tinytest/raw_data/MZMine/mzmine_input.csv
 create mode 100644 inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
 create mode 100644 man/MZMinetoMSstatsFormat.Rd
 create mode 100644 man/dot-cleanRawMZMine.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
index 613e0b649..37ad74244 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -8,3 +8,5 @@
 ^pkgdown$
 ^\.positai$
 ^\.claude$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
index 6d24a0fdc..c796408d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ inst/doc
 .lintr
 .vscode
 .positai
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
index af973548e..fd7c2e4f4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -35,6 +35,7 @@ Suggests:
     rmarkdown
 LinkingTo: Rcpp
 Collate: 
+    'clean_MZMine.R'
     'clean_ProteinProspector.R'
     'clean_Metamorpheus.R'
     'clean_DIANN.R'
@@ -53,6 +54,7 @@ Collate:
     'converters_DIANNtoMSstatsFormat.R'
     'converters_DIAUmpiretoMSstatsFormat.R'
     'converters_FragPipetoMSstatsFormat.R'
+    'converters_MZMinetoMSstatsFormat.R'
     'converters_MaxQtoMSstatsFormat.R'
     'converters_MaxQtoMSstatsTMTFormat.R'
     'converters_MetamorpheusToMSstatsFormat.R'
@@ -81,4 +83,4 @@ Collate:
     'utils_fractions.R'
     'utils_logging.R'
     'utils_shared_peptides.R'
-VignetteBuilder: knitr
+VignetteBuilder: knitr
\ No newline at end of file
diff --git a/NAMESPACE b/NAMESPACE
index cc2cfa210..e51cffd66 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -14,6 +14,7 @@ export(MSstatsLogsSettings)
 export(MSstatsMakeAnnotation)
 export(MSstatsPreprocess)
 export(MSstatsSaveSessionInfo)
+export(MZMinetoMSstatsFormat)
 export(MaxQtoMSstatsFormat)
 export(MaxQtoMSstatsTMTFormat)
 export(MetamorpheusToMSstatsFormat)
diff --git a/R/MSstatsConvert_core_functions.R b/R/MSstatsConvert_core_functions.R
index 3d9ee7dc3..7e4c77ae3 100644
--- a/R/MSstatsConvert_core_functions.R
+++ b/R/MSstatsConvert_core_functions.R
@@ -71,6 +71,10 @@ setClass("MSstatsMetamorpheusFiles", contains = "MSstatsInputFiles")
 #' @rdname MSstatsInputFiles
 #' @keywords internal
 setClass("MSstatsProteinProspectorFiles", contains = "MSstatsInputFiles")
+#' MSstatsMZMineFiles: class for MZMine files.
+#' @rdname MSstatsInputFiles
+#' @keywords internal
+setClass("MSstatsMZMineFiles", contains = "MSstatsInputFiles")
 
 
 #' Get one of files contained in an instance of `MSstatsInputFiles` class.
@@ -291,8 +295,15 @@ setMethod("MSstatsClean", signature = "MSstatsMetamorpheusFiles",
 #' @rdname MSstatsClean
 #' @inheritParams .cleanRawProteinProspector
 #' @return data.table
-setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles", 
+setMethod("MSstatsClean", signature = "MSstatsProteinProspectorFiles",
           .cleanRawProteinProspector)
+#' Clean MZMine files
+#' @include clean_MZMine.R
+#' @rdname MSstatsClean
+#' @inheritParams .cleanRawMZMine
+#' @return data.table
+setMethod("MSstatsClean", signature = "MSstatsMZMineFiles",
+          .cleanRawMZMine)
 
 
 #' Preprocess outputs from MS signal processing tools for analysis with MSstats
diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
new file mode 100644
index 000000000..fea645100
--- /dev/null
+++ b/R/clean_MZMine.R
@@ -0,0 +1,88 @@
+#' Clean raw MZMine files
+#'
+#' Operates on the column names produced by MZMine after MSstatsConvert's
+#' internal column-name standardization (spaces collapsed and dots removed):
+#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time"
+#' becomes `rowretentiontime`, and each "<sample> Peak area" becomes
+#' `<standardized-sample>Peakarea`.
+#'
+#' @param msstats_object an object of class `MSstatsMZMineFiles`.
+#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. When supplied,
+#'   the highest-scoring `compound_name` per feature is used as `ProteinName`.
+#'   Features without a matching annotation row fall back to an mz_rt string
+#'   `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
+#'   uses the mz_rt fallback.
+#' @return data.table
+#' @keywords internal
+.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) {
+    ProteinName = PeptideSequence = Intensity = Run = NULL
+    PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL
+    sample_col = id = score = compound_name = NULL
+
+    mz_input <- getInputFile(msstats_object, "input")
+    mz_input <- data.table::as.data.table(mz_input)
+
+    peak_area_suffix <- "Peakarea"
+    peak_area_cols <- grep(paste0(peak_area_suffix, "$"),
+                           colnames(mz_input), value = TRUE)
+    if (length(peak_area_cols) == 0) {
+        stop("No 'Peak area' columns found in the input. Expected per-sample ",
+             "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
+    }
+    id_col <- "rowID"
+    mz_col <- "rowmz"
+    rt_col <- "rowretentiontime"
+    required_meta <- c(id_col, mz_col, rt_col)
+    missing_meta <- setdiff(required_meta, colnames(mz_input))
+    if (length(missing_meta) > 0) {
+        stop("Missing required MZMine metadata column(s) (expected 'row ID', ",
+             "'row m/z', 'row retention time'). After standardization, ",
+             "looked for: ", paste(missing_meta, collapse = ", "), ".")
+    }
+
+    mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_",
+                             round(mz_input[[rt_col]], 2))
+
+    if (!is.null(mzmine_annotations)) {
+        ann <- data.table::as.data.table(mzmine_annotations)
+        required_ann <- c("id", "compound_name", "score")
+        missing_ann <- setdiff(required_ann, colnames(ann))
+        if (length(missing_ann) > 0) {
+            stop("mzmine_annotations is missing required column(s): ",
+                 paste(missing_ann, collapse = ", "), ".")
+        }
+        data.table::setorder(ann, id, -score)
+        ann_top <- unique(ann, by = "id")
+        matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]),
+                           compound_name]
+        compound <- ifelse(is.na(matched), mz_rt_fallback, matched)
+    } else {
+        compound <- mz_rt_fallback
+    }
+
+    mz_input[, ProteinName := compound]
+    mz_input[, PeptideSequence := as.character(get(id_col))]
+
+    long <- data.table::melt(
+        mz_input,
+        id.vars = c("ProteinName", "PeptideSequence"),
+        measure.vars = peak_area_cols,
+        variable.name = "sample_col",
+        value.name = "Intensity",
+        variable.factor = FALSE)
+
+    long[, PrecursorCharge := NA_integer_]
+    long[, FragmentIon := NA_character_]
+    long[, ProductCharge := NA_integer_]
+    long[, IsotopeLabelType := "Light"]
+    long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)]
+
+    final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
+                    "FragmentIon", "ProductCharge", "IsotopeLabelType",
+                    "Run", "Intensity")
+    long <- long[, final_cols, with = FALSE]
+
+    .logSuccess("MZMine", "clean")
+    long
+}
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
new file mode 100644
index 000000000..5ec42bfb9
--- /dev/null
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -0,0 +1,91 @@
+#' Import MZMine files
+#'
+#' @inheritParams .sharedParametersAmongConverters
+#' @param input MZMine feature-quantification table (wide format; one row per
+#'   feature). Must include the metadata columns `row ID`, `row m/z`,
+#'   `row retention time`, and per-sample peak-area columns named
+#'   `"<run> Peak area"` (e.g. `"sampleA.mzML Peak area"`).
+#' @param annotation `data.frame` with columns `Run`, `Condition`,
+#'   `BioReplicate`. `Run` values must match the sample column names with the
+#'   trailing `" Peak area"` stripped.
+#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. When supplied,
+#'   the highest-scoring `compound_name` per feature is used as `ProteinName`;
+#'   features without a matching annotation row fall back to an mz_rt string
+#'   `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
+#'   uses the mz_rt fallback.
+#' @param removeProtein_with1Feature `TRUE` will remove proteins (compounds)
+#'   represented by a single feature. Default `FALSE`.
+#' @param summaryforMultipleRows `max` (default) or `sum` — used when multiple
+#'   rows map to the same feature/run combination.
+#'
+#' @return data.table in the MSstats required format.
+#'
+#' @export
+#'
+#' @examples
+#' input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
+#'                          package = "MSstatsConvert")
+#' annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
+#'                          package = "MSstatsConvert")
+#' lib_path   = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
+#'                          package = "MSstatsConvert")
+#' input = data.table::fread(input_path)
+#' annot = data.table::fread(annot_path)
+#' lib   = data.table::fread(lib_path)
+#' output = MZMinetoMSstatsFormat(input, annotation = annot,
+#'                                mzmine_annotations = lib,
+#'                                use_log_file = FALSE)
+#' head(output)
+MZMinetoMSstatsFormat = function(
+    input,
+    annotation = NULL,
+    mzmine_annotations = NULL,
+    removeProtein_with1Feature = FALSE,
+    summaryforMultipleRows = max,
+    use_log_file = TRUE,
+    append = FALSE,
+    verbose = TRUE,
+    log_file_path = NULL,
+    ...) {
+    MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
+                                        log_file_path)
+
+    input = MSstatsConvert::MSstatsImport(list(input = input),
+                                          "MSstats", "MZMine")
+    input = MSstatsConvert::MSstatsClean(
+        input, mzmine_annotations = mzmine_annotations)
+    annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
+
+    feature_columns = c("PeptideSequence", "PrecursorCharge",
+                        "FragmentIon", "ProductCharge")
+    fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input))
+        list() else list("IsotopeLabelType" = "Light")
+
+    input = MSstatsConvert::MSstatsPreprocess(
+        input,
+        annotation,
+        feature_columns,
+        remove_shared_peptides = FALSE,
+        remove_single_feature_proteins = removeProtein_with1Feature,
+        exact_filtering = NULL,
+        pattern_filtering = NULL,
+        aggregate_isotopic = FALSE,
+        feature_cleaning = list(
+            remove_features_with_few_measurements = FALSE,
+            summarize_multiple_psms = summaryforMultipleRows),
+        columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type))
+    input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
+
+    input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns,
+                                                  fill_incomplete = TRUE,
+                                                  handle_fractions = FALSE,
+                                                  remove_few = FALSE)
+
+    msg_final = paste("** Finished preprocessing. The dataset is ready",
+                      "to be processed by the dataProcess function.")
+    getOption("MSstatsLog")("INFO", msg_final)
+    getOption("MSstatsMsg")("INFO", msg_final)
+    getOption("MSstatsLog")("INFO", "\n")
+    input
+}
diff --git a/inst/tinytest/raw_data/MZMine/annotation.csv b/inst/tinytest/raw_data/MZMine/annotation.csv
new file mode 100644
index 000000000..f28f4a863
--- /dev/null
+++ b/inst/tinytest/raw_data/MZMine/annotation.csv
@@ -0,0 +1,5 @@
+Run,Condition,BioReplicate
+sampleA.mzML,Control,1
+sampleB.mzML,Control,2
+sampleC.mzML,Treatment,3
+sampleD.mzML,Treatment,4
diff --git a/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv
new file mode 100644
index 000000000..a2c38a57d
--- /dev/null
+++ b/inst/tinytest/raw_data/MZMine/mzmine_annotations.csv
@@ -0,0 +1,6 @@
+id,compound_name,score,adduct
+1,Caffeine,0.95,[M+H]+
+2,GlucoseLow,0.72,[M+H]+
+2,GlucoseHigh,0.91,[M-H]-
+3,Lactate,0.88,[M+H]+
+6,Caffeine,0.80,[M+Na]+
diff --git a/inst/tinytest/raw_data/MZMine/mzmine_input.csv b/inst/tinytest/raw_data/MZMine/mzmine_input.csv
new file mode 100644
index 000000000..b887ed1a1
--- /dev/null
+++ b/inst/tinytest/raw_data/MZMine/mzmine_input.csv
@@ -0,0 +1,7 @@
+row ID,row m/z,row retention time,sampleA.mzML Peak area,sampleB.mzML Peak area,sampleC.mzML Peak area,sampleD.mzML Peak area
+1,123.0560,1.23,1000,1100,1200,1300
+2,245.1290,3.45,5000,4800,5200,4900
+3,367.2010,5.67,800,0,750,820
+4,489.3340,7.89,2000,2100,1900,2050
+5,555.4470,9.10,100,0,0,0
+6,123.0560,1.45,600,650,700,680
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
new file mode 100644
index 000000000..5a6aeaeb2
--- /dev/null
+++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -0,0 +1,117 @@
+# Test MZMinetoMSstatsFormat ---------------------------
+input_file_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
+                              package = "MSstatsConvert")
+annotation_file_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
+                                   package = "MSstatsConvert")
+mzmine_ann_file_path = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
+                                   package = "MSstatsConvert")
+input = data.table::fread(input_file_path)
+annot = data.table::fread(annotation_file_path)
+mzmine_ann = data.table::fread(mzmine_ann_file_path)
+
+# With mzmine_annotations supplied -------------------------------------------
+output = MZMinetoMSstatsFormat(input, annotation = annot,
+                               mzmine_annotations = mzmine_ann,
+                               use_log_file = FALSE)
+output_dt = data.table::as.data.table(output)
+
+# Basic structure: 6 features x 4 runs = 24 rows, 11 standard columns
+expect_equal(ncol(output), 11)
+expect_equal(nrow(output), 24)
+expect_true("Run" %in% colnames(output))
+expect_true("ProteinName" %in% colnames(output))
+expect_true("PeptideSequence" %in% colnames(output))
+expect_true("PrecursorCharge" %in% colnames(output))
+expect_true("Intensity" %in% colnames(output))
+expect_true("FragmentIon" %in% colnames(output))
+expect_true("ProductCharge" %in% colnames(output))
+expect_true("IsotopeLabelType" %in% colnames(output))
+expect_true("Condition" %in% colnames(output))
+expect_true("BioReplicate" %in% colnames(output))
+expect_true("Fraction" %in% colnames(output))
+
+# Metabolomics has no isotope labeling, so every row is "Light"
+expect_true(all(output_dt$IsotopeLabelType == "Light"))
+
+# Charge / fragment columns are not applicable for metabolomics
+expect_true(all(is.na(output_dt$PrecursorCharge)))
+expect_true(all(is.na(output_dt$FragmentIon)))
+expect_true(all(is.na(output_dt$ProductCharge)))
+
+# Fraction filled to 1
+expect_true(all(output_dt$Fraction == 1))
+
+# Annotation join: feature 2 has two annotation rows; the highest-scoring one wins
+feature2_proteins = unique(output_dt[PeptideSequence == "2", ProteinName])
+expect_equal(as.character(feature2_proteins), "GlucoseHigh")
+
+# Clean annotation cases
+feature1_proteins = unique(output_dt[PeptideSequence == "1", ProteinName])
+expect_equal(as.character(feature1_proteins), "Caffeine")
+feature3_proteins = unique(output_dt[PeptideSequence == "3", ProteinName])
+expect_equal(as.character(feature3_proteins), "Lactate")
+feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
+expect_equal(as.character(feature6_proteins), "Caffeine")
+
+# Features without annotation rows fall back to the mz_rt string
+feature4_proteins = unique(output_dt[PeptideSequence == "4", ProteinName])
+expect_equal(as.character(feature4_proteins), "489.334_7.89")
+feature5_proteins = unique(output_dt[PeptideSequence == "5", ProteinName])
+expect_equal(as.character(feature5_proteins), "555.447_9.1")
+
+# Zero-intensity input cells are converted to NA in output
+# Feature 3 sampleB = 0  ->  NA
+feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML",
+                                  Intensity]
+expect_true(is.na(feature3_sampleB_int))
+# Feature 5 sampleB/C/D all = 0  ->  NA
+feature5_zero_ints = output_dt[PeptideSequence == "5" &
+                                   Run %in% c("sampleBmzML", "sampleCmzML", "sampleDmzML"),
+                               Intensity]
+expect_true(all(is.na(feature5_zero_ints)))
+
+# Annotation merges correctly: sampleA is Control rep 1
+sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition])
+expect_equal(as.character(sampleA_cond), "Control")
+sampleA_rep = unique(output_dt[Run == "sampleAmzML", BioReplicate])
+expect_equal(as.character(sampleA_rep), "1")
+sampleC_cond = unique(output_dt[Run == "sampleCmzML", Condition])
+expect_equal(as.character(sampleC_cond), "Treatment")
+
+# Intensity values trace back to input
+feature1_sampleA_int = output_dt[PeptideSequence == "1" & Run == "sampleAmzML",
+                                  Intensity]
+expect_equal(as.numeric(feature1_sampleA_int), 1000)
+feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML",
+                                  Intensity]
+expect_equal(as.numeric(feature2_sampleC_int), 5200)
+
+# Without mzmine_annotations -------------------------------------------------
+output_nolib = MZMinetoMSstatsFormat(input, annotation = annot,
+                                     mzmine_annotations = NULL,
+                                     use_log_file = FALSE)
+output_nolib_dt = data.table::as.data.table(output_nolib)
+
+# Every ProteinName is the mz_rt fallback string
+expect_equal(ncol(output_nolib), 11)
+expect_equal(nrow(output_nolib), 24)
+expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67",
+                   "489.334_7.89", "555.447_9.1", "123.056_1.45")
+expect_true(all(as.character(output_nolib_dt$ProteinName) %in% expected_mz_rt))
+# Compound names from the library must not leak in
+expect_false(any(as.character(output_nolib_dt$ProteinName) %in%
+                 c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate")))
+
+# removeProtein_with1Feature filters non-Caffeine proteins -------------------
+# Caffeine has 2 features (PeptideSequence "1" and "6"); all others have 1.
+output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
+                                        mzmine_annotations = mzmine_ann,
+                                        removeProtein_with1Feature = TRUE,
+                                        use_log_file = FALSE)
+output_filtered_dt = data.table::as.data.table(output_filtered)
+
+expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
+# 2 features x 4 runs = 8 rows
+expect_equal(nrow(output_filtered), 8)
+expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
+             c("1", "6"))
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index d5559610b..0809a567b 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -15,6 +15,7 @@
 \alias{MSstatsClean,MSstatsDIANNFiles-method}
 \alias{MSstatsClean,MSstatsMetamorpheusFiles-method}
 \alias{MSstatsClean,MSstatsProteinProspectorFiles-method}
+\alias{MSstatsClean,MSstatsMZMineFiles-method}
 \title{Clean files generated by a signal processing tools.}
 \usage{
 MSstatsClean(msstats_object, ...)
@@ -80,6 +81,8 @@ MSstatsClean(msstats_object, ...)
 \S4method{MSstatsClean}{MSstatsMetamorpheusFiles}(msstats_object, MBR = TRUE, qvalue_cutoff = 0.05)
 
 \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object)
+
+\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations = NULL)
 }
 \arguments{
 \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.}
@@ -197,6 +200,13 @@ The SILAC suffix is then stripped from \code{PeptideSequence}.
 
 When \code{NULL} (default), protein-turnover mode is disabled and all
 peptides receive \code{IsotopeLabelType = "Light"}.}
+
+\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
+the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}.
+Features without a matching annotation row fall back to an mz_rt string
+\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
+uses the mz_rt fallback.}
 }
 \value{
 data.table
@@ -223,6 +233,8 @@ data.table
 
 data.table
 
+data.table
+
 data.table
 }
 \description{
@@ -253,6 +265,8 @@ Clean DIA-NN files
 Clean Metamorpheus files
 
 Clean Protein Prospector files
+
+Clean MZMine files
 }
 \examples{
 evidence_path = system.file("tinytest/raw_data/MaxQuant/mq_ev.csv", 
diff --git a/man/MSstatsInputFiles.Rd b/man/MSstatsInputFiles.Rd
index e09fff8de..eb4f86463 100644
--- a/man/MSstatsInputFiles.Rd
+++ b/man/MSstatsInputFiles.Rd
@@ -17,6 +17,7 @@
 \alias{MSstatsFragPipeFiles-class}
 \alias{MSstatsMetamorpheusFiles-class}
 \alias{MSstatsProteinProspectorFiles-class}
+\alias{MSstatsMZMineFiles-class}
 \title{Class to model files that describe a single MS dataset.}
 \description{
 Class to model files that describe a single MS dataset.
@@ -48,6 +49,8 @@ MSstatsFragPipeFiles: class for FragPipe files.
 MSstatsMetamorpheusFiles: class for Metamorpheus files.
 
 MSstatsProteinProspectorFiles: class for ProteinProspector files.
+
+MSstatsMZMineFiles: class for MZMine files.
 }
 \section{Slots}{
 
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
new file mode 100644
index 000000000..3f6aa4de1
--- /dev/null
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/converters_MZMinetoMSstatsFormat.R
+\name{MZMinetoMSstatsFormat}
+\alias{MZMinetoMSstatsFormat}
+\title{Import MZMine files}
+\usage{
+MZMinetoMSstatsFormat(
+  input,
+  annotation = NULL,
+  mzmine_annotations = NULL,
+  removeProtein_with1Feature = FALSE,
+  summaryforMultipleRows = max,
+  use_log_file = TRUE,
+  append = FALSE,
+  verbose = TRUE,
+  log_file_path = NULL,
+  ...
+)
+}
+\arguments{
+\item{input}{MZMine feature-quantification table (wide format; one row per
+feature). Must include the metadata columns \verb{row ID}, \verb{row m/z},
+\verb{row retention time}, and per-sample peak-area columns named
+\code{"<run> Peak area"} (e.g. \code{"sampleA.mzML Peak area"}).}
+
+\item{annotation}{\code{data.frame} with columns \code{Run}, \code{Condition},
+\code{BioReplicate}. \code{Run} values must match the sample column names with the
+trailing \code{" Peak area"} stripped.}
+
+\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
+the highest-scoring \code{compound_name} per feature is used as \code{ProteinName};
+features without a matching annotation row fall back to an mz_rt string
+\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
+uses the mz_rt fallback.}
+
+\item{removeProtein_with1Feature}{\code{TRUE} will remove proteins (compounds)
+represented by a single feature. Default \code{FALSE}.}
+
+\item{summaryforMultipleRows}{\code{max} (default) or \code{sum} — used when multiple
+rows map to the same feature/run combination.}
+
+\item{use_log_file}{logical. If TRUE, information about data processing
+will be saved to a file.}
+
+\item{append}{logical. If TRUE, information about data processing will be added
+to an existing log file.}
+
+\item{verbose}{logical. If TRUE, information about data processing will be printed
+to the console.}
+
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
+If not provided, such a file will be created automatically.
+If \code{append = TRUE}, has to be a valid path to a file.}
+
+\item{...}{additional parameters to \code{data.table::fread}.}
+}
+\value{
+data.table in the MSstats required format.
+}
+\description{
+Import MZMine files
+}
+\examples{
+input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
+                         package = "MSstatsConvert")
+annot_path = system.file("tinytest/raw_data/MZMine/annotation.csv",
+                         package = "MSstatsConvert")
+lib_path   = system.file("tinytest/raw_data/MZMine/mzmine_annotations.csv",
+                         package = "MSstatsConvert")
+input = data.table::fread(input_path)
+annot = data.table::fread(annot_path)
+lib   = data.table::fread(lib_path)
+output = MZMinetoMSstatsFormat(input, annotation = annot,
+                               mzmine_annotations = lib,
+                               use_log_file = FALSE)
+head(output)
+}
diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd
new file mode 100644
index 000000000..82794933a
--- /dev/null
+++ b/man/dot-cleanRawMZMine.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/clean_MZMine.R
+\name{.cleanRawMZMine}
+\alias{.cleanRawMZMine}
+\title{Clean raw MZMine files}
+\usage{
+.cleanRawMZMine(msstats_object, mzmine_annotations = NULL)
+}
+\arguments{
+\item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.}
+
+\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
+the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}.
+Features without a matching annotation row fall back to an mz_rt string
+\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
+uses the mz_rt fallback.}
+}
+\value{
+data.table
+}
+\description{
+Operates on the column names produced by MZMine after MSstatsConvert's
+internal column-name standardization (spaces collapsed and dots removed):
+"row ID" becomes \code{rowID}, "row m/z" becomes \code{rowmz}, "row retention time"
+becomes \code{rowretentiontime}, and each "\if{html}{\out{<sample>}} Peak area" becomes
+\verb{<standardized-sample>Peakarea}.
+}
+\keyword{internal}
diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd
index dbf77916a..78e92ff5b 100644
--- a/vignettes/msstats_data_format.Rmd
+++ b/vignettes/msstats_data_format.Rmd
@@ -329,3 +329,53 @@ Such a `data.frame` will be recognized by statistical processing functions from
 `MSstats` and `MSstatsTMT` packages as a valid input, which will allow them to skip
 checks and transformation necessary to fit data into this format.
 
+# Metabolomics with MZMine
+
+`MZMinetoMSstatsFormat` brings untargeted metabolomics into the MSstats family.
+It takes the wide-format feature-quantification table exported by MZMine (one row
+per feature, one `<sample> Peak area` column per sample) together with a standard
+MSstats annotation and produces an MSstats-ready long-format `data.table`.
+
+Optionally, an MZMine spectral-library annotation table (with `id`, `compound_name`,
+`score` columns) can be supplied; the highest-scoring `compound_name` per feature
+is used as `ProteinName`. Features with no library match fall back to an mz_rt
+string of the form `paste0(round(mz, 4), "_", round(rt, 2))`. When no library
+annotation is supplied, every feature uses the mz_rt fallback.
+
+```{r mzmine}
+mzmine_input = data.table::fread(system.file(
+  "tinytest/raw_data/MZMine/mzmine_input.csv",
+  package = "MSstatsConvert"
+))
+mzmine_annotation = data.table::fread(system.file(
+  "tinytest/raw_data/MZMine/annotation.csv",
+  package = "MSstatsConvert"
+))
+mzmine_library = data.table::fread(system.file(
+  "tinytest/raw_data/MZMine/mzmine_annotations.csv",
+  package = "MSstatsConvert"
+))
+
+# With a spectral-library annotation: ProteinName is the matched compound name
+mzmine_with_lib = MZMinetoMSstatsFormat(
+  mzmine_input,
+  annotation = mzmine_annotation,
+  mzmine_annotations = mzmine_library,
+  use_log_file = FALSE
+)
+head(mzmine_with_lib)
+
+# Without a library: ProteinName is an mz_rt fallback string
+mzmine_no_lib = MZMinetoMSstatsFormat(
+  mzmine_input,
+  annotation = mzmine_annotation,
+  mzmine_annotations = NULL,
+  use_log_file = FALSE
+)
+head(mzmine_no_lib)
+```
+
+Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence`
+holds the MZMine `row ID` (as a string), and `PrecursorCharge`, `FragmentIon`, and
+`ProductCharge` are all `NA`. `IsotopeLabelType` is set to `"Light"` for every row.
+

From 778899f6b43cca1dc53cbf23eab61e4a0922685a Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Tue, 19 May 2026 23:33:31 -0400
Subject: [PATCH 2/6] Address CodeRabbit review feedback

---
 R/clean_MZMine.R                                      | 4 ++++
 R/converters_MZMinetoMSstatsFormat.R                  | 9 ++++++---
 inst/tinytest/test_converters_MZMinetoMSstatsFormat.R | 5 ++++-
 man/MZMinetoMSstatsFormat.Rd                          | 7 +++++--
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index fea645100..ce2713c9e 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -52,6 +52,10 @@
             stop("mzmine_annotations is missing required column(s): ",
                  paste(missing_ann, collapse = ", "), ".")
         }
+        ann[, score := suppressWarnings(as.numeric(score))]
+        if (anyNA(ann$score)) {
+            stop("mzmine_annotations$score must be numeric (or coercible to numeric).")
+        }
         data.table::setorder(ann, id, -score)
         ann_top <- unique(ann, by = "id")
         matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]),
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index 5ec42bfb9..3c340b8e6 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -6,8 +6,11 @@
 #'   `row retention time`, and per-sample peak-area columns named
 #'   `"<run> Peak area"` (e.g. `"sampleA.mzML Peak area"`).
 #' @param annotation `data.frame` with columns `Run`, `Condition`,
-#'   `BioReplicate`. `Run` values must match the sample column names with the
-#'   trailing `" Peak area"` stripped.
+#'   `BioReplicate`. `Run` values must match MSstatsConvert-standardized sample
+#'   names (after column-name normalization removes spaces and dots) with the
+#'   trailing `"Peakarea"` suffix removed. For example, a quant-file column
+#'   `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
+#'   so the corresponding `Run` value must be `sampleAmzML`.
 #' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. When supplied,
 #'   the highest-scoring `compound_name` per feature is used as `ProteinName`;
@@ -52,7 +55,7 @@ MZMinetoMSstatsFormat = function(
                                         log_file_path)
 
     input = MSstatsConvert::MSstatsImport(list(input = input),
-                                          "MSstats", "MZMine")
+                                          "MSstats", "MZMine", ...)
     input = MSstatsConvert::MSstatsClean(
         input, mzmine_annotations = mzmine_annotations)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
index 5a6aeaeb2..6d0fff53a 100644
--- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
+++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -97,7 +97,10 @@ expect_equal(ncol(output_nolib), 11)
 expect_equal(nrow(output_nolib), 24)
 expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67",
                    "489.334_7.89", "555.447_9.1", "123.056_1.45")
-expect_true(all(as.character(output_nolib_dt$ProteinName) %in% expected_mz_rt))
+expect_equal(
+    sort(unique(as.character(output_nolib_dt$ProteinName))),
+    sort(expected_mz_rt)
+)
 # Compound names from the library must not leak in
 expect_false(any(as.character(output_nolib_dt$ProteinName) %in%
                  c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate")))
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index 3f6aa4de1..5f753215b 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -24,8 +24,11 @@ feature). Must include the metadata columns \verb{row ID}, \verb{row m/z},
 \code{"<run> Peak area"} (e.g. \code{"sampleA.mzML Peak area"}).}
 
 \item{annotation}{\code{data.frame} with columns \code{Run}, \code{Condition},
-\code{BioReplicate}. \code{Run} values must match the sample column names with the
-trailing \code{" Peak area"} stripped.}
+\code{BioReplicate}. \code{Run} values must match MSstatsConvert-standardized sample
+names (after column-name normalization removes spaces and dots) with the
+trailing \code{"Peakarea"} suffix removed. For example, a quant-file column
+\code{"sampleA.mzML Peak area"} becomes \code{"sampleAmzML"} after standardization,
+so the corresponding \code{Run} value must be \code{sampleAmzML}.}
 
 \item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,

From 2957f6d46968cbeac1b62c661ec9a7805955f8f4 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Tue, 19 May 2026 23:41:07 -0400
Subject: [PATCH 3/6] Coerce score via character to handle factor inputs

---
 R/clean_MZMine.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index ce2713c9e..9b1b59691 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -52,7 +52,7 @@
             stop("mzmine_annotations is missing required column(s): ",
                  paste(missing_ann, collapse = ", "), ".")
         }
-        ann[, score := suppressWarnings(as.numeric(score))]
+        ann[, score := suppressWarnings(as.numeric(as.character(score)))]
         if (anyNA(ann$score)) {
             stop("mzmine_annotations$score must be numeric (or coercible to numeric).")
         }

From 53fed81dd8f9afc58e81afa70c6df19541c7a000 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Wed, 27 May 2026 10:30:49 -0400
Subject: [PATCH 4/6] =?UTF-8?q?Address=20Tony=20review=20feedback=20for=20?=
 =?UTF-8?q?workstream=20A=20-=20Let=20MSstatsPreprocess=20fill=20IsotopeLa?=
 =?UTF-8?q?belType=20-=20Hardcode=20IsotopeLabelType=20in=20converter=20-?=
 =?UTF-8?q?=20Remove=20redundant=20inherited=20@params=20-=20Simplify=20sc?=
 =?UTF-8?q?ore=20coercion=20-=20Improve=20non-numeric=20score=20error=20-?=
 =?UTF-8?q?=20Rename=20`ann`=20=E2=86=92=20feature=5Fto=5Fcompound=20-=20R?=
 =?UTF-8?q?ename=20melt=20variable=20to=20Run=20-=20Refactor=20compound-na?=
 =?UTF-8?q?me=20assignment=20with=20explicit=20data.table=20join?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 R/clean_MZMine.R                     | 44 ++++++++++++++++------------
 R/converters_MZMinetoMSstatsFormat.R | 11 ++-----
 man/MZMinetoMSstatsFormat.Rd         |  6 ++--
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index 9b1b59691..c714a3c72 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -17,8 +17,8 @@
 #' @keywords internal
 .cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) {
     ProteinName = PeptideSequence = Intensity = Run = NULL
-    PrecursorCharge = FragmentIon = ProductCharge = IsotopeLabelType = NULL
-    sample_col = id = score = compound_name = NULL
+    PrecursorCharge = FragmentIon = ProductCharge = NULL
+    id = score = compound_name = i.compound_name = NULL
 
     mz_input <- getInputFile(msstats_object, "input")
     mz_input <- data.table::as.data.table(mz_input)
@@ -43,47 +43,53 @@
 
     mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_",
                              round(mz_input[[rt_col]], 2))
+    mz_input[, ProteinName := mz_rt_fallback]
 
     if (!is.null(mzmine_annotations)) {
-        ann <- data.table::as.data.table(mzmine_annotations)
+        feature_to_compound <- data.table::as.data.table(mzmine_annotations)
         required_ann <- c("id", "compound_name", "score")
-        missing_ann <- setdiff(required_ann, colnames(ann))
+        missing_ann <- setdiff(required_ann, colnames(feature_to_compound))
         if (length(missing_ann) > 0) {
             stop("mzmine_annotations is missing required column(s): ",
                  paste(missing_ann, collapse = ", "), ".")
         }
-        ann[, score := suppressWarnings(as.numeric(as.character(score)))]
-        if (anyNA(ann$score)) {
-            stop("mzmine_annotations$score must be numeric (or coercible to numeric).")
+        feature_to_compound[, score := suppressWarnings(as.numeric(score))]
+        if (anyNA(feature_to_compound$score)) {
+            stop("The 'score' column in the mzmine annotations file must contain numeric values.")
         }
-        data.table::setorder(ann, id, -score)
-        ann_top <- unique(ann, by = "id")
-        matched <- ann_top[match(mz_input[[id_col]], ann_top[["id"]]),
-                           compound_name]
-        compound <- ifelse(is.na(matched), mz_rt_fallback, matched)
-    } else {
-        compound <- mz_rt_fallback
+        # Sort by id ascending and score descending so the highest-scoring
+        # annotation per id is the first row in each group.
+        data.table::setorder(feature_to_compound, id, -score)
+        # Collapse to one row per id (the highest-scoring). data.table's
+        # unique() with a 'by' arg keeps the first row per group, which after
+        # the sort above is the highest-scoring annotation.
+        feature_to_compound <- unique(feature_to_compound, by = "id")
+        # Join: unmatched mz_input rows keep the mz_rt_fallback ProteinName
+        # set above.
+        mz_input[
+            feature_to_compound,
+            ProteinName := i.compound_name,
+            on = setNames("id", id_col)
+        ]
     }
 
-    mz_input[, ProteinName := compound]
     mz_input[, PeptideSequence := as.character(get(id_col))]
 
     long <- data.table::melt(
         mz_input,
         id.vars = c("ProteinName", "PeptideSequence"),
         measure.vars = peak_area_cols,
-        variable.name = "sample_col",
+        variable.name = "Run",
         value.name = "Intensity",
         variable.factor = FALSE)
 
     long[, PrecursorCharge := NA_integer_]
     long[, FragmentIon := NA_character_]
     long[, ProductCharge := NA_integer_]
-    long[, IsotopeLabelType := "Light"]
-    long[, Run := sub(paste0(peak_area_suffix, "$"), "", sample_col)]
+    long[, Run := sub(paste0(peak_area_suffix, "$"), "", Run)]
 
     final_cols <- c("ProteinName", "PeptideSequence", "PrecursorCharge",
-                    "FragmentIon", "ProductCharge", "IsotopeLabelType",
+                    "FragmentIon", "ProductCharge",
                     "Run", "Intensity")
     long <- long[, final_cols, with = FALSE]
 
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index 3c340b8e6..ae7a517ad 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -17,10 +17,6 @@
 #'   features without a matching annotation row fall back to an mz_rt string
 #'   `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
 #'   uses the mz_rt fallback.
-#' @param removeProtein_with1Feature `TRUE` will remove proteins (compounds)
-#'   represented by a single feature. Default `FALSE`.
-#' @param summaryforMultipleRows `max` (default) or `sum` — used when multiple
-#'   rows map to the same feature/run combination.
 #'
 #' @return data.table in the MSstats required format.
 #'
@@ -60,10 +56,7 @@ MZMinetoMSstatsFormat = function(
         input, mzmine_annotations = mzmine_annotations)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
 
-    feature_columns = c("PeptideSequence", "PrecursorCharge",
-                        "FragmentIon", "ProductCharge")
-    fill_isotope_label_type = if ("IsotopeLabelType" %in% colnames(input))
-        list() else list("IsotopeLabelType" = "Light")
+    feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge")
 
     input = MSstatsConvert::MSstatsPreprocess(
         input,
@@ -77,7 +70,7 @@ MZMinetoMSstatsFormat = function(
         feature_cleaning = list(
             remove_features_with_few_measurements = FALSE,
             summarize_multiple_psms = summaryforMultipleRows),
-        columns_to_fill = c(list(Fraction = 1), fill_isotope_label_type))
+        columns_to_fill = list(Fraction = 1, IsotopeLabelType = "Light"))
     input[, Intensity := ifelse(Intensity == 0, NA, Intensity)]
 
     input = MSstatsConvert::MSstatsBalancedDesign(input, feature_columns,
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index 5f753215b..3ce9cc9b2 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -37,11 +37,9 @@ features without a matching annotation row fall back to an mz_rt string
 \code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
 uses the mz_rt fallback.}
 
-\item{removeProtein_with1Feature}{\code{TRUE} will remove proteins (compounds)
-represented by a single feature. Default \code{FALSE}.}
+\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.}
 
-\item{summaryforMultipleRows}{\code{max} (default) or \code{sum} — used when multiple
-rows map to the same feature/run combination.}
+\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.}
 
 \item{use_log_file}{logical. If TRUE, information about data processing
 will be saved to a file.}

From e4d8966a824a4dbbaf7a13c35fb2f04c9caa5b2f Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Wed, 27 May 2026 15:08:19 -0400
Subject: [PATCH 5/6] Require mzmine_annotations; inner-join filter - Error on
 NULL/missing mzmine_annotations - Drop quant-only unmatched features (no
 mz_rt fallback) - Log retained feature IDs after join - Update tests,
 vignette, and roxygen for filtering + MSI Level 2 scope - Remove unused
 .cleanRawMZMine metadata requirements removeProtein_with1Feature default
 unchanged (FALSE).

---
 DESCRIPTION                                   |  2 +-
 R/clean_MZMine.R                              | 89 +++++++++----------
 R/converters_MZMinetoMSstatsFormat.R          | 26 ++++--
 .../test_converters_MZMinetoMSstatsFormat.R   | 55 +++++-------
 man/MSstatsClean.Rd                           | 16 ++--
 man/MZMinetoMSstatsFormat.Rd                  | 21 +++--
 man/dot-cleanRawMZMine.Rd                     | 19 ++--
 vignettes/msstats_data_format.Rmd             | 32 ++++---
 8 files changed, 136 insertions(+), 124 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fd7c2e4f4..073329f4c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -83,4 +83,4 @@ Collate:
     'utils_fractions.R'
     'utils_logging.R'
     'utils_shared_peptides.R'
-VignetteBuilder: knitr
\ No newline at end of file
+VignetteBuilder: knitr
diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index c714a3c72..ab9d559ec 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -2,20 +2,21 @@
 #'
 #' Operates on the column names produced by MZMine after MSstatsConvert's
 #' internal column-name standardization (spaces collapsed and dots removed):
-#' "row ID" becomes `rowID`, "row m/z" becomes `rowmz`, "row retention time"
-#' becomes `rowretentiontime`, and each "<sample> Peak area" becomes
+#' "row ID" becomes `rowID`, and each "<sample> Peak area" becomes
 #' `<standardized-sample>Peakarea`.
 #'
 #' @param msstats_object an object of class `MSstatsMZMineFiles`.
-#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
-#'   annotations with columns `id`, `compound_name`, `score`. When supplied,
-#'   the highest-scoring `compound_name` per feature is used as `ProteinName`.
-#'   Features without a matching annotation row fall back to an mz_rt string
-#'   `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
-#'   uses the mz_rt fallback.
+#' @param mzmine_annotations `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. Required;
+#'   passing `NULL` raises an error. The highest-scoring `compound_name`
+#'   per feature is used as `ProteinName`, and features in the quant
+#'   table with no matching annotation row are dropped from the output.
+#'   These are MSI Level 2 annotations (putative identification via
+#'   MS/MS spectral matching). See the public `MZMinetoMSstatsFormat`
+#'   docstring for the full scope discussion.
 #' @return data.table
 #' @keywords internal
-.cleanRawMZMine <- function(msstats_object, mzmine_annotations = NULL) {
+.cleanRawMZMine <- function(msstats_object, mzmine_annotations) {
     ProteinName = PeptideSequence = Intensity = Run = NULL
     PrecursorCharge = FragmentIon = ProductCharge = NULL
     id = score = compound_name = i.compound_name = NULL
@@ -31,47 +32,45 @@
              "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
     }
     id_col <- "rowID"
-    mz_col <- "rowmz"
-    rt_col <- "rowretentiontime"
-    required_meta <- c(id_col, mz_col, rt_col)
+    required_meta <- id_col
     missing_meta <- setdiff(required_meta, colnames(mz_input))
     if (length(missing_meta) > 0) {
-        stop("Missing required MZMine metadata column(s) (expected 'row ID', ",
-             "'row m/z', 'row retention time'). After standardization, ",
-             "looked for: ", paste(missing_meta, collapse = ", "), ".")
+        stop("Missing required MZMine metadata column (expected 'row ID'). ",
+             "After standardization, looked for: ",
+             paste(missing_meta, collapse = ", "), ".")
     }
 
-    mz_rt_fallback <- paste0(round(mz_input[[mz_col]], 4), "_",
-                             round(mz_input[[rt_col]], 2))
-    mz_input[, ProteinName := mz_rt_fallback]
-
-    if (!is.null(mzmine_annotations)) {
-        feature_to_compound <- data.table::as.data.table(mzmine_annotations)
-        required_ann <- c("id", "compound_name", "score")
-        missing_ann <- setdiff(required_ann, colnames(feature_to_compound))
-        if (length(missing_ann) > 0) {
-            stop("mzmine_annotations is missing required column(s): ",
-                 paste(missing_ann, collapse = ", "), ".")
-        }
-        feature_to_compound[, score := suppressWarnings(as.numeric(score))]
-        if (anyNA(feature_to_compound$score)) {
-            stop("The 'score' column in the mzmine annotations file must contain numeric values.")
-        }
-        # Sort by id ascending and score descending so the highest-scoring
-        # annotation per id is the first row in each group.
-        data.table::setorder(feature_to_compound, id, -score)
-        # Collapse to one row per id (the highest-scoring). data.table's
-        # unique() with a 'by' arg keeps the first row per group, which after
-        # the sort above is the highest-scoring annotation.
-        feature_to_compound <- unique(feature_to_compound, by = "id")
-        # Join: unmatched mz_input rows keep the mz_rt_fallback ProteinName
-        # set above.
-        mz_input[
-            feature_to_compound,
-            ProteinName := i.compound_name,
-            on = setNames("id", id_col)
-        ]
+    if (is.null(mzmine_annotations)) {
+        stop("mzmine_annotations is required. Pass a data.frame with ",
+             "columns 'id', 'compound_name', 'score'.")
+    }
+    feature_to_compound <- data.table::as.data.table(mzmine_annotations)
+    required_ann <- c("id", "compound_name", "score")
+    missing_ann <- setdiff(required_ann, colnames(feature_to_compound))
+    if (length(missing_ann) > 0) {
+        stop("mzmine_annotations is missing required column(s): ",
+             paste(missing_ann, collapse = ", "), ".")
     }
+    feature_to_compound[, score := suppressWarnings(as.numeric(score))]
+    if (anyNA(feature_to_compound$score)) {
+        stop("The 'score' column in the mzmine annotations file must contain numeric values.")
+    }
+    data.table::setorder(feature_to_compound, id, -score)
+    feature_to_compound <- unique(feature_to_compound, by = "id")
+    # Inner-join filter: drop quant rows with no matching annotation.
+    mz_input[
+        feature_to_compound,
+        ProteinName := i.compound_name,
+        on = setNames("id", id_col)
+    ]
+    mz_input <- mz_input[!is.na(ProteinName)]
+
+    retained_ids <- feature_to_compound$id
+    retained_msg <- paste0("** MZMine: retained ", length(retained_ids),
+                           " feature(s) after annotation join: ",
+                           paste(retained_ids, collapse = ", "))
+    getOption("MSstatsLog")("INFO", retained_msg)
+    getOption("MSstatsMsg")("INFO", retained_msg)
 
     mz_input[, PeptideSequence := as.character(get(id_col))]
 
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index ae7a517ad..4e3df158b 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -11,12 +11,19 @@
 #'   trailing `"Peakarea"` suffix removed. For example, a quant-file column
 #'   `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
 #'   so the corresponding `Run` value must be `sampleAmzML`.
-#' @param mzmine_annotations optional `data.frame` of MZMine spectral-library
-#'   annotations with columns `id`, `compound_name`, `score`. When supplied,
-#'   the highest-scoring `compound_name` per feature is used as `ProteinName`;
-#'   features without a matching annotation row fall back to an mz_rt string
-#'   `paste0(round(mz, 4), "_", round(rt, 2))`. When `NULL`, every feature
-#'   uses the mz_rt fallback.
+#' @param mzmine_annotations `data.frame` of MZMine spectral-library
+#'   annotations with columns `id`, `compound_name`, `score`. Required:
+#'   the highest-scoring `compound_name` per feature is used as
+#'   `ProteinName`, and features in the quant table with no matching
+#'   annotation row are dropped from the output.
+#'
+#'   These are MSI Level 2 annotations (putative identification via
+#'   MS/MS spectral matching against a reference library). Higher-
+#'   confidence Level 1 identifications require pure reference standards
+#'   and are out of scope here. Lower-confidence annotations such as
+#'   Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
+#'   CANOPUS) are not currently supported -- features without a Level 2
+#'   annotation row are filtered out.
 #'
 #' @return data.table in the MSstats required format.
 #'
@@ -39,7 +46,7 @@
 MZMinetoMSstatsFormat = function(
     input,
     annotation = NULL,
-    mzmine_annotations = NULL,
+    mzmine_annotations,
     removeProtein_with1Feature = FALSE,
     summaryforMultipleRows = max,
     use_log_file = TRUE,
@@ -50,6 +57,11 @@ MZMinetoMSstatsFormat = function(
     MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
                                         log_file_path)
 
+    if (missing(mzmine_annotations) || is.null(mzmine_annotations)) {
+        stop("mzmine_annotations is required. Pass a data.frame with ",
+             "columns 'id', 'compound_name', 'score'.")
+    }
+
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "MZMine", ...)
     input = MSstatsConvert::MSstatsClean(
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
index 6d0fff53a..dcfccf904 100644
--- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
+++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -15,9 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot,
                                use_log_file = FALSE)
 output_dt = data.table::as.data.table(output)
 
-# Basic structure: 6 features x 4 runs = 24 rows, 11 standard columns
+# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns
+# Features 4 and 5 have no annotation row and are dropped by the inner join.
 expect_equal(ncol(output), 11)
-expect_equal(nrow(output), 24)
+expect_equal(nrow(output), 16)
 expect_true("Run" %in% colnames(output))
 expect_true("ProteinName" %in% colnames(output))
 expect_true("PeptideSequence" %in% colnames(output))
@@ -53,22 +54,17 @@ expect_equal(as.character(feature3_proteins), "Lactate")
 feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
 expect_equal(as.character(feature6_proteins), "Caffeine")
 
-# Features without annotation rows fall back to the mz_rt string
-feature4_proteins = unique(output_dt[PeptideSequence == "4", ProteinName])
-expect_equal(as.character(feature4_proteins), "489.334_7.89")
-feature5_proteins = unique(output_dt[PeptideSequence == "5", ProteinName])
-expect_equal(as.character(feature5_proteins), "555.447_9.1")
+# Features absent from the annotations file are filtered out (no mz_rt fallback)
+expect_false("4" %in% as.character(output_dt$PeptideSequence))
+expect_false("5" %in% as.character(output_dt$PeptideSequence))
+expect_false(any(as.character(output_dt$ProteinName) %in%
+                 c("489.334_7.89", "555.447_9.1")))
 
 # Zero-intensity input cells are converted to NA in output
-# Feature 3 sampleB = 0  ->  NA
+# Feature 3 sampleB = 0  ->  NA  (feature 3 is annotated as Lactate)
 feature3_sampleB_int = output_dt[PeptideSequence == "3" & Run == "sampleBmzML",
                                   Intensity]
 expect_true(is.na(feature3_sampleB_int))
-# Feature 5 sampleB/C/D all = 0  ->  NA
-feature5_zero_ints = output_dt[PeptideSequence == "5" &
-                                   Run %in% c("sampleBmzML", "sampleCmzML", "sampleDmzML"),
-                               Intensity]
-expect_true(all(is.na(feature5_zero_ints)))
 
 # Annotation merges correctly: sampleA is Control rep 1
 sampleA_cond = unique(output_dt[Run == "sampleAmzML", Condition])
@@ -86,27 +82,24 @@ feature2_sampleC_int = output_dt[PeptideSequence == "2" & Run == "sampleCmzML",
                                   Intensity]
 expect_equal(as.numeric(feature2_sampleC_int), 5200)
 
-# Without mzmine_annotations -------------------------------------------------
-output_nolib = MZMinetoMSstatsFormat(input, annotation = annot,
-                                     mzmine_annotations = NULL,
-                                     use_log_file = FALSE)
-output_nolib_dt = data.table::as.data.table(output_nolib)
-
-# Every ProteinName is the mz_rt fallback string
-expect_equal(ncol(output_nolib), 11)
-expect_equal(nrow(output_nolib), 24)
-expected_mz_rt = c("123.056_1.23", "245.129_3.45", "367.201_5.67",
-                   "489.334_7.89", "555.447_9.1", "123.056_1.45")
-expect_equal(
-    sort(unique(as.character(output_nolib_dt$ProteinName))),
-    sort(expected_mz_rt)
+# mzmine_annotations is mandatory --------------------------------------------
+# Passing NULL must raise an error (no silent mz_rt fallback)
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          mzmine_annotations = NULL,
+                          use_log_file = FALSE),
+    "mzmine_annotations is required"
+)
+# Omitting the argument entirely must also raise an error
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          use_log_file = FALSE),
+    "mzmine_annotations is required"
 )
-# Compound names from the library must not leak in
-expect_false(any(as.character(output_nolib_dt$ProteinName) %in%
-                 c("Caffeine", "GlucoseHigh", "GlucoseLow", "Lactate")))
 
 # removeProtein_with1Feature filters non-Caffeine proteins -------------------
-# Caffeine has 2 features (PeptideSequence "1" and "6"); all others have 1.
+# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6);
+# Lactate and Glucose each have 1.
 output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
                                         mzmine_annotations = mzmine_ann,
                                         removeProtein_with1Feature = TRUE,
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index 0809a567b..8863b4ea7 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -82,7 +82,7 @@ MSstatsClean(msstats_object, ...)
 
 \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object)
 
-\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations = NULL)
+\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations)
 }
 \arguments{
 \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.}
@@ -201,12 +201,14 @@ The SILAC suffix is then stripped from \code{PeptideSequence}.
 When \code{NULL} (default), protein-turnover mode is disabled and all
 peptides receive \code{IsotopeLabelType = "Light"}.}
 
-\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
-annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
-the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}.
-Features without a matching annotation row fall back to an mz_rt string
-\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
-uses the mz_rt fallback.}
+\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
+passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
+per feature is used as \code{ProteinName}, and features in the quant
+table with no matching annotation row are dropped from the output.
+These are MSI Level 2 annotations (putative identification via
+MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat}
+docstring for the full scope discussion.}
 }
 \value{
 data.table
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index 3ce9cc9b2..b3fdcfe11 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -7,7 +7,7 @@
 MZMinetoMSstatsFormat(
   input,
   annotation = NULL,
-  mzmine_annotations = NULL,
+  mzmine_annotations,
   removeProtein_with1Feature = FALSE,
   summaryforMultipleRows = max,
   use_log_file = TRUE,
@@ -30,12 +30,19 @@ trailing \code{"Peakarea"} suffix removed. For example, a quant-file column
 \code{"sampleA.mzML Peak area"} becomes \code{"sampleAmzML"} after standardization,
 so the corresponding \code{Run} value must be \code{sampleAmzML}.}
 
-\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
-annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
-the highest-scoring \code{compound_name} per feature is used as \code{ProteinName};
-features without a matching annotation row fall back to an mz_rt string
-\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
-uses the mz_rt fallback.}
+\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. Required:
+the highest-scoring \code{compound_name} per feature is used as
+\code{ProteinName}, and features in the quant table with no matching
+annotation row are dropped from the output.
+
+These are MSI Level 2 annotations (putative identification via
+MS/MS spectral matching against a reference library). Higher-
+confidence Level 1 identifications require pure reference standards
+and are out of scope here. Lower-confidence annotations such as
+Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
+CANOPUS) are not currently supported -- features without a Level 2
+annotation row are filtered out.}
 
 \item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.}
 
diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd
index 82794933a..8c93db083 100644
--- a/man/dot-cleanRawMZMine.Rd
+++ b/man/dot-cleanRawMZMine.Rd
@@ -4,17 +4,19 @@
 \alias{.cleanRawMZMine}
 \title{Clean raw MZMine files}
 \usage{
-.cleanRawMZMine(msstats_object, mzmine_annotations = NULL)
+.cleanRawMZMine(msstats_object, mzmine_annotations)
 }
 \arguments{
 \item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.}
 
-\item{mzmine_annotations}{optional \code{data.frame} of MZMine spectral-library
-annotations with columns \code{id}, \code{compound_name}, \code{score}. When supplied,
-the highest-scoring \code{compound_name} per feature is used as \code{ProteinName}.
-Features without a matching annotation row fall back to an mz_rt string
-\code{paste0(round(mz, 4), "_", round(rt, 2))}. When \code{NULL}, every feature
-uses the mz_rt fallback.}
+\item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
+annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
+passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
+per feature is used as \code{ProteinName}, and features in the quant
+table with no matching annotation row are dropped from the output.
+These are MSI Level 2 annotations (putative identification via
+MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat}
+docstring for the full scope discussion.}
 }
 \value{
 data.table
@@ -22,8 +24,7 @@ data.table
 \description{
 Operates on the column names produced by MZMine after MSstatsConvert's
 internal column-name standardization (spaces collapsed and dots removed):
-"row ID" becomes \code{rowID}, "row m/z" becomes \code{rowmz}, "row retention time"
-becomes \code{rowretentiontime}, and each "\if{html}{\out{<sample>}} Peak area" becomes
+"row ID" becomes \code{rowID}, and each "\if{html}{\out{<sample>}} Peak area" becomes
 \verb{<standardized-sample>Peakarea}.
 }
 \keyword{internal}
diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd
index 78e92ff5b..ea8f0b729 100644
--- a/vignettes/msstats_data_format.Rmd
+++ b/vignettes/msstats_data_format.Rmd
@@ -336,11 +336,18 @@ It takes the wide-format feature-quantification table exported by MZMine (one ro
 per feature, one `<sample> Peak area` column per sample) together with a standard
 MSstats annotation and produces an MSstats-ready long-format `data.table`.
 
-Optionally, an MZMine spectral-library annotation table (with `id`, `compound_name`,
-`score` columns) can be supplied; the highest-scoring `compound_name` per feature
-is used as `ProteinName`. Features with no library match fall back to an mz_rt
-string of the form `paste0(round(mz, 4), "_", round(rt, 2))`. When no library
-annotation is supplied, every feature uses the mz_rt fallback.
+An MZMine spectral-library annotation table with `id`, `compound_name`, and
+`score` columns is **required**. The highest-scoring `compound_name` per feature
+is used as `ProteinName`. Features in the quant table with no matching annotation
+row are dropped from the output — there is no synthesized mz_rt fallback,
+because placeholder identifiers inflate the hypothesis count for downstream
+`groupComparison` without biological signal.
+
+These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/)
+(putative identification via MS/MS spectral matching against a reference library).
+Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS
+(Level 4) — are out of scope for this iteration; features without a Level 2
+annotation row are filtered out.
 
 ```{r mzmine}
 mzmine_input = data.table::fread(system.file(
@@ -356,23 +363,14 @@ mzmine_library = data.table::fread(system.file(
   package = "MSstatsConvert"
 ))
 
-# With a spectral-library annotation: ProteinName is the matched compound name
-mzmine_with_lib = MZMinetoMSstatsFormat(
+# ProteinName comes from the matched compound_name; unannotated features are dropped
+mzmine_converted = MZMinetoMSstatsFormat(
   mzmine_input,
   annotation = mzmine_annotation,
   mzmine_annotations = mzmine_library,
   use_log_file = FALSE
 )
-head(mzmine_with_lib)
-
-# Without a library: ProteinName is an mz_rt fallback string
-mzmine_no_lib = MZMinetoMSstatsFormat(
-  mzmine_input,
-  annotation = mzmine_annotation,
-  mzmine_annotations = NULL,
-  use_log_file = FALSE
-)
-head(mzmine_no_lib)
+head(mzmine_converted)
 ```
 
 Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence`

From 91feff31e7b00657eb5cdfd8f8569631448f72b5 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Wed, 27 May 2026 15:55:46 -0400
Subject: [PATCH 6/6] Replace <- with = assignment operator in cleanRawMZMine
 function

---
 R/clean_MZMine.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index ab9d559ec..cee0cf050 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -21,8 +21,8 @@
     PrecursorCharge = FragmentIon = ProductCharge = NULL
     id = score = compound_name = i.compound_name = NULL
 
-    mz_input <- getInputFile(msstats_object, "input")
-    mz_input <- data.table::as.data.table(mz_input)
+    mz_input = getInputFile(msstats_object, "input")
+    mz_input = data.table::as.data.table(mz_input)
 
     peak_area_suffix <- "Peakarea"
     peak_area_cols <- grep(paste0(peak_area_suffix, "$"),