@@ -18,20 +18,24 @@ The workflow follows the structure of a standard epidemiological analysis:
1818
1919First, we load the necessary R packages. We then load the ` dat.full.with.mortality.RDS ` file (from [ here] ( https://ehsanx.github.io/Reproducible-NHANES-Analysis/ ) ), which contains the merged NHANES and mortality data from 1999-2018.
2020
21- ``` {r setup, message=FALSE, warning=FALSE, cache=TRUE }
21+ ``` {r setup, message=FALSE, warning=FALSE}
2222# Load all necessary packages for the analysis
2323library(dplyr)
2424library(car)
25- library(survey)
2625library(survival)
27- library(mice)
26+ library(mice) # For imputation
27+ library(survey) # For survey analysis (svydesign, svyglm, etc.)
28+ library(mitools) # FOR imputationList() and other MI tools
2829library(Publish)
2930library(DataExplorer)
3031library(knitr)
3132library(kableExtra)
3233# devtools::install_github("ehsanx/svyTable1", build_vignettes = TRUE, dependencies = TRUE)
33- library(svyTable1)
34+ library(svyTable1) # for svypooled
35+ ```
36+
3437
38+ ``` {r setup2, message=FALSE, warning=FALSE, cache=TRUE}
3539# Set survey option for compatibility
3640options(survey.want.obsolete = TRUE)
3741
@@ -114,7 +118,7 @@ dat.analytic$nelson_aalen <- nelsonaalen(
114118 time = stime.since.birth,
115119 status = status_all
116120)
117- summary(dat.analytic$stime.since.birth )
121+ summary(dat.analytic$nelson_aalen )
118122summary(dat.analytic$stime.since.birth)
119123table(dat.analytic$status_all)
120124hist(dat.analytic$nelson_aalen)
@@ -130,7 +134,7 @@ The `predictorMatrix` tells `mice` what to do. Here’s the logic for our setup:
130134Since ` exposure.cat ` is the only ** predictor** with missing data, it's the only variable we will actively impute in this tutorial.
131135
132136* ** What about the missing outcome data?**
133- * Your data shows that ` stime.since.birth ` and ` status_all ` also have missing values (134 observations each) .
137+ * Data shows that ` stime.since.birth ` and ` status_all ` also have missing values.
134138 * It is standard practice not to impute the outcome variables in a survival analysis.
135139
136140* ** What variables will help the imputation (i.e., act as predictors)?**
@@ -177,33 +181,66 @@ With our `m=2` complete datasets, we follow the "analyze then pool" procedure:
1771812 . ** Pool** : Combine the 2 sets of results into a single, final estimate using ` pool() ` .
178182
179183``` {r analyze-pool}
180- # --- 5. Survival Analysis on Imputed Data (Corrected) ---
181-
182- # 1. Create an empty list to store the results of each analysis
183- fit_list <- list()
184-
185- # 2. Loop through each of the 'm' imputed datasets
186- for (i in 1:imputed_data$m) {
187-
188- # Get the i-th completed dataset
189- completed_data <- mice::complete(imputed_data, i)
190-
191- # Create a survey design object *specifically for this dataset*
192- design_i <- svydesign(ids = ~psu,
193- strata = ~strata,
194- weights = ~survey.weight.new,
195- nest = TRUE,
196- data = completed_data)
197-
198- # Fit the survey-weighted Cox model using this design
199- fit_list[[i]] <- svycoxph(Surv(stime.since.birth, status_all) ~ exposure.cat + sex + race + year.cat,
200- design = design_i)
201- }
202-
203- # 3. Pool the results from the list of model fits
204- pooled_results <- pool(fit_list)
205-
206- # 4. Display the final, pooled results
184+ # --- 5. Survival Analysis on Imputed Data ---
185+
186+ # --- Step 5.1: Re-integrate Ineligible Subjects for Correct Survey Variance ---
187+
188+ # First, extract the 'm' imputed datasets into a single long-format data frame.
189+ # Add a flag to identify this group as our analytic/eligible sample.
190+ imputed_analytic_data <- complete(imputed_data, "long", include = FALSE)
191+ imputed_analytic_data$eligible <- 1
192+
193+ # Next, identify the subjects from the original full dataset who were NOT in our analytic sample.
194+ # The analytic sample was defined as age >= 20 & age < 80.
195+ dat_ineligible <- subset(dat.full.with.mortality, !(age >= 20 & age < 80))
196+
197+ # Replicate this ineligible dataset 'm' times, once for each imputation.
198+ ineligible_list <- lapply(1:imputed_data$m, function(i) {
199+ df <- dat_ineligible
200+ df$.imp <- i # Add the imputation number
201+ return(df)
202+ })
203+ ineligible_stacked <- do.call(rbind, ineligible_list)
204+
205+ # Now, align the columns. Add columns that exist in the imputed data (like 'nelson_aalen')
206+ # to the ineligible data, filling them with NA.
207+ cols_to_add <- setdiff(names(imputed_analytic_data), names(ineligible_stacked))
208+ ineligible_stacked[, cols_to_add] <- NA
209+
210+ # Set the eligibility flag for this group to 0.
211+ ineligible_stacked$eligible <- 0
212+
213+ # CRITICAL: Ensure the column order is identical before row-binding.
214+ ineligible_final <- ineligible_stacked[, names(imputed_analytic_data)]
215+
216+ # Finally, combine the imputed analytic data with the prepared ineligible data.
217+ imputed_full_data <- rbind(imputed_analytic_data, ineligible_final)
218+
219+
220+ # --- Step 5.2: Create Survey Design and Run Pooled Analysis ---
221+
222+ # Create the complex survey design object using an `imputationList`.
223+ # This tells the survey package how to handle the 'm' imputed datasets.
224+ # The design is specified on the *full* data to capture the total population structure.
225+ design_full <- svydesign(ids = ~psu,
226+ strata = ~strata,
227+ weights = ~survey.weight.new,
228+ nest = TRUE,
229+ data = imputationList(split(imputed_full_data, imputed_full_data$.imp)))
230+
231+ # Subset the design object to include only the eligible participants for the analysis.
232+ # This ensures variance is calculated correctly based on the full sample design.
233+ design_analytic <- subset(design_full, eligible == 1)
234+
235+ # Fit the Cox model across all 'm' imputed datasets using the `with()` function.
236+ # This is more efficient than a for-loop.
237+ fit_pooled <- with(design_analytic,
238+ svycoxph(Surv(stime.since.birth, status_all) ~ exposure.cat + sex + race + year.cat))
239+
240+ # Pool the results from the list of model fits using Rubin's Rules.
241+ pooled_results <- pool(fit_pooled)
242+
243+ # Display the final, pooled results.
207244print("--- Final Adjusted Cox Model Results (from Pooled Imputed Data) ---")
208245summary(pooled_results, conf.int = TRUE, exponentiate = TRUE)
209246```
@@ -234,4 +271,4 @@ svypooled(
234271
235272This tutorial demonstrated how to replace a complete-case analysis with a multiple imputation workflow for a survey-weighted survival analysis. By correctly preparing the data, configuring ` mice ` with survival-specific information, and pooling the final results, we can generate valid estimates that properly account for missing data.
236273
237- ## References
274+ ## References
0 commit comments