Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
## BGmisc 1.8.0
* Optimized gedcom reader, com2links for speed and memory usage, with a focus on large pedigrees
* Fixed bug in gedcom reader that resulted in document records being added to the final person in the pedigree
* Added more unit tests for gedcom reader and data parser
* Optimized sliceFamilies to be more abstract, and no longer require mtdna
* Created `.require_openmx()` to make it easier to use OpenMx functions without making OpenMx a dependency
* Smarter string ID handling for ped2id
Expand Down
8 changes: 5 additions & 3 deletions R/documentData.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,20 @@ NULL

#' Royal pedigree data from 1992
#'
#' A dataset created by Denis Reid from the Royal Families of Europe.
#' A dataset created by Denis Reid from the Royal Families of Europe. The data was originally published in 1992 and is available on the internet. This version has been updated to combine duplicate entries and to include additional information on birth and death dates, as well as titles. This dataset is intended for educational and illustrative use in software demonstrations involving pedigree diagrams, inheritance structures, and kinship modeling. This dataset is not intended to represent any real individuals or families beyond the original source data, and it is provided solely for educational purposes.
#'
#' The variables are as follows:
#' \itemize{
#' \item \code{id}: Person identification variable
#' \item \code{personID}: Person identification variable
#' \item \code{momID}: ID of the mother
#' \item \code{dadID}: ID of the father
#' \item \code{famID}: ID of the extended family
#' \item \code{twinID}: ID of the twin, if applicable
#' \item \code{name}: Name of the person
#' \item \code{sex}: Biological sex
#' \item \code{birth_date}: Date of birth
#' \item \code{death_date}: Date of death
#' \item \code{attribute_title}: Title of the person
#' \item \code{title}: Title of the person
#'
#' }
#'
Expand Down
64 changes: 53 additions & 11 deletions R/readGedcom.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,13 @@
#' @param combine_cols Logical. If `TRUE`, combine redundant name columns, such
#' as `name_given` with `name_given_pieces` and `name_surn` with
#' `name_surn_pieces`, when their values do not conflict.
#' @param parse_dates Logical. If `TRUE`, attempt to parse date columns (e.g., `birth_date`, `death_date`) into Date objects, after removing common GEDCOM date qualifiers like "ABT", "BEF", and "AFT".
#' @param skinny Logical. If `TRUE`, return a slimmer data frame by dropping
#' `FAMC`, `FAMS`, and columns that are entirely `NA` during post-processing.
#' @param update_rate Numeric. Intended rate at which progress messages should
#' be printed. Currently unused.
#' @param post_process Logical. If `TRUE`, apply post-processing steps controlled
#' by `add_parents`, `combine_cols`, `remove_empty_cols`, and `skinny`.
#' by `add_parents`, `combine_cols`, `remove_empty_cols`, `skinny`, and `parse_dates`.
#' @param ... Additional arguments. Currently unused.
#' @return A data frame containing information about individuals, with the following potential columns:
#' \describe{
Expand Down Expand Up @@ -110,6 +111,7 @@
remove_empty_cols = TRUE,
combine_cols = TRUE,
skinny = FALSE,
parse_dates = FALSE,
update_rate = 1000,
post_process = TRUE,
...) {
Expand Down Expand Up @@ -179,6 +181,7 @@
df_temp = df_temp,
remove_empty_cols = remove_empty_cols,
combine_cols = combine_cols,
parse_dates = parse_dates,
add_parents = add_parents,
skinny = skinny,
verbose = verbose
Expand Down Expand Up @@ -502,9 +505,15 @@
#' @param vars The current list of variables to update.
#' @return A list with updated `vars` and a `matched` flag.
#' @keywords internal
processTag <- function(tag, field_name, pattern_rows, line, vars,
extractor = NULL, mode = "replace") {
count_name <- paste0("num_", tolower(tag), "_rows")
processTag <- function(tag,
field_name,
pattern_rows,
line,
vars,
extractor = NULL,
mode = "replace") {
count_name <- paste0("num_",# normalize leading underscores

Check notice on line 515 in R/readGedcom.R

View check run for this annotation

codefactor.io / CodeFactor

R/readGedcom.R#L515

Put a space after a comma. (commas_linter)
tolower(gsub("^_", "", tag)), "_rows")
matched <- FALSE
if (!is.null(pattern_rows[[count_name]]) &&
pattern_rows[[count_name]] > 0 &&
Expand Down Expand Up @@ -533,12 +542,14 @@
#' @param remove_empty_cols Logical indicating whether to remove columns that are entirely missing.
#' @param combine_cols Logical indicating whether to combine columns with duplicate values.
#' @param add_parents Logical indicating whether to add parent information.
#' @param parse_dates Logical indicating whether to parse date columns into Date objects.
#' @param skinny Logical indicating whether to slim down the data frame.
#' @param verbose Logical indicating whether to print progress messages.
#' @return The post-processed data frame.
postProcessGedcom <- function(df_temp,
remove_empty_cols = TRUE,
combine_cols = TRUE,
parse_dates = FALSE,
add_parents = TRUE,
skinny = TRUE,
verbose = FALSE) {
Expand All @@ -553,6 +564,30 @@
if (verbose == TRUE) message("Removing empty columns")
df_temp <- df_temp[, colSums(is.na(df_temp)) < nrow(df_temp)]
}
if(parse_dates == TRUE) {

Check notice on line 567 in R/readGedcom.R

View check run for this annotation

codefactor.io / CodeFactor

R/readGedcom.R#L567

Place a space before left parenthesis, except in a function call. (spaces_left_parentheses_linter)

date_cols <- c("birth_date", "death_date")
if (verbose == TRUE) message("Parsing date columns: ", paste(date_cols[date_cols %in% colnames(df_temp)], collapse = ", "))
# GEDCOM date qualifiers like "ABT", "BEF", "AFT" can be present in date strings. We can remove them before parsing.
date_qualifier_regex <- "\\b(?:[aA][bBfF][tT]|[bB][eE][tTfF])\\.?\\b\\s*"

if(verbose == TRUE && any(sapply(df_temp[date_cols], function(col) any(grepl(date_qualifier_regex, col, perl = TRUE))))

Check notice on line 574 in R/readGedcom.R

View check run for this annotation

codefactor.io / CodeFactor

R/readGedcom.R#L574

Place a space before left parenthesis, except in a function call. (spaces_left_parentheses_linter)
) {
message("Found date qualifiers in date columns. They will be removed before parsing.")
}

# only parse date columns that are present in the data frame
if (any(date_cols %in% colnames(df_temp))) {
df_temp[date_cols] <- lapply(df_temp[date_cols], function(x) {
if (is.character(x)) {
x <- stringr::str_replace_all(x, date_qualifier_regex, "")
as.Date(x, format = "%d %b %Y")
} else {
x
}
})
}
}
if (skinny == TRUE) {
if (verbose == TRUE) message("Slimming down the data frame")
# Remove raw family relationship columns
Expand All @@ -568,12 +603,14 @@
#'
#' @param df_temp A data frame produced by \code{readGedcom()}.
#' @param datasource Character string indicating the data source ("gedcom" or "wiki").
#' @param person_id_col Character string indicating the column name for individual IDs (default "personID").
#' @return The updated data frame with parent IDs added.
processParents <- function(df_temp, datasource) {
processParents <- function(df_temp, datasource, person_id_col = "personID"
) {
if (datasource %in% c("gedcom", "ged")) {
required_cols <- c("FAMC", "sex", "FAMS")
} else if (datasource == "wiki") {
required_cols <- c("personID")
required_cols <- c(person_id_col)
} else {
stop("Invalid datasource")
}
Expand All @@ -596,8 +633,13 @@
#' to the corresponding parent IDs.
#'
#' @param df_temp A data frame produced by \code{readGedcom()}.
#' @param mom_sex Character string indicating the value of sex that corresponds to mothers (default "F").
#' @param dad_sex Character string indicating the value of sex that corresponds to fathers (default "M").
#' @return A list mapping family IDs to parent information.
mapFAMS2parents <- function(df_temp) {
mapFAMS2parents <- function(df_temp,
mom_sex = "F",
dad_sex = "M"
) {
if (!all(c("FAMS", "sex") %in% colnames(df_temp))) {
warning("The data frame does not contain the necessary columns (FAMS, sex)")
return(NULL)
Expand All @@ -608,16 +650,16 @@
fams_ids <- unlist(strsplit(df_temp$FAMS[i], ", "))
for (fams_id in fams_ids) {
if (!is.null(family_to_parents[[fams_id]])) {
if (df_temp$sex[i] == "M") {
if (df_temp$sex[i] == dad_sex) {
family_to_parents[[fams_id]]$father <- df_temp$personID[i]
} else if (df_temp$sex[i] == "F") {
} else if (df_temp$sex[i] == mom_sex) {
family_to_parents[[fams_id]]$mother <- df_temp$personID[i]
}
} else {
family_to_parents[[fams_id]] <- list()
if (df_temp$sex[i] == "M") {
if (df_temp$sex[i] == dad_sex) {
family_to_parents[[fams_id]]$father <- df_temp$personID[i]
} else if (df_temp$sex[i] == "F") {
} else if (df_temp$sex[i] == mom_sex) {
family_to_parents[[fams_id]]$mother <- df_temp$personID[i]
}
}
Expand Down
Loading
Loading