From e9f7ca492140f948865cb3b6de2ee3a92a8f274d Mon Sep 17 00:00:00 2001 From: yvonne232 <91641067+yvonne232@users.noreply.github.com> Date: Sat, 20 Jun 2026 10:30:46 -0400 Subject: [PATCH 1/2] Added database-driven to pfb_to_zip so whitelist/blacklist can be loaded from Amanuensis project_datapoints (with local config fallback) --- pfb_to_zip/db_config.py | 94 +++++++++++++++++++++++++ pfb_to_zip/pfb_to_zip.py | 63 ++++++++++++----- pfb_to_zip/seed_datapoints.py | 128 ++++++++++++++++++++++++++++++++++ poetry.lock | 79 ++++++++++++++++++++- pyproject.toml | 1 + 5 files changed, 348 insertions(+), 17 deletions(-) create mode 100644 pfb_to_zip/db_config.py create mode 100644 pfb_to_zip/seed_datapoints.py diff --git a/pfb_to_zip/db_config.py b/pfb_to_zip/db_config.py new file mode 100644 index 0000000..0b0d27a --- /dev/null +++ b/pfb_to_zip/db_config.py @@ -0,0 +1,94 @@ +"""Load export whitelist/blacklist from amanuensis project_datapoints.""" + +import os +import sys +import types +from importlib import import_module +from pathlib import Path +from typing import Any, Dict, Optional +import psycopg2 + + + +def load_config_module(config_file_path: str): + path = Path(config_file_path).resolve() + parent = str(path.parent) + if parent not in sys.path: + sys.path.insert(0, parent) + return import_module(path.stem) + + +def get_db_kwargs( + host: str = "localhost", + port: int = 5432, + dbname: str = "amanuensis_pcdc", + user: str = "amanuensis_pcdc", + password: Optional[str] = None, +) -> Dict[str, Any]: + return { + "host": os.environ.get("AMANUENSIS_DB_HOST", host), + "port": int(os.environ.get("AMANUENSIS_DB_PORT", port)), + "dbname": os.environ.get("AMANUENSIS_DB_NAME", dbname), + "user": os.environ.get("AMANUENSIS_DB_USER", user), + "password": os.environ.get("AMANUENSIS_DB_PASSWORD", password), + } + + +def load_config(project_id: int, db_kwargs: Dict[str, Any], fallback_module): + """ + Load white_list and black_list from project_datapoints for project_id. + exclude_files and data_dictionary always come from fallback_module. + Falls back to fallback_module when the DB is unavailable or has no rows. + """ + white_list = {} + black_list = {} + + try: + conn = psycopg2.connect(**db_kwargs) + except Exception as exc: + return fallback_module, ( + f"Config source: local file (could not connect to amanuensis DB: {exc})" + ) + + try: + with conn: + with conn.cursor() as cur: + cur.execute( + """ + SELECT term, type, value_list + FROM project_datapoints + WHERE project_id = %s AND active = true + """, + (project_id,), + ) + rows = cur.fetchall() + except Exception as exc: + return fallback_module, ( + f"Config source: local file (error querying project_datapoints: {exc})" + ) + finally: + conn.close() + + if not rows: + return fallback_module, ( + f"Config source: local file (no datapoints found for project_id={project_id})" + ) + + for term, dtype, value_list in rows: + if dtype == "w": + white_list[term] = list(value_list) + elif dtype == "b": + black_list[term] = list(value_list) + + config = types.SimpleNamespace( + white_list=white_list, + black_list=black_list, + exclude_files=list(getattr(fallback_module, "exclude_files", [])), + data_dictionary=getattr(fallback_module, "data_dictionary", None), + ) + summary = ( + f"Config source: database (project_id={project_id}, " + f"{len(white_list)} whitelist and {len(black_list)} blacklist entries). " + f"exclude_files and data_dictionary from local file." + ) + return config, summary diff --git a/pfb_to_zip/pfb_to_zip.py b/pfb_to_zip/pfb_to_zip.py index d140e6b..d332ad9 100644 --- a/pfb_to_zip/pfb_to_zip.py +++ b/pfb_to_zip/pfb_to_zip.py @@ -1,6 +1,4 @@ import argparse -import sys -from importlib import import_module from pathlib import Path import os import re @@ -16,6 +14,8 @@ from pfb.exporters import tsv from dictionaryutils.utils import node_values_to_codes +from db_config import get_db_kwargs, load_config, load_config_module + def to_folder_name(value: str) -> str: value = value.lower() @@ -31,7 +31,17 @@ def __init__(self): class PFBExporter: - def __init__(self, pfb_file_path:str, tmp_folder:str, output_path:str, config_file_path:str, ontology:str=None, extra_analysis:str=None) -> None: + def __init__( + self, + pfb_file_path: str, + tmp_folder: str, + output_path: str, + config_file_path: str, + ontology: str = None, + extra_analysis: str = None, + project_id: int = None, + db_kwargs: dict = None, + ) -> None: self.pfb_file_path = pfb_file_path self.tmp_folder = tmp_folder if tmp_folder else "./tmp" self.output_path = output_path if output_path else "./" @@ -39,11 +49,14 @@ def __init__(self, pfb_file_path:str, tmp_folder:str, output_path:str, config_fi self.analysis_path = extra_analysis self.zip_file_output_path = None - # Retrieve config file module - path, file = config_file_path.rsplit('/', 1) - file = file[:-3] - sys.path.append(path) - self.config = import_module(file) + fallback_config = load_config_module(config_file_path) + if project_id is not None: + self.config, self.config_source = load_config( + project_id, db_kwargs or get_db_kwargs(), fallback_config + ) + else: + self.config = fallback_config + self.config_source = "Config source: local file (--project-id not set)" self.data_dictionary = None if self.ontology and self.ontology == "ncit": @@ -148,7 +161,6 @@ def filter_attributes(self, is_black_list=False): a list of attributes to whitelist, by default, and a black_list boolean to indicate if the list of attributes should be blacklisted instead. ''' - invalid_attributes = {} attribute_list = self.config.black_list if is_black_list else self.config.white_list for file in os.listdir(self.zip_folder + "/tsvs_original"): @@ -164,9 +176,6 @@ def filter_attributes(self, is_black_list=False): if attribute not in attribute_list[file.split(".")[0]] ] else: - invalid_attributes[file.split(".")[0]] = [ - a for a in attribute_list[file.split(".")[0]] if a not in header - ] filtered_header = [ attribute for attribute in header @@ -187,8 +196,6 @@ def filter_attributes(self, is_black_list=False): print(file + " NOT FILTERED, no config is present for it.") # just copy it over to the filtered folder copy(self.zip_folder + "/tsvs_original/" + file, self.zip_folder + "/tsvs/" + file) - if any(a for a in invalid_attributes.values()): - raise RuntimeError(f'Invalid attributes in config: {({k:v for k,v in invalid_attributes.items() if v})}') # TODO not working need to be updated @@ -348,7 +355,13 @@ def main(): # EXAMPLE: python pfb_to_zip.py -i ./export_2023-03-27T02_42_17.avro -o ./outputs/ -c ./config.py -d https://portal.pedscommons.org/api/v0/submission/_dictionary/_all -t ncit parser = argparse.ArgumentParser(description="Build ZIP bundle for data delivery after project request has been approved") - parser.add_argument('-c', '--config', help='The config file') + parser.add_argument('-c', '--config', help='Fallback config file (exclude_files, data_dictionary, and white/black lists if DB unavailable)') + parser.add_argument('-p', '--project-id', type=int, help='Load white_list and black_list from amanuensis project_datapoints for this project') + parser.add_argument('--db-host', default='localhost', help='Amanuensis Postgres host (default: localhost or AMANUENSIS_DB_HOST)') + parser.add_argument('--db-port', type=int, default=5432, help='Amanuensis Postgres port (default: 5432 or AMANUENSIS_DB_PORT)') + parser.add_argument('--db-name', default='amanuensis_pcdc', help='Amanuensis Postgres database (default: amanuensis_pcdc or AMANUENSIS_DB_NAME)') + parser.add_argument('--db-user', default='amanuensis_pcdc', help='Amanuensis Postgres user (default: amanuensis_pcdc or AMANUENSIS_DB_USER)') + parser.add_argument('--db-password', default=os.environ.get('AMANUENSIS_DB_PASSWORD'), help='Amanuensis Postgres password (default: AMANUENSIS_DB_PASSWORD env var)') parser.add_argument('-i', '--input', help='Input PFB file path') parser.add_argument('-o', '--output', help='Output ZIP directory') parser.add_argument('-t', '--terminology', help='The ontology you want to transform GEN3 values to.') @@ -359,6 +372,14 @@ def main(): input_path = args.input output_path = args.output config_file = args.config + project_id = args.project_id + db_kwargs = get_db_kwargs( + host=args.db_host, + port=args.db_port, + dbname=args.db_name, + user=args.db_user, + password=args.db_password, + ) ontology = args.terminology analysis_script_consortia = args.analysis except argparse.ArgumentError as err: @@ -367,7 +388,16 @@ def main(): tmp_folder = "./tmp" - pfb_export = PFBExporter(input_path, tmp_folder, output_path, config_file, ontology, True if analysis_script_consortia and analysis_script_consortia != "" else False) + pfb_export = PFBExporter( + input_path, + tmp_folder, + output_path, + config_file, + ontology, + True if analysis_script_consortia and analysis_script_consortia != "" else False, + project_id=project_id, + db_kwargs=db_kwargs, + ) if not pfb_export: print("One or more problems occurred during the initialization of the PFBExporter class") exit() @@ -381,6 +411,7 @@ def main(): pfb_export.add_external_references_material() pfb_export.zip() pfb_export.clean_up() + print(pfb_export.config_source) diff --git a/pfb_to_zip/seed_datapoints.py b/pfb_to_zip/seed_datapoints.py new file mode 100644 index 0000000..38f82ac --- /dev/null +++ b/pfb_to_zip/seed_datapoints.py @@ -0,0 +1,128 @@ +"""Seed project_datapoints from a pfb_to_zip config file.""" + +import argparse +import os + +import psycopg2 + +from db_config import get_db_kwargs, load_config_module + + +def ensure_project(cur, project_id, project_name): + if project_id is not None: + cur.execute("SELECT id FROM project WHERE id = %s", (project_id,)) + if cur.fetchone(): + return project_id + cur.execute( + """ + INSERT INTO project (id, name, description, institution, active) + VALUES (%s, %s, %s, %s, true) + """, + ( + project_id, + project_name, + "Seeded for pfb_to_zip export testing", + "PCDC", + ), + ) + return project_id + + cur.execute( + """ + INSERT INTO project (name, description, institution, active) + VALUES (%s, %s, %s, true) + RETURNING id + """, + ( + project_name, + "Seeded for pfb_to_zip export testing", + "PCDC", + ), + ) + return cur.fetchone()[0] + + +def seed_datapoints(cur, project_id, config_module): + inserted = 0 + for term, cols in config_module.white_list.items(): + cur.execute( + """ + INSERT INTO project_datapoints (term, value_list, type, active, project_id) + VALUES (%s, %s, 'w', true, %s) + """, + (term, cols, project_id), + ) + inserted += 1 + + for term, cols in config_module.black_list.items(): + cur.execute( + """ + INSERT INTO project_datapoints (term, value_list, type, active, project_id) + VALUES (%s, %s, 'b', true, %s) + """, + (term, cols, project_id), + ) + inserted += 1 + + return inserted + + +def main(): + parser = argparse.ArgumentParser( + description="Seed amanuensis project_datapoints from a config file" + ) + parser.add_argument( + "-c", + "--config", + default="./configs/config_inrg.py", + help="Config file with white_list and black_list", + ) + parser.add_argument( + "-p", + "--project-id", + type=int, + default=1, + help="Project id to seed (creates project if missing)", + ) + parser.add_argument( + "--project-name", + default="INRG Test Export", + help="Name for the test project if it must be created", + ) + parser.add_argument("--db-host", default=os.environ.get("AMANUENSIS_DB_HOST", "127.0.0.1")) + parser.add_argument("--db-port", type=int, default=int(os.environ.get("AMANUENSIS_DB_PORT", "5433"))) + parser.add_argument("--db-name", default=os.environ.get("AMANUENSIS_DB_NAME", "amanuensis_pcdc")) + parser.add_argument("--db-user", default=os.environ.get("AMANUENSIS_DB_USER", "amanuensis_pcdc")) + parser.add_argument("--db-password", default=os.environ.get("AMANUENSIS_DB_PASSWORD")) + args = parser.parse_args() + + config_module = load_config_module(args.config) + db_kwargs = get_db_kwargs( + host=args.db_host, + port=args.db_port, + dbname=args.db_name, + user=args.db_user, + password=args.db_password, + ) + + with psycopg2.connect(**db_kwargs) as conn: + with conn.cursor() as cur: + cur.execute( + "SELECT COUNT(*) FROM project_datapoints WHERE project_id = %s AND active = true", + (args.project_id,), + ) + existing = cur.fetchone()[0] + if existing: + print( + f"project_id={args.project_id} already has {existing} active datapoints; skipping" + ) + return + + project_id = ensure_project(cur, args.project_id, args.project_name) + count = seed_datapoints(cur, project_id, config_module) + conn.commit() + print(f"Seeded project_id={project_id} with {count} datapoint rows") + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index 01631bf..bc4f5c4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1511,6 +1511,83 @@ files = [ {file = "propcache-0.5.2.tar.gz", hash = "sha256:01c4fc7480cd0598bb4b57022df55b9ca296da7fc5a8760bd8451a7e63a7d427"}, ] +[[package]] +name = "psycopg2-binary" +version = "2.9.12" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "psycopg2_binary-2.9.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9b818ceff717f98851a64bffd4c5eb5b3059ae280276dcecc52ac658dcf006a4"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d2fa0d7caca8635c56e373055094eeda3208d901d55dd0ff5abc1d4e47f82b56"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:864c261b3690e1207d14bbfe0a61e27567981b80c47a778561e49f676f7ce433"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c5ee5213445dd45312459029b8c4c0a695461eb517b753d2582315bd07995f5e"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f9cae1f848779b5b01f417e762c40d026ea93eb0648249a604728cda991dde3"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:63a3ebbd543d3d1eda088ac99164e8c5bac15293ee91f20281fd17d050aee1c4"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d6fcbba8c9fed08a73b8ac61ea79e4821e45b1e92bb466230c5e746bbf3d5256"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:36512911ebb2b60a0c3e44d0bb5048c1980aced91235d133b7874f3d1d93487c"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:8ffdb59fe88f99589e34354a130217aa1fd2d615612402d6edc8b3dbc7a44463"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a46fe069b65255df410f856d842bc235f90e22ffdf532dda625fd4213d3fd9b1"}, + {file = "psycopg2_binary-2.9.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab29414b25dcb698bf26bf213e3348abdcd07bbd5de032a5bec15bd75b298b03"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5c8ce6c61bd1b1f6b9c24ee32211599f6166af2c55abb19456090a21fd16554b"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b4a9eaa6e7f4ff91bec10aa3fb296878e75187bced5cc4bafe17dc40915e1326"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c6528cefc8e50fcc6f4a107e27a672058b36cc5736d665476aeb413ba88dbb06"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e4e184b1fb6072bf05388aa41c697e1b2d01b3473f107e7ec44f186a32cfd0b8"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4766ab678563054d3f1d064a4db19cc4b5f9e3a8d9018592a8285cf200c248f3"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5a0253224780c978746cb9be55a946bcdaf40fe3519c0f622924cdabdafe2c39"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0dc9228d47c46bda253d2ecd6bb93b56a9f2d7ad33b684a1fa3622bf74ffe30c"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f921f3cd87035ef7df233383011d7a53ea1d346224752c1385f1edfd790ceb6a"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d999bd982a723113c1a45b55a7a6a90d64d0ed2278020ed625c490ff7bef96c"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29d4d134bd0ab46ffb04e94aa3c5fa3ef582e9026609165e2f758ff76fc3a3be"}, + {file = "psycopg2_binary-2.9.12-cp311-cp311-win_amd64.whl", hash = "sha256:cb4a1dacdd48077150dc762a9e5ddbf32c256d66cb46f80839391aa458774936"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5cdc05117180c5fa9c40eea8ea559ce64d73824c39d928b7da9fb5f6a9392433"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d3227a3bc228c10d21011a99245edca923e4e8bf461857e869a507d9a41fe9f6"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:995ce929eede89db6254b50827e2b7fd61e50d11f0b116b29fffe4a2e53c4580"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9fe06d93e72f1c048e731a2e3e7854a5bfaa58fc736068df90b352cefe66f03f"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40e7b28b63aaf737cb3a1edc3a9bbc9a9f4ad3dcb7152e8c1130e4050eddcb7d"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:89d19a9f7899e8eb0656a2b3a08e0da04c720a06db6e0033eab5928aabe60fa9"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:612b965daee295ae2da8f8218ce1d274645dc76ef3f1abf6a0a94fd57eff876d"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:b9a339b79d37c1b45f3235265f07cdeb0cb5ad7acd2ac7720a5920989c17c24e"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:3471336e1acfd9c7fe507b8bad5af9317b6a89294f9eb37bd9a030bb7bebcdc6"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7af18183109e23502c8b2ae7f6926c0882766f35b5175a4cd737ad825e4d7a1b"}, + {file = "psycopg2_binary-2.9.12-cp312-cp312-win_amd64.whl", hash = "sha256:398fcd4db988c7d7d3713e2b8e18939776fd3fb447052daae4f24fa39daede4c"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7c729a73c7b1b84de3582f73cdd27d905121dc2c531f3d9a3c32a3011033b965"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4413d0caef93c5cf50b96863df4c2efe8c269bf2267df353225595e7e15e8df7"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:4dfcf8e45ebb0c663be34a3442f65e17311f3367089cd4e5e3a3e8e62c978777"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c41321a14dd74aceb6a9a643b9253a334521babfa763fa873e33d89cfa122fb5"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83946ba43979ebfdc99a3cd0ee775c89f221df026984ba19d46133d8d75d3cd9"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:411e85815652d13560fbe731878daa5d92378c4995a22302071890ec3397d019"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c8ad4c08e00f7679559eaed7aff1edfffc60c086b976f93972f686384a95e2c"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:00814e40fa23c2b37ef0a1e3c749d89982c73a9cb5046137f0752a22d432e82f"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:98062447aebc20ed20add1f547a364fd0ef8933640d5372ff1873f8deb9b61be"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:66a7685d7e548f10fb4ce32fb01a7b7f4aa702134de92a292c7bd9e0d3dbd290"}, + {file = "psycopg2_binary-2.9.12-cp313-cp313-win_amd64.whl", hash = "sha256:b6937f5fe4e180aeee87de907a2fa982ded6f7f15d7218f78a083e4e1d68f2a0"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6f3b3de8a74ef8db215f22edffb19e32dc6fa41340456de7ec99efdc8a7b3ec2"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1006fb62f0f0bc5ce256a832356c6262e91be43f5e4eb15b5eaf38079464caf2"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:840066105706cd2eb29b9a1c2329620056582a4bf3e8169dec5c447042d0869f"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:863f5d12241ebe1c76a72a04c2113b6dc905f90b9cef0e9be0efd994affd9354"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a99eaab34a9010f1a086b126de467466620a750634d114d20455f3a824aae033"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ffdd7dc5463ccd61845ac37b7012d0f35a1548df9febe14f8dd549be4a0bc81e"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:54a0dfecab1b48731f934e06139dfe11e24219fb6d0ceb32177cf0375f14c7b5"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:96937c9c5d891f772430f418a7a8b4691a90c3e6b93cf72b5bd7cad8cbca32a5"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:77b348775efd4cdab410ec6609d81ccecd1139c90265fa583a7255c8064bc03d"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:527e6342b3e44c2f0544f6b8e927d60de7f163f5723b8f1dfa7d2a84298738cd"}, + {file = "psycopg2_binary-2.9.12-cp314-cp314-win_amd64.whl", hash = "sha256:f12ae41fcafadb39b2785e64a40f9db05d6de2ac114077457e0e7c597f3af980"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ee2d84ef5eb6c04702d2e9c372ad557fb027f26a5d82804f749dfb14c7fdd2ab"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cfa2517c94ea3af6deb46f81e1bbd884faa63e28481eb2f889989dd8d95e5f03"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:ba3df2fc42a1cfa45b72cf096d4acb2b885937eedc61461081d53538d4a82a86"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:718e1fc18edf573b02cb8aea868de8d8d33f99ce9620206aa9144b67b0985e94"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5c7cb4cbf894a1d36c720d713de507952c7c58f66d30834708f03dbe5c822ccf"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:049366c6d884bdcd65d66e6ca1fdbebe670b56c6c9ba46f164e6667e90881964"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fb1828cf3da68f99e45ebce1355d65d2d12b6a78fb5dfb16247aad6bdef5f5d2"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:127467c6e476dd876634f17c3d870530e73ff454ff99bff73d36e80af28e1115"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:ace94261f43850e9e79f6c56636c5e0147978ab79eda5e5e5ebf13ae146fc8fe"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a7e39a65b7d2a20e4ba2e0aaad1960b61cc2888d6ab047769f8347bd3c9ad915"}, + {file = "psycopg2_binary-2.9.12-cp39-cp39-win_amd64.whl", hash = "sha256:f625abb7020e4af3432d95342daa1aa0db3fa369eed19807aa596367ba791b10"}, + {file = "psycopg2_binary-2.9.12.tar.gz", hash = "sha256:5ac9444edc768c02a6b6a591f070b8aae28ff3a99be57560ac996001580f294c"}, +] + [[package]] name = "pygments" version = "2.20.0" @@ -2164,4 +2241,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.13,<4" -content-hash = "043ffb6aaf14cff063850677d20403570b3fc6ed9289a7112a97fc80619b969e" +content-hash = "ac7f53c082215ef00f8d2a7d0b8dabae522fd5360d339aa0e3e40855a7d247ad" diff --git a/pyproject.toml b/pyproject.toml index 855a1b2..f6b141c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ gitpython = "^3.1.44" dictionaryutils = { git = "https://github.com/chicagopcdc/dictionaryutils.git", rev = "2026.05" } pypfb = { git = "https://github.com/chicagopcdc/pypfb.git", rev = "0.6.0" } +psycopg2-binary = "^2.9.12" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" From dbdf0b53b9465f8a2b94522dc0bfa505d106b0c8 Mon Sep 17 00:00:00 2001 From: yvonne232 <91641067+yvonne232@users.noreply.github.com> Date: Fri, 3 Jul 2026 16:42:07 -0400 Subject: [PATCH 2/2] Use Amanuensis project_datapoints API --- pfb_to_zip/db_config.py | 132 +++++++++++++++++++++++----------- pfb_to_zip/pfb_to_zip.py | 26 +++---- pfb_to_zip/seed_datapoints.py | 128 --------------------------------- pyproject.toml | 1 - 4 files changed, 101 insertions(+), 186 deletions(-) delete mode 100644 pfb_to_zip/seed_datapoints.py diff --git a/pfb_to_zip/db_config.py b/pfb_to_zip/db_config.py index 0b0d27a..89914c9 100644 --- a/pfb_to_zip/db_config.py +++ b/pfb_to_zip/db_config.py @@ -1,13 +1,12 @@ -"""Load export whitelist/blacklist from amanuensis project_datapoints.""" +"""Load export whitelist/blacklist from amanuensis project_datapoints API.""" import os import sys import types from importlib import import_module from pathlib import Path -from typing import Any, Dict, Optional -import psycopg2 - +from typing import Any, Dict, List, Optional +import requests def load_config_module(config_file_path: str): @@ -18,63 +17,114 @@ def load_config_module(config_file_path: str): return import_module(path.stem) -def get_db_kwargs( - host: str = "localhost", - port: int = 5432, - dbname: str = "amanuensis_pcdc", - user: str = "amanuensis_pcdc", - password: Optional[str] = None, -) -> Dict[str, Any]: +def get_api_config( + base_url: str = "https://localhost", + token: Optional[str] = None): return { - "host": os.environ.get("AMANUENSIS_DB_HOST", host), - "port": int(os.environ.get("AMANUENSIS_DB_PORT", port)), - "dbname": os.environ.get("AMANUENSIS_DB_NAME", dbname), - "user": os.environ.get("AMANUENSIS_DB_USER", user), - "password": os.environ.get("AMANUENSIS_DB_PASSWORD", password), + "base_url": os.environ.get("AMANUENSIS_URL", base_url), + "token": os.environ.get("AMANUENSIS_ACCESS_TOKEN", token), } -def load_config(project_id: int, db_kwargs: Dict[str, Any], fallback_module): +def project_datapoints_url(base_url: str, endpoint: str) -> str: + base = base_url.rstrip("/") + if base.endswith("/amanuensis"): + return f"{base}/project-datapoints/{endpoint}" + return f"{base}/amanuensis/project-datapoints/{endpoint}" + + +def _auth_headers(token: str) -> Dict[str, str]: + return {"Authorization": f"Bearer {token}"} + + +def _request_verify(): + """SSL verification for amanuensis API requests. + + Set AMANUENSIS_INSECURE_SSL=1 for local dev when the portal uses a + self-signed cert without a localhost SAN (common with gen3-helm). + """ + if os.environ.get("AMANUENSIS_INSECURE_SSL", "").lower() in ("1", "true", "yes"): + return False + ca_bundle = os.environ.get("REQUESTS_CA_BUNDLE") + return ca_bundle if ca_bundle else True + + +def fetch_project_datapoints( + api_config: Dict[str, Any], project_id: int +) -> List[Dict[str, Any]]: + token = api_config.get("token") + if not token: + raise ValueError("AMANUENSIS_ACCESS_TOKEN is required") + + url = project_datapoints_url(api_config["base_url"], "get-datapoints") + response = requests.get( + url, + json={"project_id": project_id, "many": True}, + headers=_auth_headers(token), + timeout=30, + verify=_request_verify(), + ) + response.raise_for_status() + data = response.json() + if not data: + return [] + if isinstance(data, list): + return data + return [data] + + +def add_project_datapoint( + api_config: Dict[str, Any], + term: str, + value_list: List[str], + dtype: str, + project_id: int, +) -> None: + token = api_config.get("token") + if not token: + raise ValueError("AMANUENSIS_ACCESS_TOKEN is required") + + url = project_datapoints_url(api_config["base_url"], "add-datapoints") + response = requests.post( + url, + json={ + "term": term, + "value_list": value_list, + "type": dtype, + "project_id": project_id, + }, + headers=_auth_headers(token), + timeout=30, + verify=_request_verify(), + ) + response.raise_for_status() + + +def load_config(project_id: int, api_config: Dict[str, Any], fallback_module): """ Load white_list and black_list from project_datapoints for project_id. exclude_files and data_dictionary always come from fallback_module. - Falls back to fallback_module when the DB is unavailable or has no rows. + Falls back to fallback_module when the API is unavailable or has no rows. """ white_list = {} black_list = {} try: - conn = psycopg2.connect(**db_kwargs) - except Exception as exc: - return fallback_module, ( - f"Config source: local file (could not connect to amanuensis DB: {exc})" - ) - - try: - with conn: - with conn.cursor() as cur: - cur.execute( - """ - SELECT term, type, value_list - FROM project_datapoints - WHERE project_id = %s AND active = true - """, - (project_id,), - ) - rows = cur.fetchall() + rows = fetch_project_datapoints(api_config, project_id) except Exception as exc: return fallback_module, ( - f"Config source: local file (error querying project_datapoints: {exc})" + f"Config source: local file (could not fetch from amanuensis API: {exc})" ) - finally: - conn.close() if not rows: return fallback_module, ( f"Config source: local file (no datapoints found for project_id={project_id})" ) - for term, dtype, value_list in rows: + for row in rows: + term = row.get("term") + dtype = row.get("type") + value_list = row.get("value_list") or [] if dtype == "w": white_list[term] = list(value_list) elif dtype == "b": @@ -87,7 +137,7 @@ def load_config(project_id: int, db_kwargs: Dict[str, Any], fallback_module): data_dictionary=getattr(fallback_module, "data_dictionary", None), ) summary = ( - f"Config source: database (project_id={project_id}, " + f"Config source: amanuensis API (project_id={project_id}, " f"{len(white_list)} whitelist and {len(black_list)} blacklist entries). " f"exclude_files and data_dictionary from local file." ) diff --git a/pfb_to_zip/pfb_to_zip.py b/pfb_to_zip/pfb_to_zip.py index d332ad9..1200f02 100644 --- a/pfb_to_zip/pfb_to_zip.py +++ b/pfb_to_zip/pfb_to_zip.py @@ -14,7 +14,7 @@ from pfb.exporters import tsv from dictionaryutils.utils import node_values_to_codes -from db_config import get_db_kwargs, load_config, load_config_module +from db_config import get_api_config, load_config, load_config_module def to_folder_name(value: str) -> str: @@ -40,7 +40,7 @@ def __init__( ontology: str = None, extra_analysis: str = None, project_id: int = None, - db_kwargs: dict = None, + api_config: dict = None, ) -> None: self.pfb_file_path = pfb_file_path self.tmp_folder = tmp_folder if tmp_folder else "./tmp" @@ -52,7 +52,7 @@ def __init__( fallback_config = load_config_module(config_file_path) if project_id is not None: self.config, self.config_source = load_config( - project_id, db_kwargs or get_db_kwargs(), fallback_config + project_id, api_config or get_api_config(), fallback_config ) else: self.config = fallback_config @@ -355,13 +355,10 @@ def main(): # EXAMPLE: python pfb_to_zip.py -i ./export_2023-03-27T02_42_17.avro -o ./outputs/ -c ./config.py -d https://portal.pedscommons.org/api/v0/submission/_dictionary/_all -t ncit parser = argparse.ArgumentParser(description="Build ZIP bundle for data delivery after project request has been approved") - parser.add_argument('-c', '--config', help='Fallback config file (exclude_files, data_dictionary, and white/black lists if DB unavailable)') + parser.add_argument('-c', '--config', help='Fallback config file (exclude_files, data_dictionary, and white/black lists if API unavailable)') parser.add_argument('-p', '--project-id', type=int, help='Load white_list and black_list from amanuensis project_datapoints for this project') - parser.add_argument('--db-host', default='localhost', help='Amanuensis Postgres host (default: localhost or AMANUENSIS_DB_HOST)') - parser.add_argument('--db-port', type=int, default=5432, help='Amanuensis Postgres port (default: 5432 or AMANUENSIS_DB_PORT)') - parser.add_argument('--db-name', default='amanuensis_pcdc', help='Amanuensis Postgres database (default: amanuensis_pcdc or AMANUENSIS_DB_NAME)') - parser.add_argument('--db-user', default='amanuensis_pcdc', help='Amanuensis Postgres user (default: amanuensis_pcdc or AMANUENSIS_DB_USER)') - parser.add_argument('--db-password', default=os.environ.get('AMANUENSIS_DB_PASSWORD'), help='Amanuensis Postgres password (default: AMANUENSIS_DB_PASSWORD env var)') + parser.add_argument('--amanuensis-url', default=os.environ.get('AMANUENSIS_URL', 'https://localhost'), help='Portal base URL (default: AMANUENSIS_URL or https://localhost)') + parser.add_argument('--access-token', default=os.environ.get('AMANUENSIS_ACCESS_TOKEN'), help='Bearer token with amanuensis access (default: AMANUENSIS_ACCESS_TOKEN env var)') parser.add_argument('-i', '--input', help='Input PFB file path') parser.add_argument('-o', '--output', help='Output ZIP directory') parser.add_argument('-t', '--terminology', help='The ontology you want to transform GEN3 values to.') @@ -373,12 +370,9 @@ def main(): output_path = args.output config_file = args.config project_id = args.project_id - db_kwargs = get_db_kwargs( - host=args.db_host, - port=args.db_port, - dbname=args.db_name, - user=args.db_user, - password=args.db_password, + api_config = get_api_config( + base_url=args.amanuensis_url, + token=args.access_token, ) ontology = args.terminology analysis_script_consortia = args.analysis @@ -396,7 +390,7 @@ def main(): ontology, True if analysis_script_consortia and analysis_script_consortia != "" else False, project_id=project_id, - db_kwargs=db_kwargs, + api_config=api_config, ) if not pfb_export: print("One or more problems occurred during the initialization of the PFBExporter class") diff --git a/pfb_to_zip/seed_datapoints.py b/pfb_to_zip/seed_datapoints.py deleted file mode 100644 index 38f82ac..0000000 --- a/pfb_to_zip/seed_datapoints.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Seed project_datapoints from a pfb_to_zip config file.""" - -import argparse -import os - -import psycopg2 - -from db_config import get_db_kwargs, load_config_module - - -def ensure_project(cur, project_id, project_name): - if project_id is not None: - cur.execute("SELECT id FROM project WHERE id = %s", (project_id,)) - if cur.fetchone(): - return project_id - cur.execute( - """ - INSERT INTO project (id, name, description, institution, active) - VALUES (%s, %s, %s, %s, true) - """, - ( - project_id, - project_name, - "Seeded for pfb_to_zip export testing", - "PCDC", - ), - ) - return project_id - - cur.execute( - """ - INSERT INTO project (name, description, institution, active) - VALUES (%s, %s, %s, true) - RETURNING id - """, - ( - project_name, - "Seeded for pfb_to_zip export testing", - "PCDC", - ), - ) - return cur.fetchone()[0] - - -def seed_datapoints(cur, project_id, config_module): - inserted = 0 - for term, cols in config_module.white_list.items(): - cur.execute( - """ - INSERT INTO project_datapoints (term, value_list, type, active, project_id) - VALUES (%s, %s, 'w', true, %s) - """, - (term, cols, project_id), - ) - inserted += 1 - - for term, cols in config_module.black_list.items(): - cur.execute( - """ - INSERT INTO project_datapoints (term, value_list, type, active, project_id) - VALUES (%s, %s, 'b', true, %s) - """, - (term, cols, project_id), - ) - inserted += 1 - - return inserted - - -def main(): - parser = argparse.ArgumentParser( - description="Seed amanuensis project_datapoints from a config file" - ) - parser.add_argument( - "-c", - "--config", - default="./configs/config_inrg.py", - help="Config file with white_list and black_list", - ) - parser.add_argument( - "-p", - "--project-id", - type=int, - default=1, - help="Project id to seed (creates project if missing)", - ) - parser.add_argument( - "--project-name", - default="INRG Test Export", - help="Name for the test project if it must be created", - ) - parser.add_argument("--db-host", default=os.environ.get("AMANUENSIS_DB_HOST", "127.0.0.1")) - parser.add_argument("--db-port", type=int, default=int(os.environ.get("AMANUENSIS_DB_PORT", "5433"))) - parser.add_argument("--db-name", default=os.environ.get("AMANUENSIS_DB_NAME", "amanuensis_pcdc")) - parser.add_argument("--db-user", default=os.environ.get("AMANUENSIS_DB_USER", "amanuensis_pcdc")) - parser.add_argument("--db-password", default=os.environ.get("AMANUENSIS_DB_PASSWORD")) - args = parser.parse_args() - - config_module = load_config_module(args.config) - db_kwargs = get_db_kwargs( - host=args.db_host, - port=args.db_port, - dbname=args.db_name, - user=args.db_user, - password=args.db_password, - ) - - with psycopg2.connect(**db_kwargs) as conn: - with conn.cursor() as cur: - cur.execute( - "SELECT COUNT(*) FROM project_datapoints WHERE project_id = %s AND active = true", - (args.project_id,), - ) - existing = cur.fetchone()[0] - if existing: - print( - f"project_id={args.project_id} already has {existing} active datapoints; skipping" - ) - return - - project_id = ensure_project(cur, args.project_id, args.project_name) - count = seed_datapoints(cur, project_id, config_module) - conn.commit() - print(f"Seeded project_id={project_id} with {count} datapoint rows") - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index f6b141c..855a1b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ gitpython = "^3.1.44" dictionaryutils = { git = "https://github.com/chicagopcdc/dictionaryutils.git", rev = "2026.05" } pypfb = { git = "https://github.com/chicagopcdc/pypfb.git", rev = "0.6.0" } -psycopg2-binary = "^2.9.12" [tool.poetry.group.dev.dependencies] pytest = "^8.3.5"