From 8c317d2deacd3c6f8cdcbd9883339e27deef7570 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Fri, 5 Dec 2025 13:28:21 +0100 Subject: [PATCH 1/9] MAINT: Removed dependency on direct mongo plugin --- dserver_dependency_graph_plugin/__init__.py | 5 +- dserver_dependency_graph_plugin/graph.py | 13 +-- dserver_dependency_graph_plugin/utils.py | 117 ++++++++++++++++++++ dserver_dependency_graph_plugin/version.py | 34 ++++++ pyproject.toml | 2 +- 5 files changed, 161 insertions(+), 10 deletions(-) create mode 100644 dserver_dependency_graph_plugin/utils.py create mode 100644 dserver_dependency_graph_plugin/version.py diff --git a/dserver_dependency_graph_plugin/__init__.py b/dserver_dependency_graph_plugin/__init__.py index 3f9c937..c3d09af 100644 --- a/dserver_dependency_graph_plugin/__init__.py +++ b/dserver_dependency_graph_plugin/__init__.py @@ -33,7 +33,7 @@ from dservercore import AuthenticationError, ExtensionABC from dservercore.sql_models import DatasetSchema from dservercore.utils import _preprocess_privileges -from dserver_direct_mongo_plugin.utils import _dict_to_mongo_query +from .utils import _dict_to_mongo_query from .schemas import DependencyKeysSchema @@ -246,7 +246,8 @@ def dependency_graph_by_user_and_uuid(username, uuid, dependency_keys=Config.DEP mongo_aggregation = query_dependency_graph(pre_query=pre_query, post_query=post_query, dependency_keys=dependency_keys, - mongo_dependency_view=dependency_view) + mongo_dependency_view=dependency_view, + mongo_collection=current_app.config['MONGO_COLLECTION']) logger.debug("Constructed mongo aggregation: {}".format(mongo_aggregation)) cx = DependencyGraphExtension.db[current_app.config['MONGO_COLLECTION']].aggregate(mongo_aggregation) diff --git a/dserver_dependency_graph_plugin/graph.py b/dserver_dependency_graph_plugin/graph.py index 6b3aa6e..68eefd5 100644 --- a/dserver_dependency_graph_plugin/graph.py +++ b/dserver_dependency_graph_plugin/graph.py @@ -1,7 +1,6 @@ """Aggregation pipelines for graph operations.""" -from dserver_dependency_graph_plugin.config import Config as dependency_graph_plugin_config -from dserver_direct_mongo_plugin.config import Config as direct_mongo_plugin_config +from .config import Config # a regular expression to filter valid v4 UUIDs UUID_v4_REGEX = '[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[4][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}' @@ -9,7 +8,7 @@ # most of those 'functions' are pretty static and just wrapped in function # definitions for convenience. -def unwind_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def unwind_dependencies(dependency_keys=Config.DEPENDENCY_KEYS): """Create parallel aggregation pipelines for unwinding all configured dependency keys.""" parallel_aggregations = [] @@ -41,7 +40,7 @@ def unwind_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENC return parallel_aggregations -def merge_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def merge_dependencies(dependency_keys=Config.DEPENDENCY_KEYS): """Aggregate (directed) dependency graph edges. All configured dependency keys are merged in a key-agnostic 'dependencies' @@ -117,7 +116,7 @@ def group_inverse_dependencies(): return aggregation -def build_undirected_adjecency_lists(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def build_undirected_adjecency_lists(dependency_keys=Config.DEPENDENCY_KEYS): """Aggregate undirected adjacency lists.""" aggregation = [ *merge_dependencies(dependency_keys), @@ -200,8 +199,8 @@ def build_undirected_adjecency_lists(dependency_keys=dependency_graph_plugin_con # behavior would be to yield all redundant dataset entries for a uuid. def query_dependency_graph(mongo_dependency_view, pre_query, post_query=None, - dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS, - mongo_collection=direct_mongo_plugin_config.MONGO_COLLECTION): + dependency_keys=Config.DEPENDENCY_KEYS, + mongo_collection=None): """Aggregation pipeline for querying dependency view on datasets collection. :param pre_query: selects all documents for whicht to query the dependency graph. diff --git a/dserver_dependency_graph_plugin/utils.py b/dserver_dependency_graph_plugin/utils.py new file mode 100644 index 0000000..4fd2de6 --- /dev/null +++ b/dserver_dependency_graph_plugin/utils.py @@ -0,0 +1,117 @@ +"""Utility functions for MongoDB query construction. + +These functions were originally part of dserver-direct-mongo-plugin but are +copied here to decouple the dependency-graph-plugin from that package. +""" + +import logging + +logger = logging.getLogger(__name__) + + +VALID_MONGO_QUERY_KEYS = ( + "free_text", + "creator_usernames", + "base_uris", + "uuids", + "tags", +) + +MONGO_QUERY_LIST_KEYS = ( + "creator_usernames", + "base_uris", + "uuids", + "tags", +) + + +def _dict_to_mongo(query_dict): + """Convert a query dictionary to a MongoDB query. + + :param query_dict: Dictionary with query parameters + :returns: MongoDB query dictionary + """ + def _sanitise(query_dict): + for key in list(query_dict.keys()): + if key not in VALID_MONGO_QUERY_KEYS: + del query_dict[key] + for lk in MONGO_QUERY_LIST_KEYS: + if lk in query_dict: + if len(query_dict[lk]) == 0: + del query_dict[lk] + + def _deal_with_possible_or_statment(a_list, key): + if len(a_list) == 1: + return {key: a_list[0]} + else: + return {"$or": [{key: v} for v in a_list]} + + def _deal_with_possible_and_statement(a_list, key): + if len(a_list) == 1: + return {key: a_list[0]} + else: + return {key: {"$all": a_list}} + + _sanitise(query_dict) + + sub_queries = [] + if "free_text" in query_dict: + sub_queries.append({"$text": {"$search": query_dict["free_text"]}}) + if "creator_usernames" in query_dict: + sub_queries.append( + _deal_with_possible_or_statment( + query_dict["creator_usernames"], "creator_username" + ) + ) + if "base_uris" in query_dict: + sub_queries.append( + _deal_with_possible_or_statment(query_dict["base_uris"], "base_uri") + ) + if "uuids" in query_dict: + sub_queries.append(_deal_with_possible_or_statment(query_dict["uuids"], "uuid")) + if "tags" in query_dict: + sub_queries.append( + _deal_with_possible_and_statement(query_dict["tags"], "tags") + ) + + if len(sub_queries) == 0: + return {} + elif len(sub_queries) == 1: + return sub_queries[0] + else: + return {"$and": [q for q in sub_queries]} + + +def _dict_to_mongo_query(query_dict): + """Construct mongo query, allowing embedding of a raw mongo query. + + Converts a query dictionary to a MongoDB query format. If the query_dict + contains a 'query' key with a dict value, that raw MongoDB query is + merged with the constructed query. + + :param query_dict: Dictionary with query parameters. May contain: + - free_text: Text search string + - creator_usernames: List of creator usernames + - base_uris: List of base URIs + - uuids: List of UUIDs + - tags: List of tags + - query: Raw MongoDB query dict (optional) + :returns: MongoDB query dictionary + """ + if "query" in query_dict and isinstance(query_dict["query"], dict): + raw_mongo = query_dict["query"] + del query_dict["query"] + else: + raw_mongo = {} + + mongo_query = _dict_to_mongo(query_dict) + + if len(raw_mongo) > 0 and len(mongo_query) == 0: + mongo_query = raw_mongo + elif len(raw_mongo) > 0 and len(mongo_query) == 1 and "$and" in mongo_query: + mongo_query["$and"].append(raw_mongo) + elif len(raw_mongo) > 0: + mongo_query = {"$and": [mongo_query, raw_mongo]} + + logger.debug("Constructed mongo query: {}".format(mongo_query)) + return mongo_query diff --git a/dserver_dependency_graph_plugin/version.py b/dserver_dependency_graph_plugin/version.py new file mode 100644 index 0000000..88daef1 --- /dev/null +++ b/dserver_dependency_graph_plugin/version.py @@ -0,0 +1,34 @@ +# file generated by setuptools-scm +# don't change, don't track in version control + +__all__ = [ + "__version__", + "__version_tuple__", + "version", + "version_tuple", + "__commit_id__", + "commit_id", +] + +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple + from typing import Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] + COMMIT_ID = Union[str, None] +else: + VERSION_TUPLE = object + COMMIT_ID = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE +commit_id: COMMIT_ID +__commit_id__: COMMIT_ID + +__version__ = version = '0.4.3.dev1' +__version_tuple__ = version_tuple = (0, 4, 3, 'dev1') + +__commit_id__ = commit_id = 'gcde515931' diff --git a/pyproject.toml b/pyproject.toml index d168a75..569df7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dynamic = ["version"] dependencies = [ "dtoolcore>=3.18.0", "dservercore>=0.20.0", - "dserver-direct-mongo-plugin", + "pymongo", ] [project.optional-dependencies] From 652501f643e73bf5a1f87788d4c4b707dde34ce7 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Sun, 7 Dec 2025 23:25:45 +0100 Subject: [PATCH 2/9] BUILD: Switched build system to flit --- dserver_dependency_graph_plugin/version.py | 6 ++--- pyproject.toml | 29 +++++++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/dserver_dependency_graph_plugin/version.py b/dserver_dependency_graph_plugin/version.py index 88daef1..6825d3b 100644 --- a/dserver_dependency_graph_plugin/version.py +++ b/dserver_dependency_graph_plugin/version.py @@ -28,7 +28,7 @@ commit_id: COMMIT_ID __commit_id__: COMMIT_ID -__version__ = version = '0.4.3.dev1' -__version_tuple__ = version_tuple = (0, 4, 3, 'dev1') +__version__ = version = '0.4.3.dev2' +__version_tuple__ = version_tuple = (0, 4, 3, 'dev2') -__commit_id__ = commit_id = 'gcde515931' +__commit_id__ = commit_id = 'g8c317d2de' diff --git a/pyproject.toml b/pyproject.toml index 569df7a..27b687f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,22 @@ [build-system] -requires = ["setuptools>=42", "setuptools_scm[toml]>=6.3"] -build-backend = "setuptools.build_meta" +requires = ["flit_scm"] +build-backend = "flit_scm:buildapi" [project] name = "dserver-dependency-graph-plugin" description = "dserver plugin for receiving s3 notifications on updated objects." readme = "README.rst" -license = {file = "LICENSE"} +license = {text = "MIT"} authors = [ {name = "Johannes L. Hörmann", email = "johannes.laurin@gmail.com"}, ] dynamic = ["version"] +requires-python = ">=3.8" dependencies = [ - "dtoolcore>=3.18.0", - "dservercore>=0.20.0", - "pymongo", - ] + "dtoolcore>=3.18.0", + "dservercore>=0.20.0", + "pymongo", +] [project.optional-dependencies] test = [ @@ -31,13 +32,17 @@ Documentation = "https://github.com/livMatS/dserver-dependency-graph-plugin/blob Repository = "https://github.com/livMatS/dserver-dependency-graph-plugin" Changelog = "https://github.com/livMatS/dserver-dependency-graph-plugin/blob/main/CHANGELOG.rst" +[project.entry-points."dservercore.extension"] +DependencyGraphExtension = "dserver_dependency_graph_plugin:DependencyGraphExtension" + +[tool.flit.module] +name = "dserver_dependency_graph_plugin" + [tool.setuptools_scm] version_scheme = "guess-next-dev" local_scheme = "no-local-version" write_to = "dserver_dependency_graph_plugin/version.py" -[tool.setuptools] -packages = ["dserver_dependency_graph_plugin"] - -[project.entry-points."dservercore.extension"] -"DependencyGraphExtension" = "dserver_dependency_graph_plugin:DependencyGraphExtension" +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=dserver_dependency_graph_plugin --cov-report=term-missing" From 4d201d6b5d89c25cc41994c5470c2033d77aa73d Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Mon, 8 Dec 2025 16:16:41 +0100 Subject: [PATCH 3/9] MAINT: Mongo configuration --- dserver_dependency_graph_plugin/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dserver_dependency_graph_plugin/config.py b/dserver_dependency_graph_plugin/config.py index d50ca9c..98b9c12 100644 --- a/dserver_dependency_graph_plugin/config.py +++ b/dserver_dependency_graph_plugin/config.py @@ -8,6 +8,11 @@ class Config(object): + # MongoDB connection settings + # These are required for the dependency graph plugin to connect to MongoDB + MONGO_URI = os.environ.get("MONGO_URI") + MONGO_DB = os.environ.get("MONGO_DB") + MONGO_COLLECTION = os.environ.get("MONGO_COLLECTION") # If enabled, the underlying database will offer dependency graph views on # the server's default collection. Those views offer on-the-fly-generated # collections of undirected per-dataset adjacency lists in order to From 9f101897f8196f6590a5a5e74c892847bee20cc5 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Wed, 10 Jun 2026 22:15:35 +0200 Subject: [PATCH 4/9] BUG: Traverse dependencies via readme_parsed (string READMEs broke graph queries); obfuscate Mongo credentials in config route; reject JS-executing Mongo operators; timezone-aware datetimes --- dserver_dependency_graph_plugin/__init__.py | 9 ++++--- dserver_dependency_graph_plugin/config.py | 8 ++++-- dserver_dependency_graph_plugin/utils.py | 25 +++++++++++++++++ dserver_dependency_graph_plugin/version.py | 28 +++++++------------ tests/conftest.py | 18 ++++++------- tests/test_config_routes.py | 2 +- tests/test_graph_routes.py | 4 +-- tests/test_raw_query_hardening.py | 30 +++++++++++++++++++++ 8 files changed, 88 insertions(+), 36 deletions(-) create mode 100644 tests/test_raw_query_hardening.py diff --git a/dserver_dependency_graph_plugin/__init__.py b/dserver_dependency_graph_plugin/__init__.py index c3d09af..f0e8b24 100644 --- a/dserver_dependency_graph_plugin/__init__.py +++ b/dserver_dependency_graph_plugin/__init__.py @@ -116,7 +116,8 @@ def _get_dependency_view_bookkeeping_record(dependency_keys): @assert_dependency_view_bookkeeping_collection def _create_dependency_view_bookkeeping_record(name, dependency_keys): ret = DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].insert_one( - {'name': name, 'keys': dependency_keys, 'accessed_on': datetime.datetime.utcnow()}) + {'name': name, 'keys': dependency_keys, + 'accessed_on': datetime.datetime.now(datetime.timezone.utc)}) # drop oldest entry if number of documents exceeds allowed maximum count = DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].count_documents({}) if count > Config.MONGO_DEPENDENCY_VIEW_CACHE_SIZE: @@ -135,7 +136,8 @@ def _create_dependency_view_bookkeeping_record(name, dependency_keys): def _update_dependency_view_bookkeeping_record(name): """Updated record to dependency view bookkeeping collection or add if new.""" return DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].update_one( - {'name': name}, {'$set': {'accessed_on': datetime.datetime.utcnow()}}) + {'name': name}, + {'$set': {'accessed_on': datetime.datetime.now(datetime.timezone.utc)}}) # mid-level dependency view helpers @@ -146,7 +148,8 @@ def _create_dependency_view(dependency_keys): :returns: str""" # generate unique, valid name for view from prefix and ISO date string - datestring = datetime.datetime.utcnow().isoformat() + datestring = datetime.datetime.now( + datetime.timezone.utc).replace(tzinfo=None).isoformat() name = Config.MONGO_DEPENDENCY_VIEW_PREFIX + datestring if name in DependencyGraphExtension.db.list_collection_names(): diff --git a/dserver_dependency_graph_plugin/config.py b/dserver_dependency_graph_plugin/config.py index 98b9c12..c837b7b 100644 --- a/dserver_dependency_graph_plugin/config.py +++ b/dserver_dependency_graph_plugin/config.py @@ -4,7 +4,11 @@ AFFIRMATIVE_EXPRESSIONS = ['true', '1', 'y', 'yes', 'on'] -CONFIG_SECRETS_TO_OBFUSCATE = [] +CONFIG_SECRETS_TO_OBFUSCATE = [ + "MONGO_URI", + "MONGO_DB", + "MONGO_COLLECTION", +] class Config(object): @@ -39,7 +43,7 @@ class Config(object): # a single key or a JSON-formatted list of keys. # Nested fields are separated by a dot (.) DEPENDENCY_KEYS = [ - 'readme.derived_from.uuid', + 'readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid' ] diff --git a/dserver_dependency_graph_plugin/utils.py b/dserver_dependency_graph_plugin/utils.py index 4fd2de6..35917fc 100644 --- a/dserver_dependency_graph_plugin/utils.py +++ b/dserver_dependency_graph_plugin/utils.py @@ -82,6 +82,28 @@ def _deal_with_possible_and_statement(a_list, key): return {"$and": [q for q in sub_queries]} +# MongoDB operators that execute server-side JavaScript. A raw query +# containing any of these could run arbitrary code on the database server. +_FORBIDDEN_MONGO_OPERATORS = frozenset( + ("$where", "$function", "$accumulator")) + + +def _assert_no_forbidden_operators(raw_mongo): + """Recursively reject MongoDB operators that execute JavaScript. + + :raises: ValueError if a forbidden operator occurs anywhere in the query + """ + if isinstance(raw_mongo, dict): + for key, value in raw_mongo.items(): + if key in _FORBIDDEN_MONGO_OPERATORS: + raise ValueError( + f"Operator '{key}' is not allowed in raw MongoDB queries") + _assert_no_forbidden_operators(value) + elif isinstance(raw_mongo, (list, tuple)): + for item in raw_mongo: + _assert_no_forbidden_operators(item) + + def _dict_to_mongo_query(query_dict): """Construct mongo query, allowing embedding of a raw mongo query. @@ -97,9 +119,12 @@ def _dict_to_mongo_query(query_dict): - tags: List of tags - query: Raw MongoDB query dict (optional) :returns: MongoDB query dictionary + :raises: ValueError if the raw query contains JavaScript-executing + operators ($where, $function, $accumulator) """ if "query" in query_dict and isinstance(query_dict["query"], dict): raw_mongo = query_dict["query"] + _assert_no_forbidden_operators(raw_mongo) del query_dict["query"] else: raw_mongo = {} diff --git a/dserver_dependency_graph_plugin/version.py b/dserver_dependency_graph_plugin/version.py index 6825d3b..d5f9652 100644 --- a/dserver_dependency_graph_plugin/version.py +++ b/dserver_dependency_graph_plugin/version.py @@ -1,5 +1,6 @@ -# file generated by setuptools-scm +# file generated by vcs-versioning # don't change, don't track in version control +from __future__ import annotations __all__ = [ "__version__", @@ -10,25 +11,14 @@ "commit_id", ] -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple - from typing import Union - - VERSION_TUPLE = Tuple[Union[int, str], ...] - COMMIT_ID = Union[str, None] -else: - VERSION_TUPLE = object - COMMIT_ID = object - version: str __version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE -commit_id: COMMIT_ID -__commit_id__: COMMIT_ID +__version_tuple__: tuple[int | str, ...] +version_tuple: tuple[int | str, ...] +commit_id: str | None +__commit_id__: str | None -__version__ = version = '0.4.3.dev2' -__version_tuple__ = version_tuple = (0, 4, 3, 'dev2') +__version__ = version = '0.4.3.dev4' +__version_tuple__ = version_tuple = (0, 4, 3, 'dev4') -__commit_id__ = commit_id = 'g8c317d2de' +__commit_id__ = commit_id = 'g4d201d6b5' diff --git a/tests/conftest.py b/tests/conftest.py index d3f7a10..9d7b8c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -163,19 +163,19 @@ def tmp_app_with_users(request): "MONGO_DEPENDENCY_VIEW_CACHE_SIZE": 10, "FORCE_REBUILD_DEPENDENCY_VIEW": False, "DEPENDENCY_KEYS": [ - 'readme.derived_from.uuid', + 'readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid' ], "DYNAMIC_DEPENDENCY_KEYS": True, - "RETRIEVE_MONGO_URI": "mongodb://localhost:27017/", + "RETRIEVE_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "RETRIEVE_MONGO_DB": tmp_mongo_db_name, "RETRIEVE_MONGO_COLLECTION": "datasets", - "SEARCH_MONGO_URI": "mongodb://localhost:27017/", + "SEARCH_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "SEARCH_MONGO_DB": tmp_mongo_db_name, "SEARCH_MONGO_COLLECTION": "datasets", - "MONGO_URI": "mongodb://localhost:27017/", + "MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "MONGO_DB": tmp_mongo_db_name, - "MONGO_COLLECTION": "metadata", + "MONGO_COLLECTION": "datasets", "SQLALCHEMY_DATABASE_URI": "sqlite:///:memory:", "SQLALCHEMY_TRACK_MODIFICATIONS": False, "JWT_ALGORITHM": "RS256", @@ -239,15 +239,15 @@ def tmp_app_with_dependent_data(request): "OPENAPI_VERSION": '3.0.2', "FLASK_ENV": "development", "CONFIG_SECRETS_TO_OBFUSCATE": [], - "RETRIEVE_MONGO_URI": "mongodb://localhost:27017/", + "RETRIEVE_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "RETRIEVE_MONGO_DB": tmp_mongo_db_name, "RETRIEVE_MONGO_COLLECTION": "datasets", - "SEARCH_MONGO_URI": "mongodb://localhost:27017/", + "SEARCH_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "SEARCH_MONGO_DB": tmp_mongo_db_name, "SEARCH_MONGO_COLLECTION": "datasets", - "MONGO_URI": "mongodb://localhost:27017/", + "MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "MONGO_DB": tmp_mongo_db_name, - "MONGO_COLLECTION": "metadata", + "MONGO_COLLECTION": "datasets", "SQLALCHEMY_DATABASE_URI": "sqlite:///:memory:", "SQLALCHEMY_TRACK_MODIFICATIONS": False, "JWT_ALGORITHM": "RS256", diff --git a/tests/test_config_routes.py b/tests/test_config_routes.py index 07237e5..062e9bc 100644 --- a/tests/test_config_routes.py +++ b/tests/test_config_routes.py @@ -20,7 +20,7 @@ def test_config_info_route(tmp_app_with_users, snowwhite_token): # NOQA assert r.status_code == 200 expected_response = { - 'dependency_keys': ['readme.derived_from.uuid', + 'dependency_keys': ['readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid'], 'dynamic_dependency_keys': True, 'enable_dependency_view': True, diff --git a/tests/test_graph_routes.py b/tests/test_graph_routes.py index fc1631e..8438d79 100644 --- a/tests/test_graph_routes.py +++ b/tests/test_graph_routes.py @@ -50,7 +50,7 @@ def test_query_dependency_graph_by_custom_keys(tmp_app_with_dependent_data, test uuid = "a2218059-5bd0-4690-b090-062faf08e044" # brother - dependency_keys = ["readme.derived_from.uuid", "some_nonexistant_key"] + dependency_keys = ["readme_parsed.derived_from.uuid", "some_nonexistant_key"] r = tmp_app_with_dependent_data.post( "/graph/uuids/{}".format(uuid), @@ -137,7 +137,7 @@ def test_generate_many_dependency_views(tmp_app_with_dependent_data, testing_fam uuid = "a2218059-5bd0-4690-b090-062faf08e044" # brother dependency_keys_list = [ - ["readme.derived_from.uuid", "some_nonexistant_key_{}".format(i)] for i in range(12)] + ["readme_parsed.derived_from.uuid", "some_nonexistant_key_{}".format(i)] for i in range(12)] expected_response = [] for role, p in testing_family.items(): diff --git a/tests/test_raw_query_hardening.py b/tests/test_raw_query_hardening.py new file mode 100644 index 0000000..50325a3 --- /dev/null +++ b/tests/test_raw_query_hardening.py @@ -0,0 +1,30 @@ +"""Audit hardening: raw MongoDB queries merged into constructed queries +must not be able to execute server-side JavaScript.""" + +import pytest + +from dserver_dependency_graph_plugin.utils import _dict_to_mongo_query + + +def test_benign_raw_query_is_merged(): + query = _dict_to_mongo_query({ + "base_uris": ["s3://bucket"], + "query": {"readme.project": "test"}, + }) + assert "readme.project" in str(query) + + +@pytest.mark.parametrize("malicious", [ + {"$where": "sleep(10000)"}, + {"$expr": {"$function": {"body": "function(){}", "args": [], + "lang": "js"}}}, + {"nested": {"$where": "this.a == 1"}}, + {"$or": [{"a": 1}, {"$where": "true"}]}, + {"group": {"$accumulator": {"init": "function(){}"}}}, +]) +def test_javascript_operators_rejected(malicious): + with pytest.raises(ValueError): + _dict_to_mongo_query({ + "base_uris": ["s3://bucket"], + "query": malicious, + }) From f52c083f2e7ecc24be243de3fdfca4319e31a177 Mon Sep 17 00:00:00 2001 From: Lars Pastewka Date: Wed, 10 Jun 2026 23:22:37 +0200 Subject: [PATCH 5/9] TST: Exclude server-stamped upload provenance fields from graph response comparisons --- tests/test_graph_routes.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_graph_routes.py b/tests/test_graph_routes.py index 8438d79..d7af43a 100644 --- a/tests/test_graph_routes.py +++ b/tests/test_graph_routes.py @@ -40,6 +40,9 @@ def test_query_dependency_graph_by_default_keys(tmp_app_with_dependent_data, tes m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -83,6 +86,9 @@ def test_query_dependency_graph_by_custom_keys(tmp_app_with_dependent_data, test m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -126,6 +132,9 @@ def test_query_dependency_graph_by_custom_nonexistant_keys(tmp_app_with_dependen m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -161,6 +170,9 @@ def test_generate_many_dependency_views(tmp_app_with_dependent_data, testing_fam m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False for dependency_keys in dependency_keys_list: r = tmp_app_with_dependent_data.post( From 8e6f8d5a3c35079b9a8514ed62fc4af029d0d2a5 Mon Sep 17 00:00:00 2001 From: "Johannes L. Hoermann" Date: Wed, 17 Jun 2026 16:32:19 +0900 Subject: [PATCH 6/9] DOC: Correct graph route paths in README (/graph/lookup -> /graph/uuids) The blueprint registers the dependency-graph endpoints under /graph/uuids/ (see dserver_dependency_graph_plugin/__init__.py), but the README documented a non-existent /graph/lookup/ path in all three curl examples. Co-Authored-By: Claude Opus 4.8 --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 086ae2a..cb67cc7 100644 --- a/README.rst +++ b/README.rst @@ -151,7 +151,7 @@ graph by UUID is possible, i.e. .. code-block:: bash $ UUID=41a2e3e2-0c01-444f-bd7d-f9bb45512373 - $ curl -H "$HEADER" http://localhost:5000/graph/lookup/$UUID + $ curl -H "$HEADER" http://localhost:5000/graph/uuids/$UUID Looking up a dependency graph by UUID will result in unique per-UUID hits. As it is possible for a dataset to be registered in more than one base @@ -172,7 +172,7 @@ of desired dependency keys attached $ curl -H "$HEADER" -H "Content-Type: application/json" \ -X POST -d \ '["annotations.source_dataset_uuid","readme.derived_from.uuid"]' - http://localhost:5000/graph/lookup/$UUID + http://localhost:5000/graph/uuids/$UUID If a view for this particular set of keys does not exist yet, the server will generate and cache it on-the-fly. This can be observed in the mongo shell @@ -217,7 +217,7 @@ and querying with a specific set of keys for the first time $ curl -H "$HEADER" -H "Content-Type: application/json" \ -X POST -d \ '["another.possibly_nested.dependency_key"]' \ - http://localhost:5000/graph/lookup/$UUID + http://localhost:5000/graph/uuids/$UUID will result in an additional view named uniquely by the current UTC time:: From e387117dde866ab6d1176f53ef87257fb1f8eb7d Mon Sep 17 00:00:00 2001 From: "Johannes L. Hoermann" Date: Wed, 17 Jun 2026 16:32:19 +0900 Subject: [PATCH 7/9] CI: Modernize workflows and target jic-dtool plugin repositories Install dservercore, the search and the retrieve plugin from the jic-dtool mains, which carry the coordinated readme_parsed feature and the dservercore.utils_auth JWT helpers this branch relies on. Drop the dserver-direct-mongo-plugin install entirely: the dependency was removed in 8c317d2 and, as a custom extension, it overwrites the readme_parsed field and breaks the graph queries. Bump actions to current majors: checkout v6, setup-python v6, mongodb-github-action 1.12.1, upload-artifact v7, download-artifact v8, sigstore v3.4.0. Update the test matrix to Python 3.10-3.13 x MongoDB 6.0/7.0/8.0 and raise requires-python to >=3.10. Python 3.9 (and the EOL MongoDB 4.2-5.0) are dropped because jic-dtool/dservercore now requires Python >=3.10. Verified locally with act across the full matrix: all 12 cells (3.10-3.13 x 6.0/7.0/8.0) pass, 11 tests each. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/publish.yml | 14 +++++++------- .github/workflows/test.yml | 18 ++++++++---------- pyproject.toml | 2 +- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index bf5c306..f5ebed3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,12 +14,12 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Set up Python 3.12 - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: 3.12 @@ -35,7 +35,7 @@ jobs: python -m build - name: Upload artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: python-package-distributions path: dist/ @@ -56,7 +56,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ @@ -83,7 +83,7 @@ jobs: steps: - name: Download artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ @@ -107,13 +107,13 @@ jobs: steps: - name: Download artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ - name: Sign with Sigstore - uses: sigstore/gh-action-sigstore-python@v3.0.0 + uses: sigstore/gh-action-sigstore-python@v3.4.0 with: inputs: >- ./dist/*.tar.gz diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e61e728..261ed15 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,24 +7,23 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - mongodb-version: ['4.2', '4.4', '5.0', '6.0'] + python-version: ['3.10', '3.11', '3.12', '3.13'] + mongodb-version: ['6.0', '7.0', '8.0'] dservercore-version: [main] dserver-search-plugin-mongo-version: [main] dserver-retrieve-plugin-mongo-version: [main] - dserver-direct-mongo-plugin-version: [main] steps: - name: Git checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up MongoDB ${{ matrix.mongodb-version }} - uses: supercharge/mongodb-github-action@1.11.0 + uses: supercharge/mongodb-github-action@1.12.1 with: mongodb-version: ${{ matrix.mongodb-version }} - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -36,10 +35,9 @@ jobs: - name: Install server, search and retrieve plugins run: | - pip install git+https://github.com/livMatS/dservercore.git@${{ matrix.dservercore-version }} - pip install git+https://github.com/livMatS/dserver-search-plugin-mongo.git@${{ matrix.dserver-search-plugin-mongo-version }} - pip install git+https://github.com/livMatS/dserver-retrieve-plugin-mongo.git@${{ matrix.dserver-retrieve-plugin-mongo-version }} - pip install git+https://github.com/livMatS/dserver-direct-mongo-plugin.git@${{ matrix.dserver-direct-mongo-plugin-version }} + pip install git+https://github.com/jic-dtool/dservercore.git@${{ matrix.dservercore-version }} + pip install git+https://github.com/jic-dtool/dserver-search-plugin-mongo.git@${{ matrix.dserver-search-plugin-mongo-version }} + pip install git+https://github.com/jic-dtool/dserver-retrieve-plugin-mongo.git@${{ matrix.dserver-retrieve-plugin-mongo-version }} - name: Remaining requirements run: | diff --git a/pyproject.toml b/pyproject.toml index 27b687f..e11f57a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Johannes L. Hörmann", email = "johannes.laurin@gmail.com"}, ] dynamic = ["version"] -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ "dtoolcore>=3.18.0", "dservercore>=0.20.0", From 00f1135157c1fc872e7f9e4ffd6aa928e5e21a94 Mon Sep 17 00:00:00 2001 From: "Johannes L. Hoermann" Date: Wed, 17 Jun 2026 16:35:14 +0900 Subject: [PATCH 8/9] DOC: Add CLAUDE.md with build/test commands and architecture overview Guidance file for Claude Code sessions: how to run the suite (MongoDB required), the dependency-view/bookkeeping architecture, the readme_parsed dependency keys, and the JS-operator query hardening boundary. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..d7a99e6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,121 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this is + +A `dservercore` extension plugin that lets a [dserver](https://github.com/livMatS/dservercore) +instance answer "give me every dataset in the same dependency graph as this UUID." +It registers a Flask blueprint at `/graph` and is wired into the host server through the +`dservercore.extension` entry point (`pyproject.toml`): +`DependencyGraphExtension = "dserver_dependency_graph_plugin:DependencyGraphExtension"`. + +The plugin does **not** register or own dataset metadata. It reads the dataset collection +that the search/retrieve mongo plugins populate, building MongoDB *views* on top of it. +`register_dataset` is intentionally a no-op. + +## Commands + +```bash +# Install with test deps (needs the sibling livMatS plugins, see test.yml for git installs) +pip install .[test] + +# Run the full test suite (requires a running MongoDB) +pytest -sv + +# Point tests at a non-default mongo +TEST_MONGO_URI=mongodb://localhost:27017/ pytest -sv + +# Single test +pytest tests/test_graph_routes.py::test_query_dependency_graph_by_default_keys -sv + +# Lint (matches CI) +flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics +``` + +Tests spin up a real dserver app via `create_app` and a temporary mongo database per run +(`conftest.py`). **A MongoDB server must be reachable** — there is no mocking layer. CI +(`.github/workflows/test.yml`) runs a matrix of Python 3.8–3.12 × MongoDB 4.2–6.0 and +installs `dservercore`, `dserver-search-plugin-mongo`, `dserver-retrieve-plugin-mongo`, +and `dserver-direct-mongo-plugin` from their `main` branches first. + +Build backend is `flit_scm`; version is derived from git tags via `setuptools_scm` and +written to `dserver_dependency_graph_plugin/version.py` (do not edit by hand). + +## Routes (actual) + +Defined in `__init__.py` on `graph_bp` (`url_prefix="/graph"`): + +- `GET /graph/uuids/` — graph using server-default `DEPENDENCY_KEYS`. +- `POST /graph/uuids/` — body is a `DependencyKeysSchema` (`{"dependency_keys": [...]}`); + only honored when `DYNAMIC_DEPENDENCY_KEYS` is enabled, otherwise it silently falls back + to the defaults. + +Note: `README.rst` documents an older `/graph/lookup/` path — the code uses +`/graph/uuids/`. Trust the code. + +## Architecture / request flow + +The hard part is all MongoDB aggregation-pipeline construction. Two files carry it: + +- **`graph.py`** — pure pipeline builders (no DB calls), two distinct concerns: + 1. `build_undirected_adjacency_lists(keys)` builds the **view** definition: it unwinds + each configured dependency key into directed `(uuid → derived_from)` edges, then + emits *both* directions (`group_dependencies` + `group_inverse_dependencies`) so the + graph can be traversed forward and backward. Invalid edges are dropped by a + `UUID_v4_REGEX` `$match`. + 2. `query_dependency_graph(...)` builds the **query** pipeline: a `$graphLookup` over that + view starting from the requested uuid, re-joined (`$lookup`) against the real dataset + collection, with `pre_query`/`post_query` privilege filters and a final `$project` that + strips `readme`, `manifest`, `annotations`. + +- **`__init__.py`** — the stateful half. `DependencyGraphExtension.init_app` opens the + mongo client and stashes `client`/`db`/`collection` as **class variables** (so the + module-level route functions can reach the DB — see the NOTE comment there). + `dependency_graph_by_user_and_uuid` is the orchestrator: gates on `ENABLE_DEPENDENCY_VIEW`, + resolves a cached view via `_get_dependency_view_from_keys`, applies privilege filtering + through `dservercore`'s `_preprocess_privileges` + the local `_dict_to_mongo_query`, runs + the aggregation, and converts datetimes to float timestamps for the response. + +### View caching / bookkeeping + +Each distinct *set* of dependency keys gets its own materialized view named +`` (e.g. `dep:2020-10-05T01:22:39.581592`). A bookkeeping +collection (`dep_views`) maps `keys → view name` with an `accessed_on` timestamp and acts +as an LRU: when count exceeds `MONGO_DEPENDENCY_VIEW_CACHE_SIZE`, the least-recently-accessed +view is dropped. `FORCE_REBUILD_DEPENDENCY_VIEW=True` drops and recreates the view on every +query (needed to pick up changes to `DEPENDENCY_KEYS`). All bookkeeping helpers are wrapped +by `@assert_dependency_view_bookkeeping_collection` which lazily creates that collection. + +### Security boundary in `utils.py` + +`utils.py` is a vendored copy of query-building helpers from `dserver-direct-mongo-plugin` +(deliberately copied to drop the runtime dependency). `_dict_to_mongo_query` can merge a +caller-supplied raw mongo `query`; `_assert_no_forbidden_operators` **recursively rejects** +`$where`, `$function`, `$accumulator` to block server-side JavaScript execution. Tests in +`test_raw_query_hardening.py` lock this down — keep that guarantee when editing. + +## Configuration (`config.py`) + +`Config` reads env vars at import time. Key behavioral switches (all `DSERVER_`-prefixed +env vars, parsed against `AFFIRMATIVE_EXPRESSIONS`): + +- `MONGO_URI` / `MONGO_DB` / `MONGO_COLLECTION` — **required**; `init_app` raises if absent. + Listed in `CONFIG_SECRETS_TO_OBFUSCATE` so the `/config/info` route never returns them clear-text. +- `DSERVER_ENABLE_DEPENDENCY_VIEW` (default True), `DSERVER_DYNAMIC_DEPENDENCY_KEYS` (default True), + `DSERVER_FORCE_REBUILD_DEPENDENCY_VIEW` (default False). +- `DSERVER_DEPENDENCY_KEYS` — JSON list (or bare string) of dotted paths to source UUIDs. + Default: `["readme_parsed.derived_from.uuid", "annotations.source_dataset_uuid"]`. Nesting + hierarchy is irrelevant; the dot-path is just unwound. Note it traverses `readme_parsed` + (the server-parsed README), **not** raw `readme` — a string README breaks traversal. + +## Conventions / gotchas + +- A dataset is truly identified by `(uuid, base_uri)`, but the graph is keyed on `uuid` + alone — duplicate registrations of one uuid across base URIs yield one arbitrary hit + (see the `TODO` in `graph.py`). +- Privilege filtering happens **twice** (pre- and post-graph-traversal) so a user who lacks + access to part of a graph gets a truncated/disconnected result rather than a leak. +- When changing the response shape, update the field-exclusion markers in + `tests/test_graph_routes.py` (server-stamped fields like `created_at`, `frozen_at`, + `uploaded_at`, `uploaded_by` are excluded from comparison). From e3fa22b905aa26bf66f8fea9de31a2621b25e235 Mon Sep 17 00:00:00 2001 From: "Johannes L. Hoermann" Date: Wed, 17 Jun 2026 16:38:58 +0900 Subject: [PATCH 9/9] BUILD: Stop tracking setuptools_scm-generated version.py version.py is the setuptools_scm write_to target and is regenerated on every build, so it does not belong in VCS. Remove it from the index and add it to .gitignore. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 3 +++ dserver_dependency_graph_plugin/version.py | 24 ---------------------- 2 files changed, 3 insertions(+), 24 deletions(-) delete mode 100644 dserver_dependency_graph_plugin/version.py diff --git a/.gitignore b/.gitignore index f03519f..776f3d3 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ dist/* venv/* old-provision/* jwt-spike/* + +# Auto-generated by setuptools_scm (write_to target) +dserver_dependency_graph_plugin/version.py diff --git a/dserver_dependency_graph_plugin/version.py b/dserver_dependency_graph_plugin/version.py deleted file mode 100644 index d5f9652..0000000 --- a/dserver_dependency_graph_plugin/version.py +++ /dev/null @@ -1,24 +0,0 @@ -# file generated by vcs-versioning -# don't change, don't track in version control -from __future__ import annotations - -__all__ = [ - "__version__", - "__version_tuple__", - "version", - "version_tuple", - "__commit_id__", - "commit_id", -] - -version: str -__version__: str -__version_tuple__: tuple[int | str, ...] -version_tuple: tuple[int | str, ...] -commit_id: str | None -__commit_id__: str | None - -__version__ = version = '0.4.3.dev4' -__version_tuple__ = version_tuple = (0, 4, 3, 'dev4') - -__commit_id__ = commit_id = 'g4d201d6b5'