diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index bf5c306..f5ebed3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,12 +14,12 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Set up Python 3.12 - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: 3.12 @@ -35,7 +35,7 @@ jobs: python -m build - name: Upload artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: python-package-distributions path: dist/ @@ -56,7 +56,7 @@ jobs: steps: - name: Download all the dists - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ @@ -83,7 +83,7 @@ jobs: steps: - name: Download artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ @@ -107,13 +107,13 @@ jobs: steps: - name: Download artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 with: name: python-package-distributions path: dist/ - name: Sign with Sigstore - uses: sigstore/gh-action-sigstore-python@v3.0.0 + uses: sigstore/gh-action-sigstore-python@v3.4.0 with: inputs: >- ./dist/*.tar.gz diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e61e728..261ed15 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,24 +7,23 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - mongodb-version: ['4.2', '4.4', '5.0', '6.0'] + python-version: ['3.10', '3.11', '3.12', '3.13'] + mongodb-version: ['6.0', '7.0', '8.0'] dservercore-version: [main] dserver-search-plugin-mongo-version: [main] dserver-retrieve-plugin-mongo-version: [main] - dserver-direct-mongo-plugin-version: [main] steps: - name: Git checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up MongoDB ${{ matrix.mongodb-version }} - uses: supercharge/mongodb-github-action@1.11.0 + uses: supercharge/mongodb-github-action@1.12.1 with: mongodb-version: ${{ matrix.mongodb-version }} - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} @@ -36,10 +35,9 @@ jobs: - name: Install server, search and retrieve plugins run: | - pip install git+https://github.com/livMatS/dservercore.git@${{ matrix.dservercore-version }} - pip install git+https://github.com/livMatS/dserver-search-plugin-mongo.git@${{ matrix.dserver-search-plugin-mongo-version }} - pip install git+https://github.com/livMatS/dserver-retrieve-plugin-mongo.git@${{ matrix.dserver-retrieve-plugin-mongo-version }} - pip install git+https://github.com/livMatS/dserver-direct-mongo-plugin.git@${{ matrix.dserver-direct-mongo-plugin-version }} + pip install git+https://github.com/jic-dtool/dservercore.git@${{ matrix.dservercore-version }} + pip install git+https://github.com/jic-dtool/dserver-search-plugin-mongo.git@${{ matrix.dserver-search-plugin-mongo-version }} + pip install git+https://github.com/jic-dtool/dserver-retrieve-plugin-mongo.git@${{ matrix.dserver-retrieve-plugin-mongo-version }} - name: Remaining requirements run: | diff --git a/.gitignore b/.gitignore index f03519f..776f3d3 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ dist/* venv/* old-provision/* jwt-spike/* + +# Auto-generated by setuptools_scm (write_to target) +dserver_dependency_graph_plugin/version.py diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..d7a99e6 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,121 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this is + +A `dservercore` extension plugin that lets a [dserver](https://github.com/livMatS/dservercore) +instance answer "give me every dataset in the same dependency graph as this UUID." +It registers a Flask blueprint at `/graph` and is wired into the host server through the +`dservercore.extension` entry point (`pyproject.toml`): +`DependencyGraphExtension = "dserver_dependency_graph_plugin:DependencyGraphExtension"`. + +The plugin does **not** register or own dataset metadata. It reads the dataset collection +that the search/retrieve mongo plugins populate, building MongoDB *views* on top of it. +`register_dataset` is intentionally a no-op. + +## Commands + +```bash +# Install with test deps (needs the sibling livMatS plugins, see test.yml for git installs) +pip install .[test] + +# Run the full test suite (requires a running MongoDB) +pytest -sv + +# Point tests at a non-default mongo +TEST_MONGO_URI=mongodb://localhost:27017/ pytest -sv + +# Single test +pytest tests/test_graph_routes.py::test_query_dependency_graph_by_default_keys -sv + +# Lint (matches CI) +flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics +``` + +Tests spin up a real dserver app via `create_app` and a temporary mongo database per run +(`conftest.py`). **A MongoDB server must be reachable** — there is no mocking layer. CI +(`.github/workflows/test.yml`) runs a matrix of Python 3.8–3.12 × MongoDB 4.2–6.0 and +installs `dservercore`, `dserver-search-plugin-mongo`, `dserver-retrieve-plugin-mongo`, +and `dserver-direct-mongo-plugin` from their `main` branches first. + +Build backend is `flit_scm`; version is derived from git tags via `setuptools_scm` and +written to `dserver_dependency_graph_plugin/version.py` (do not edit by hand). + +## Routes (actual) + +Defined in `__init__.py` on `graph_bp` (`url_prefix="/graph"`): + +- `GET /graph/uuids/` — graph using server-default `DEPENDENCY_KEYS`. +- `POST /graph/uuids/` — body is a `DependencyKeysSchema` (`{"dependency_keys": [...]}`); + only honored when `DYNAMIC_DEPENDENCY_KEYS` is enabled, otherwise it silently falls back + to the defaults. + +Note: `README.rst` documents an older `/graph/lookup/` path — the code uses +`/graph/uuids/`. Trust the code. + +## Architecture / request flow + +The hard part is all MongoDB aggregation-pipeline construction. Two files carry it: + +- **`graph.py`** — pure pipeline builders (no DB calls), two distinct concerns: + 1. `build_undirected_adjacency_lists(keys)` builds the **view** definition: it unwinds + each configured dependency key into directed `(uuid → derived_from)` edges, then + emits *both* directions (`group_dependencies` + `group_inverse_dependencies`) so the + graph can be traversed forward and backward. Invalid edges are dropped by a + `UUID_v4_REGEX` `$match`. + 2. `query_dependency_graph(...)` builds the **query** pipeline: a `$graphLookup` over that + view starting from the requested uuid, re-joined (`$lookup`) against the real dataset + collection, with `pre_query`/`post_query` privilege filters and a final `$project` that + strips `readme`, `manifest`, `annotations`. + +- **`__init__.py`** — the stateful half. `DependencyGraphExtension.init_app` opens the + mongo client and stashes `client`/`db`/`collection` as **class variables** (so the + module-level route functions can reach the DB — see the NOTE comment there). + `dependency_graph_by_user_and_uuid` is the orchestrator: gates on `ENABLE_DEPENDENCY_VIEW`, + resolves a cached view via `_get_dependency_view_from_keys`, applies privilege filtering + through `dservercore`'s `_preprocess_privileges` + the local `_dict_to_mongo_query`, runs + the aggregation, and converts datetimes to float timestamps for the response. + +### View caching / bookkeeping + +Each distinct *set* of dependency keys gets its own materialized view named +`` (e.g. `dep:2020-10-05T01:22:39.581592`). A bookkeeping +collection (`dep_views`) maps `keys → view name` with an `accessed_on` timestamp and acts +as an LRU: when count exceeds `MONGO_DEPENDENCY_VIEW_CACHE_SIZE`, the least-recently-accessed +view is dropped. `FORCE_REBUILD_DEPENDENCY_VIEW=True` drops and recreates the view on every +query (needed to pick up changes to `DEPENDENCY_KEYS`). All bookkeeping helpers are wrapped +by `@assert_dependency_view_bookkeeping_collection` which lazily creates that collection. + +### Security boundary in `utils.py` + +`utils.py` is a vendored copy of query-building helpers from `dserver-direct-mongo-plugin` +(deliberately copied to drop the runtime dependency). `_dict_to_mongo_query` can merge a +caller-supplied raw mongo `query`; `_assert_no_forbidden_operators` **recursively rejects** +`$where`, `$function`, `$accumulator` to block server-side JavaScript execution. Tests in +`test_raw_query_hardening.py` lock this down — keep that guarantee when editing. + +## Configuration (`config.py`) + +`Config` reads env vars at import time. Key behavioral switches (all `DSERVER_`-prefixed +env vars, parsed against `AFFIRMATIVE_EXPRESSIONS`): + +- `MONGO_URI` / `MONGO_DB` / `MONGO_COLLECTION` — **required**; `init_app` raises if absent. + Listed in `CONFIG_SECRETS_TO_OBFUSCATE` so the `/config/info` route never returns them clear-text. +- `DSERVER_ENABLE_DEPENDENCY_VIEW` (default True), `DSERVER_DYNAMIC_DEPENDENCY_KEYS` (default True), + `DSERVER_FORCE_REBUILD_DEPENDENCY_VIEW` (default False). +- `DSERVER_DEPENDENCY_KEYS` — JSON list (or bare string) of dotted paths to source UUIDs. + Default: `["readme_parsed.derived_from.uuid", "annotations.source_dataset_uuid"]`. Nesting + hierarchy is irrelevant; the dot-path is just unwound. Note it traverses `readme_parsed` + (the server-parsed README), **not** raw `readme` — a string README breaks traversal. + +## Conventions / gotchas + +- A dataset is truly identified by `(uuid, base_uri)`, but the graph is keyed on `uuid` + alone — duplicate registrations of one uuid across base URIs yield one arbitrary hit + (see the `TODO` in `graph.py`). +- Privilege filtering happens **twice** (pre- and post-graph-traversal) so a user who lacks + access to part of a graph gets a truncated/disconnected result rather than a leak. +- When changing the response shape, update the field-exclusion markers in + `tests/test_graph_routes.py` (server-stamped fields like `created_at`, `frozen_at`, + `uploaded_at`, `uploaded_by` are excluded from comparison). diff --git a/README.rst b/README.rst index 086ae2a..cb67cc7 100644 --- a/README.rst +++ b/README.rst @@ -151,7 +151,7 @@ graph by UUID is possible, i.e. .. code-block:: bash $ UUID=41a2e3e2-0c01-444f-bd7d-f9bb45512373 - $ curl -H "$HEADER" http://localhost:5000/graph/lookup/$UUID + $ curl -H "$HEADER" http://localhost:5000/graph/uuids/$UUID Looking up a dependency graph by UUID will result in unique per-UUID hits. As it is possible for a dataset to be registered in more than one base @@ -172,7 +172,7 @@ of desired dependency keys attached $ curl -H "$HEADER" -H "Content-Type: application/json" \ -X POST -d \ '["annotations.source_dataset_uuid","readme.derived_from.uuid"]' - http://localhost:5000/graph/lookup/$UUID + http://localhost:5000/graph/uuids/$UUID If a view for this particular set of keys does not exist yet, the server will generate and cache it on-the-fly. This can be observed in the mongo shell @@ -217,7 +217,7 @@ and querying with a specific set of keys for the first time $ curl -H "$HEADER" -H "Content-Type: application/json" \ -X POST -d \ '["another.possibly_nested.dependency_key"]' \ - http://localhost:5000/graph/lookup/$UUID + http://localhost:5000/graph/uuids/$UUID will result in an additional view named uniquely by the current UTC time:: diff --git a/dserver_dependency_graph_plugin/__init__.py b/dserver_dependency_graph_plugin/__init__.py index 3f9c937..f0e8b24 100644 --- a/dserver_dependency_graph_plugin/__init__.py +++ b/dserver_dependency_graph_plugin/__init__.py @@ -33,7 +33,7 @@ from dservercore import AuthenticationError, ExtensionABC from dservercore.sql_models import DatasetSchema from dservercore.utils import _preprocess_privileges -from dserver_direct_mongo_plugin.utils import _dict_to_mongo_query +from .utils import _dict_to_mongo_query from .schemas import DependencyKeysSchema @@ -116,7 +116,8 @@ def _get_dependency_view_bookkeeping_record(dependency_keys): @assert_dependency_view_bookkeeping_collection def _create_dependency_view_bookkeeping_record(name, dependency_keys): ret = DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].insert_one( - {'name': name, 'keys': dependency_keys, 'accessed_on': datetime.datetime.utcnow()}) + {'name': name, 'keys': dependency_keys, + 'accessed_on': datetime.datetime.now(datetime.timezone.utc)}) # drop oldest entry if number of documents exceeds allowed maximum count = DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].count_documents({}) if count > Config.MONGO_DEPENDENCY_VIEW_CACHE_SIZE: @@ -135,7 +136,8 @@ def _create_dependency_view_bookkeeping_record(name, dependency_keys): def _update_dependency_view_bookkeeping_record(name): """Updated record to dependency view bookkeeping collection or add if new.""" return DependencyGraphExtension.db[Config.MONGO_DEPENDENCY_VIEW_BOOKKEEPING].update_one( - {'name': name}, {'$set': {'accessed_on': datetime.datetime.utcnow()}}) + {'name': name}, + {'$set': {'accessed_on': datetime.datetime.now(datetime.timezone.utc)}}) # mid-level dependency view helpers @@ -146,7 +148,8 @@ def _create_dependency_view(dependency_keys): :returns: str""" # generate unique, valid name for view from prefix and ISO date string - datestring = datetime.datetime.utcnow().isoformat() + datestring = datetime.datetime.now( + datetime.timezone.utc).replace(tzinfo=None).isoformat() name = Config.MONGO_DEPENDENCY_VIEW_PREFIX + datestring if name in DependencyGraphExtension.db.list_collection_names(): @@ -246,7 +249,8 @@ def dependency_graph_by_user_and_uuid(username, uuid, dependency_keys=Config.DEP mongo_aggregation = query_dependency_graph(pre_query=pre_query, post_query=post_query, dependency_keys=dependency_keys, - mongo_dependency_view=dependency_view) + mongo_dependency_view=dependency_view, + mongo_collection=current_app.config['MONGO_COLLECTION']) logger.debug("Constructed mongo aggregation: {}".format(mongo_aggregation)) cx = DependencyGraphExtension.db[current_app.config['MONGO_COLLECTION']].aggregate(mongo_aggregation) diff --git a/dserver_dependency_graph_plugin/config.py b/dserver_dependency_graph_plugin/config.py index d50ca9c..c837b7b 100644 --- a/dserver_dependency_graph_plugin/config.py +++ b/dserver_dependency_graph_plugin/config.py @@ -4,10 +4,19 @@ AFFIRMATIVE_EXPRESSIONS = ['true', '1', 'y', 'yes', 'on'] -CONFIG_SECRETS_TO_OBFUSCATE = [] +CONFIG_SECRETS_TO_OBFUSCATE = [ + "MONGO_URI", + "MONGO_DB", + "MONGO_COLLECTION", +] class Config(object): + # MongoDB connection settings + # These are required for the dependency graph plugin to connect to MongoDB + MONGO_URI = os.environ.get("MONGO_URI") + MONGO_DB = os.environ.get("MONGO_DB") + MONGO_COLLECTION = os.environ.get("MONGO_COLLECTION") # If enabled, the underlying database will offer dependency graph views on # the server's default collection. Those views offer on-the-fly-generated # collections of undirected per-dataset adjacency lists in order to @@ -34,7 +43,7 @@ class Config(object): # a single key or a JSON-formatted list of keys. # Nested fields are separated by a dot (.) DEPENDENCY_KEYS = [ - 'readme.derived_from.uuid', + 'readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid' ] diff --git a/dserver_dependency_graph_plugin/graph.py b/dserver_dependency_graph_plugin/graph.py index 6b3aa6e..68eefd5 100644 --- a/dserver_dependency_graph_plugin/graph.py +++ b/dserver_dependency_graph_plugin/graph.py @@ -1,7 +1,6 @@ """Aggregation pipelines for graph operations.""" -from dserver_dependency_graph_plugin.config import Config as dependency_graph_plugin_config -from dserver_direct_mongo_plugin.config import Config as direct_mongo_plugin_config +from .config import Config # a regular expression to filter valid v4 UUIDs UUID_v4_REGEX = '[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[4][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}' @@ -9,7 +8,7 @@ # most of those 'functions' are pretty static and just wrapped in function # definitions for convenience. -def unwind_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def unwind_dependencies(dependency_keys=Config.DEPENDENCY_KEYS): """Create parallel aggregation pipelines for unwinding all configured dependency keys.""" parallel_aggregations = [] @@ -41,7 +40,7 @@ def unwind_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENC return parallel_aggregations -def merge_dependencies(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def merge_dependencies(dependency_keys=Config.DEPENDENCY_KEYS): """Aggregate (directed) dependency graph edges. All configured dependency keys are merged in a key-agnostic 'dependencies' @@ -117,7 +116,7 @@ def group_inverse_dependencies(): return aggregation -def build_undirected_adjecency_lists(dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS): +def build_undirected_adjecency_lists(dependency_keys=Config.DEPENDENCY_KEYS): """Aggregate undirected adjacency lists.""" aggregation = [ *merge_dependencies(dependency_keys), @@ -200,8 +199,8 @@ def build_undirected_adjecency_lists(dependency_keys=dependency_graph_plugin_con # behavior would be to yield all redundant dataset entries for a uuid. def query_dependency_graph(mongo_dependency_view, pre_query, post_query=None, - dependency_keys=dependency_graph_plugin_config.DEPENDENCY_KEYS, - mongo_collection=direct_mongo_plugin_config.MONGO_COLLECTION): + dependency_keys=Config.DEPENDENCY_KEYS, + mongo_collection=None): """Aggregation pipeline for querying dependency view on datasets collection. :param pre_query: selects all documents for whicht to query the dependency graph. diff --git a/dserver_dependency_graph_plugin/utils.py b/dserver_dependency_graph_plugin/utils.py new file mode 100644 index 0000000..35917fc --- /dev/null +++ b/dserver_dependency_graph_plugin/utils.py @@ -0,0 +1,142 @@ +"""Utility functions for MongoDB query construction. + +These functions were originally part of dserver-direct-mongo-plugin but are +copied here to decouple the dependency-graph-plugin from that package. +""" + +import logging + +logger = logging.getLogger(__name__) + + +VALID_MONGO_QUERY_KEYS = ( + "free_text", + "creator_usernames", + "base_uris", + "uuids", + "tags", +) + +MONGO_QUERY_LIST_KEYS = ( + "creator_usernames", + "base_uris", + "uuids", + "tags", +) + + +def _dict_to_mongo(query_dict): + """Convert a query dictionary to a MongoDB query. + + :param query_dict: Dictionary with query parameters + :returns: MongoDB query dictionary + """ + def _sanitise(query_dict): + for key in list(query_dict.keys()): + if key not in VALID_MONGO_QUERY_KEYS: + del query_dict[key] + for lk in MONGO_QUERY_LIST_KEYS: + if lk in query_dict: + if len(query_dict[lk]) == 0: + del query_dict[lk] + + def _deal_with_possible_or_statment(a_list, key): + if len(a_list) == 1: + return {key: a_list[0]} + else: + return {"$or": [{key: v} for v in a_list]} + + def _deal_with_possible_and_statement(a_list, key): + if len(a_list) == 1: + return {key: a_list[0]} + else: + return {key: {"$all": a_list}} + + _sanitise(query_dict) + + sub_queries = [] + if "free_text" in query_dict: + sub_queries.append({"$text": {"$search": query_dict["free_text"]}}) + if "creator_usernames" in query_dict: + sub_queries.append( + _deal_with_possible_or_statment( + query_dict["creator_usernames"], "creator_username" + ) + ) + if "base_uris" in query_dict: + sub_queries.append( + _deal_with_possible_or_statment(query_dict["base_uris"], "base_uri") + ) + if "uuids" in query_dict: + sub_queries.append(_deal_with_possible_or_statment(query_dict["uuids"], "uuid")) + if "tags" in query_dict: + sub_queries.append( + _deal_with_possible_and_statement(query_dict["tags"], "tags") + ) + + if len(sub_queries) == 0: + return {} + elif len(sub_queries) == 1: + return sub_queries[0] + else: + return {"$and": [q for q in sub_queries]} + + +# MongoDB operators that execute server-side JavaScript. A raw query +# containing any of these could run arbitrary code on the database server. +_FORBIDDEN_MONGO_OPERATORS = frozenset( + ("$where", "$function", "$accumulator")) + + +def _assert_no_forbidden_operators(raw_mongo): + """Recursively reject MongoDB operators that execute JavaScript. + + :raises: ValueError if a forbidden operator occurs anywhere in the query + """ + if isinstance(raw_mongo, dict): + for key, value in raw_mongo.items(): + if key in _FORBIDDEN_MONGO_OPERATORS: + raise ValueError( + f"Operator '{key}' is not allowed in raw MongoDB queries") + _assert_no_forbidden_operators(value) + elif isinstance(raw_mongo, (list, tuple)): + for item in raw_mongo: + _assert_no_forbidden_operators(item) + + +def _dict_to_mongo_query(query_dict): + """Construct mongo query, allowing embedding of a raw mongo query. + + Converts a query dictionary to a MongoDB query format. If the query_dict + contains a 'query' key with a dict value, that raw MongoDB query is + merged with the constructed query. + + :param query_dict: Dictionary with query parameters. May contain: + - free_text: Text search string + - creator_usernames: List of creator usernames + - base_uris: List of base URIs + - uuids: List of UUIDs + - tags: List of tags + - query: Raw MongoDB query dict (optional) + :returns: MongoDB query dictionary + :raises: ValueError if the raw query contains JavaScript-executing + operators ($where, $function, $accumulator) + """ + if "query" in query_dict and isinstance(query_dict["query"], dict): + raw_mongo = query_dict["query"] + _assert_no_forbidden_operators(raw_mongo) + del query_dict["query"] + else: + raw_mongo = {} + + mongo_query = _dict_to_mongo(query_dict) + + if len(raw_mongo) > 0 and len(mongo_query) == 0: + mongo_query = raw_mongo + elif len(raw_mongo) > 0 and len(mongo_query) == 1 and "$and" in mongo_query: + mongo_query["$and"].append(raw_mongo) + elif len(raw_mongo) > 0: + mongo_query = {"$and": [mongo_query, raw_mongo]} + + logger.debug("Constructed mongo query: {}".format(mongo_query)) + return mongo_query diff --git a/pyproject.toml b/pyproject.toml index d168a75..e11f57a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,22 @@ [build-system] -requires = ["setuptools>=42", "setuptools_scm[toml]>=6.3"] -build-backend = "setuptools.build_meta" +requires = ["flit_scm"] +build-backend = "flit_scm:buildapi" [project] name = "dserver-dependency-graph-plugin" description = "dserver plugin for receiving s3 notifications on updated objects." readme = "README.rst" -license = {file = "LICENSE"} +license = {text = "MIT"} authors = [ {name = "Johannes L. Hörmann", email = "johannes.laurin@gmail.com"}, ] dynamic = ["version"] +requires-python = ">=3.10" dependencies = [ - "dtoolcore>=3.18.0", - "dservercore>=0.20.0", - "dserver-direct-mongo-plugin", - ] + "dtoolcore>=3.18.0", + "dservercore>=0.20.0", + "pymongo", +] [project.optional-dependencies] test = [ @@ -31,13 +32,17 @@ Documentation = "https://github.com/livMatS/dserver-dependency-graph-plugin/blob Repository = "https://github.com/livMatS/dserver-dependency-graph-plugin" Changelog = "https://github.com/livMatS/dserver-dependency-graph-plugin/blob/main/CHANGELOG.rst" +[project.entry-points."dservercore.extension"] +DependencyGraphExtension = "dserver_dependency_graph_plugin:DependencyGraphExtension" + +[tool.flit.module] +name = "dserver_dependency_graph_plugin" + [tool.setuptools_scm] version_scheme = "guess-next-dev" local_scheme = "no-local-version" write_to = "dserver_dependency_graph_plugin/version.py" -[tool.setuptools] -packages = ["dserver_dependency_graph_plugin"] - -[project.entry-points."dservercore.extension"] -"DependencyGraphExtension" = "dserver_dependency_graph_plugin:DependencyGraphExtension" +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=dserver_dependency_graph_plugin --cov-report=term-missing" diff --git a/tests/conftest.py b/tests/conftest.py index d3f7a10..9d7b8c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -163,19 +163,19 @@ def tmp_app_with_users(request): "MONGO_DEPENDENCY_VIEW_CACHE_SIZE": 10, "FORCE_REBUILD_DEPENDENCY_VIEW": False, "DEPENDENCY_KEYS": [ - 'readme.derived_from.uuid', + 'readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid' ], "DYNAMIC_DEPENDENCY_KEYS": True, - "RETRIEVE_MONGO_URI": "mongodb://localhost:27017/", + "RETRIEVE_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "RETRIEVE_MONGO_DB": tmp_mongo_db_name, "RETRIEVE_MONGO_COLLECTION": "datasets", - "SEARCH_MONGO_URI": "mongodb://localhost:27017/", + "SEARCH_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "SEARCH_MONGO_DB": tmp_mongo_db_name, "SEARCH_MONGO_COLLECTION": "datasets", - "MONGO_URI": "mongodb://localhost:27017/", + "MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "MONGO_DB": tmp_mongo_db_name, - "MONGO_COLLECTION": "metadata", + "MONGO_COLLECTION": "datasets", "SQLALCHEMY_DATABASE_URI": "sqlite:///:memory:", "SQLALCHEMY_TRACK_MODIFICATIONS": False, "JWT_ALGORITHM": "RS256", @@ -239,15 +239,15 @@ def tmp_app_with_dependent_data(request): "OPENAPI_VERSION": '3.0.2', "FLASK_ENV": "development", "CONFIG_SECRETS_TO_OBFUSCATE": [], - "RETRIEVE_MONGO_URI": "mongodb://localhost:27017/", + "RETRIEVE_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "RETRIEVE_MONGO_DB": tmp_mongo_db_name, "RETRIEVE_MONGO_COLLECTION": "datasets", - "SEARCH_MONGO_URI": "mongodb://localhost:27017/", + "SEARCH_MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "SEARCH_MONGO_DB": tmp_mongo_db_name, "SEARCH_MONGO_COLLECTION": "datasets", - "MONGO_URI": "mongodb://localhost:27017/", + "MONGO_URI": os.environ.get("TEST_MONGO_URI", "mongodb://localhost:27017/"), "MONGO_DB": tmp_mongo_db_name, - "MONGO_COLLECTION": "metadata", + "MONGO_COLLECTION": "datasets", "SQLALCHEMY_DATABASE_URI": "sqlite:///:memory:", "SQLALCHEMY_TRACK_MODIFICATIONS": False, "JWT_ALGORITHM": "RS256", diff --git a/tests/test_config_routes.py b/tests/test_config_routes.py index 07237e5..062e9bc 100644 --- a/tests/test_config_routes.py +++ b/tests/test_config_routes.py @@ -20,7 +20,7 @@ def test_config_info_route(tmp_app_with_users, snowwhite_token): # NOQA assert r.status_code == 200 expected_response = { - 'dependency_keys': ['readme.derived_from.uuid', + 'dependency_keys': ['readme_parsed.derived_from.uuid', 'annotations.source_dataset_uuid'], 'dynamic_dependency_keys': True, 'enable_dependency_view': True, diff --git a/tests/test_graph_routes.py b/tests/test_graph_routes.py index fc1631e..d7af43a 100644 --- a/tests/test_graph_routes.py +++ b/tests/test_graph_routes.py @@ -40,6 +40,9 @@ def test_query_dependency_graph_by_default_keys(tmp_app_with_dependent_data, tes m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -50,7 +53,7 @@ def test_query_dependency_graph_by_custom_keys(tmp_app_with_dependent_data, test uuid = "a2218059-5bd0-4690-b090-062faf08e044" # brother - dependency_keys = ["readme.derived_from.uuid", "some_nonexistant_key"] + dependency_keys = ["readme_parsed.derived_from.uuid", "some_nonexistant_key"] r = tmp_app_with_dependent_data.post( "/graph/uuids/{}".format(uuid), @@ -83,6 +86,9 @@ def test_query_dependency_graph_by_custom_keys(tmp_app_with_dependent_data, test m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -126,6 +132,9 @@ def test_query_dependency_graph_by_custom_nonexistant_keys(tmp_app_with_dependen m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False assert compare_marked_nested(response, expected_response, marker) @@ -137,7 +146,7 @@ def test_generate_many_dependency_views(tmp_app_with_dependent_data, testing_fam uuid = "a2218059-5bd0-4690-b090-062faf08e044" # brother dependency_keys_list = [ - ["readme.derived_from.uuid", "some_nonexistant_key_{}".format(i)] for i in range(12)] + ["readme_parsed.derived_from.uuid", "some_nonexistant_key_{}".format(i)] for i in range(12)] expected_response = [] for role, p in testing_family.items(): @@ -161,6 +170,9 @@ def test_generate_many_dependency_views(tmp_app_with_dependent_data, testing_fam m['frozen_at'] = False m['size_in_bytes'] = False m['number_of_items'] = False + # Server-stamped registration provenance (dynamic timestamp). + m['uploaded_at'] = False + m['uploaded_by'] = False for dependency_keys in dependency_keys_list: r = tmp_app_with_dependent_data.post( diff --git a/tests/test_raw_query_hardening.py b/tests/test_raw_query_hardening.py new file mode 100644 index 0000000..50325a3 --- /dev/null +++ b/tests/test_raw_query_hardening.py @@ -0,0 +1,30 @@ +"""Audit hardening: raw MongoDB queries merged into constructed queries +must not be able to execute server-side JavaScript.""" + +import pytest + +from dserver_dependency_graph_plugin.utils import _dict_to_mongo_query + + +def test_benign_raw_query_is_merged(): + query = _dict_to_mongo_query({ + "base_uris": ["s3://bucket"], + "query": {"readme.project": "test"}, + }) + assert "readme.project" in str(query) + + +@pytest.mark.parametrize("malicious", [ + {"$where": "sleep(10000)"}, + {"$expr": {"$function": {"body": "function(){}", "args": [], + "lang": "js"}}}, + {"nested": {"$where": "this.a == 1"}}, + {"$or": [{"a": 1}, {"$where": "true"}]}, + {"group": {"$accumulator": {"init": "function(){}"}}}, +]) +def test_javascript_operators_rejected(malicious): + with pytest.raises(ValueError): + _dict_to_mongo_query({ + "base_uris": ["s3://bucket"], + "query": malicious, + })