Add pre-baked Cassandra Docker image for faster test startup

Tommel71 · Tommel71 · commit 02ca4958ae11 · 2026-01-27T10:59:48.000+01:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -40,6 +40,7 @@ repos:
         exclude: |
             (?x)^(
                 tests/cassandra/insert.py|
+                tests/cassandra/init-schemas.py|
                 tests/tagstore/insert.py|
                 tests/txs_pagesize_tester.py|
                 tests/txs_pagesize_tests.py
diff --git a/Makefile b/Makefile
@@ -18,6 +18,9 @@ WORKTREE_DIR ?= ../.graphsense-rest-old
 test: install-dev
 	uv run pytest -x -rx -vv
 
+test-reset-cassandra-image:
+	docker rmi graphsense/cassandra-test:4.1.4
+
 test-regression:
 	@export SKIP_REST_CONTAINER_SETUP=True && uv run pytest -m "regression" -s
 
@@ -97,4 +100,4 @@ tag-version: ensure-versions-alignment
 	-git diff --exit-code && git diff --staged --exit-code && git tag -a v$(GS_REST_SERVICE_VERSIONM) -m 'Release v$(GS_REST_SERVICE_VERSION)' || (echo "Repo is dirty please commit first" && exit 1)
 	git diff --exit-code && git diff --staged --exit-code && git tag -a v$(GS_REST_SERVICE_VERSION) -m 'Release v$(GS_REST_SERVICE_VERSION)' || (echo "Repo is dirty please commit first" && exit 1)
 
-.PHONY: format lint test ensure-versions-alignment run-codegen serve serve-docker pre-commit install-dev tag-version generate-python-client build-docker test-migration setup-migration-worktree clean-migration-worktree serve-old serve-new test-regression
+.PHONY: format lint test ensure-versions-alignment run-codegen serve serve-docker pre-commit install-dev tag-version generate-python-client build-docker build-test-cassandra test-reset-cassandra-image test-migration setup-migration-worktree clean-migration-worktree serve-old serve-new test-regression
diff --git a/tests/cassandra/Dockerfile b/tests/cassandra/Dockerfile
@@ -0,0 +1,35 @@
+# Pre-baked Cassandra image with GraphSense test schemas
+# Build: docker build -t graphsense/cassandra-test:4.1.4 tests/cassandra/
+FROM cassandra:4.1.4
+
+# Copy schema initialization script
+COPY init-schemas.py /init-schemas.py
+
+# Install Python dependencies for schema init
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends python3 python3-pip && \
+    pip3 install cassandra-driver requests && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Configure for fast single-node startup BEFORE first start
+# This is critical - num_tokens is locked after first start
+RUN sed -i 's/num_tokens: 16/num_tokens: 1/' /etc/cassandra/cassandra.yaml
+
+# Set JVM options for fast startup
+ENV JVM_EXTRA_OPTS="-Dcassandra.ring_delay_ms=100 -Dcassandra.skip_wait_for_gossip_to_settle=0"
+
+RUN set -e && \
+    echo "Starting Cassandra for schema initialization..." && \
+    cassandra -R && \
+    echo "Waiting for Cassandra to be ready..." && \
+    until cqlsh -e "SELECT now() FROM system.local" 2>/dev/null; do sleep 1; done && \
+    echo "Cassandra is ready, creating schemas..." && \
+    python3 /init-schemas.py && \
+    echo "Schemas created, stopping Cassandra..." && \
+    nodetool stopdaemon && \
+    sleep 2 && \
+    echo "Schema initialization complete."
+
+# Clean up init script (no longer needed)
+RUN rm /init-schemas.py
diff --git a/tests/cassandra/init-schemas.py b/tests/cassandra/init-schemas.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Initialize Cassandra with GraphSense test schemas at Docker build time."""
+
+import sys
+
+import requests
+from cassandra.cluster import Cluster
+
+TAG = "master"
+SCHEMA_BASE = f"https://raw.githubusercontent.com/graphsense/graphsense-lib/{TAG}/src/graphsenselib/schema/resources/"
+
+SCHEMA_MAPPING = {"btc": "utxo", "ltc": "utxo", "eth": "account", "trx": "account_trx"}
+SCHEMA_MAPPING_OVERRIDE = {("trx", "transformed"): "account"}
+
+MAGIC_REPLACE_CONSTANT = "0x8BADF00D"
+MAGIC_REPLACE_CONSTANT2 = f"{MAGIC_REPLACE_CONSTANT}_REPLICATION_CONFIG"
+SIMPLE_REPLICATION_CONFIG = "{'class': 'SimpleStrategy', 'replication_factor': 1}"
+
+
+def get_schema_file(filename):
+    res = requests.get(SCHEMA_BASE + filename)
+    res.raise_for_status()
+    return res.text
+
+
+def main():
+    cluster = Cluster(["127.0.0.1"])
+    session = cluster.connect()
+
+    for currency, schema_base in SCHEMA_MAPPING.items():
+        for schema_type in ["raw", "transformed"]:
+            schema_name = SCHEMA_MAPPING_OVERRIDE.get(
+                (currency, schema_type), schema_base
+            )
+            filename = f"{schema_type}_{schema_name}_schema.sql"
+            keyspace = f"resttest_{currency}_{schema_type}"
+
+            # Log progress during Docker build
+            sys.stdout.write(f"Creating schema: {keyspace} from {filename}\n")
+            sys.stdout.flush()
+
+            schema_str = (
+                get_schema_file(filename)
+                .replace(MAGIC_REPLACE_CONSTANT2, SIMPLE_REPLICATION_CONFIG)
+                .replace(MAGIC_REPLACE_CONSTANT, keyspace)
+            )
+
+            for stmt in schema_str.split(";"):
+                stmt = stmt.strip()
+                if stmt:
+                    session.execute(stmt)
+
+    sys.stdout.write("All schemas created successfully!\n")
+    cluster.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/cassandra/insert.py b/tests/cassandra/insert.py
@@ -1,63 +1,31 @@
-import logging
 import os
-from functools import cache
 from pathlib import Path
 
-import requests
 from cassandra.cluster import Cluster
 
 DATA_DIR = Path(__file__).parent.resolve() / "data"
 
-TAG = "master"
-SCHEMA_BASE = f"https://raw.githubusercontent.com/graphsense/graphsense-lib/{TAG}/src/graphsenselib/schema/resources/"
-
-SCHEMA_MAPPING = {"btc": "utxo", "ltc": "utxo", "eth": "account", "trx": "account_trx"}
-
-SCHEMA_MAPPING_OVERRIDE = {("trx", "transformed"): "account"}
-
-MAGIC_REPLACE_CONSTANT = "0x8BADF00D"
-MAGIC_REPLACE_CONSTANT2 = f"{MAGIC_REPLACE_CONSTANT}_REPLICATION_CONFIG"
-
-SIMPLE_REPLICATION_CONFIG = "{'class': 'SimpleStrategy', 'replication_factor': 1}"
-
-
-@cache
-def get_schema_file(file: str):
-    res = requests.get(SCHEMA_BASE + file)
-    return res.text
-
 
 def load_test_data(host, port):
+    """Load test data into pre-baked Cassandra (schemas already exist)."""
     cluster = Cluster([host], port=port)
     session = cluster.connect()
 
-    for k, v in SCHEMA_MAPPING.items():
-        for st in ["raw", "transformed"]:
-            v = SCHEMA_MAPPING_OVERRIDE.get((k, st), v)
-            filename = f"{st}_{v}_schema.sql"
-            keyspace = f"resttest_{k}_{st}"
-
-            logging.info(f"creating db tables cassandra {filename}")
-            schema_str = (
-                get_schema_file(filename)
-                .replace(MAGIC_REPLACE_CONSTANT2, SIMPLE_REPLICATION_CONFIG)
-                .replace(MAGIC_REPLACE_CONSTANT, keyspace)
-            )
-            for x in schema_str.split(";"):
-                x = x.strip()
-                if x:
-                    session.execute(x)
-
-    for x in DATA_DIR.iterdir():
-        if not x.is_file():
+    # Collect all insert statements
+    inserts = []
+    for file_path in DATA_DIR.iterdir():
+        if not file_path.is_file():
             continue
-        table_name = os.path.basename(x)
-        content = x.read_text()
-        for x in content.split("\n"):
-            x = x.strip()
-            if x:
-                session.execute(
-                    f"""
-                    INSERT INTO {table_name} JSON '{x}'
-                    """
-                )
+        table_name = os.path.basename(file_path)
+        content = file_path.read_text()
+        for line in content.split("\n"):
+            line = line.strip()
+            if line:
+                inserts.append(f"INSERT INTO {table_name} JSON '{line}'")
+
+    # Execute inserts concurrently using async Cassandra driver
+    futures = [session.execute_async(stmt) for stmt in inserts]
+
+    # Wait for all inserts to complete
+    for future in futures:
+        future.result()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,8 @@
+import subprocess
 from os import environ
+from pathlib import Path
 
+import docker
 import pytest
 from testcontainers.cassandra import CassandraContainer
 from testcontainers.postgres import PostgresContainer
@@ -9,7 +12,30 @@
 from tests.tagstore.insert import load_test_data as tags_load_test_data
 
 postgres = PostgresContainer("postgres:16-alpine")
-cassandra = CassandraContainer("cassandra:4.1.4")
+
+# Pre-baked Cassandra image with schemas and fast startup settings already configured
+# Build with: make build-test-cassandra
+# Baked-in optimizations: NUM_TOKENS=1, ring_delay_ms=100, skip_wait_for_gossip=0
+CASSANDRA_TEST_IMAGE = environ.get(
+    "CASSANDRA_TEST_IMAGE", "graphsense/cassandra-test:4.1.4"
+)
+
+
+def ensure_cassandra_image_exists():
+    """Build Cassandra test image if it doesn't exist locally."""
+    client = docker.from_env()
+    try:
+        client.images.get(CASSANDRA_TEST_IMAGE)
+    except docker.errors.ImageNotFound:
+        dockerfile_path = Path(__file__).parent / "cassandra"
+        subprocess.run(
+            ["docker", "build", "-t", CASSANDRA_TEST_IMAGE, str(dockerfile_path)],
+            check=True,
+        )
+
+
+ensure_cassandra_image_exists()
+cassandra = CassandraContainer(CASSANDRA_TEST_IMAGE)
 
 
 @pytest.fixture(scope="session", autouse=True)