From d61f9fe4bf0b6c8b67c528d0b52c7aba6181f7e3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 20 May 2026 17:24:09 -0500 Subject: [PATCH 01/28] stub expr crate --- python/sedonadb-expr/.gitignore | 60 ++++++++++ python/sedonadb-expr/README.md | 36 ++++++ python/sedonadb-expr/_version.py | 49 ++++++++ python/sedonadb-expr/hatch_build.py | 109 ++++++++++++++++++ python/sedonadb-expr/pyproject.toml | 55 +++++++++ .../python/sedonadb_expr/__init__.py | 20 ++++ python/sedonadb-expr/tests/__init__.py | 16 +++ .../sedonadb-expr/tests/test_sedonadb_expr.py | 24 ++++ 8 files changed, 369 insertions(+) create mode 100644 python/sedonadb-expr/.gitignore create mode 100644 python/sedonadb-expr/README.md create mode 100644 python/sedonadb-expr/_version.py create mode 100644 python/sedonadb-expr/hatch_build.py create mode 100644 python/sedonadb-expr/pyproject.toml create mode 100644 python/sedonadb-expr/python/sedonadb_expr/__init__.py create mode 100644 python/sedonadb-expr/tests/__init__.py create mode 100644 python/sedonadb-expr/tests/test_sedonadb_expr.py diff --git a/python/sedonadb-expr/.gitignore b/python/sedonadb-expr/.gitignore new file mode 100644 index 0000000000..8c753cedf6 --- /dev/null +++ b/python/sedonadb-expr/.gitignore @@ -0,0 +1,60 @@ + +# Generated files +python/sedonadb_expr/_version.py +python/sedonadb_expr/_generated/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +*.egg + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# IDE +.idea/ +.vscode/ +*.swp +*.swo diff --git a/python/sedonadb-expr/README.md b/python/sedonadb-expr/README.md new file mode 100644 index 0000000000..f115748051 --- /dev/null +++ b/python/sedonadb-expr/README.md @@ -0,0 +1,36 @@ + + +# SedonaDB Expr + +A standalone Python package for SedonaDB expressions. + +## Installation + +```shell +pip install sedonadb-expr +``` + +## Example + +```python +import sedonadb_expr + +print(sedonadb_expr.__version__) +``` diff --git a/python/sedonadb-expr/_version.py b/python/sedonadb-expr/_version.py new file mode 100644 index 0000000000..1bb4234a51 --- /dev/null +++ b/python/sedonadb-expr/_version.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Version source for hatchling - reads from workspace Cargo.toml. + +This file is used by hatchling at build time to determine the version. +The build hook then generates a static _version.py inside the package. +""" + +import re +from pathlib import Path + + +def get_version() -> str: + """Read version from the workspace root Cargo.toml.""" + here = Path(__file__).parent + cargo_toml = here.parent.parent / "Cargo.toml" + + if not cargo_toml.exists(): + raise FileNotFoundError( + f"Could not find workspace Cargo.toml at {cargo_toml}" + ) + + content = cargo_toml.read_text() + + match = re.search( + r'\[workspace\.package\].*?version\s*=\s*"([^"]+)"', + content, + re.DOTALL, + ) + if match: + return match.group(1) + + raise ValueError("Could not find workspace.package.version in Cargo.toml") diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py new file mode 100644 index 0000000000..8446fd948c --- /dev/null +++ b/python/sedonadb-expr/hatch_build.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Hatch build hook for sedonadb-expr. + +This hook runs during sdist and wheel builds to generate Python source +files from the docs/reference/sql documentation files. +""" + +from pathlib import Path +from typing import Any + +from hatchling.builders.hooks.plugin.interface import BuildHookInterface + + +class CustomBuildHook(BuildHookInterface): + """Custom build hook that generates Python sources from SQL docs.""" + + PLUGIN_NAME = "custom" + + def initialize(self, version: str, build_data: dict[str, Any]) -> None: + """ + Called before the build process starts. + + Args: + version: The version being built + build_data: Mutable dict to modify build behavior + """ + self._generate_version(version) + self._generate_sources() + + def _generate_version(self, version: str) -> None: + """Generate _version.py with the static version string.""" + here = Path(__file__).parent + version_file = here / "python" / "sedonadb_expr" / "_version.py" + + content = f'''# Auto-generated at build time - do not edit +__version__ = "{version}" +''' + version_file.write_text(content) + self.app.display_info(f"Generated _version.py with version {version}") + + def _generate_sources(self) -> None: + """Generate Python source files from docs/reference/sql.""" + here = Path(__file__).parent + docs_sql = here.parent.parent / "docs" / "reference" / "sql" + output_dir = here / "python" / "sedonadb_expr" / "_generated" + + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Create __init__.py for the generated module + init_file = output_dir / "__init__.py" + init_file.write_text( + "# Auto-generated module - do not edit\n" + "# Generated from docs/reference/sql\n" + ) + + if not docs_sql.exists(): + self.app.display_warning( + f"docs/reference/sql not found at {docs_sql}, skipping generation" + ) + return + + # Find all .qmd files (source files, not rendered .md) + qmd_files = sorted(docs_sql.glob("*.qmd")) + + functions = [] + for qmd_file in qmd_files: + # Skip index and special files + if qmd_file.stem in ("index", "barrier", "_quarto"): + continue + + func_name = qmd_file.stem.upper() + functions.append(func_name) + + # Generate a simple functions module listing all available functions + functions_file = output_dir / "functions.py" + functions_content = [ + "# Auto-generated - do not edit", + "# Generated from docs/reference/sql/*.qmd", + "", + "FUNCTIONS = [", + ] + for func in sorted(functions): + functions_content.append(f' "{func}",') + functions_content.append("]") + functions_content.append("") + + functions_file.write_text("\n".join(functions_content)) + + self.app.display_info( + f"Generated {len(functions)} function definitions" + ) diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml new file mode 100644 index 0000000000..cd3f2587ee --- /dev/null +++ b/python/sedonadb-expr/pyproject.toml @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "sedonadb-expr" +readme = "README.md" +requires-python = ">=3.9" +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dynamic = ["version"] + +[project.optional-dependencies] +test = [ + "pytest", +] + +[tool.hatch.version] +source = "code" +path = "_version.py" +expression = "get_version()" + +[tool.hatch.build.targets.wheel] +packages = ["python/sedonadb_expr"] + +[tool.hatch.build.targets.wheel.hooks.custom] +path = "hatch_build.py" + +[tool.hatch.build.targets.sdist.hooks.custom] +path = "hatch_build.py" diff --git a/python/sedonadb-expr/python/sedonadb_expr/__init__.py b/python/sedonadb-expr/python/sedonadb_expr/__init__.py new file mode 100644 index 0000000000..1f173a4ea5 --- /dev/null +++ b/python/sedonadb-expr/python/sedonadb_expr/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sedonadb_expr._version import __version__ + +__all__ = ["__version__"] diff --git a/python/sedonadb-expr/tests/__init__.py b/python/sedonadb-expr/tests/__init__.py new file mode 100644 index 0000000000..13a83393a9 --- /dev/null +++ b/python/sedonadb-expr/tests/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/sedonadb-expr/tests/test_sedonadb_expr.py b/python/sedonadb-expr/tests/test_sedonadb_expr.py new file mode 100644 index 0000000000..58db4376f1 --- /dev/null +++ b/python/sedonadb-expr/tests/test_sedonadb_expr.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sedonadb_expr + + +def test_version(): + # Version should match workspace Cargo.toml + assert sedonadb_expr.__version__ + assert sedonadb_expr.__version__.count(".") >= 2 # semver format From be81a2ff82de3c07ef77a6f41552c333efef9de4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 20 May 2026 17:24:56 -0500 Subject: [PATCH 02/28] fix test --- python/sedonadb-expr/tests/test_sedonadb_expr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sedonadb-expr/tests/test_sedonadb_expr.py b/python/sedonadb-expr/tests/test_sedonadb_expr.py index 58db4376f1..e693292110 100644 --- a/python/sedonadb-expr/tests/test_sedonadb_expr.py +++ b/python/sedonadb-expr/tests/test_sedonadb_expr.py @@ -21,4 +21,3 @@ def test_version(): # Version should match workspace Cargo.toml assert sedonadb_expr.__version__ - assert sedonadb_expr.__version__.count(".") >= 2 # semver format From c69c873d5f135f6e2776bb9e5fd5c5233433e9d1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 28 May 2026 23:21:04 -0500 Subject: [PATCH 03/28] prototype --- python/sedonadb-expr/_version.py | 4 +--- python/sedonadb-expr/hatch_build.py | 4 +--- python/sedonadb-expr/python/sedonadb_expr/__init__.py | 4 +++- python/sedonadb/python/sedonadb/expr/expression.py | 10 ++++++++++ python/sedonadb/python/sedonadb/functions/__init__.py | 8 ++++++++ 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/python/sedonadb-expr/_version.py b/python/sedonadb-expr/_version.py index 1bb4234a51..ff985f4c56 100644 --- a/python/sedonadb-expr/_version.py +++ b/python/sedonadb-expr/_version.py @@ -32,9 +32,7 @@ def get_version() -> str: cargo_toml = here.parent.parent / "Cargo.toml" if not cargo_toml.exists(): - raise FileNotFoundError( - f"Could not find workspace Cargo.toml at {cargo_toml}" - ) + raise FileNotFoundError(f"Could not find workspace Cargo.toml at {cargo_toml}") content = cargo_toml.read_text() diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 8446fd948c..4787ecec78 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -104,6 +104,4 @@ def _generate_sources(self) -> None: functions_file.write_text("\n".join(functions_content)) - self.app.display_info( - f"Generated {len(functions)} function definitions" - ) + self.app.display_info(f"Generated {len(functions)} function definitions") diff --git a/python/sedonadb-expr/python/sedonadb_expr/__init__.py b/python/sedonadb-expr/python/sedonadb_expr/__init__.py index 1f173a4ea5..19a4700217 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/__init__.py +++ b/python/sedonadb-expr/python/sedonadb_expr/__init__.py @@ -16,5 +16,7 @@ # under the License. from sedonadb_expr._version import __version__ +from sedonadb_expr._generated.geo_functions import GeoFunctions +from sedonadb_expr._generated.geo_methods import GeoMethods -__all__ = ["__version__"] +__all__ = ["__version__", "GeoFunctions", "GeoMethods"] diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index 0ce6c57a33..2693474262 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -34,6 +34,10 @@ from sedonadb.functions import Functions +if TYPE_CHECKING: + from sedonadb_expr._generated.geo_methods import GeoMethods + + class Expr: """A column expression. @@ -212,6 +216,12 @@ def desc(self, nulls_first: bool = False) -> "SortExpr": """ return SortExpr(self._impl.desc(nulls_first)) + @property + def geo(self) -> "GeoMethods[Expr]": + from sedonadb_expr import GeoMethods + + return GeoMethods(self) + # Arithmetic operators ------------------------------------------------- # # Each binary dunder routes through the shared `_binary` helper, which diff --git a/python/sedonadb/python/sedonadb/functions/__init__.py b/python/sedonadb/python/sedonadb/functions/__init__.py index 2469becfc5..ac3e4591a3 100644 --- a/python/sedonadb/python/sedonadb/functions/__init__.py +++ b/python/sedonadb/python/sedonadb/functions/__init__.py @@ -23,6 +23,8 @@ if TYPE_CHECKING: from sedonadb.functions.table import TableFunctions + from sedonadb.expr.expression import Expr + from sedonadb_expr import GeoFunctions class Functions: @@ -46,6 +48,12 @@ def table(self) -> "TableFunctions": return TableFunctions(self._ctx) + @property + def geo(self) -> "GeoFunctions[Expr]": + from sedonadb_expr import GeoFunctions + + return GeoFunctions(self) + def __getattr__(self, name) -> Union["ScalarUdf", "AggregateUdf"]: try: return ScalarUdf(self._ctx._impl.scalar_udf(name), self._ctx, self._expr) From 9de055f7602c7753c2ebc8a30fabb44d84081daf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 28 May 2026 23:31:50 -0500 Subject: [PATCH 04/28] generate the build --- python/sedonadb-expr/hatch_build.py | 547 ++++++++++++++++++++++++++-- python/sedonadb-expr/pyproject.toml | 2 +- 2 files changed, 524 insertions(+), 25 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 4787ecec78..76d8783631 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -22,12 +22,512 @@ files from the docs/reference/sql documentation files. """ +from __future__ import annotations + +import re +import textwrap from pathlib import Path from typing import Any +import yaml from hatchling.builders.hooks.plugin.interface import BuildHookInterface +# Type to parameter name mapping (matches R version) +TYPE_TO_PARAM: dict[str, str] = { + "geometry": "geom", + "geography": "geog", + "raster": "rast", + "float64": "x", + "double": "x", + "integer": "n", + "int64": "n", + "string": "s", + "boolean": "b", + "crs": "crs", +} + +# Types that qualify for geo methods (first arg piped in) +GEO_TYPES = {"geometry", "geography"} + +DOCS_BASE_URL = "https://sedona.apache.org/sedonadb/latest/reference/sql" + +LICENSE_HEADER = '''\ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +''' + + +class ArgInfo: + """Information about a kernel argument.""" + + def __init__( + self, + type: str, + name: str | None = None, + description: str | None = None, + ): + self.type = type + self.name = name + self.description = description + + +class KernelInfo: + """Parsed kernel information.""" + + def __init__( + self, + args: list[ArgInfo] | None = None, + returns: str = "unknown", + variadic: bool = False, + kernel_signatures: list[str] | None = None, + ): + self.args = args if args is not None else [] + self.returns = returns + self.variadic = variadic + self.kernel_signatures = kernel_signatures if kernel_signatures is not None else [] + + +class FunctionInfo: + """Parsed function information from a .qmd file.""" + + def __init__( + self, + name: str, + title: str, + description: str, + kernels: list[dict[str, Any]], + is_geo_method: bool = False, + kernel_info: KernelInfo | None = None, + ): + self.name = name + self.title = title + self.description = description + self.kernels = kernels + self.is_geo_method = is_geo_method + self.kernel_info = kernel_info + + +def extract_frontmatter(file_path: Path) -> dict[str, Any]: + """Extract YAML frontmatter from a .qmd file.""" + content = file_path.read_text() + lines = content.split("\n") + + # Find YAML delimiters + delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] + if len(delimiters) < 2: + raise ValueError(f"Could not find YAML frontmatter in {file_path}") + + yaml_text = "\n".join(lines[delimiters[0] + 1 : delimiters[1]]) + return yaml.safe_load(yaml_text) + + +def extract_description_section(file_path: Path) -> str | None: + """Extract the ## Description section from the .qmd file body.""" + content = file_path.read_text() + lines = content.split("\n") + + # Find end of frontmatter + delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] + if len(delimiters) < 2: + return None + + body_lines = lines[delimiters[1] + 1 :] + + # Find ## Description section + desc_start = None + for i, line in enumerate(body_lines): + if line.startswith("## Description"): + desc_start = i + break + + if desc_start is None: + return None + + # Find next section or end + remaining = body_lines[desc_start + 1 :] + next_section = None + for i, line in enumerate(remaining): + if line.startswith("## "): + next_section = i + break + + if next_section is None: + desc_lines = remaining + else: + desc_lines = remaining[:next_section] + + # Process lines: preserve markdown lists, join paragraphs + result_lines: list[str] = [] + current_paragraph: list[str] = [] + + for line in desc_lines: + stripped = line.strip() + # Check if this is a list item (-, *, or numbered) + is_list_item = bool(re.match(r"^[-*]|\d+\.", stripped)) + + if not stripped: + # Empty line: flush current paragraph + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + current_paragraph = [] + elif is_list_item: + # List item: flush paragraph first, then add list item + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + current_paragraph = [] + result_lines.append(stripped) + else: + # Regular text: accumulate into paragraph + current_paragraph.append(stripped) + + # Flush any remaining paragraph + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + + desc_text = "\n".join(result_lines).strip() + return desc_text if desc_text else None + + +def type_to_param_name(arg_type: str, index: int = 0, needs_suffix: bool = False) -> str: + """Generate parameter name from type.""" + base_name = TYPE_TO_PARAM.get(arg_type, "arg") + if needs_suffix: + suffix = chr(ord("a") + index) # 0=a, 1=b, 2=c, ... + return f"{base_name}_{suffix}" + return base_name + + +def parse_kernel_args(kernel_args: list) -> list[ArgInfo]: + """Parse kernel arguments into ArgInfo objects.""" + result = [] + for arg in kernel_args: + if isinstance(arg, str): + result.append(ArgInfo(type=arg)) + elif isinstance(arg, dict): + result.append( + ArgInfo( + type=arg.get("type", "unknown"), + name=arg.get("name"), + description=arg.get("description"), + ) + ) + else: + result.append(ArgInfo(type="unknown")) + return result + + +def generate_arg_names(arg_info_list: list[ArgInfo]) -> list[str]: + """Generate argument names for a kernel's args.""" + types = [info.type for info in arg_info_list] + type_counts: dict[str, int] = {} + type_totals: dict[str, int] = {} + + # Count total occurrences of each type + for t in types: + type_totals[t] = type_totals.get(t, 0) + 1 + + arg_names = [] + for info in arg_info_list: + arg_type = info.type + arg_name = info.name + + if arg_name is None: + type_counts[arg_type] = type_counts.get(arg_type, 0) + 1 + needs_suffix = type_totals.get(arg_type, 0) > 1 + arg_name = type_to_param_name(arg_type, type_counts[arg_type] - 1, needs_suffix) + + arg_names.append(arg_name) + + return arg_names + + +def parse_kernel_params(kernels: list[dict], fn_name: str = "unknown") -> KernelInfo: + """Parse kernel arguments and generate parameter info.""" + if not kernels: + return KernelInfo() + + # Process all kernels + all_kernel_info = [parse_kernel_args(k.get("args", [])) for k in kernels] + all_kernel_args = [generate_arg_names(info) for info in all_kernel_info] + + # Find max args + kernel_lengths = [len(args) for args in all_kernel_args] + max_args = max(kernel_lengths) if kernel_lengths else 0 + + # Check for argument name conflicts + has_conflict = False + for pos in range(max_args): + names_at_pos = set() + for args in all_kernel_args: + if pos < len(args): + names_at_pos.add(args[pos]) + if len(names_at_pos) > 1: + has_conflict = True + break + + returns = kernels[0].get("returns", "unknown") + + if has_conflict: + # Build signature strings for documentation + kernel_signatures = [] + for i, args in enumerate(all_kernel_args): + types = [info.type for info in all_kernel_info[i]] + sig = ", ".join(f"{arg} ({t})" for arg, t in zip(args, types)) + kernel_signatures.append(sig) + + return KernelInfo( + args=[], + returns=returns, + variadic=True, + kernel_signatures=kernel_signatures, + ) + + # Use kernel with most arguments as reference + ref_idx = kernel_lengths.index(max(kernel_lengths)) if kernel_lengths else 0 + arg_info = all_kernel_info[ref_idx] if all_kernel_info else [] + arg_names = all_kernel_args[ref_idx] if all_kernel_args else [] + + # Update ArgInfo with generated names + for i, info in enumerate(arg_info): + if info.name is None: + info.name = arg_names[i] + + return KernelInfo(args=arg_info, returns=returns, variadic=False) + + +def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: + """Parse a .qmd file and return FunctionInfo.""" + fn_name = qmd_path.stem # e.g., "st_envelope" + + try: + frontmatter = extract_frontmatter(qmd_path) + except Exception: + return None + + kernels = frontmatter.get("kernels", []) + if not kernels: + return None + + # Check if first argument of any kernel is geometry/geography + is_geo_method = False + for kernel in kernels: + args = kernel.get("args", []) + if args: + first_arg = args[0] + first_type = first_arg if isinstance(first_arg, str) else first_arg.get("type", "") + if first_type in GEO_TYPES: + is_geo_method = True + break + + title = frontmatter.get("description", frontmatter.get("title", fn_name)) + description = extract_description_section(qmd_path) or "" + + kernel_info = parse_kernel_params(kernels, fn_name) + + return FunctionInfo( + name=fn_name, + title=title, + description=description, + kernels=kernels, + is_geo_method=is_geo_method, + kernel_info=kernel_info, + ) + + +def wrap_docstring(text: str, width: int = 88, indent: str = " ") -> str: + """Wrap text for docstrings, preserving markdown lists.""" + if not text: + return "" + + result_lines: list[str] = [] + for i, line in enumerate(text.split("\n")): + if not line.strip(): + result_lines.append("") + continue + + # Wrap each line separately + wrapped = textwrap.fill(line, width=width - len(indent)) + for j, wrapped_line in enumerate(wrapped.split("\n")): + if i == 0 and j == 0: + # First line of first paragraph - no indent + result_lines.append(wrapped_line) + else: + result_lines.append(indent + wrapped_line) + + return "\n".join(result_lines) + + +def generate_method_docstring(func: FunctionInfo) -> str: + """Generate docstring for a method.""" + parts = [f'"""{func.title}'] + + if func.description and func.description != func.title: + parts.append("") + parts.append(wrap_docstring(func.description, indent=" ")) + + kernel_info = func.kernel_info + if kernel_info and kernel_info.args: + # Skip first arg (piped in via self._expr) + remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + if remaining_args: + parts.append("") + parts.append("Args:") + for arg in remaining_args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") + + parts.append("") + parts.append("See Also:") + parts.append(f" {DOCS_BASE_URL}/{func.name}/") + parts.append('"""') + + return "\n ".join(parts) + + +def generate_function_docstring(func: FunctionInfo) -> str: + """Generate docstring for a standalone function property.""" + parts = [f'"""{func.title}'] + + if func.description and func.description != func.title: + parts.append("") + parts.append(wrap_docstring(func.description, indent=" ")) + + kernel_info = func.kernel_info + if kernel_info and kernel_info.args: + parts.append("") + parts.append("Args:") + for arg in kernel_info.args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") + + parts.append("") + parts.append("See Also:") + parts.append(f" {DOCS_BASE_URL}/{func.name}/") + parts.append('"""') + + return "\n ".join(parts) + + +def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: + """Generate geo_methods.py content.""" + # Filter to only geo methods (first arg is geometry/geography) + geo_funcs = [f for f in functions if f.is_geo_method] + + lines = [ + LICENSE_HEADER, + "", + '"""Auto-generated geometry/geography methods - do not edit."""', + "", + "from typing import Generic, TypeVar", + "", + 'ExprT = TypeVar("ExprT")', + "", + "", + "class GeoMethods(Generic[ExprT]):", + ' """Geometry and geography methods accessible via expr.geo."""', + "", + " def __init__(self, expr: ExprT) -> None:", + " self._expr = expr", + ] + + for func in sorted(geo_funcs, key=lambda f: f.name): + # Method name: strip st_ prefix + method_name = func.name + if method_name.startswith("st_"): + method_name = method_name[3:] + + kernel_info = func.kernel_info + if not kernel_info: + continue + + # Build method signature - skip first arg (piped in) + remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + + if kernel_info.variadic: + params = "self, *args" + call_args = "*args" + elif remaining_args: + param_strs = [f"{arg.name}" for arg in remaining_args] + params = "self, " + ", ".join(param_strs) + call_args = ", ".join(arg.name for arg in remaining_args) + else: + params = "self" + call_args = "" + + docstring = generate_method_docstring(func) + + lines.extend([ + "", + f" def {method_name}({params}) -> ExprT:", + f" {docstring}", + ]) + + if call_args: + lines.append(f' return self._expr._call("{method_name}", {call_args})') + else: + lines.append(f' return self._expr._call("{method_name}")') + + lines.append("") + return "\n".join(lines) + + +def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: + """Generate geo_functions.py content.""" + # Filter to only geo methods (these become callable properties) + geo_funcs = [f for f in functions if f.is_geo_method] + + lines = [ + LICENSE_HEADER, + "", + '"""Auto-generated geometry/geography functions - do not edit."""', + "", + "from typing import Callable, Generic, TypeVar", + "", + 'ExprT = TypeVar("ExprT")', + "", + "", + "class GeoFunctions(Generic[ExprT]):", + ' """Geometry and geography functions accessible via a factory."""', + "", + " def __init__(self, factory) -> None:", + " self._factory = factory", + ] + + for func in sorted(geo_funcs, key=lambda f: f.name): + # Property name: strip st_ prefix + prop_name = func.name + if prop_name.startswith("st_"): + prop_name = prop_name[3:] + + docstring = generate_function_docstring(func) + + lines.extend([ + "", + " @property", + f" def {prop_name}(self) -> Callable[..., ExprT]:", + f" {docstring}", + f' return self._factory["{prop_name}"]', + ]) + + lines.append("") + return "\n".join(lines) + + class CustomBuildHook(BuildHookInterface): """Custom build hook that generates Python sources from SQL docs.""" @@ -77,31 +577,30 @@ def _generate_sources(self) -> None: ) return - # Find all .qmd files (source files, not rendered .md) - qmd_files = sorted(docs_sql.glob("*.qmd")) + # Find all .qmd files + qmd_files = sorted(docs_sql.glob("st_*.qmd")) - functions = [] + # Parse all function definitions + functions: list[FunctionInfo] = [] for qmd_file in qmd_files: - # Skip index and special files - if qmd_file.stem in ("index", "barrier", "_quarto"): - continue - - func_name = qmd_file.stem.upper() - functions.append(func_name) - - # Generate a simple functions module listing all available functions - functions_file = output_dir / "functions.py" - functions_content = [ - "# Auto-generated - do not edit", - "# Generated from docs/reference/sql/*.qmd", - "", - "FUNCTIONS = [", - ] - for func in sorted(functions): - functions_content.append(f' "{func}",') - functions_content.append("]") - functions_content.append("") + func = parse_qmd_file(qmd_file) + if func: + functions.append(func) - functions_file.write_text("\n".join(functions_content)) + # Generate geo_methods.py + geo_methods_content = generate_geo_methods_py(functions) + geo_methods_file = output_dir / "geo_methods.py" + geo_methods_file.write_text(geo_methods_content) - self.app.display_info(f"Generated {len(functions)} function definitions") + # Generate geo_functions.py + geo_functions_content = generate_geo_functions_py(functions) + geo_functions_file = output_dir / "geo_functions.py" + geo_functions_file.write_text(geo_functions_content) + + # Count stats + geo_method_count = sum(1 for f in functions if f.is_geo_method) + + self.app.display_info( + f"Generated {len(functions)} functions total, " + f"{geo_method_count} geo methods" + ) diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml index cd3f2587ee..8ff2153cac 100644 --- a/python/sedonadb-expr/pyproject.toml +++ b/python/sedonadb-expr/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["hatchling"] +requires = ["hatchling", "pyyaml"] build-backend = "hatchling.build" [project] From d332e3a9502746a694e67d38d485ee584361f671 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 29 May 2026 13:43:24 -0500 Subject: [PATCH 05/28] maybe fix ci --- .github/workflows/python.yml | 12 ++++--- python/sedonadb-expr/.gitignore | 16 +++++++++ python/sedonadb-expr/hatch_build.py | 52 ++++++++++++++++++----------- 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index f925c40d7f..b533fb9d78 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -114,14 +114,18 @@ jobs: # Update this key to force a new cache (sync with packaging.yml) prefix-key: "python-v3" - - name: Install + - name: Install sedonadb-expr + run: | + pip install -e "python/sedonadb-expr" -vv + + - name: Install sedonadb run: | # Keep this export in sync with the export in dev/release/verify-release-candidate.sh export MATURIN_PEP517_ARGS="--features s2geography" pip install -e "python/sedonadb/[test]" -vv - # Unset so `--features s2geography` (sedonadb-only) doesn't - # carry into the plugin install. - unset MATURIN_PEP517_ARGS + + - name: Install sedonadb-zarr + run: | pip install -e "python/sedonadb-zarr/[test]" -vv - name: Download minimal geoarrow-data assets diff --git a/python/sedonadb-expr/.gitignore b/python/sedonadb-expr/.gitignore index 8c753cedf6..71528ae4bf 100644 --- a/python/sedonadb-expr/.gitignore +++ b/python/sedonadb-expr/.gitignore @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. # Generated files python/sedonadb_expr/_version.py diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 76d8783631..1939efae9a 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -52,7 +52,7 @@ DOCS_BASE_URL = "https://sedona.apache.org/sedonadb/latest/reference/sql" -LICENSE_HEADER = '''\ +LICENSE_HEADER = """\ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -69,7 +69,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -''' +""" class ArgInfo: @@ -99,7 +99,9 @@ def __init__( self.args = args if args is not None else [] self.returns = returns self.variadic = variadic - self.kernel_signatures = kernel_signatures if kernel_signatures is not None else [] + self.kernel_signatures = ( + kernel_signatures if kernel_signatures is not None else [] + ) class FunctionInfo: @@ -203,7 +205,9 @@ def extract_description_section(file_path: Path) -> str | None: return desc_text if desc_text else None -def type_to_param_name(arg_type: str, index: int = 0, needs_suffix: bool = False) -> str: +def type_to_param_name( + arg_type: str, index: int = 0, needs_suffix: bool = False +) -> str: """Generate parameter name from type.""" base_name = TYPE_TO_PARAM.get(arg_type, "arg") if needs_suffix: @@ -249,7 +253,9 @@ def generate_arg_names(arg_info_list: list[ArgInfo]) -> list[str]: if arg_name is None: type_counts[arg_type] = type_counts.get(arg_type, 0) + 1 needs_suffix = type_totals.get(arg_type, 0) > 1 - arg_name = type_to_param_name(arg_type, type_counts[arg_type] - 1, needs_suffix) + arg_name = type_to_param_name( + arg_type, type_counts[arg_type] - 1, needs_suffix + ) arg_names.append(arg_name) @@ -329,7 +335,9 @@ def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: args = kernel.get("args", []) if args: first_arg = args[0] - first_type = first_arg if isinstance(first_arg, str) else first_arg.get("type", "") + first_type = ( + first_arg if isinstance(first_arg, str) else first_arg.get("type", "") + ) if first_type in GEO_TYPES: is_geo_method = True break @@ -471,14 +479,18 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: docstring = generate_method_docstring(func) - lines.extend([ - "", - f" def {method_name}({params}) -> ExprT:", - f" {docstring}", - ]) + lines.extend( + [ + "", + f" def {method_name}({params}) -> ExprT:", + f" {docstring}", + ] + ) if call_args: - lines.append(f' return self._expr._call("{method_name}", {call_args})') + lines.append( + f' return self._expr._call("{method_name}", {call_args})' + ) else: lines.append(f' return self._expr._call("{method_name}")') @@ -516,13 +528,15 @@ def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: docstring = generate_function_docstring(func) - lines.extend([ - "", - " @property", - f" def {prop_name}(self) -> Callable[..., ExprT]:", - f" {docstring}", - f' return self._factory["{prop_name}"]', - ]) + lines.extend( + [ + "", + " @property", + f" def {prop_name}(self) -> Callable[..., ExprT]:", + f" {docstring}", + f' return self._factory["{prop_name}"]', + ] + ) lines.append("") return "\n".join(lines) From 793c82eccac4ce2e7b40a31a7206667cbe1497f6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 29 May 2026 15:05:58 -0500 Subject: [PATCH 06/28] see if this works to fix test collection --- python/sedonadb-expr/pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml index 8ff2153cac..6c9631657d 100644 --- a/python/sedonadb-expr/pyproject.toml +++ b/python/sedonadb-expr/pyproject.toml @@ -53,3 +53,6 @@ path = "hatch_build.py" [tool.hatch.build.targets.sdist.hooks.custom] path = "hatch_build.py" + +[tool.pytest.ini_options] +collect_ignore = ["hatch_build.py"] From f6cb7186c08f65e41a71fee2323c2fd5b014f165 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 09:53:56 -0500 Subject: [PATCH 07/28] more specific pytest invocation --- .github/workflows/python.yml | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index b533fb9d78..d8b381fbba 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -136,18 +136,28 @@ jobs: run: | docker compose up --wait --detach postgis - - name: Run tests + - name: Run tests (sedonadb) env: # Ensure that we don't skip tests that we didn't intend to SEDONADB_PYTHON_NO_SKIP_TESTS: "true" run: | - cd python + cd python/sedonadb python -m pytest -vv - - name: Run doctests + - name: Run doctests (sedonadb) run: | - cd python - python -m pytest --doctest-modules + cd python/sedonadb + python -m pytest --doctest-modules src + + - name: Run tests (sedonadb-expr) + run: | + cd python/sedonadb-expr + python -m pytest -vv + + - name: Run doctests (sedonadb-expr) + run: | + cd python/sedonadb-expr + python -m pytest --doctest-modules src - name: Shutdown docker compose services if: always() From 66e9130430bb85467a1c4c3af3e4145a91a47b4a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 10:05:00 -0500 Subject: [PATCH 08/28] fix build option --- python/sedonadb-expr/pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml index 6c9631657d..8ff2153cac 100644 --- a/python/sedonadb-expr/pyproject.toml +++ b/python/sedonadb-expr/pyproject.toml @@ -53,6 +53,3 @@ path = "hatch_build.py" [tool.hatch.build.targets.sdist.hooks.custom] path = "hatch_build.py" - -[tool.pytest.ini_options] -collect_ignore = ["hatch_build.py"] From 626e90872b5d2c375b333d575ecc96ee91bbbd32 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 10:26:34 -0500 Subject: [PATCH 09/28] fix some generated function issues --- python/sedonadb-expr/hatch_build.py | 64 ++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 1939efae9a..083d725888 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -36,7 +36,7 @@ # Type to parameter name mapping (matches R version) TYPE_TO_PARAM: dict[str, str] = { "geometry": "geom", - "geography": "geog", + "geography": "geom", "raster": "rast", "float64": "x", "double": "x", @@ -183,10 +183,11 @@ def extract_description_section(file_path: Path) -> str | None: is_list_item = bool(re.match(r"^[-*]|\d+\.", stripped)) if not stripped: - # Empty line: flush current paragraph + # Empty line: flush current paragraph and add blank line for separation if current_paragraph: result_lines.append(" ".join(current_paragraph)) current_paragraph = [] + result_lines.append("") # Preserve paragraph break elif is_list_item: # List item: flush paragraph first, then add list item if current_paragraph: @@ -382,22 +383,37 @@ def wrap_docstring(text: str, width: int = 88, indent: str = " ") -> str: def generate_method_docstring(func: FunctionInfo) -> str: """Generate docstring for a method.""" - parts = [f'"""{func.title}'] + title = func.title.strip() + parts = [f'"""{title}'] - if func.description and func.description != func.title: + if func.description and func.description.strip() != title: parts.append("") parts.append(wrap_docstring(func.description, indent=" ")) kernel_info = func.kernel_info - if kernel_info and kernel_info.args: - # Skip first arg (piped in via self._expr) - remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] - if remaining_args: + if kernel_info: + if kernel_info.variadic and kernel_info.kernel_signatures: + # Variadic mode: document with bulleted list of supported combinations + # Skip the first arg (piped in via self._expr) from each signature parts.append("") - parts.append("Args:") - for arg in remaining_args: - desc = arg.description or f"Input {arg.type}" - parts.append(f" {arg.name}: {desc}") + parts.append("Variants:") + for sig in kernel_info.kernel_signatures: + # Split signature, skip first arg, rejoin + arg_parts = [p.strip() for p in sig.split(",")] + remaining = ", ".join(arg_parts[1:]) if len(arg_parts) > 1 else "" + if remaining: + parts.append(f" - {remaining}") + else: + parts.append(" - (no additional arguments)") + elif kernel_info.args: + # Skip first arg (piped in via self._expr) + remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + if remaining_args: + parts.append("") + parts.append("Args:") + for arg in remaining_args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") parts.append("") parts.append("See Also:") @@ -409,19 +425,27 @@ def generate_method_docstring(func: FunctionInfo) -> str: def generate_function_docstring(func: FunctionInfo) -> str: """Generate docstring for a standalone function property.""" - parts = [f'"""{func.title}'] + title = func.title.strip() + parts = [f'"""{title}'] - if func.description and func.description != func.title: + if func.description and func.description.strip() != title: parts.append("") parts.append(wrap_docstring(func.description, indent=" ")) kernel_info = func.kernel_info - if kernel_info and kernel_info.args: - parts.append("") - parts.append("Args:") - for arg in kernel_info.args: - desc = arg.description or f"Input {arg.type}" - parts.append(f" {arg.name}: {desc}") + if kernel_info: + if kernel_info.variadic and kernel_info.kernel_signatures: + # Variadic mode: document with bulleted list of supported combinations + parts.append("") + parts.append("Variants:") + for sig in kernel_info.kernel_signatures: + parts.append(f" - {sig}") + elif kernel_info.args: + parts.append("") + parts.append("Args:") + for arg in kernel_info.args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") parts.append("") parts.append("See Also:") From d4e9107dcbb2188aa9534140eefba9a6ac7a23cb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 10:44:28 -0500 Subject: [PATCH 10/28] maybe fix doctest command --- .github/workflows/python.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index d8b381fbba..5d4cb16cda 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -147,7 +147,7 @@ jobs: - name: Run doctests (sedonadb) run: | cd python/sedonadb - python -m pytest --doctest-modules src + python -m pytest --doctest-modules python/ - name: Run tests (sedonadb-expr) run: | @@ -157,7 +157,7 @@ jobs: - name: Run doctests (sedonadb-expr) run: | cd python/sedonadb-expr - python -m pytest --doctest-modules src + python -m pytest --doctest-modules python/ - name: Shutdown docker compose services if: always() From e816251a0968622a3bcbbb5450631a13ca685860 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 11:20:12 -0500 Subject: [PATCH 11/28] missing args --- python/sedonadb-expr/hatch_build.py | 40 ++++++++-- .../python/sedonadb_expr/__init__.py | 3 +- .../python/sedonadb_expr/utils.py | 73 +++++++++++++++++++ 3 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 python/sedonadb-expr/python/sedonadb_expr/utils.py diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 083d725888..2b3ed2f864 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -80,10 +80,12 @@ def __init__( type: str, name: str | None = None, description: str | None = None, + optional: bool = False, ): self.type = type self.name = name self.description = description + self.optional = optional class KernelInfo: @@ -103,6 +105,11 @@ def __init__( kernel_signatures if kernel_signatures is not None else [] ) + @property + def has_optional_args(self) -> bool: + """Return True if any argument is optional.""" + return any(arg.optional for arg in self.args) + class FunctionInfo: """Parsed function information from a .qmd file.""" @@ -309,10 +316,15 @@ def parse_kernel_params(kernels: list[dict], fn_name: str = "unknown") -> Kernel arg_info = all_kernel_info[ref_idx] if all_kernel_info else [] arg_names = all_kernel_args[ref_idx] if all_kernel_args else [] - # Update ArgInfo with generated names + # Determine minimum args (args present in all kernels) + min_args = min(kernel_lengths) if kernel_lengths else 0 + + # Update ArgInfo with generated names and optional flag for i, info in enumerate(arg_info): if info.name is None: info.name = arg_names[i] + # Args beyond min_args are optional (not present in all kernels) + info.optional = i >= min_args return KernelInfo(args=arg_info, returns=returns, variadic=False) @@ -467,6 +479,8 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: "", "from typing import Generic, TypeVar", "", + "from sedonadb_expr.utils import MISSING, filter_missing_args", + "", 'ExprT = TypeVar("ExprT")', "", "", @@ -489,17 +503,28 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: # Build method signature - skip first arg (piped in) remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + # Check if any remaining args are optional + has_optional = any(arg.optional for arg in remaining_args) if kernel_info.variadic: params = "self, *args" call_args = "*args" + use_filter = False elif remaining_args: - param_strs = [f"{arg.name}" for arg in remaining_args] + # Build param strings with MISSING default for optional args + param_strs = [] + for arg in remaining_args: + if arg.optional: + param_strs.append(f"{arg.name}=MISSING") + else: + param_strs.append(arg.name) params = "self, " + ", ".join(param_strs) call_args = ", ".join(arg.name for arg in remaining_args) + use_filter = has_optional else: params = "self" call_args = "" + use_filter = False docstring = generate_method_docstring(func) @@ -512,9 +537,14 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: ) if call_args: - lines.append( - f' return self._expr._call("{method_name}", {call_args})' - ) + if use_filter: + lines.append( + f' return self._expr._call("{method_name}", *filter_missing_args({call_args}))' + ) + else: + lines.append( + f' return self._expr._call("{method_name}", {call_args})' + ) else: lines.append(f' return self._expr._call("{method_name}")') diff --git a/python/sedonadb-expr/python/sedonadb_expr/__init__.py b/python/sedonadb-expr/python/sedonadb_expr/__init__.py index 19a4700217..e8b13e4d81 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/__init__.py +++ b/python/sedonadb-expr/python/sedonadb_expr/__init__.py @@ -18,5 +18,6 @@ from sedonadb_expr._version import __version__ from sedonadb_expr._generated.geo_functions import GeoFunctions from sedonadb_expr._generated.geo_methods import GeoMethods +from sedonadb_expr.utils import MISSING, filter_missing_args -__all__ = ["__version__", "GeoFunctions", "GeoMethods"] +__all__ = ["__version__", "GeoFunctions", "GeoMethods", "MISSING", "filter_missing_args"] diff --git a/python/sedonadb-expr/python/sedonadb_expr/utils.py b/python/sedonadb-expr/python/sedonadb_expr/utils.py new file mode 100644 index 0000000000..c81ef8a5c8 --- /dev/null +++ b/python/sedonadb-expr/python/sedonadb_expr/utils.py @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Utility classes and functions for sedonadb-expr.""" + +from typing import Any + + +class _MissingType: + """Sentinel type for missing/omitted arguments. + + This is distinct from None, which represents a valid NULL value. + Use the MISSING singleton instance rather than creating new instances. + """ + + +MISSING = _MissingType() +"""Sentinel value for missing/omitted arguments. + +Use this as the default value for optional parameters. +""" + + +def filter_missing_args(*args: Any): + """Filter out trailing MISSING arguments, validating ordering. + + Args: + *args: Arguments to filter + + Returns: + Tuple of non-MISSING arguments + + Raises: + ValueError: If MISSING arguments appear before non-MISSING arguments + """ + if not args: + return () + + # Find indices of missing args + is_missing = [arg is MISSING for arg in args] + + if not any(is_missing): + return args + + # Find last non-missing arg + last_non_missing = -1 + for i in range(len(args) - 1, -1, -1): + if not is_missing[i]: + last_non_missing = i + break + + # Check no missing args before non-missing args + if last_non_missing >= 0 and any(is_missing[: last_non_missing + 1]): + raise ValueError("Missing arguments must be at the end of the argument list") + + # Return args up to and including last non-missing + if last_non_missing < 0: + return () + return args[: last_non_missing + 1] From 5617b3263118c6c3ebee76f0f27b60f7518e45a2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 11:53:41 -0500 Subject: [PATCH 12/28] context aware expressions --- python/sedonadb/python/sedonadb/expr/expression.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index 2693474262..b12c82e205 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -222,6 +222,9 @@ def geo(self) -> "GeoMethods[Expr]": return GeoMethods(self) + def _call(self, name, *args) -> "Expr": + return self._ctx.funcs[name](*args) + # Arithmetic operators ------------------------------------------------- # # Each binary dunder routes through the shared `_binary` helper, which From 1b7f6b269c9d589a1b90292ccb396e1c7c6fcd4d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 11:55:24 -0500 Subject: [PATCH 13/28] nicer error for when _ctx is None --- python/sedonadb/python/sedonadb/expr/expression.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index b12c82e205..1ba91d582d 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -223,6 +223,8 @@ def geo(self) -> "GeoMethods[Expr]": return GeoMethods(self) def _call(self, name, *args) -> "Expr": + if self._ctx is None: + raise ValueError("Can't _call() Expr constructed without a SedonaContext") return self._ctx.funcs[name](*args) # Arithmetic operators ------------------------------------------------- From 9e00aa05220b10b636b71676fc2d6fff40a517fa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 2 Jun 2026 12:15:22 -0500 Subject: [PATCH 14/28] fix the function names --- python/sedonadb-expr/hatch_build.py | 8 ++++---- python/sedonadb/python/sedonadb/expr/expression.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 2b3ed2f864..3c4c13bf9c 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -539,14 +539,14 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: if call_args: if use_filter: lines.append( - f' return self._expr._call("{method_name}", *filter_missing_args({call_args}))' + f' return self._expr._call("{func.name}", *filter_missing_args({call_args}))' ) else: lines.append( - f' return self._expr._call("{method_name}", {call_args})' + f' return self._expr._call("{func.name}", {call_args})' ) else: - lines.append(f' return self._expr._call("{method_name}")') + lines.append(f' return self._expr._call("{func.name}")') lines.append("") return "\n".join(lines) @@ -588,7 +588,7 @@ def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: " @property", f" def {prop_name}(self) -> Callable[..., ExprT]:", f" {docstring}", - f' return self._factory["{prop_name}"]', + f' return self._factory["{func.name}"]', ] ) diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index 1ba91d582d..338ed8e9f0 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -225,7 +225,7 @@ def geo(self) -> "GeoMethods[Expr]": def _call(self, name, *args) -> "Expr": if self._ctx is None: raise ValueError("Can't _call() Expr constructed without a SedonaContext") - return self._ctx.funcs[name](*args) + return self._ctx.funcs[name](self, *args) # Arithmetic operators ------------------------------------------------- # From bb269c9dc4806a3c95246febeb2f17c398f51c93 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 10:27:57 -0500 Subject: [PATCH 15/28] formatting --- python/sedonadb-expr/python/sedonadb_expr/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/sedonadb-expr/python/sedonadb_expr/__init__.py b/python/sedonadb-expr/python/sedonadb_expr/__init__.py index e8b13e4d81..5aaef47adb 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/__init__.py +++ b/python/sedonadb-expr/python/sedonadb_expr/__init__.py @@ -20,4 +20,10 @@ from sedonadb_expr._generated.geo_methods import GeoMethods from sedonadb_expr.utils import MISSING, filter_missing_args -__all__ = ["__version__", "GeoFunctions", "GeoMethods", "MISSING", "filter_missing_args"] +__all__ = [ + "__version__", + "GeoFunctions", + "GeoMethods", + "MISSING", + "filter_missing_args", +] From bc2826d399b99e90641adacfb7a62e41a350c031 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 17:03:58 -0500 Subject: [PATCH 16/28] make pythonic --- python/sedonadb-expr/hatch_build.py | 47 ++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 3c4c13bf9c..357bba643c 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -52,6 +52,22 @@ DOCS_BASE_URL = "https://sedona.apache.org/sedonadb/latest/reference/sql" + +def camel_to_snake(name: str) -> str: + """Convert CamelCase/PascalCase to snake_case. + + Examples: + AsBinary -> as_binary + GeomFromWKB -> geom_from_wkb + AsEWKT -> as_ewkt + LineInterpolatePoint -> line_interpolate_point + """ + # Insert underscore before uppercase letters that follow lowercase letters + # or before uppercase letters that are followed by lowercase letters + result = re.sub(r"(?<=[a-z])(?=[A-Z])", "_", name) + result = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", "_", result) + return result.lower() + LICENSE_HEADER = """\ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -122,6 +138,7 @@ def __init__( kernels: list[dict[str, Any]], is_geo_method: bool = False, kernel_info: KernelInfo | None = None, + sql_name: str | None = None, ): self.name = name self.title = title @@ -129,6 +146,21 @@ def __init__( self.kernels = kernels self.is_geo_method = is_geo_method self.kernel_info = kernel_info + self.sql_name = sql_name or name # e.g., "ST_AsBinary" + + @property + def method_name(self) -> str: + """Return the snake_case method name derived from the SQL function name. + + e.g., ST_AsBinary -> as_binary, ST_GeomFromWKB -> geom_from_wkb + """ + sql = self.sql_name + # Strip prefix (ST_, RS_, S2_, SD_) + for prefix in ("ST_", "RS_", "S2_", "SD_"): + if sql.upper().startswith(prefix): + sql = sql[len(prefix) :] + break + return camel_to_snake(sql) def extract_frontmatter(file_path: Path) -> dict[str, Any]: @@ -355,6 +387,8 @@ def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: is_geo_method = True break + # Get properly-cased SQL function name from title field + sql_name = frontmatter.get("title", fn_name) title = frontmatter.get("description", frontmatter.get("title", fn_name)) description = extract_description_section(qmd_path) or "" @@ -367,6 +401,7 @@ def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: kernels=kernels, is_geo_method=is_geo_method, kernel_info=kernel_info, + sql_name=sql_name, ) @@ -492,10 +527,8 @@ def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: ] for func in sorted(geo_funcs, key=lambda f: f.name): - # Method name: strip st_ prefix - method_name = func.name - if method_name.startswith("st_"): - method_name = method_name[3:] + # Method name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) + method_name = func.method_name kernel_info = func.kernel_info if not kernel_info: @@ -575,10 +608,8 @@ def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: ] for func in sorted(geo_funcs, key=lambda f: f.name): - # Property name: strip st_ prefix - prop_name = func.name - if prop_name.startswith("st_"): - prop_name = prop_name[3:] + # Property name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) + prop_name = func.method_name docstring = generate_function_docstring(func) From b29d9e791c087669c0741d5307edc31c3599b323 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 17:13:03 -0500 Subject: [PATCH 17/28] add the builder --- python/sedonadb-expr/hatch_build.py | 643 +--------------- .../python/sedonadb_expr/_codegen.py | 721 ++++++++++++++++++ 2 files changed, 732 insertions(+), 632 deletions(-) create mode 100644 python/sedonadb-expr/python/sedonadb_expr/_codegen.py diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 357bba643c..8eec2dc0f3 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -24,609 +24,12 @@ from __future__ import annotations -import re -import textwrap from pathlib import Path from typing import Any -import yaml from hatchling.builders.hooks.plugin.interface import BuildHookInterface -# Type to parameter name mapping (matches R version) -TYPE_TO_PARAM: dict[str, str] = { - "geometry": "geom", - "geography": "geom", - "raster": "rast", - "float64": "x", - "double": "x", - "integer": "n", - "int64": "n", - "string": "s", - "boolean": "b", - "crs": "crs", -} - -# Types that qualify for geo methods (first arg piped in) -GEO_TYPES = {"geometry", "geography"} - -DOCS_BASE_URL = "https://sedona.apache.org/sedonadb/latest/reference/sql" - - -def camel_to_snake(name: str) -> str: - """Convert CamelCase/PascalCase to snake_case. - - Examples: - AsBinary -> as_binary - GeomFromWKB -> geom_from_wkb - AsEWKT -> as_ewkt - LineInterpolatePoint -> line_interpolate_point - """ - # Insert underscore before uppercase letters that follow lowercase letters - # or before uppercase letters that are followed by lowercase letters - result = re.sub(r"(?<=[a-z])(?=[A-Z])", "_", name) - result = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", "_", result) - return result.lower() - -LICENSE_HEADER = """\ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" - - -class ArgInfo: - """Information about a kernel argument.""" - - def __init__( - self, - type: str, - name: str | None = None, - description: str | None = None, - optional: bool = False, - ): - self.type = type - self.name = name - self.description = description - self.optional = optional - - -class KernelInfo: - """Parsed kernel information.""" - - def __init__( - self, - args: list[ArgInfo] | None = None, - returns: str = "unknown", - variadic: bool = False, - kernel_signatures: list[str] | None = None, - ): - self.args = args if args is not None else [] - self.returns = returns - self.variadic = variadic - self.kernel_signatures = ( - kernel_signatures if kernel_signatures is not None else [] - ) - - @property - def has_optional_args(self) -> bool: - """Return True if any argument is optional.""" - return any(arg.optional for arg in self.args) - - -class FunctionInfo: - """Parsed function information from a .qmd file.""" - - def __init__( - self, - name: str, - title: str, - description: str, - kernels: list[dict[str, Any]], - is_geo_method: bool = False, - kernel_info: KernelInfo | None = None, - sql_name: str | None = None, - ): - self.name = name - self.title = title - self.description = description - self.kernels = kernels - self.is_geo_method = is_geo_method - self.kernel_info = kernel_info - self.sql_name = sql_name or name # e.g., "ST_AsBinary" - - @property - def method_name(self) -> str: - """Return the snake_case method name derived from the SQL function name. - - e.g., ST_AsBinary -> as_binary, ST_GeomFromWKB -> geom_from_wkb - """ - sql = self.sql_name - # Strip prefix (ST_, RS_, S2_, SD_) - for prefix in ("ST_", "RS_", "S2_", "SD_"): - if sql.upper().startswith(prefix): - sql = sql[len(prefix) :] - break - return camel_to_snake(sql) - - -def extract_frontmatter(file_path: Path) -> dict[str, Any]: - """Extract YAML frontmatter from a .qmd file.""" - content = file_path.read_text() - lines = content.split("\n") - - # Find YAML delimiters - delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] - if len(delimiters) < 2: - raise ValueError(f"Could not find YAML frontmatter in {file_path}") - - yaml_text = "\n".join(lines[delimiters[0] + 1 : delimiters[1]]) - return yaml.safe_load(yaml_text) - - -def extract_description_section(file_path: Path) -> str | None: - """Extract the ## Description section from the .qmd file body.""" - content = file_path.read_text() - lines = content.split("\n") - - # Find end of frontmatter - delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] - if len(delimiters) < 2: - return None - - body_lines = lines[delimiters[1] + 1 :] - - # Find ## Description section - desc_start = None - for i, line in enumerate(body_lines): - if line.startswith("## Description"): - desc_start = i - break - - if desc_start is None: - return None - - # Find next section or end - remaining = body_lines[desc_start + 1 :] - next_section = None - for i, line in enumerate(remaining): - if line.startswith("## "): - next_section = i - break - - if next_section is None: - desc_lines = remaining - else: - desc_lines = remaining[:next_section] - - # Process lines: preserve markdown lists, join paragraphs - result_lines: list[str] = [] - current_paragraph: list[str] = [] - - for line in desc_lines: - stripped = line.strip() - # Check if this is a list item (-, *, or numbered) - is_list_item = bool(re.match(r"^[-*]|\d+\.", stripped)) - - if not stripped: - # Empty line: flush current paragraph and add blank line for separation - if current_paragraph: - result_lines.append(" ".join(current_paragraph)) - current_paragraph = [] - result_lines.append("") # Preserve paragraph break - elif is_list_item: - # List item: flush paragraph first, then add list item - if current_paragraph: - result_lines.append(" ".join(current_paragraph)) - current_paragraph = [] - result_lines.append(stripped) - else: - # Regular text: accumulate into paragraph - current_paragraph.append(stripped) - - # Flush any remaining paragraph - if current_paragraph: - result_lines.append(" ".join(current_paragraph)) - - desc_text = "\n".join(result_lines).strip() - return desc_text if desc_text else None - - -def type_to_param_name( - arg_type: str, index: int = 0, needs_suffix: bool = False -) -> str: - """Generate parameter name from type.""" - base_name = TYPE_TO_PARAM.get(arg_type, "arg") - if needs_suffix: - suffix = chr(ord("a") + index) # 0=a, 1=b, 2=c, ... - return f"{base_name}_{suffix}" - return base_name - - -def parse_kernel_args(kernel_args: list) -> list[ArgInfo]: - """Parse kernel arguments into ArgInfo objects.""" - result = [] - for arg in kernel_args: - if isinstance(arg, str): - result.append(ArgInfo(type=arg)) - elif isinstance(arg, dict): - result.append( - ArgInfo( - type=arg.get("type", "unknown"), - name=arg.get("name"), - description=arg.get("description"), - ) - ) - else: - result.append(ArgInfo(type="unknown")) - return result - - -def generate_arg_names(arg_info_list: list[ArgInfo]) -> list[str]: - """Generate argument names for a kernel's args.""" - types = [info.type for info in arg_info_list] - type_counts: dict[str, int] = {} - type_totals: dict[str, int] = {} - - # Count total occurrences of each type - for t in types: - type_totals[t] = type_totals.get(t, 0) + 1 - - arg_names = [] - for info in arg_info_list: - arg_type = info.type - arg_name = info.name - - if arg_name is None: - type_counts[arg_type] = type_counts.get(arg_type, 0) + 1 - needs_suffix = type_totals.get(arg_type, 0) > 1 - arg_name = type_to_param_name( - arg_type, type_counts[arg_type] - 1, needs_suffix - ) - - arg_names.append(arg_name) - - return arg_names - - -def parse_kernel_params(kernels: list[dict], fn_name: str = "unknown") -> KernelInfo: - """Parse kernel arguments and generate parameter info.""" - if not kernels: - return KernelInfo() - - # Process all kernels - all_kernel_info = [parse_kernel_args(k.get("args", [])) for k in kernels] - all_kernel_args = [generate_arg_names(info) for info in all_kernel_info] - - # Find max args - kernel_lengths = [len(args) for args in all_kernel_args] - max_args = max(kernel_lengths) if kernel_lengths else 0 - - # Check for argument name conflicts - has_conflict = False - for pos in range(max_args): - names_at_pos = set() - for args in all_kernel_args: - if pos < len(args): - names_at_pos.add(args[pos]) - if len(names_at_pos) > 1: - has_conflict = True - break - - returns = kernels[0].get("returns", "unknown") - - if has_conflict: - # Build signature strings for documentation - kernel_signatures = [] - for i, args in enumerate(all_kernel_args): - types = [info.type for info in all_kernel_info[i]] - sig = ", ".join(f"{arg} ({t})" for arg, t in zip(args, types)) - kernel_signatures.append(sig) - - return KernelInfo( - args=[], - returns=returns, - variadic=True, - kernel_signatures=kernel_signatures, - ) - - # Use kernel with most arguments as reference - ref_idx = kernel_lengths.index(max(kernel_lengths)) if kernel_lengths else 0 - arg_info = all_kernel_info[ref_idx] if all_kernel_info else [] - arg_names = all_kernel_args[ref_idx] if all_kernel_args else [] - - # Determine minimum args (args present in all kernels) - min_args = min(kernel_lengths) if kernel_lengths else 0 - - # Update ArgInfo with generated names and optional flag - for i, info in enumerate(arg_info): - if info.name is None: - info.name = arg_names[i] - # Args beyond min_args are optional (not present in all kernels) - info.optional = i >= min_args - - return KernelInfo(args=arg_info, returns=returns, variadic=False) - - -def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: - """Parse a .qmd file and return FunctionInfo.""" - fn_name = qmd_path.stem # e.g., "st_envelope" - - try: - frontmatter = extract_frontmatter(qmd_path) - except Exception: - return None - - kernels = frontmatter.get("kernels", []) - if not kernels: - return None - - # Check if first argument of any kernel is geometry/geography - is_geo_method = False - for kernel in kernels: - args = kernel.get("args", []) - if args: - first_arg = args[0] - first_type = ( - first_arg if isinstance(first_arg, str) else first_arg.get("type", "") - ) - if first_type in GEO_TYPES: - is_geo_method = True - break - - # Get properly-cased SQL function name from title field - sql_name = frontmatter.get("title", fn_name) - title = frontmatter.get("description", frontmatter.get("title", fn_name)) - description = extract_description_section(qmd_path) or "" - - kernel_info = parse_kernel_params(kernels, fn_name) - - return FunctionInfo( - name=fn_name, - title=title, - description=description, - kernels=kernels, - is_geo_method=is_geo_method, - kernel_info=kernel_info, - sql_name=sql_name, - ) - - -def wrap_docstring(text: str, width: int = 88, indent: str = " ") -> str: - """Wrap text for docstrings, preserving markdown lists.""" - if not text: - return "" - - result_lines: list[str] = [] - for i, line in enumerate(text.split("\n")): - if not line.strip(): - result_lines.append("") - continue - - # Wrap each line separately - wrapped = textwrap.fill(line, width=width - len(indent)) - for j, wrapped_line in enumerate(wrapped.split("\n")): - if i == 0 and j == 0: - # First line of first paragraph - no indent - result_lines.append(wrapped_line) - else: - result_lines.append(indent + wrapped_line) - - return "\n".join(result_lines) - - -def generate_method_docstring(func: FunctionInfo) -> str: - """Generate docstring for a method.""" - title = func.title.strip() - parts = [f'"""{title}'] - - if func.description and func.description.strip() != title: - parts.append("") - parts.append(wrap_docstring(func.description, indent=" ")) - - kernel_info = func.kernel_info - if kernel_info: - if kernel_info.variadic and kernel_info.kernel_signatures: - # Variadic mode: document with bulleted list of supported combinations - # Skip the first arg (piped in via self._expr) from each signature - parts.append("") - parts.append("Variants:") - for sig in kernel_info.kernel_signatures: - # Split signature, skip first arg, rejoin - arg_parts = [p.strip() for p in sig.split(",")] - remaining = ", ".join(arg_parts[1:]) if len(arg_parts) > 1 else "" - if remaining: - parts.append(f" - {remaining}") - else: - parts.append(" - (no additional arguments)") - elif kernel_info.args: - # Skip first arg (piped in via self._expr) - remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] - if remaining_args: - parts.append("") - parts.append("Args:") - for arg in remaining_args: - desc = arg.description or f"Input {arg.type}" - parts.append(f" {arg.name}: {desc}") - - parts.append("") - parts.append("See Also:") - parts.append(f" {DOCS_BASE_URL}/{func.name}/") - parts.append('"""') - - return "\n ".join(parts) - - -def generate_function_docstring(func: FunctionInfo) -> str: - """Generate docstring for a standalone function property.""" - title = func.title.strip() - parts = [f'"""{title}'] - - if func.description and func.description.strip() != title: - parts.append("") - parts.append(wrap_docstring(func.description, indent=" ")) - - kernel_info = func.kernel_info - if kernel_info: - if kernel_info.variadic and kernel_info.kernel_signatures: - # Variadic mode: document with bulleted list of supported combinations - parts.append("") - parts.append("Variants:") - for sig in kernel_info.kernel_signatures: - parts.append(f" - {sig}") - elif kernel_info.args: - parts.append("") - parts.append("Args:") - for arg in kernel_info.args: - desc = arg.description or f"Input {arg.type}" - parts.append(f" {arg.name}: {desc}") - - parts.append("") - parts.append("See Also:") - parts.append(f" {DOCS_BASE_URL}/{func.name}/") - parts.append('"""') - - return "\n ".join(parts) - - -def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: - """Generate geo_methods.py content.""" - # Filter to only geo methods (first arg is geometry/geography) - geo_funcs = [f for f in functions if f.is_geo_method] - - lines = [ - LICENSE_HEADER, - "", - '"""Auto-generated geometry/geography methods - do not edit."""', - "", - "from typing import Generic, TypeVar", - "", - "from sedonadb_expr.utils import MISSING, filter_missing_args", - "", - 'ExprT = TypeVar("ExprT")', - "", - "", - "class GeoMethods(Generic[ExprT]):", - ' """Geometry and geography methods accessible via expr.geo."""', - "", - " def __init__(self, expr: ExprT) -> None:", - " self._expr = expr", - ] - - for func in sorted(geo_funcs, key=lambda f: f.name): - # Method name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) - method_name = func.method_name - - kernel_info = func.kernel_info - if not kernel_info: - continue - - # Build method signature - skip first arg (piped in) - remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] - # Check if any remaining args are optional - has_optional = any(arg.optional for arg in remaining_args) - - if kernel_info.variadic: - params = "self, *args" - call_args = "*args" - use_filter = False - elif remaining_args: - # Build param strings with MISSING default for optional args - param_strs = [] - for arg in remaining_args: - if arg.optional: - param_strs.append(f"{arg.name}=MISSING") - else: - param_strs.append(arg.name) - params = "self, " + ", ".join(param_strs) - call_args = ", ".join(arg.name for arg in remaining_args) - use_filter = has_optional - else: - params = "self" - call_args = "" - use_filter = False - - docstring = generate_method_docstring(func) - - lines.extend( - [ - "", - f" def {method_name}({params}) -> ExprT:", - f" {docstring}", - ] - ) - - if call_args: - if use_filter: - lines.append( - f' return self._expr._call("{func.name}", *filter_missing_args({call_args}))' - ) - else: - lines.append( - f' return self._expr._call("{func.name}", {call_args})' - ) - else: - lines.append(f' return self._expr._call("{func.name}")') - - lines.append("") - return "\n".join(lines) - - -def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: - """Generate geo_functions.py content.""" - # Filter to only geo methods (these become callable properties) - geo_funcs = [f for f in functions if f.is_geo_method] - - lines = [ - LICENSE_HEADER, - "", - '"""Auto-generated geometry/geography functions - do not edit."""', - "", - "from typing import Callable, Generic, TypeVar", - "", - 'ExprT = TypeVar("ExprT")', - "", - "", - "class GeoFunctions(Generic[ExprT]):", - ' """Geometry and geography functions accessible via a factory."""', - "", - " def __init__(self, factory) -> None:", - " self._factory = factory", - ] - - for func in sorted(geo_funcs, key=lambda f: f.name): - # Property name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) - prop_name = func.method_name - - docstring = generate_function_docstring(func) - - lines.extend( - [ - "", - " @property", - f" def {prop_name}(self) -> Callable[..., ExprT]:", - f" {docstring}", - f' return self._factory["{func.name}"]', - ] - ) - - lines.append("") - return "\n".join(lines) - - class CustomBuildHook(BuildHookInterface): """Custom build hook that generates Python sources from SQL docs.""" @@ -656,50 +59,26 @@ def _generate_version(self, version: str) -> None: def _generate_sources(self) -> None: """Generate Python source files from docs/reference/sql.""" + # Import here to avoid circular imports and allow standalone usage + import sys + here = Path(__file__).parent + # Add the package to sys.path so we can import _codegen + sys.path.insert(0, str(here / "python")) + from sedonadb_expr._codegen import generate_sources + docs_sql = here.parent.parent / "docs" / "reference" / "sql" output_dir = here / "python" / "sedonadb_expr" / "_generated" - # Ensure output directory exists - output_dir.mkdir(parents=True, exist_ok=True) - - # Create __init__.py for the generated module - init_file = output_dir / "__init__.py" - init_file.write_text( - "# Auto-generated module - do not edit\n" - "# Generated from docs/reference/sql\n" - ) + result = generate_sources(docs_sql, output_dir) - if not docs_sql.exists(): + if result.total_functions == 0 and not docs_sql.exists(): self.app.display_warning( f"docs/reference/sql not found at {docs_sql}, skipping generation" ) return - # Find all .qmd files - qmd_files = sorted(docs_sql.glob("st_*.qmd")) - - # Parse all function definitions - functions: list[FunctionInfo] = [] - for qmd_file in qmd_files: - func = parse_qmd_file(qmd_file) - if func: - functions.append(func) - - # Generate geo_methods.py - geo_methods_content = generate_geo_methods_py(functions) - geo_methods_file = output_dir / "geo_methods.py" - geo_methods_file.write_text(geo_methods_content) - - # Generate geo_functions.py - geo_functions_content = generate_geo_functions_py(functions) - geo_functions_file = output_dir / "geo_functions.py" - geo_functions_file.write_text(geo_functions_content) - - # Count stats - geo_method_count = sum(1 for f in functions if f.is_geo_method) - self.app.display_info( - f"Generated {len(functions)} functions total, " - f"{geo_method_count} geo methods" + f"Generated {result.total_functions} functions total, " + f"{result.geo_method_count} geo methods" ) diff --git a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py new file mode 100644 index 0000000000..33b11e31e3 --- /dev/null +++ b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py @@ -0,0 +1,721 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Code generation for sedonadb-expr. + +This module generates Python source files from docs/reference/sql documentation files. +It can be invoked during the build process or run as a standalone script. +""" + +from __future__ import annotations + +import re +import textwrap +from pathlib import Path +from typing import Any + +import yaml + + +# Type to parameter name mapping (matches R version) +TYPE_TO_PARAM: dict[str, str] = { + "geometry": "geom", + "geography": "geom", + "raster": "rast", + "float64": "x", + "double": "x", + "integer": "n", + "int64": "n", + "string": "s", + "boolean": "b", + "crs": "crs", +} + +# Types that qualify for geo methods (first arg piped in) +GEO_TYPES = {"geometry", "geography"} + +DOCS_BASE_URL = "https://sedona.apache.org/sedonadb/latest/reference/sql" + + +def camel_to_snake(name: str) -> str: + """Convert CamelCase/PascalCase to snake_case. + + Examples: + AsBinary -> as_binary + GeomFromWKB -> geom_from_wkb + AsEWKT -> as_ewkt + LineInterpolatePoint -> line_interpolate_point + """ + # Insert underscore before uppercase letters that follow lowercase letters + # or before uppercase letters that are followed by lowercase letters + result = re.sub(r"(?<=[a-z])(?=[A-Z])", "_", name) + result = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", "_", result) + return result.lower() + + +LICENSE_HEADER = """\ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" + + +class ArgInfo: + """Information about a kernel argument.""" + + def __init__( + self, + type: str, + name: str | None = None, + description: str | None = None, + optional: bool = False, + ): + self.type = type + self.name = name + self.description = description + self.optional = optional + + +class KernelInfo: + """Parsed kernel information.""" + + def __init__( + self, + args: list[ArgInfo] | None = None, + returns: str = "unknown", + variadic: bool = False, + kernel_signatures: list[str] | None = None, + ): + self.args = args if args is not None else [] + self.returns = returns + self.variadic = variadic + self.kernel_signatures = ( + kernel_signatures if kernel_signatures is not None else [] + ) + + @property + def has_optional_args(self) -> bool: + """Return True if any argument is optional.""" + return any(arg.optional for arg in self.args) + + +class FunctionInfo: + """Parsed function information from a .qmd file.""" + + def __init__( + self, + name: str, + title: str, + description: str, + kernels: list[dict[str, Any]], + is_geo_method: bool = False, + kernel_info: KernelInfo | None = None, + sql_name: str | None = None, + ): + self.name = name + self.title = title + self.description = description + self.kernels = kernels + self.is_geo_method = is_geo_method + self.kernel_info = kernel_info + self.sql_name = sql_name or name # e.g., "ST_AsBinary" + + @property + def method_name(self) -> str: + """Return the snake_case method name derived from the SQL function name. + + e.g., ST_AsBinary -> as_binary, ST_GeomFromWKB -> geom_from_wkb + """ + sql = self.sql_name + # Strip prefix (ST_, RS_, S2_, SD_) + for prefix in ("ST_", "RS_", "S2_", "SD_"): + if sql.upper().startswith(prefix): + sql = sql[len(prefix) :] + break + return camel_to_snake(sql) + + +def extract_frontmatter(file_path: Path) -> dict[str, Any]: + """Extract YAML frontmatter from a .qmd file.""" + content = file_path.read_text() + lines = content.split("\n") + + # Find YAML delimiters + delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] + if len(delimiters) < 2: + raise ValueError(f"Could not find YAML frontmatter in {file_path}") + + yaml_text = "\n".join(lines[delimiters[0] + 1 : delimiters[1]]) + return yaml.safe_load(yaml_text) + + +def extract_description_section(file_path: Path) -> str | None: + """Extract the ## Description section from the .qmd file body.""" + content = file_path.read_text() + lines = content.split("\n") + + # Find end of frontmatter + delimiters = [i for i, line in enumerate(lines) if line.strip() == "---"] + if len(delimiters) < 2: + return None + + body_lines = lines[delimiters[1] + 1 :] + + # Find ## Description section + desc_start = None + for i, line in enumerate(body_lines): + if line.startswith("## Description"): + desc_start = i + break + + if desc_start is None: + return None + + # Find next section or end + remaining = body_lines[desc_start + 1 :] + next_section = None + for i, line in enumerate(remaining): + if line.startswith("## "): + next_section = i + break + + if next_section is None: + desc_lines = remaining + else: + desc_lines = remaining[:next_section] + + # Process lines: preserve markdown lists, join paragraphs + result_lines: list[str] = [] + current_paragraph: list[str] = [] + + for line in desc_lines: + stripped = line.strip() + # Check if this is a list item (-, *, or numbered) + is_list_item = bool(re.match(r"^[-*]|\d+\.", stripped)) + + if not stripped: + # Empty line: flush current paragraph and add blank line for separation + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + current_paragraph = [] + result_lines.append("") # Preserve paragraph break + elif is_list_item: + # List item: flush paragraph first, then add list item + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + current_paragraph = [] + result_lines.append(stripped) + else: + # Regular text: accumulate into paragraph + current_paragraph.append(stripped) + + # Flush any remaining paragraph + if current_paragraph: + result_lines.append(" ".join(current_paragraph)) + + desc_text = "\n".join(result_lines).strip() + return desc_text if desc_text else None + + +def type_to_param_name( + arg_type: str, index: int = 0, needs_suffix: bool = False +) -> str: + """Generate parameter name from type.""" + base_name = TYPE_TO_PARAM.get(arg_type, "arg") + if needs_suffix: + suffix = chr(ord("a") + index) # 0=a, 1=b, 2=c, ... + return f"{base_name}_{suffix}" + return base_name + + +def parse_kernel_args(kernel_args: list) -> list[ArgInfo]: + """Parse kernel arguments into ArgInfo objects.""" + result = [] + for arg in kernel_args: + if isinstance(arg, str): + result.append(ArgInfo(type=arg)) + elif isinstance(arg, dict): + result.append( + ArgInfo( + type=arg.get("type", "unknown"), + name=arg.get("name"), + description=arg.get("description"), + ) + ) + else: + result.append(ArgInfo(type="unknown")) + return result + + +def generate_arg_names(arg_info_list: list[ArgInfo]) -> list[str]: + """Generate argument names for a kernel's args.""" + types = [info.type for info in arg_info_list] + type_counts: dict[str, int] = {} + type_totals: dict[str, int] = {} + + # Count total occurrences of each type + for t in types: + type_totals[t] = type_totals.get(t, 0) + 1 + + arg_names = [] + for info in arg_info_list: + arg_type = info.type + arg_name = info.name + + if arg_name is None: + type_counts[arg_type] = type_counts.get(arg_type, 0) + 1 + needs_suffix = type_totals.get(arg_type, 0) > 1 + arg_name = type_to_param_name( + arg_type, type_counts[arg_type] - 1, needs_suffix + ) + + arg_names.append(arg_name) + + return arg_names + + +def parse_kernel_params(kernels: list[dict], fn_name: str = "unknown") -> KernelInfo: + """Parse kernel arguments and generate parameter info.""" + if not kernels: + return KernelInfo() + + # Process all kernels + all_kernel_info = [parse_kernel_args(k.get("args", [])) for k in kernels] + all_kernel_args = [generate_arg_names(info) for info in all_kernel_info] + + # Find max args + kernel_lengths = [len(args) for args in all_kernel_args] + max_args = max(kernel_lengths) if kernel_lengths else 0 + + # Check for argument name conflicts + has_conflict = False + for pos in range(max_args): + names_at_pos = set() + for args in all_kernel_args: + if pos < len(args): + names_at_pos.add(args[pos]) + if len(names_at_pos) > 1: + has_conflict = True + break + + returns = kernels[0].get("returns", "unknown") + + if has_conflict: + # Build signature strings for documentation + kernel_signatures = [] + for i, args in enumerate(all_kernel_args): + types = [info.type for info in all_kernel_info[i]] + sig = ", ".join(f"{arg} ({t})" for arg, t in zip(args, types)) + kernel_signatures.append(sig) + + return KernelInfo( + args=[], + returns=returns, + variadic=True, + kernel_signatures=kernel_signatures, + ) + + # Use kernel with most arguments as reference + ref_idx = kernel_lengths.index(max(kernel_lengths)) if kernel_lengths else 0 + arg_info = all_kernel_info[ref_idx] if all_kernel_info else [] + arg_names = all_kernel_args[ref_idx] if all_kernel_args else [] + + # Determine minimum args (args present in all kernels) + min_args = min(kernel_lengths) if kernel_lengths else 0 + + # Update ArgInfo with generated names and optional flag + for i, info in enumerate(arg_info): + if info.name is None: + info.name = arg_names[i] + # Args beyond min_args are optional (not present in all kernels) + info.optional = i >= min_args + + return KernelInfo(args=arg_info, returns=returns, variadic=False) + + +def parse_qmd_file(qmd_path: Path) -> FunctionInfo | None: + """Parse a .qmd file and return FunctionInfo.""" + fn_name = qmd_path.stem # e.g., "st_envelope" + + try: + frontmatter = extract_frontmatter(qmd_path) + except Exception: + return None + + kernels = frontmatter.get("kernels", []) + if not kernels: + return None + + # Check if first argument of any kernel is geometry/geography + is_geo_method = False + for kernel in kernels: + args = kernel.get("args", []) + if args: + first_arg = args[0] + first_type = ( + first_arg if isinstance(first_arg, str) else first_arg.get("type", "") + ) + if first_type in GEO_TYPES: + is_geo_method = True + break + + # Get properly-cased SQL function name from title field + sql_name = frontmatter.get("title", fn_name) + title = frontmatter.get("description", frontmatter.get("title", fn_name)) + description = extract_description_section(qmd_path) or "" + + kernel_info = parse_kernel_params(kernels, fn_name) + + return FunctionInfo( + name=fn_name, + title=title, + description=description, + kernels=kernels, + is_geo_method=is_geo_method, + kernel_info=kernel_info, + sql_name=sql_name, + ) + + +def wrap_docstring(text: str, width: int = 88, indent: str = " ") -> str: + """Wrap text for docstrings, preserving markdown lists.""" + if not text: + return "" + + result_lines: list[str] = [] + for i, line in enumerate(text.split("\n")): + if not line.strip(): + result_lines.append("") + continue + + # Wrap each line separately + wrapped = textwrap.fill(line, width=width - len(indent)) + for j, wrapped_line in enumerate(wrapped.split("\n")): + if i == 0 and j == 0: + # First line of first paragraph - no indent + result_lines.append(wrapped_line) + else: + result_lines.append(indent + wrapped_line) + + return "\n".join(result_lines) + + +def generate_method_docstring(func: FunctionInfo) -> str: + """Generate docstring for a method.""" + title = func.title.strip() + parts = [f'"""{title}'] + + if func.description and func.description.strip() != title: + parts.append("") + parts.append(wrap_docstring(func.description, indent=" ")) + + kernel_info = func.kernel_info + if kernel_info: + if kernel_info.variadic and kernel_info.kernel_signatures: + # Variadic mode: document with bulleted list of supported combinations + # Skip the first arg (piped in via self._expr) from each signature + parts.append("") + parts.append("Variants:") + for sig in kernel_info.kernel_signatures: + # Split signature, skip first arg, rejoin + arg_parts = [p.strip() for p in sig.split(",")] + remaining = ", ".join(arg_parts[1:]) if len(arg_parts) > 1 else "" + if remaining: + parts.append(f" - {remaining}") + else: + parts.append(" - (no additional arguments)") + elif kernel_info.args: + # Skip first arg (piped in via self._expr) + remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + if remaining_args: + parts.append("") + parts.append("Args:") + for arg in remaining_args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") + + parts.append("") + parts.append("See Also:") + parts.append(f" {DOCS_BASE_URL}/{func.name}/") + parts.append('"""') + + return "\n ".join(parts) + + +def generate_function_docstring(func: FunctionInfo) -> str: + """Generate docstring for a standalone function property.""" + title = func.title.strip() + parts = [f'"""{title}'] + + if func.description and func.description.strip() != title: + parts.append("") + parts.append(wrap_docstring(func.description, indent=" ")) + + kernel_info = func.kernel_info + if kernel_info: + if kernel_info.variadic and kernel_info.kernel_signatures: + # Variadic mode: document with bulleted list of supported combinations + parts.append("") + parts.append("Variants:") + for sig in kernel_info.kernel_signatures: + parts.append(f" - {sig}") + elif kernel_info.args: + parts.append("") + parts.append("Args:") + for arg in kernel_info.args: + desc = arg.description or f"Input {arg.type}" + parts.append(f" {arg.name}: {desc}") + + parts.append("") + parts.append("See Also:") + parts.append(f" {DOCS_BASE_URL}/{func.name}/") + parts.append('"""') + + return "\n ".join(parts) + + +def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: + """Generate geo_methods.py content.""" + # Filter to only geo methods (first arg is geometry/geography) + geo_funcs = [f for f in functions if f.is_geo_method] + + lines = [ + LICENSE_HEADER, + "", + '"""Auto-generated geometry/geography methods - do not edit."""', + "", + "from typing import Generic, TypeVar", + "", + "from sedonadb_expr.utils import MISSING, filter_missing_args", + "", + 'ExprT = TypeVar("ExprT")', + "", + "", + "class GeoMethods(Generic[ExprT]):", + ' """Geometry and geography methods accessible via expr.geo."""', + "", + " def __init__(self, expr: ExprT) -> None:", + " self._expr = expr", + ] + + for func in sorted(geo_funcs, key=lambda f: f.name): + # Method name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) + method_name = func.method_name + + kernel_info = func.kernel_info + if not kernel_info: + continue + + # Build method signature - skip first arg (piped in) + remaining_args = kernel_info.args[1:] if len(kernel_info.args) > 1 else [] + # Check if any remaining args are optional + has_optional = any(arg.optional for arg in remaining_args) + + if kernel_info.variadic: + params = "self, *args" + call_args = "*args" + use_filter = False + elif remaining_args: + # Build param strings with MISSING default for optional args + param_strs = [] + for arg in remaining_args: + if arg.optional: + param_strs.append(f"{arg.name}=MISSING") + else: + param_strs.append(arg.name) + params = "self, " + ", ".join(param_strs) + call_args = ", ".join(arg.name for arg in remaining_args) + use_filter = has_optional + else: + params = "self" + call_args = "" + use_filter = False + + docstring = generate_method_docstring(func) + + lines.extend( + [ + "", + f" def {method_name}({params}) -> ExprT:", + f" {docstring}", + ] + ) + + if call_args: + if use_filter: + lines.append( + f' return self._expr._call("{func.name}", *filter_missing_args({call_args}))' + ) + else: + lines.append( + f' return self._expr._call("{func.name}", {call_args})' + ) + else: + lines.append(f' return self._expr._call("{func.name}")') + + lines.append("") + return "\n".join(lines) + + +def generate_geo_functions_py(functions: list[FunctionInfo]) -> str: + """Generate geo_functions.py content.""" + # Filter to only geo methods (these become callable properties) + geo_funcs = [f for f in functions if f.is_geo_method] + + lines = [ + LICENSE_HEADER, + "", + '"""Auto-generated geometry/geography functions - do not edit."""', + "", + "from typing import Callable, Generic, TypeVar", + "", + 'ExprT = TypeVar("ExprT")', + "", + "", + "class GeoFunctions(Generic[ExprT]):", + ' """Geometry and geography functions accessible via a factory."""', + "", + " def __init__(self, factory) -> None:", + " self._factory = factory", + ] + + for func in sorted(geo_funcs, key=lambda f: f.name): + # Property name: derived from SQL function name (e.g., ST_AsBinary -> as_binary) + prop_name = func.method_name + + docstring = generate_function_docstring(func) + + lines.extend( + [ + "", + " @property", + f" def {prop_name}(self) -> Callable[..., ExprT]:", + f" {docstring}", + f' return self._factory["{func.name}"]', + ] + ) + + lines.append("") + return "\n".join(lines) + + +class GenerationResult: + """Result of code generation.""" + + def __init__( + self, + total_functions: int, + geo_method_count: int, + generated_files: list[Path], + ): + self.total_functions = total_functions + self.geo_method_count = geo_method_count + self.generated_files = generated_files + + +def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: + """Generate Python source files from docs/reference/sql. + + Args: + docs_sql: Path to docs/reference/sql directory containing .qmd files. + output_dir: Path to output directory for generated files. + + Returns: + GenerationResult with statistics about generated code. + """ + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Create __init__.py for the generated module + init_file = output_dir / "__init__.py" + init_file.write_text( + "# Auto-generated module - do not edit\n" + "# Generated from docs/reference/sql\n" + ) + + generated_files: list[Path] = [init_file] + + if not docs_sql.exists(): + return GenerationResult( + total_functions=0, + geo_method_count=0, + generated_files=generated_files, + ) + + # Find all .qmd files + qmd_files = sorted(docs_sql.glob("st_*.qmd")) + + # Parse all function definitions + functions: list[FunctionInfo] = [] + for qmd_file in qmd_files: + func = parse_qmd_file(qmd_file) + if func: + functions.append(func) + + # Generate geo_methods.py + geo_methods_content = generate_geo_methods_py(functions) + geo_methods_file = output_dir / "geo_methods.py" + geo_methods_file.write_text(geo_methods_content) + generated_files.append(geo_methods_file) + + # Generate geo_functions.py + geo_functions_content = generate_geo_functions_py(functions) + geo_functions_file = output_dir / "geo_functions.py" + geo_functions_file.write_text(geo_functions_content) + generated_files.append(geo_functions_file) + + # Count stats + geo_method_count = sum(1 for f in functions if f.is_geo_method) + + return GenerationResult( + total_functions=len(functions), + geo_method_count=geo_method_count, + generated_files=generated_files, + ) + + +if __name__ == "__main__": + # Allow running as a standalone script for development/debugging + import sys + + here = Path(__file__).parent + docs_sql = here.parent.parent.parent.parent / "docs" / "reference" / "sql" + output_dir = here / "_generated" + + result = generate_sources(docs_sql, output_dir) + + print(f"Generated {result.total_functions} functions total") + print(f"Generated {result.geo_method_count} geo methods") + print("Output files:") + for f in result.generated_files: + print(f" - {f}") + + sys.exit(0) From 617865e450b70d0e37cee722bd11f11c0df70c38 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 17:18:35 -0500 Subject: [PATCH 18/28] imports --- python/sedonadb-expr/python/sedonadb_expr/__init__.py | 3 --- python/sedonadb/python/sedonadb/expr/expression.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/python/sedonadb-expr/python/sedonadb_expr/__init__.py b/python/sedonadb-expr/python/sedonadb_expr/__init__.py index 5aaef47adb..8536d706fc 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/__init__.py +++ b/python/sedonadb-expr/python/sedonadb_expr/__init__.py @@ -18,12 +18,9 @@ from sedonadb_expr._version import __version__ from sedonadb_expr._generated.geo_functions import GeoFunctions from sedonadb_expr._generated.geo_methods import GeoMethods -from sedonadb_expr.utils import MISSING, filter_missing_args __all__ = [ "__version__", "GeoFunctions", "GeoMethods", - "MISSING", - "filter_missing_args", ] diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index 338ed8e9f0..019d693a74 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -35,7 +35,7 @@ if TYPE_CHECKING: - from sedonadb_expr._generated.geo_methods import GeoMethods + from sedonadb_expr import GeoMethods class Expr: From d33e2883a496b706965a9e8267dc0fe760eb1f05 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 17:27:01 -0500 Subject: [PATCH 19/28] add toods --- .../tests/expr/test_function_expression.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/sedonadb/tests/expr/test_function_expression.py b/python/sedonadb/tests/expr/test_function_expression.py index add20624fe..f5b5300bab 100644 --- a/python/sedonadb/tests/expr/test_function_expression.py +++ b/python/sedonadb/tests/expr/test_function_expression.py @@ -27,6 +27,10 @@ def test_scalar_st_function_returns_expr(con): assert isinstance(e, Expr) assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + # TODO: do this for other functions too + e = con.lit("POINT (0 1)").funcs.st_geomfromwkt() + assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + def test_scalar_st_function_alias_returns_expr(con): st_geomfromtext = con.funcs.st_geomfromtext @@ -108,3 +112,13 @@ def test_function_expression_composed(con): repr(e) == 'Expr(st_area(st_geomfromwkt(Utf8("POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))"))))' ) + + +def test_geo_functions_accessor(con): + # TODO: + pass + + +def test_geo_methods_accessor(con): + # TODO: need to add this test + pass From 5dad2c7be2e133e64aef0c200c61f753704e37b4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 22:57:58 -0500 Subject: [PATCH 20/28] versions and basic testing --- python/sedonadb-expr/pyproject.toml | 3 +- .../python/sedonadb_expr/_codegen.py | 38 +-- python/sedonadb-expr/tests/test_codegen.py | 250 ++++++++++++++++++ 3 files changed, 275 insertions(+), 16 deletions(-) create mode 100644 python/sedonadb-expr/tests/test_codegen.py diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml index 8ff2153cac..e7d207d7b3 100644 --- a/python/sedonadb-expr/pyproject.toml +++ b/python/sedonadb-expr/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["hatchling", "pyyaml"] +requires = ["hatchling>=1.0.0", "pyyaml"] build-backend = "hatchling.build" [project] @@ -38,6 +38,7 @@ dynamic = ["version"] [project.optional-dependencies] test = [ "pytest", + "pyyaml", ] [tool.hatch.version] diff --git a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py index 33b11e31e3..1d2309c51c 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py +++ b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py @@ -467,7 +467,8 @@ def generate_method_docstring(func: FunctionInfo) -> str: parts.append(f" {DOCS_BASE_URL}/{func.name}/") parts.append('"""') - return "\n ".join(parts) + joined = "\n ".join(parts) + return "\n".join(line.rstrip() for line in joined.split("\n")) def generate_function_docstring(func: FunctionInfo) -> str: @@ -499,7 +500,8 @@ def generate_function_docstring(func: FunctionInfo) -> str: parts.append(f" {DOCS_BASE_URL}/{func.name}/") parts.append('"""') - return "\n ".join(parts) + joined = "\n ".join(parts) + return "\n".join(line.rstrip() for line in joined.split("\n")) def generate_geo_methods_py(functions: list[FunctionInfo]) -> str: @@ -641,6 +643,24 @@ def __init__( self.generated_files = generated_files +def parse_qmd_files(docs_sql: Path, pattern: str) -> list[FunctionInfo]: + """Parse all .qmd files in a directory and return function definitions. + + Args: + docs_sql: Path to directory containing .qmd files. + + Returns: + List of parsed FunctionInfo objects. + """ + qmd_files = sorted(docs_sql.glob(pattern)) + functions: list[FunctionInfo] = [] + for qmd_file in qmd_files: + func = parse_qmd_file(qmd_file) + if func: + functions.append(func) + return functions + + def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: """Generate Python source files from docs/reference/sql. @@ -670,15 +690,7 @@ def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: generated_files=generated_files, ) - # Find all .qmd files - qmd_files = sorted(docs_sql.glob("st_*.qmd")) - - # Parse all function definitions - functions: list[FunctionInfo] = [] - for qmd_file in qmd_files: - func = parse_qmd_file(qmd_file) - if func: - functions.append(func) + functions = parse_qmd_files(docs_sql, "st_*.qmd") # Generate geo_methods.py geo_methods_content = generate_geo_methods_py(functions) @@ -704,8 +716,6 @@ def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: if __name__ == "__main__": # Allow running as a standalone script for development/debugging - import sys - here = Path(__file__).parent docs_sql = here.parent.parent.parent.parent / "docs" / "reference" / "sql" output_dir = here / "_generated" @@ -717,5 +727,3 @@ def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: print("Output files:") for f in result.generated_files: print(f" - {f}") - - sys.exit(0) diff --git a/python/sedonadb-expr/tests/test_codegen.py b/python/sedonadb-expr/tests/test_codegen.py new file mode 100644 index 0000000000..d007a7a224 --- /dev/null +++ b/python/sedonadb-expr/tests/test_codegen.py @@ -0,0 +1,250 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest + +from sedonadb_expr import _codegen + +SAMPLE_QMD = """\ +--- +title: ST_Buffer +description: Computes a buffered geometry. +kernels: + - returns: geometry + args: + - geometry + - name: distance + type: float64 + description: Radius of the buffer +--- + +## Description + +Returns a geometry covering all points within a given distance. +This paragraph could have more than one line. + +This is the second paragraph. + +- Followed by a list! +- Second bullet point + +## The Next Section + +...if there is one +""" + + +@pytest.fixture +def sample_qmd_path(): + with TemporaryDirectory() as tmpdir: + path = Path(tmpdir) / "st_buffer.qmd" + path.write_text(SAMPLE_QMD) + yield path + + +def test_camel_to_snake(): + assert _codegen.camel_to_snake("AsBinary") == "as_binary" + assert _codegen.camel_to_snake("GeomFromWKB") == "geom_from_wkb" + assert _codegen.camel_to_snake("AsEWKT") == "as_ewkt" + assert _codegen.camel_to_snake("LineInterpolatePoint") == "line_interpolate_point" + + +def test_extract_frontmatter(sample_qmd_path: Path): + fm = _codegen.extract_frontmatter(sample_qmd_path) + assert fm["title"] == "ST_Buffer" + + +def test_extract_description_section(sample_qmd_path: Path): + desc = _codegen.extract_description_section(sample_qmd_path) + expected = """\ +Returns a geometry covering all points within a given distance. This paragraph could have more than one line. + +This is the second paragraph. + +- Followed by a list! +- Second bullet point""" + assert desc == expected + + +def test_generate_method_docstring_with_args(): + # Test case: method with description and args (first arg skipped) + func = _codegen.FunctionInfo( + name="st_buffer", + title="ST_Buffer", + description="Returns a buffered geometry.", + kernels=[], + kernel_info=_codegen.KernelInfo( + args=[ + _codegen.ArgInfo( + type="geometry", name="geom", description="Input geometry" + ), + _codegen.ArgInfo( + type="float64", name="distance", description="Buffer distance" + ), + ], + returns="geometry", + ), + ) + docstring = _codegen.generate_method_docstring(func) + expected = '''\ +"""ST_Buffer + + Returns a buffered geometry. + + Args: + distance: Buffer distance + + See Also: + https://sedona.apache.org/sedonadb/latest/reference/sql/st_buffer/ + """''' + assert docstring == expected + + +def test_generate_method_docstring_variadic(): + # Variadic mode is triggered when parameter names conflict across kernels + # e.g., ST_Buffer has (geom, distance, params) vs (geog, distance, num_quad_segs) + # where position 3 conflicts: "params" vs "num_quad_segs" + func = _codegen.FunctionInfo( + name="st_buffer", + title="ST_Buffer", + description="Creates a buffer.", + kernels=[], + kernel_info=_codegen.KernelInfo( + args=[], + returns="geometry", + variadic=True, + kernel_signatures=[ + "geom (geometry), distance (float64)", + "geom (geometry), distance (float64), params (string)", + "geog (geography), distance (float64)", + "geog (geography), distance (float64), num_quad_segs (integer)", + "geog (geography), distance (float64), params (string)", + ], + ), + ) + docstring = _codegen.generate_method_docstring(func) + # Method docstring skips the first arg (piped in via self._expr) + expected = '''\ +"""ST_Buffer + + Creates a buffer. + + Variants: + - distance (float64) + - distance (float64), params (string) + - distance (float64) + - distance (float64), num_quad_segs (integer) + - distance (float64), params (string) + + See Also: + https://sedona.apache.org/sedonadb/latest/reference/sql/st_buffer/ + """''' + assert docstring == expected + + +def test_generate_function_docstring_with_args(): + # Test case: function with description and args (all args included) + func = _codegen.FunctionInfo( + name="st_buffer", + title="ST_Buffer", + description="Returns a buffered geometry.", + kernels=[], + kernel_info=_codegen.KernelInfo( + args=[ + _codegen.ArgInfo( + type="geometry", name="geom", description="Input geometry" + ), + _codegen.ArgInfo( + type="float64", name="distance", description="Buffer distance" + ), + ], + returns="geometry", + ), + ) + docstring = _codegen.generate_function_docstring(func) + expected = '''\ +"""ST_Buffer + + Returns a buffered geometry. + + Args: + geom: Input geometry + distance: Buffer distance + + See Also: + https://sedona.apache.org/sedonadb/latest/reference/sql/st_buffer/ + """''' + assert docstring == expected + + +def test_generate_function_docstring_variadic(): + # Variadic mode is triggered when parameter names conflict across kernels + func = _codegen.FunctionInfo( + name="st_buffer", + title="ST_Buffer", + description="Creates a buffer.", + kernels=[], + kernel_info=_codegen.KernelInfo( + args=[], + returns="geometry", + variadic=True, + kernel_signatures=[ + "geom (geometry), distance (float64)", + "geom (geometry), distance (float64), params (string)", + "geom (geography), distance (float64)", + "geom (geography), distance (float64), num_quad_segs (integer)", + "geom (geography), distance (float64), params (string)", + ], + ), + ) + docstring = _codegen.generate_function_docstring(func) + expected = '''\ +"""ST_Buffer + + Creates a buffer. + + Variants: + - geom (geometry), distance (float64) + - geom (geometry), distance (float64), params (string) + - geom (geography), distance (float64) + - geom (geography), distance (float64), num_quad_segs (integer) + - geom (geography), distance (float64), params (string) + + See Also: + https://sedona.apache.org/sedonadb/latest/reference/sql/st_buffer/ + """''' + assert docstring == expected + + +def test_generate_sources(sample_qmd_path: Path): + docs_sql = sample_qmd_path.parent + with TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) / "output" + + result = _codegen.generate_sources(docs_sql, output_dir) + + assert result.total_functions == 1 + assert result.geo_method_count == 1 + assert len(result.generated_files) == 3 + + # Verify generated files compile as valid Python + for file_path in result.generated_files: + code = file_path.read_text() + compile(code, str(file_path), "exec") From 27d8b6e069ad9ffa7cae069b5654f8593da52f40 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 8 Jun 2026 23:03:39 -0500 Subject: [PATCH 21/28] format, tidy --- .../python/sedonadb_expr/_codegen.py | 3 +-- .../sedonadb-expr/tests/test_sedonadb_expr.py | 23 ------------------- .../python/sedonadb/expr/expression.py | 4 +--- .../tests/expr/test_function_expression.py | 6 ++++- 4 files changed, 7 insertions(+), 29 deletions(-) delete mode 100644 python/sedonadb-expr/tests/test_sedonadb_expr.py diff --git a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py index 1d2309c51c..914a2e03f9 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py +++ b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py @@ -677,8 +677,7 @@ def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: # Create __init__.py for the generated module init_file = output_dir / "__init__.py" init_file.write_text( - "# Auto-generated module - do not edit\n" - "# Generated from docs/reference/sql\n" + "# Auto-generated module - do not edit\n# Generated from docs/reference/sql\n" ) generated_files: list[Path] = [init_file] diff --git a/python/sedonadb-expr/tests/test_sedonadb_expr.py b/python/sedonadb-expr/tests/test_sedonadb_expr.py deleted file mode 100644 index e693292110..0000000000 --- a/python/sedonadb-expr/tests/test_sedonadb_expr.py +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import sedonadb_expr - - -def test_version(): - # Version should match workspace Cargo.toml - assert sedonadb_expr.__version__ diff --git a/python/sedonadb/python/sedonadb/expr/expression.py b/python/sedonadb/python/sedonadb/expr/expression.py index 019d693a74..497b9b753b 100644 --- a/python/sedonadb/python/sedonadb/expr/expression.py +++ b/python/sedonadb/python/sedonadb/expr/expression.py @@ -223,9 +223,7 @@ def geo(self) -> "GeoMethods[Expr]": return GeoMethods(self) def _call(self, name, *args) -> "Expr": - if self._ctx is None: - raise ValueError("Can't _call() Expr constructed without a SedonaContext") - return self._ctx.funcs[name](self, *args) + return self.funcs[name](*args) # Arithmetic operators ------------------------------------------------- # diff --git a/python/sedonadb/tests/expr/test_function_expression.py b/python/sedonadb/tests/expr/test_function_expression.py index f5b5300bab..0d470f02ad 100644 --- a/python/sedonadb/tests/expr/test_function_expression.py +++ b/python/sedonadb/tests/expr/test_function_expression.py @@ -27,7 +27,7 @@ def test_scalar_st_function_returns_expr(con): assert isinstance(e, Expr) assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' - # TODO: do this for other functions too + # Also check piped function from literal e = con.lit("POINT (0 1)").funcs.st_geomfromwkt() assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' @@ -50,6 +50,10 @@ def test_scalar_st_function_with_column(con): assert isinstance(e, Expr) assert repr(e) == "Expr(st_area(geom))" + # Also check piped function from column + e = con.col("geom").funcs.st_geomfromwkt() + assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + def test_scalar_st_function_with_multiple_args(con): st_buffer = con.funcs.st_buffer From 0e626905c10dd609927aa688b5a585e8e26cceac Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 10:46:44 -0500 Subject: [PATCH 22/28] tests --- .../sedonadb/python/sedonadb/expr/literal.py | 12 +++++++++ .../tests/expr/test_function_expression.py | 25 +++++++++++++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/python/sedonadb/python/sedonadb/expr/literal.py b/python/sedonadb/python/sedonadb/expr/literal.py index a0c21828c2..5fbbfcb045 100644 --- a/python/sedonadb/python/sedonadb/expr/literal.py +++ b/python/sedonadb/python/sedonadb/expr/literal.py @@ -20,6 +20,9 @@ from sedonadb.utility import sedona # noqa: F401 if TYPE_CHECKING: + from sedonadb_expr import GeoMethods + + from sedonadb.expr import Expr from sedonadb.functions import Functions @@ -71,6 +74,15 @@ def funcs(self) -> "Functions": return Functions(self._ctx, self) + @property + def geo(self) -> "GeoMethods[Expr]": + from sedonadb_expr import GeoMethods + + return GeoMethods(self) + + def _call(self, name, *args) -> "Expr": + return self.funcs[name](*args) + def alias(self, name: str): """Give this literal a column name. diff --git a/python/sedonadb/tests/expr/test_function_expression.py b/python/sedonadb/tests/expr/test_function_expression.py index 0d470f02ad..d15615001f 100644 --- a/python/sedonadb/tests/expr/test_function_expression.py +++ b/python/sedonadb/tests/expr/test_function_expression.py @@ -18,6 +18,9 @@ from sedonadb.expr import Expr from sedonadb.expr.expression import ScalarUdf, AggregateUdf +import shapely +import pytest + def test_scalar_st_function_returns_expr(con): st_geomfromwkt = con.funcs.st_geomfromwkt @@ -52,7 +55,7 @@ def test_scalar_st_function_with_column(con): # Also check piped function from column e = con.col("geom").funcs.st_geomfromwkt() - assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + assert repr(e) == "Expr(st_geomfromwkt(geom))" def test_scalar_st_function_with_multiple_args(con): @@ -119,10 +122,22 @@ def test_function_expression_composed(con): def test_geo_functions_accessor(con): - # TODO: - pass + pytest.importorskip("sedonadb_expr") + + # Check function as resolved from the geo accessor + e = con.funcs.geo.as_text(con.col("foofy")) + assert isinstance(e, Expr) + assert repr(e) == "Expr(st_astext(foofy))" def test_geo_methods_accessor(con): - # TODO: need to add this test - pass + pytest.importorskip("sedonadb_expr") + + # Check piped function from literal via .geo accessor + e = con.lit(shapely.Point(0, 1)).geo.as_text() + e = con.lit("POINT (0 1)").funcs.st_geomfromwkt() + assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + + # Check piped function from Expr via .geo accessor + e = con.col("foofy").geo.as_text() + assert repr(e) == "Expr(st_astext(foofy))" From 17268c86821bd216ce6c6549f19de09372ec917f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 10:49:22 -0500 Subject: [PATCH 23/28] fix circular import on build --- python/sedonadb-expr/hatch_build.py | 13 ++++++++----- python/sedonadb-expr/pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 8eec2dc0f3..79bcc1c0b0 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -59,13 +59,16 @@ def _generate_version(self, version: str) -> None: def _generate_sources(self) -> None: """Generate Python source files from docs/reference/sql.""" - # Import here to avoid circular imports and allow standalone usage - import sys + # Import the _codegen module directly to avoid triggering __init__.py, + # which imports from _generated (which doesn't exist yet). + import importlib.util here = Path(__file__).parent - # Add the package to sys.path so we can import _codegen - sys.path.insert(0, str(here / "python")) - from sedonadb_expr._codegen import generate_sources + codegen_path = here / "python" / "sedonadb_expr" / "_codegen.py" + spec = importlib.util.spec_from_file_location("_codegen", codegen_path) + codegen_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(codegen_module) + generate_sources = codegen_module.generate_sources docs_sql = here.parent.parent / "docs" / "reference" / "sql" output_dir = here / "python" / "sedonadb_expr" / "_generated" diff --git a/python/sedonadb-expr/pyproject.toml b/python/sedonadb-expr/pyproject.toml index e7d207d7b3..e36243aa6f 100644 --- a/python/sedonadb-expr/pyproject.toml +++ b/python/sedonadb-expr/pyproject.toml @@ -16,7 +16,7 @@ # under the License. [build-system] -requires = ["hatchling>=1.0.0", "pyyaml"] +requires = ["hatchling", "pyyaml"] build-backend = "hatchling.build" [project] From 20fcb244524a242a39d0a76a5a6990c13f2b5e23 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 13:13:41 -0500 Subject: [PATCH 24/28] no doctests yet --- .github/workflows/python.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 5d4cb16cda..d42989042b 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -154,11 +154,6 @@ jobs: cd python/sedonadb-expr python -m pytest -vv - - name: Run doctests (sedonadb-expr) - run: | - cd python/sedonadb-expr - python -m pytest --doctest-modules python/ - - name: Shutdown docker compose services if: always() run: | From d7ca4e3dd5c9376518e64a31bc4cd9d1df8bace2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 13:16:27 -0500 Subject: [PATCH 25/28] better readme --- python/sedonadb-expr/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sedonadb-expr/README.md b/python/sedonadb-expr/README.md index f115748051..83296cf9ab 100644 --- a/python/sedonadb-expr/README.md +++ b/python/sedonadb-expr/README.md @@ -19,7 +19,9 @@ # SedonaDB Expr -A standalone Python package for SedonaDB expressions. +A standalone Python package for SedonaDB expressions. This is an optional +dependency of the `sedonadb` package that powers the type-specific accessors +without bloating the core package for non-interactive usage. ## Installation From eb0f7334742ac0b9191a471ce18b5ada728ee943 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 14:30:13 -0500 Subject: [PATCH 26/28] fix test --- python/sedonadb-expr/hatch_build.py | 30 ++++++++++--------- .../python/sedonadb_expr/_codegen.py | 4 ++- .../tests/expr/test_function_expression.py | 6 ++-- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/python/sedonadb-expr/hatch_build.py b/python/sedonadb-expr/hatch_build.py index 79bcc1c0b0..277a82289f 100644 --- a/python/sedonadb-expr/hatch_build.py +++ b/python/sedonadb-expr/hatch_build.py @@ -43,31 +43,33 @@ def initialize(self, version: str, build_data: dict[str, Any]) -> None: version: The version being built build_data: Mutable dict to modify build behavior """ - self._generate_version(version) - self._generate_sources() + # Import the _codegen module directly to avoid triggering __init__.py, + # which imports from _generated (which doesn't exist yet). + import importlib.util + + here = Path(__file__).parent + codegen_path = here / "python" / "sedonadb_expr" / "_codegen.py" + spec = importlib.util.spec_from_file_location("_codegen", codegen_path) + codegen_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(codegen_module) + + self._generate_version(version, codegen_module.LICENSE_HEADER) + self._generate_sources(codegen_module, here) - def _generate_version(self, version: str) -> None: + def _generate_version(self, version: str, license_header: str) -> None: """Generate _version.py with the static version string.""" here = Path(__file__).parent version_file = here / "python" / "sedonadb_expr" / "_version.py" - content = f'''# Auto-generated at build time - do not edit + content = f'''{license_header} +# Auto-generated at build time - do not edit __version__ = "{version}" ''' version_file.write_text(content) self.app.display_info(f"Generated _version.py with version {version}") - def _generate_sources(self) -> None: + def _generate_sources(self, codegen_module: Any, here: Path) -> None: """Generate Python source files from docs/reference/sql.""" - # Import the _codegen module directly to avoid triggering __init__.py, - # which imports from _generated (which doesn't exist yet). - import importlib.util - - here = Path(__file__).parent - codegen_path = here / "python" / "sedonadb_expr" / "_codegen.py" - spec = importlib.util.spec_from_file_location("_codegen", codegen_path) - codegen_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(codegen_module) generate_sources = codegen_module.generate_sources docs_sql = here.parent.parent / "docs" / "reference" / "sql" diff --git a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py index 914a2e03f9..d1edc1c4db 100644 --- a/python/sedonadb-expr/python/sedonadb_expr/_codegen.py +++ b/python/sedonadb-expr/python/sedonadb_expr/_codegen.py @@ -677,7 +677,9 @@ def generate_sources(docs_sql: Path, output_dir: Path) -> GenerationResult: # Create __init__.py for the generated module init_file = output_dir / "__init__.py" init_file.write_text( - "# Auto-generated module - do not edit\n# Generated from docs/reference/sql\n" + f"{LICENSE_HEADER}\n" + "# Auto-generated module - do not edit\n" + "# Generated from docs/reference/sql\n" ) generated_files: list[Path] = [init_file] diff --git a/python/sedonadb/tests/expr/test_function_expression.py b/python/sedonadb/tests/expr/test_function_expression.py index d15615001f..0675a684ae 100644 --- a/python/sedonadb/tests/expr/test_function_expression.py +++ b/python/sedonadb/tests/expr/test_function_expression.py @@ -135,8 +135,10 @@ def test_geo_methods_accessor(con): # Check piped function from literal via .geo accessor e = con.lit(shapely.Point(0, 1)).geo.as_text() - e = con.lit("POINT (0 1)").funcs.st_geomfromwkt() - assert repr(e) == 'Expr(st_geomfromwkt(Utf8("POINT (0 1)")))' + assert ( + repr(e) + == """Expr(st_astext(Binary("1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,240,63") FieldMetadata { inner: {"ARROW:extension:metadata": "{}", "ARROW:extension:name": "geoarrow.wkb"} }))""" + ) # Check piped function from Expr via .geo accessor e = con.col("foofy").geo.as_text() From a3e6e33727470142480b687562b5ca86a2cc3b36 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 14:36:53 -0500 Subject: [PATCH 27/28] add utils test --- python/sedonadb-expr/tests/test_utils.py | 109 +++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 python/sedonadb-expr/tests/test_utils.py diff --git a/python/sedonadb-expr/tests/test_utils.py b/python/sedonadb-expr/tests/test_utils.py new file mode 100644 index 0000000000..d24089c1ef --- /dev/null +++ b/python/sedonadb-expr/tests/test_utils.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +from sedonadb_expr import GeoFunctions, GeoMethods +from sedonadb_expr.utils import MISSING, filter_missing_args + + +class MockExpr: + """Mock expression that records _call invocations.""" + + def __init__(self): + self.calls = [] + + def _call(self, name, *args): + self.calls.append((name, args)) + return self + + +def test_filter_missing_args(): + """Tests for filter_missing_args utility.""" + # Passthrough when no missing + assert filter_missing_args(1, 2, 3) == (1, 2, 3) + + # All missing returns empty + assert filter_missing_args(MISSING, MISSING) == () + + # Trailing missing are filtered + assert filter_missing_args(1, 2, MISSING, MISSING) == (1, 2) + assert filter_missing_args(1, MISSING) == (1,) + + # Empty args + assert filter_missing_args() == () + + # Missing before non-missing raises + with pytest.raises(ValueError, match="Missing arguments must be at the end"): + filter_missing_args(MISSING, 1) + + # Missing in middle raises + with pytest.raises(ValueError, match="Missing arguments must be at the end"): + filter_missing_args(1, MISSING, 2) + + +def test_geo_methods_missing_args(): + """Tests for MISSING argument handling in generated GeoMethods.""" + mock = MockExpr() + geo = GeoMethods(mock) + + # force4d with no args passes no extra arguments + geo.force4d() + assert mock.calls[-1] == ("st_force4d", ()) + + # force4d with z only passes just z + geo.force4d(z=1.0) + assert mock.calls[-1] == ("st_force4d", (1.0,)) + + # force4d with both z and m passes both + geo.force4d(z=1.0, m=2.0) + assert mock.calls[-1] == ("st_force4d", (1.0, 2.0)) + + # translate with partial missing filters correctly + geo.translate(deltaX=1.0, deltaY=2.0) + assert mock.calls[-1] == ("st_translate", (1.0, 2.0)) + + # force4d with MISSING z but non-MISSING m should raise + with pytest.raises(ValueError, match="Missing arguments must be at the end"): + geo.force4d(z=MISSING, m=2.0) + + # translate with MISSING in middle should raise + with pytest.raises(ValueError, match="Missing arguments must be at the end"): + geo.translate(deltaX=1.0, deltaY=MISSING, deltaZ=3.0) + + +def test_geo_functions(): + """Tests for GeoFunctions property access.""" + calls = [] + + def mock_factory_getitem(name): + def fn(*args): + calls.append((name, args)) + + return fn + + factory = type( + "MockFactory", (), {"__getitem__": lambda self, k: mock_factory_getitem(k)} + )() + geo_fns = GeoFunctions(factory) + + # Properties return callables + assert callable(geo_fns.affine) + assert callable(geo_fns.buffer) + + # Calling returned function invokes factory + geo_fns.envelope("geom_arg") + assert calls[-1] == ("st_envelope", ("geom_arg",)) From c2bfdfd5a505f163a8d7f5a230dc41f03ebcc20b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 9 Jun 2026 14:42:02 -0500 Subject: [PATCH 28/28] less weird mock factory --- python/sedonadb-expr/tests/test_utils.py | 27 +++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/python/sedonadb-expr/tests/test_utils.py b/python/sedonadb-expr/tests/test_utils.py index d24089c1ef..103a5ecc8e 100644 --- a/python/sedonadb-expr/tests/test_utils.py +++ b/python/sedonadb-expr/tests/test_utils.py @@ -31,6 +31,19 @@ def _call(self, name, *args): return self +class MockFunctions: + """Mock function mapping that records _call invocations.""" + + def __init__(self): + self.calls = [] + + def __getitem__(self, name, *args): + def fn(*args): + self.calls.append((name, args)) + + return fn + + def test_filter_missing_args(): """Tests for filter_missing_args utility.""" # Passthrough when no missing @@ -87,17 +100,7 @@ def test_geo_methods_missing_args(): def test_geo_functions(): """Tests for GeoFunctions property access.""" - calls = [] - - def mock_factory_getitem(name): - def fn(*args): - calls.append((name, args)) - - return fn - - factory = type( - "MockFactory", (), {"__getitem__": lambda self, k: mock_factory_getitem(k)} - )() + factory = MockFunctions() geo_fns = GeoFunctions(factory) # Properties return callables @@ -106,4 +109,4 @@ def fn(*args): # Calling returned function invokes factory geo_fns.envelope("geom_arg") - assert calls[-1] == ("st_envelope", ("geom_arg",)) + assert factory.calls[-1] == ("st_envelope", ("geom_arg",))