Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions src/DataTypes/NestedUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,21 +212,37 @@ using NameToDataType = std::map<String, DataTypePtr>;

NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types)
{
std::unordered_map<String, NamesAndTypesList> nested;
/// Pass 1: count how many Array(T) columns share each dotted prefix.
/// A lone column like `a.b Array(T)` must not be collapsed into a synthetic
/// Nested parent — only genuine flat-Nested groups (n.x, n.y, ...) qualify.
std::unordered_map<String, size_t> prefix_count;
for (const auto & name_type : names_and_types)
{
/// Skip subcolumns (e.g. `c0.c2.null` derived from `c0.c2 Array(Nullable(Tuple()))`).
/// They are not real flat-nested columns like `n.a Array(T)`, `n.b Array(T)`.
if (name_type.isSubcolumn())
continue;

const auto * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get());

/// Ignore true Nested type, but try to unite flatten arrays to Nested type.
if (!isNested(name_type.type) && type_arr)
{
auto split = splitName(name_type.name);
if (!split.second.empty())
++prefix_count[split.first];
}
}

/// Pass 2: build Nested only for prefixes shared by at least two columns.
std::unordered_map<String, NamesAndTypesList> nested;
for (const auto & name_type : names_and_types)
{
if (name_type.isSubcolumn())
continue;

const auto * type_arr = typeid_cast<const DataTypeArray *>(name_type.type.get());
if (!isNested(name_type.type) && type_arr)
{
auto split = splitName(name_type.name);
if (!split.second.empty() && prefix_count[split.first] >= 2)
nested[split.first].emplace_back(split.second, type_arr->getNestedType());
}
}
Expand Down
5 changes: 1 addition & 4 deletions src/Storages/ColumnsDescription.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -976,10 +976,7 @@ std::vector<String> ColumnsDescription::getAllRegisteredNames() const
std::vector<String> names;
names.reserve(columns.size());
for (const auto & column : columns)
{
if (!column.name.contains('.'))
names.push_back(column.name);
}
names.emplace_back(column.name);
return names;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,101 @@ def test_deeply_nested_struct_with_dotted_names(started_cluster_iceberg_with_spa
).strip()
expected = "deep_value1\ndeep_value2\ndeep_value3"
assert result == expected, f"Expected:\n{expected}\nGot:\n{result}"


@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
def test_dotted_array_column(started_cluster_iceberg_with_spark, storage_type):
"""
Regression test for issue #90731.
A top-level ARRAY column whose name literally contains a dot (e.g. `a.b`)
must be returned with its actual values, not as an empty array.
"""
instance = started_cluster_iceberg_with_spark.instances["node1"]
spark = started_cluster_iceberg_with_spark.spark_session
TABLE_NAME = "test_dotted_array_column_" + storage_type + "_" + get_uuid_str()

from pyspark.sql.types import ArrayType

data = [(["a", "b", "c"],)]
schema = StructType([
StructField("a.b", ArrayType(StringType())),
])
df = spark.createDataFrame(data=data, schema=schema)

write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")

default_upload_directory(
started_cluster_iceberg_with_spark,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
)

# Test via table function
table_function_expr = get_creation_expression(
storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
)

result = instance.query(
f"SELECT `a.b` FROM {table_function_expr}"
).strip()
assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}"

# Test via table engine
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)

result = instance.query(
f"SELECT `a.b` FROM {TABLE_NAME}"
).strip()
assert result == "['a','b','c']", f"Expected ['a','b','c'], got: {result}"


@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
def test_dotted_array_alongside_real_nested(started_cluster_iceberg_with_spark, storage_type):
"""
Regression guard: a lone dotted Array column (`a.b`) must not interfere with
a genuine flat-Nested group (`c.x`, `c.y`) that shares a different prefix.
All three columns must round-trip correctly.
"""
instance = started_cluster_iceberg_with_spark.instances["node1"]
spark = started_cluster_iceberg_with_spark.spark_session
TABLE_NAME = "test_dotted_array_alongside_real_nested_" + storage_type + "_" + get_uuid_str()

from pyspark.sql.types import ArrayType, IntegerType as SparkIntegerType

data = [(["a", "b", "c"], [1, 2], ["p", "q"])]
schema = StructType([
StructField("a.b", ArrayType(StringType())),
StructField("c.x", ArrayType(SparkIntegerType())),
StructField("c.y", ArrayType(StringType())),
])
df = spark.createDataFrame(data=data, schema=schema)

write_iceberg_from_df(spark, df, TABLE_NAME, mode="overwrite", format_version="2")

default_upload_directory(
started_cluster_iceberg_with_spark,
storage_type,
f"/iceberg_data/default/{TABLE_NAME}/",
f"/iceberg_data/default/{TABLE_NAME}/",
)

# Test via table function
table_function_expr = get_creation_expression(
storage_type, TABLE_NAME, started_cluster_iceberg_with_spark, table_function=True
)

result = instance.query(
f"SELECT `a.b`, `c.x`, `c.y` FROM {table_function_expr}"
).strip()
assert result == "['a','b','c']\t[1,2]\t['p','q']", \
f"Unexpected result via table function: {result}"

# Test via table engine
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster_iceberg_with_spark)

result = instance.query(
f"SELECT `a.b`, `c.x`, `c.y` FROM {TABLE_NAME}"
).strip()
assert result == "['a','b','c']\t[1,2]\t['p','q']", \
f"Unexpected result via table engine: {result}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
['a','b','c']
['a','b','c']
[1,2] ['p','q']
18 changes: 18 additions & 0 deletions tests/queries/0_stateless/04259_dotted_array_not_nested.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
-- Regression test for #90731.
-- A lone Array(T) column with a dot in its name must not be collapsed into
-- a synthetic Nested structure and must be readable as a plain array.

CREATE TABLE t1 (`a.b` Array(String)) ENGINE = Memory;
INSERT INTO t1 VALUES (['a','b','c']);
SELECT `a.b` FROM t1;

-- In a mixed table, the lone dotted column must not interfere with the
-- genuine flat-Nested group (c.x / c.y share prefix 'c').
CREATE TABLE t2 (`a.b` Array(String), `c.x` Array(Int32), `c.y` Array(String))
ENGINE = Memory;
INSERT INTO t2 VALUES (['a','b','c'], [1,2], ['p','q']);
SELECT `a.b` FROM t2;
SELECT `c.x`, `c.y` FROM t2;

DROP TABLE t1;
DROP TABLE t2;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ok
19 changes: 19 additions & 0 deletions tests/queries/0_stateless/04260_dotted_column_in_hints.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Regression test for #90731.
# ColumnsDescription::getAllRegisteredNames must include columns whose names
# contain a dot, so they appear in IHints suggestions after a typo.

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

$CLICKHOUSE_CLIENT -q "
CREATE TABLE t_dotted_hint (\`a.b\` Array(String))
ENGINE = MergeTree ORDER BY tuple();
"

# Misspell the column name; the error message must suggest the real name 'a.b'.
$CLICKHOUSE_CLIENT -q "ALTER TABLE t_dotted_hint MODIFY COLUMN a_b Array(String);" 2>&1 \
| grep -qF "a.b" && echo "ok" || echo "FAIL"

$CLICKHOUSE_CLIENT -q "DROP TABLE t_dotted_hint;"
Loading