-
Notifications
You must be signed in to change notification settings - Fork 473
Expand file tree
/
Copy pathtest_replace.py
More file actions
109 lines (93 loc) · 4.04 KB
/
test_replace.py
File metadata and controls
109 lines (93 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from pyiceberg.catalog import Catalog
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
from pyiceberg.schema import Schema
from pyiceberg.table.snapshots import Operation
from pyiceberg.typedef import Record
def test_replace_api(catalog: Catalog) -> None:
# Setup a basic table using the catalog fixture
catalog.create_namespace("default")
table = catalog.create_table(
identifier="default.test_replace",
schema=Schema(),
)
# Create mock DataFiles for the test
file_to_delete = DataFile.from_args(
file_path="s3://bucket/test/data/deleted.parquet",
file_format=FileFormat.PARQUET,
partition=Record(),
record_count=100,
file_size_in_bytes=1024,
content=DataFileContent.DATA,
)
file_to_delete.spec_id = 0
file_to_add = DataFile.from_args(
file_path="s3://bucket/test/data/added.parquet",
file_format=FileFormat.PARQUET,
partition=Record(),
record_count=100,
file_size_in_bytes=1024,
content=DataFileContent.DATA,
)
file_to_add.spec_id = 0
# Initially append to have something to replace
with table.transaction() as tx:
with tx.update_snapshot().fast_append() as append_snapshot:
append_snapshot.append_data_file(file_to_delete)
# Verify initial append snapshot
assert len(table.history()) == 1
snapshot = table.current_snapshot()
assert snapshot is not None
assert snapshot.summary is not None
assert snapshot.summary["operation"] == Operation.APPEND
# Call the replace API
table.replace(files_to_delete=[file_to_delete], files_to_add=[file_to_add])
# Verify the replacement created a REPLACE snapshot
assert len(table.history()) == 2
snapshot = table.current_snapshot()
assert snapshot is not None
assert snapshot.summary is not None
assert snapshot.summary["operation"] == Operation.REPLACE
# Verify the correct files are added and deleted
# The summary property tracks these counts
assert snapshot.summary["added-data-files"] == "1"
assert snapshot.summary["deleted-data-files"] == "1"
assert snapshot.summary["added-records"] == "100"
assert snapshot.summary["deleted-records"] == "100"
# Verify the new file exists in the new manifest
manifest_files = snapshot.manifests(table.io)
assert len(manifest_files) == 2 # One for ADDED, one for DELETED
# Check that sequence numbers were handled properly natively by verifying the manifest contents
entries = []
for manifest in manifest_files:
for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False):
entries.append(entry)
# One entry for ADDED (new file), one for DELETED (old file)
assert len(entries) == 2
def test_replace_empty_files(catalog: Catalog) -> None:
# Setup a basic table using the catalog fixture
catalog.create_namespace("default")
table = catalog.create_table(
identifier="default.test_replace_empty",
schema=Schema(),
)
# Replacing empty lists should not throw errors, but should produce no changes.
table.replace([], [])
# History should be completely empty since no files were rewritten
assert len(table.history()) == 0
assert table.current_snapshot() is None