Skip to content

Commit f92994e

Browse files
Fokkosungwy
authored andcommitted
Fix tracing existing entries when there are deletes (#1046)
1 parent f73da80 commit f92994e

File tree

2 files changed

+85
-10
lines changed

2 files changed

+85
-10
lines changed

pyiceberg/table/__init__.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
Reference,
6363
)
6464
from pyiceberg.expressions.visitors import (
65-
ROWS_CANNOT_MATCH,
65+
ROWS_MIGHT_NOT_MATCH,
6666
ROWS_MUST_MATCH,
6767
_InclusiveMetricsEvaluator,
6868
_StrictMetricsEvaluator,
@@ -3360,13 +3360,14 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
33603360
existing_entries = []
33613361
for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True):
33623362
if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH:
3363+
# Based on the metadata, it can be dropped right away
33633364
deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED))
33643365
self._deleted_data_files.add(entry.data_file)
3365-
elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH:
3366-
existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
33673366
else:
3368-
# Based on the metadata, it is unsure to say if the file can be deleted
3369-
partial_rewrites_needed = True
3367+
# Based on the metadata, we cannot determine if it can be deleted
3368+
existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING))
3369+
if inclusive_metrics_evaluator(entry.data_file) != ROWS_MIGHT_NOT_MATCH:
3370+
partial_rewrites_needed = True
33703371

33713372
if len(deleted_entries) > 0:
33723373
total_deleted_entries += deleted_entries
@@ -3383,8 +3384,6 @@ def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) ->
33833384
for existing_entry in existing_entries:
33843385
writer.add_entry(existing_entry)
33853386
existing_manifests.append(writer.to_manifest_file())
3386-
# else:
3387-
# deleted_manifests.append()
33883387
else:
33893388
existing_manifests.append(manifest_file)
33903389
else:

tests/integration/test_writes/test_writes.py

+79-3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing import Any, Dict
2424
from urllib.parse import urlparse
2525

26+
import numpy as np
2627
import pandas as pd
2728
import pyarrow as pa
2829
import pyarrow.parquet as pq
@@ -38,13 +39,20 @@
3839
from pyiceberg.catalog.rest import RestCatalog
3940
from pyiceberg.catalog.sql import SqlCatalog
4041
from pyiceberg.exceptions import NoSuchTableError
41-
from pyiceberg.expressions import In
42+
from pyiceberg.expressions import GreaterThanOrEqual, In, Not
4243
from pyiceberg.io.pyarrow import _dataframe_to_data_files
4344
from pyiceberg.partitioning import PartitionField, PartitionSpec
4445
from pyiceberg.schema import Schema
4546
from pyiceberg.table import TableProperties
46-
from pyiceberg.transforms import IdentityTransform
47-
from pyiceberg.types import IntegerType, LongType, NestedField, StringType
47+
from pyiceberg.transforms import DayTransform, IdentityTransform
48+
from pyiceberg.types import (
49+
DateType,
50+
DoubleType,
51+
IntegerType,
52+
LongType,
53+
NestedField,
54+
StringType,
55+
)
4856
from utils import _create_table
4957

5058

@@ -1331,3 +1339,71 @@ def test_overwrite_all_data_with_filter(session_catalog: Catalog) -> None:
13311339
tbl.overwrite(data, In("id", ["1", "2", "3"]))
13321340

13331341
assert len(tbl.scan().to_arrow()) == 3
1342+
1343+
1344+
@pytest.mark.integration
1345+
def test_delete_threshold() -> None:
1346+
catalog = load_catalog(
1347+
"local",
1348+
**{
1349+
"type": "rest",
1350+
"uri": "http://localhost:8181",
1351+
"s3.endpoint": "http://localhost:9000",
1352+
"s3.access-key-id": "admin",
1353+
"s3.secret-access-key": "password",
1354+
},
1355+
)
1356+
1357+
schema = Schema(
1358+
NestedField(field_id=101, name="id", field_type=LongType(), required=True),
1359+
NestedField(field_id=103, name="created_at", field_type=DateType(), required=False),
1360+
NestedField(field_id=104, name="relevancy_score", field_type=DoubleType(), required=False),
1361+
)
1362+
1363+
partition_spec = PartitionSpec(PartitionField(source_id=103, field_id=2000, transform=DayTransform(), name="created_at_day"))
1364+
1365+
try:
1366+
catalog.drop_table(
1367+
identifier="default.scores",
1368+
)
1369+
except NoSuchTableError:
1370+
pass
1371+
1372+
catalog.create_table(
1373+
identifier="default.scores",
1374+
schema=schema,
1375+
partition_spec=partition_spec,
1376+
)
1377+
1378+
# Parameters
1379+
num_rows = 100 # Number of rows in the dataframe
1380+
id_min, id_max = 1, 10000
1381+
date_start, date_end = date(2024, 1, 1), date(2024, 2, 1)
1382+
1383+
# Generate the 'id' column
1384+
id_column = np.random.randint(id_min, id_max, num_rows)
1385+
1386+
# Generate the 'created_at' column as dates only
1387+
date_range = pd.date_range(start=date_start, end=date_end, freq="D") # Daily frequency for dates
1388+
created_at_column = np.random.choice(date_range, num_rows) # Convert to string (YYYY-MM-DD format)
1389+
1390+
# Generate the 'relevancy_score' column with a peak around 0.1
1391+
relevancy_score_column = np.random.beta(a=2, b=20, size=num_rows) # Adjusting parameters to peak around 0.1
1392+
1393+
# Create the dataframe
1394+
df = pd.DataFrame({"id": id_column, "created_at": created_at_column, "relevancy_score": relevancy_score_column})
1395+
1396+
iceberg_table = catalog.load_table("default.scores")
1397+
1398+
# Convert the pandas DataFrame to a PyArrow Table with the Iceberg schema
1399+
arrow_schema = iceberg_table.schema().as_arrow()
1400+
docs_table = pa.Table.from_pandas(df, schema=arrow_schema)
1401+
1402+
# Append the data to the Iceberg table
1403+
iceberg_table.append(docs_table)
1404+
1405+
delete_condition = GreaterThanOrEqual("relevancy_score", 0.1)
1406+
lower_before = len(iceberg_table.scan(row_filter=Not(delete_condition)).to_arrow())
1407+
assert len(iceberg_table.scan(row_filter=Not(delete_condition)).to_arrow()) == lower_before
1408+
iceberg_table.delete(delete_condition)
1409+
assert len(iceberg_table.scan().to_arrow()) == lower_before

0 commit comments

Comments
 (0)