From 49fb6c6fd7d878db664aec738118968284364024 Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Thu, 18 Sep 2025 14:32:23 +0000 Subject: [PATCH 1/7] Script to delete/migrate fuzz task events. --- .../scripts/migrate_fuzzer_task_events.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/local/butler/scripts/migrate_fuzzer_task_events.py diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py new file mode 100644 index 0000000000..3405f56f67 --- /dev/null +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -0,0 +1,73 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Migrate fuzzer-based task events to the correct entity.""" + +import datetime + +from clusterfuzz._internal.datastore import data_types +from clusterfuzz._internal.datastore import ndb_utils +from clusterfuzz._internal.metrics import events +from clusterfuzz._internal.metrics import logs +from clusterfuzz._internal.system import environment + +_MIGRATE_FIELDS = [ + 'timestamp', 'source', 'clusterfuzz_version', 'clusterfuzz_config_version', + 'instance_id', 'operating_system', 'os_version', 'task_id', 'task_name', + 'task_stage', 'task_status', 'task_outcome', 'task_job', 'task_fuzzer' +] + +_BATCH_SIZE = 500 +_DELETE_ALL = False + + +def execute(args): + """Deletes/Migrates fuzzer-based task events.""" + del args + environment.set_bot_environment() + logs.configure('run_bot') + + task_event_type = events.EventTypes.TASK_EXECUTION + task_names = ['fuzz', 'corpus_pruning'] + query = data_types.TestcaseLifecycleEvent.query( + data_types.TestcaseLifecycleEvent.event_type == task_event_type, + data_types.TestcaseLifecycleEvent.task_name.IN(task_names)) + + # If we choose to delete all entities from the old model. + if _DELETE_ALL: + ndb_utils.delete_multi(ndb_utils.get_all_from_query(query, keys_only=True)) + return + + # If we choose to delete most, but migrate the latest ones. + last_month = datetime.datetime.now() - datetime.timedelta(days=30) + query_delete = query.filter( + data_types.TestcaseLifecycleEvent.timestamp < last_month) + ndb_utils.delete_multi( + ndb_utils.get_all_from_query(query_delete, keys_only=True)) + + query_update = query.filter( + data_types.TestcaseLifecycleEvent.timestamp >= last_month) + to_update = [] + to_delete = [] + for event in ndb_utils.get_all_from_query(query_update): + migrate_event = data_types.FuzzerTaskEvent( + event_type=events.EventTypes.FUZZER_TASK_EXECUTION) + for attr in _MIGRATE_FIELDS: + setattr(migrate_event, attr, getattr(event, attr)) + to_update.append(migrate_event) + to_delete.append(event.key) + if len(to_update) == _BATCH_SIZE: + ndb_utils.put_multi(to_update) + ndb_utils.delete_multi(to_delete) + to_update = [] + to_delete = [] From de9bd901f26938be8a5b2f8c69548825b59c6762 Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Thu, 18 Sep 2025 16:59:37 +0000 Subject: [PATCH 2/7] Add log. --- src/local/butler/scripts/migrate_fuzzer_task_events.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py index 3405f56f67..b04083f47d 100644 --- a/src/local/butler/scripts/migrate_fuzzer_task_events.py +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -59,6 +59,7 @@ def execute(args): data_types.TestcaseLifecycleEvent.timestamp >= last_month) to_update = [] to_delete = [] + total_count = 0 for event in ndb_utils.get_all_from_query(query_update): migrate_event = data_types.FuzzerTaskEvent( event_type=events.EventTypes.FUZZER_TASK_EXECUTION) @@ -66,7 +67,9 @@ def execute(args): setattr(migrate_event, attr, getattr(event, attr)) to_update.append(migrate_event) to_delete.append(event.key) + total_count += 1 if len(to_update) == _BATCH_SIZE: + logs.info(f'Migrated {total_count} fuzzer-based task events.') ndb_utils.put_multi(to_update) ndb_utils.delete_multi(to_delete) to_update = [] From 57625cbaffe36791bd2270e3a830e7874bca9c4f Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Thu, 18 Sep 2025 19:08:39 +0000 Subject: [PATCH 3/7] Remove timestamp filter to avoid index for now. --- .../butler/scripts/migrate_fuzzer_task_events.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py index b04083f47d..40383af598 100644 --- a/src/local/butler/scripts/migrate_fuzzer_task_events.py +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -48,22 +48,14 @@ def execute(args): ndb_utils.delete_multi(ndb_utils.get_all_from_query(query, keys_only=True)) return - # If we choose to delete most, but migrate the latest ones. - last_month = datetime.datetime.now() - datetime.timedelta(days=30) - query_delete = query.filter( - data_types.TestcaseLifecycleEvent.timestamp < last_month) - ndb_utils.delete_multi( - ndb_utils.get_all_from_query(query_delete, keys_only=True)) - - query_update = query.filter( - data_types.TestcaseLifecycleEvent.timestamp >= last_month) + # If we choose to migrate them into the new entity model. to_update = [] to_delete = [] total_count = 0 - for event in ndb_utils.get_all_from_query(query_update): + for event in ndb_utils.get_all_from_query(query): migrate_event = data_types.FuzzerTaskEvent( event_type=events.EventTypes.FUZZER_TASK_EXECUTION) - for attr in _MIGRATE_FIELDS: + for attr in _MIGRATE_FIELDS: setattr(migrate_event, attr, getattr(event, attr)) to_update.append(migrate_event) to_delete.append(event.key) From cdfa9d84dce52f892c527a1099f31c4a42ead26b Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Mon, 22 Sep 2025 18:36:27 +0000 Subject: [PATCH 4/7] Remove timestamp filter and add print. --- src/clusterfuzz/_internal/datastore/ndb_utils.py | 3 +++ src/local/butler/scripts/migrate_fuzzer_task_events.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/clusterfuzz/_internal/datastore/ndb_utils.py b/src/clusterfuzz/_internal/datastore/ndb_utils.py index 15a9430107..e84817e829 100644 --- a/src/clusterfuzz/_internal/datastore/ndb_utils.py +++ b/src/clusterfuzz/_internal/datastore/ndb_utils.py @@ -73,5 +73,8 @@ def put_multi(entities): def delete_multi(keys): """Delete multiple entities, working around a limitation in the NDB library with the maximum number of keys allowed.""" + total_ct = 0 for chunk in _gen_chunks(keys, _MODIFY_BATCH_SIZE): ndb.delete_multi(chunk) + total_ct += len(chunk) + print(f'Deleted #{total_ct} entities.') diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py index 40383af598..724dd5f737 100644 --- a/src/local/butler/scripts/migrate_fuzzer_task_events.py +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -28,7 +28,7 @@ ] _BATCH_SIZE = 500 -_DELETE_ALL = False +_DELETE_ALL = True def execute(args): @@ -46,6 +46,7 @@ def execute(args): # If we choose to delete all entities from the old model. if _DELETE_ALL: ndb_utils.delete_multi(ndb_utils.get_all_from_query(query, keys_only=True)) + logs.info('Done deleting fuzzer-based task execution events!') return # If we choose to migrate them into the new entity model. From ef6d95433326b9c03df83aae7b68f4416f4425e0 Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Mon, 22 Sep 2025 21:40:44 +0000 Subject: [PATCH 5/7] Use fetch page for batch delete. --- .../scripts/migrate_fuzzer_task_events.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py index 724dd5f737..a18e58a556 100644 --- a/src/local/butler/scripts/migrate_fuzzer_task_events.py +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -13,7 +13,7 @@ # limitations under the License. """Migrate fuzzer-based task events to the correct entity.""" -import datetime +from google.cloud import ndb from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.datastore import ndb_utils @@ -38,25 +38,45 @@ def execute(args): logs.configure('run_bot') task_event_type = events.EventTypes.TASK_EXECUTION - task_names = ['fuzz', 'corpus_pruning'] - query = data_types.TestcaseLifecycleEvent.query( - data_types.TestcaseLifecycleEvent.event_type == task_event_type, - data_types.TestcaseLifecycleEvent.task_name.IN(task_names)) + task_names = ['corpus_pruning', 'fuzz'] # If we choose to delete all entities from the old model. if _DELETE_ALL: - ndb_utils.delete_multi(ndb_utils.get_all_from_query(query, keys_only=True)) + total_deleted = 0 + for task_name in task_names: + print(f'Started deleting for {task_name}.') + query = data_types.TestcaseLifecycleEvent.query( + data_types.TestcaseLifecycleEvent.event_type == task_event_type, + data_types.TestcaseLifecycleEvent.task_name == task_name) + + keys_batch, next_cursor, more = query.fetch_page( + _BATCH_SIZE, keys_only=True) + while keys_batch: + ndb.delete_multi(keys_batch) + total_deleted += len(keys_batch) + print(f"Deleted {len(keys_batch)} entities (Total: {total_deleted})") + if not more: + break + if total_deleted >= 10000: + break + keys_batch, next_cursor, more = query.fetch_page( + _BATCH_SIZE, keys_only=True, start_cursor=next_cursor) + logs.info('Done deleting fuzzer-based task execution events!') return - # If we choose to migrate them into the new entity model. + +# If we choose to migrate them into the new entity model. + query = data_types.TestcaseLifecycleEvent.query( + data_types.TestcaseLifecycleEvent.event_type == task_event_type, + data_types.TestcaseLifecycleEvent.task_name.IN(task_names)) to_update = [] to_delete = [] total_count = 0 for event in ndb_utils.get_all_from_query(query): migrate_event = data_types.FuzzerTaskEvent( event_type=events.EventTypes.FUZZER_TASK_EXECUTION) - for attr in _MIGRATE_FIELDS: + for attr in _MIGRATE_FIELDS: setattr(migrate_event, attr, getattr(event, attr)) to_update.append(migrate_event) to_delete.append(event.key) From 5d96f7792339b4aee2e1bdb814f414c53c2754a5 Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Mon, 22 Sep 2025 21:50:26 +0000 Subject: [PATCH 6/7] Remove print in ndb utils. --- src/clusterfuzz/_internal/datastore/ndb_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/clusterfuzz/_internal/datastore/ndb_utils.py b/src/clusterfuzz/_internal/datastore/ndb_utils.py index e84817e829..15a9430107 100644 --- a/src/clusterfuzz/_internal/datastore/ndb_utils.py +++ b/src/clusterfuzz/_internal/datastore/ndb_utils.py @@ -73,8 +73,5 @@ def put_multi(entities): def delete_multi(keys): """Delete multiple entities, working around a limitation in the NDB library with the maximum number of keys allowed.""" - total_ct = 0 for chunk in _gen_chunks(keys, _MODIFY_BATCH_SIZE): ndb.delete_multi(chunk) - total_ct += len(chunk) - print(f'Deleted #{total_ct} entities.') From ca86dc4d97941e9c5729db684d839f1771aabf54 Mon Sep 17 00:00:00 2001 From: Vinicius da Costa Date: Mon, 22 Sep 2025 21:52:40 +0000 Subject: [PATCH 7/7] Remove limit. --- src/local/butler/scripts/migrate_fuzzer_task_events.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/local/butler/scripts/migrate_fuzzer_task_events.py b/src/local/butler/scripts/migrate_fuzzer_task_events.py index a18e58a556..da648e8700 100644 --- a/src/local/butler/scripts/migrate_fuzzer_task_events.py +++ b/src/local/butler/scripts/migrate_fuzzer_task_events.py @@ -57,8 +57,6 @@ def execute(args): print(f"Deleted {len(keys_batch)} entities (Total: {total_deleted})") if not more: break - if total_deleted >= 10000: - break keys_batch, next_cursor, more = query.fetch_page( _BATCH_SIZE, keys_only=True, start_cursor=next_cursor)