|
12 | 12 | from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked_dict, ITEM_KEYS, Statistics |
13 | 13 | from ..archive import BackupOSError, backup_io, backup_io_iter, get_item_uid_gid |
14 | 14 | from ..helpers import msgpack |
15 | | -from ..item import Item, ArchiveItem |
| 15 | +from ..item import Item, ArchiveItem, ChunkListEntry |
16 | 16 | from ..manifest import Manifest |
17 | 17 | from ..platform import uid2user, gid2group, is_win32 |
18 | 18 |
|
@@ -132,6 +132,11 @@ def add_chunk(self, id, meta, data, stats=None, wait=True, ro_type=None): |
132 | 132 | self.objects[id] = data |
133 | 133 | return id, len(data) |
134 | 134 |
|
| 135 | + def fetch_many(self, ids, ro_type=None): |
| 136 | + """Mock implementation of fetch_many""" |
| 137 | + for id in ids: |
| 138 | + yield self.objects[id] |
| 139 | + |
135 | 140 |
|
136 | 141 | def test_cache_chunk_buffer(): |
137 | 142 | data = [Item(path="p1"), Item(path="p2")] |
@@ -402,3 +407,161 @@ def test_reject_non_sanitized_item(): |
402 | 407 | for path in rejected_dotdot_paths: |
403 | 408 | with pytest.raises(ValueError, match="unexpected '..' element in path"): |
404 | 409 | Item(path=path, user="root", group="root") |
| 410 | + |
| 411 | + |
| 412 | +@pytest.fixture |
| 413 | +def setup_extractor(tmpdir): |
| 414 | + """Setup common test infrastructure""" |
| 415 | + |
| 416 | + class MockCache: |
| 417 | + def __init__(self): |
| 418 | + self.objects = {} |
| 419 | + |
| 420 | + repository = Mock() |
| 421 | + key = PlaintextKey(repository) |
| 422 | + manifest = Manifest(key, repository) |
| 423 | + cache = MockCache() |
| 424 | + |
| 425 | + extractor = Archive(manifest=manifest, name="test", create=True) |
| 426 | + extractor.pipeline = cache |
| 427 | + extractor.key = key |
| 428 | + extractor.cwd = str(tmpdir) |
| 429 | + extractor.restore_attrs = Mock() |
| 430 | + |
| 431 | + # Track fetched chunks across tests |
| 432 | + fetched_chunks = [] |
| 433 | + |
| 434 | + def create_mock_chunks(item_data, chunk_size=4): |
| 435 | + chunks = [] |
| 436 | + for i in range(0, len(item_data), chunk_size): |
| 437 | + chunk_data = item_data[i : i + chunk_size] |
| 438 | + chunk_id = key.id_hash(chunk_data) |
| 439 | + chunks.append(ChunkListEntry(id=chunk_id, size=len(chunk_data))) |
| 440 | + cache.objects[chunk_id] = chunk_data |
| 441 | + |
| 442 | + item = Mock(spec=["chunks", "size", "__contains__", "get"]) |
| 443 | + item.chunks = chunks |
| 444 | + item.size = len(item_data) |
| 445 | + item.__contains__ = lambda self, item: item == "size" |
| 446 | + |
| 447 | + return item, str(tmpdir.join("test.txt")) |
| 448 | + |
| 449 | + def mock_fetch_many(chunk_ids, ro_type=None): |
| 450 | + fetched_chunks.extend(chunk_ids) |
| 451 | + return iter([cache.objects[chunk_id] for chunk_id in chunk_ids]) |
| 452 | + |
| 453 | + def clear_fetched_chunks(): |
| 454 | + fetched_chunks.clear() |
| 455 | + |
| 456 | + def get_fetched_chunks(): |
| 457 | + return fetched_chunks |
| 458 | + |
| 459 | + cache.fetch_many = mock_fetch_many |
| 460 | + |
| 461 | + return extractor, key, cache, tmpdir, create_mock_chunks, get_fetched_chunks, clear_fetched_chunks |
| 462 | + |
| 463 | + |
| 464 | +@pytest.mark.parametrize( |
| 465 | + "name, item_data, fs_data, expected_fetched_chunks", |
| 466 | + [ |
| 467 | + ( |
| 468 | + "no_changes", |
| 469 | + b"1111", # One complete chunk, no changes needed |
| 470 | + b"1111", # Identical content |
| 471 | + 0, # No chunks should be fetched |
| 472 | + ), |
| 473 | + ( |
| 474 | + "single_chunk_change", |
| 475 | + b"11112222", # Two chunks |
| 476 | + b"1111XXXX", # Second chunk different |
| 477 | + 1, # Only second chunk should be fetched |
| 478 | + ), |
| 479 | + ( |
| 480 | + "cross_boundary_change", |
| 481 | + b"11112222", # Two chunks |
| 482 | + b"111XX22", # Change crosses chunk boundary |
| 483 | + 2, # Both chunks need update |
| 484 | + ), |
| 485 | + ( |
| 486 | + "exact_multiple_chunks", |
| 487 | + b"11112222333", # Three chunks (last one partial) |
| 488 | + b"1111XXXX333", # Middle chunk different |
| 489 | + 1, # Only middle chunk fetched |
| 490 | + ), |
| 491 | + ( |
| 492 | + "first_chunk_change", |
| 493 | + b"11112222", # Two chunks |
| 494 | + b"XXXX2222", # First chunk different |
| 495 | + 1, # Only first chunk should be fetched |
| 496 | + ), |
| 497 | + ( |
| 498 | + "all_chunks_different", |
| 499 | + b"11112222", # Two chunks |
| 500 | + b"XXXXYYYY", # Both chunks different |
| 501 | + 2, # Both chunks should be fetched |
| 502 | + ), |
| 503 | + ( |
| 504 | + "partial_last_chunk", |
| 505 | + b"111122", # One full chunk + partial |
| 506 | + b"1111XX", # Partial chunk different |
| 507 | + 1, # Only second chunk should be fetched |
| 508 | + ), |
| 509 | + ( |
| 510 | + "fs_file_shorter", |
| 511 | + b"11112222", # Two chunks in archive |
| 512 | + b"111122", # Shorter on disk - missing part of second chunk |
| 513 | + 1, # Should fetch second chunk |
| 514 | + ), |
| 515 | + ( |
| 516 | + "fs_file_longer", |
| 517 | + b"11112222", # Two chunks in archive |
| 518 | + b"1111222233", # Longer on disk |
| 519 | + 0, # Should fetch no chunks since content matches up to archive length |
| 520 | + ), |
| 521 | + ( |
| 522 | + "empty_archive_file", |
| 523 | + b"", # Empty in archive |
| 524 | + b"11112222", # Content on disk |
| 525 | + 0, # No chunks to compare = no chunks to fetch |
| 526 | + ), |
| 527 | + ( |
| 528 | + "empty_fs_file", |
| 529 | + b"11112222", # Two chunks in archive |
| 530 | + b"", # Empty on disk |
| 531 | + 2, # Should fetch all chunks since file is empty |
| 532 | + ), |
| 533 | + ], |
| 534 | +) |
| 535 | +def test_compare_and_extract_chunks(setup_extractor, name, item_data, fs_data, expected_fetched_chunks): |
| 536 | + """Test chunk comparison and extraction""" |
| 537 | + extractor, key, cache, tmpdir, create_mock_chunks, get_fetched_chunks, clear_fetched_chunks = setup_extractor |
| 538 | + clear_fetched_chunks() |
| 539 | + |
| 540 | + chunk_size = 4 |
| 541 | + item, target_path = create_mock_chunks(item_data, chunk_size=chunk_size) |
| 542 | + |
| 543 | + original_chunk_ids = [chunk.id for chunk in item.chunks] |
| 544 | + |
| 545 | + with open(target_path, "wb") as f: |
| 546 | + f.write(fs_data) |
| 547 | + |
| 548 | + st = os.stat(target_path) |
| 549 | + result = extractor.compare_and_extract_chunks(item, target_path, st=st) |
| 550 | + assert result |
| 551 | + |
| 552 | + fetched_chunks = get_fetched_chunks() |
| 553 | + assert len(fetched_chunks) == expected_fetched_chunks |
| 554 | + |
| 555 | + # For single chunk changes, verify it's the correct chunk |
| 556 | + if expected_fetched_chunks == 1: |
| 557 | + item_chunks = [item_data[i : i + chunk_size] for i in range(0, len(item_data), chunk_size)] |
| 558 | + fs_chunks = [fs_data[i : i + chunk_size] for i in range(0, len(fs_data), chunk_size)] |
| 559 | + |
| 560 | + # Find which chunk should have changed by comparing item_data with fs_data |
| 561 | + for i, (item_chunk, fs_chunk) in enumerate(zip(item_chunks, fs_chunks)): |
| 562 | + if item_chunk != fs_chunk: |
| 563 | + assert fetched_chunks[0] == original_chunk_ids[i] |
| 564 | + break |
| 565 | + |
| 566 | + with open(target_path, "rb") as f: |
| 567 | + assert f.read() == item_data |
0 commit comments