Add tests

alighazi288 · alighazi288 · commit 6ea3ea80646c · 2025-01-13T00:08:30.000-05:00
diff --git a/src/borg/archive.py b/src/borg/archive.py
@@ -720,9 +720,17 @@ def extract_helper(self, item, path, hlm, *, dry_run=False):
                 pass
 
     def compare_and_extract_chunks(self, item, fs_path):
-        fs_path = os.path.normpath(fs_path.replace(self.cwd + os.sep, "", 1))
-        fs_path = os.path.join(self.cwd, fs_path)
-        print(f"Starting chunk comparison for {fs_path}")
+        print(f"Initial fs_path: {fs_path}")
+        print(f"self.cwd: {self.cwd}")
+        if fs_path.startswith(self.cwd):
+            fs_path = fs_path[len(self.cwd) :].lstrip(os.sep)
+        print(f"Relative fs_path: {fs_path}")
+
+        # Construct the final path
+        fs_path = os.path.normpath(os.path.join(self.cwd, fs_path))
+        print(f"Final fs_path: {fs_path}")
+        print(f"File exists at final path: {os.path.isfile(fs_path)}")
+
         os.makedirs(os.path.dirname(fs_path), exist_ok=True)
         try:
             if os.path.isfile(fs_path):
@@ -731,45 +739,49 @@ def compare_and_extract_chunks(self, item, fs_path):
                     for chunk_entry in item.chunks:
                         chunkid_A = chunk_entry.id
                         size = chunk_entry.size
+                        print(f"Processing chunk at offset {chunk_offset}")
 
                         fs_file.seek(chunk_offset)
                         data_F = fs_file.read(size)
+                        print(f"Read {len(data_F)} bytes at offset {chunk_offset}")
+                        print(f"File content: {data_F[:20]}...")  # Show first 20 bytes
 
                         if len(data_F) == size:
                             chunkid_F = self.key.id_hash(data_F)
+                            print("Comparing hashes:")  # Debug
+                            print(f"Archive hash: {chunkid_A.hex()}")  # Debug
+                            print(f"File hash: {chunkid_F.hex()}")  # Debug
+                            print(f"Hashes match? {chunkid_A == chunkid_F}")
                             if chunkid_A != chunkid_F:
+                                print("Hashes don't match, fetching new chunk")  # Debug
                                 fs_file.seek(chunk_offset)  # Go back to the start of the chunk
                                 chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM))
+                                print(f"Fetched content: {chunk_data[:20]}...")
                                 fs_file.write(chunk_data)
+                                fs_file.flush()
+                                print("Wrote and flushed new chunk data")
                         else:
+                            print(f"Chunk size mismatch at offset {chunk_offset}")
                             fs_file.seek(chunk_offset)
                             chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM))
                             fs_file.write(chunk_data)
 
                         chunk_offset += size
 
                     fs_file.truncate(item.size)
+                    print(f"\nFinal file size: {os.path.getsize(fs_path)}")
+                    with open(fs_path, "rb") as f:
+                        print(f"Final content: {f.read()[:20]}...")
             else:
                 with open(fs_path, "wb") as fs_file:
                     for chunk_entry in item.chunks:
                         chunk_data = b"".join(self.pipeline.fetch_many([chunk_entry.id], ro_type=ROBJ_FILE_STREAM))
                         fs_file.write(chunk_data)
                     fs_file.truncate(item.size)
 
-            total_size = 0
-            chunk_size = 8192
             with open(fs_path, "rb") as fs_file:
-                while True:
-                    chunk = fs_file.read(chunk_size)
-                    if not chunk:
-                        break
-                    total_size += len(chunk)
-                    if total_size > item.size:
-                        break
-
-                fs_file.seek(0)
                 preview = fs_file.read(50)
-                print(f"Final file size: {total_size}, Expected: {item.size}")
+                print(f"Final file size: {os.path.getsize(fs_path)}, Expected: {item.size}")
                 print(f"Content preview (text): {preview.decode('utf-8', errors='replace')}")
 
         except OSError as e:
diff --git a/src/borg/testsuite/archive_test.py b/src/borg/testsuite/archive_test.py
@@ -132,6 +132,11 @@ def add_chunk(self, id, meta, data, stats=None, wait=True, ro_type=None):
         self.objects[id] = data
         return id, len(data)
 
+    def fetch_many(self, ids, ro_type=None):
+        """Mock implementation of fetch_many"""
+        for id in ids:
+            yield self.objects[id]
+
 
 def test_cache_chunk_buffer():
     data = [Item(path="p1"), Item(path="p2")]
@@ -402,3 +407,134 @@ def test_reject_non_sanitized_item():
     for path in rejected_dotdot_paths:
         with pytest.raises(ValueError, match="unexpected '..' element in path"):
             Item(path=path, user="root", group="root")
+
+
+def test_compare_and_extract_chunks(tmpdir, monkeypatch):
+    """Test chunk comparison and selective extraction with fixed-size chunks"""
+    # Setup mock repository and key
+    repository = Mock()
+    key = PlaintextKey(repository)
+    manifest = Manifest(key, repository)
+
+    cache = MockCache()
+
+    # Create a test file with known content divided into 512-byte chunks
+    chunk_size = 512
+    test_data = b"block" * 128  # 640 bytes - will create 2 chunks
+    original_file = tmpdir.join("test.txt")
+    original_file.write_binary(test_data)
+
+    # Create mock item with chunks
+    chunks = []
+    for i in range(0, len(test_data), chunk_size):
+        chunk_data = test_data[i : i + chunk_size]
+        chunk_id = key.id_hash(chunk_data)
+        chunks.append(Mock(id=chunk_id, size=len(chunk_data)))
+        cache.objects[chunk_id] = chunk_data
+
+    item = Mock(chunks=chunks, size=len(test_data))
+
+    # Test case 1: File doesn't exist (full extraction)
+    extractor = Archive(manifest=manifest, name="test", create=True)
+    extractor.pipeline = cache
+    extractor.key = key
+    extractor.cwd = str(tmpdir)
+
+    target_path = str(tmpdir.join("extracted.txt"))
+    extractor.compare_and_extract_chunks(item, target_path)
+
+    with open(target_path, "rb") as f:
+        assert f.read() == test_data
+
+    # Test case 2: File exists with partially matching chunks
+    modified_data = test_data[:256] + b"modified" + test_data[264:]
+    with open(target_path, "wb") as f:
+        f.write(modified_data)
+
+    extractor.compare_and_extract_chunks(item, target_path)
+
+    with open(target_path, "rb") as f:
+        extracted = f.read()
+        assert extracted == test_data
+        assert extracted != modified_data
+
+    # Test case 3: File exists with all matching chunks
+    extractor.compare_and_extract_chunks(item, target_path)
+    with open(target_path, "rb") as f:
+        assert f.read() == test_data
+
+
+def test_compare_and_extract_chunks_size_mismatch(tmpdir):
+    """Test chunk comparison when file size doesn't match chunk size"""
+    repository = Mock()
+    key = PlaintextKey(repository)
+    manifest = Manifest(key, repository)
+
+    cache = MockCache()
+
+    # Create a smaller file than expected
+    test_data = b"block" * 64  # 320 bytes
+    expected_data = b"block" * 128  # 640 bytes
+
+    original_file = tmpdir.join("test.txt")
+    original_file.write_binary(test_data)
+
+    # Create mock item with chunks expecting larger size
+    chunks = []
+    for i in range(0, len(expected_data), 512):
+        chunk_data = expected_data[i : i + 512]
+        chunk_id = key.id_hash(chunk_data)
+        chunks.append(Mock(id=chunk_id, size=len(chunk_data)))
+        cache.objects[chunk_id] = chunk_data
+
+    item = Mock(chunks=chunks, size=len(expected_data))
+
+    # Test extraction
+    extractor = Archive(manifest=manifest, name="test", create=True)
+    extractor.pipeline = cache
+    extractor.key = key
+    extractor.cwd = str(tmpdir)
+
+    target_path = str(original_file)
+    extractor.compare_and_extract_chunks(item, target_path)
+
+    with open(target_path, "rb") as f:
+        assert f.read() == expected_data
+
+
+def test_compare_and_extract_chunks_partial_chunk(tmpdir):
+    """Test chunk comparison with a final partial chunk"""
+    repository = Mock()
+    key = PlaintextKey(repository)
+    manifest = Manifest(key, repository)
+
+    cache = MockCache()
+
+    # Create data that doesn't align with chunk boundaries
+    chunk_size = 512
+    test_data = b"block" * 130  # 650 bytes - will create 2 chunks, second one partial
+
+    original_file = tmpdir.join("test.txt")
+    original_file.write_binary(test_data)
+
+    # Create mock item with chunks
+    chunks = []
+    for i in range(0, len(test_data), chunk_size):
+        chunk_data = test_data[i : i + chunk_size]
+        chunk_id = key.id_hash(chunk_data)
+        chunks.append(Mock(id=chunk_id, size=len(chunk_data)))
+        cache.objects[chunk_id] = chunk_data
+
+    item = Mock(chunks=chunks, size=len(test_data))
+
+    # Test extraction
+    extractor = Archive(manifest=manifest, name="test", create=True)
+    extractor.pipeline = cache
+    extractor.key = key
+    extractor.cwd = str(tmpdir)
+
+    target_path = str(tmpdir.join("extracted.txt"))
+    extractor.compare_and_extract_chunks(item, target_path)
+
+    with open(target_path, "rb") as f:
+        assert f.read() == test_data