Add support to resume incomplete download

yichi-yang · gmargaritis · commit 0617d7c02955 · 2024-09-26T21:26:44.000+03:00
diff --git a/news/11180.feature.rst b/news/11180.feature.rst
@@ -0,0 +1 @@
+Add support to resume incomplete download. The behavior can be controlled using flags ``--incomplete-downloads`` and ``--incomplete-download-retries``.
diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py
@@ -1028,6 +1028,25 @@ def check_list_path_option(options: Values) -> None:
     help=("Enable deprecated functionality, that will be removed in the future."),
 )
 
+incomplete_downloads: Callable[..., Option] = partial(
+    Option,
+    "--incomplete-downloads",
+    dest="resume_incomplete",
+    choices=["resume", "discard"],
+    default="discard",
+    metavar="policy",
+    help="How to handle an incomplete download: resume, discard (default to %default).",
+)
+
+incomplete_download_retries: Callable[..., Option] = partial(
+    Option,
+    "--incomplete-download-retries",
+    dest="resume_attempts",
+    type="int",
+    default=5,
+    help="Maximum number of resumption retries for incomplete download "
+    "(default %default times).",
+)
 
 ##########
 # groups #
@@ -1061,6 +1080,8 @@ def check_list_path_option(options: Values) -> None:
         no_python_version_warning,
         use_new_feature,
         use_deprecated_feature,
+        incomplete_downloads,
+        incomplete_download_retries,
     ],
 }
 
diff --git a/src/pip/_internal/cli/progress_bars.py b/src/pip/_internal/cli/progress_bars.py
@@ -26,6 +26,7 @@ def _rich_progress_bar(
     *,
     bar_type: str,
     size: int,
+    initial_progress: Optional[int] = None,
 ) -> Generator[bytes, None, None]:
     assert bar_type == "on", "This should only be used in the default mode."
 
@@ -51,6 +52,8 @@ def _rich_progress_bar(
 
     progress = Progress(*columns, refresh_per_second=5)
     task_id = progress.add_task(" " * (get_indentation() + 2), total=total)
+    if initial_progress is not None:
+        progress.update(task_id, advance=initial_progress)
     with progress:
         for chunk in iterable:
             yield chunk
@@ -80,15 +83,15 @@ def write_progress(current: int, total: int) -> None:
 
 
 def get_download_progress_renderer(
-    *, bar_type: str, size: Optional[int] = None
+    *, bar_type: str, size: Optional[int] = None, initial_progress: Optional[int] = None
 ) -> DownloadProgressRenderer:
     """Get an object that can be used to render the download progress.
 
     Returns a callable, that takes an iterable to "wrap".
     """
     if bar_type == "on":
-        return functools.partial(_rich_progress_bar, bar_type=bar_type, size=size)
+        return functools.partial(_rich_progress_bar, bar_type=bar_type, size=size, initial_progress=initial_progress,)
     elif bar_type == "raw":
-        return functools.partial(_raw_progress_bar, size=size)
+        return functools.partial(_raw_progress_bar, size=size, initial_progress=initial_progress,)
     else:
         return iter  # no-op, when passed an iterator
diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py
@@ -127,6 +127,8 @@ def make_requirement_preparer(
                     "fast-deps has no effect when used with the legacy resolver."
                 )
 
+        resume_incomplete = options.resume_incomplete == "resume"
+
         return RequirementPreparer(
             build_dir=temp_build_dir_path,
             src_dir=options.src_dir,
@@ -142,6 +144,8 @@ def make_requirement_preparer(
             lazy_wheel=lazy_wheel,
             verbosity=verbosity,
             legacy_resolver=legacy_resolver,
+            resume_incomplete=resume_incomplete,
+            resume_attempts=options.resume_attempts,
         )
 
     @classmethod
diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py
@@ -5,6 +5,7 @@
 import logging
 import mimetypes
 import os
+from http import HTTPStatus
 from typing import Iterable, Optional, Tuple
 
 from pip._vendor.requests.models import Response
@@ -28,13 +29,21 @@ def _get_http_response_size(resp: Response) -> Optional[int]:
         return None
 
 
+def _get_http_response_etag_or_date(resp: Response) -> Optional[str]:
+    """
+    Return either the ETag or Date header (or None if neither exists).
+    The return value can be used in an If-Range header.
+    """
+    return resp.headers.get("etag", resp.headers.get("date"))
+
+
 def _prepare_download(
     resp: Response,
     link: Link,
     progress_bar: str,
+    total_length: Optional[int],
+    range_start: Optional[int] = None,
 ) -> Iterable[bytes]:
-    total_length = _get_http_response_size(resp)
-
     if link.netloc == PyPI.file_storage_domain:
         url = link.show_url
     else:
@@ -43,10 +52,17 @@ def _prepare_download(
     logged_url = redact_auth_from_url(url)
 
     if total_length:
-        logged_url = f"{logged_url} ({format_size(total_length)})"
+        if range_start is not None:
+            logged_url = "{} ({}/{})".format(
+                logged_url, format_size(range_start), format_size(total_length)
+            )
+        else:
+            logged_url = "{} ({})".format(logged_url, format_size(total_length))
 
     if is_from_cache(resp):
         logger.info("Using cached %s", logged_url)
+    elif range_start is not None:
+        logger.info("Resume download %s", logged_url)
     else:
         logger.info("Downloading %s", logged_url)
 
@@ -66,7 +82,9 @@ def _prepare_download(
     if not show_progress:
         return chunks
 
-    renderer = get_download_progress_renderer(bar_type=progress_bar, size=total_length)
+    renderer = get_download_progress_renderer(
+        bar_type=progress_bar, size=total_length, initial_progress=range_start
+    )
     return renderer(chunks)
 
 
@@ -113,10 +131,27 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:
     return filename
 
 
-def _http_get_download(session: PipSession, link: Link) -> Response:
+def _http_get_download(
+    session: PipSession,
+    link: Link,
+    range_start: Optional[int] = None,
+    if_range: Optional[str] = None,
+) -> Response:
     target_url = link.url.split("#", 1)[0]
-    resp = session.get(target_url, headers=HEADERS, stream=True)
-    raise_for_status(resp)
+    headers = {**HEADERS}
+    # request a partial download
+    if range_start is not None:
+        headers["Range"] = "bytes={}-".format(range_start)
+    # make sure the file hasn't changed
+    if if_range is not None:
+        headers["If-Range"] = if_range
+    try:
+        resp = session.get(target_url, headers=headers, stream=True)
+        raise_for_status(resp)
+    except NetworkConnectionError as e:
+        assert e.response is not None
+        logger.critical("HTTP error %s while getting %s", e.response.status_code, link)
+        raise
     return resp
 
 
@@ -125,28 +160,91 @@ def __init__(
         self,
         session: PipSession,
         progress_bar: str,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
         self._session = session
         self._progress_bar = progress_bar
+        self._resume_incomplete = resume_incomplete
+        assert (
+            resume_attempts > 0
+        ), "Number of max incomplete download retries must be positive"
+        self._resume_attempts = resume_attempts
 
     def __call__(self, link: Link, location: str) -> Tuple[str, str]:
         """Download the file given by link into location."""
-        try:
-            resp = _http_get_download(self._session, link)
-        except NetworkConnectionError as e:
-            assert e.response is not None
-            logger.critical(
-                "HTTP error %s while getting %s", e.response.status_code, link
-            )
-            raise
+        resp = _http_get_download(self._session, link)
+        total_length = _get_http_response_size(resp)
+        etag_or_date = _get_http_response_etag_or_date(resp)
 
         filename = _get_http_response_filename(resp, link)
         filepath = os.path.join(location, filename)
 
-        chunks = _prepare_download(resp, link, self._progress_bar)
+        chunks = _prepare_download(resp, link, self._progress_bar, total_length)
+        bytes_received = 0
+
         with open(filepath, "wb") as content_file:
+
+            # Process the initial response
             for chunk in chunks:
+                bytes_received += len(chunk)
                 content_file.write(chunk)
+
+            if self._resume_incomplete:
+                attempts_left = self._resume_attempts
+
+                while total_length is not None and bytes_received < total_length:
+                    if attempts_left <= 0:
+                        break
+                    attempts_left -= 1
+
+                    # Attempt to resume download
+                    resume_resp = _http_get_download(
+                        self._session,
+                        link,
+                        range_start=bytes_received,
+                        if_range=etag_or_date,
+                    )
+
+                    restart = resume_resp.status_code != HTTPStatus.PARTIAL_CONTENT
+                    # If the server responded with 200 (e.g. when the file has been
+                    # modifiedon the server or the server doesn't support range
+                    # requests), reset the download to start from the beginning.
+                    if restart:
+                        content_file.seek(0)
+                        content_file.truncate()
+                        bytes_received = 0
+                        total_length = _get_http_response_size(resume_resp)
+                        etag_or_date = _get_http_response_etag_or_date(resume_resp)
+
+                    chunks = _prepare_download(
+                        resume_resp,
+                        link,
+                        self._progress_bar,
+                        total_length,
+                        range_start=bytes_received,
+                    )
+                    for chunk in chunks:
+                        bytes_received += len(chunk)
+                        content_file.write(chunk)
+
+        if total_length is not None and bytes_received < total_length:
+            if self._resume_incomplete:
+                logger.critical(
+                    "Failed to download %s after %d resumption attempts.",
+                    link,
+                    self._resume_attempts,
+                )
+            else:
+                logger.critical(
+                    "Failed to download %s."
+                    " Set --incomplete-downloads=resume to automatically"
+                    "resume incomplete download.",
+                    link,
+                )
+            os.remove(filepath)
+            raise RuntimeError("Incomplete download")
+
         content_type = resp.headers.get("Content-Type", "")
         return filepath, content_type
 
@@ -156,32 +254,17 @@ def __init__(
         self,
         session: PipSession,
         progress_bar: str,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
-        self._session = session
-        self._progress_bar = progress_bar
+        self._downloader = Downloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
 
     def __call__(
         self, links: Iterable[Link], location: str
     ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
         """Download the files given by links into location."""
         for link in links:
-            try:
-                resp = _http_get_download(self._session, link)
-            except NetworkConnectionError as e:
-                assert e.response is not None
-                logger.critical(
-                    "HTTP error %s while getting %s",
-                    e.response.status_code,
-                    link,
-                )
-                raise
-
-            filename = _get_http_response_filename(resp, link)
-            filepath = os.path.join(location, filename)
-
-            chunks = _prepare_download(resp, link, self._progress_bar)
-            with open(filepath, "wb") as content_file:
-                for chunk in chunks:
-                    content_file.write(chunk)
-            content_type = resp.headers.get("Content-Type", "")
+            filepath, content_type = self._downloader(link, location)
             yield link, (filepath, content_type)
diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py
@@ -231,15 +231,21 @@ def __init__(
         lazy_wheel: bool,
         verbosity: int,
         legacy_resolver: bool,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
         super().__init__()
 
         self.src_dir = src_dir
         self.build_dir = build_dir
         self.build_tracker = build_tracker
         self._session = session
-        self._download = Downloader(session, progress_bar)
-        self._batch_download = BatchDownloader(session, progress_bar)
+        self._download = Downloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
+        self._batch_download = BatchDownloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
         self.finder = finder
 
         # Where still-packed archives should be written to. If None, they are
diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py
diff --git a/tests/unit/test_operations_prepare.py b/tests/unit/test_operations_prepare.py
diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Add support to resume incomplete download. The behavior can be controlled using flags ``--incomplete-downloads`` and ``--incomplete-download-retries``.