From dbc6a6401f54bd6c9333fee4dbb49af6eace9123 Mon Sep 17 00:00:00 2001 From: gmargaritis Date: Fri, 4 Oct 2024 23:29:14 +0300 Subject: [PATCH] Add support for --resume-retries option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added —resume-retries option to allow resuming incomplete downloads - Setting —resume-retries=N allows pip to make N attempts to resume downloading, in case of dropped or timed out connections - Each resume attempt uses the values specified for —retries and —timeout internally Signed-off-by: gmargaritis --- news/11180.feature.rst | 1 - news/12991.feature.rst | 3 + src/pip/_internal/cli/cmdoptions.py | 23 +-- src/pip/_internal/cli/progress_bars.py | 13 +- src/pip/_internal/cli/req_command.py | 5 +- src/pip/_internal/exceptions.py | 28 +--- src/pip/_internal/network/download.py | 204 ++++++++++++++++-------- src/pip/_internal/operations/prepare.py | 11 +- tests/unit/test_network_download.py | 36 ++--- tests/unit/test_operations_prepare.py | 8 +- tests/unit/test_req.py | 3 +- 11 files changed, 183 insertions(+), 152 deletions(-) delete mode 100644 news/11180.feature.rst create mode 100644 news/12991.feature.rst diff --git a/news/11180.feature.rst b/news/11180.feature.rst deleted file mode 100644 index 24931855789..00000000000 --- a/news/11180.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add support to resume incomplete download. The behavior can be controlled using flags ``--incomplete-downloads`` and ``--incomplete-download-retries``. diff --git a/news/12991.feature.rst b/news/12991.feature.rst new file mode 100644 index 00000000000..26c2df389ed --- /dev/null +++ b/news/12991.feature.rst @@ -0,0 +1,3 @@ +Add support to enable resuming incomplete downloads. + +Control the number of retry attempts using the ``--resume-retries`` flag. diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index 96663602c23..ff0644ce711 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -1028,23 +1028,13 @@ def check_list_path_option(options: Values) -> None: help=("Enable deprecated functionality, that will be removed in the future."), ) -incomplete_downloads: Callable[..., Option] = partial( +resume_retries: Callable[..., Option] = partial( Option, - "--incomplete-downloads", - dest="resume_incomplete", - choices=["resume", "discard"], - default="discard", - metavar="policy", - help="How to handle an incomplete download: resume, discard (default to %default).", -) - -incomplete_download_retries: Callable[..., Option] = partial( - Option, - "--incomplete-download-retries", - dest="resume_attempts", + "--resume-retries", + dest="resume_retries", type="int", - default=5, - help="Maximum number of resumption retries for incomplete download " + default=0, + help="Maximum number of resumption retries for incomplete downloads" "(default %default times).", ) @@ -1080,8 +1070,7 @@ def check_list_path_option(options: Values) -> None: no_python_version_warning, use_new_feature, use_deprecated_feature, - incomplete_downloads, - incomplete_download_retries, + resume_retries, ], } diff --git a/src/pip/_internal/cli/progress_bars.py b/src/pip/_internal/cli/progress_bars.py index 42ebc83b137..41d602b087e 100644 --- a/src/pip/_internal/cli/progress_bars.py +++ b/src/pip/_internal/cli/progress_bars.py @@ -90,8 +90,17 @@ def get_download_progress_renderer( Returns a callable, that takes an iterable to "wrap". """ if bar_type == "on": - return functools.partial(_rich_progress_bar, bar_type=bar_type, size=size, initial_progress=initial_progress,) + return functools.partial( + _rich_progress_bar, + bar_type=bar_type, + size=size, + initial_progress=initial_progress, + ) elif bar_type == "raw": - return functools.partial(_raw_progress_bar, size=size, initial_progress=initial_progress,) + return functools.partial( + _raw_progress_bar, + size=size, + initial_progress=initial_progress, + ) else: return iter # no-op, when passed an iterator diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 6dd9d224b3e..6d9431fc87c 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -127,8 +127,6 @@ def make_requirement_preparer( "fast-deps has no effect when used with the legacy resolver." ) - resume_incomplete = options.resume_incomplete == "resume" - return RequirementPreparer( build_dir=temp_build_dir_path, src_dir=options.src_dir, @@ -144,8 +142,7 @@ def make_requirement_preparer( lazy_wheel=lazy_wheel, verbosity=verbosity, legacy_resolver=legacy_resolver, - resume_incomplete=resume_incomplete, - resume_attempts=options.resume_attempts, + resume_retries=options.resume_retries, ) @classmethod diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index b74c9fe8f7d..7a06d79704d 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -776,35 +776,23 @@ def __init__(self, *, distribution: "BaseDistribution") -> None: hint_stmt=None, ) + class IncompleteDownloadError(DiagnosticPipError): """Raised when the downloader receives fewer bytes than advertised in the Content-Length header.""" reference = "incomplete-download-error" - def __init__( - self, link: str, resume_incomplete: bool, resume_attempts: int - ) -> None: - if resume_incomplete: - message = ( - "Download failed after {} attempts because not enough bytes are" - " received. The incomplete file has been cleaned up." - ).format(resume_attempts) - hint = "Use --incomplete-download-retries to configure resume retry limit." - else: - message = ( - "Download failed because not enough bytes are received." - " The incomplete file has been cleaned up." - ) - hint = ( - "Use --incomplete-downloads=resume to make pip retry failed download." - ) + def __init__(self, link: str, resume_retries: int) -> None: + message = ( + f"Download failed after {resume_retries} attempts because not enough" + " bytes were received. The incomplete file has been cleaned up." + ) + hint = "Use --resume-retries to configure resume retry limit." super().__init__( message=message, - context="File: {}\n" - "Resume failed download: {}\n" - "Resume retry limit: {}".format(link, resume_incomplete, resume_attempts), + context=f"File: {link}\nResume retry limit: {resume_retries}", hint_stmt=hint, note_stmt="This is an issue with network connectivity, not pip.", ) diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index c1acca9c262..5d2f2cbf41d 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -6,9 +6,10 @@ import mimetypes import os from http import HTTPStatus -from typing import Iterable, Optional, Tuple +from typing import BinaryIO, Iterable, Optional, Tuple from pip._vendor.requests.models import Response +from pip._vendor.urllib3.exceptions import ReadTimeoutError from pip._internal.cli.progress_bars import get_download_progress_renderer from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError @@ -42,7 +43,7 @@ def _prepare_download( link: Link, progress_bar: str, total_length: Optional[int], - range_start: Optional[int] = None, + range_start: Optional[int] = 0, ) -> Iterable[bytes]: if link.netloc == PyPI.file_storage_domain: url = link.show_url @@ -52,17 +53,17 @@ def _prepare_download( logged_url = redact_auth_from_url(url) if total_length: - if range_start is not None: - logged_url = "{} ({}/{})".format( - logged_url, format_size(range_start), format_size(total_length) + if range_start: + logged_url = ( + f"{logged_url} ({format_size(range_start)}/{format_size(total_length)})" ) else: - logged_url = "{} ({})".format(logged_url, format_size(total_length)) + logged_url = f"{logged_url} ({format_size(total_length)})" if is_from_cache(resp): logger.info("Using cached %s", logged_url) - elif range_start is not None: - logger.info("Resume download %s", logged_url) + elif range_start: + logger.info("Resuming download %s", logged_url) else: logger.info("Downloading %s", logged_url) @@ -134,16 +135,16 @@ def _get_http_response_filename(resp: Response, link: Link) -> str: def _http_get_download( session: PipSession, link: Link, - range_start: Optional[int] = None, + range_start: Optional[int] = 0, if_range: Optional[str] = None, ) -> Response: target_url = link.url.split("#", 1)[0] headers = {**HEADERS} # request a partial download - if range_start is not None: - headers["Range"] = "bytes={}-".format(range_start) + if range_start: + headers["Range"] = f"bytes={range_start}-" # make sure the file hasn't changed - if if_range is not None: + if if_range: headers["If-Range"] = if_range try: resp = session.get(target_url, headers=headers, stream=True) @@ -160,82 +161,154 @@ def __init__( self, session: PipSession, progress_bar: str, - resume_incomplete: bool, - resume_attempts: int, + resume_retries: int, ) -> None: + assert ( + resume_retries >= 0 + ), "Number of max resume retries must be bigger or equal to zero" self._session = session self._progress_bar = progress_bar - self._resume_incomplete = resume_incomplete - assert ( - resume_attempts > 0 - ), "Number of max incomplete download retries must be positive" - self._resume_attempts = resume_attempts + self._resume_retries = resume_retries def __call__(self, link: Link, location: str) -> Tuple[str, str]: """Download the file given by link into location.""" resp = _http_get_download(self._session, link) total_length = _get_http_response_size(resp) - etag_or_date = _get_http_response_etag_or_date(resp) filename = _get_http_response_filename(resp, link) filepath = os.path.join(location, filename) - chunks = _prepare_download(resp, link, self._progress_bar, total_length) bytes_received = 0 - with open(filepath, "wb") as content_file: + bytes_received = self._process_response( + resp, link, content_file, bytes_received, total_length + ) - # Process the initial response + if not total_length: + content_type = resp.headers.get("Content-Type", "") + return filepath, content_type + + if bytes_received < total_length: + self._attempt_resume( + resp, link, content_file, total_length, bytes_received, filepath + ) + + content_type = resp.headers.get("Content-Type", "") + return filepath, content_type + + def _process_response( + self, + resp: Response, + link: Link, + content_file: BinaryIO, + bytes_received: int, + total_length: Optional[int], + ) -> int: + """Process the response and write the chunks to the file.""" + chunks = _prepare_download( + resp, link, self._progress_bar, total_length, range_start=bytes_received + ) + + bytes_received = self._write_chunks_to_file( + chunks, + content_file, + bytes_received, + total_length, + ) + + return bytes_received + + def _write_chunks_to_file( + self, + chunks: Iterable[bytes], + content_file: BinaryIO, + bytes_received: int, + total_length: Optional[int], + ) -> int: + """Write the chunks to the file and return the number of bytes received.""" + try: for chunk in chunks: bytes_received += len(chunk) content_file.write(chunk) + except ReadTimeoutError as e: + if not total_length: + # Raise exception if the Content-Length header is not provided + # and the connection times out. + raise e - if self._resume_incomplete: - attempts_left = self._resume_attempts + # Ensuring bytes_received is returned to attempt resume + logger.warning("Connection timed out while downloading.") - while total_length is not None and bytes_received < total_length: - if attempts_left <= 0: - break - attempts_left -= 1 + return bytes_received - # Attempt to resume download - resume_resp = _http_get_download( - self._session, - link, - range_start=bytes_received, - if_range=etag_or_date, - ) + def _attempt_resume( + self, + resp: Response, + link: Link, + content_file: BinaryIO, + total_length: Optional[int], + bytes_received: int, + filepath: str, + ) -> int: + """Attempt to resume the download if connection was dropped.""" + etag_or_date = _get_http_response_etag_or_date(resp) + + attempts_left = self._resume_retries + while attempts_left and total_length and bytes_received < total_length: + attempts_left -= 1 - restart = resume_resp.status_code != HTTPStatus.PARTIAL_CONTENT - # If the server responded with 200 (e.g. when the file has been - # modifiedon the server or the server doesn't support range - # requests), reset the download to start from the beginning. - if restart: - content_file.seek(0) - content_file.truncate() - bytes_received = 0 - total_length = _get_http_response_size(resume_resp) - etag_or_date = _get_http_response_etag_or_date(resume_resp) - - chunks = _prepare_download( - resume_resp, - link, - self._progress_bar, - total_length, - range_start=bytes_received, + logger.warning( + "Attempting to resume download with bytes received: %s/%s", + format_size(bytes_received), + format_size(total_length), + ) + + try: + resume_resp = _http_get_download( + self._session, + link, + range_start=bytes_received, + if_range=etag_or_date, + ) + + # If the server responded with 200 (e.g. when the file has been + # modified on the server or range requests are not supported by + # the server), reset the download to start from the beginning. + restart = resume_resp.status_code != HTTPStatus.PARTIAL_CONTENT + if restart: + bytes_received, total_length, etag_or_date = ( + self._reset_download_state(resume_resp, content_file) ) - for chunk in chunks: - bytes_received += len(chunk) - content_file.write(chunk) - if total_length is not None and bytes_received < total_length: + bytes_received = self._process_response( + resume_resp, + link, + content_file, + bytes_received, + total_length, + ) + except (ConnectionError, ReadTimeoutError): + continue + + if total_length and bytes_received < total_length: os.remove(filepath) - raise IncompleteDownloadError( - str(link), self._resume_incomplete, self._resume_attempts - ) + raise IncompleteDownloadError(str(link), self._resume_retries) - content_type = resp.headers.get("Content-Type", "") - return filepath, content_type + return bytes_received + + def _reset_download_state( + self, + resp: Response, + content_file: BinaryIO, + ) -> Tuple[int, Optional[int], Optional[str]]: + """Reset the download state to restart downloading from the beginning.""" + content_file.seek(0) + content_file.truncate() + bytes_received = 0 + total_length = _get_http_response_size(resp) + etag_or_date = _get_http_response_etag_or_date(resp) + + return bytes_received, total_length, etag_or_date class BatchDownloader: @@ -243,12 +316,9 @@ def __init__( self, session: PipSession, progress_bar: str, - resume_incomplete: bool, - resume_attempts: int, + resume_retries: int, ) -> None: - self._downloader = Downloader( - session, progress_bar, resume_incomplete, resume_attempts - ) + self._downloader = Downloader(session, progress_bar, resume_retries) def __call__( self, links: Iterable[Link], location: str diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 116d33d9450..1719799ba52 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -231,8 +231,7 @@ def __init__( lazy_wheel: bool, verbosity: int, legacy_resolver: bool, - resume_incomplete: bool, - resume_attempts: int, + resume_retries: int, ) -> None: super().__init__() @@ -240,12 +239,8 @@ def __init__( self.build_dir = build_dir self.build_tracker = build_tracker self._session = session - self._download = Downloader( - session, progress_bar, resume_incomplete, resume_attempts - ) - self._batch_download = BatchDownloader( - session, progress_bar, resume_incomplete, resume_attempts - ) + self._download = Downloader(session, progress_bar, resume_retries) + self._batch_download = BatchDownloader(session, progress_bar, resume_retries) self.finder = finder # Where still-packed archives should be written to. If None, they are diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index 1d7f6405f36..3d9b5ecd368 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -71,7 +71,7 @@ {"content-length": "200"}, False, 100, - "Resume download http://example.com/foo.tgz (100 bytes/200 bytes)", + "Resuming download http://example.com/foo.tgz (100 bytes/200 bytes)", ), ], ) @@ -184,41 +184,33 @@ def test_parse_content_disposition( @pytest.mark.parametrize( - "resume_incomplete," - "resume_attempts," - "mock_responses," - "expected_resume_args," - "expected_bytes", + "resume_retries,mock_responses,expected_resume_args,expected_bytes", [ # If content-length is not provided, the download will # always "succeed" since we don't have a way to check if # the download is complete. ( - False, - 5, + 0, [({}, 200, b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89")], [], b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", ), # Complete download (content-length matches body) ( - False, - 5, + 0, [({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89")], [], b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", ), - # Incomplete download with auto resume disabled + # Incomplete download without resume retries ( - False, - 5, + 0, [({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-")], [], None, ), - # Incomplete download with auto resume enabled + # Incomplete download with resume retries ( - True, 5, [ ({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-"), @@ -229,9 +221,8 @@ def test_parse_content_disposition( ), # If the server responds with 200 (e.g. no range header support or the file # has changed between the requests) the downloader should restart instead of - # resume. The downloaded file should not be affected. + # attempting to resume. The downloaded file should not be affected. ( - True, 5, [ ({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-"), @@ -247,7 +238,6 @@ def test_parse_content_disposition( ), # File size could change between requests. Make sure this is handled correctly. ( - True, 5, [ ({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-"), @@ -261,11 +251,10 @@ def test_parse_content_disposition( [(24, None), (36, None)], b"new-0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89", ), - # The downloader should fail after resume_attempts attempts. + # The downloader should fail after n resume_retries attempts. # This prevents the downloader from getting stuck if the connection # is unstable and the server doesn't not support range requests. ( - True, 1, [ ({"content-length": "36"}, 200, b"0cfa7e9d-1868-4dd7-9fb3-"), @@ -278,7 +267,6 @@ def test_parse_content_disposition( # request conditional if it is possible to check for modifications # (e.g. if we know the creation time of the initial response). ( - True, 5, [ ( @@ -297,7 +285,6 @@ def test_parse_content_disposition( ), # ETag is preferred over Date for the If-Range condition. ( - True, 5, [ ( @@ -325,8 +312,7 @@ def test_parse_content_disposition( ], ) def test_downloader( - resume_incomplete: bool, - resume_attempts: int, + resume_retries: int, mock_responses: List[Tuple[Dict[str, str], int, bytes]], # list of (range_start, if_range) expected_resume_args: List[Tuple[Optional[int], Optional[int]]], @@ -336,7 +322,7 @@ def test_downloader( ) -> None: session = PipSession() link = Link("http://example.com/foo.tgz") - downloader = Downloader(session, "on", resume_incomplete, resume_attempts) + downloader = Downloader(session, "on", resume_retries) responses = [] for headers, status_code, body in mock_responses: diff --git a/tests/unit/test_operations_prepare.py b/tests/unit/test_operations_prepare.py index 215744b2bf3..15e5d3569df 100644 --- a/tests/unit/test_operations_prepare.py +++ b/tests/unit/test_operations_prepare.py @@ -31,9 +31,7 @@ def _fake_session_get(*args: Any, **kwargs: Any) -> Dict[str, str]: session = Mock() session.get = _fake_session_get - download = Downloader( - session, progress_bar="on", resume_incomplete=False, resume_attempts=5 - ) + download = Downloader(session, progress_bar="on", resume_retries=0) uri = data.packages.joinpath("simple-1.0.tar.gz").as_uri() link = Link(uri) @@ -79,9 +77,7 @@ def test_download_http_url__no_directory_traversal( "content-disposition": 'attachment;filename="../out_dir_file"', } session.get.return_value = resp - download = Downloader( - session, progress_bar="on", resume_incomplete=False, resume_attempts=5 - ) + download = Downloader(session, progress_bar="on", resume_retries=0) download_dir = os.fspath(tmpdir.joinpath("download")) os.mkdir(download_dir) diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index e4850577d4b..7cd49a0a8d3 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -107,8 +107,7 @@ def _basic_resolver( lazy_wheel=False, verbosity=0, legacy_resolver=True, - resume_incomplete=False, - resume_attempts=5, + resume_retries=0, ) yield Resolver( preparer=preparer,