From 6cb37fe14f5b367d8d3f0d5abee27117d2c0a975 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 27 Aug 2025 08:44:07 -0700 Subject: [PATCH 01/35] Initial setup --- gems/aws-sdk-s3/CHANGELOG.md | 2 ++ .../lib/aws-sdk-s3/customizations.rb | 2 ++ .../lib/aws-sdk-s3/directory_downloader.rb | 19 +++++++++++++++++++ .../lib/aws-sdk-s3/directory_uploader.rb | 19 +++++++++++++++++++ .../lib/aws-sdk-s3/transfer_manager.rb | 14 ++++++++++++++ 5 files changed, 56 insertions(+) create mode 100644 gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb create mode 100644 gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb diff --git a/gems/aws-sdk-s3/CHANGELOG.md b/gems/aws-sdk-s3/CHANGELOG.md index 0ee8bd699fe..808f45f6ac6 100644 --- a/gems/aws-sdk-s3/CHANGELOG.md +++ b/gems/aws-sdk-s3/CHANGELOG.md @@ -1,6 +1,8 @@ Unreleased Changes ------------------ +* Feature - TODO + 1.198.0 (2025-08-26) ------------------ diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb index c0dba64b79c..211c20aaf05 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb @@ -18,6 +18,8 @@ module S3 autoload :ObjectMultipartCopier, 'aws-sdk-s3/object_multipart_copier' autoload :PresignedPost, 'aws-sdk-s3/presigned_post' autoload :Presigner, 'aws-sdk-s3/presigner' + autoload :DirectoryUploader, 'aws-sdk-s3/directory_uploader' + autoload :DirectoryDownloader, 'aws-sdk-s3/directory_downloader' autoload :TransferManager, 'aws-sdk-s3/transfer_manager' # s3 express session auth diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb new file mode 100644 index 00000000000..ea04904cc46 --- /dev/null +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +module Aws + module S3 + # @api private + class DirectoryDownloader + def initialize(options = {}) + @client = options[:client] || Client.new + @executor = options[:executor] + end + + attr_reader :client, :executor + + def download(destination, bucket:, **options) + # TODO + end + end + end +end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb new file mode 100644 index 00000000000..568f6d42e2e --- /dev/null +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true + +module Aws + module S3 + # @api private + class DirectoryUploader + def initialize(options = {}) + @client = options[:client] || Client.new + @executor = options[:executor] + end + + attr_reader :client, :executor + + def upload(source, bucket:, **options) + # TODO + end + end + end +end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb index 14443036959..714adccb6fa 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb @@ -18,11 +18,15 @@ class TransferManager # will be created automatically. def initialize(options = {}) @client = options.delete(:client) || Client.new + @executor = options.delete(:executor) end # @return [S3::Client] attr_reader :client + # @return [Object] executor + attr_reader :executor + # Downloads a file in S3 to a path on disk. # # # small files (< 5MB) are downloaded in a single API call @@ -101,6 +105,11 @@ def download_file(destination, bucket:, key:, **options) true end + # TODO: Docs + def download_directory(destination, bucket:, ** options) + # TODO + end + # Uploads a file from disk to S3. # # # a small file are uploaded with PutObject API @@ -179,6 +188,11 @@ def upload_file(source, bucket:, key:, **options) true end + # TODO: Docs + def upload_directory(source, bucket:, ** options) + # TODO + end + # Uploads a stream in a streaming fashion to S3. # # Passed chunks automatically split into multipart upload parts and the parts are uploaded in parallel. From d31ae4d84559ee4f14ad1519a5fb18502863ccca Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Fri, 29 Aug 2025 12:50:33 -0700 Subject: [PATCH 02/35] Minor adjustments --- gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb index 714adccb6fa..6350e339dc2 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb @@ -106,7 +106,7 @@ def download_file(destination, bucket:, key:, **options) end # TODO: Docs - def download_directory(destination, bucket:, ** options) + def download_directory(destination, bucket:, **options) # TODO end @@ -189,7 +189,7 @@ def upload_file(source, bucket:, key:, **options) end # TODO: Docs - def upload_directory(source, bucket:, ** options) + def upload_directory(source, bucket:, **options) # TODO end @@ -256,5 +256,6 @@ def upload_stream(bucket:, key:, **options, &block) true end end + end end From 74ed1891fa179f2fc91e93f69e3bed9ad2ba19be Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Fri, 29 Aug 2025 12:51:15 -0700 Subject: [PATCH 03/35] Directory downloader impl --- .../lib/aws-sdk-s3/directory_downloader.rb | 88 ++++++++++++++++++- .../lib/aws-sdk-s3/file_downloader.rb | 85 +++++++++++++----- 2 files changed, 150 insertions(+), 23 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb index ea04904cc46..e323d7a1aa7 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -2,6 +2,17 @@ module Aws module S3 + # Raised when DirectoryDownloader fails to download objects from S3 bucket + class DirectoryDownloadError < StandardError + def initialize(message, errors = []) + @errors = errors + super(message) + end + + # @return [Array] The list of errors encountered when downloading objects + attr_reader :errors + end + # @api private class DirectoryDownloader def initialize(options = {}) @@ -12,7 +23,82 @@ def initialize(options = {}) attr_reader :client, :executor def download(destination, bucket:, **options) - # TODO + if File.exist?(destination) + raise ArgumentError 'invalid destination, expected a directory' unless File.directory?(destination) + else + FileUtils.mkdir_p(destination) + end + + download_opts = options.dup + @destination = destination + @bucket = bucket + @recursive = download_opts.delete(:recursive) || false + @s3_prefix = download_opts.delete(:s3_prefix) + @s3_delimiter = download_opts.delete(:s3_delimiter) || '/' + @failure_policy = download_opts.delete(:failure_policy) || :abort + + downloader = FileDownloader.new(client: client, executor: @executor) + @download_queue = SizedQueue.new(100) + @abort_download = false + @errors = [] + + Thread.new do + stream_keys + @download_queue << :done + end + + download_attempts = 0 + completion_queue = Queue.new + while (queue_key = @download_queue.shift) != :done + break if @abort_download + + download_attempts += 1 + @executor.post(queue_key) do |k| + normalized_key = normalize_key(k) + full_path = File.join(@destination, normalized_key) + dir_path = File.dirname(full_path) + FileUtils.mkdir_p(dir_path) unless dir_path == @destination || Dir.exist?(dir_path) + + downloader.download(full_path, download_opts.merge(bucket: @bucket, key: k)) + rescue StandardError => e + @errors << e + @abort_download = true if @failure_policy == :abort + ensure + completion_queue << :done + end + end + + download_attempts.times { completion_queue.pop } + + if @abort_download + msg = "failed to download directory: attempt to download #{download_attempts} objects " \ + "but failed to download #{@errors.count} objects." + raise DirectoryDownloadError, msg + @errors.to_s + else + { + downloaded: download_attempts - @errors.count, + errors: @errors.count + } + end + end + + def normalize_key(key) + key = key.delete_prefix(@s3_prefix) if @s3_prefix + return key.tr('/', @s3_delimiter) if @s3_delimiter != '/' + return key if File::SEPARATOR == '/' + + key.tr('/', File::SEPARATOR) + end + + def stream_keys(continuation_token: nil) + resp = @client.list_objects_v2(bucket: @bucket, continuation_token: continuation_token) + resp.contents.each do |o| + break if @abort_download + next if o.key.end_with?('/') + + @download_queue << o.key + end + stream_keys(continuation_token: resp.next_continuation_token) if resp.next_continuation_token end end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb index 517a06dabea..5774094ab6e 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb @@ -14,6 +14,7 @@ class FileDownloader def initialize(options = {}) @client = options[:client] || Client.new + @executor = options[:executor] end # @return [Client] @@ -25,24 +26,25 @@ def download(destination, options = {}) raise ArgumentError, "Invalid destination, expected #{valid_types.join(', ')} but got: #{destination.class}" end + @temp_path = nil @destination = destination @mode = options.delete(:mode) || 'auto' @thread_count = options.delete(:thread_count) || 10 @chunk_size = options.delete(:chunk_size) @on_checksum_validated = options.delete(:on_checksum_validated) @progress_callback = options.delete(:progress_callback) - @params = options + params = options validate! Aws::Plugins::UserAgent.metric('S3_TRANSFER') do case @mode - when 'auto' then multipart_download - when 'single_request' then single_request + when 'auto' then multipart_download(params) + when 'single_request' then single_request(params) when 'get_range' raise ArgumentError, 'In get_range mode, :chunk_size must be provided' unless @chunk_size resp = @client.head_object(@params) - multithreaded_get_by_ranges(resp.content_length, resp.etag) + multithreaded_get_by_ranges(resp.content_length, resp.etag, params) else raise ArgumentError, "Invalid mode #{@mode} provided, :mode should be single_request, get_range or auto" end @@ -60,34 +62,34 @@ def validate! raise ArgumentError, ':on_checksum_validated must be callable' end - def multipart_download - resp = @client.head_object(@params.merge(part_number: 1)) + def multipart_download(params) + resp = @client.head_object(params.merge(part_number: 1)) count = resp.parts_count if count.nil? || count <= 1 if resp.content_length <= MIN_CHUNK_SIZE - single_request + single_request(params) else - multithreaded_get_by_ranges(resp.content_length, resp.etag) + multithreaded_get_by_ranges(resp.content_length, resp.etag, params) end else # covers cases when given object is not uploaded via UploadPart API - resp = @client.head_object(@params) # partNumber is an option + resp = @client.head_object(params) # partNumber is an option if resp.content_length <= MIN_CHUNK_SIZE - single_request + single_request(params) else - compute_mode(resp.content_length, count, resp.etag) + compute_mode(resp.content_length, count, resp.etag, params) end end end - def compute_mode(file_size, count, etag) + def compute_mode(file_size, count, etag, params) chunk_size = compute_chunk(file_size) part_size = (file_size.to_f / count).ceil if chunk_size < part_size - multithreaded_get_by_ranges(file_size, etag) + multithreaded_get_by_ranges(file_size, etag, params) else - multithreaded_get_by_parts(count, file_size, etag) + multithreaded_get_by_parts(count, file_size, etag, params) end end @@ -97,7 +99,7 @@ def compute_chunk(file_size) @chunk_size || [(file_size.to_f / MAX_PARTS).ceil, MIN_CHUNK_SIZE].max.to_i end - def multithreaded_get_by_ranges(file_size, etag) + def multithreaded_get_by_ranges(file_size, etag, params) offset = 0 default_chunk_size = compute_chunk(file_size) chunks = [] @@ -105,19 +107,58 @@ def multithreaded_get_by_ranges(file_size, etag) while offset < file_size progress = offset + default_chunk_size progress = file_size if progress > file_size - params = @params.merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag) + params = params.merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag) chunks << Part.new(part_number: part_number, size: (progress - offset), params: params) part_number += 1 offset = progress end - download_in_threads(PartList.new(chunks), file_size) + if @executor + download_with_executor(PartList.new(chunks)) + else + download_in_threads(PartList.new(chunks), file_size) + end end - def multithreaded_get_by_parts(n_parts, total_size, etag) + def multithreaded_get_by_parts(n_parts, total_size, etag, params) parts = (1..n_parts).map do |part| - Part.new(part_number: part, params: @params.merge(part_number: part, if_match: etag)) + Part.new(part_number: part, params: params.merge(part_number: part, if_match: etag)) + end + if @executor + download_with_executor(PartList.new(parts)) + else + download_in_threads(PartList.new(parts), total_size) + end + end + + def download_with_executor(pending) + max_parts = pending.size + completion_queue = Queue.new + @abort_download = false + unless [File, Tempfile].include?(@destination.class) + @temp_path = "#{@destination}.s3tmp.#{SecureRandom.alphanumeric(8)}" + end + + while (part = pending.shift) + break if @abort_download + + @executor.post(part) do |p| + resp = @client.get_object(p.params) + range = extract_range(resp.content_range) + validate_range(range, p.params[:range]) if p.params[:range] + write(resp.body, range) + + if @on_checksum_validated && resp.checksum_validated + @on_checksum_validated.call(resp.checksum_validated, resp) + end + rescue StandardError => e + @abort_download = true + raise e + ensure + completion_queue << :done + end end - download_in_threads(PartList.new(parts), total_size) + + max_parts.times { completion_queue.pop } end def download_in_threads(pending, total_size) @@ -170,8 +211,8 @@ def write(body, range) File.write(path, body.read, range.split('-').first.to_i) end - def single_request - params = @params.merge(response_target: @destination) + def single_request(params) + params = params.merge(response_target: @destination) params[:on_chunk_received] = single_part_progress if @progress_callback resp = @client.get_object(params) return resp unless @on_checksum_validated From 4e8db178d6fb7fe3b5833e0e27c7d8f91944386b Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Fri, 29 Aug 2025 12:57:03 -0700 Subject: [PATCH 04/35] Directory uploader impl --- .../lib/aws-sdk-s3/directory_uploader.rb | 116 +++++++++++++++++- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 38 +++++- 2 files changed, 151 insertions(+), 3 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 568f6d42e2e..7efb8900552 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -1,7 +1,21 @@ # frozen_string_literal: true +require 'find' +require 'set' + module Aws module S3 + # Raised when DirectoryUploader fails to upload files to S3 bucket + class DirectoryUploadError < StandardError + def initialize(message, errors = []) + @errors = errors + super(message) + end + + # @return [Array] The list of errors encountered when uploading objects + attr_reader :errors + end + # @api private class DirectoryUploader def initialize(options = {}) @@ -9,10 +23,108 @@ def initialize(options = {}) @executor = options[:executor] end - attr_reader :client, :executor + # @return [Client] + attr_reader :client def upload(source, bucket:, **options) - # TODO + raise ArgumentError, 'Invalid directory' unless Dir.exist?(source) + + upload_opts = options.dup + @source = source + @bucket = bucket + @recursive = upload_opts.delete(:recursive) || false + @follow_symlinks = upload_opts.delete(:follow_symlinks) || false + @s3_prefix = upload_opts.delete(:s3_prefix) + @s3_delimiter = upload_opts.delete(:s3_delimiter) || '/' + @filter_callback = upload_opts.delete(:filter_callback) + + uploader = FileUploader.new( + multipart_threshold: upload_opts.delete(:multipart_threshold), + client: @client, + executor: @executor + ) + @upload_queue = SizedQueue.new(100) + @errors = [] + @abort_upload = false + + Thread.new do + if @recursive + stream_recursive_files + else + stream_direct_files + end + @upload_queue << :done + end + + upload_attempts = 0 + completion_queue = Queue.new + while (queue_file = @upload_queue.shift) != :done + break if @abort_upload + + upload_attempts += 1 + @executor.post(queue_file) do |f| + + path = File.join(@source, f) + # TODO: key to consider s3_prefix and custom delimiter + uploader.upload(path, upload_opts.merge(bucket: @bucket, key: f)) + rescue StandardError => e + @errors << e + @abort_download = true if @failure_policy == :abort + ensure + completion_queue << :done + end + end + file_count.times { completion_queue.pop } + + if @abort_upload + msg = "failed to upload directory: attempt to upload #{upload_attempts} files " \ + "but failed to upload #{@errors.count} files." + raise DirectoryUploadError, msg + @errors.to_s + else + { + upload: upload_attempts - @errors.count, + errors: @errors.count + } + end + end + + private + + # TODO: need to optimize & handle failures + def stream_recursive_files + visited = Set.new + Find.find(@source) do |p| + break if @abort_upload + + if !@follow_symlinks && File.symlink?(p) + Find.prune + next + end + + absolute_path = File.realpath(p) + if visited.include?(absolute_path) + Find.prune + next + end + + visited << absolute_path + + # TODO: if non-default s3_delimiter is used, validate here and fail + @upload_queue << p if File.file?(p) + end + end + + # TODO: need to optimize & handle failures + def stream_direct_files + Dir.each_child(@source) do |entry| + break if @abort_upload + + path = File.join(@source, entry) + next if !@follow_symlinks && File.symlink?(path) + + # TODO: if non-default s3_delimiter is used, validate here and fail + @upload_queue << entry if File.file?(path) + end end end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index bcc05f1fc9e..83f0646a2e9 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -24,6 +24,7 @@ class MultipartFileUploader # @option options [Integer] :thread_count (DEFAULT_THREAD_COUNT) def initialize(options = {}) @client = options[:client] || Client.new + @executor = options[:executor] @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT end @@ -66,7 +67,13 @@ def complete_upload(upload_id, parts, source, options) def upload_parts(upload_id, source, options) completed = PartList.new pending = PartList.new(compute_parts(upload_id, source, options)) - errors = upload_in_threads(pending, completed, options) + errors = + if @executor + upload_with_executor(pending, completed, options) + else + upload_in_threads(pending, completed, options) + end + if errors.empty? completed.to_a.sort_by { |part| part[:part_number] } else @@ -135,6 +142,35 @@ def upload_part_opts(options) end end + def upload_with_executor(pending, completed, _options) + max_parts = pending.count + completion_queue = Queue.new + abort_upload = false + errors = [] + + while (part = pending.shift) + break if abort_upload + + @executor.post(part) do |p| + resp = @client.upload_part(p) + p[:body].close + completed_part = { etag: resp.etag, part_number: p[:part_number] } + algorithm = resp.context.params[:checksum_algorithm] + k = "checksum_#{algorithm.downcase}".to_sym + completed_part[k] = resp.send(k) + completed.push(completed_part) + rescue StandardError => e + abort_upload = true + errors << e + ensure + completion_queue << :done + end + end + + max_parts.times { completion_queue.pop } + errors + end + def upload_in_threads(pending, completed, options) threads = [] if (callback = options[:progress_callback]) From 7749ba5bbab9a8c111847e800672826f27917284 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 9 Sep 2025 07:44:39 -0700 Subject: [PATCH 05/35] Add default executor --- .../lib/aws-sdk-s3/customizations.rb | 1 + .../lib/aws-sdk-s3/default_executor.rb | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb index 211c20aaf05..a85bfd6e14c 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb @@ -6,6 +6,7 @@ module S3 autoload :BucketRegionCache, 'aws-sdk-s3/bucket_region_cache' autoload :Encryption, 'aws-sdk-s3/encryption' autoload :EncryptionV2, 'aws-sdk-s3/encryption_v2' + autoload :DefaultExecutor, 'aws-sdk-s3/default_executor' autoload :FilePart, 'aws-sdk-s3/file_part' autoload :FileUploader, 'aws-sdk-s3/file_uploader' autoload :FileDownloader, 'aws-sdk-s3/file_downloader' diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb new file mode 100644 index 00000000000..72e79bef4e3 --- /dev/null +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +module Aws + module S3 + # @api private + class DefaultExecutor + def initialize(options = {}) + @queue = Queue.new + @max_threads = options[:max_threads] || 10 + @pool = [] + @running = true + monitor_pool + end + + def post(*args, &block) + raise 'Executor is not running' unless @running + + @queue << [args, block] + end + + def shutdown + @running = false + @max_threads.times { @queue << :shutdown } + @pool.each(&:join) + @pool.clear + true + end + + private + + def monitor_pool + Thread.new do + while @running + @pool.select!(&:alive?) + + @pool << spawn_worker if @queue.size > @pool.size && @pool.size < @max_threads + sleep(0.01) + end + end + end + + def spawn_worker + Thread.new do + while (job = @queue.pop) + break if job == :shutdown + + args, block = job + block.call(*args) + end + end + end + end + end +end From 99f0de60d3af725057150bca44690219fbfb483c Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 9 Sep 2025 08:41:53 -0700 Subject: [PATCH 06/35] Add running check to default executor --- gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb index 72e79bef4e3..b9ab408b6ff 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -26,6 +26,10 @@ def shutdown true end + def running? + @running + end + private def monitor_pool From 441fa826d646c7cd7a4329a2cb4a762e0b6055eb Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 9 Sep 2025 10:13:00 -0700 Subject: [PATCH 07/35] Refactor MultipartFileUploader with executor --- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 86 +++++++------------ 1 file changed, 32 insertions(+), 54 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index 83f0646a2e9..95dc59ae938 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -7,7 +7,6 @@ module Aws module S3 # @api private class MultipartFileUploader - MIN_PART_SIZE = 5 * 1024 * 1024 # 5MB MAX_PARTS = 10_000 DEFAULT_THREAD_COUNT = 10 @@ -24,12 +23,13 @@ class MultipartFileUploader # @option options [Integer] :thread_count (DEFAULT_THREAD_COUNT) def initialize(options = {}) @client = options[:client] || Client.new - @executor = options[:executor] @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT + @custom_executor = !options[:executor].nil? + @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) end # @return [Client] - attr_reader :client + attr_reader :client, :executor # @param [String, Pathname, File, Tempfile] source The file to upload. # @option options [required, String] :bucket The bucket to upload to. @@ -44,6 +44,7 @@ def upload(source, options = {}) upload_id = initiate_upload(options) parts = upload_parts(upload_id, source, options) complete_upload(upload_id, parts, source, options) + shutdown_executor end private @@ -67,13 +68,7 @@ def complete_upload(upload_id, parts, source, options) def upload_parts(upload_id, source, options) completed = PartList.new pending = PartList.new(compute_parts(upload_id, source, options)) - errors = - if @executor - upload_with_executor(pending, completed, options) - else - upload_in_threads(pending, completed, options) - end - + errors = upload_with_executor(pending, completed, options) if errors.empty? completed.to_a.sort_by { |part| part[:part_number] } else @@ -81,13 +76,19 @@ def upload_parts(upload_id, source, options) end end + def shutdown_executor + @executor.shutdown if @executor.running? && !@custom_executor + end + def abort_upload(upload_id, options, errors) @client.abort_multipart_upload(bucket: options[:bucket], key: options[:key], upload_id: upload_id) msg = "multipart upload failed: #{errors.map(&:message).join('; ')}" raise MultipartUploadError.new(msg, errors) - rescue MultipartUploadError => e + rescue MultipartUploadError => eq + shutdown_executor raise e rescue StandardError => e + shutdown_executor msg = "failed to abort multipart upload: #{e.message}. "\ "Multipart upload failed: #{errors.map(&:message).join('; ')}" raise MultipartUploadError.new(msg, errors + [e]) @@ -115,13 +116,13 @@ def checksum_key?(key) CHECKSUM_KEYS.include?(key) end - def has_checksum_key?(keys) + def checksum_keys?(keys) keys.any? { |key| checksum_key?(key) } end def create_opts(options) opts = { checksum_algorithm: Aws::Plugins::ChecksumAlgorithm::DEFAULT_CHECKSUM } - opts[:checksum_type] = 'FULL_OBJECT' if has_checksum_key?(options.keys) + opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) CREATE_OPTIONS.each_with_object(opts) do |key, hash| hash[key] = options[key] if options.key?(key) end @@ -129,7 +130,7 @@ def create_opts(options) def complete_opts(options) opts = {} - opts[:checksum_type] = 'FULL_OBJECT' if has_checksum_key?(options.keys) + opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) COMPLETE_OPTIONS.each_with_object(opts) do |key, hash| hash[key] = options[key] if options.key?(key) end @@ -142,21 +143,33 @@ def upload_part_opts(options) end end - def upload_with_executor(pending, completed, _options) - max_parts = pending.count + def upload_with_executor(pending, completed, options) + upload_attempts = 0 completion_queue = Queue.new abort_upload = false errors = [] + if (callback = options[:progress_callback]) + progress = MultipartProgress.new(pending, callback) + end + while (part = pending.shift) break if abort_upload + upload_attempts += 1 @executor.post(part) do |p| + if progress + p[:on_chunk_sent] = + proc do |_chunk, bytes, _total| + progress.call(p[:part_number], bytes) + end + end + resp = @client.upload_part(p) p[:body].close completed_part = { etag: resp.etag, part_number: p[:part_number] } - algorithm = resp.context.params[:checksum_algorithm] - k = "checksum_#{algorithm.downcase}".to_sym + algorithm = resp.context.params[:checksum_algorithm].downcase + k = "checksum_#{algorithm}".to_sym completed_part[k] = resp.send(k) completed.push(completed_part) rescue StandardError => e @@ -167,45 +180,10 @@ def upload_with_executor(pending, completed, _options) end end - max_parts.times { completion_queue.pop } + upload_attempts.times { completion_queue.pop } errors end - def upload_in_threads(pending, completed, options) - threads = [] - if (callback = options[:progress_callback]) - progress = MultipartProgress.new(pending, callback) - end - options.fetch(:thread_count, @thread_count).times do - thread = Thread.new do - begin - while (part = pending.shift) - if progress - part[:on_chunk_sent] = - proc do |_chunk, bytes, _total| - progress.call(part[:part_number], bytes) - end - end - resp = @client.upload_part(part) - part[:body].close - completed_part = { etag: resp.etag, part_number: part[:part_number] } - algorithm = resp.context.params[:checksum_algorithm] - k = "checksum_#{algorithm.downcase}".to_sym - completed_part[k] = resp.send(k) - completed.push(completed_part) - end - nil - rescue StandardError => e - # keep other threads from uploading other parts - pending.clear! - e - end - end - threads << thread - end - threads.map(&:value).compact - end - def compute_default_part_size(source_size) [(source_size.to_f / MAX_PARTS).ceil, MIN_PART_SIZE].max.to_i end From c792439ca41b35b3807ec0fb574756359e8d4793 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 9 Sep 2025 10:13:34 -0700 Subject: [PATCH 08/35] Fix typo in MultipartFileUploader --- gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index 95dc59ae938..38443f41d07 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -84,7 +84,7 @@ def abort_upload(upload_id, options, errors) @client.abort_multipart_upload(bucket: options[:bucket], key: options[:key], upload_id: upload_id) msg = "multipart upload failed: #{errors.map(&:message).join('; ')}" raise MultipartUploadError.new(msg, errors) - rescue MultipartUploadError => eq + rescue MultipartUploadError => e shutdown_executor raise e rescue StandardError => e From adce496d3e1093949e0e47ea4aa7aa980073ffea Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 9 Sep 2025 10:23:04 -0700 Subject: [PATCH 09/35] Update TM upload file with executor --- gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb | 1 + gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb index 587066551ea..8de03e17f8e 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb @@ -41,6 +41,7 @@ def upload(source, options = {}) else # remove multipart parameters not supported by put_object options.delete(:thread_count) + options.delete(:executor) put_object(source, options) end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb index 6350e339dc2..f819ad6fbec 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/transfer_manager.rb @@ -181,7 +181,8 @@ def upload_file(source, bucket:, key:, **options) uploading_options = options.dup uploader = FileUploader.new( multipart_threshold: uploading_options.delete(:multipart_threshold), - client: @client + client: @client, + executor: @executor ) response = uploader.upload(source, uploading_options.merge(bucket: bucket, key: key)) yield response if block_given? From 2758c4d2fe1940daac5ed7546af43a0d912151e8 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 17 Sep 2025 12:55:19 -0700 Subject: [PATCH 10/35] Update to only spawn workers when needed --- .../lib/aws-sdk-s3/default_executor.rb | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb index b9ab408b6ff..b99c822497c 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -9,13 +9,14 @@ def initialize(options = {}) @max_threads = options[:max_threads] || 10 @pool = [] @running = true - monitor_pool + @mutex = Mutex.new end def post(*args, &block) raise 'Executor is not running' unless @running @queue << [args, block] + ensure_worker_available end def shutdown @@ -32,20 +33,16 @@ def running? private - def monitor_pool - Thread.new do - while @running - @pool.select!(&:alive?) - - @pool << spawn_worker if @queue.size > @pool.size && @pool.size < @max_threads - sleep(0.01) - end + def ensure_worker_available + @mutex.synchronize do + @pool.select!(&:alive?) + @pool << spawn_worker if @pool.size < @max_threads end end def spawn_worker Thread.new do - while (job = @queue.pop) + while (job = @queue.shift) break if job == :shutdown args, block = job From b92d3b35d32a94c1d5b64cbc3435e4747cfbe91a Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 18 Sep 2025 09:44:35 -0700 Subject: [PATCH 11/35] Update directory uploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 118 +++++++++++++----- 1 file changed, 86 insertions(+), 32 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 7efb8900552..e9499d3eddd 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -12,7 +12,7 @@ def initialize(message, errors = []) super(message) end - # @return [Array] The list of errors encountered when uploading objects + # @return [Array] The list of errors encountered when uploading files attr_reader :errors end @@ -20,7 +20,9 @@ def initialize(message, errors = []) class DirectoryUploader def initialize(options = {}) @client = options[:client] || Client.new - @executor = options[:executor] + @executor = options[:executor] || DefaultExecutor.new + @options = options + @mutex = Mutex.new end # @return [Client] @@ -35,17 +37,18 @@ def upload(source, bucket:, **options) @recursive = upload_opts.delete(:recursive) || false @follow_symlinks = upload_opts.delete(:follow_symlinks) || false @s3_prefix = upload_opts.delete(:s3_prefix) - @s3_delimiter = upload_opts.delete(:s3_delimiter) || '/' - @filter_callback = upload_opts.delete(:filter_callback) + # @filter_callback = upload_opts.delete(:filter_callback) # need to impl + @failure_policy = upload_opts.delete(:failure_policy) || :abort # need to add validation, available opts are :abort, :ignore uploader = FileUploader.new( multipart_threshold: upload_opts.delete(:multipart_threshold), client: @client, executor: @executor ) + + @abort_upload = false @upload_queue = SizedQueue.new(100) @errors = [] - @abort_upload = false Thread.new do if @recursive @@ -58,72 +61,123 @@ def upload(source, bucket:, **options) upload_attempts = 0 completion_queue = Queue.new - while (queue_file = @upload_queue.shift) != :done + queue_executor = DefaultExecutor.new + + while (file = @upload_queue.shift) != :done break if @abort_upload upload_attempts += 1 - @executor.post(queue_file) do |f| - - path = File.join(@source, f) - # TODO: key to consider s3_prefix and custom delimiter - uploader.upload(path, upload_opts.merge(bucket: @bucket, key: f)) + queue_executor.post(file) do |f| + uploader.upload(f[:full_path], upload_opts.merge(bucket: @bucket, key: f[:key])) rescue StandardError => e @errors << e - @abort_download = true if @failure_policy == :abort + @abort_upload = true if @failure_policy == :abort ensure completion_queue << :done end end - file_count.times { completion_queue.pop } + upload_attempts.times { completion_queue.pop } + queue_executor.shutdown if @abort_upload - msg = "failed to upload directory: attempt to upload #{upload_attempts} files " \ + msg = "failed to upload directory: uploaded #{upload_attempts - @errors.count} files " \ "but failed to upload #{@errors.count} files." - raise DirectoryUploadError, msg + @errors.to_s + raise DirectoryUploadError.new(msg, @errors) else { - upload: upload_attempts - @errors.count, - errors: @errors.count + completed_uploads: upload_attempts - @errors.count, + failed_uploads: @errors.count } end + @executor.shutdown end private - # TODO: need to optimize & handle failures def stream_recursive_files - visited = Set.new - Find.find(@source) do |p| + if @follow_symlinks + puts 'streaming files with symlinks' + stream_with_symlinks + else + puts 'streaming files without symlinks' + stream_without_symlinks + end + end + + def stream_with_symlinks + source_prefix = "#{@source}/" + visited_dirs = Set.new + visited_files = Set.new + + Find.find(@source) do |path| break if @abort_upload - if !@follow_symlinks && File.symlink?(p) - Find.prune + stat = File.stat(path) + if stat.directory? + if visited_dirs.include?(stat.ino) + Find.prune + next + end + visited_dirs << stat.ino next end - absolute_path = File.realpath(p) - if visited.include?(absolute_path) + next if visited_files.include?(stat.ino) + + visited_files << stat.ino + key = path.gsub(source_prefix, '') + entry = { full_path: path } + entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key + + puts "adding #{entry}" + @upload_queue << entry + rescue StandardError => e + @errors << e + @abort_upload = true if @failure_policy == :abort + end + end + + def stream_without_symlinks + source_prefix = "#{@source}/" + Find.find(@source) do |path| + break if @abort_upload + + stat = File.lstat(path) + if stat.symlink? Find.prune next end + next unless stat.file? - visited << absolute_path + key = path.gsub(source_prefix, '') + entry = { full_path: path } + entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key - # TODO: if non-default s3_delimiter is used, validate here and fail - @upload_queue << p if File.file?(p) + puts "adding #{entry}" + @upload_queue << entry + rescue StandardError => e + @errors << e + @abort_upload = true if @failure_policy == :abort end end - # TODO: need to optimize & handle failures def stream_direct_files - Dir.each_child(@source) do |entry| + Dir.each_child(@source) do |key| break if @abort_upload - path = File.join(@source, entry) + path = File.join(@source, key) + next unless File.file?(path) + next if !@follow_symlinks && File.symlink?(path) - # TODO: if non-default s3_delimiter is used, validate here and fail - @upload_queue << entry if File.file?(path) + entry = { full_path: path } + entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key + + puts "adding #{entry}" + @upload_queue << entry + rescue StandardError => e + @errors << e + @abort_upload = true if @failure_policy == :abort end end end From 6afb4952f794e458c9d05d50e05e59d5ff86c131 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 18 Sep 2025 14:14:31 -0700 Subject: [PATCH 12/35] Update directory uploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 124 ++++++++---------- 1 file changed, 53 insertions(+), 71 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index e9499d3eddd..21b862cbc82 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -1,6 +1,5 @@ # frozen_string_literal: true -require 'find' require 'set' module Aws @@ -22,7 +21,6 @@ def initialize(options = {}) @client = options[:client] || Client.new @executor = options[:executor] || DefaultExecutor.new @options = options - @mutex = Mutex.new end # @return [Client] @@ -32,13 +30,13 @@ def upload(source, bucket:, **options) raise ArgumentError, 'Invalid directory' unless Dir.exist?(source) upload_opts = options.dup + bucket = bucket @source = source - @bucket = bucket + @s3_prefix = upload_opts.delete(:s3_prefix) @recursive = upload_opts.delete(:recursive) || false @follow_symlinks = upload_opts.delete(:follow_symlinks) || false - @s3_prefix = upload_opts.delete(:s3_prefix) - # @filter_callback = upload_opts.delete(:filter_callback) # need to impl - @failure_policy = upload_opts.delete(:failure_policy) || :abort # need to add validation, available opts are :abort, :ignore + @ignore_failure = upload_opts.delete(:failure_policy) || false + @filter_callback = upload_opts.delete(:filter_callback) uploader = FileUploader.new( multipart_threshold: upload_opts.delete(:multipart_threshold), @@ -54,7 +52,7 @@ def upload(source, bucket:, **options) if @recursive stream_recursive_files else - stream_direct_files + direct_traverse end @upload_queue << :done end @@ -68,16 +66,15 @@ def upload(source, bucket:, **options) upload_attempts += 1 queue_executor.post(file) do |f| - uploader.upload(f[:full_path], upload_opts.merge(bucket: @bucket, key: f[:key])) + uploader.upload(f[:path], upload_opts.merge(bucket: bucket, key: f[:key])) rescue StandardError => e @errors << e - @abort_upload = true if @failure_policy == :abort + @abort_upload = true unless @ignore_failure ensure completion_queue << :done end end upload_attempts.times { completion_queue.pop } - queue_executor.shutdown if @abort_upload msg = "failed to upload directory: uploaded #{upload_attempts - @errors.count} files " \ @@ -89,97 +86,82 @@ def upload(source, bucket:, **options) failed_uploads: @errors.count } end - @executor.shutdown + ensure + queue_executor.shutdown + @executor.shutdown unless @options[:executor] end private def stream_recursive_files if @follow_symlinks - puts 'streaming files with symlinks' - stream_with_symlinks + visited = Set.new + visited << File.stat(@source).ino + traverse_directory(@source, visited: visited) else - puts 'streaming files without symlinks' - stream_without_symlinks + traverse_directory(@source) end end - def stream_with_symlinks - source_prefix = "#{@source}/" - visited_dirs = Set.new - visited_files = Set.new + def traverse_directory(dir_path, relative_prefix: '', visited: nil) + return if @abort_upload - Find.find(@source) do |path| + Dir.each_child(dir_path) do |file| break if @abort_upload - stat = File.stat(path) - if stat.directory? - if visited_dirs.include?(stat.ino) - Find.prune - next - end - visited_dirs << stat.ino - next - end + full_path = File.join(dir_path, file) + next if @filter_callback&.call(full_path, file) + next if File.symlink?(full_path) && !@follow_symlinks - next if visited_files.include?(stat.ino) - - visited_files << stat.ino - key = path.gsub(source_prefix, '') - entry = { full_path: path } - entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key - - puts "adding #{entry}" - @upload_queue << entry + if File.directory?(full_path) + process_directory(full_path, file, relative_prefix, visited) + elsif File.file?(full_path) || File.symlink?(full_path) + key = relative_prefix.empty? ? file : File.join(relative_prefix, file) + queue_file(full_path, key) + end rescue StandardError => e @errors << e - @abort_upload = true if @failure_policy == :abort + @abort_upload = true unless @ignore_failure end end - def stream_without_symlinks - source_prefix = "#{@source}/" - Find.find(@source) do |path| - break if @abort_upload - - stat = File.lstat(path) - if stat.symlink? - Find.prune - next - end - next unless stat.file? - - key = path.gsub(source_prefix, '') - entry = { full_path: path } - entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key + def process_directory(path, file, relative_prefix, visited) + if @follow_symlinks && visited + stat = File.stat(path) + return if visited.include?(stat.ino) - puts "adding #{entry}" - @upload_queue << entry - rescue StandardError => e - @errors << e - @abort_upload = true if @failure_policy == :abort + visited << stat.ino end + new_prefix = relative_prefix.empty? ? file : File.join(relative_prefix, file) + traverse_directory(path, relative_prefix: new_prefix, visited: visited) end - def stream_direct_files - Dir.each_child(@source) do |key| + def direct_traverse + Dir.each_child(@source) do |file| break if @abort_upload - path = File.join(@source, key) - next unless File.file?(path) - - next if !@follow_symlinks && File.symlink?(path) + full_path = File.join(@source, file) + next if @filter_callback&.call(full_path, file) + next unless should_upload_file?(full_path) - entry = { full_path: path } - entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key - - puts "adding #{entry}" - @upload_queue << entry + queue_file(full_path, file) rescue StandardError => e @errors << e - @abort_upload = true if @failure_policy == :abort + @abort_upload = true unless @ignore_failure end end + + def should_upload_file?(path) + return false if File.directory?(path) + return false if !@follow_symlinks && File.symlink?(path) + File.file?(path) || File.symlink?(path) + end + + def queue_file(path, key) + entry = { path: path } + entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key + @upload_queue << entry + end end end end From 86b53e8014c148e427364a70e84d92085784ee5a Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 18 Sep 2025 14:19:23 -0700 Subject: [PATCH 13/35] Update uploader --- gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 21b862cbc82..9e7e2344fac 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -30,7 +30,6 @@ def upload(source, bucket:, **options) raise ArgumentError, 'Invalid directory' unless Dir.exist?(source) upload_opts = options.dup - bucket = bucket @source = source @s3_prefix = upload_opts.delete(:s3_prefix) @recursive = upload_opts.delete(:recursive) || false @@ -152,8 +151,8 @@ def direct_traverse end def should_upload_file?(path) - return false if File.directory?(path) - return false if !@follow_symlinks && File.symlink?(path) + return false if File.directory?(path) || (!@follow_symlinks && File.symlink?(path)) + File.file?(path) || File.symlink?(path) end From eae3814d4516807f603fc094f126f60e4f6d3c70 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 18 Sep 2025 17:00:52 -0700 Subject: [PATCH 14/35] Add minor improvements to directory uploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 107 +++++++++--------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 9e7e2344fac..c9cd1a5fc0e 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -36,26 +36,16 @@ def upload(source, bucket:, **options) @follow_symlinks = upload_opts.delete(:follow_symlinks) || false @ignore_failure = upload_opts.delete(:failure_policy) || false @filter_callback = upload_opts.delete(:filter_callback) + @abort_upload = false + @upload_queue = SizedQueue.new(100) + @errors = [] uploader = FileUploader.new( multipart_threshold: upload_opts.delete(:multipart_threshold), client: @client, executor: @executor ) - - @abort_upload = false - @upload_queue = SizedQueue.new(100) - @errors = [] - - Thread.new do - if @recursive - stream_recursive_files - else - direct_traverse - end - @upload_queue << :done - end - + queue_files upload_attempts = 0 completion_queue = Queue.new queue_executor = DefaultExecutor.new @@ -74,25 +64,42 @@ def upload(source, bucket:, **options) end end upload_attempts.times { completion_queue.pop } + build_result(upload_attempts) + ensure + queue_executor.shutdown + @executor.shutdown unless @options[:executor] + end + + private + def build_result(upload_count) if @abort_upload - msg = "failed to upload directory: uploaded #{upload_attempts - @errors.count} files " \ + msg = "failed to upload directory: uploaded #{upload_count - @errors.count} files " \ "but failed to upload #{@errors.count} files." raise DirectoryUploadError.new(msg, @errors) else - { - completed_uploads: upload_attempts - @errors.count, - failed_uploads: @errors.count - } + result = { completed_uploads: upload_count - @errors.count, failed_uploads: @errors.count } + result[:errors] = @errors if @errors.any? + result end - ensure - queue_executor.shutdown - @executor.shutdown unless @options[:executor] end - private + def direct_traverse + Dir.each_child(@source) do |entry| + break if @abort_upload + + full_path = File.join(@source, entry) + next unless @filter_callback&.call(full_path, entry) + next unless valid_entry?(full_path) - def stream_recursive_files + queue_file(full_path, entry) + rescue StandardError => e + @errors << e + @abort_upload = true unless @ignore_failure + end + end + + def traverse_recursively if @follow_symlinks visited = Set.new visited << File.stat(@source).ino @@ -102,20 +109,20 @@ def stream_recursive_files end end - def traverse_directory(dir_path, relative_prefix: '', visited: nil) + def traverse_directory(dir_path, prefix: '', visited: nil) return if @abort_upload - Dir.each_child(dir_path) do |file| + Dir.each_child(dir_path) do |entry| break if @abort_upload - full_path = File.join(dir_path, file) - next if @filter_callback&.call(full_path, file) - next if File.symlink?(full_path) && !@follow_symlinks + full_path = File.join(dir_path, entry) + next unless @filter_callback&.call(full_path, entry) + next if !@follow_symlinks && File.symlink?(full_path) if File.directory?(full_path) - process_directory(full_path, file, relative_prefix, visited) + process_directory(full_path, entry, prefix, visited) elsif File.file?(full_path) || File.symlink?(full_path) - key = relative_prefix.empty? ? file : File.join(relative_prefix, file) + key = prefix.empty? ? entry : File.join(prefix, entry) queue_file(full_path, key) end rescue StandardError => e @@ -124,43 +131,39 @@ def traverse_directory(dir_path, relative_prefix: '', visited: nil) end end - def process_directory(path, file, relative_prefix, visited) + def process_directory(path, dir, prefix, visited) if @follow_symlinks && visited stat = File.stat(path) return if visited.include?(stat.ino) visited << stat.ino end - new_prefix = relative_prefix.empty? ? file : File.join(relative_prefix, file) - traverse_directory(path, relative_prefix: new_prefix, visited: visited) + new_prefix = prefix.empty? ? dir : File.join(prefix, dir) + traverse_directory(path, prefix: new_prefix, visited: visited) end - def direct_traverse - Dir.each_child(@source) do |file| - break if @abort_upload - - full_path = File.join(@source, file) - next if @filter_callback&.call(full_path, file) - next unless should_upload_file?(full_path) - - queue_file(full_path, file) - rescue StandardError => e - @errors << e - @abort_upload = true unless @ignore_failure + def queue_files + Thread.new do + if @recursive + traverse_recursively + else + direct_traverse + end + @upload_queue << :done end end - def should_upload_file?(path) - return false if File.directory?(path) || (!@follow_symlinks && File.symlink?(path)) - - File.file?(path) || File.symlink?(path) - end - def queue_file(path, key) entry = { path: path } entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key @upload_queue << entry end + + def valid_entry?(path) + return false if File.directory?(path) || (!@follow_symlinks && File.symlink?(path)) + + File.file?(path) || File.symlink?(path) + end end end end From 8ab4edc39b972c7708ced95ad645de06a844517a Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 23 Sep 2025 11:20:05 -0700 Subject: [PATCH 15/35] Fix specs --- gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb | 2 +- gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb index 5774094ab6e..6c960cc12cd 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb @@ -43,7 +43,7 @@ def download(destination, options = {}) when 'get_range' raise ArgumentError, 'In get_range mode, :chunk_size must be provided' unless @chunk_size - resp = @client.head_object(@params) + resp = @client.head_object(params) multithreaded_get_by_ranges(resp.content_length, resp.etag, params) else raise ArgumentError, "Invalid mode #{@mode} provided, :mode should be single_request, get_range or auto" diff --git a/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb b/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb index 82e147986e3..d5a004b1834 100644 --- a/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb +++ b/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb @@ -85,7 +85,6 @@ module S3 end it 'reports progress for multipart uploads' do - allow(Thread).to receive(:new).and_yield.and_return(double(value: nil)) client.stub_responses(:create_multipart_upload, upload_id: 'id') client.stub_responses(:complete_multipart_upload) expect(client).to receive(:upload_part).exactly(24).times do |args| @@ -127,10 +126,6 @@ module S3 end it 'reports when it is unable to abort a failed multipart upload' do - allow(Thread).to receive(:new) do |_, &block| - double(value: block.call) - end - client.stub_responses( :upload_part, [ From face84d3cd762f76fa85f607e6e14880324c2832 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 23 Sep 2025 12:38:34 -0700 Subject: [PATCH 16/35] Minor updates to multipart file uploader --- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index 38443f41d07..ae68cc00cc1 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -24,12 +24,12 @@ class MultipartFileUploader def initialize(options = {}) @client = options[:client] || Client.new @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT - @custom_executor = !options[:executor].nil? @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) + @options = options end # @return [Client] - attr_reader :client, :executor + attr_reader :client # @param [String, Pathname, File, Tempfile] source The file to upload. # @option options [required, String] :bucket The bucket to upload to. @@ -77,7 +77,7 @@ def upload_parts(upload_id, source, options) end def shutdown_executor - @executor.shutdown if @executor.running? && !@custom_executor + @executor.shutdown if @executor.running? && @options[:executor].nil? end def abort_upload(upload_id, options, errors) @@ -158,13 +158,7 @@ def upload_with_executor(pending, completed, options) upload_attempts += 1 @executor.post(part) do |p| - if progress - p[:on_chunk_sent] = - proc do |_chunk, bytes, _total| - progress.call(p[:part_number], bytes) - end - end - + update_progress(progress, p) if progress resp = @client.upload_part(p) p[:body].close completed_part = { etag: resp.etag, part_number: p[:part_number] } @@ -196,6 +190,13 @@ def part_size(total_size, part_size, offset) end end + def update_progress(progress, part) + part[:on_chunk_sent] = + proc do |_chunk, bytes, _total| + progress.call(part[:part_number], bytes) + end + end + # @api private class PartList def initialize(parts = []) From 36a1e8742fe03d70aba13df21cab9fc0498644cb Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 24 Sep 2025 09:33:14 -0700 Subject: [PATCH 17/35] Minor refactors --- .../lib/aws-sdk-s3/file_uploader.rb | 7 ++-- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 38 +++++++++---------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb index 8de03e17f8e..50bf1279271 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb @@ -41,7 +41,6 @@ def upload(source, options = {}) else # remove multipart parameters not supported by put_object options.delete(:thread_count) - options.delete(:executor) put_object(source, options) end end @@ -49,9 +48,9 @@ def upload(source, options = {}) private - def open_file(source) - if String === source || Pathname === source - File.open(source, 'rb') { |file| yield(file) } + def open_file(source, &block) + if source.is_a?(String) || source.is_a?(Pathname) + File.open(source, 'rb', &block) else yield(source) end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index ae68cc00cc1..0288aa4b253 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -22,10 +22,10 @@ class MultipartFileUploader # @option options [Client] :client # @option options [Integer] :thread_count (DEFAULT_THREAD_COUNT) def initialize(options = {}) + @options = options @client = options[:client] || Client.new @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) - @options = options end # @return [Client] @@ -39,11 +39,12 @@ def initialize(options = {}) # It will be invoked with [bytes_read], [total_sizes] # @return [Seahorse::Client::Response] - the CompleteMultipartUploadResponse def upload(source, options = {}) - raise ArgumentError, 'unable to multipart upload files smaller than 5MB' if File.size(source) < MIN_PART_SIZE + file_size = File.size(source) + raise ArgumentError, 'unable to multipart upload files smaller than 5MB' if file_size < MIN_PART_SIZE upload_id = initiate_upload(options) - parts = upload_parts(upload_id, source, options) - complete_upload(upload_id, parts, source, options) + parts = upload_parts(upload_id, source, file_size, options) + complete_upload(upload_id, parts, file_size, options) shutdown_executor end @@ -53,21 +54,21 @@ def initiate_upload(options) @client.create_multipart_upload(create_opts(options)).upload_id end - def complete_upload(upload_id, parts, source, options) + def complete_upload(upload_id, parts, file_size, options) @client.complete_multipart_upload( **complete_opts(options).merge( upload_id: upload_id, multipart_upload: { parts: parts }, - mpu_object_size: File.size(source) + mpu_object_size: file_size ) ) rescue StandardError => e abort_upload(upload_id, options, [e]) end - def upload_parts(upload_id, source, options) + def upload_parts(upload_id, source, file_size, options) completed = PartList.new - pending = PartList.new(compute_parts(upload_id, source, options)) + pending = PartList.new(compute_parts(upload_id, source, file_size, options)) errors = upload_with_executor(pending, completed, options) if errors.empty? completed.to_a.sort_by { |part| part[:part_number] } @@ -94,17 +95,20 @@ def abort_upload(upload_id, options, errors) raise MultipartUploadError.new(msg, errors + [e]) end - def compute_parts(upload_id, source, options) - size = File.size(source) - default_part_size = compute_default_part_size(size) + def compute_parts(upload_id, source, file_size, options) + default_part_size = compute_default_part_size(file_size) offset = 0 part_number = 1 parts = [] - while offset < size + while offset < file_size parts << upload_part_opts(options).merge( upload_id: upload_id, part_number: part_number, - body: FilePart.new(source: source, offset: offset, size: part_size(size, default_part_size, offset)) + body: FilePart.new( + source: source, + offset: offset, + size: part_size(file_size, default_part_size, offset) + ) ) part_number += 1 offset += default_part_size @@ -123,17 +127,13 @@ def checksum_keys?(keys) def create_opts(options) opts = { checksum_algorithm: Aws::Plugins::ChecksumAlgorithm::DEFAULT_CHECKSUM } opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) - CREATE_OPTIONS.each_with_object(opts) do |key, hash| - hash[key] = options[key] if options.key?(key) - end + CREATE_OPTIONS.each_with_object(opts) { |k, h| h[k] = options[k] if options.key?(k) } end def complete_opts(options) opts = {} opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) - COMPLETE_OPTIONS.each_with_object(opts) do |key, hash| - hash[key] = options[key] if options.key?(key) - end + COMPLETE_OPTIONS.each_with_object(opts) { |k, h| h[k] = options[k] if options.key?(k) } end def upload_part_opts(options) From 7dd9f984ca877a6a3ea7a5b2c3c086f9a33410b4 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 24 Sep 2025 09:55:43 -0700 Subject: [PATCH 18/35] Fix options --- gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index c9cd1a5fc0e..619153e7aaa 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -34,7 +34,7 @@ def upload(source, bucket:, **options) @s3_prefix = upload_opts.delete(:s3_prefix) @recursive = upload_opts.delete(:recursive) || false @follow_symlinks = upload_opts.delete(:follow_symlinks) || false - @ignore_failure = upload_opts.delete(:failure_policy) || false + @ignore_failure = upload_opts.delete(:ignore_failure) || false @filter_callback = upload_opts.delete(:filter_callback) @abort_upload = false @upload_queue = SizedQueue.new(100) From 77ab1bac4456c36f15bfec25f908f93737209d25 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 24 Sep 2025 15:49:21 -0700 Subject: [PATCH 19/35] Refactor DirectoryUploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 216 +++++++++++------- 1 file changed, 131 insertions(+), 85 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 619153e7aaa..d7d87a58311 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -21,23 +21,21 @@ def initialize(options = {}) @client = options[:client] || Client.new @executor = options[:executor] || DefaultExecutor.new @options = options + @abort_upload = false end # @return [Client] attr_reader :client + # @return [Boolean] + attr_accessor :abort_upload + + # TODO: Need to add progress tracker def upload(source, bucket:, **options) raise ArgumentError, 'Invalid directory' unless Dir.exist?(source) upload_opts = options.dup - @source = source - @s3_prefix = upload_opts.delete(:s3_prefix) - @recursive = upload_opts.delete(:recursive) || false - @follow_symlinks = upload_opts.delete(:follow_symlinks) || false @ignore_failure = upload_opts.delete(:ignore_failure) || false - @filter_callback = upload_opts.delete(:filter_callback) - @abort_upload = false - @upload_queue = SizedQueue.new(100) @errors = [] uploader = FileUploader.new( @@ -45,17 +43,52 @@ def upload(source, bucket:, **options) client: @client, executor: @executor ) - queue_files + producer = FileProducer.new(source, build_producer_opts(upload_opts)) + producer.run + uploads = process_upload_queue(producer, uploader, upload_opts.merge(bucket: bucket)) + build_result(uploads) + ensure + @executor.shutdown unless @options[:executor] + end + + private + + def build_producer_opts(opts) + { + directory_uploader: self, + s3_prefix: opts.delete(:s3_prefix), + recursive: opts.delete(:recursive), + follow_symlinks: opts.delete(:follow_symlinks), + filter_callback: opts.delete(:filter_callback), + ignore_failure: @ignore_failure, + errors: @errors + } + end + + def build_result(upload_count) + uploads = [upload_count - @errors.count, 0].max + + if @abort_upload + msg = "failed to upload directory: uploaded #{uploads} files " \ + "and failed to upload #{@errors.count} files." + raise DirectoryUploadError.new(msg, @errors) + else + result = { completed_uploads: uploads, failed_uploads: @errors.count } + result[:errors] = @errors if @errors.any? + result + end + end + + def process_upload_queue(producer, uploader, opts) upload_attempts = 0 completion_queue = Queue.new queue_executor = DefaultExecutor.new - - while (file = @upload_queue.shift) != :done + while (file = producer.file_queue.shift) != :done break if @abort_upload upload_attempts += 1 queue_executor.post(file) do |f| - uploader.upload(f[:path], upload_opts.merge(bucket: bucket, key: f[:key])) + uploader.upload(f[:path], opts.merge(key: f[:key])) rescue StandardError => e @errors << e @abort_upload = true unless @ignore_failure @@ -64,105 +97,118 @@ def upload(source, bucket:, **options) end end upload_attempts.times { completion_queue.pop } - build_result(upload_attempts) + upload_attempts ensure queue_executor.shutdown - @executor.shutdown unless @options[:executor] end - private - def build_result(upload_count) - if @abort_upload - msg = "failed to upload directory: uploaded #{upload_count - @errors.count} files " \ - "but failed to upload #{@errors.count} files." - raise DirectoryUploadError.new(msg, @errors) - else - result = { completed_uploads: upload_count - @errors.count, failed_uploads: @errors.count } - result[:errors] = @errors if @errors.any? - result + # @api private + class FileProducer + def initialize(source_dir, options = {}) + @source_dir = source_dir + @s3_prefix = options[:s3_prefix] + @recursive = options[:recursive] || false + @follow_symlinks = options[:follow_symlinks] || false + @ignore_failure = options[:ignore_failure] + @filter_callback = options[:filter_callback] + @errors = options[:errors] + @directory_uploader = options[:directory_uploader] + @file_queue = SizedQueue.new(100) end - end - def direct_traverse - Dir.each_child(@source) do |entry| - break if @abort_upload - - full_path = File.join(@source, entry) - next unless @filter_callback&.call(full_path, entry) - next unless valid_entry?(full_path) + attr_accessor :file_queue - queue_file(full_path, entry) - rescue StandardError => e - @errors << e - @abort_upload = true unless @ignore_failure + def run + Thread.new do + if @recursive + find_recursively + else + find_directly + end + @file_queue << :done + end end - end - def traverse_recursively - if @follow_symlinks - visited = Set.new - visited << File.stat(@source).ino - traverse_directory(@source, visited: visited) - else - traverse_directory(@source) + private + + def build_file_entry(file_path, key) + normalized_key = @s3_prefix ? File.join(@s3_prefix, key) : key + { path: file_path, key: normalized_key } end - end - def traverse_directory(dir_path, prefix: '', visited: nil) - return if @abort_upload + def find_directly + Dir.each_child(@source_dir) do |entry| + break if @directory_uploader.abort_upload - Dir.each_child(dir_path) do |entry| - break if @abort_upload + entry_path = File.join(@source_dir, entry) + next if File.directory?(entry_path) || skip_symlink?(entry_path) + next unless include_file?(entry_path, entry) + next unless valid_file_type?(entry_path) - full_path = File.join(dir_path, entry) - next unless @filter_callback&.call(full_path, entry) - next if !@follow_symlinks && File.symlink?(full_path) + @file_queue << build_file_entry(entry_path, entry) + rescue StandardError => e + @errors << e + @directory_uploader.abort_upload = true unless @ignore_failure + end + end - if File.directory?(full_path) - process_directory(full_path, entry, prefix, visited) - elsif File.file?(full_path) || File.symlink?(full_path) - key = prefix.empty? ? entry : File.join(prefix, entry) - queue_file(full_path, key) + def find_recursively + if @follow_symlinks + visited = Set.new + visited << File.stat(@source_dir).ino + scan_directory(@source_dir, visited: visited) + else + scan_directory(@source_dir) end - rescue StandardError => e - @errors << e - @abort_upload = true unless @ignore_failure end - end - def process_directory(path, dir, prefix, visited) - if @follow_symlinks && visited - stat = File.stat(path) - return if visited.include?(stat.ino) + def valid_file_type?(path) + File.file?(path) || File.symlink?(path) + end + + def skip_symlink?(path) + !@follow_symlinks && File.symlink?(path) + end + + def include_file?(file_path, file_name) + return true unless @filter_callback - visited << stat.ino + @filter_callback.call(file_path, file_name) end - new_prefix = prefix.empty? ? dir : File.join(prefix, dir) - traverse_directory(path, prefix: new_prefix, visited: visited) - end - def queue_files - Thread.new do - if @recursive - traverse_recursively - else - direct_traverse + def scan_directory(dir_path, key_prefix: '', visited: nil) + return if @directory_uploader.abort_upload + + Dir.each_child(dir_path) do |entry| + break if @directory_uploader.abort_upload + + full_path = File.join(dir_path, entry) + next unless include_file?(full_path, entry) + next if !@follow_symlinks && File.symlink?(full_path) + + if File.directory?(full_path) + handle_directory(full_path, entry, key_prefix, visited) + elsif valid_file_type?(full_path) + key = key_prefix.empty? ? entry : File.join(key_prefix, entry) + @file_queue << build_file_entry(full_path, key) + end + rescue StandardError => e + @errors << e + @directory_uploader.abort_upload = true unless @ignore_failure end - @upload_queue << :done end - end - def queue_file(path, key) - entry = { path: path } - entry[:key] = @s3_prefix ? File.join(@s3_prefix, key) : key - @upload_queue << entry - end + def handle_directory(dir_path, dir_name, key_prefix, visited) + if @follow_symlinks && visited + stat = File.stat(dir_path) + return if visited.include?(stat.ino) - def valid_entry?(path) - return false if File.directory?(path) || (!@follow_symlinks && File.symlink?(path)) - - File.file?(path) || File.symlink?(path) + visited << stat.ino + end + new_prefix = key_prefix.empty? ? dir_name : File.join(key_prefix, dir_name) + scan_directory(dir_path, key_prefix: new_prefix, visited: visited) + end end end end From 009127ded406cb414b7aa368b50aa480d44c4c81 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Wed, 24 Sep 2025 16:21:36 -0700 Subject: [PATCH 20/35] Update multipartfileuploader --- .../aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index 0288aa4b253..b8a1b3e1631 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -45,7 +45,8 @@ def upload(source, options = {}) upload_id = initiate_upload(options) parts = upload_parts(upload_id, source, file_size, options) complete_upload(upload_id, parts, file_size, options) - shutdown_executor + ensure + @executor.shutdown if @executor.running? && @options[:executor].nil? end private @@ -77,19 +78,13 @@ def upload_parts(upload_id, source, file_size, options) end end - def shutdown_executor - @executor.shutdown if @executor.running? && @options[:executor].nil? - end - def abort_upload(upload_id, options, errors) @client.abort_multipart_upload(bucket: options[:bucket], key: options[:key], upload_id: upload_id) msg = "multipart upload failed: #{errors.map(&:message).join('; ')}" raise MultipartUploadError.new(msg, errors) rescue MultipartUploadError => e - shutdown_executor raise e rescue StandardError => e - shutdown_executor msg = "failed to abort multipart upload: #{e.message}. "\ "Multipart upload failed: #{errors.map(&:message).join('; ')}" raise MultipartUploadError.new(msg, errors + [e]) From 39912fd5fdbb37903f904717ae0518a5bff3e9ad Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 25 Sep 2025 08:26:34 -0700 Subject: [PATCH 21/35] Refactor FileDownloader --- .../lib/aws-sdk-s3/file_downloader.rb | 235 +++++++++--------- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 2 +- gems/aws-sdk-s3/spec/file_downloader_spec.rb | 6 - 3 files changed, 122 insertions(+), 121 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb index 6c960cc12cd..13f07bc538a 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb @@ -8,192 +8,199 @@ module Aws module S3 # @api private class FileDownloader - MIN_CHUNK_SIZE = 5 * 1024 * 1024 MAX_PARTS = 10_000 + DEFAULT_THREAD_COUNT = 10 def initialize(options = {}) @client = options[:client] || Client.new - @executor = options[:executor] + @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT + @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) + @options = options end # @return [Client] attr_reader :client def download(destination, options = {}) - valid_types = [String, Pathname, File, Tempfile] - unless valid_types.include?(destination.class) - raise ArgumentError, "Invalid destination, expected #{valid_types.join(', ')} but got: #{destination.class}" - end - - @temp_path = nil - @destination = destination - @mode = options.delete(:mode) || 'auto' - @thread_count = options.delete(:thread_count) || 10 - @chunk_size = options.delete(:chunk_size) - @on_checksum_validated = options.delete(:on_checksum_validated) - @progress_callback = options.delete(:progress_callback) - params = options - validate! + validate_destination!(destination) + opts = build_download_opts(destination, options.dup) + validate_opts!(opts) Aws::Plugins::UserAgent.metric('S3_TRANSFER') do - case @mode - when 'auto' then multipart_download(params) - when 'single_request' then single_request(params) + case opts[:mode] + when 'auto' then multipart_download(destination, opts) + when 'single_request' then single_request(destination, opts) when 'get_range' - raise ArgumentError, 'In get_range mode, :chunk_size must be provided' unless @chunk_size - - resp = @client.head_object(params) - multithreaded_get_by_ranges(resp.content_length, resp.etag, params) - else - raise ArgumentError, "Invalid mode #{@mode} provided, :mode should be single_request, get_range or auto" + resp = @client.head_object(opts[:params]) + set_temp_path(opts) + multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) end end - File.rename(@temp_path, @destination) if @temp_path + File.rename(opts[:temp_path], destination) if opts[:temp_path] ensure - File.delete(@temp_path) if @temp_path && File.exist?(@temp_path) + @executor.shutdown if @executor.running? && @options[:executor].nil? + cleanup_temp_file!(opts) end private - def validate! - return unless @on_checksum_validated && !@on_checksum_validated.respond_to?(:call) + def build_download_opts(destination, opts) + { + destination: destination, + mode: opts.delete(:mode) || 'auto', + chunk_size: opts.delete(:chunk_size), + on_checksum_validated: opts.delete(:on_checksum_validated), + progress_callback: opts.delete(:progress_callback), + params: opts, + temp_path: nil + } + end + + def cleanup_temp_file!(opts) + return unless opts + + temp_file = opts[:temp_path] + File.delete(temp_file) if temp_file && File.exist?(temp_file) + end + + def set_temp_path(opts) + return if [File, Tempfile].include?(opts[:destination].class) - raise ArgumentError, ':on_checksum_validated must be callable' + opts[:temp_path] ||= "#{opts[:destination]}.s3tmp.#{SecureRandom.alphanumeric(8)}" end - def multipart_download(params) + def validate_destination!(destination) + valid_types = [String, Pathname, File, Tempfile] + return if valid_types.include?(destination.class) + + raise ArgumentError, "Invalid destination, expected #{valid_types.join(', ')} but got: #{destination.class}" + end + + def validate_opts!(opts) + if opts[:on_checksum_validated] && !opts[:on_checksum_validated].respond_to?(:call) + raise ArgumentError, ':on_checksum_validated must be callable' + end + + valid_modes = %w[auto get_range single_request] + unless valid_modes.include?(opts[:mode]) + msg = "Invalid mode #{opts[:mode]} provided, :mode should be single_request, get_range or auto" + raise ArgumentError, msg + end + + if opts[:mode] == 'get_range' && opts[:chunk_size].nil? + raise ArgumentError, 'In get_range mode, :chunk_size must be provided' + end + + if opts[:chunk_size] && opts[:chunk_size] <= 0 + raise ArgumentError, ':chunk_size must be positive' + end + end + + def multipart_download(destination, opts) + params = opts[:params] resp = @client.head_object(params.merge(part_number: 1)) count = resp.parts_count if count.nil? || count <= 1 if resp.content_length <= MIN_CHUNK_SIZE - single_request(params) + single_request(destination, opts) else - multithreaded_get_by_ranges(resp.content_length, resp.etag, params) + set_temp_path(opts) + multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) end else # covers cases when given object is not uploaded via UploadPart API resp = @client.head_object(params) # partNumber is an option if resp.content_length <= MIN_CHUNK_SIZE - single_request(params) + single_request(destination, opts) else - compute_mode(resp.content_length, count, resp.etag, params) + compute_mode(resp.content_length, count, resp.etag, opts) end end end - def compute_mode(file_size, count, etag, params) - chunk_size = compute_chunk(file_size) + def compute_mode(file_size, count, etag, opts) + chunk_size = compute_chunk(opts[:chunk_size], file_size) part_size = (file_size.to_f / count).ceil + + set_temp_path(opts) if chunk_size < part_size - multithreaded_get_by_ranges(file_size, etag, params) + multithreaded_get_by_ranges(file_size, etag, opts) else - multithreaded_get_by_parts(count, file_size, etag, params) + multithreaded_get_by_parts(count, file_size, etag, opts) end end - def compute_chunk(file_size) - raise ArgumentError, ":chunk_size shouldn't exceed total file size." if @chunk_size && @chunk_size > file_size + def compute_chunk(chunk_size, file_size) + raise ArgumentError, ":chunk_size shouldn't exceed total file size." if chunk_size && chunk_size > file_size - @chunk_size || [(file_size.to_f / MAX_PARTS).ceil, MIN_CHUNK_SIZE].max.to_i + chunk_size || [(file_size.to_f / MAX_PARTS).ceil, MIN_CHUNK_SIZE].max.to_i end - def multithreaded_get_by_ranges(file_size, etag, params) + def multithreaded_get_by_ranges(file_size, etag, opts) offset = 0 - default_chunk_size = compute_chunk(file_size) + default_chunk_size = compute_chunk(opts[:chunk_size], file_size) chunks = [] part_number = 1 # parts start at 1 while offset < file_size progress = offset + default_chunk_size progress = file_size if progress > file_size - params = params.merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag) + params = opts[:params].merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag) chunks << Part.new(part_number: part_number, size: (progress - offset), params: params) part_number += 1 offset = progress end - if @executor - download_with_executor(PartList.new(chunks)) - else - download_in_threads(PartList.new(chunks), file_size) - end + download_with_executor(PartList.new(chunks), file_size, opts) end - def multithreaded_get_by_parts(n_parts, total_size, etag, params) + def multithreaded_get_by_parts(n_parts, total_size, etag, opts) parts = (1..n_parts).map do |part| - Part.new(part_number: part, params: params.merge(part_number: part, if_match: etag)) - end - if @executor - download_with_executor(PartList.new(parts)) - else - download_in_threads(PartList.new(parts), total_size) + params = opts[:params].merge(part_number: part, if_match: etag) + Part.new(part_number: part, params: params) end + download_with_executor(PartList.new(parts), total_size, opts) end - def download_with_executor(pending) - max_parts = pending.size + def download_with_executor(pending, total_size, opts) + download_attempts = 0 completion_queue = Queue.new - @abort_download = false - unless [File, Tempfile].include?(@destination.class) - @temp_path = "#{@destination}.s3tmp.#{SecureRandom.alphanumeric(8)}" - end + abort_download = false + error = nil + progress = + if (progress_callback = opts[:progress_callback]) + MultipartProgress.new(pending, total_size, progress_callback) + end while (part = pending.shift) - break if @abort_download + break if abort_download + download_attempts += 1 @executor.post(part) do |p| + if progress + p.params[:on_chunk_received] = + proc do |_chunk, bytes, total| + progress.call(p.part_number, bytes, total) + end + end resp = @client.get_object(p.params) range = extract_range(resp.content_range) validate_range(range, p.params[:range]) if p.params[:range] - write(resp.body, range) + write(resp.body, range, opts) - if @on_checksum_validated && resp.checksum_validated - @on_checksum_validated.call(resp.checksum_validated, resp) + if opts[:on_checksum_validated] && resp.checksum_validated + opts[:on_checksum_validated].call(resp.checksum_validated, resp) end rescue StandardError => e - @abort_download = true - raise e + abort_download = true + error = e ensure completion_queue << :done end end - max_parts.times { completion_queue.pop } - end - - def download_in_threads(pending, total_size) - threads = [] - progress = MultipartProgress.new(pending, total_size, @progress_callback) if @progress_callback - unless [File, Tempfile].include?(@destination.class) - @temp_path = "#{@destination}.s3tmp.#{SecureRandom.alphanumeric(8)}" - end - @thread_count.times do - thread = Thread.new do - begin - while (part = pending.shift) - if progress - part.params[:on_chunk_received] = - proc do |_chunk, bytes, total| - progress.call(part.part_number, bytes, total) - end - end - resp = @client.get_object(part.params) - range = extract_range(resp.content_range) - validate_range(range, part.params[:range]) if part.params[:range] - write(resp.body, range) - if @on_checksum_validated && resp.checksum_validated - @on_checksum_validated.call(resp.checksum_validated, resp) - end - end - nil - rescue StandardError => e - pending.clear! # keep other threads from downloading other parts - raise e - end - end - threads << thread - end - threads.map(&:value).compact + download_attempts.times { completion_queue.pop } + raise error unless error.nil? end def extract_range(value) @@ -206,24 +213,24 @@ def validate_range(actual, expected) raise MultipartDownloadError, "multipart download failed: expected range of #{expected} but got #{actual}" end - def write(body, range) - path = @temp_path || @destination + def write(body, range, opts) + path = opts[:temp_path] || opts[:destination] File.write(path, body.read, range.split('-').first.to_i) end - def single_request(params) - params = params.merge(response_target: @destination) - params[:on_chunk_received] = single_part_progress if @progress_callback + def single_request(destination, opts) + params = opts[:params].merge(response_target: destination) + params[:on_chunk_received] = single_part_progress(opts) if opts[:progress_callback] resp = @client.get_object(params) - return resp unless @on_checksum_validated + return resp unless opts[:on_checksum_validated] - @on_checksum_validated.call(resp.checksum_validated, resp) if resp.checksum_validated + opts[:on_checksum_validated].call(resp.checksum_validated, resp) if resp.checksum_validated resp end - def single_part_progress + def single_part_progress(opts) proc do |_chunk, bytes_read, total_size| - @progress_callback.call([bytes_read], [total_size], total_size) + opts[:progress_callback].call([bytes_read], [total_size], total_size) end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index b8a1b3e1631..243ddbe7848 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -22,10 +22,10 @@ class MultipartFileUploader # @option options [Client] :client # @option options [Integer] :thread_count (DEFAULT_THREAD_COUNT) def initialize(options = {}) - @options = options @client = options[:client] || Client.new @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) + @options = options end # @return [Client] diff --git a/gems/aws-sdk-s3/spec/file_downloader_spec.rb b/gems/aws-sdk-s3/spec/file_downloader_spec.rb index 0f3dd8fb2cf..d638e0dff37 100644 --- a/gems/aws-sdk-s3/spec/file_downloader_spec.rb +++ b/gems/aws-sdk-s3/spec/file_downloader_spec.rb @@ -198,7 +198,6 @@ module S3 it 'raises when checksum validation fails on multipart object' do client.stub_responses(:get_object, { body: 'body', checksum_sha1: 'invalid' }) - expect(Thread).to receive(:new).and_yield.and_return(double(value: nil)) expect { subject.download(path, parts_params) }.to raise_error(Aws::Errors::ChecksumError) end @@ -208,7 +207,6 @@ module S3 expect(ctx.params[:if_match]).to eq('test-etag') 'PreconditionFailed' }) - expect(Thread).to receive(:new).and_yield.and_return(double(value: nil)) expect { subject.download(path, range_params.merge(chunk_size: one_meg, mode: 'get_range')) } .to raise_error(Aws::S3::Errors::PreconditionFailed) end @@ -219,8 +217,6 @@ module S3 expect(ctx.params[:if_match]).to eq('test-etag') 'PreconditionFailed' }) - - expect(Thread).to receive(:new).and_yield.and_return(double(value: nil)) expect { subject.download(path, parts_params) }.to raise_error(Aws::S3::Errors::PreconditionFailed) end @@ -246,7 +242,6 @@ module S3 it 'raises when range validation fails' do client.stub_responses(:get_object, { body: 'body', content_range: 'bytes 0-3/4' }) - expect(Thread).to receive(:new).and_yield.and_return(double(value: nil)) expect { subject.download(path, range_params.merge(mode: 'get_range', chunk_size: one_meg)) } .to raise_error(Aws::S3::MultipartDownloadError) end @@ -263,7 +258,6 @@ module S3 responses[context.params[:range]] }) - expect(Thread).to receive(:new).and_yield.and_return(double(value: nil)) expect { subject.download(path, range_params.merge(chunk_size: 5 * one_meg, mode: 'get_range')) } .to raise_error(Aws::S3::MultipartDownloadError) expect(File.exist?(path)).to be(true) From f9fb117c87049f9ef18d160354fed7ca2778a186 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 25 Sep 2025 09:11:41 -0700 Subject: [PATCH 22/35] Implement Directory Downloader --- .../lib/aws-sdk-s3/directory_downloader.rb | 138 ++++++++++++------ 1 file changed, 94 insertions(+), 44 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb index e323d7a1aa7..86d849f72fd 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -17,11 +17,14 @@ def initialize(message, errors = []) class DirectoryDownloader def initialize(options = {}) @client = options[:client] || Client.new - @executor = options[:executor] + @executor = options[:executor] || DefaultExecutor.new + @options = options + @abort_download = false end - attr_reader :client, :executor + attr_reader :client, :abort_download + # TODO: need to add progress tracker def download(destination, bucket:, **options) if File.exist?(destination) raise ArgumentError 'invalid destination, expected a directory' unless File.directory?(destination) @@ -30,75 +33,122 @@ def download(destination, bucket:, **options) end download_opts = options.dup - @destination = destination @bucket = bucket - @recursive = download_opts.delete(:recursive) || false - @s3_prefix = download_opts.delete(:s3_prefix) - @s3_delimiter = download_opts.delete(:s3_delimiter) || '/' - @failure_policy = download_opts.delete(:failure_policy) || :abort + @ignore_failure = download_opts.delete(:ignore_failure) || false + @errors = [] downloader = FileDownloader.new(client: client, executor: @executor) - @download_queue = SizedQueue.new(100) - @abort_download = false - @errors = [] + producer = ObjectProducer.new(destination, build_producer_opts(download_opts)) + producer.run + + downloads = process_download_queue(producer, downloader, download_opts) + build_result(downloads) + ensure + @executor.shutdown unless @options[:executor] + end + + def build_producer_opts(opts) + { + directory_downloader: self, + client: @client, + bucket: @bucket, + s3_prefix: opts.delete(:s3_prefix), + ignore_failure: @ignore_failure, + filter_callback: opts.delete(:filter_callback), + errors: @errors + } + end - Thread.new do - stream_keys - @download_queue << :done + def build_result(download_count) + downloads = [download_count - @errors.count, 0].max + + if @abort_download + msg = "failed to download directory: downloaded #{downloads} files " \ + "and failed to download #{@errors.count} files." + raise DirectoryDownloadError.new(msg, @errors) + else + result = { completed_downloads: downloads, failed_downloads: @errors.count } + result[:errors] = @errors if @errors.any? + result end + end + def process_download_queue(producer, downloader, opts) download_attempts = 0 completion_queue = Queue.new - while (queue_key = @download_queue.shift) != :done + queue_executor = DefaultExecutor.new + while (object = producer.object_queue.shift) != :done break if @abort_download download_attempts += 1 - @executor.post(queue_key) do |k| - normalized_key = normalize_key(k) - full_path = File.join(@destination, normalized_key) - dir_path = File.dirname(full_path) + queue_executor.post(object) do |o| + dir_path = File.dirname(o[:path]) FileUtils.mkdir_p(dir_path) unless dir_path == @destination || Dir.exist?(dir_path) - downloader.download(full_path, download_opts.merge(bucket: @bucket, key: k)) + downloader.download(o[:path], opts.merge(bucket: @bucket, key: o[:key])) rescue StandardError => e @errors << e - @abort_download = true if @failure_policy == :abort + @abort_download = true unless @ignore_failure ensure completion_queue << :done end end - download_attempts.times { completion_queue.pop } + download_attempts + ensure + queue_executor.shutdown + end - if @abort_download - msg = "failed to download directory: attempt to download #{download_attempts} objects " \ - "but failed to download #{@errors.count} objects." - raise DirectoryDownloadError, msg + @errors.to_s - else - { - downloaded: download_attempts - @errors.count, - errors: @errors.count - } + # @api private + class ObjectProducer + def initialize(destination_dir, options = {}) + @destination_dir = destination_dir + @client = options[:client] + @bucket = options[:bucket] + @s3_prefix = options[:s3_prefix] + @ignore_failure = options[:ignore_failure] + @filter_callback = options[:filter_callback] + @errors = options[:errors] + @directory_downloader = options[:directory_downloader] + @object_queue = SizedQueue.new(100) end - end - def normalize_key(key) - key = key.delete_prefix(@s3_prefix) if @s3_prefix - return key.tr('/', @s3_delimiter) if @s3_delimiter != '/' - return key if File::SEPARATOR == '/' + attr_reader :object_queue - key.tr('/', File::SEPARATOR) - end + def run + Thread.new do + stream_objects + @object_queue << :done + end + end - def stream_keys(continuation_token: nil) - resp = @client.list_objects_v2(bucket: @bucket, continuation_token: continuation_token) - resp.contents.each do |o| - break if @abort_download - next if o.key.end_with?('/') + private + + def build_object_entry(key) + { path: File.join(@destination_dir, normalize_key(key)), key: key } + end + + # TODO: need to add filter callback, double check handling of objects that ends with / + def stream_objects(continuation_token: nil) + resp = @client.list_objects_v2(bucket: @bucket, continuation_token: continuation_token) + resp.contents.each do |o| + break if @directory_downloader.abort_download + next if o.key.end_with?('/') + + @object_queue << build_object_entry(o.key) + rescue StandardError => e + @errors << e + @abort_download = true unless @ignore_failure + end + stream_objects(continuation_token: resp.next_continuation_token) if resp.next_continuation_token + end + + def normalize_key(key) + key = key.delete_prefix(@s3_prefix) if @s3_prefix + return key if File::SEPARATOR == '/' - @download_queue << o.key + key.tr('/', File::SEPARATOR) end - stream_keys(continuation_token: resp.next_continuation_token) if resp.next_continuation_token end end end From d307555eb52ce3670640d25501313b5c730a6fe3 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Thu, 25 Sep 2025 09:16:40 -0700 Subject: [PATCH 23/35] Add TODO --- gems/aws-sdk-s3/spec/file_downloader_spec.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/gems/aws-sdk-s3/spec/file_downloader_spec.rb b/gems/aws-sdk-s3/spec/file_downloader_spec.rb index d638e0dff37..97850069b2d 100644 --- a/gems/aws-sdk-s3/spec/file_downloader_spec.rb +++ b/gems/aws-sdk-s3/spec/file_downloader_spec.rb @@ -48,6 +48,7 @@ module S3 subject.download(path, single_params) end + # TODO: flakey only in jruby? attemp to fix it it 'downloads a large object in parts' do parts = 0 client.stub_responses(:get_object, lambda do |_ctx| From b9231e777ea5a63350edad400d8521e4d16f33b2 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 29 Sep 2025 09:08:25 -0700 Subject: [PATCH 24/35] Feedback - update default executor --- .../lib/aws-sdk-s3/default_executor.rb | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb index b99c822497c..38158b343d7 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -4,37 +4,74 @@ module Aws module S3 # @api private class DefaultExecutor + RUNNING = :running + SHUTTING_DOWN = :shutting_down + SHUTDOWN = :shutdown + def initialize(options = {}) - @queue = Queue.new @max_threads = options[:max_threads] || 10 + @state = RUNNING + @queue = Queue.new @pool = [] - @running = true @mutex = Mutex.new end def post(*args, &block) - raise 'Executor is not running' unless @running + raise 'Executor is not accepting new tasks' unless @state == RUNNING @queue << [args, block] ensure_worker_available end - def shutdown - @running = false + def kill + @mutex.synchronize do + @state = SHUTDOWN + @pool.each(&:kill) + @pool.clear + @queue.clear + end + true + end + + def shutdown(timeout = nil) + @mutex.synchronize do + return true if @state == SHUTDOWN + + @state = SHUTTING_DOWN + end + @max_threads.times { @queue << :shutdown } - @pool.each(&:join) + + if timeout + @pool.each { |thread| thread.join(timeout) } + @pool.select(&:alive?).each(&:kill) + else + @pool.each(&:join) + end + @pool.clear + @state = SHUTDOWN true end def running? - @running + @state == RUNNING + end + + def shutting_down? + @state == SHUTTING_DOWN + end + + def shutdown? + @state == SHUTDOWN end private def ensure_worker_available @mutex.synchronize do + return unless @state == RUNNING + @pool.select!(&:alive?) @pool << spawn_worker if @pool.size < @max_threads end From d991128cf0b50ed5190da8383298db81ba46367c Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 29 Sep 2025 10:32:35 -0700 Subject: [PATCH 25/35] Refactor file downloader --- .../lib/aws-sdk-s3/file_downloader.rb | 233 +++++++++--------- 1 file changed, 122 insertions(+), 111 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb index 13f07bc538a..e914a1813be 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb @@ -10,13 +10,12 @@ module S3 class FileDownloader MIN_CHUNK_SIZE = 5 * 1024 * 1024 MAX_PARTS = 10_000 - DEFAULT_THREAD_COUNT = 10 + HEAD_OPTIONS = Set.new(Client.api.operation(:head_object).input.shape.member_names) + GET_OPTIONS = Set.new(Client.api.operation(:get_object).input.shape.member_names) def initialize(options = {}) @client = options[:client] || Client.new - @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT - @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) - @options = options + @executor = options[:executor] end # @return [Client] @@ -29,17 +28,13 @@ def download(destination, options = {}) Aws::Plugins::UserAgent.metric('S3_TRANSFER') do case opts[:mode] - when 'auto' then multipart_download(destination, opts) - when 'single_request' then single_request(destination, opts) - when 'get_range' - resp = @client.head_object(opts[:params]) - set_temp_path(opts) - multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) + when 'auto' then multipart_download(opts) + when 'single_request' then single_request(opts) + when 'get_range' then range_request(opts) end end File.rename(opts[:temp_path], destination) if opts[:temp_path] ensure - @executor.shutdown if @executor.running? && @options[:executor].nil? cleanup_temp_file!(opts) end @@ -64,78 +59,101 @@ def cleanup_temp_file!(opts) File.delete(temp_file) if temp_file && File.exist?(temp_file) end - def set_temp_path(opts) - return if [File, Tempfile].include?(opts[:destination].class) + def download_with_executor(part_list, total_size, opts) + download_attempts = 0 + completion_queue = Queue.new + abort_download = false + error = nil + progress = + if (progress_callback = opts[:progress_callback]) + MultipartProgress.new(part_list, total_size, progress_callback) + end - opts[:temp_path] ||= "#{opts[:destination]}.s3tmp.#{SecureRandom.alphanumeric(8)}" + while (part = part_list.shift) + break if abort_download + + download_attempts += 1 + @executor.post(part) do |p| + update_progress(progress, p) if progress + + resp = @client.get_object(p.params) + range = extract_range(resp.content_range) + validate_range(range, p.params[:range]) if p.params[:range] + write(resp.body, range, opts) + + if opts[:on_checksum_validated] && resp.checksum_validated + opts[:on_checksum_validated].call(resp.checksum_validated, resp) + end + rescue StandardError => e + abort_download = true + error = e + ensure + completion_queue << :done + end + end + + download_attempts.times { completion_queue.pop } + raise error unless error.nil? end - def validate_destination!(destination) - valid_types = [String, Pathname, File, Tempfile] - return if valid_types.include?(destination.class) + def get_opts(opts) + GET_OPTIONS.each_with_object({}) { |k, h| h[k] = opts[k] if opts.key?(k) } + end - raise ArgumentError, "Invalid destination, expected #{valid_types.join(', ')} but got: #{destination.class}" + def head_opts(opts) + HEAD_OPTIONS.each_with_object({}) { |k, h| h[k] = opts[k] if opts.key?(k) } end - def validate_opts!(opts) - if opts[:on_checksum_validated] && !opts[:on_checksum_validated].respond_to?(:call) - raise ArgumentError, ':on_checksum_validated must be callable' - end + def compute_chunk(chunk_size, file_size) + raise ArgumentError, ":chunk_size shouldn't exceed total file size." if chunk_size && chunk_size > file_size - valid_modes = %w[auto get_range single_request] - unless valid_modes.include?(opts[:mode]) - msg = "Invalid mode #{opts[:mode]} provided, :mode should be single_request, get_range or auto" - raise ArgumentError, msg - end + chunk_size || [(file_size.to_f / MAX_PARTS).ceil, MIN_CHUNK_SIZE].max.to_i + end - if opts[:mode] == 'get_range' && opts[:chunk_size].nil? - raise ArgumentError, 'In get_range mode, :chunk_size must be provided' - end + def compute_mode(file_size, total_parts, etag, opts) + chunk_size = compute_chunk(opts[:chunk_size], file_size) + part_size = (file_size.to_f / total_parts).ceil - if opts[:chunk_size] && opts[:chunk_size] <= 0 - raise ArgumentError, ':chunk_size must be positive' + set_temp_path(opts) + if chunk_size < part_size + multithreaded_get_by_ranges(file_size, etag, opts) + else + multithreaded_get_by_parts(total_parts, file_size, etag, opts) end end - def multipart_download(destination, opts) - params = opts[:params] - resp = @client.head_object(params.merge(part_number: 1)) + def extract_range(value) + value.match(%r{bytes (?\d+-\d+)/\d+})[:range] + end + + def multipart_download(opts) + resp = @client.head_object(head_opts(opts[:params].merge(part_number: 1))) count = resp.parts_count if count.nil? || count <= 1 if resp.content_length <= MIN_CHUNK_SIZE - single_request(destination, opts) + single_request(opts) else set_temp_path(opts) multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) end else # covers cases when given object is not uploaded via UploadPart API - resp = @client.head_object(params) # partNumber is an option + resp = @client.head_object(head_opts(opts[:params])) # partNumber is an option if resp.content_length <= MIN_CHUNK_SIZE - single_request(destination, opts) + single_request(opts) else compute_mode(resp.content_length, count, resp.etag, opts) end end end - def compute_mode(file_size, count, etag, opts) - chunk_size = compute_chunk(opts[:chunk_size], file_size) - part_size = (file_size.to_f / count).ceil - - set_temp_path(opts) - if chunk_size < part_size - multithreaded_get_by_ranges(file_size, etag, opts) - else - multithreaded_get_by_parts(count, file_size, etag, opts) + def multithreaded_get_by_parts(total_parts, file_size, etag, opts) + parts = (1..total_parts).map do |part| + params = get_opts(opts[:params].merge(part_number: part, if_match: etag)) + Part.new(part_number: part, params: params) end - end - - def compute_chunk(chunk_size, file_size) - raise ArgumentError, ":chunk_size shouldn't exceed total file size." if chunk_size && chunk_size > file_size - - chunk_size || [(file_size.to_f / MAX_PARTS).ceil, MIN_CHUNK_SIZE].max.to_i + download_with_executor(PartList.new(parts), file_size, opts) end def multithreaded_get_by_ranges(file_size, etag, opts) @@ -146,7 +164,7 @@ def multithreaded_get_by_ranges(file_size, etag, opts) while offset < file_size progress = offset + default_chunk_size progress = file_size if progress > file_size - params = opts[:params].merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag) + params = get_opts(opts[:params].merge(range: "bytes=#{offset}-#{progress - 1}", if_match: etag)) chunks << Part.new(part_number: part_number, size: (progress - offset), params: params) part_number += 1 offset = progress @@ -154,57 +172,66 @@ def multithreaded_get_by_ranges(file_size, etag, opts) download_with_executor(PartList.new(chunks), file_size, opts) end - def multithreaded_get_by_parts(n_parts, total_size, etag, opts) - parts = (1..n_parts).map do |part| - params = opts[:params].merge(part_number: part, if_match: etag) - Part.new(part_number: part, params: params) + def range_request(opts) + resp = @client.head_object(head_opts(opts[:params])) + set_temp_path(opts) + multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) + end + + def set_temp_path(opts) + return if [File, Tempfile].include?(opts[:destination].class) + + opts[:temp_path] ||= "#{opts[:destination]}.s3tmp.#{SecureRandom.alphanumeric(8)}" + end + + def single_request(opts) + params = get_opts(opts[:params]).merge(response_target: opts[:destination]) + params[:on_chunk_received] = single_part_progress(opts) if opts[:progress_callback] + resp = @client.get_object(params) + return resp unless opts[:on_checksum_validated] + + opts[:on_checksum_validated].call(resp.checksum_validated, resp) if resp.checksum_validated + resp + end + + def single_part_progress(opts) + proc do |_chunk, bytes_read, total_size| + opts[:progress_callback].call([bytes_read], [total_size], total_size) end - download_with_executor(PartList.new(parts), total_size, opts) end - def download_with_executor(pending, total_size, opts) - download_attempts = 0 - completion_queue = Queue.new - abort_download = false - error = nil - progress = - if (progress_callback = opts[:progress_callback]) - MultipartProgress.new(pending, total_size, progress_callback) + def update_progress(progress, part) + part.params[:on_chunk_received] = + proc do |_chunk, bytes, total| + progress.call(part.part_number, bytes, total) end + end - while (part = pending.shift) - break if abort_download + def validate_destination!(destination) + valid_types = [String, Pathname, File, Tempfile] + return if valid_types.include?(destination.class) - download_attempts += 1 - @executor.post(part) do |p| - if progress - p.params[:on_chunk_received] = - proc do |_chunk, bytes, total| - progress.call(p.part_number, bytes, total) - end - end - resp = @client.get_object(p.params) - range = extract_range(resp.content_range) - validate_range(range, p.params[:range]) if p.params[:range] - write(resp.body, range, opts) + raise ArgumentError, "Invalid destination, expected #{valid_types.join(', ')} but got: #{destination.class}" + end - if opts[:on_checksum_validated] && resp.checksum_validated - opts[:on_checksum_validated].call(resp.checksum_validated, resp) - end - rescue StandardError => e - abort_download = true - error = e - ensure - completion_queue << :done - end + def validate_opts!(opts) + if opts[:on_checksum_validated] && !opts[:on_checksum_validated].respond_to?(:call) + raise ArgumentError, ':on_checksum_validated must be callable' end - download_attempts.times { completion_queue.pop } - raise error unless error.nil? - end + valid_modes = %w[auto get_range single_request] + unless valid_modes.include?(opts[:mode]) + msg = "Invalid mode #{opts[:mode]} provided, :mode should be single_request, get_range or auto" + raise ArgumentError, msg + end - def extract_range(value) - value.match(%r{bytes (?\d+-\d+)/\d+})[:range] + if opts[:mode] == 'get_range' && opts[:chunk_size].nil? + raise ArgumentError, 'In get_range mode, :chunk_size must be provided' + end + + if opts[:chunk_size] && opts[:chunk_size] <= 0 + raise ArgumentError, ':chunk_size must be positive' + end end def validate_range(actual, expected) @@ -218,22 +245,6 @@ def write(body, range, opts) File.write(path, body.read, range.split('-').first.to_i) end - def single_request(destination, opts) - params = opts[:params].merge(response_target: destination) - params[:on_chunk_received] = single_part_progress(opts) if opts[:progress_callback] - resp = @client.get_object(params) - return resp unless opts[:on_checksum_validated] - - opts[:on_checksum_validated].call(resp.checksum_validated, resp) if resp.checksum_validated - resp - end - - def single_part_progress(opts) - proc do |_chunk, bytes_read, total_size| - opts[:progress_callback].call([bytes_read], [total_size], total_size) - end - end - # @api private class Part < Struct.new(:part_number, :size, :params) include Aws::Structure From bc533a00db0d157664c64316bd34c3679ea9971d Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 29 Sep 2025 10:34:15 -0700 Subject: [PATCH 26/35] Support FileDownloader changes --- gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb | 4 +++- gems/aws-sdk-s3/spec/file_downloader_spec.rb | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb index 0a9a9b9d3ca..2965ff4fd0d 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb @@ -539,9 +539,11 @@ def upload_file(source, options = {}) # @see Client#get_object # @see Client#head_object def download_file(destination, options = {}) - downloader = FileDownloader.new(client: client) + executor = DefaultExecutor.new(max_threads: options[:thread_count]) + downloader = FileDownloader.new(client: client, executor: executor) Aws::Plugins::UserAgent.metric('RESOURCE_MODEL') do downloader.download(destination, options.merge(bucket: bucket_name, key: key)) + executor.shutdown end true end diff --git a/gems/aws-sdk-s3/spec/file_downloader_spec.rb b/gems/aws-sdk-s3/spec/file_downloader_spec.rb index 97850069b2d..586690a8a63 100644 --- a/gems/aws-sdk-s3/spec/file_downloader_spec.rb +++ b/gems/aws-sdk-s3/spec/file_downloader_spec.rb @@ -7,7 +7,8 @@ module Aws module S3 describe FileDownloader do let(:client) { S3::Client.new(stub_responses: true) } - let(:subject) { FileDownloader.new(client: client) } + let(:executor) { DefaultExecutor.new } + let(:subject) { FileDownloader.new(client: client, executor: executor) } let(:tmpdir) { Dir.tmpdir } describe '#initialize' do From 9efc77fb63a5befb1f9498a1b5f72b6839b7c062 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 29 Sep 2025 14:06:53 -0700 Subject: [PATCH 27/35] Extra updates to FileDownloader --- gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb | 8 ++++---- gems/aws-sdk-s3/spec/file_downloader_spec.rb | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb index e914a1813be..4181c7eaeac 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_downloader.rb @@ -114,7 +114,7 @@ def compute_mode(file_size, total_parts, etag, opts) chunk_size = compute_chunk(opts[:chunk_size], file_size) part_size = (file_size.to_f / total_parts).ceil - set_temp_path(opts) + resolve_temp_path(opts) if chunk_size < part_size multithreaded_get_by_ranges(file_size, etag, opts) else @@ -134,7 +134,7 @@ def multipart_download(opts) if resp.content_length <= MIN_CHUNK_SIZE single_request(opts) else - set_temp_path(opts) + resolve_temp_path(opts) multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) end else @@ -174,11 +174,11 @@ def multithreaded_get_by_ranges(file_size, etag, opts) def range_request(opts) resp = @client.head_object(head_opts(opts[:params])) - set_temp_path(opts) + resolve_temp_path(opts) multithreaded_get_by_ranges(resp.content_length, resp.etag, opts) end - def set_temp_path(opts) + def resolve_temp_path(opts) return if [File, Tempfile].include?(opts[:destination].class) opts[:temp_path] ||= "#{opts[:destination]}.s3tmp.#{SecureRandom.alphanumeric(8)}" diff --git a/gems/aws-sdk-s3/spec/file_downloader_spec.rb b/gems/aws-sdk-s3/spec/file_downloader_spec.rb index 586690a8a63..2632377b981 100644 --- a/gems/aws-sdk-s3/spec/file_downloader_spec.rb +++ b/gems/aws-sdk-s3/spec/file_downloader_spec.rb @@ -49,7 +49,6 @@ module S3 subject.download(path, single_params) end - # TODO: flakey only in jruby? attemp to fix it it 'downloads a large object in parts' do parts = 0 client.stub_responses(:get_object, lambda do |_ctx| From 1cc3fcfbc123e58a9d200dd0c00ccc5d69caa95a Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 29 Sep 2025 14:07:33 -0700 Subject: [PATCH 28/35] Address feedback for FileUploader and MultipartFileUploader --- .../lib/aws-sdk-s3/customizations/object.rb | 10 ++++++++-- gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb | 8 +++----- .../lib/aws-sdk-s3/multipart_file_uploader.rb | 17 ++++++----------- gems/aws-sdk-s3/spec/file_uploader_spec.rb | 6 ------ .../spec/multipart_file_uploader_spec.rb | 3 ++- 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb index 2965ff4fd0d..69e7d6905b9 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations/object.rb @@ -459,11 +459,17 @@ def upload_stream(options = {}, &block) # @see Client#upload_part def upload_file(source, options = {}) uploading_options = options.dup - uploader = FileUploader.new(multipart_threshold: uploading_options.delete(:multipart_threshold), client: client) + executor = DefaultExecutor.new(max_threads: uploading_options.delete(:thread_count)) + uploader = FileUploader.new( + client: client, + executor: executor, + multipart_threshold: uploading_options.delete(:multipart_threshold) + ) response = Aws::Plugins::UserAgent.metric('RESOURCE_MODEL') do uploader.upload(source, uploading_options.merge(bucket: bucket_name, key: key)) end yield response if block_given? + executor.shutdown true end deprecated(:upload_file, use: 'Aws::S3::TransferManager#upload_file', version: 'next major version') @@ -543,8 +549,8 @@ def download_file(destination, options = {}) downloader = FileDownloader.new(client: client, executor: executor) Aws::Plugins::UserAgent.metric('RESOURCE_MODEL') do downloader.download(destination, options.merge(bucket: bucket_name, key: key)) - executor.shutdown end + executor.shutdown true end deprecated(:download_file, use: 'Aws::S3::TransferManager#download_file', version: 'next major version') diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb index 50bf1279271..62dbb07f8c3 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/file_uploader.rb @@ -13,8 +13,8 @@ class FileUploader # @option options [Client] :client # @option options [Integer] :multipart_threshold (104857600) def initialize(options = {}) - @options = options @client = options[:client] || Client.new + @executor = options[:executor] @multipart_threshold = options[:multipart_threshold] || DEFAULT_MULTIPART_THRESHOLD end @@ -36,11 +36,9 @@ def initialize(options = {}) # @return [void] def upload(source, options = {}) Aws::Plugins::UserAgent.metric('S3_TRANSFER') do - if File.size(source) >= multipart_threshold - MultipartFileUploader.new(@options).upload(source, options) + if File.size(source) >= @multipart_threshold + MultipartFileUploader.new(client: @client, executor: @executor).upload(source, options) else - # remove multipart parameters not supported by put_object - options.delete(:thread_count) put_object(source, options) end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb index 243ddbe7848..bab023e1bf7 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/multipart_file_uploader.rb @@ -20,12 +20,9 @@ class MultipartFileUploader ) # @option options [Client] :client - # @option options [Integer] :thread_count (DEFAULT_THREAD_COUNT) def initialize(options = {}) @client = options[:client] || Client.new - @thread_count = options[:thread_count] || DEFAULT_THREAD_COUNT - @executor = options[:executor] || DefaultExecutor.new(max_threads: @thread_count) - @options = options + @executor = options[:executor] end # @return [Client] @@ -45,8 +42,6 @@ def upload(source, options = {}) upload_id = initiate_upload(options) parts = upload_parts(upload_id, source, file_size, options) complete_upload(upload_id, parts, file_size, options) - ensure - @executor.shutdown if @executor.running? && @options[:executor].nil? end private @@ -115,19 +110,19 @@ def checksum_key?(key) CHECKSUM_KEYS.include?(key) end - def checksum_keys?(keys) + def has_checksum_keys?(keys) keys.any? { |key| checksum_key?(key) } end def create_opts(options) opts = { checksum_algorithm: Aws::Plugins::ChecksumAlgorithm::DEFAULT_CHECKSUM } - opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) + opts[:checksum_type] = 'FULL_OBJECT' if has_checksum_keys?(options.keys) CREATE_OPTIONS.each_with_object(opts) { |k, h| h[k] = options[k] if options.key?(k) } end def complete_opts(options) opts = {} - opts[:checksum_type] = 'FULL_OBJECT' if checksum_keys?(options.keys) + opts[:checksum_type] = 'FULL_OBJECT' if has_checksum_keys?(options.keys) COMPLETE_OPTIONS.each_with_object(opts) { |k, h| h[k] = options[k] if options.key?(k) } end @@ -173,8 +168,8 @@ def upload_with_executor(pending, completed, options) errors end - def compute_default_part_size(source_size) - [(source_size.to_f / MAX_PARTS).ceil, MIN_PART_SIZE].max.to_i + def compute_default_part_size(file_size) + [(file_size.to_f / MAX_PARTS).ceil, MIN_PART_SIZE].max.to_i end def part_size(total_size, part_size, offset) diff --git a/gems/aws-sdk-s3/spec/file_uploader_spec.rb b/gems/aws-sdk-s3/spec/file_uploader_spec.rb index 64dbaf7cdd2..dc3ab94b052 100644 --- a/gems/aws-sdk-s3/spec/file_uploader_spec.rb +++ b/gems/aws-sdk-s3/spec/file_uploader_spec.rb @@ -81,12 +81,6 @@ module S3 subject.upload(ten_meg_file.path, params) end - - it 'does not fail when given :thread_count' do - expect(client).to receive(:put_object).with(params.merge(body: ten_meg_file)) - - subject.upload(ten_meg_file, params.merge(thread_count: 1)) - end end end end diff --git a/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb b/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb index d5a004b1834..54b50a13a01 100644 --- a/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb +++ b/gems/aws-sdk-s3/spec/multipart_file_uploader_spec.rb @@ -7,7 +7,8 @@ module Aws module S3 describe MultipartFileUploader do let(:client) { S3::Client.new(stub_responses: true) } - let(:subject) { MultipartFileUploader.new(client: client) } + let(:executor) { DefaultExecutor.new } + let(:subject) { MultipartFileUploader.new(client: client, executor: executor) } let(:params) { { bucket: 'bucket', key: 'key' } } describe '#initialize' do From 45d2f5d11bf7ff9fb512463355f4aeb9590fd4d2 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 6 Oct 2025 10:06:45 -0700 Subject: [PATCH 29/35] Add improvements to directory uploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 166 ++++++++++-------- 1 file changed, 96 insertions(+), 70 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index d7d87a58311..733625d9f1f 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -18,63 +18,63 @@ def initialize(message, errors = []) # @api private class DirectoryUploader def initialize(options = {}) - @client = options[:client] || Client.new - @executor = options[:executor] || DefaultExecutor.new - @options = options + @client = options[:client] + @executor = options[:executor] @abort_upload = false + @mutex = Mutex.new end - # @return [Client] - attr_reader :client - - # @return [Boolean] - attr_accessor :abort_upload + attr_reader :abort_upload # TODO: Need to add progress tracker - def upload(source, bucket:, **options) - raise ArgumentError, 'Invalid directory' unless Dir.exist?(source) - - upload_opts = options.dup - @ignore_failure = upload_opts.delete(:ignore_failure) || false - @errors = [] + def upload(source_directory, bucket:, **options) + raise ArgumentError, 'Invalid directory' unless Dir.exist?(source_directory) + upload_opts = build_upload_opts(source_directory, bucket, options) uploader = FileUploader.new( - multipart_threshold: upload_opts.delete(:multipart_threshold), + multipart_threshold: options[:multipart_threshold], client: @client, executor: @executor ) - producer = FileProducer.new(source, build_producer_opts(upload_opts)) - producer.run - uploads = process_upload_queue(producer, uploader, upload_opts.merge(bucket: bucket)) - build_result(uploads) + producer = FileProducer.new(build_producer_opts(upload_opts)) + uploads, errors = process_upload_queue(producer, uploader, upload_opts) + build_result(uploads, errors) ensure - @executor.shutdown unless @options[:executor] + set_abort_flag(value: false) end private - def build_producer_opts(opts) + def set_abort_flag(value: true) + @mutex.synchronize { @abort_upload = value } + end + + def build_upload_opts(source_directory, bucket, opts) { - directory_uploader: self, + source_dir: source_directory, + bucket: bucket, s3_prefix: opts.delete(:s3_prefix), - recursive: opts.delete(:recursive), - follow_symlinks: opts.delete(:follow_symlinks), + recursive: opts.delete(:recursive) || false, + follow_symlinks: opts.delete(:follow_symlinks) || false, filter_callback: opts.delete(:filter_callback), - ignore_failure: @ignore_failure, - errors: @errors + ignore_failure: opts.delete(:ignore_failure) || false, } end - def build_result(upload_count) - uploads = [upload_count - @errors.count, 0].max + def build_producer_opts(opts) + opts.merge(client: @client, directory_uploader: self) + end + + def build_result(upload_count, errors) + uploads = [upload_count - errors.count, 0].max if @abort_upload msg = "failed to upload directory: uploaded #{uploads} files " \ - "and failed to upload #{@errors.count} files." - raise DirectoryUploadError.new(msg, @errors) + "and failed to upload #{errors.count} files." + raise DirectoryUploadError.new(msg, errors) else - result = { completed_uploads: uploads, failed_uploads: @errors.count } - result[:errors] = @errors if @errors.any? + result = { completed_uploads: uploads, failed_uploads: errors.count } + result[:errors] = errors if errors.any? result end end @@ -83,51 +83,69 @@ def process_upload_queue(producer, uploader, opts) upload_attempts = 0 completion_queue = Queue.new queue_executor = DefaultExecutor.new - while (file = producer.file_queue.shift) != :done + errors = [] + producer.each do |file| break if @abort_upload + if file.is_a?(StandardError) + errors << file + next + end + upload_attempts += 1 queue_executor.post(file) do |f| - uploader.upload(f[:path], opts.merge(key: f[:key])) + uploader.upload(f[:path], bucket: opts[:bucket], key: f[:key]) + puts 'yay this file uploaded' rescue StandardError => e - @errors << e - @abort_upload = true unless @ignore_failure + errors << e + set_abort_flag unless opts[:ignore_failure] ensure completion_queue << :done end end upload_attempts.times { completion_queue.pop } - upload_attempts + [upload_attempts, errors] ensure queue_executor.shutdown end - # @api private class FileProducer - def initialize(source_dir, options = {}) - @source_dir = source_dir + include Enumerable + + DEFAULT_QUEUE_SIZE = 100 + + def initialize(options = {}) + @source_dir = options[:source_dir] @s3_prefix = options[:s3_prefix] - @recursive = options[:recursive] || false - @follow_symlinks = options[:follow_symlinks] || false + @recursive = options[:recursive] + @follow_symlinks = options[:follow_symlinks] @ignore_failure = options[:ignore_failure] @filter_callback = options[:filter_callback] - @errors = options[:errors] @directory_uploader = options[:directory_uploader] - @file_queue = SizedQueue.new(100) + @file_queue = SizedQueue.new(DEFAULT_QUEUE_SIZE) end - attr_accessor :file_queue - - def run - Thread.new do - if @recursive - find_recursively - else - find_directly + def each + producer_thread = Thread.new do + begin + if @recursive + find_recursively + else + find_directly + end + ensure + @file_queue << :done end - @file_queue << :done end + + while (file = @file_queue.shift) != :done + break if @directory_uploader.abort_upload + + yield file + end + ensure + producer_thread.join end private @@ -142,14 +160,20 @@ def find_directly break if @directory_uploader.abort_upload entry_path = File.join(@source_dir, entry) - next if File.directory?(entry_path) || skip_symlink?(entry_path) + if @follow_symlinks + stat = File.stat(entry_path) + next if stat.directory? + else + stat = File.lstat(entry_path) + next if stat.symlink? || stat.directory? + end next unless include_file?(entry_path, entry) - next unless valid_file_type?(entry_path) @file_queue << build_file_entry(entry_path, entry) rescue StandardError => e - @errors << e - @directory_uploader.abort_upload = true unless @ignore_failure + raise unless @ignore_failure + + @file_queue << e end end @@ -163,14 +187,6 @@ def find_recursively end end - def valid_file_type?(path) - File.file?(path) || File.symlink?(path) - end - - def skip_symlink?(path) - !@follow_symlinks && File.symlink?(path) - end - def include_file?(file_path, file_name) return true unless @filter_callback @@ -185,17 +201,27 @@ def scan_directory(dir_path, key_prefix: '', visited: nil) full_path = File.join(dir_path, entry) next unless include_file?(full_path, entry) - next if !@follow_symlinks && File.symlink?(full_path) - if File.directory?(full_path) + stat = + if @follow_symlinks + File.stat(full_path) + else + lstat = File.lstat(full_path) + next if lstat.symlink? + + lstat + end + + if stat.directory? handle_directory(full_path, entry, key_prefix, visited) - elsif valid_file_type?(full_path) + else key = key_prefix.empty? ? entry : File.join(key_prefix, entry) @file_queue << build_file_entry(full_path, key) end rescue StandardError => e - @errors << e - @directory_uploader.abort_upload = true unless @ignore_failure + raise unless @ignore_failure + + @file_queue << e end end From 64d481ebb6be9372ae3065619cf8a4a1b058f76d Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 6 Oct 2025 12:02:43 -0700 Subject: [PATCH 30/35] Update DirectoryDownloader based on feedbacks --- .../lib/aws-sdk-s3/directory_downloader.rb | 142 +++++++++++------- 1 file changed, 90 insertions(+), 52 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb index 86d849f72fd..ed43b49caea 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -16,60 +16,64 @@ def initialize(message, errors = []) # @api private class DirectoryDownloader def initialize(options = {}) - @client = options[:client] || Client.new - @executor = options[:executor] || DefaultExecutor.new - @options = options + @client = options[:client] + @executor = options[:executor] @abort_download = false + @mutex = Mutex.new end - attr_reader :client, :abort_download + attr_reader :abort_download # TODO: need to add progress tracker def download(destination, bucket:, **options) if File.exist?(destination) - raise ArgumentError 'invalid destination, expected a directory' unless File.directory?(destination) + raise ArgumentError, 'invalid destination, expected a directory' unless File.directory?(destination) else FileUtils.mkdir_p(destination) end - download_opts = options.dup - @bucket = bucket - @ignore_failure = download_opts.delete(:ignore_failure) || false - @errors = [] + download_opts = build_download_opts(destination, bucket, options) + downloader = FileDownloader.new(client: @client, executor: @executor) + producer = ObjectProducer.new(build_producer_opts(download_opts)) + downloads, errors = process_download_queue(producer, downloader, download_opts) + build_result(downloads, errors) + ensure + set_abort_flag(value: false) + end - downloader = FileDownloader.new(client: client, executor: @executor) - producer = ObjectProducer.new(destination, build_producer_opts(download_opts)) - producer.run + private - downloads = process_download_queue(producer, downloader, download_opts) - build_result(downloads) - ensure - @executor.shutdown unless @options[:executor] + def set_abort_flag(value: true) + @mutex.synchronize { @abort_download = value } end - def build_producer_opts(opts) + def build_download_opts(destination, bucket, opts) { - directory_downloader: self, - client: @client, - bucket: @bucket, + destination: destination, + bucket: bucket, s3_prefix: opts.delete(:s3_prefix), - ignore_failure: @ignore_failure, + ignore_failure: opts.delete(:ignore_failure) || false, filter_callback: opts.delete(:filter_callback), - errors: @errors + progress_callback: opts.delete(:progress_callback) } end - def build_result(download_count) - downloads = [download_count - @errors.count, 0].max + def build_producer_opts(opts) + opts.merge(client: @client, directory_downloader: self) + end + + def build_result(download_count, errors) + downloads = [download_count - errors.count, 0].max if @abort_download - msg = "failed to download directory: downloaded #{downloads} files " \ - "and failed to download #{@errors.count} files." - raise DirectoryDownloadError.new(msg, @errors) + msg = "failed to download directory: downloaded #{downloads} files, failed #{errors.count} files" + raise DirectoryDownloadError.new(msg, errors) else - result = { completed_downloads: downloads, failed_downloads: @errors.count } - result[:errors] = @errors if @errors.any? - result + { + completed_downloads: downloads, + failed_downloads: errors.count, + errors: errors.any? ? errors : nil + }.compact end end @@ -77,49 +81,62 @@ def process_download_queue(producer, downloader, opts) download_attempts = 0 completion_queue = Queue.new queue_executor = DefaultExecutor.new - while (object = producer.object_queue.shift) != :done + progress = DirectoryProgress.new(opts[:progress_callback]) if opts[:progress_callback] + errors = [] + producer.each do |object| break if @abort_download download_attempts += 1 queue_executor.post(object) do |o| dir_path = File.dirname(o[:path]) - FileUtils.mkdir_p(dir_path) unless dir_path == @destination || Dir.exist?(dir_path) + FileUtils.mkdir_p(dir_path) unless dir_path == opts[:destination] || Dir.exist?(dir_path) - downloader.download(o[:path], opts.merge(bucket: @bucket, key: o[:key])) + downloader.download(o[:path], bucket: opts[:bucket], key: o[:key]) + progress&.call(File.size(o[:path])) rescue StandardError => e - @errors << e - @abort_download = true unless @ignore_failure + errors << e + set_abort_flag unless opts[:ignore_failure] ensure completion_queue << :done end end download_attempts.times { completion_queue.pop } - download_attempts + [download_attempts, errors] ensure queue_executor.shutdown end # @api private class ObjectProducer - def initialize(destination_dir, options = {}) - @destination_dir = destination_dir + include Enumerable + + DEFAULT_QUEUE_SIZE = 100 + + def initialize(options = {}) + @destination_dir = options[:destination] @client = options[:client] @bucket = options[:bucket] @s3_prefix = options[:s3_prefix] - @ignore_failure = options[:ignore_failure] @filter_callback = options[:filter_callback] - @errors = options[:errors] @directory_downloader = options[:directory_downloader] - @object_queue = SizedQueue.new(100) + @object_queue = SizedQueue.new(DEFAULT_QUEUE_SIZE) end - attr_reader :object_queue - - def run - Thread.new do + def each + producer_thread = Thread.new do stream_objects + ensure @object_queue << :done end + + # Yield objects from internal queue + while (object = @object_queue.shift) != :done + break if @directory_downloader.abort_download + + yield object + end + ensure + producer_thread.join end private @@ -128,26 +145,47 @@ def build_object_entry(key) { path: File.join(@destination_dir, normalize_key(key)), key: key } end - # TODO: need to add filter callback, double check handling of objects that ends with / + # TODO: double check handling of objects that ends with / def stream_objects(continuation_token: nil) - resp = @client.list_objects_v2(bucket: @bucket, continuation_token: continuation_token) + resp = @client.list_objects_v2(bucket: @bucket, prefix: @s3_prefix, continuation_token: continuation_token) resp.contents.each do |o| break if @directory_downloader.abort_download next if o.key.end_with?('/') + next unless include_object?(o.key) @object_queue << build_object_entry(o.key) - rescue StandardError => e - @errors << e - @abort_download = true unless @ignore_failure end stream_objects(continuation_token: resp.next_continuation_token) if resp.next_continuation_token end + def include_object?(key) + return true unless @filter_callback + + @filter_callback.call(key) + end + def normalize_key(key) key = key.delete_prefix(@s3_prefix) if @s3_prefix - return key if File::SEPARATOR == '/' + File::SEPARATOR == '/' ? key : key.tr('/', File::SEPARATOR) + end + end + + # @api private + class DirectoryProgress + def initialize(progress_callback) + @transferred_bytes = 0 + @transferred_files = 0 + @progress_callback = progress_callback + @mutex = Mutex.new + end + + def call(bytes_received) + @mutex.synchronize do + @transferred_bytes += bytes_received + @transferred_files += 1 - key.tr('/', File::SEPARATOR) + @progress_callback.call(@transferred_bytes, @transferred_files) + end end end end From 2ab63fbdaf2041d6f286ffb5e2a6fdd253257277 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Mon, 6 Oct 2025 13:21:28 -0700 Subject: [PATCH 31/35] Minor feedback updates --- .../lib/aws-sdk-s3/customizations.rb | 1 + .../lib/aws-sdk-s3/directory_downloader.rb | 52 +++++------------ .../lib/aws-sdk-s3/directory_progress.rb | 24 ++++++++ .../lib/aws-sdk-s3/directory_uploader.rb | 56 +++++++++---------- 4 files changed, 67 insertions(+), 66 deletions(-) create mode 100644 gems/aws-sdk-s3/lib/aws-sdk-s3/directory_progress.rb diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb index a85bfd6e14c..3114b192128 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/customizations.rb @@ -19,6 +19,7 @@ module S3 autoload :ObjectMultipartCopier, 'aws-sdk-s3/object_multipart_copier' autoload :PresignedPost, 'aws-sdk-s3/presigned_post' autoload :Presigner, 'aws-sdk-s3/presigner' + autoload :DirectoryProgress, '../aws-sdk-s3/directory_progress' autoload :DirectoryUploader, 'aws-sdk-s3/directory_uploader' autoload :DirectoryDownloader, 'aws-sdk-s3/directory_downloader' autoload :TransferManager, 'aws-sdk-s3/transfer_manager' diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb index ed43b49caea..404759f5a66 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -18,13 +18,12 @@ class DirectoryDownloader def initialize(options = {}) @client = options[:client] @executor = options[:executor] - @abort_download = false + @abort_requested = false @mutex = Mutex.new end - attr_reader :abort_download + attr_reader :abort_requested - # TODO: need to add progress tracker def download(destination, bucket:, **options) if File.exist?(destination) raise ArgumentError, 'invalid destination, expected a directory' unless File.directory?(destination) @@ -38,15 +37,14 @@ def download(destination, bucket:, **options) downloads, errors = process_download_queue(producer, downloader, download_opts) build_result(downloads, errors) ensure - set_abort_flag(value: false) + @abort_requested = false end private - def set_abort_flag(value: true) - @mutex.synchronize { @abort_download = value } + def request_abort + @mutex.synchronize { @abort_requested = true } end - def build_download_opts(destination, bucket, opts) { destination: destination, @@ -65,7 +63,7 @@ def build_producer_opts(opts) def build_result(download_count, errors) downloads = [download_count - errors.count, 0].max - if @abort_download + if @abort_requested msg = "failed to download directory: downloaded #{downloads} files, failed #{errors.count} files" raise DirectoryDownloadError.new(msg, errors) else @@ -78,13 +76,14 @@ def build_result(download_count, errors) end def process_download_queue(producer, downloader, opts) - download_attempts = 0 - completion_queue = Queue.new + # Separate executor for lightweight queuing tasks, + # avoiding interference with main @executor lifecycle queue_executor = DefaultExecutor.new progress = DirectoryProgress.new(opts[:progress_callback]) if opts[:progress_callback] + download_attempts = 0 errors = [] producer.each do |object| - break if @abort_download + break if @abort_requested download_attempts += 1 queue_executor.post(object) do |o| @@ -95,15 +94,13 @@ def process_download_queue(producer, downloader, opts) progress&.call(File.size(o[:path])) rescue StandardError => e errors << e - set_abort_flag unless opts[:ignore_failure] - ensure - completion_queue << :done + request_abort unless opts[:ignore_failure] end end - download_attempts.times { completion_queue.pop } + queue_executor.shutdown [download_attempts, errors] ensure - queue_executor.shutdown + queue_executor.shutdown if queue_executor.running? end # @api private @@ -131,7 +128,7 @@ def each # Yield objects from internal queue while (object = @object_queue.shift) != :done - break if @directory_downloader.abort_download + break if @directory_downloader.abort_requested yield object end @@ -149,7 +146,7 @@ def build_object_entry(key) def stream_objects(continuation_token: nil) resp = @client.list_objects_v2(bucket: @bucket, prefix: @s3_prefix, continuation_token: continuation_token) resp.contents.each do |o| - break if @directory_downloader.abort_download + break if @directory_downloader.abort_requested next if o.key.end_with?('/') next unless include_object?(o.key) @@ -169,25 +166,6 @@ def normalize_key(key) File::SEPARATOR == '/' ? key : key.tr('/', File::SEPARATOR) end end - - # @api private - class DirectoryProgress - def initialize(progress_callback) - @transferred_bytes = 0 - @transferred_files = 0 - @progress_callback = progress_callback - @mutex = Mutex.new - end - - def call(bytes_received) - @mutex.synchronize do - @transferred_bytes += bytes_received - @transferred_files += 1 - - @progress_callback.call(@transferred_bytes, @transferred_files) - end - end - end end end end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_progress.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_progress.rb new file mode 100644 index 00000000000..d2100782977 --- /dev/null +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_progress.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +module Aws + module S3 + # @api private + class DirectoryProgress + def initialize(progress_callback) + @transferred_bytes = 0 + @transferred_files = 0 + @progress_callback = progress_callback + @mutex = Mutex.new + end + + def call(bytes_transferred) + @mutex.synchronize do + @transferred_bytes += bytes_transferred + @transferred_files += 1 + + @progress_callback.call(@transferred_bytes, @transferred_files) + end + end + end + end +end diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 733625d9f1f..8f8059739d8 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -20,13 +20,12 @@ class DirectoryUploader def initialize(options = {}) @client = options[:client] @executor = options[:executor] - @abort_upload = false + @abort_requested = false @mutex = Mutex.new end - attr_reader :abort_upload + attr_reader :abort_requested - # TODO: Need to add progress tracker def upload(source_directory, bucket:, **options) raise ArgumentError, 'Invalid directory' unless Dir.exist?(source_directory) @@ -40,13 +39,13 @@ def upload(source_directory, bucket:, **options) uploads, errors = process_upload_queue(producer, uploader, upload_opts) build_result(uploads, errors) ensure - set_abort_flag(value: false) + @abort_requested = false end private - def set_abort_flag(value: true) - @mutex.synchronize { @abort_upload = value } + def request_abort + @mutex.synchronize { @abort_requested = true } end def build_upload_opts(source_directory, bucket, opts) @@ -58,6 +57,7 @@ def build_upload_opts(source_directory, bucket, opts) follow_symlinks: opts.delete(:follow_symlinks) || false, filter_callback: opts.delete(:filter_callback), ignore_failure: opts.delete(:ignore_failure) || false, + progress_callback: opts.delete(:progress_callback) } end @@ -68,7 +68,7 @@ def build_producer_opts(opts) def build_result(upload_count, errors) uploads = [upload_count - errors.count, 0].max - if @abort_upload + if @abort_requested msg = "failed to upload directory: uploaded #{uploads} files " \ "and failed to upload #{errors.count} files." raise DirectoryUploadError.new(msg, errors) @@ -80,33 +80,33 @@ def build_result(upload_count, errors) end def process_upload_queue(producer, uploader, opts) - upload_attempts = 0 - completion_queue = Queue.new + # Separate executor for lightweight queuing tasks, + # avoiding interference with main @executor lifecycle queue_executor = DefaultExecutor.new + progress = DirectoryProgress.new(opts[:progress_callback]) if opts[:progress_callback] + upload_attempts = 0 errors = [] producer.each do |file| - break if @abort_upload + break if @abort_requested + upload_attempts += 1 if file.is_a?(StandardError) errors << file next end - upload_attempts += 1 queue_executor.post(file) do |f| uploader.upload(f[:path], bucket: opts[:bucket], key: f[:key]) - puts 'yay this file uploaded' + progress&.call(File.size(f[:path])) rescue StandardError => e errors << e - set_abort_flag unless opts[:ignore_failure] - ensure - completion_queue << :done + request_abort unless opts[:ignore_failure] end end - upload_attempts.times { completion_queue.pop } + queue_executor.shutdown [upload_attempts, errors] ensure - queue_executor.shutdown + queue_executor.shutdown if queue_executor.running? end # @api private @@ -128,19 +128,17 @@ def initialize(options = {}) def each producer_thread = Thread.new do - begin - if @recursive - find_recursively - else - find_directly - end - ensure - @file_queue << :done + if @recursive + find_recursively + else + find_directly end + ensure + @file_queue << :done end while (file = @file_queue.shift) != :done - break if @directory_uploader.abort_upload + break if @directory_uploader.abort_requested yield file end @@ -157,7 +155,7 @@ def build_file_entry(file_path, key) def find_directly Dir.each_child(@source_dir) do |entry| - break if @directory_uploader.abort_upload + break if @directory_uploader.abort_requested entry_path = File.join(@source_dir, entry) if @follow_symlinks @@ -194,10 +192,10 @@ def include_file?(file_path, file_name) end def scan_directory(dir_path, key_prefix: '', visited: nil) - return if @directory_uploader.abort_upload + return if @directory_uploader.abort_requested Dir.each_child(dir_path) do |entry| - break if @directory_uploader.abort_upload + break if @directory_uploader.abort_requested full_path = File.join(dir_path, entry) next unless include_file?(full_path, entry) From 747965fce92369a426b97f6df3836cf158dc076a Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 7 Oct 2025 08:10:12 -0700 Subject: [PATCH 32/35] Update executor --- .../lib/aws-sdk-s3/default_executor.rb | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb index 38158b343d7..e3a60a3a647 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -17,10 +17,13 @@ def initialize(options = {}) end def post(*args, &block) - raise 'Executor is not accepting new tasks' unless @state == RUNNING + @mutex.synchronize do + raise 'Executor has been shutdown and is no longer accepting tasks' unless @state == RUNNING - @queue << [args, block] - ensure_worker_available + @queue << [args, block] + ensure_worker_available + end + true end def kill @@ -43,7 +46,13 @@ def shutdown(timeout = nil) @max_threads.times { @queue << :shutdown } if timeout - @pool.each { |thread| thread.join(timeout) } + deadline = Time.now + timeout + @pool.each do |thread| + remaining = deadline - Time.now + break if remaining <= 0 + + thread.join([remaining, 0].max) + end @pool.select(&:alive?).each(&:kill) else @pool.each(&:join) @@ -69,12 +78,10 @@ def shutdown? private def ensure_worker_available - @mutex.synchronize do - return unless @state == RUNNING + return unless @state == RUNNING - @pool.select!(&:alive?) - @pool << spawn_worker if @pool.size < @max_threads - end + @pool.select!(&:alive?) + @pool << spawn_worker if @pool.size < @max_threads end def spawn_worker From 22304784db6e7e5ee366bfef57befc273e651107 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 7 Oct 2025 09:59:44 -0700 Subject: [PATCH 33/35] Improve Directory Uploader --- .../lib/aws-sdk-s3/directory_uploader.rb | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 8f8059739d8..6888565f441 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -35,7 +35,7 @@ def upload(source_directory, bucket:, **options) client: @client, executor: @executor ) - producer = FileProducer.new(build_producer_opts(upload_opts)) + producer = FileProducer.new(upload_opts.merge(client: @client, directory_uploader: self)) uploads, errors = process_upload_queue(producer, uploader, upload_opts) build_result(uploads, errors) ensure @@ -61,24 +61,26 @@ def build_upload_opts(source_directory, bucket, opts) } end - def build_producer_opts(opts) - opts.merge(client: @client, directory_uploader: self) - end - def build_result(upload_count, errors) - uploads = [upload_count - errors.count, 0].max - if @abort_requested - msg = "failed to upload directory: uploaded #{uploads} files " \ - "and failed to upload #{errors.count} files." + msg = "directory upload failed: #{errors.map(&:message).join('; ')}" raise DirectoryUploadError.new(msg, errors) else - result = { completed_uploads: uploads, failed_uploads: errors.count } - result[:errors] = errors if errors.any? - result + { + completed_uploads: upload_count - errors.count, + failed_uploads: errors.count, + errors: errors.any? ? errors : nil + }.compact end end + def handle_error(executor, opts) + return if opts[:ignore_failure] + + request_abort + executor.kill + end + def process_upload_queue(producer, uploader, opts) # Separate executor for lightweight queuing tasks, # avoiding interference with main @executor lifecycle @@ -86,27 +88,30 @@ def process_upload_queue(producer, uploader, opts) progress = DirectoryProgress.new(opts[:progress_callback]) if opts[:progress_callback] upload_attempts = 0 errors = [] - producer.each do |file| - break if @abort_requested - - upload_attempts += 1 - if file.is_a?(StandardError) - errors << file - next - end + begin + producer.each do |file| + break if @abort_requested + + upload_attempts += 1 + if file.is_a?(StandardError) + errors << file + next + end - queue_executor.post(file) do |f| - uploader.upload(f[:path], bucket: opts[:bucket], key: f[:key]) - progress&.call(File.size(f[:path])) - rescue StandardError => e - errors << e - request_abort unless opts[:ignore_failure] + queue_executor.post(file) do |f| + uploader.upload(f[:path], bucket: opts[:bucket], key: f[:key]) + progress&.call(File.size(f[:path])) + rescue StandardError => e + errors << e + handle_error(queue_executor, opts) + end end + rescue StandardError => e + errors << e + handle_error(queue_executor, opts) end queue_executor.shutdown [upload_attempts, errors] - ensure - queue_executor.shutdown if queue_executor.running? end # @api private From cb145a0a51fb24aa33aa12d7d8dab2bc75e67d9c Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 7 Oct 2025 10:27:29 -0700 Subject: [PATCH 34/35] Handle failure cases correctly --- .../lib/aws-sdk-s3/directory_downloader.rb | 52 ++++++++++--------- .../lib/aws-sdk-s3/directory_uploader.rb | 2 +- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb index 404759f5a66..a8f9aca32dd 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_downloader.rb @@ -33,7 +33,7 @@ def download(destination, bucket:, **options) download_opts = build_download_opts(destination, bucket, options) downloader = FileDownloader.new(client: @client, executor: @executor) - producer = ObjectProducer.new(build_producer_opts(download_opts)) + producer = ObjectProducer.new(download_opts.merge(client: @client, directory_downloader: self)) downloads, errors = process_download_queue(producer, downloader, download_opts) build_result(downloads, errors) ensure @@ -56,25 +56,26 @@ def build_download_opts(destination, bucket, opts) } end - def build_producer_opts(opts) - opts.merge(client: @client, directory_downloader: self) - end - def build_result(download_count, errors) - downloads = [download_count - errors.count, 0].max - if @abort_requested - msg = "failed to download directory: downloaded #{downloads} files, failed #{errors.count} files" + msg = "directory download failed: #{errors.map(&:message).join('; ')}" raise DirectoryDownloadError.new(msg, errors) else { - completed_downloads: downloads, + completed_downloads: [download_count - errors.count, 0].max, failed_downloads: errors.count, errors: errors.any? ? errors : nil }.compact end end + def handle_error(executor, opts) + return if opts[:ignore_failure] + + request_abort + executor.kill + end + def process_download_queue(producer, downloader, opts) # Separate executor for lightweight queuing tasks, # avoiding interference with main @executor lifecycle @@ -82,25 +83,28 @@ def process_download_queue(producer, downloader, opts) progress = DirectoryProgress.new(opts[:progress_callback]) if opts[:progress_callback] download_attempts = 0 errors = [] - producer.each do |object| - break if @abort_requested - - download_attempts += 1 - queue_executor.post(object) do |o| - dir_path = File.dirname(o[:path]) - FileUtils.mkdir_p(dir_path) unless dir_path == opts[:destination] || Dir.exist?(dir_path) - - downloader.download(o[:path], bucket: opts[:bucket], key: o[:key]) - progress&.call(File.size(o[:path])) - rescue StandardError => e - errors << e - request_abort unless opts[:ignore_failure] + begin + producer.each do |object| + break if @abort_requested + + download_attempts += 1 + queue_executor.post(object) do |o| + dir_path = File.dirname(o[:path]) + FileUtils.mkdir_p(dir_path) unless dir_path == opts[:destination] || Dir.exist?(dir_path) + + downloader.download(o[:path], bucket: opts[:bucket], key: o[:key]) + progress&.call(File.size(o[:path])) + rescue StandardError => e + errors << e + handle_error(queue_executor, opts) + end end + rescue StandardError => e + errors << e + handle_error(queue_executor, opts) end queue_executor.shutdown [download_attempts, errors] - ensure - queue_executor.shutdown if queue_executor.running? end # @api private diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb index 6888565f441..68aa50c48f1 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/directory_uploader.rb @@ -67,7 +67,7 @@ def build_result(upload_count, errors) raise DirectoryUploadError.new(msg, errors) else { - completed_uploads: upload_count - errors.count, + completed_uploads: [upload_count - errors.count, 0].max, failed_uploads: errors.count, errors: errors.any? ? errors : nil }.compact From 0cb35cd018319b024ff1e6522c53211b5a002bd2 Mon Sep 17 00:00:00 2001 From: Juli Tera Date: Tue, 7 Oct 2025 11:12:53 -0700 Subject: [PATCH 35/35] Improve Executor --- gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb index e3a60a3a647..0fbdde2f61f 100644 --- a/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb +++ b/gems/aws-sdk-s3/lib/aws-sdk-s3/default_executor.rb @@ -41,10 +41,9 @@ def shutdown(timeout = nil) return true if @state == SHUTDOWN @state = SHUTTING_DOWN + @pool.size.times { @queue << :shutdown } end - @max_threads.times { @queue << :shutdown } - if timeout deadline = Time.now + timeout @pool.each do |thread|