From 884eaf489e212fef31df1636d7b2bb877dc4eadd Mon Sep 17 00:00:00 2001 From: Morgan Killik Date: Sun, 16 Nov 2025 10:09:55 -0500 Subject: [PATCH 01/13] Update AWS SDK version --- cpp/thirdparty/versions.txt | 48 ++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index fd596f9a24d..bb17cc7d41e 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -25,34 +25,34 @@ ARROW_ABSL_BUILD_VERSION=20211102.0 ARROW_ABSL_BUILD_SHA256_CHECKSUM=dcf71b9cba8dc0ca9940c4b316a0c796be8fab42b070bb6b7cab62b48f0e66c4 -ARROW_AWS_C_AUTH_BUILD_VERSION=v0.9.0 -ARROW_AWS_C_AUTH_BUILD_SHA256_CHECKSUM=aa6e98864fefb95c249c100da4ae7aed36ba13a8a91415791ec6fad20bec0427 -ARROW_AWS_C_CAL_BUILD_VERSION=v0.9.2 -ARROW_AWS_C_CAL_BUILD_SHA256_CHECKSUM=f9f3bc6a069e2efe25fcdf73e4d2b16b5608c327d2eb57c8f7a8524e9e1fcad0 -ARROW_AWS_C_COMMON_BUILD_VERSION=v0.12.4 -ARROW_AWS_C_COMMON_BUILD_SHA256_CHECKSUM=0b7705a4d115663c3f485d353a75ed86e37583157585e5825d851af634b57fe3 +ARROW_AWS_C_AUTH_BUILD_VERSION=v0.9.1 +ARROW_AWS_C_AUTH_BUILD_SHA256_CHECKSUM=adae1e725d9725682366080b8bf8e49481650c436b846ceeb5efe955d5e03273 +ARROW_AWS_C_CAL_BUILD_VERSION=v0.9.10 +ARROW_AWS_C_CAL_BUILD_SHA256_CHECKSUM=a41b389e942fadd599a6a0f692b75480d663f1e702c0301177f00f365e0c9b94 +ARROW_AWS_C_COMMON_BUILD_VERSION=v0.12.5 +ARROW_AWS_C_COMMON_BUILD_SHA256_CHECKSUM=02d1ab905d43a33008a63f273b27dbe4859e9f090eac6f0e3eeaf8c64a083937 ARROW_AWS_C_COMPRESSION_BUILD_VERSION=v0.3.1 ARROW_AWS_C_COMPRESSION_BUILD_SHA256_CHECKSUM=d89fca17a37de762dc34f332d2da402343078da8dbd2224c46a11a88adddf754 -ARROW_AWS_C_EVENT_STREAM_BUILD_VERSION=v0.5.4 -ARROW_AWS_C_EVENT_STREAM_BUILD_SHA256_CHECKSUM=cef8b78e362836d15514110fb43a0a0c7a86b0a210d5fe25fd248a82027a7272 -ARROW_AWS_C_HTTP_BUILD_VERSION=v0.10.2 -ARROW_AWS_C_HTTP_BUILD_SHA256_CHECKSUM=048d9d683459ade363fd7cc448c2b6329c78f67a2a0c0cb61c16de4634a2fc6b -ARROW_AWS_C_IO_BUILD_VERSION=v0.19.1 -ARROW_AWS_C_IO_BUILD_SHA256_CHECKSUM=f2fea0c066924f7fe3c2b1c7b2fa9be640f5b16a6514854226330e63a1faacd0 -ARROW_AWS_C_MQTT_BUILD_VERSION=v0.13.1 -ARROW_AWS_C_MQTT_BUILD_SHA256_CHECKSUM=c54d02c1e46f55bae8d5e6f9c4b0d78d84c1c9d9ac16ba8d78c3361edcd8b5bb -ARROW_AWS_C_S3_BUILD_VERSION=v0.8.1 -ARROW_AWS_C_S3_BUILD_SHA256_CHECKSUM=c8b09780691d2b94e50d101c68f01fa2d1c3debb0ff3aed313d93f0d3c9af663 +ARROW_AWS_C_EVENT_STREAM_BUILD_VERSION=v0.5.7 +ARROW_AWS_C_EVENT_STREAM_BUILD_SHA256_CHECKSUM=5d92abed2ed89cc1efaba3963e888d9df527296f1dbfe21c569f84ea731aa3c2 +ARROW_AWS_C_HTTP_BUILD_VERSION=v0.10.7 +ARROW_AWS_C_HTTP_BUILD_SHA256_CHECKSUM=ce9e71c3eae67b1c6c0149278e0d0929a7d928c3547de64999430c8592864ad4 +ARROW_AWS_C_IO_BUILD_VERSION=v0.23.3 +ARROW_AWS_C_IO_BUILD_SHA256_CHECKSUM=cdcb31b694fc28ba96237ee33a742679daf2dcabfd41464f8a68fbd913907085 +ARROW_AWS_C_MQTT_BUILD_VERSION=v0.13.3 +ARROW_AWS_C_MQTT_BUILD_SHA256_CHECKSUM=1dfc11d6b3dc1a6d408df64073e8238739b4c50374078d36d3f2d30491d15527 +ARROW_AWS_C_S3_BUILD_VERSION=v0.10.1 +ARROW_AWS_C_S3_BUILD_SHA256_CHECKSUM=cc656740e8e566b5c0233b1e96c8a91fc10d31872b3fcea6831c57902a746386 ARROW_AWS_C_SDKUTILS_BUILD_VERSION=v0.2.4 ARROW_AWS_C_SDKUTILS_BUILD_SHA256_CHECKSUM=493cbed4fa57e0d4622fcff044e11305eb4fc12445f32c8861025597939175fc ARROW_AWS_CHECKSUMS_BUILD_VERSION=v0.2.7 ARROW_AWS_CHECKSUMS_BUILD_SHA256_CHECKSUM=178e8398d98111f29150f7813a70c20ad97ab30be0de02525440355fe84ccb1d -ARROW_AWS_CRT_CPP_BUILD_VERSION=v0.32.8 -ARROW_AWS_CRT_CPP_BUILD_SHA256_CHECKSUM=db44260452a0296341fb8e7b987e4c328f08f7829b9f1c740fed9c963e081e93 -ARROW_AWS_LC_BUILD_VERSION=v1.52.1 -ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=fe552e3c3522f73afc3c30011745c431c633f7b4e25dcd7b38325f194a7b3b75 -ARROW_AWSSDK_BUILD_VERSION=1.11.587 -ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=b9944ba9905a68d6e53abb4f36ab2b3bd18ac88d8571647bb9f2b8026b76f8cd +ARROW_AWS_CRT_CPP_BUILD_VERSION=v0.35.2 +ARROW_AWS_CRT_CPP_BUILD_SHA256_CHECKSUM=9d53d7018994a5f7fc879d397032b72ad88b1585a8cc07e2c8c339ae427f0577 +ARROW_AWS_LC_BUILD_VERSION=v1.64.0 +ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=54646e5956f5394473ebe32741d2bf1509f2b556424899aed116647856f1e041 +ARROW_AWSSDK_BUILD_VERSION=1.11.691 +ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=1904c0c0306944fdd7be55ff1d3337f272b5281bd7e7d967eb4b2ae1a99e2ae0 # Despite the confusing version name this is still the whole Azure SDK for C++ including core, keyvault, storage-common, etc. ARROW_AZURE_SDK_BUILD_VERSION=azure-identity_1.9.0 ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM=97065bfc971ac8df450853ce805f820f52b59457bd7556510186a1569502e4a1 @@ -108,8 +108,8 @@ ARROW_SNAPPY_BUILD_VERSION=1.2.2 ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=90f74bc1fbf78a6c56b3c4a082a05103b3a56bb17bca1a27e052ea11723292dc ARROW_SUBSTRAIT_BUILD_VERSION=v0.44.0 ARROW_SUBSTRAIT_BUILD_SHA256_CHECKSUM=f989a862f694e7dbb695925ddb7c4ce06aa6c51aca945105c075139aed7e55a2 -ARROW_S2N_TLS_BUILD_VERSION=v1.5.23 -ARROW_S2N_TLS_BUILD_SHA256_CHECKSUM=81961ea5ae9313c987edfa579306ad4500bedfbf10caf84d8a5dcfc42aaf591f +ARROW_S2N_TLS_BUILD_VERSION=v1.6.0 +ARROW_S2N_TLS_BUILD_SHA256_CHECKSUM=25f1f14092438d0919d60c4357990e1d2b734e3ffa9d8ecd86590abfd9407b00 ARROW_THRIFT_BUILD_VERSION=0.22.0 ARROW_THRIFT_BUILD_SHA256_CHECKSUM=794a0e455787960d9f27ab92c38e34da27e8deeda7a5db0e59dc64a00df8a1e5 ARROW_UTF8PROC_BUILD_VERSION=v2.10.0 From 6cd28290758f167b144a3102abb9e9bfc7b4b912 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 17 Nov 2025 02:45:32 -0800 Subject: [PATCH 02/13] GH-48142: [CI] Disallow scheduled GitHub Actions run on forked repos (#48143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Closes #48142 Prevents Github workflows to periodically run on forked repos ### What changes are included in this PR? Add a gate to the two github workflows to prevent them from periodically running on forked repos. ### Are these changes tested? Yes, manually ### Are there any user-facing changes? No **This PR includes breaking changes to public APIs.** (If there are any breaking changes to public APIs, please explain which changes are breaking. If not, you can remove this.) **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #48142 Authored-by: Kevin Liu Signed-off-by: Raúl Cumplido --- .github/workflows/cpp_extra.yml | 1 + .github/workflows/package_linux.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 8ee50017138..984441d78db 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -79,6 +79,7 @@ permissions: jobs: check-labels: + if: github.event_name != 'schedule' || github.repository == 'apache/arrow' uses: ./.github/workflows/check_labels.yml secrets: inherit with: diff --git a/.github/workflows/package_linux.yml b/.github/workflows/package_linux.yml index ba863894283..92889ab6cb5 100644 --- a/.github/workflows/package_linux.yml +++ b/.github/workflows/package_linux.yml @@ -66,6 +66,7 @@ permissions: jobs: check-labels: + if: github.event_name != 'schedule' || github.repository == 'apache/arrow' uses: ./.github/workflows/check_labels.yml secrets: inherit with: From 857e1746002fe70a3bc3ad61b843783265b0f1b9 Mon Sep 17 00:00:00 2001 From: Bogdan Romenskii <77816165+rmnskb@users.noreply.github.com> Date: Mon, 17 Nov 2025 11:47:11 +0100 Subject: [PATCH 03/13] GH-48096 [Python][Parquet] Expose new WriterProperties::max_rows_per_page to Python bindings (#48101) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change See #48096, exposes `parquet.WriterProperties max_rows_per_page` argument to Python's API. ### What changes are included in this PR? Added the argument ### Are these changes tested? Yes, since the metadata doesn't have any info about the number of pages, a naive end-to-end test was used to ensure the implementation correctness. ### Are there any user-facing changes? The ability to set the `max_rows_per_page` directly from PyArrow. * GitHub Issue: #48096 Authored-by: Bogdan Romenskii Signed-off-by: Raúl Cumplido --- python/pyarrow/_dataset_parquet.pyx | 2 + python/pyarrow/_parquet.pxd | 1 + python/pyarrow/_parquet.pyx | 6 ++ python/pyarrow/includes/libparquet.pxd | 1 + python/pyarrow/parquet/core.py | 8 +++ .../tests/parquet/test_parquet_writer.py | 57 +++++++++++++++++++ 6 files changed, 75 insertions(+) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 9405b5d8c54..534f7790923 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -646,6 +646,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): version=self._properties["version"], write_statistics=self._properties["write_statistics"], data_page_size=self._properties["data_page_size"], + max_rows_per_page=self._properties["max_rows_per_page"], compression_level=self._properties["compression_level"], use_byte_stream_split=( self._properties["use_byte_stream_split"] @@ -695,6 +696,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): version="2.6", write_statistics=None, data_page_size=None, + max_rows_per_page=None, compression_level=None, use_byte_stream_split=False, column_encoding=None, diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 704eb06cc38..ef9ed576570 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -44,6 +44,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( version=*, write_statistics=*, data_page_size=*, + max_rows_per_page=*, compression_level=*, use_byte_stream_split=*, column_encoding=*, diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 14cd3e363a4..66e02940103 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1984,6 +1984,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( version=None, write_statistics=None, data_page_size=None, + max_rows_per_page=None, compression_level=None, use_byte_stream_split=False, column_encoding=None, @@ -2129,6 +2130,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( if data_page_size is not None: props.data_pagesize(data_page_size) + if max_rows_per_page is not None: + props.max_rows_per_page(max_rows_per_page) + if write_batch_size is not None: props.write_batch_size(write_batch_size) @@ -2300,6 +2304,7 @@ cdef class ParquetWriter(_Weakrefable): use_deprecated_int96_timestamps=False, coerce_timestamps=None, data_page_size=None, + max_rows_per_page=None, allow_truncated_timestamps=False, compression_level=None, use_byte_stream_split=False, @@ -2340,6 +2345,7 @@ cdef class ParquetWriter(_Weakrefable): version=version, write_statistics=write_statistics, data_page_size=data_page_size, + max_rows_per_page=max_rows_per_page, compression_level=compression_level, use_byte_stream_split=use_byte_stream_split, column_encoding=column_encoding, diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index 42d48ba050f..81901a00acd 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -492,6 +492,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* enable_store_decimal_as_integer() Builder* disable_store_decimal_as_integer() Builder* data_pagesize(int64_t size) + Builder* max_rows_per_page(int64_t max_rows) Builder* encoding(ParquetEncoding encoding) Builder* encoding(const c_string& path, ParquetEncoding encoding) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 5f62a3fc4f1..676bc445238 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -795,6 +795,10 @@ def _sanitize_table(table, new_schema, flavor): Set a target threshold for the approximate encoded size of data pages within a column chunk (in bytes). If None, use the default data page size of 1MByte. +max_rows_per_page : int, default None + Maximum number of rows per page within a column chunk. + If None, use the default of 20000. + Smaller values reduce memory usage during reads but increase metadata overhead. flavor : {'spark'}, default None Sanitize schema or set other compatibility options to work with various target systems. @@ -1042,6 +1046,7 @@ def __init__(self, where, schema, filesystem=None, sorting_columns=None, store_decimal_as_integer=False, write_time_adjusted_to_utc=False, + max_rows_per_page=None, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1096,6 +1101,7 @@ def __init__(self, where, schema, filesystem=None, sorting_columns=sorting_columns, store_decimal_as_integer=store_decimal_as_integer, write_time_adjusted_to_utc=write_time_adjusted_to_utc, + max_rows_per_page=max_rows_per_page, **options) self.is_open = True @@ -1971,6 +1977,7 @@ def write_table(table, where, row_group_size=None, version='2.6', sorting_columns=None, store_decimal_as_integer=False, write_time_adjusted_to_utc=False, + max_rows_per_page=None, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -2003,6 +2010,7 @@ def write_table(table, where, row_group_size=None, version='2.6', sorting_columns=sorting_columns, store_decimal_as_integer=store_decimal_as_integer, write_time_adjusted_to_utc=write_time_adjusted_to_utc, + max_rows_per_page=max_rows_per_page, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 3e7352428c9..a49441f09f4 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -487,3 +487,60 @@ def test_arrow_writer_props_time_adjusted_to_utc( result.validate(full=True) assert result.equals(table) + + +@pytest.mark.parametrize( + "max_rows_per_page", + [1, 10, 100, 1_000, None], +) +def test_writer_props_max_rows_per_page(tempdir, max_rows_per_page): + # GH-48096 + filename = tempdir / "max_rows_per_page.parquet" + + table = pa.table({ + "x": pa.array([1, 2, 3, 4, 5, 6, 7], type=pa.int8()), + "y": pa.array([11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0], type=pa.float16()), + }) + + schema = pa.schema([ + ("x", pa.int8()), + ("y", pa.float16()), + ]) + + with pq.ParquetWriter( + where=filename, + schema=schema, + max_rows_per_page=max_rows_per_page, + ) as writer: + writer.write_table(table) + + result = pq.read_table(filename, schema=schema) + + result.validate(full=True) + + assert result.equals(table) + + +def test_writer_props_max_rows_per_page_file_size(tempdir): + # GH-48096 + table = pa.table({ + "x": pa.array(range(1_000_000)) + }) + + local = fs.LocalFileSystem() + file_infos = [] + + for max_rows in (1_000, 10_000): + path = f"{tempdir}/max_rows_per_page_{max_rows}.parquet" + + with pq.ParquetWriter( + where=path, + schema=table.schema, + max_rows_per_page=max_rows, + ) as writer: + writer.write_table(table) + + file_infos.append(local.get_file_info(path)) + + # A smaller maximum rows parameter should produce a larger file + assert file_infos[0].size > file_infos[1].size From a63203c43f7282c701359ee7e5d0795a0420012c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 17 Nov 2025 16:03:46 +0100 Subject: [PATCH 04/13] GH-48112: [C++][Parquet] Use more accurate data length estimate when decoding PLAIN BYTE_ARRAY data (#48113) ### Rationale for this change Avoid reserving too many data bytes when decoding a PLAIN BYTE_ARRAY Parquet column as Arrow Binary or LargeBinary. ### Are these changes tested? By existing tests. ### Are there any user-facing changes? No. * GitHub Issue: #48112 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/decoder.cc | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index d0a857dd22a..431f2d26042 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -754,6 +754,12 @@ class PlainByteArrayDecoder : public PlainDecoder { int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_values_decoded) { + // We're going to decode up to `num_values - null_count` PLAIN values, + // and each value has a 4-byte length header that doesn't count for the + // Arrow binary data length. + int64_t estimated_data_length = + std::max(0, len_ - 4 * (num_values - null_count)); + auto visit_binary_helper = [&](auto* helper) { int values_decoded = 0; @@ -772,11 +778,12 @@ class PlainByteArrayDecoder : public PlainDecoder { "Invalid or truncated PLAIN-encoded BYTE_ARRAY data"); } RETURN_NOT_OK( - helper->AppendValue(data_ + 4, value_len, - /*estimated_remaining_data_length=*/len_)); + helper->AppendValue(data_ + 4, value_len, estimated_data_length)); auto increment = value_len + 4; data_ += increment; len_ -= increment; + estimated_data_length -= value_len; + DCHECK_GE(estimated_data_length, 0); } values_decoded += static_cast(run_length); return Status::OK(); @@ -790,8 +797,8 @@ class PlainByteArrayDecoder : public PlainDecoder { return Status::OK(); }; - return DispatchArrowBinaryHelper(out, num_values, len_, - visit_binary_helper); + return DispatchArrowBinaryHelper( + out, num_values, estimated_data_length, visit_binary_helper); } template From f55b5864a9599ae2a330872c294ce52ae5c14259 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Mon, 17 Nov 2025 19:02:40 -0500 Subject: [PATCH 05/13] GH-48152: [CI][MATLAB] Bump MATLAB release to R2025b in the MATLAB GitHub Actions Workflow (#48153) Thanks for opening a pull request! ### Rationale for this change MATLAB R2025b is the latest available version of MATLAB as of November 2025. We are currently building against MATLAB R2025a in CI, and would like to build and test the MATLAB Interface to Apache Arrow against the latest version of MATLAB. ### What changes are included in this PR? Updated `.github/workfows/matlab.yml` to build/test the MATLAB Interface to Apache Arrow against MATLAB `R2025b`. ### Are these changes tested? Yes. MATLAB CI workflow [successfully passed on all platforms in mathworks/arrow](https://github.com/apache/arrow/actions/runs/19440402671). ### Are there any user-facing changes? Yes, the MATLAB Interface to Apache Arrow will now be built against `R2025b` in CI. * GitHub Issue: #48152 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index fbdac4a8b06..41a50d7e8c7 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -59,7 +59,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -107,7 +107,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Install ccache run: brew install ccache - name: Setup ccache @@ -146,7 +146,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh From 1352de1c58de85ffbb2ef2bb4099d817056cc164 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Mon, 17 Nov 2025 19:03:09 -0500 Subject: [PATCH 06/13] GH-48154: [MATAB][Packaging] Update MATLAB crossbow workflow to build against MATLAB `R2025b` (#48155) Thanks for opening a pull request! ### Rationale for this change MATLAB [R2025b](https://www.mathworks.com/products/new_products/latest_features.html) is now available for use with the [matlab-actions/setup-matlab](https://github.com/matlab-actions/setup-matlab) GitHub Action. We should update the [crossbow packaging workflows for the MATLAB MLTBX files](https://github.com/apache/arrow/blob/main/dev/tasks/matlab/github.yml) to build against R2025b. ### What changes are included in this PR? 1. Updated the `dev/tasks/matlab/github.yml` crossbow packaging workflow to build the MATLAB MLTBX files against MATLAB R2025b. ### Are these changes tested? Yes. The MATLAB crossbow packaging workflow [passed on all three platforms](https://github.com/ursacomputing/crossbow/actions/runs/19440978027/job/55623923047). ### Are there any user-facing changes? Yes. The MATLAB MLTBX release artifacts will now be built against R2025b. * GitHub Issue: #48154 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- dev/tasks/matlab/github.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 6fdb313cfb2..77f3056c362 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -31,7 +31,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -73,7 +73,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -99,7 +99,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Install sccache shell: bash run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache @@ -146,7 +146,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2025a + release: R2025b - name: Run commands env: MATLABPATH: arrow/matlab/tools From 99c1315ad8a8aad50a044221202d9c7d9905b3a8 Mon Sep 17 00:00:00 2001 From: feuillatte <97880140+feuillatte@users.noreply.github.com> Date: Mon, 17 Nov 2025 16:26:59 -0800 Subject: [PATCH 07/13] GH-48139: [C++] Allow compilation for QNX versions up to 8 (#48140) ### Rationale for this change The endianness header inclusion preprocessor selection logic in `cpp/src/arrow/util/endian.h` currently prevents compling Arrow C++ libraries for the QNX operating system. Modern QNX operating system toolchains are detectable via the preprocessor defines `__QNXNTO__` (versions 5 - 7) and `__QNX__` (version 8), which are not currently considered, but the basis for the logic is already implemented for AIX. ### What changes are included in this PR? Extend the current preprocessor `!defined()` check for AIX with the equivalent for QNX in `cpp/src/arrow/util/endian.h`. ### Are these changes tested? The change is verified using the proprietary QNX SDP 7.1 GCC 8 based toolchain and tested on QNX 7. Applying the patch allows building for QNX. ### Are there any user-facing changes? No; the changes purely relate to OS-specific toolchain detection. * GitHub Issue: #48139 Authored-by: Lilja Tamminen Signed-off-by: Sutou Kouhei --- cpp/src/arrow/util/endian.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/endian.h b/cpp/src/arrow/util/endian.h index fcc138828e7..1edb828f9b5 100644 --- a/cpp/src/arrow/util/endian.h +++ b/cpp/src/arrow/util/endian.h @@ -24,7 +24,7 @@ # include // IWYU pragma: keep # elif defined(sun) || defined(__sun) # include // IWYU pragma: keep -# elif !defined(_AIX) +# elif !defined(_AIX) && !defined(__QNXNTO__) && !defined(__QNX__) # include // IWYU pragma: keep # endif # From 0178d912a0850fec462b946fc087f04ab530fe7f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Nov 2025 11:03:52 +0000 Subject: [PATCH 08/13] GH-48129: [CI] Stale issues bot only looks at 30 issues at a time (#48130) ### Rationale for this change Stale issues bot only analyses 30 issues at a time due to defaults. ### What changes are included in this PR? Increase them to 1000 and run daily to catch any beyond this value. ### Are these changes tested? Yeah, I temporarily ran on this PR and looks like everything which is permitted on PRs did run. ### Are there any user-facing changes? No * GitHub Issue: #48129 Authored-by: Nic Crane Signed-off-by: Nic Crane --- .github/workflows/stale.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 7dfa5fd02aa..45627dc88f8 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,8 @@ name: "Close stale PRs" on: schedule: - - cron: "0 11 * * 3" # Run once per week on Wednesday at 11:00 AM UTC + - cron: "10 11 * * *" # Run daily at 11:10 AM UTC + workflow_dispatch: jobs: close-stale-prs: @@ -36,6 +37,7 @@ jobs: # exclude issues days-before-issue-stale: -1 days-before-issue-close: -1 + operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} close-stale-issues-usage: runs-on: ubuntu-latest @@ -53,6 +55,7 @@ jobs: stale-issue-label: "Status: stale-warning" days-before-issue-stale: 365 days-before-issue-close: 14 + operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} close-stale-issues-enhancement: runs-on: ubuntu-latest @@ -71,4 +74,5 @@ jobs: stale-issue-label: "Status: stale-warning" days-before-issue-stale: 365 days-before-issue-close: 14 + operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} From 7609ecda2db5121c17d774137868fa528876001e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Nov 2025 17:43:18 +0000 Subject: [PATCH 09/13] GH-48162: [CI] Stale issues bot hit secondary rate limit and did not complete (#48165) ### Rationale for this change Stale issues bot hits rate limit on API and doesn't post comments ### What changes are included in this PR? Reduce how many operations it can complete (I went and manually added comments myself to the issues/PRs with labels but no comments) ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #48162 Authored-by: Nic Crane Signed-off-by: Nic Crane --- .github/workflows/stale.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 45627dc88f8..2e28538e3e6 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,7 @@ name: "Close stale PRs" on: schedule: - - cron: "10 11 * * *" # Run daily at 11:10 AM UTC + - cron: "10 11 * * *" # Run daily at 11:10 UTC workflow_dispatch: jobs: @@ -37,7 +37,6 @@ jobs: # exclude issues days-before-issue-stale: -1 days-before-issue-close: -1 - operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} close-stale-issues-usage: runs-on: ubuntu-latest @@ -55,7 +54,6 @@ jobs: stale-issue-label: "Status: stale-warning" days-before-issue-stale: 365 days-before-issue-close: 14 - operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} close-stale-issues-enhancement: runs-on: ubuntu-latest @@ -74,5 +72,4 @@ jobs: stale-issue-label: "Status: stale-warning" days-before-issue-stale: 365 days-before-issue-close: 14 - operations-per-run: 1000 repo-token: ${{ secrets.GITHUB_TOKEN }} From b8207575bfb49c4554cb38249ab1c2c9d15c9261 Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Wed, 19 Nov 2025 06:31:57 +0530 Subject: [PATCH 10/13] MINOR: [C++][Parquet][Docs] Increase chunk_size in docs (#40705) ### Rationale for this change Is there a reason we're using a low value here. All other examples use `128*1024` or `64*1024`. I was stumped by this as I used it without really reading about the parameter and spent a day figuring out why my parquet writes were so slow. ### What changes are included in this PR? Increase `chunk_size` to `64*1024` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. Authored-by: Shadab Zafar Signed-off-by: Sutou Kouhei --- cpp/examples/arrow/parquet_read_write.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/arrow/parquet_read_write.cc b/cpp/examples/arrow/parquet_read_write.cc index 24650189663..b0765e1da61 100644 --- a/cpp/examples/arrow/parquet_read_write.cc +++ b/cpp/examples/arrow/parquet_read_write.cc @@ -120,7 +120,7 @@ arrow::Status WriteFullFile(std::string path_to_file) { ARROW_RETURN_NOT_OK(parquet::arrow::WriteTable(*table.get(), arrow::default_memory_pool(), outfile, - /*chunk_size=*/3, props, arrow_props)); + /*chunk_size=*/64*1024, props, arrow_props)); return arrow::Status::OK(); } From e0876ae793fe392e1017fd61e0c09c00325f6e6a Mon Sep 17 00:00:00 2001 From: paultiq <104510378+paultiq@users.noreply.github.com> Date: Wed, 19 Nov 2025 03:19:01 -0500 Subject: [PATCH 11/13] GH-47823: [Python] Use PyWeakref_GetRef instead of PyWeakref_GET_OBJECT (Python 3.15) (#48027) ### Rationale for this change pyarrow builds fail on CPython 3.15 due to the 3.15's removal of PyWeakref_GetObject, used in extension_types.cc. This was deprecated in 3.13. A backport is available in the already-vendored [pythoncapi_compat](https://github.com/apache/arrow/blob/main/python/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h). Fixes https://github.com/apache/arrow/issues/47823. To be clear: this fixes only the build issue reported in the issue, not "3.15 support". ### What changes are included in this PR? Replaces the sole use of PyWeakref_GET_OBJECT with the backported version of PyWeakref_GetRef. This follows the recommendation from [Pending removal in Python 3.15](https://docs.python.org/3/deprecations/c-api-pending-removal-in-3.15.html) > [PyWeakref_GetObject()](https://docs.python.org/3/c-api/weakref.html#c.PyWeakref_GetObject) and [PyWeakref_GET_OBJECT()](https://docs.python.org/3/c-api/weakref.html#c.PyWeakref_GET_OBJECT): Use [PyWeakref_GetRef()](https://docs.python.org/3/c-api/weakref.html#c.PyWeakref_GetRef) instead. The [pythoncapi-compat project](https://github.com/python/pythoncapi-compat/) can be used to get [PyWeakref_GetRef()](https://docs.python.org/3/c-api/weakref.html#c.PyWeakref_GetRef) on Python 3.12 and older. ### Are these changes tested? Build and tested on a local Ubuntu 24.04 build. ### Are there any user-facing changes? No user-facing changes. * GitHub Issue: #47823 Authored-by: paultiq <104510378+paultiq@users.noreply.github.com> Signed-off-by: AlenkaF --- .../pyarrow/src/arrow/python/extension_type.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/src/arrow/python/extension_type.cc b/python/pyarrow/src/arrow/python/extension_type.cc index 8439ecf8583..8c3f3f1d8d0 100644 --- a/python/pyarrow/src/arrow/python/extension_type.cc +++ b/python/pyarrow/src/arrow/python/extension_type.cc @@ -22,6 +22,7 @@ #include "arrow/python/extension_type.h" #include "arrow/python/helpers.h" #include "arrow/python/pyarrow.h" +#include "arrow/python/vendored/pythoncapi_compat.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -164,15 +165,18 @@ PyObject* PyExtensionType::GetInstance() const { return nullptr; } ARROW_DCHECK(PyWeakref_CheckRef(type_instance_.obj())); - PyObject* inst = PyWeakref_GET_OBJECT(type_instance_.obj()); - if (inst != Py_None) { - // Cached instance still alive - Py_INCREF(inst); + PyObject* inst = NULL; + int result = PyWeakref_GetRef(type_instance_.obj(), &inst); + if (result == 1) { + // Alive: inst is a new strong reference return inst; - } else { - // Must reconstruct from serialized form + } else if (result == 0) { + // Weakref is dead, must reconstruct from serialized form // XXX cache again? return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_); + } else { + // -1 = exception + return nullptr; } } From fab52dbd033009a83e4e2c3a98bb85e72f4ddedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 19 Nov 2025 11:41:41 +0100 Subject: [PATCH 12/13] GH-48163: [CI][Docs] Update preview docs task S3 secret to use (#48164) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The voltron data AWS account will be closed soon, we have to update to newly created bucket under the AWS account for Arrow. ### What changes are included in this PR? Migrate credentials for newly created bucket and update URL where dashboard will be deployed. ### Are these changes tested? Yes, via archery. ### Are there any user-facing changes? No, only devs where the URL is different * GitHub Issue: #48163 Lead-authored-by: Raúl Cumplido Co-authored-by: Rok Mihevc Signed-off-by: Rok Mihevc --- .../fixtures/chat-report-extra-message-failure.txt | 2 +- .../fixtures/chat-report-extra-message-success.txt | 2 +- .../archery/crossbow/tests/fixtures/chat-report.txt | 2 +- .../archery/crossbow/tests/fixtures/email-report.txt | 2 +- .../archery/templates/chat_nightly_report.txt.j2 | 2 +- .../archery/templates/email_nightly_report.txt.j2 | 2 +- dev/tasks/docs/github.linux.yml | 11 ++++++----- r/PACKAGING.md | 2 +- 8 files changed, 13 insertions(+), 12 deletions(-) diff --git a/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-failure.txt b/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-failure.txt index e5565eb8754..25cbdb8672c 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-failure.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-failure.txt @@ -1,5 +1,5 @@ -* for * +* for * :x: *1 failed jobs* - diff --git a/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-success.txt b/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-success.txt index b5a0d48805c..721d530b686 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-success.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/chat-report-extra-message-success.txt @@ -1,5 +1,5 @@ -* for * +* for * :tada: *4 successful jobs* diff --git a/dev/archery/archery/crossbow/tests/fixtures/chat-report.txt b/dev/archery/archery/crossbow/tests/fixtures/chat-report.txt index e4c6bd8a521..367ca742e1c 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/chat-report.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/chat-report.txt @@ -1,5 +1,5 @@ -* for * +* for * :x: *1 failed jobs* - diff --git a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt b/dev/archery/archery/crossbow/tests/fixtures/email-report.txt index 4480b5b515d..c29cafd3938 100644 --- a/dev/archery/archery/crossbow/tests/fixtures/email-report.txt +++ b/dev/archery/archery/crossbow/tests/fixtures/email-report.txt @@ -4,7 +4,7 @@ Subject: [NIGHTLY] Arrow Build Report for Job ursabot-1: 2 failed, 1 pending Arrow Build Report for Job ursabot-1 -See http://crossbow.voltrondata.com/ for more information. +See https://s3.amazonaws.com/arrow-data/index.html for more information. All tasks: https://github.com/apache/crossbow/branches/all?query=ursabot-1 diff --git a/dev/archery/archery/templates/chat_nightly_report.txt.j2 b/dev/archery/archery/templates/chat_nightly_report.txt.j2 index ab68f201ab2..3989df9b3b0 100644 --- a/dev/archery/archery/templates/chat_nightly_report.txt.j2 +++ b/dev/archery/archery/templates/chat_nightly_report.txt.j2 @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. #} -* for <{{ report.url(report.job.branch) }}|{{ report.job.branch }}>* +* for <{{ report.url(report.job.branch) }}|{{ report.job.branch }}>* {% if report.tasks_by_state["failure"] %} :x: *{{ report.tasks_by_state["failure"] | length }} failed jobs* {% for task_name, task in report.tasks_by_state["failure"] | dictsort -%} diff --git a/dev/archery/archery/templates/email_nightly_report.txt.j2 b/dev/archery/archery/templates/email_nightly_report.txt.j2 index e3345bdb073..bc040734b03 100644 --- a/dev/archery/archery/templates/email_nightly_report.txt.j2 +++ b/dev/archery/archery/templates/email_nightly_report.txt.j2 @@ -24,7 +24,7 @@ Subject: [NIGHTLY] Arrow Build Report for Job {{report.job.branch}}: {{ (report. Arrow Build Report for Job {{ report.job.branch }} -See http://crossbow.voltrondata.com/ for more information. +See https://s3.amazonaws.com/arrow-data/index.html for more information. All tasks: {{ report.url(report.job.branch) }} {% if report.tasks_by_state["failure"] %} diff --git a/dev/tasks/docs/github.linux.yml b/dev/tasks/docs/github.linux.yml index 5863d68d2c8..29b7b76c690 100644 --- a/dev/tasks/docs/github.linux.yml +++ b/dev/tasks/docs/github.linux.yml @@ -57,14 +57,15 @@ jobs: - name: Upload preview to S3 env: {%- raw %} - AWS_ACCESS_KEY_ID: ${{ secrets.CROSSBOW_DOCS_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CROSSBOW_DOCS_AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: ${{ secrets.CROSSBOW_DOCS_S3_BUCKET_REGION }} - BUCKET: ${{ secrets.CROSSBOW_DOCS_S3_BUCKET }} + AWS_ACCESS_KEY_ID: ${{ secrets.CROSSBOW_DOCS_ARROW_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.CROSSBOW_DOCS_ARROW_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.CROSSBOW_DOCS_ARROW_S3_BUCKET_REGION }} + BUCKET: ${{ secrets.CROSSBOW_DOCS_ARROW_S3_BUCKET }} {% endraw %} run: | aws s3 cp build/docs/ $BUCKET/pr_docs/{{ pr_number }}/ --recursive - echo ":open_book: You can find the preview here: http://crossbow.voltrondata.com/pr_docs/{{ pr_number }}" >> $GITHUB_STEP_SUMMARY + echo ":open_book: You can find the preview here: https://s3.amazonaws.com/arrow-data/pr_docs/{{ pr_number }}/index.html" >> $GITHUB_STEP_SUMMARY + {% endif %} - name: Prepare Docs artifacts run: | diff --git a/r/PACKAGING.md b/r/PACKAGING.md index b8815e6a526..56410c74208 100644 --- a/r/PACKAGING.md +++ b/r/PACKAGING.md @@ -26,7 +26,7 @@ For a high-level overview of the Arrow release process see the [Apache Arrow Rel - [ ] [Create a GitHub issue](https://github.com/apache/arrow/issues/new/) entitled `[R] CRAN packaging checklist for version X.Y.Z` and copy this checklist to the issue. - [ ] Review deprecated functions to advance their deprecation status. -- [ ] Evaluate the status of any failing [nightly tests and nightly packaging builds](http://crossbow.voltrondata.com). These checks replicate most of the checks that CRAN runs, so we need them all to be passing or to understand that the failures may (though won't necessarily) result in a rejection from CRAN. +- [ ] Evaluate the status of any failing [nightly tests and nightly packaging builds](https://s3.amazonaws.com/arrow-data/index.html). These checks replicate most of the checks that CRAN runs, so we need them all to be passing or to understand that the failures may (though won't necessarily) result in a rejection from CRAN. - [ ] Check [current CRAN check results](https://cran.rstudio.org/web/checks/check_results_arrow.html). - [ ] Ensure the contents of the README are accurate and up to date. - [ ] Run `urlchecker::url_check()` on the R directory at the release candidate. From 78b9053d2dca508246f666526e32a80f0729646f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 19 Nov 2025 14:11:18 +0100 Subject: [PATCH 13/13] GH-48091: [C++] Use FetchContent for bundled c-ares (#48092) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change As a follow up of requiring a minimum CMake version >= 3.25 we discussed moving our dependencies from ExternalProject to FetchContent. This can heavily simplify our third party dependency management. Moving c-ares is the next step before moving grpc. ### What changes are included in this PR? The general change is moving from `ExternalProject` to `FetchContent`. It also add some required integration due to other dependencies, like grpc, using `ExternalProject`. We not only have to build but also install in order for those other dependencies to find c-ares. This causes some timing issues between config, build, install that requires us to create a custom target to depend on so the other dependencies find abseil. ### Are these changes tested? Yes, the changes are tested locally and on CI. ### Are there any user-facing changes? No * GitHub Issue: #48091 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 107 ++++++++++++++------ 1 file changed, 76 insertions(+), 31 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 910f256c81e..835baec87ba 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2950,33 +2950,67 @@ if(ARROW_WITH_UTF8PROC) resolve_dependency(${utf8proc_resolve_dependency_args}) endif() -macro(build_cares) - message(STATUS "Building c-ares from source") - set(CARES_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/cares_ep-install") - set(CARES_INCLUDE_DIR "${CARES_PREFIX}/include") - - # If you set -DCARES_SHARED=ON then the build system names the library - # libcares_static.a - set(CARES_STATIC_LIB - "${CARES_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}cares${CMAKE_STATIC_LIBRARY_SUFFIX}" - ) +function(build_cares) + list(APPEND CMAKE_MESSAGE_INDENT "c-ares: ") + message(STATUS "Building c-ares from source using FetchContent") + set(CARES_VENDORED + TRUE + PARENT_SCOPE) + set(CARES_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/cares_fc-install") + set(CARES_PREFIX + "${CARES_PREFIX}" + PARENT_SCOPE) - set(CARES_CMAKE_ARGS "${EP_COMMON_CMAKE_ARGS}" "-DCMAKE_INSTALL_PREFIX=${CARES_PREFIX}" - -DCARES_SHARED=OFF -DCARES_STATIC=ON) + fetchcontent_declare(cares + URL ${CARES_SOURCE_URL} + URL_HASH "SHA256=${ARROW_CARES_BUILD_SHA256_CHECKSUM}") - externalproject_add(cares_ep - ${EP_COMMON_OPTIONS} - URL ${CARES_SOURCE_URL} - URL_HASH "SHA256=${ARROW_CARES_BUILD_SHA256_CHECKSUM}" - CMAKE_ARGS ${CARES_CMAKE_ARGS} - BUILD_BYPRODUCTS "${CARES_STATIC_LIB}") + prepare_fetchcontent() - file(MAKE_DIRECTORY ${CARES_INCLUDE_DIR}) + set(CARES_SHARED OFF) + set(CARES_STATIC ON) + set(CARES_INSTALL ON) + set(CARES_BUILD_TOOLS OFF) + set(CARES_BUILD_TESTS OFF) + fetchcontent_makeavailable(cares) + + # gRPC requires c-ares to be installed to a known location. + # We have to do this in two steps to avoid double installation of c-ares + # when Arrow is installed. + # This custom target ensures c-ares is built before we install + add_custom_target(cares_built DEPENDS c-ares::cares) + + # Disable c-ares's install script after it's built to prevent double installation + add_custom_command(OUTPUT "${cares_BINARY_DIR}/cmake_install.cmake.saved" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${cares_BINARY_DIR}/cmake_install.cmake" + "${cares_BINARY_DIR}/cmake_install.cmake.saved" + COMMAND ${CMAKE_COMMAND} -E echo + "# c-ares install disabled to prevent double installation with Arrow" + > "${cares_BINARY_DIR}/cmake_install.cmake" + DEPENDS cares_built + COMMENT "Disabling c-ares install to prevent double installation" + VERBATIM) + + add_custom_target(cares_install_disabled ALL + DEPENDS "${cares_BINARY_DIR}/cmake_install.cmake.saved") + + # Install c-ares to CARES_PREFIX for gRPC to find + add_custom_command(OUTPUT "${CARES_PREFIX}/.cares_installed" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${cares_BINARY_DIR}/cmake_install.cmake.saved" + "${cares_BINARY_DIR}/cmake_install.cmake.tmp" + COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${CARES_PREFIX} + -DCMAKE_INSTALL_CONFIG_NAME=$ -P + "${cares_BINARY_DIR}/cmake_install.cmake.tmp" || + ${CMAKE_COMMAND} -E true + COMMAND ${CMAKE_COMMAND} -E touch "${CARES_PREFIX}/.cares_installed" + DEPENDS cares_install_disabled + COMMENT "Installing c-ares to ${CARES_PREFIX} for gRPC" + VERBATIM) - add_library(c-ares::cares STATIC IMPORTED) - set_target_properties(c-ares::cares PROPERTIES IMPORTED_LOCATION "${CARES_STATIC_LIB}") - target_include_directories(c-ares::cares BEFORE INTERFACE "${CARES_INCLUDE_DIR}") - add_dependencies(c-ares::cares cares_ep) + # Make cares_fc depend on the install completion marker + add_custom_target(cares_fc DEPENDS "${CARES_PREFIX}/.cares_installed") if(APPLE) # libresolv must be linked from c-ares version 1.16.1 @@ -2985,10 +3019,11 @@ macro(build_cares) "${LIBRESOLV_LIBRARY}") endif() - set(CARES_VENDORED TRUE) - - list(APPEND ARROW_BUNDLED_STATIC_LIBS c-ares::cares) -endmacro() + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} c-ares::cares + PARENT_SCOPE) + list(POP_BACK CMAKE_MESSAGE_INDENT) +endfunction() # ---------------------------------------------------------------------- # Dependencies for Arrow Flight RPC @@ -3136,7 +3171,9 @@ function(build_absl) # This is due to upstream absl::cctz issue # https://github.com/abseil/abseil-cpp/issues/283 find_library(CoreFoundation CoreFoundation) - set_property(TARGET absl::time + # When ABSL_ENABLE_INSTALL is ON, the real target is "time" not "absl_time" + # Cannot use set_property on alias targets (absl::time is an alias) + set_property(TARGET time APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${CoreFoundation}) endif() @@ -3189,7 +3226,7 @@ macro(build_grpc) add_dependencies(grpc_dependencies absl_fc) endif() if(CARES_VENDORED) - add_dependencies(grpc_dependencies cares_ep) + add_dependencies(grpc_dependencies cares_fc) endif() if(GFLAGS_VENDORED) @@ -3208,8 +3245,16 @@ macro(build_grpc) get_filename_component(GRPC_PB_ROOT "${GRPC_PROTOBUF_INCLUDE_DIR}" DIRECTORY) get_target_property(GRPC_Protobuf_PROTOC_LIBRARY ${ARROW_PROTOBUF_LIBPROTOC} IMPORTED_LOCATION) - get_target_property(GRPC_CARES_INCLUDE_DIR c-ares::cares INTERFACE_INCLUDE_DIRECTORIES) - get_filename_component(GRPC_CARES_ROOT "${GRPC_CARES_INCLUDE_DIR}" DIRECTORY) + + # For FetchContent c-ares, use the install prefix directly + if(CARES_VENDORED) + set(GRPC_CARES_ROOT "${CARES_PREFIX}") + else() + get_target_property(GRPC_CARES_INCLUDE_DIR c-ares::cares + INTERFACE_INCLUDE_DIRECTORIES) + get_filename_component(GRPC_CARES_ROOT "${GRPC_CARES_INCLUDE_DIR}" DIRECTORY) + endif() + get_target_property(GRPC_RE2_INCLUDE_DIR re2::re2 INTERFACE_INCLUDE_DIRECTORIES) get_filename_component(GRPC_RE2_ROOT "${GRPC_RE2_INCLUDE_DIR}" DIRECTORY)