EnzymeAD
diff --git a/‎Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/ReactantCUDAExt.jl‎
Lines changed: 1 addition & 1 deletion b/‎ext/ReactantCUDAExt.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Compiler.jl‎
Lines changed: 36 additions & 33 deletions b/‎src/Compiler.jl‎
Lines changed: 36 additions & 33 deletions
diff --git a/‎src/ConcreteRArray.jl‎
Lines changed: 12 additions & 16 deletions b/‎src/ConcreteRArray.jl‎
Lines changed: 12 additions & 16 deletions
diff --git a/‎src/Devices.jl‎
Lines changed: 8 additions & 8 deletions b/‎src/Devices.jl‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/Precompile.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/Precompile.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Reactant.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/Reactant.jl‎
Lines changed: 1 addition & 1 deletion
@@ -81,7 +81,7 @@ PythonCall = "0.9"
 Random = "1.10"
 Random123 = "1.7"
 ReactantCore = "0.1.5"
-Reactant_jll = "0.0.70"
+Reactant_jll = "0.0.71"
 Scratch = "1.2"
 Sockets = "1.10"
 SpecialFunctions = "2.4"
 
@@ -1155,7 +1155,7 @@ end
 @static if !Sys.isapple()
     Reactant.PrecompileTools.@setup_workload begin
         Reactant.initialize_dialect()
-        client = Reactant.XLA.CPUClient(; checkcount=false)
+        client = Reactant.XLA.PJRT.CPUClient(; checkcount=false)
         Reactant.PrecompileTools.@compile_workload begin
             @static if Reactant.precompilation_supported() && VERSION != v"1.11.3"
                 function square_kernel!(x)
 
@@ -478,9 +478,9 @@ function compile_mlir(f, args; client=nothing, kwargs...)
     @ccall MLIR.API.mlir_c.RegisterDialects(ctx::MLIR.API.MlirContext)::Cvoid
 
     if client !== nothing
-        backend = XLA.ClientGetPlatformName(client)
+        backend = XLA.platform_name(client)
     else
-        backend = XLA.ClientGetPlatformName(XLA.default_backend[])
+        backend = XLA.platform_name(XLA.default_backend[])
     end
     if backend == "CUDA"
         backend = "GPU"
@@ -1076,9 +1076,7 @@ function codegen_flatten!(
 
         if is_sharded
             carg = inv_seen_args[arg]
-            condensed_op_sharding = Reactant.Sharding.XLA.CondensedOpSharding(
-                linear_parameter_shardings[i]
-            )
+            device_ids = mesh.sorted_device_ids
             if Reactant.Sharding.is_sharded(carg)
                 # Currently disabling the error since we roundtrip from MHLO to generate
                 # the shardings
@@ -1090,29 +1088,30 @@ function codegen_flatten!(
 
                 push!(flatten_code, :($usbuf = $flatcode.data))
                 for j in 1:length(mesh)
-                    sbuf = Symbol(:sbuf_, i, "_", j)
+                    sbuf = Symbol(:sbuf_, i, "_", device_ids[j])
                     push!(flatten_names, sbuf)
                     push!(flatten_code, :($sbuf = XLA.synced_buffer(getindex($usbuf, $j))))
                 end
             else
+                condensed_op_sharding = convert(
+                    Reactant.Sharding.XLA.CondensedOpSharding, linear_parameter_shardings[i]
+                )
                 push!(flatten_code, :($usbuf = $flatcode))
                 device_to_array_slices = XLA.sharding_to_concrete_array_indices(
                     condensed_op_sharding, size(carg), mesh
                 )
-                device_ids = vec(mesh)
                 for j in 1:length(mesh)
-                    buf = Symbol(:buf_, i, :_, j)
-                    device_id = device_ids[j]
+                    local_device_id = device_ids[j]
+                    buf = Symbol(:buf_, i, :_, local_device_id)
                     slice = device_to_array_slices[j]
                     push!(
                         flatten_code,
                         :($buf = XLA.synced_buffer(only($usbuf[$(slice)...].data))),
                     )
-                    device_ordinal = XLA.device_ordinal(client, device_id)
-                    sbuf = Symbol(:sbuf_, i, :_, j)
-                    device = XLA.ClientGetAddressableDevice(client, device_ordinal)
+                    sbuf = Symbol(:sbuf_, i, :_, local_device_id)
+                    device = XLA.get_addressable_device(client, local_device_id)
                     push!(flatten_names, sbuf)
-                    push!(flatten_code, :($sbuf = XLA.CopyBufferToDevice($buf, $device)))
+                    push!(flatten_code, :($sbuf = XLA.copy_buffer_to_device($buf, $device)))
                 end
             end
         else
@@ -1308,12 +1307,17 @@ Generate Julia code to call the XLA executable.
 - `nresults`: The number of results to expect.
 """
 function codegen_xla_call(
-    exec, device, flatten_names, donated_args_mask, nresults, is_sharded::Bool, mesh_ids
+    exec,
+    device,
+    flatten_names,
+    donated_args_mask,
+    nresults,
+    is_sharded::Bool,
+    ndevices::Int,
 )
     flatten_buffer_refs = map(n -> :($n.buffer), flatten_names)
 
-    base_symbol_name =
-        is_sharded ? Symbol(:result_buffer_m, length(mesh_ids), :_) : :result_buffer_
+    base_symbol_name = is_sharded ? Symbol(:result_buffer_m, ndevices, :_) : :result_buffer_
     concretized_res_names = Symbol[Symbol(base_symbol_name, i) for i in 1:nresults]
     concretized_res_code = map(enumerate(concretized_res_names)) do (i, varname)
         :($varname = linearized_results[$i])
@@ -1325,21 +1329,20 @@ function codegen_xla_call(
         if is_sharded
             quote
                 GC.@preserve $(flatten_names...) begin
-                    linearized_results = XLA.ExecutableCall(
+                    linearized_results = XLA.execute(
                         $exec,
-                        $(mesh_ids),
                         ($(flatten_buffer_refs...),),
                         $(Tuple(donated_args_mask)),
                         Val($nresults),
-                        Val($(length(mesh_ids))),
+                        Val($ndevices),
                     )
                 end
                 $(concretized_res_code...)
             end
         else
             quote
                 GC.@preserve $(flatten_names...) begin
-                    linearized_results = XLA.ExecutableCallSharded(
+                    linearized_results = XLA.execute_sharded(
                         $exec,
                         $(device),
                         ($(flatten_buffer_refs...),),
@@ -1393,7 +1396,7 @@ function __resolve_device_and_client(client, seen_args, linear_args, is_sharded)
             if !allequal(devices_list)
                 msg = "Expected all arguments to be on the same device, got:\n"
                 for (i, device) in enumerate(devices_list)
-                    msg *= "    Device $(i): $(XLA.DeviceToString(device))\n"
+                    msg *= "    Device $(i): $(string(device))\n"
                 end
                 throw(ArgumentError(msg))
             end
@@ -1407,17 +1410,13 @@ function __resolve_device_and_client(client, seen_args, linear_args, is_sharded)
             client = XLA.client(device)
         else
             client = XLA.default_backend[]
-            device = XLA.ClientGetAddressableDevice(
-                client, XLA.device_ordinal(client, XLA.default_device_idx[])
-            )
+            device = XLA.get_addressable_device(client, XLA.default_device_idx[])
         end
     else
         if device !== nothing
             @assert client == XLA.client(device) "client ($(client)) and XLA.client(device) ($(XLA.client(device))) must be the same"
         else
-            device = XLA.ClientGetAddressableDevice(
-                client, XLA.device_ordinal(client, XLA.default_device_idx[])
-            )
+            device = XLA.get_addressable_device(client, XLA.default_device_idx[])
         end
     end
 
@@ -1431,9 +1430,9 @@ function compile_xla(f, args; client=nothing, kwargs...)
     @ccall MLIR.API.mlir_c.RegisterDialects(ctx::MLIR.API.MlirContext)::Cvoid
 
     if client !== nothing
-        backend = XLA.ClientGetPlatformName(client)
+        backend = XLA.platform_name(client)
     else
-        backend = XLA.ClientGetPlatformName(XLA.default_backend[])
+        backend = XLA.platform_name(XLA.default_backend[])
     end
     if backend == "CUDA"
         backend = "GPU"
@@ -1461,17 +1460,21 @@ function compile_xla(f, args; client=nothing, kwargs...)
         )
 
         # compile MLIR module to XLA executable
-        device_ids = mlir_fn_res.is_sharded ? vec(mlir_fn_res.sharding_mesh) : Int64[]
+        local_device_ids = if mlir_fn_res.is_sharded
+            collect(Int64, mlir_fn_res.sharding_mesh.sorted_device_ids)
+        else
+            Int64[]
+        end
         mlir_fn_res.is_sharded && (device = nothing)
 
-        exec = XLA.Compile(
+        exec = XLA.compile(
             client,
             device,
             mod;
             num_outputs=length(mlir_fn_res.linear_results),
             num_parameters=length(mlir_fn_res.linear_args),
             mlir_fn_res.is_sharded,
-            device_ids,
+            local_device_ids,
         )
 
         return mod, exec, mlir_fn_res, device, client
@@ -1514,7 +1517,7 @@ function compile(f, args; sync=false, kwargs...)
         donated_args_mask,
         length(linear_results),
         mlir_fn_res.is_sharded,
-        mlir_fn_res.is_sharded ? vec(mlir_fn_res.sharding_mesh) : Int64[],
+        mlir_fn_res.is_sharded ? length(mlir_fn_res.sharding_mesh) : 1,
     )
 
     linear_result_shard_info = if mlir_fn_res.is_sharded
 
@@ -37,15 +37,13 @@ Adapt.adapt_storage(::Type{T}, x::AbstractArray) where {T<:ConcreteRArray} = T(x
 
 Base.size(x::ConcreteRArray) = x.shape
 
-function Base.isempty(x::Union{ConcreteRArray,ConcreteRNumber})
-    return any(==(XLA.AsyncEmptyBuffer), x.data)
-end
+Base.isempty(x::Union{ConcreteRArray,ConcreteRNumber}) = any(isempty, x.data)
 Base.isempty(x::WrappedConcreteRArray) = isempty(ancestor(x))
 
 function Base.convert(::Type{<:Array}, X::ConcreteRArray{T,N}) where {T,N}
-    data = Array{T,N}(undef, size(X)...)
-
     if Sharding.is_sharded(X)
+        data = Array{T,N}(undef, size(X)...)
+
         completed = Set{eltype(X.sharding.device_to_array_slices)}()
         for idx in 1:length(X.data)
             slice = X.sharding.device_to_array_slices[idx]
@@ -56,14 +54,14 @@ function Base.convert(::Type{<:Array}, X::ConcreteRArray{T,N}) where {T,N}
             end
             data[slice...] = convert(Array{T}, X.data[idx])
         end
+
+        return data
     else
         buf = XLA.synced_buffer(only(X.data))
-        GC.@preserve data buf begin
-            XLA.BufferToHost(buf, pointer(data))
+        GC.@preserve buf begin
+            return convert(Array{T}, buf)
         end
     end
-
-    return data
 end
 function Base.convert(::Type{<:Array}, X::WrappedConcreteRArray)
     fn = compile(TracedUtils.materialize_traced_array, (X,))
@@ -82,7 +80,7 @@ function to_number(X::ConcreteRScalar{T}) where {T}
     XLA.await(X)
     buf = get_buffer(X; no_error_for_scalar=true)
     GC.@preserve data buf begin
-        XLA.BufferToHost(buf, data)
+        XLA.to_host(buf, data)
     end
     return data[]
 end
@@ -184,7 +182,7 @@ function Base.getindex(a::ConcreteRArray{T}, args::Vararg{Int,N}) where {T,N}
     if buffer_on_cpu(a) && !Sharding.is_sharded(a)
         buf = get_buffer(a)
         GC.@preserve buf begin
-            ptr = Base.unsafe_convert(Ptr{T}, XLA.UnsafeBufferPointer(buf))
+            ptr = Base.unsafe_convert(Ptr{T}, XLA.unsafe_buffer_pointer(buf))
             start = 0
             for i in 1:N
                 start *= size(a, N - i + 1)
@@ -211,7 +209,7 @@ function Base.setindex!(a::ConcreteRArray{T}, v, args::Vararg{Int,N}) where {T,N
     if buffer_on_cpu(a) && !Sharding.is_sharded(a)
         buf = get_buffer(a)
         GC.@preserve buf begin
-            ptr = Base.unsafe_convert(Ptr{T}, XLA.UnsafeBufferPointer(buf))
+            ptr = Base.unsafe_convert(Ptr{T}, XLA.unsafe_buffer_pointer(buf))
             start = 0
             for i in 1:N
                 start *= size(a, N - i + 1)
@@ -303,9 +301,7 @@ end
 (f::CallMapReduce)(A) = Base.mapreduce(f.f, f.op, A; f.dims, f.init)
 
 buffer_on_cpu(::Any) = true
-function buffer_on_cpu(x::ConcreteRArray)
-    return all(XLA.BufferOnCPU ∘ Base.Fix2(getproperty, :buffer), x.data)
-end
+buffer_on_cpu(x::ConcreteRArray) = all(XLA.buffer_on_cpu, x.data)
 
 function Ops.constant(x::ConcreteRArray; kwargs...)
     return Ops.constant(Base.convert(Array, x); kwargs...)
@@ -328,7 +324,7 @@ function Base.fill!(a::ConcreteRArray{T,N}, val) where {T,N}
     if buffer_on_cpu(a) && !Sharding.is_sharded(a)
         buf = get_buffer(a)
         GC.@preserve buf begin
-            ptr = Base.unsafe_convert(Ptr{T}, XLA.UnsafeBufferPointer(buf))
+            ptr = Base.unsafe_convert(Ptr{T}, XLA.unsafe_buffer_pointer(buf))
             for start in 1:length(a)
                 unsafe_store!(ptr, val, start)
             end
 
@@ -1,27 +1,27 @@
 """
     devices(backend::String)
-    devices(backend::XLA.Client = XLA.default_backend[])
+    devices(backend::XLA.AbstractClient = XLA.default_backend[])
 
 Return a list of devices available on the backend.
 """
 devices(backend::String) = devices(XLA.backends[backend])
 
-function devices(client::XLA.Client=XLA.default_backend[])
-    ndevices = XLA.ClientNumDevices(client)
-    return [XLA.ClientGetDevice(client, i - 1) for i in 1:ndevices]
+function devices(client::XLA.AbstractClient=XLA.default_backend[])
+    ndevices = XLA.num_devices(client)
+    return [XLA.get_device(client, i - 1) for i in 1:ndevices]
 end
 
 """
     addressable_devices(backend::String)
-    addressable_devices(backend::XLA.Client = XLA.default_backend[])
+    addressable_devices(backend::XLA.AbstractClient = XLA.default_backend[])
 
 Return a list of addressable devices available on the backend.
 """
 addressable_devices(backend::String) = addressable_devices(XLA.backends[backend])
 
-function addressable_devices(client::XLA.Client=XLA.default_backend[])
-    ndevices = XLA.ClientNumAddressableDevices(client)
-    return [XLA.ClientGetAddressableDevice(client, i - 1) for i in 1:ndevices]
+function addressable_devices(client::XLA.AbstractClient=XLA.default_backend[])
+    ndevices = XLA.num_addressable_devices(client)
+    return [XLA.get_addressable_device(client, i - 1) for i in 1:ndevices]
 end
 
 # https://github.com/jax-ml/jax/blob/152099ee0ef31119f16f4c2dac50d84fcb1575ef/jax/_src/hardware_utils.py#L19-L55
 
@@ -59,7 +59,7 @@ end
 
 @setup_workload begin
     initialize_dialect()
-    client = XLA.CPUClient(; checkcount=false)
+    client = XLA.PJRT.CPUClient(; checkcount=false)
     @compile_workload begin
         @static if precompilation_supported()
             x = ConcreteRNumber(2.0; client)
 
@@ -221,7 +221,7 @@ function __init__()
     return initialize_dialect()
 end
 
-function set_default_backend(backend::XLA.Client)
+function set_default_backend(backend::XLA.AbstractClient)
     return XLA.default_backend[] = backend
 end