Implement that fix again

christiangnrd · christiangnrd · commit 0e766685f341 · 2025-11-02T17:16:16.000-04:00
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -231,10 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     #
     # threads in a group work together to reduce values across the reduction dimensions;
     # we want as many as possible to improve algorithm efficiency and execution occupancy.
-    wanted_threads = shuffle ? nextwarp(kernel.kern.pipeline, length(Rreduce)) : length(Rreduce)
-    function compute_threads(max_threads)
+    function compute_threads(kern)
+        max_threads = KI.kernel_max_work_group_size(backend, kern)
+        wanted_threads = shuffle ? nextwarp(kern.kern.pipeline, length(Rreduce)) : length(Rreduce)
         if wanted_threads > max_threads
-            shuffle ? prevwarp(kernel.kern.pipeline, max_threads) : max_threads
+            shuffle ? prevwarp(kern.kern.pipeline, max_threads) : max_threads
         else
             wanted_threads
         end
@@ -244,7 +245,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     #         kernel above may be greater than the maxTotalThreadsPerThreadgroup of the eventually launched
     #         kernel below, causing errors
     # reduce_threads = compute_threads(kernel.pipeline.maxTotalThreadsPerThreadgroup)
-    reduce_threads = compute_threads(KI.kernel_max_work_group_size(backend, kernel))
+    reduce_threads = compute_threads(kernel)
 
     # how many groups should we launch?
     #
@@ -265,21 +266,33 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
                 numworkgroups=groups, workgroupsize=threads)
     else
-        # we need multiple steps to cover all values to reduce
-        partial = similar(R, (size(R)..., reduce_groups))
+        # temporary empty array whose type will match the final partial array
+	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
+
+        # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
+        #       since the type of `partial` might not match the original output container
+        #       (e.g. if that was a view).
+        partial_kernel = KI.KIKernel(backend, partial_mapreduce_device,
+                                    f, op, init, Val(threads), Val(Rreduce),
+                                    Val(Rother), Val(UInt64(length(Rother))),
+                                    Val(grain), Val(shuffle), partial, A)
+        partial_reduce_threads = compute_threads(partial_kernel)
+        partial_reduce_groups = cld(length(Rreduce), partial_reduce_threads * grain)
+
+        partial_threads = partial_reduce_threads
+        partial_groups = partial_reduce_groups*other_groups
+
+        partial = similar(R, (size(R)..., partial_reduce_groups))
         if init === nothing
             # without an explicit initializer we need to copy from the output container
-            # use broadcasting to extend singleton dimensions
             partial .= R
         end
-        # NOTE: we can't use the previously-compiled kernel, since the type of `partial`
-        #       might not match the original output container (e.g. if that was a view).
-        KI.KIKernel(backend, partial_mapreduce_device,
-            f, op, init, Val(threads), Val(Rreduce), Val(Rother),
-            Val(UInt64(length(Rother))), Val(grain), Val(shuffle), partial, A)(
-            f, op, init, Val(threads), Val(Rreduce), Val(Rother),
-            Val(UInt64(length(Rother))), Val(grain), Val(shuffle), partial, A;
-            numworkgroups=groups, workgroupsize=threads)
+
+        partial_kernel(f, op, init, Val(threads), Val(Rreduce),
+                        Val(Rother), Val(UInt64(length(Rother))),
+                        Val(grain), Val(shuffle), partial, A;
+                        numworkgroups=partial_groups, workgroupsize=partial_threads)
+
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end