[Reranker Feature] Support Loop with multi-commands

wangzhaode · wangzhaode · commit 1bde9715ef1c · 2026-02-07T12:52:32.000+08:00
Discussed-in: Merge-Request 25729546 , URL: https://code.alibaba-inc.com/AliNN/AliNNPrivate/codereview/25729546 GitOrigin-RevId: 7dd154bd8cfa99854201b9222d043f4cc0509975 ORIGINAL_AUTHOR=MNNSyncBot <hi@zhaode.wang>
diff --git a/source/backend/metal/MetalLoop.mm b/source/backend/metal/MetalLoop.mm
@@ -162,9 +162,10 @@ static void _setTensorStack(std::vector<Tensor*>& result, const std::vector<Tens
     id<MTLComputePipelineState> mPipeline;
     std::vector<Tensor*> mTensors;
     bool mHasBias = false;
+    int mCmdIndex = 0;
 
 public:
-    MetalBatchMatMul(const LoopParam* loop, Backend *bn) : MetalExecution(bn) {
+    MetalBatchMatMul(const LoopParam* loop, Backend *bn, int index = 0) : MetalExecution(bn) {
         mLoop = loop;
         auto mtbn = static_cast<MetalBackend *>(bn);
         auto context = (__bridge MNNMetalContext *)mtbn->context();
@@ -180,7 +181,7 @@ static void _setTensorStack(std::vector<Tensor*>& result, const std::vector<Tens
             std::string([T UTF8String]),
             "matmulunit"
         };
-        auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = loop->commands()->GetAs<RegionCommand>(index);
         mHasBias = cmd->indexes()->size() > 3;
         if (mHasBias) {
             keys.emplace_back("BIAS");
@@ -200,17 +201,19 @@ static void _setTensorStack(std::vector<Tensor*>& result, const std::vector<Tens
             }
             pipeline = mtbn->makeComputePipelineWithSourceOption(gMatMulUnitTemplate, "loop_matmul", compileOptions);
             mtbn->runtime()->insertPipeline(keys, pipeline);
+
         }
         if (nil == pipeline) {
             MNN_ERROR("Create batch matmul pipeline error\n");
         }
         mPipeline = pipeline;
         mTensors.resize(mLoop->tensorNumber());
+        mCmdIndex = index;
     }
     virtual ~MetalBatchMatMul() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs) override {
         _setTensorStack(mTensors, inputs, outputs, mLoop);
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto size = cmd->size()->data();
         auto AStride = cmd->view()->GetAs<View>(1)->stride()->data();
         auto BStride = cmd->view()->GetAs<View>(2)->stride()->data();
@@ -234,7 +237,7 @@ virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vecto
         return NO_ERROR;
     }
     virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override {
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto size = cmd->size()->data();
         auto AStride = cmd->view()->GetAs<View>(1)->stride()->data();
         auto BStride = cmd->view()->GetAs<View>(2)->stride()->data();
@@ -252,6 +255,8 @@ virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Ten
                 MetalBackend::setTensor(inputs[0], encoder, cmd->indexes()->size() + i);
             }
         }
+//        printf("loop_matmul out dequant BMNK: %d %d %d %d\n", mLoop->loopNumber(), size[0], size[2], size[1]);
+
         [encoder setBuffer:mParam offset:0 atIndex:cmd->indexes()->size() * 2];
         [encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(totalSize, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
     }
@@ -394,23 +399,21 @@ kernel void set_copy(device T *out   [[buffer(0)]],
 class MetalGather : public MetalExecution {
 private:
     const LoopParam* mLoop;
-    bool mNeedInit = false;
-    std::pair<MTLSize, MTLSize> mInitThreads;
+    int mCmdIndex = 0;
     id<MTLBuffer> mParam;
     id<MTLComputePipelineState> mPipeline;
-    id<MTLComputePipelineState> mInitPipeline;
-    id<MTLBuffer> mInitParam;
     std::vector<Tensor*> mTensors;
 public:
-    MetalGather(const LoopParam* loop, Backend *bn, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) : MetalExecution(bn) {
+    MetalGather(const LoopParam* loop, Backend *bn, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, int index = 0) : MetalExecution(bn) {
         mLoop = loop;
         auto mtbn = static_cast<MetalBackend *>(bn);
         auto context = (__bridge MNNMetalContext *)mtbn->context();
 
         mParam = [context newDeviceBuffer:sizeof(GatherInfo) access:CPUWriteOnly];
         bool useFp16 = mtbn->useFp16InsteadFp32();
         mTensors.resize(mLoop->tensorNumber());
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        mCmdIndex = index;
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         _setTensorStack(mTensors, inputs, outputs, mLoop);
         auto dstTensor = mTensors[cmd->indexes()->data()[0]];
         
@@ -435,40 +438,10 @@ kernel void set_copy(device T *out   [[buffer(0)]],
             }
             mPipeline = pipeline;
         }
-        
-        // scatter need init command pipeline
-        if(mLoop->initCommand() != nullptr){
-            mNeedInit = true;
-            std::string shader = "set_copy";
-            auto cmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
-            if (cmd->op() == nullptr){
-                shader = "set_zero";
-            } else {
-                mInitParam = [context newDeviceBuffer:sizeof(InitInfo) access:CPUWriteOnly];
-            }
-            std::vector<std::string> keys = {
-                std::string([T UTF8String]),
-                "init_region",
-                shader
-            };
-            auto pipeline = mtbn->runtime()->findPipeline(keys);
-            if (nil == pipeline) {
-                MTLCompileOptions *compileOptions = [[MTLCompileOptions alloc] init];
-                compileOptions.preprocessorMacros = @{
-                    @"T" : T,
-                };
-                pipeline = mtbn->makeComputePipelineWithSourceOption(gInitRegion, shader.c_str(), compileOptions);
-                mtbn->runtime()->insertPipeline(keys, pipeline);
-            }
-            if (nil == pipeline) {
-                MNN_ERROR("Create gather init pipeline error\n");
-            }
-            mInitPipeline = pipeline;
-        }
     }
     virtual ~MetalGather() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs) override {
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         _setTensorStack(mTensors, inputs, outputs, mLoop);
         
         auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
@@ -504,51 +477,11 @@ virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vecto
         param->totalSize[0] = inputSize;
         param->totalSize[1] = outputSize;
         
-        if(mNeedInit) {
-            auto initCmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
-            auto data = reinterpret_cast<InitInfo*>([mInitParam contents]);
-
-            auto srcStride = initCmd->view()->GetAs<View>(1)->stride()->data();
-            auto dstStride = initCmd->view()->GetAs<View>(0)->stride()->data();
-            auto dataSize = initCmd->size()->data();
-            for (int i = 0; i < 3; ++i) {
-                data->srcStride[i] = srcStride[i];
-                data->dstStride[i] = dstStride[i];
-                data->size[i] = dataSize[i];
-            }
-            
-            auto initDstTensor = mTensors[initCmd->indexes()->data()[0]];
-            auto initSrcTensor = mTensors[initCmd->indexes()->data()[1]];
-            auto initInputSize = initSrcTensor->usize() / initSrcTensor->buffer().type.bytes();
-            auto initOutputSize = initDstTensor->usize() / initDstTensor->buffer().type.bytes();
-            data->totalSize[0] = initInputSize;
-            data->totalSize[1] = initOutputSize;
-            
-            auto backend = static_cast<MetalBackend *>(this->backend());
-            auto context = (__bridge MNNMetalContext *)backend->context();
-            mInitThreads = [context computeBestGroupAndLocal:mInitPipeline threads:MTLSizeMake(data->size[0], data->size[1], data->size[2])];
-        }
         return NO_ERROR;
     }
     virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
                           id<MTLComputeCommandEncoder> encoder) override {
-        if(mNeedInit) {
-            auto initCmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
-            int x = initCmd->size()->data()[0];
-            int y = initCmd->size()->data()[1];
-            int z = initCmd->size()->data()[2];
-            
-            [encoder setComputePipelineState:mInitPipeline];
-            auto dstTensor = mTensors[initCmd->indexes()->data()[0]];
-            auto srcTensor = mTensors[initCmd->indexes()->data()[1]];
-            MetalBackend::setTensor(dstTensor, encoder, 0);
-            MetalBackend::setTensor(srcTensor, encoder, 1);
-            [encoder setBuffer:mInitParam offset:0 atIndex:2];
-            
-            [encoder dispatchThreadgroups:mInitThreads.first threadsPerThreadgroup:mInitThreads.second];
-        }
-        
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto size = cmd->size()->data();
         auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
         auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
@@ -631,13 +564,14 @@ kernel void loop_binary(device T1* uOutput [[buffer(0)]], const device T0* uInpu
 
 class MetalBinaryBroadCast : public MetalExecution {
 public:
-    MetalBinaryBroadCast(const LoopParam* loop, Backend *bn, std::vector<Tensor*>&& tensors, NSString* CUSTOM) : MetalExecution(bn) {
+    MetalBinaryBroadCast(const LoopParam* loop, Backend *bn, const std::vector<Tensor*>& tensors, NSString* CUSTOM, int index = 0) : MetalExecution(bn) {
         mLoop = loop;
         auto mtbn = static_cast<MetalBackend *>(bn);
         auto context = (__bridge MNNMetalContext *)mtbn->context();
         mParam = mtbn->getConstBuffer(sizeof(BinaryBroadCastInfo));
-        mTensors = std::move(tensors);
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        mTensors = tensors;
+        mCmdIndex = index;
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto dstTensor = mTensors[cmd->indexes()->data()[0]];
         auto srcTensor = mTensors[cmd->indexes()->data()[1]];
         auto srcTensor1 = mTensors[cmd->indexes()->data()[2]];
@@ -672,7 +606,7 @@ kernel void loop_binary(device T1* uOutput [[buffer(0)]], const device T0* uInpu
     }
     virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs) override {
         _setTensorStack(mTensors, inputs, outputs, mLoop);
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto size = cmd->size()->data();
         auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
         auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
@@ -694,7 +628,7 @@ virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vecto
     virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
                                id<MTLComputeCommandEncoder> encoder) override {
 
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(mCmdIndex);
         auto dstTensor = mTensors[cmd->indexes()->data()[0]];
         auto srcTensor = mTensors[cmd->indexes()->data()[1]];
         auto srcTensor1 = mTensors[cmd->indexes()->data()[2]];
@@ -711,6 +645,139 @@ virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Ten
     id<MTLBuffer> mParam;
     std::vector<Tensor*> mTensors;
     int mTotalSize;
+    int mCmdIndex = 0;
+};
+
+class MetalLoop : public MetalExecution {
+public:
+    MetalLoop(const LoopParam* loop, Backend *bn, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : MetalExecution(bn) {
+        mLoop = loop;
+        auto mtbn = static_cast<MetalBackend *>(bn);
+        auto context = (__bridge MNNMetalContext *)mtbn->context();
+        mTensors.resize(mLoop->tensorNumber());
+        _setTensorStack(mTensors, inputs, outputs, mLoop);
+
+        // Init
+        if(mLoop->initCommand() != nullptr) {
+            mNeedInit = true;
+            std::string shader = "set_copy";
+            auto dstTensor = mTensors[mLoop->initCommand()->GetAs<RegionCommand>(0)->indexes()->data()[0]];
+            NSString* T = MetalCast::getScalarType(dstTensor->getType(), mtbn->useFp16InsteadFp32());
+
+            auto cmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
+            if (cmd->op() == nullptr){
+                shader = "set_zero";
+            } else {
+                mInitParam = [context newDeviceBuffer:sizeof(InitInfo) access:CPUWriteOnly];
+            }
+            std::vector<std::string> keys = {
+                std::string([T UTF8String]),
+                "init_region",
+                shader
+            };
+            auto pipeline = mtbn->runtime()->findPipeline(keys);
+            if (nil == pipeline) {
+                MTLCompileOptions *compileOptions = [[MTLCompileOptions alloc] init];
+                compileOptions.preprocessorMacros = @{
+                    @"T" : T,
+                };
+                pipeline = mtbn->makeComputePipelineWithSourceOption(gInitRegion, shader.c_str(), compileOptions);
+                mtbn->runtime()->insertPipeline(keys, pipeline);
+            }
+            if (nil == pipeline) {
+                MNN_ERROR("Create gather init pipeline error\n");
+            }
+            mInitPipeline = pipeline;
+        }
+
+        bool valid = true;
+        for (int i=0; i<loop->commands()->size(); ++i) {
+            auto cmd = loop->commands()->GetAs<RegionCommand>(i);
+            auto subop = cmd->op();
+            if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
+                mExecutions.emplace_back(std::make_shared<MetalGather>(loop, bn, inputs, outputs, i));
+            } else if (OpType_MatMul == subop->type() && loop->parallel()) {
+                mExecutions.emplace_back(std::make_shared<MetalBatchMatMul>(loop, bn, i));
+            } else if (OpType_BinaryOp == subop->type() && cmd->fuse() < 0 && 1 == loop->loopNumber()) {
+                mExecutions.emplace_back(std::make_shared<MetalBinaryBroadCast>(loop, bn, mTensors, MetalBinary::convert(cmd->op()->main_as_BinaryOp()->opType(), mTensors[cmd->indexes()->data()[1]]->getType().code == halide_type_float), i));
+            } else {
+                valid = false;
+                break;
+            }
+        }
+        if (!valid) {
+            mExecutions.clear();
+        }
+    }
+    virtual ~MetalLoop() = default;
+    
+    virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs) override {
+        // Init
+        if(mNeedInit) {
+            _setTensorStack(mTensors, inputs, outputs, mLoop);
+            auto initCmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
+            auto data = reinterpret_cast<InitInfo*>([mInitParam contents]);
+
+            auto srcStride = initCmd->view()->GetAs<View>(1)->stride()->data();
+            auto dstStride = initCmd->view()->GetAs<View>(0)->stride()->data();
+            auto dataSize = initCmd->size()->data();
+            for (int i = 0; i < 3; ++i) {
+                data->srcStride[i] = srcStride[i];
+                data->dstStride[i] = dstStride[i];
+                data->size[i] = dataSize[i];
+            }
+            
+            auto initDstTensor = mTensors[initCmd->indexes()->data()[0]];
+            auto initSrcTensor = mTensors[initCmd->indexes()->data()[1]];
+            auto initInputSize = initSrcTensor->usize() / initSrcTensor->buffer().type.bytes();
+            auto initOutputSize = initDstTensor->usize() / initDstTensor->buffer().type.bytes();
+            data->totalSize[0] = initInputSize;
+            data->totalSize[1] = initOutputSize;
+            
+            auto backend = static_cast<MetalBackend *>(this->backend());
+            auto context = (__bridge MNNMetalContext *)backend->context();
+            mInitThreads = [context computeBestGroupAndLocal:mInitPipeline threads:MTLSizeMake(data->size[0], data->size[1], data->size[2])];
+        }
+
+        // Loop commands
+        for (auto& exe : mExecutions) {
+            auto code = exe->onResize(inputs, outputs);
+            if (NO_ERROR != code) {
+                return code;
+            }
+        }
+        return NO_ERROR;
+    }
+    virtual void onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) override {
+        // Init
+        if(mNeedInit) {
+            auto initCmd = mLoop->initCommand()->GetAs<RegionCommand>(0);
+            [encoder setComputePipelineState:mInitPipeline];
+            auto dstTensor = mTensors[initCmd->indexes()->data()[0]];
+            auto srcTensor = mTensors[initCmd->indexes()->data()[1]];
+            MetalBackend::setTensor(dstTensor, encoder, 0);
+            MetalBackend::setTensor(srcTensor, encoder, 1);
+            [encoder setBuffer:mInitParam offset:0 atIndex:2];
+            [encoder dispatchThreadgroups:mInitThreads.first threadsPerThreadgroup:mInitThreads.second];
+        }
+        // Loop commands
+        for (auto& exe : mExecutions) {
+            exe->onEncode(inputs, outputs, encoder);
+        }
+    }
+    bool isValid() {
+        return !mExecutions.empty();
+    }
+private:
+    const LoopParam* mLoop;
+    std::vector<std::shared_ptr<MetalExecution>> mExecutions;
+    std::vector<Tensor*> mTensors;
+    // For Init
+    bool mNeedInit = false;
+    std::pair<MTLSize, MTLSize> mInitThreads;
+    id<MTLComputePipelineState> mInitPipeline;
+    id<MTLBuffer> mInitParam;
+
 };
 
 class MetalLoopCreator : public MetalBackend::Creator {
@@ -725,7 +792,7 @@ virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Ten
         if (1 == loop->commands()->size()) {
             auto cmd = loop->commands()->GetAs<RegionCommand>(0);
             auto subop = cmd->op();
-            if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0) {
+            if (OpType_UnaryOp == subop->type() && nullptr == subop->main() && cmd->fuse() < 0 && nullptr == loop->initCommand()) {
                 return new MetalGather(loop, bn, inputs, outputs);
             }
             if (OpType_MatMul == subop->type() && loop->parallel() && nullptr == loop->initCommand()) {
@@ -741,10 +808,17 @@ virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Ten
                     MNN_ERROR("Metal Don't support binary - %d \n", cmd->op()->main_as_BinaryOp()->opType());
                     return nullptr;
                 }
-                return new MetalBinaryBroadCast(loop, bn, std::move(tensors), CUSTOM);
+                return new MetalBinaryBroadCast(loop, bn, tensors, CUSTOM);
             }
         }
+        // General Case
+        auto exe = new MetalLoop(loop, bn, inputs, outputs);
+        if (exe->isValid()) {
+            return exe;
+        }
+        delete exe;
         return nullptr;
+
     }
 };
 REGISTER_METAL_OP_CREATOR(MetalLoopCreator, OpType_While);