更多样例

样例模板

为了方便您快速运行具体指令中的参考样例，本章节提供标量双目指令的样例模板。

您可以将以下样例模板作为代码框架，只需将具体指令中的样例片段拷贝替换下文代码段中的加粗内容即可。

#include "kernel_operator.h"
namespace AscendC {
class KernelBinaryScalar {
public:
    __aicore__ inline KernelBinaryScalar() {}
    __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
    {
        srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src);
        dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm);
        pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        LocalTensor<int16_t> srcLocal = inQueueSrc.AllocTensor<int16_t>();
        DataCopy(srcLocal, srcGlobal, 512);
        inQueueSrc.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        LocalTensor<int16_t> srcLocal = inQueueSrc.DeQue<int16_t>();
        LocalTensor<int16_t> dstLocal = outQueueDst.AllocTensor<int16_t>();

        uint64_t mask = 128;
        int16_t scalar = 2;
        // repeatTimes = 4, 128 elements one repeat, 512 elements total
       // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat
       // dstRepStride, srcRepStride =8, no gap between repeats
        Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8});
        
        outQueueDst.EnQue<int16_t>(dstLocal);
        inQueueSrc.FreeTensor(srcLocal);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<int16_t> dstLocal = outQueueDst.DeQue<int16_t>();
        DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }
private:
    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueSrc;
    TQue<QuePosition::VECOUT, 1> outQueueDst;
    GlobalTensor<int16_t> srcGlobal, dstGlobal;
};
} // namespace AscendC
extern "C" __global__ __aicore__ void binary_scalar_simple_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm)
{
    AscendC::KernelBinaryScalar op;
    op.Init(src, dstGm);
    op.Process();
}

更多样例

您可以参考以下样例，了解如何使用标量双目指令的tensor高维切分计算接口，进行更灵活的操作、实现更高级的功能。

如果您需要运行样例代码，请将代码段拷贝并替换上述模板中Compute函数的加粗部分代码即可。

通过tensor高维切分计算接口中的mask连续模式，实现数据非连续计算。

uint64_t mask = 64;  // 每个迭代内只计算前64个数
uint16_t scalar = 2;
Adds(dstLocal, srcLocal, scalar, mask, 4, { 1, 1, 8, 8 });

结果示例如下：

输入数据(srcLocal): [1 2 3 ... 512]
输入数据(scalar): 2
输出数据(dstLocal): 
[3 4 5 ... 66 undefined ... undefined
 131 132 133 ... 194 undefined ... undefined
 259 260 261 ... 322 undefined ... undefined
 387 388 389 ... 450 undefined ... undefined]

通过tensor高维切分计算接口中的mask逐比特模式，实现数据非连续计算。

uint64_t mask[2] = { UINT64_MAX, 0 };  // mask[0]满，mask[1]空，每次只计算前64个数
int16_t scalar = 2;
Adds(dstLocal, src0Local, scalar, mask, 4, { 1, 1, 8, 8 });

结果示例如下：

输入数据(srcLocal): [1 2 3 ... 512]
输入数据(scalar): 2
输出数据(dstLocal): 
[3 4 5 ... 66 undefined ... undefined
 131 132 133 ... 194 undefined ... undefined
 259 260 261 ... 322 undefined ... undefined
 387 388 389 ... 450 undefined ... undefined]

通过控制tensor高维切分计算接口的Repeat stride参数，实现数据非连续计算。

uint64_t mask = 128;
int16_t scalar = 2;
// repeatTimes设置为2，表示一共需要进行2次迭代
// dstBlkStride/srcBlkStride设置为1，表示每个迭代内dst/src参与计算的数据地址间隔为1个block
// dstRepStride设置为8，表示相邻迭代之间dst起始地址间隔为8个block
// srcRepStride设置为16, 表示相邻迭代之间src起始地址间隔为16个block
Adds(dstLocal, srcLocal, scalar, mask, 2, { 1, 1, 8, 16 });

结果示例如下：

输入数据(srcLocal): [1 2 3 ... 512]
输入数据(scalar): 2
输出数据(dstLocal):
[3 4 5 ...130 259 260 261 ... 386  undefined ... undefined]

通过控制tensor高维切分计算接口的Block stride和 Repeat stride参数，实现数据非连续计算。

uint64_t mask = 128;
int16_t scalar = 2;
// repeatTimes设置为2，表示一共需要进行2次迭代
// dstBlkStride设置为2，表示每个迭代内dst参与计算的数据地址间隔为2个block
// srcBlkStride设置为1，表示每个迭代内src参与计算的数据地址间隔为1个block
// dstRepStride设置为16, 表示相邻迭代之间dst起始地址间隔为16个block
// srcRepStride设置为8, 表示相邻迭代之间src起始地址间隔为8个block
Adds(dstLocal, srcLocal, scalar, mask, 2, { 2, 1, 16, 8 });

结果示例如下：

输入数据(srcLocal): [1 2 3 ... 512]
输入数据(scalar): 2
输出数据(dstLocal):
[3 4 5 ... 18  undefined ... undefined 19 20 21 ... 34  undefined ... undefined 
35 36 37 ... 50 undefined ... undefined 51 52 53 ... 66 undefined ... undefined 
67 68 69 ... 82 undefined ... undefined 83 84 85 ... 98 undefined ... undefined  
99 100 101 ... 114 undefined ... undefined 115 116 117 ... 130 undefined ... undefined 
131 132 133 ... 146 undefined ... undefined 147 148 149 ... 162 undefined ... undefined 
163 164 165 ... 178 undefined ... undefined 179 180 181 ... 194 undefined ... undefined
195 196 197 ... 210 undefined ... undefined 211 212 213 ... 226 undefined ... undefined
227 228 229 ... 242 undefined ... undefined 243 244 245 ... 258 undefined ... undefined]（每次undefined长度为16个数据）

当标量双目指令使用的scalar参数需要从kernel函数外部输入时，kernel函数可以参考如下形式做修改。

constexpr int32_t BUFFER_NUM = 2;
class KernelBinaryScalar {
public:
    __aicore__ inline KernelBinaryScalar() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, float scalar, uint32_t totalLength, uint32_t tileNum)
    {
        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
        this->blockLength = totalLength / GetBlockNum();
        this->scalar = scalar;
        this->tileNum = tileNum;
        ASSERT(tileNum != 0 && "tile num can not be zero!");
        this->tileLength = this->blockLength / tileNum / BUFFER_NUM;
        xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x + this->blockLength * GetBlockIdx(), this->blockLength);
        zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z + this->blockLength * GetBlockIdx(), this->blockLength);
        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X));
        pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z));
    }
    __aicore__ inline void Process()
    {
        int32_t loopCount = this->tileNum * BUFFER_NUM;
        for (int32_t i = 0; i < loopCount; i++) {
            CopyIn(i);
            Compute(i);
            CopyOut(i);
        }
    }
private:
    __aicore__ inline void CopyIn(int32_t progress)
    {
        LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
        DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength);
        inQueueX.EnQue(xLocal);
    }
    __aicore__ inline void Compute(int32_t progress)
    {
        LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
        LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
        Adds(zLocal, xLocal, (DTYPE_X)scalar, this->tileLength);
        outQueueZ.EnQue<DTYPE_Z>(zLocal);
        inQueueX.FreeTensor(xLocal);
    }
    __aicore__ inline void CopyOut(int32_t progress)
    {
        LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
        DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength);
        outQueueZ.FreeTensor(zLocal);
    }
private:
    TPipe pipe;
    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
    GlobalTensor<DTYPE_X> xGm;
    GlobalTensor<DTYPE_Z> zGm;
    float scalar;
    uint32_t blockLength;
    uint32_t tileNum;
    uint32_t tileLength;
};
extern "C" __global__ __aicore__ void binary_scalar_simple_kernel(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelBinaryScalar op;
    op.Init(x, z, tilingData.scalar, tilingData.totalLength, tilingData.tileNum);
    if (TILING_KEY_IS(1)) {
        op.Process();
    }
}

父主题： 标量双目指令