昇腾社区首页
中文
注册

更多样例

样例模板

为了方便您快速运行具体指令中的参考样例,本章节提供标量三目指令的样例模板。

  • 完整样例一:srcLocal、scalar、dstLocal的数据类型均为half。
    #include "kernel_operator.h"
    class KernelAxpy {
    public:
        __aicore__ inline KernelAxpy() {}
        __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
        {
            srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
            dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
            pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
            pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(half));
        }
        __aicore__ inline void Process()
        {
            CopyIn();
            Compute();
            CopyOut();
        }
    private:
        __aicore__ inline void CopyIn()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
            AscendC::DataCopy(srcLocal, srcGlobal, 512);
            inQueueSrc.EnQue(srcLocal);
        }
        __aicore__ inline void Compute()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
            AscendC::LocalTensor<half> dstLocal = outQueueDst.AllocTensor<half>();
     
            AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 512);
     
            outQueueDst.EnQue<half>(dstLocal);
            inQueueSrc.FreeTensor(srcLocal);
        }
        __aicore__ inline void CopyOut()
        {
            AscendC::LocalTensor<half> dstLocal = outQueueDst.DeQue<half>();
            AscendC::DataCopy(dstGlobal, dstLocal, 512);
            outQueueDst.FreeTensor(dstLocal);
        }
    private:
        AscendC::TPipe pipe;
        AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc;
        AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst;
        AscendC::GlobalTensor<half> srcGlobal, dstGlobal;
    };
    extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_half(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        KernelAxpy op;
        op.Init(srcGm, dstGm);
        op.Process();
    }

    结果示例如下:

    输入数据(src1Gm):
    [1. 1. 1. 1. 1. 1. ... 1.]
    输出数据(dstGm):
    [2. 2. 2. 2. 2. 2. ... 2.]
  • 完整样例二:srcLocal、scalar的数据类型为half,dstLocal的数据类型为float。
    #include "kernel_operator.h"
    class KernelAxpy {
    public:
        __aicore__ inline KernelAxpy() {}
        __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
        {
            srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
            dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm);
            pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(float));
            pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
        }
        __aicore__ inline void Process()
        {
            CopyIn();
            Compute();
            CopyOut();
        }
    private:
        __aicore__ inline void CopyIn()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
            AscendC::DataCopy(srcLocal, srcGlobal, 512);
            inQueueSrc.EnQue(srcLocal);
        }
        __aicore__ inline void Compute()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
            AscendC::LocalTensor<float> dstLocal = outQueueDst.AllocTensor<float>();
     
            AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 64, 8,{ 1, 1, 8, 4 });
     
            outQueueDst.EnQue<float>(dstLocal);
            inQueueSrc.FreeTensor(srcLocal);
        }
        __aicore__ inline void CopyOut()
        {
            AscendC::LocalTensor<float> dstLocal = outQueueDst.DeQue<float>();
            AscendC::DataCopy(dstGlobal, dstLocal, 512);
            outQueueDst.FreeTensor(dstLocal);
        }
    private:
        AscendC::TPipe pipe;
        AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc;
        AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst;
        AscendC::GlobalTensor<half> srcGlobal;
        AscendC::GlobalTensor<float> dstGlobal;
    };
    extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_float(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        KernelAxpy op;
        op.Init(srcGm, dstGm);
        op.Process();
    }

    结果示例如下:

    输入数据(src1Gm):
    [1. 1. 1. 1. 1. 1. ... 1.]
    输出数据(dstGm):
    [2. 2. 2. 2. 2. 2. ... 2.]