昇腾社区首页
中文
注册

更多样例

样例模板

为了方便您快速运行具体指令中的参考样例,本章节提供标量三目指令的样例模板。

  • 完整样例一:srcLocal、scalar、dstLocal的数据类型均为half。
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    #include "kernel_operator.h"
    class KernelAxpy {
    public:
        __aicore__ inline KernelAxpy() {}
        __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
        {
            srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
            dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
            pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
            pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(half));
        }
        __aicore__ inline void Process()
        {
            CopyIn();
            Compute();
            CopyOut();
        }
    private:
        __aicore__ inline void CopyIn()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
            AscendC::DataCopy(srcLocal, srcGlobal, 512);
            inQueueSrc.EnQue(srcLocal);
        }
        __aicore__ inline void Compute()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
            AscendC::LocalTensor<half> dstLocal = outQueueDst.AllocTensor<half>();
     
            AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 512);
     
            outQueueDst.EnQue<half>(dstLocal);
            inQueueSrc.FreeTensor(srcLocal);
        }
        __aicore__ inline void CopyOut()
        {
            AscendC::LocalTensor<half> dstLocal = outQueueDst.DeQue<half>();
            AscendC::DataCopy(dstGlobal, dstLocal, 512);
            outQueueDst.FreeTensor(dstLocal);
        }
    private:
        AscendC::TPipe pipe;
        AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc;
        AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst;
        AscendC::GlobalTensor<half> srcGlobal, dstGlobal;
    };
    extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_half(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        KernelAxpy op;
        op.Init(srcGm, dstGm);
        op.Process();
    }
    

    结果示例如下:

    输入数据(src1Gm):
    [1. 1. 1. 1. 1. 1. ... 1.]
    输出数据(dstGm):
    [2. 2. 2. 2. 2. 2. ... 2.]
  • 完整样例二:srcLocal、scalar的数据类型为half,dstLocal的数据类型为float。
     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    #include "kernel_operator.h"
    class KernelAxpy {
    public:
        __aicore__ inline KernelAxpy() {}
        __aicore__ inline void Init(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
        {
            srcGlobal.SetGlobalBuffer((__gm__ half*)srcGm);
            dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm);
            pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(float));
            pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(half));
        }
        __aicore__ inline void Process()
        {
            CopyIn();
            Compute();
            CopyOut();
        }
    private:
        __aicore__ inline void CopyIn()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.AllocTensor<half>();
            AscendC::DataCopy(srcLocal, srcGlobal, 512);
            inQueueSrc.EnQue(srcLocal);
        }
        __aicore__ inline void Compute()
        {
            AscendC::LocalTensor<half> srcLocal = inQueueSrc.DeQue<half>();
            AscendC::LocalTensor<float> dstLocal = outQueueDst.AllocTensor<float>();
     
            AscendC::Axpy(dstLocal, srcLocal, (half)2.0, 64, 8,{ 1, 1, 8, 4 });
     
            outQueueDst.EnQue<float>(dstLocal);
            inQueueSrc.FreeTensor(srcLocal);
        }
        __aicore__ inline void CopyOut()
        {
            AscendC::LocalTensor<float> dstLocal = outQueueDst.DeQue<float>();
            AscendC::DataCopy(dstGlobal, dstLocal, 512);
            outQueueDst.FreeTensor(dstLocal);
        }
    private:
        AscendC::TPipe pipe;
        AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc;
        AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst;
        AscendC::GlobalTensor<half> srcGlobal;
        AscendC::GlobalTensor<float> dstGlobal;
    };
    extern "C" __global__ __aicore__ void kernel_vec_ternary_scalar_Axpy_half_2_float(__gm__ uint8_t* srcGm, __gm__ uint8_t* dstGm)
    {
        KernelAxpy op;
        op.Init(srcGm, dstGm);
        op.Process();
    }
    

    结果示例如下:

    输入数据(src1Gm):
    [1. 1. 1. 1. 1. 1. ... 1.]
    输出数据(dstGm):
    [2. 2. 2. 2. 2. 2. ... 2.]