更多样例

样例一

下面的样例展示了数学库kernel侧API和Tiling API GetXxxMaxMinTmpSize的配套使用方法，具体流程如下：

Host侧调用Tiling接口，获取所需临时空间的大小，并将其写入tiling data中；kernel侧再读取tiling data，获取相应的临时空间大小，并根据此分配临时空间。

Host侧Tiling API使用样例:

#include <vector>

#include "register/op_def_registry.h"
#include "register/tilingdata_base.h"
#include "tiling/tiling_api.h"

namespace optiling {

BEGIN_TILING_DATA_DEF(AsinCustomTilingData)
  TILING_DATA_FIELD_DEF(uint32_t, srcSize);
  TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize);
END_TILING_DATA_DEF;

static ge::graphStatus TilingFunc(gert::TilingContext* context)
{
    // Input source shapes.
    std::vector<int64_t> srcDims = {16, 128};
    uint32_t srcSize = 1;
    for (auto dim : srcDims) {
        srcSize *= dim;
    }
    uint32_t typeSize = 2;
    ge::Shape shape(srcDims);
    uint32_t minValue = 0;
    uint32_t maxValue = 0;
    AscendC::GetAsinMaxMinTmpSize(shape, typeSize, false, maxValue, minValue);

    auto platformInfo = context->GetPlatformInfo();
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
    uint64_t tailSize = 0; // ub剩余空间大小
    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize); // 本样例中使用完整的ub空间，实际情况下tailSize需要减掉用户已使用的ub空间
    auto tmpSize = tailSize >= maxValue ? maxValue : tailSize;

    AsinCustomTilingData tiling;
    tiling.set_srcSize(srcSize);
    tiling.set_tmpBufferSize(tmpSize);
    context->SetBlockDim(1);
    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
    context->SetTilingKey(1);

    return ge::GRAPH_SUCCESS;
}
} // namespace optiling

kernel侧读取tiling data，获取相应的临时空间大小，并根据此分配临时空间:

#include "kernel_operator.h"

template <typename srcType>
class KernelAsin
{
public:
    __aicore__ inline KernelAsin() {}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSize, uint32_t tmpBufferSize)
    {
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize);

        pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue, 1, tmpBufferSize);
        bufferSize = srcSize;
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, bufferSize);
        inQueue.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();

        AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>();
        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>();
        AscendC::Asin<srcType, false>(dstLocal, srcLocal, sharedTmpBuffer, bufferSize);

        outQueue.EnQue<srcType>(dstLocal);
        inQueue.FreeTensor(srcLocal);
        tmpQueue.FreeTensor(sharedTmpBuffer);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, bufferSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;

    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t bufferSize = 0;
};

extern "C" __global__ __aicore__ void asin_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR workspace, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelAsin<half> op;
    op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.tmpBufferSize);
    if (TILING_KEY_IS(1))
    {
        op.Process();
    }
}

样例二

下面的样例展示了数学库kernel侧API和Tiling API GetXxxTmpBufferFactorSize的配套使用方法，具体流程如下：

Host侧调用Tiling接口，获取maxLiveNodeCnt和extraBuf，并推算算子单次最大计算元素数量，将其写入tiling data中；kernel侧再读取tiling data，获取该值，基于该值分配临时空间。

Host侧Tiling API使用样例:

#include <vector>
#include <cassert>
#include "register/op_def_registry.h"
#include "register/tilingdata_base.h"
#include "tiling/tiling_api.h"

namespace optiling {
BEGIN_TILING_DATA_DEF(AsinCustomTilingData)
TILING_DATA_FIELD_DEF(uint32_t, srcSize);
TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize);
END_TILING_DATA_DEF;

static ge::graphStatus TilingFunc(gert::TilingContext *context)
{
    // Input source shapes.
    std::vector<int64_t> srcDims = { 16, 128 };
    uint32_t srcSize = 1;
    uint32_t srcCurSize = 1;
    for (auto dim : srcDims) {
        srcSize *= dim;
    }
    uint32_t typeSize = 2;

    auto platformInfo = context->GetPlatformInfo();
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo);
    uint64_t tailSize = 0; // ub剩余空间大小
    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize);

    uint32_t asinMaxLiveNodeCnt = 0;
    uint32_t asinExtraBuf = 0;

    uint32_t acosMaxLiveNodeCnt = 0;
    uint32_t acosExtraBuf = 0;

    AscendC::GetAsinTmpBufferFactorSize(typeSize, asinMaxLiveNodeCnt, asinExtraBuf);
    AscendC::GetAcosTmpBufferFactorSize(typeSize, acosMaxLiveNodeCnt, acosExtraBuf);
    // tmp的大小需要减去UB上调用api接口输入和输出占用的大小
    // 该示例中包括Asin接口的输入输出，以及Acos的输入输出，其中Asin接口的输出作为Acos的输入，因此一共需要3份src的空间大小
    auto tmpSize = tailSize - srcSize * typeSize * 3;
    assert(tmpSize >= asinExtraBuf);
    assert(tmpSize >= acosExtraBuf);
    // 计算Asin算子单次最大计算元素数量
    if (asinMaliveNodeCnt != 0) {
        srcAsinCurSize = (tmpSize - asinExtraBuf) / asinMaxLiveNodeCnt / typeSize;
    } else {
        srcAsinCurSize = srcSize;
    }
    // 计算Acos算子单次最大计算元素数量
    if (acosMaxLiveNodeCnt != 0) {
        srcAcosCurSize = (tmpSize - acosExtraBuf) / acosMaxLiveNodeCnt / typeSize; 
    } else {
        srcAcosCurSize = srcSize;
    }
    srcCurSize = min(srcAsinCurSize, srcAcosCurSize);

    AsinCustomTilingData tiling;
    tiling.set_srcSize(srcSize);
    tiling.set_srcCurSize(srcCurSize);
    tiling.set_tmpBufferSize(tmpSize);
    context->SetBlockDim(1);
    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
    context->SetTilingKey(1);

    return ge::GRAPH_SUCCESS;
}
} // namespace optiling

kernel侧样例:

#include "kernel_operator.h"
template <typename srcType>
class KernelAsin {
public:
    __aicore__ inline KernelAsin(){}
    __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSizeIn, uint32_t srcCurSizeIn, uint32_t tmpBufferSize)
    {
        srcSize = srcSizeIn;
        srcCurSize = srcCurSizeIn;
        srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize);
        dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize);

        pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue1, 1, srcCurSize * sizeof(srcType));
        pipe.InitBuffer(tmpQueue, 1, tmpBufferSize);
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>();
        AscendC::DataCopy(srcLocal, srcGlobal, srcSize);
        inQueue.EnQue(srcLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>();
        AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>();
        AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>();
        AscendC::LocalTensor<srcType> tmpresBuffer = tmpQueue1.AllocTensor<srcType>();

        for (int32_t offset = 0; offset < srcSize; offset += srcCurSize) {
            AscendC::Asin<srcType, false>(tmpresBuffer, srcLocal[offset], sharedTmpBuffer, srcCurSize);
            AscendC::PipeBarrier<PIPE_V>();
            AscendC::Acos<srcType, false>(dstLocal[offset], tmpresBuffer, sharedTmpBuffer, srcCurSize);
            AscendC::PipeBarrier<PIPE_V>();
        }
        outQueue.EnQue<srcType>(dstLocal);
        inQueue.FreeTensor(srcLocal);
        tmpQueue.FreeTensor(sharedTmpBuffer);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>();
        AscendC::DataCopy(dstGlobal, dstLocal, srcSize);
        outQueue.FreeTensor(dstLocal);
    }

private:
    AscendC::GlobalTensor<srcType> srcGlobal;
    AscendC::GlobalTensor<srcType> dstGlobal;

    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue;
    AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue1;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
    uint32_t srcSize, srcCurSize;
};

extern "C" __global__ __aicore__ void kernel_asin_operator(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR tiling)
{
    GET_TILING_DATA(tilingData, tiling);
    KernelAsin<half> op;
    op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.srcCurSize, tilingData.tmpBufferSize);
    if (TILING_KEY_IS(1)) {
        op.Process();
    }
}

样例三

下面以Exp接口的关键调用代码为例，辅以调用前后数据打印结果，展示模板参数isReuseSource的使用及其相关影响。

模板参数isReuseSource为bool类型；若isReuseSource为false，则接口内部计算时不复用源操作数的内存空间；若isReuseSource为true，接口内部计算时会复用源操作数的内存空间，存放一些中间结果，节省内存空间，开发者需要注意接口执行完成后，源操作数的内存空间不再是原始值。

isReuseSource = false

// 调用Exp前dstLocal、srcLocal数值
// dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
// srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]

AscendC::Exp<float, 15, false>(dstLocal, srcLocal, 16);

// 调用Exp后dstLocal、srcLocal数值
// dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04]
// srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]

isReuseSource = true

// 调用Exp前dstLocal、srcLocal数值
// dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
// srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]

AscendC::Exp<float, 15, true>(dstLocal, srcLocal, 16);

// 调用Exp后dstLocal、srcLocal数值
// dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04]
// srcLocal: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

父主题： 数学库