更多样例
样例一
下面的样例展示了数学库kernel侧API和Tiling API GetXxxMaxMinTmpSize的配套使用方法,具体流程如下:
Host侧调用Tiling接口,获取所需临时空间的大小,并将其写入tiling data中;kernel侧再读取tiling data,获取相应的临时空间大小,并根据此分配临时空间。
Host侧Tiling API使用样例:
#include <vector> #include "register/op_def_registry.h" #include "register/tilingdata_base.h" #include "tiling/tiling_api.h" namespace optiling { BEGIN_TILING_DATA_DEF(AsinCustomTilingData) TILING_DATA_FIELD_DEF(uint32_t, srcSize); TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize); END_TILING_DATA_DEF; static ge::graphStatus TilingFunc(gert::TilingContext* context) { // Input source shapes. std::vector<int64_t> srcDims = {16, 128}; uint32_t srcSize = 1; for (auto dim : srcDims) { srcSize *= dim; } uint32_t typeSize = 2; ge::Shape shape(srcDims); uint32_t minValue = 0; uint32_t maxValue = 0; AscendC::GetAsinMaxMinTmpSize(shape, typeSize, false, maxValue, minValue); auto platformInfo = context->GetPlatformInfo(); auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo); uint64_t tailSize = 0; // ub剩余空间大小 ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize); // 本样例中使用完整的ub空间,实际情况下tailSize需要减掉用户已使用的ub空间 auto tmpSize = tailSize >= maxValue ? maxValue : tailSize; AsinCustomTilingData tiling; tiling.set_srcSize(srcSize); tiling.set_tmpBufferSize(tmpSize); context->SetBlockDim(1); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); context->SetTilingKey(1); return ge::GRAPH_SUCCESS; } } // namespace optiling
kernel侧读取tiling data,获取相应的临时空间大小,并根据此分配临时空间:
#include "kernel_operator.h" template <typename srcType> class KernelAsin { public: __aicore__ inline KernelAsin() {} __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSize, uint32_t tmpBufferSize) { srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize); dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize); pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(tmpQueue, 1, tmpBufferSize); bufferSize = srcSize; } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>(); AscendC::DataCopy(srcLocal, srcGlobal, bufferSize); inQueue.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>(); AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>(); AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>(); AscendC::Asin<srcType, false>(dstLocal, srcLocal, sharedTmpBuffer, bufferSize); outQueue.EnQue<srcType>(dstLocal); inQueue.FreeTensor(srcLocal); tmpQueue.FreeTensor(sharedTmpBuffer); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>(); AscendC::DataCopy(dstGlobal, dstLocal, bufferSize); outQueue.FreeTensor(dstLocal); } private: AscendC::GlobalTensor<srcType> srcGlobal; AscendC::GlobalTensor<srcType> dstGlobal; AscendC::TPipe pipe; AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue; AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue; AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue; uint32_t bufferSize = 0; }; extern "C" __global__ __aicore__ void asin_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); KernelAsin<half> op; op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.tmpBufferSize); if (TILING_KEY_IS(1)) { op.Process(); } }
样例二
下面的样例展示了数学库kernel侧API和Tiling API GetXxxTmpBufferFactorSize的配套使用方法,具体流程如下:
Host侧调用Tiling接口,获取maxLiveNodeCnt和extraBuf,并推算算子单次最大计算元素数量,将其写入tiling data中;kernel侧再读取tiling data,获取该值,基于该值分配临时空间。
Host侧Tiling API使用样例:
#include <vector> #include <cassert> #include "register/op_def_registry.h" #include "register/tilingdata_base.h" #include "tiling/tiling_api.h" namespace optiling { BEGIN_TILING_DATA_DEF(AsinCustomTilingData) TILING_DATA_FIELD_DEF(uint32_t, srcSize); TILING_DATA_FIELD_DEF(uint32_t, tmpBufferSize); END_TILING_DATA_DEF; static ge::graphStatus TilingFunc(gert::TilingContext *context) { // Input source shapes. std::vector<int64_t> srcDims = { 16, 128 }; uint32_t srcSize = 1; uint32_t srcCurSize = 1; for (auto dim : srcDims) { srcSize *= dim; } uint32_t typeSize = 2; auto platformInfo = context->GetPlatformInfo(); auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfo); uint64_t tailSize = 0; // ub剩余空间大小 ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, tailSize); uint32_t asinMaxLiveNodeCnt = 0; uint32_t asinExtraBuf = 0; uint32_t acosMaxLiveNodeCnt = 0; uint32_t acosExtraBuf = 0; AscendC::GetAsinTmpBufferFactorSize(typeSize, asinMaxLiveNodeCnt, asinExtraBuf); AscendC::GetAcosTmpBufferFactorSize(typeSize, acosMaxLiveNodeCnt, acosExtraBuf); // tmp的大小需要减去UB上调用api接口输入和输出占用的大小 // 该示例中包括Asin接口的输入输出,以及Acos的输入输出,其中Asin接口的输出作为Acos的输入,因此一共需要3份src的空间大小 auto tmpSize = tailSize - srcSize * typeSize * 3; assert(tmpSize >= asinExtraBuf); assert(tmpSize >= acosExtraBuf); // 计算Asin算子单次最大计算元素数量 if (asinMaliveNodeCnt != 0) { srcAsinCurSize = (tmpSize - asinExtraBuf) / asinMaxLiveNodeCnt / typeSize; } else { srcAsinCurSize = srcSize; } // 计算Acos算子单次最大计算元素数量 if (acosMaxLiveNodeCnt != 0) { srcAcosCurSize = (tmpSize - acosExtraBuf) / acosMaxLiveNodeCnt / typeSize; } else { srcAcosCurSize = srcSize; } srcCurSize = min(srcAsinCurSize, srcAcosCurSize); AsinCustomTilingData tiling; tiling.set_srcSize(srcSize); tiling.set_srcCurSize(srcCurSize); tiling.set_tmpBufferSize(tmpSize); context->SetBlockDim(1); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); context->SetTilingKey(1); return ge::GRAPH_SUCCESS; } } // namespace optiling
kernel侧样例:
#include "kernel_operator.h" template <typename srcType> class KernelAsin { public: __aicore__ inline KernelAsin(){} __aicore__ inline void Init(GM_ADDR srcGm, GM_ADDR dstGm, uint32_t srcSizeIn, uint32_t srcCurSizeIn, uint32_t tmpBufferSize) { srcSize = srcSizeIn; srcCurSize = srcCurSizeIn; srcGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(srcGm), srcSize); dstGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ srcType *>(dstGm), srcSize); pipe.InitBuffer(inQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(outQueue, 1, srcSize * sizeof(srcType)); pipe.InitBuffer(tmpQueue1, 1, srcCurSize * sizeof(srcType)); pipe.InitBuffer(tmpQueue, 1, tmpBufferSize); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<srcType> srcLocal = inQueue.AllocTensor<srcType>(); AscendC::DataCopy(srcLocal, srcGlobal, srcSize); inQueue.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<srcType> dstLocal = outQueue.AllocTensor<srcType>(); AscendC::LocalTensor<srcType> srcLocal = inQueue.DeQue<srcType>(); AscendC::LocalTensor<uint8_t> sharedTmpBuffer = tmpQueue.AllocTensor<uint8_t>(); AscendC::LocalTensor<srcType> tmpresBuffer = tmpQueue1.AllocTensor<srcType>(); for (int32_t offset = 0; offset < srcSize; offset += srcCurSize) { AscendC::Asin<srcType, false>(tmpresBuffer, srcLocal[offset], sharedTmpBuffer, srcCurSize); AscendC::PipeBarrier<PIPE_V>(); AscendC::Acos<srcType, false>(dstLocal[offset], tmpresBuffer, sharedTmpBuffer, srcCurSize); AscendC::PipeBarrier<PIPE_V>(); } outQueue.EnQue<srcType>(dstLocal); inQueue.FreeTensor(srcLocal); tmpQueue.FreeTensor(sharedTmpBuffer); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<srcType> dstLocal = outQueue.DeQue<srcType>(); AscendC::DataCopy(dstGlobal, dstLocal, srcSize); outQueue.FreeTensor(dstLocal); } private: AscendC::GlobalTensor<srcType> srcGlobal; AscendC::GlobalTensor<srcType> dstGlobal; AscendC::TPipe pipe; AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueue; AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue; AscendC::TQue<AscendC::QuePosition::VECCALC, 1> tmpQueue1; AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue; uint32_t srcSize, srcCurSize; }; extern "C" __global__ __aicore__ void kernel_asin_operator(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); KernelAsin<half> op; op.Init(srcGm, dstGm, tilingData.srcSize, tilingData.srcCurSize, tilingData.tmpBufferSize); if (TILING_KEY_IS(1)) { op.Process(); } }
样例三
下面以Exp接口的关键调用代码为例,辅以调用前后数据打印结果,展示模板参数isReuseSource的使用及其相关影响。
模板参数isReuseSource为bool类型;若isReuseSource为false,则接口内部计算时不复用源操作数的内存空间;若isReuseSource为true,接口内部计算时会复用源操作数的内存空间,存放一些中间结果,节省内存空间,开发者需要注意接口执行完成后,源操作数的内存空间不再是原始值。
- isReuseSource = false
// 调用Exp前dstLocal、srcLocal数值 // dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5] AscendC::Exp<float, 15, false>(dstLocal, srcLocal, 16); // 调用Exp后dstLocal、srcLocal数值 // dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04] // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]
- isReuseSource = true
// 调用Exp前dstLocal、srcLocal数值 // dstLocal: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] // srcLocal: [-7.5, -6.5, -5.5, -4.5, -3.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5] AscendC::Exp<float, 15, true>(dstLocal, srcLocal, 16); // 调用Exp后dstLocal、srcLocal数值 // dstLocal: [0.000553084, 0.00150344, 0.00408677, 0.011109, 0.0301974, 0.082085, 0.22313, 0.606531, 1.64872, 4.48169, 12.1825, 33.1155, 90.0171, 244.692, 665.142, 1808.04] // srcLocal: [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
父主题: 数学库