InitConstValue
功能说明
初始化LocalTensor(TPosition为A1/A2/B1/B2)为某一个具体的数值。
函数原型
template <typename T>
__aicore__ inline void InitConstValue(const LocalTensor<T> &dstLocal, const InitConstValueParams<T> &InitConstValueParams);
参数说明
参数名称 |
输入/输出 |
含义 |
---|---|---|
dstLocal |
输出 |
目的操作数,结果矩阵,类型为LocalTensor,支持的TPosition为A1/A2/B1/B2。 如果TPosition为A1/B1,起始地址需要满足32B对齐;如果TPosition为A2/B2,起始地址需要满足512B对齐。 Atlas 训练系列产品,支持的数据类型为:half Atlas推理系列产品AI Core,支持的数据类型为:half/int16_t/uint16_t Atlas A2训练系列产品/Atlas 800I A2推理产品,支持的数据类型为:half/int16_t/uint16_t/bfloat16_t/float/int32_t/uint32_t |
InitConstValueParams |
输入 |
初始化相关参数,类型为InitConstValueParams,结构体具体定义为: struct InitConstValueParams { uint16_t repeatTimes; uint16_t blockNum; uint16_t dstGap; T initValue; }; 参数说明请参考表2。 Atlas 训练系列产品只支持配置repeatTimes,initValue,其他参数配置无效 Atlas推理系列产品AI Core只支持配置repeatTimes,initValue,其他参数配置无效 Atlas A2训练系列产品/Atlas 800I A2推理产品支持配置所有参数 |
参数名称 |
含义 |
---|---|
repeatTimes |
迭代次数。默认值为0。 Atlas 训练系列产品:repeatTimes∈[0, 255],每次迭代处理512B数据。 Atlas推理系列产品AI Core:repeatTimes∈[0, 255],每次迭代处理512B数据。 Atlas A2训练系列产品/Atlas 800I A2推理产品:repeatTimes∈[0, 32767] 。
|
blockNum |
每次迭代初始化的数据块个数,取值范围:blockNum∈[0, 32767] 。默认值为0。
|
dstGap |
目的操作数前一个迭代结束地址到后一个迭代起始地址之间的距离。
取值范围:dstGap∈[0, 32767] 。默认值为0。 |
initValue |
初始化的value值,支持的数据类型与dstLocal保持一致。 |
注意事项
- 操作数地址偏移对齐要求请参见通用约束。
支持的型号
Atlas 训练系列产品
Atlas推理系列产品AI Core
Atlas A2训练系列产品/Atlas 800I A2推理产品
调用示例
#include "kernel_operator.h" namespace AscendC { template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T> class KernelCubeMmad { public: __aicore__ inline KernelCubeMmad() { C0 = 32 / sizeof(fmap_T); C1 = channelSize / C0; coutBlocks = (Cout + 16 - 1) / 16; ho = H - dilationH * (Kh - 1); wo = W - dilationW * (Kw - 1); howo = ho * wo; howoRound = ((howo + 16 - 1) / 16) * 16; featureMapA1Size = C1 * H * W * C0; // shape: [C1, H, W, C0] weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0] featureMapA2Size = howoRound * (C1 * Kh * Kw * C0); weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16; m = howo; k = C1 * Kh * Kw * C0; n = Cout; biasSize = Cout; // shape: [Cout] dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16] dstCO1Size = coutBlocks * howoRound * 16; fmRepeat = featureMapA2Size / (16 * C0); weRepeat = weightB2Size / (16 * C0); } __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* biasGm, __gm__ uint8_t* dstGm) { fmGlobal.SetGlobalBuffer((__gm__ fmap_T*)fmGm); weGlobal.SetGlobalBuffer((__gm__ weight_T*)weGm); biasGlobal.SetGlobalBuffer((__gm__ dstCO1_T*)biasGm); dstGlobal.SetGlobalBuffer((__gm__ dst_T*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(weight_T)); pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(weight_T)); pipe.InitBuffer(inQueueBiasA1, 1, biasSize * sizeof(dstCO1_T)); pipe.InitBuffer(outQueueCO1, 1, dstCO1Size * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); Split(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.AllocTensor<fmap_T>(); LocalTensor<weight_T> weightB1 = inQueueWeB1.AllocTensor<weight_T>(); LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.AllocTensor<dstCO1_T>(); InitConstValue(featureMapA1, {1, static_cast<uint16_t>(featureMapA1Size * sizeof(fmap_T) / 32), 0, 1}); InitConstValue(weightB1, {1, static_cast<uint16_t>(weightA1Size * sizeof(weight_T) / 32), 0, 2}); DataCopy(biasA1, biasGlobal, { 1, static_cast<uint16_t>(biasSize * sizeof(dstCO1_T) / 32), 0, 0 }); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); inQueueBiasA1.EnQue(biasA1); } __aicore__ inline void Split() { LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.DeQue<fmap_T>(); LocalTensor<weight_T> weightB1 = inQueueWeB1.DeQue<weight_T>(); LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.AllocTensor<fmap_T>(); LocalTensor<weight_T> weightB2 = inQueueWeB2.AllocTensor<weight_T>(); InitConstValue(featureMapA2, {1, static_cast<uint16_t>(featureMapA2Size * sizeof(fmap_T) / 512), 0, 1}); InitConstValue(weightB2, { 1, static_cast<uint16_t>(weightB2Size * sizeof(weight_T) / 512), 0, 2}); inQueueFmA2.EnQue<fmap_T>(featureMapA2); inQueueWeB2.EnQue<weight_T>(weightB2); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); } __aicore__ inline void Compute() { LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.DeQue<fmap_T>(); LocalTensor<weight_T> weightB2 = inQueueWeB2.DeQue<weight_T>(); LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.AllocTensor<dstCO1_T>(); LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.DeQue<dstCO1_T>(); Mmad(dstCO1, featureMapA2, weightB2, biasA1, { m, n, k, true, 0, false, false, false }); outQueueCO1.EnQue<dstCO1_T>(dstCO1); inQueueFmA2.FreeTensor(featureMapA2); inQueueWeB2.FreeTensor(weightB2); inQueueBiasA1.FreeTensor(biasA1); } __aicore__ inline void CopyOut() { LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.DeQue<dstCO1_T>(); FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = coutBlocks * 16; fixpipeParams.mSize = howo; fixpipeParams.srcStride = howo; fixpipeParams.dstStride = howo * BLOCK_CUBE * sizeof(dst_T) / ONE_BLK_SIZE; fixpipeParams.quantPre = deqMode; Fixpipe<dst_T, dstCO1_T, CFG_NZ>(dstGlobal, dstCO1, fixpipeParams); outQueueCO1.FreeTensor(dstCO1); } private: TPipe pipe; // feature map queue TQue<QuePosition::A1, 1> inQueueFmA1; TQue<QuePosition::A2, 1> inQueueFmA2; // weight queue TQue<QuePosition::B1, 1> inQueueWeB1; TQue<QuePosition::B2, 1> inQueueWeB2; // bias queue TQue<QuePosition::A1, 1> inQueueBiasA1; // dst queue TQue<QuePosition::CO1, 1> outQueueCO1; GlobalTensor<fmap_T> fmGlobal; GlobalTensor<weight_T> weGlobal; GlobalTensor<dst_T> dstGlobal; GlobalTensor<dstCO1_T> biasGlobal; uint16_t channelSize = 32; uint16_t H = 4, W = 4; uint8_t Kh = 2, Kw = 2; uint16_t Cout = 16; uint16_t C0, C1; uint8_t dilationH = 2, dilationW = 2; uint16_t coutBlocks, ho, wo, howo, howoRound; uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, biasSize, dstSize, dstCO1Size; uint16_t m, k, n; uint8_t fmRepeat, weRepeat; QuantMode_t deqMode = QuantMode_t::F322F16; }; } // namespace AscendC extern "C" __global__ __aicore__ void cube_mmad_simple_kernel(__gm__ uint8_t *fmGm, __gm__ uint8_t *weGm, __gm__ uint8_t *biasGm, __gm__ uint8_t *dstGm) { AscendC::KernelCubeMmad<dst_type, fmap_type, weight_type, dstCO1_type> op; op.Init(fmGm, weGm, biasGm, dstGm); op.Process(); }