InitConstValue
功能说明
初始化LocalTensor(TPosition为A1/A2/B1/B2)为某一个具体的数值。
函数原型
1 2 |
template <typename T> __aicore__ inline void InitConstValue(const LocalTensor<T> &dstLocal, const InitConstValueParams<T> &InitConstValueParams) |
参数说明
参数名称 |
输入/输出 |
含义 |
||
---|---|---|---|---|
dstLocal |
输出 |
目的操作数,结果矩阵,类型为LocalTensor,支持的TPosition为A1/A2/B1/B2。 如果TPosition为A1/B1,起始地址需要满足32B对齐;如果TPosition为A2/B2,起始地址需要满足512B对齐。 Atlas 训练系列产品,支持的数据类型为:half Atlas推理系列产品AI Core,支持的数据类型为:half/int16_t/uint16_t Atlas A2训练系列产品/Atlas 800I A2推理产品,支持的数据类型为:half/int16_t/uint16_t/bfloat16_t/float/int32_t/uint32_t Atlas 200/500 A2推理产品, 支持的数据类型为:half/int16_t/uint16_t/bfloat16_t/float/int32_t/uint32_t |
||
InitConstValueParams |
输入 |
初始化相关参数,类型为InitConstValueParams,结构体具体定义为:
参数说明请参考表2。 Atlas 训练系列产品只支持配置repeatTimes、initValue,其他参数配置无效。 Atlas推理系列产品AI Core只支持配置repeatTimes、initValue,其他参数配置无效。 Atlas A2训练系列产品/Atlas 800I A2推理产品支持配置所有参数。 Atlas 200/500 A2推理产品, 支持配置所有参数。 |
参数名称 |
含义 |
---|---|
repeatTimes |
迭代次数。默认值为0。 Atlas 训练系列产品:repeatTimes∈[0, 255],每次迭代处理512B数据。 Atlas推理系列产品AI Core:repeatTimes∈[0, 255],每次迭代处理512B数据。 Atlas A2训练系列产品/Atlas 800I A2推理产品:repeatTimes∈[0, 32767] 。
Atlas 200/500 A2推理产品:repeatTimes∈[0, 32767] 。
|
blockNum |
每次迭代初始化的数据块个数,取值范围:blockNum∈[0, 32767] 。默认值为0。
|
dstGap |
目的操作数前一个迭代结束地址到后一个迭代起始地址之间的距离。
取值范围:dstGap∈[0, 32767] 。默认值为0。 |
initValue |
初始化的value值,支持的数据类型与dstLocal保持一致。 |
注意事项
- 操作数地址偏移对齐要求请参见通用约束。
支持的型号
Atlas 训练系列产品
Atlas推理系列产品AI Core
Atlas A2训练系列产品/Atlas 800I A2推理产品
Atlas 200/500 A2推理产品
调用示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T> class KernelCubeMmad { public: __aicore__ inline KernelCubeMmad() { C0 = 32 / sizeof(fmap_T); C1 = channelSize / C0; coutBlocks = (Cout + 16 - 1) / 16; ho = H - dilationH * (Kh - 1); wo = W - dilationW * (Kw - 1); howo = ho * wo; howoRound = ((howo + 16 - 1) / 16) * 16; featureMapA1Size = C1 * H * W * C0; // shape: [C1, H, W, C0] weightA1Size = C1 * Kh * Kw * Cout * C0; // shape: [C1, Kh, Kw, Cout, C0] featureMapA2Size = howoRound * (C1 * Kh * Kw * C0); weightB2Size = (C1 * Kh * Kw * C0) * coutBlocks * 16; m = howo; k = C1 * Kh * Kw * C0; n = Cout; biasSize = Cout; // shape: [Cout] dstSize = coutBlocks * howo * 16; // shape: [coutBlocks, howo, 16] dstCO1Size = coutBlocks * howoRound * 16; fmRepeat = featureMapA2Size / (16 * C0); weRepeat = weightB2Size / (16 * C0); } __aicore__ inline void Init(__gm__ uint8_t* fmGm, __gm__ uint8_t* weGm, __gm__ uint8_t* biasGm, __gm__ uint8_t* dstGm) { fmGlobal.SetGlobalBuffer((__gm__ fmap_T*)fmGm); weGlobal.SetGlobalBuffer((__gm__ weight_T*)weGm); biasGlobal.SetGlobalBuffer((__gm__ dstCO1_T*)biasGm); dstGlobal.SetGlobalBuffer((__gm__ dst_T*)dstGm); pipe.InitBuffer(inQueueFmA1, 1, featureMapA1Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueFmA2, 1, featureMapA2Size * sizeof(fmap_T)); pipe.InitBuffer(inQueueWeB1, 1, weightA1Size * sizeof(weight_T)); pipe.InitBuffer(inQueueWeB2, 1, weightB2Size * sizeof(weight_T)); pipe.InitBuffer(inQueueBiasA1, 1, biasSize * sizeof(dstCO1_T)); pipe.InitBuffer(outQueueCO1, 1, dstCO1Size * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); Split(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.AllocTensor<weight_T>(); AscendC::LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.AllocTensor<dstCO1_T>(); AscendC::InitConstValue(featureMapA1, {1, static_cast<uint16_t>(featureMapA1Size * sizeof(fmap_T) / 32), 0, 1}); AscendC::InitConstValue(weightB1, {1, static_cast<uint16_t>(weightA1Size * sizeof(weight_T) / 32), 0, 2}); AscendC::DataCopy(biasA1, biasGlobal, { 1, static_cast<uint16_t>(biasSize * sizeof(dstCO1_T) / 32), 0, 0 }); inQueueFmA1.EnQue(featureMapA1); inQueueWeB1.EnQue(weightB1); inQueueBiasA1.EnQue(biasA1); } __aicore__ inline void Split() { AscendC::LocalTensor<fmap_T> featureMapA1 = inQueueFmA1.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB1 = inQueueWeB1.DeQue<weight_T>(); AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.AllocTensor<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.AllocTensor<weight_T>(); AscendC::InitConstValue(featureMapA2, {1, static_cast<uint16_t>(featureMapA2Size * sizeof(fmap_T) / 512), 0, 1}); AscendC::InitConstValue(weightB2, { 1, static_cast<uint16_t>(weightB2Size * sizeof(weight_T) / 512), 0, 2}); inQueueFmA2.EnQue<fmap_T>(featureMapA2); inQueueWeB2.EnQue<weight_T>(weightB2); inQueueFmA1.FreeTensor(featureMapA1); inQueueWeB1.FreeTensor(weightB1); } __aicore__ inline void Compute() { AscendC::LocalTensor<fmap_T> featureMapA2 = inQueueFmA2.DeQue<fmap_T>(); AscendC::LocalTensor<weight_T> weightB2 = inQueueWeB2.DeQue<weight_T>(); AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.AllocTensor<dstCO1_T>(); AscendC::LocalTensor<dstCO1_T> biasA1 = inQueueBiasA1.DeQue<dstCO1_T>(); AscendC::Mmad(dstCO1, featureMapA2, weightB2, biasA1, { m, n, k, true, 0, false, false, false }); outQueueCO1.EnQue<dstCO1_T>(dstCO1); inQueueFmA2.FreeTensor(featureMapA2); inQueueWeB2.FreeTensor(weightB2); inQueueBiasA1.FreeTensor(biasA1); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<dstCO1_T> dstCO1 = outQueueCO1.DeQue<dstCO1_T>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = coutBlocks * 16; fixpipeParams.mSize = howo; fixpipeParams.srcStride = howo; fixpipeParams.dstStride = howo * AscendC::BLOCK_CUBE * sizeof(dst_T) / AscendC::ONE_BLK_SIZE; fixpipeParams.quantPre = deqMode; AscendC::Fixpipe<dst_T, dstCO1_T, AscendC::CFG_NZ>(dstGlobal, dstCO1, fixpipeParams); outQueueCO1.FreeTensor(dstCO1); } private: AscendC::TPipe pipe; // feature map queue AscendC::TQue<AscendC::QuePosition::A1, 1> inQueueFmA1; AscendC::TQue<AscendC::QuePosition::A2, 1> inQueueFmA2; // weight queue AscendC::TQue<AscendC::QuePosition::B1, 1> inQueueWeB1; AscendC::TQue<AscendC::QuePosition::B2, 1> inQueueWeB2; // bias queue AscendC::TQue<AscendC::QuePosition::A1, 1> inQueueBiasA1; // dst queue AscendC::TQue<AscendC::QuePosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<fmap_T> fmGlobal; AscendC::GlobalTensor<weight_T> weGlobal; AscendC::GlobalTensor<dst_T> dstGlobal; AscendC::GlobalTensor<dstCO1_T> biasGlobal; uint16_t channelSize = 32; uint16_t H = 4, W = 4; uint8_t Kh = 2, Kw = 2; uint16_t Cout = 16; uint16_t C0, C1; uint8_t dilationH = 2, dilationW = 2; uint16_t coutBlocks, ho, wo, howo, howoRound; uint32_t featureMapA1Size, weightA1Size, featureMapA2Size, weightB2Size, biasSize, dstSize, dstCO1Size; uint16_t m, k, n; uint8_t fmRepeat, weRepeat; AscendC::QuantMode_t deqMode = AscendC::QuantMode_t::F322F16; }; extern "C" __global__ __aicore__ void cube_mmad_simple_kernel(__gm__ uint8_t *fmGm, __gm__ uint8_t *weGm, __gm__ uint8_t *biasGm, __gm__ uint8_t *dstGm) { KernelCubeMmad<half, half, half, half> op; op.Init(fmGm, weGm, biasGm, dstGm); op.Process(); } |