实现样例
Ascend C Matmul算子实现文件:matmul_custom.cpp
/* * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. * * Function : c = a * b (matrix multiplication) * This sample is a very basic sample that implements Matmul on Ascend platform. * In this sample: * Shape of matrix a is [m, k]: [32, 32] * Shape of matrix b is [k, n]: [32, 32] * Shape of matrix c is [m, n]: [32, 32] */ #include "kernel_operator.h" class KernelMatmul { public: __aicore__ inline KernelMatmul() { aSize = m * k; bSize = k * n; cSize = m * m; mBlocks = m / 16; nBlocks = n / 16; kBlocks = k / 16; } __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR c) { aGM.SetGlobalBuffer((__gm__ half *)a); bGM.SetGlobalBuffer((__gm__ half *)b); cGM.SetGlobalBuffer((__gm__ float *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(half) / 2); pipe.InitBuffer(outQueueCO1, 2, cSize * sizeof(float) / 2); pipe.InitBuffer(outQueueCO2, 1, cSize * sizeof(float)); } __aicore__ inline void Process() { CopyIn(); SplitA(); AscendC::LocalTensor<half> b1Local = inQueueB1.DeQue<half>(); AscendC::LocalTensor<half> a2Local = inQueueA2.DeQue<half>(); AscendC::LocalTensor<float> c2Local = outQueueCO2.AllocTensor<float>(); // split matrix b into 2 parts, [32, 16] and [32, 16] for (int i = 0; i < 2; ++i) { SplitB(b1Local, i); Compute(a2Local); Aggregate(c2Local, i); } inQueueB1.FreeTensor(b1Local); inQueueA2.FreeTensor(a2Local); outQueueCO2.EnQue<float>(c2Local); CopyOut(); } private: __aicore__ inline void CopyND2NZ(const AscendC::LocalTensor<half> &dst, const AscendC::GlobalTensor<half> &src, const uint16_t height, const uint16_t width) { for (int i = 0; i < width / 16; ++i) { int srcOffset = i * 16; int dstOffset = i * 16 * height; AscendC::DataCopy(dst[dstOffset], src[srcOffset], {height, 1, uint16_t(width / 16 - 1), 0}); } } __aicore__ inline void CopyIn() { AscendC::LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>(); AscendC::LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>(); CopyND2NZ(a1Local, aGM, m, k); CopyND2NZ(b1Local, bGM, k, n); inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { int srcOffset = 0; int dstOffset = 0; AscendC::LocalTensor<half> a1Local = inQueueA1.DeQue<half>(); AscendC::LocalTensor<half> a2Local = inQueueA2.AllocTensor<half>(); // transform nz to zz for (int i = 0; i < mBlocks; ++i) { AscendC::LoadData2dParams loadDataParams; loadDataParams.repeatTimes = kBlocks; loadDataParams.srcStride = mBlocks; loadDataParams.ifTranspose = false; LoadData(a2Local[dstOffset], a1Local[srcOffset], loadDataParams); srcOffset += 16 * 16; dstOffset += k * 16; } inQueueA2.EnQue<half>(a2Local); inQueueA1.FreeTensor(a1Local); } __aicore__ inline void SplitB(const AscendC::LocalTensor<half> &b1Local, const int bSplitIdx) { AscendC::LocalTensor<half> b2Local = inQueueB2.AllocTensor<half>(); // transform nz to zn AscendC::LoadData2dParams loadDataParams; loadDataParams.repeatTimes = kBlocks; loadDataParams.srcStride = 1; loadDataParams.ifTranspose = true; AscendC::LoadData(b2Local, b1Local[bSplitIdx * bSize / 2], loadDataParams); inQueueB2.EnQue<half>(b2Local); } __aicore__ inline void Compute(const AscendC::LocalTensor<half> &a2Local) { AscendC::LocalTensor<half> b2Local = inQueueB2.DeQue<half>(); AscendC::LocalTensor<float> c1Local = outQueueCO1.AllocTensor<float>(); AscendC::Mmad(c1Local, a2Local, b2Local, {m, uint16_t(n / 2), k, false, 0, false, false, false}); outQueueCO1.EnQue<float>(c1Local); inQueueB2.FreeTensor(b2Local); } __aicore__ inline void Aggregate(const AscendC::LocalTensor<float> &c2Local, const int bSplitIdx) { AscendC::LocalTensor<float> c1Local = outQueueCO1.DeQue<float>(); AscendC::DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = 2; AscendC::DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(c2Local[bSplitIdx * cSize / 2], c1Local, dataCopyParams, enhancedParams); outQueueCO1.FreeTensor(c1Local); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<float> c2Local = outQueueCO2.DeQue<float>(); // transform nz to nd for (int i = 0; i < nBlocks; ++i) { AscendC::DataCopy(cGM[i * 16], c2Local[i * m * 16], {m, 2, 0, uint16_t((nBlocks - 1) * 2)}); } outQueueCO2.FreeTensor(c2Local); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::QuePosition::A1, 1> inQueueA1; AscendC::TQue<AscendC::QuePosition::A2, 1> inQueueA2; AscendC::TQue<AscendC::QuePosition::B1, 1> inQueueB1; AscendC::TQue<AscendC::QuePosition::B2, 2> inQueueB2; // dst queue AscendC::TQue<AscendC::QuePosition::CO1, 2> outQueueCO1; AscendC::TQue<AscendC::QuePosition::CO2, 1> outQueueCO2; AscendC::GlobalTensor<half> aGM, bGM; AscendC::GlobalTensor<float> cGM; uint16_t m = 32; uint16_t n = 32; uint16_t k = 32; uint16_t aSize, bSize, cSize, mBlocks, nBlocks, kBlocks; }; extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c) { KernelMatmul op; op.Init(a, b, c); op.Process(); } #ifndef ASCENDC_CPU_DEBUG // call of kernel function void matmul_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *a, uint8_t *b, uint8_t *c) { matmul_custom<<<blockDim, l2ctrl, stream>>>(a, b, c); } #endif
父主题: 矩阵编程(基础API)