LoadDataUnzip

功能说明

将GM上的数据解压并搬运到A1/B1/B2上。

函数原型

template <typename T>
__aicore__ inline void LoadDataUnzip(const LocalTensor<T>& dstLocal, const GlobalTensor<T>& srcLocal)

参数说明

表1 参数说明
参数名称	输入/输出	含义
dstLocal	输出	目的操作数，类型为LocalTensor，支持的TPosition为A1/B1/B2。 LocalTensor的起始地址需要保证：TPosition为A1/B1时，32字节对齐；TPosition为B2时，512B对齐。支持的数据类型为：int8_t。
srcLocal	输入	源操作数，类型为GlobalTensor。数据类型需要与dstLocal保持一致。

支持的型号

Atlas推理系列产品AI Core

注意事项

操作数地址偏移对齐要求请参见通用约束。

返回值

无

调用示例

该调用示例支持的运行平台为Atlas推理系列产品AI Core。

#include "kernel_operator.h"

class KernelLoadUnzip {
public:
    __aicore__ inline KernelLoadUnzip() {}
    __aicore__ inline void Init(__gm__ int8_t *weGm, __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
    {
        weGlobal.SetGlobalBuffer((__gm__ int8_t *)weGm);
        indexGlobal.SetGlobalBuffer((__gm__ int8_t *)indexGm);
        dstGlobal.SetGlobalBuffer((__gm__ int8_t *)dstGm);
        pipe.InitBuffer(inQueueB1, 1, dstLen * sizeof(int8_t));
        pipe.InitBuffer(outQueueUB, 1, dstLen * sizeof(int8_t));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        CopyToUB();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.AllocTensor<int8_t>();
        AscendC::LoadUnzipIndex(indexGlobal, numOfIndexTabEntry);
        AscendC::LoadDataUnzip(weightB1, weGlobal);
        inQueueB1.EnQue(weightB1);
    }
    __aicore__ inline void CopyToUB()
    {
        AscendC::LocalTensor<int8_t> weightB1 = inQueueB1.DeQue<int8_t>();
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.AllocTensor<int8_t>();
        AscendC::DataCopy(featureMapUB, weightB1, dstLen);
        outQueueUB.EnQue<int8_t>(featureMapUB);
        inQueueB1.FreeTensor(weightB1);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<int8_t> featureMapUB = outQueueUB.DeQue<int8_t>();
        event_t eventIdMTE1ToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE1_MTE3));
        AscendC::SetFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::WaitFlag<AscendC::HardEvent::MTE1_MTE3>(eventIdMTE1ToMTE3);
        AscendC::DataCopy(dstGlobal, featureMapUB, dstLen);
        outQueueUB.FreeTensor(featureMapUB);
    }

private:
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::QuePosition::B1, 1> inQueueB1;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueUB;
    AscendC::GlobalTensor<int8_t> weGlobal;
    AscendC::GlobalTensor<int8_t> dstGlobal;
    AscendC::GlobalTensor<int8_t> indexGlobal;
    uint32_t srcLen = 896, dstLen = 1024, numOfIndexTabEntry = 1;
};
extern "C" __global__ __aicore__ void cube_load_unzip_simple_kernel(__gm__ int8_t *weightGm,
    __gm__ int8_t *indexGm, __gm__ int8_t *dstGm)
{
    KernelLoadUnzip op;
    op.Init(weightGm, indexGm, dstGm);
    op.Process();
}

父主题： 矩阵计算