CheckLocalMemoryIA

功能说明

监视设定范围内的UB读写行为，如果监视到有设定范围的读写行为则会出现EXCEPTION报错，未监视到设定范围的读写行为则不会报错。

函数原型

__aicore__ inline void CheckLocalMemoryIA(const CheckLocalMemoryIAParam& checkParams);

参数说明

表1 参数说明
参数名称	输入/输出	含义
checkParams	输入	矩阵乘相关参数，类型为CheckLocalMemoryIAParam，结构体具体定义为： struct CheckLocalMemoryIAParam { uint8_t enableBit = 0; uint32_t startAddr = 0; uint32_t endAddr = 0; bool isScalarRead = false; bool isScalarWrite = false; bool isVectorRead = false; bool isVectorWrite = false; bool isMteRead = false; bool isMteWrite = false; bool isEnable = false; }; 参数说明请参考表2。

表1 参数说明

参数名称

输入/输出

含义

checkParams

输入

矩阵乘相关参数，类型为CheckLocalMemoryIAParam，结构体具体定义为：

struct CheckLocalMemoryIAParam {
    uint8_t enableBit = 0;
    uint32_t startAddr = 0;
    uint32_t endAddr = 0;
    bool isScalarRead = false;
    bool isScalarWrite = false;
    bool isVectorRead = false;
    bool isVectorWrite = false;
    bool isMteRead = false;
    bool isMteWrite = false;
    bool isEnable = false;
};

参数说明请参考表2。

表2 CheckLocalMemoryIAParam结构体内参数说明
参数名称	含义
enableBit	配置的异常寄存器，取值范围：enableBit∈[0,3]，默认为0。 0：异常寄存器0。 1：异常寄存器1。 2：异常寄存器2。 3：异常寄存器3。
startAddr	Check的起始地址，32B对齐，取值范围：startAddr∈[0, 65535]，默认值为0。比如，可通过LocalTensor.GetPhyAddr()/32来获取startAddr。
endAddr	Check的结束地址，32B对齐，取值范围：startAddr∈[0, 65535] 。默认值为0。
isScalarRead	Check标量读访问。 false：不开启，默认为false。 true：开启。
isScalarWrite	Check标量写访问。 false：不开启，默认为false。 true：开启。
isVectorRead	Check矢量读访问。 false：不开启，默认为false。 true：开启。
isVectorWrite	Check矢量写访问。 false：不开启，默认为false。 true：开启。
isMteRead	Check Mte读访问。 false：不开启，默认为false。 true：开启。
isMteWrite	Check Mte写访问。 false：不开启，默认为false。 true：开启。
isEnable	是否使能enableBit参数配置的异常寄存器。 false：不使能，默认为false。 true：使能。

约束说明

startAddr/endAddr的单位是32B，监控范围不包含startAddr，包含endAddr，即(startAddr，endAddr]。
每次调用完该接口需要进行复位（配置isEnable为false进行复位）；
操作数地址偏移对齐要求请参见通用约束。

支持的型号

Atlas A2训练系列产品/Atlas 800I A2推理产品

Atlas推理系列产品AI Core

调用示例

该示例check矢量写访问是否在设定的(startAddr, endAddr]范围内。当前示例check到矢量写在设定的范围内，结果会报错（ACL_ERROR_RT_VECTOR_CORE_EXECPTION）。

#include "kernel_operator.h"

namespace AscendC {
class KernelAdd {
public:
    __aicore__ inline KernelAdd() {}
    __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
    {
        src0Global.SetGlobalBuffer((__gm__ half*)src0Gm);
        src1Global.SetGlobalBuffer((__gm__ half*)src1Gm);
        dstGlobal.SetGlobalBuffer((__gm__ half*)dstGm);
        pipe.InitBuffer(inQueueSrc0, 1, 512 * sizeof(half));
        pipe.InitBuffer(inQueueSrc1, 1, 512 * sizeof(half));
        pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(half));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }

private:
    __aicore__ inline void CopyIn()
    {
        LocalTensor<half> src0Local = inQueueSrc0.AllocTensor<half>();
        LocalTensor<half> src1Local = inQueueSrc1.AllocTensor<half>();
        DataCopy(src0Local, src0Global, 512);
        DataCopy(src1Local, src1Global, 512);
        inQueueSrc0.EnQue(src0Local);
        inQueueSrc1.EnQue(src1Local);
    }
    __aicore__ inline void Compute()
    {
        LocalTensor<half> src0Local = inQueueSrc0.DeQue<half>();
        LocalTensor<half> src1Local = inQueueSrc1.DeQue<half>();
        LocalTensor<half> dstLocal = outQueueDst.AllocTensor<half>();
        CheckLocalMemoryIA({ 0, (uint32_t)(dstLocal.GetPhyAddr() / 32),
            (uint32_t)((dstLocal.GetPhyAddr() + 512 * sizeof(half)) / 32), false, false, false, true, false, false,
            true });
        Add(dstLocal, src0Local, src1Local, 512);

        outQueueDst.EnQue<half>(dstLocal);
        inQueueSrc0.FreeTensor(src0Local);
        inQueueSrc1.FreeTensor(src1Local);
    }
    __aicore__ inline void CopyOut()
    {
        LocalTensor<half> dstLocal = outQueueDst.DeQue<half>();
        DataCopy(dstGlobal, dstLocal, 512);
        outQueueDst.FreeTensor(dstLocal);
    }

private:
    TPipe pipe;
    TQue<QuePosition::VECIN, 1> inQueueSrc0, inQueueSrc1;
    TQue<QuePosition::VECOUT, 1> outQueueDst;
    GlobalTensor<half> src0Global, src1Global, dstGlobal;
};
} // namespace AscendC

extern "C" __global__ __aicore__ void add_simple_kernel(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm)
{
    AscendC::KernelAdd op;
    op.Init(src0Gm, src1Gm, dstGm);
    op.Process();
}

父主题： 矩阵计算