本章节列出的接口均为预留接口，后续有可能变更或废弃，不建议开发者使用，开发者无需关注。

Matmul

__aicore__ inline void SetSubBlockIdx(uint8_t subBlockIdx);
__aicore__ inline const handle SyncGroupJoin(uint32_t groupID)
__aicore__ inline const handle SyncGroupJoin(uint32_t groupID)
__aicore__ inline void SetAntiQuantScalar(const SrcT offsetScalar, const SrcT scaleScalar);
__aicore__ inline void SetAntiQuantVector(const LocalTensor<SrcT> &offsetTensor, const LocalTensor<SrcT> &scaleTensor);
__aicore__ inline void SetTensorAWithCopy(const GlobalTensor<SrcAT>& gm, const LocalTensor<SrcAT> &leftMatrix, bool isTransposeA = false);
__aicore__ inline void SetTensorBWithCopy(const GlobalTensor<SrcBT>& gm, const LocalTensor<SrcBT> &rightMatrix, bool isTransposeB = false);

SoftMax

template <typename T1, typename T2> __aicore__ inline bool AdjustSoftMaxRes(const LocalTensor<T1>& softMaxRes, const LocalTensor<T2>& maxTensor, const uint32_t from, const T1 to, const SoftMaxShapeInfo& softmaxShapeInfo)

LayerNorm

template <typename T, bool isReuseSource = false> __aicore__ inline void LayerNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamma, const LocalTensor<T>& beta, const T epsilon, LayerNormTiling& tiling)

template <typename T, bool isReuseSource = false> __aicore__ inline void LayerNorm(const LocalTensor<T>& output, const LocalTensor<T>& outputMean, const LocalTensor<T>& outputVariance, const LocalTensor<T>& inputX, const LocalTensor<T>& gamma, const LocalTensor<T>& beta, const LocalTensor<uint8_t>& sharedTmpBuffer, const T epsilon, LayerNormTiling& tiling)
template <typename T, bool isReuseSource = false>
__aicore__ inline void LayerNormGrad(const LocalTensor<T> &outputPdX, const LocalTensor<T> &resForGamma, const LocalTensor<T> &inputDy, const LocalTensor<T> &inputX, const LocalTensor<T> &inputVariance, const LocalTensor<T> &inputMean, const LocalTensor<T> &inputGamma, T epsilon, LayerNormGradTiling &tiling)
template <typename T, bool isReuseSource = false>
__aicore__ inline void LayerNormGrad(const LocalTensor<T> &outputPdX, const LocalTensor<T> &resForGamma, const LocalTensor<T> &inputDy, const LocalTensor<T> &inputX, const LocalTensor<T> &inputVariance, const LocalTensor<T> &inputMean, const LocalTensor<T> &inputGamma, LocalTensor<uint8_t> &sharedTmpBuffer, T epsilon, LayerNormGradTiling &tiling)
template <typename T, bool isReuseSource = false>
__aicore__ inline void LayerNormGradBeta(const LocalTensor<T>& outputPdGamma, const LocalTensor<T>& outputPdBeta, const LocalTensor<T>& resForGamma, const LocalTensor<T>& inputDy, LayerNormGradBetaTiling& tiling)
template <typename T, bool isReuseSource = false>
__aicore__ inline void LayerNormGradBeta(const LocalTensor<T>& outputPdGamma, const LocalTensor<T>& outputPdBeta, const LocalTensor<T>& resForGamma, const LocalTensor<T>& inputDy, const LocalTensor<uint8_t>& sharedTmpBuffer,const LayerNormGradBetaTiling& tiling)
void GetLayerNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource, uint32_t& maxValue, uint32_t& minValue)
void GetLayerNormRedNDTillingInfo(const uint32_t stackBufferSize, const uint32_t typeSize, optiling::LayerNormRedTiling& tilling, optiling::LayerNormRedParams& redParams)
inline void GetLayerNormGradMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource, uint32_t &maxValue, uint32_t &minValue)
inline void GetLayerNormGradNDTilingInfo(const ge::Shape srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, const bool isReuseSource, optiling::LayerNormGradTiling &tiling)
inline void GetLayerNormGradReduceTilingInfo(const uint32_t stackBufferSize, const uint32_t typeSize, optiling::LayerNormGradReduceTiling &tiling, optiling::LayerNormGradReduceParams &reduceParams, const bool isReuseSource = false)
void GetLayerNormGradBetaMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource, uint32_t& maxValue, uint32_t& minValue)
void GetLayerNormGradBetaNDTilingInfo(const ge::Shape srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, const bool isReuseSource, optiling::LayerNormGradBetaTiling& tiling)
void GetLayerNormGradBetaReduceTilingInfo(const uint32_t stackBufferSize, const uint32_t typeSize, optiling::LayerNormGradBetaReduceTiling& tiling, optiling::LayerNormGradBetaReduceParams& params)

ConfusionTranspose

void GetConfusionTransposeOnlyTilingInfo(const ge::Shape &srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, optiling::ConfusionTransposeTiling &tiling);

AscendAntiQuant

inline uint32_t GetAscendAntiQuantMaxTmpSize(const ge::Shape &srcShape, const ge::Shape &scaleShape, bool isTranspose, ge::DataType inputDataType, ge::DataType outputDataType)
inline uint32_t GetAscendAntiQuantMinTmpSize(const ge::Shape &srcShape, const ge::Shape &scaleShape, bool isTranspose, ge::DataType inputDataType, ge::DataType outputDataType)

SwiGLU

void GetSwiGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer)

GeGLU

void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)

数学库

void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf)
void GetAsinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer)
void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetAcosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer)
void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetCoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer)
void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)

void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetPowerTmpBufferFactorSize(const bool baseIsTensor, const bool expIsTensor, const bool typeIsInt, const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuffer)
void GetErfTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer)
void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
inline uint32_t GetSignMaxTmpSize(const ge::Shape srcShape, const uint32_t typeSize, const bool isReuseSource)
inline uint32_t GetSignMinTmpSize(const ge::Shape srcShape, const uint32_t typeSize, const bool isReuseSource)
inline uint32_t GetXorMaxTmpSize(const ge::Shape srcShape, const uint32_t typeSize, const bool isReuseSource)
inline uint32_t GetXorMinTmpSize(const ge::Shape srcShape, const uint32_t typeSize, const bool isReuseSource)
void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf)
void GetExpTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuffer)

数据类型定义

LocalTensor
- SetAddrWithOffset
- Print [CPU ONLY]
- ToFile [CPU ONLY]
- GetBufferHandle
- GetPosition
- GetLength
- SetBufferLen
- SetShapeInfo
- GetShapeInfo
- SetAddr
- GetPhyAddr
- operator()
GetShapeSize
GlobalTensor
- SetAddr
- GetValue
- SetValue
- SetShapeInfo
- GetShapeInfo
- operator()

TensorDesc类
ListTensorDesc类

内存管理和同步控制

TPipe
- Init
- GetAbsAddr
- InitShareBufStart
- InitShareBufEnd
- GetQueueEndAddress
- Reset
- Destroy
- GetBaseAddr
- ReleaseEvent
- IsAivTscm
- GetBaseAddr [cpu only]

TQueBind
TBuf
GetWithOffset

SetTpipeBuf
__aicore__ constexpr Hardware GetPhyType(TPosition pos);
template <typename T, TPosition pos> __aicore__ inline bool PopStackBuffer(LocalTensor<T>& popLocal);
template <TPosition pos> __aicore__ inline bool PopStackBuffer(TBuf<pos>& popBuffer, TBufType& bufStart);

矢量计算

template <typename T, bool isSetMask = true> __aicore__ inline void RepeatReduceSum(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, const int32_t repeat, const int32_t elemsInOneRepeate, const int32_t dstBlkStride, const int32_t srcBlkStride, const int32_t dstRepStride, const int32_t srcRepStride);
template <typename T, typename U> __aicore__ inline void Select(const LocalTensor<T>& dstLocal, const LocalTensor<U>& selMask, const LocalTensor<T>& src0Local, uint8_t repeatTimes, const BinaryRepeatParams& repeatParams);
template <typename T, SELMODE selMode> __aicore__ inline void Select(const LocalTensor<T>& dstLocal, const LocalTensor<T>& src0Local, const LocalTensor<T>& src1Local, uint8_t repeatTimes, const BinaryRepeatParams& repeatParams);
template <typename T> __aicore__ inline void GatherMask(const LocalTensor<T>& dstLocal, const LocalTensor<T>& src0Local, const LocalTensor<T>& src1Local, const uint8_t patternMode, const GatherMaskParams& gatherMaskParams);
template <typename T> __aicore__ inline void GatherMask(const LocalTensor<T>& dstLocal, const LocalTensor<T>& src0Local, const uint8_t patternMode, const GatherMaskParams& gatherMaskParams);

矩阵计算

template <typename T> __aicore__ inline void InitConstValue(const LocalTensor<T> &dstLocal, const InitConstValueParams<T> &initConstValueParams);
template <typename T> __aicore__ inline void LoadDataWithTranspose(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, const LoadData2dTransposeParams& loadDataParams);
__aicore__ inline void SetFmatrix(uint16_t l1H, uint16_t l1W, const uint8_t padList[4], const FmatrixMode &fmatrixMode);
__aicore__ inline void SetLoadDataRepeat(const LoadDataRepeatParam& repeatParams);
template <typename T> __aicore__ inline void SetLoadDataPaddingValue(const T padValue);
__aicore__ inline void SetLoadDataBoundary(uint32_t boundaryValue);
template <typename dst_T, typename src_T> __aicore__ inline void Fixpipe(const LocalTensor<dst_T>& dstLocal, const LocalTensor<src_T>& srcLocal, const FixpipeParams<src_T>& intriParams);
template <typename dst_T, typename src_T> __aicore__ inline void Fixpipe(const LocalTensor<dst_T>& dstLocal, const LocalTensor<src_T>& srcLocal, const LocalTensor<uint64_t>& cbufWorkspace, const FixpipeParams<src_T>& intriParams);
template <typename dst_T, typename src_T> __aicore__ inline void Fixpipe(const GlobalTensor<dst_T>& dstGlobal, const LocalTensor<src_T>& srcLocal, const FixpipeParams<src_T>& intriParams);
template <typename dst_T, typename src_T> __aicore__ inline void Fixpipe(const GlobalTensor<dst_T>& dstGlobal, const LocalTensor<src_T>& srcLocal, const LocalTensor<uint64_t>& cbufWorkspace, const FixpipeParams<src_T>& intriParams);
template <typename T> __aicore__ inline void SetFixPipeConfig(const LocalTensor<T> &reluPre, const LocalTensor<T> &quantPre, bool isUnitFlag = false);
template <typename T, bool setRelu = false> __aicore__ inline void SetFixPipeConfig(const LocalTensor<T> &preTensor, bool isUnitFlag = false);
__aicore__ inline void SetFixpipeNz2ndFlag(uint16_t ndNum, uint16_t srcNdStride, uint16_t dstNdStride);
__aicore__ inline void SetFixpipePreQuantFlag(uint64_t config);
template<typename T> __aicore__ inline void SetLeakyReluAlpha(T scaleValue);
template <typename T, typename U> __aicore__ inline __inout_pipe__(V) void BroadCastVecToMM(const LocalTensor<T> &dstLocal, const LocalTensor<U> &srcLocal, const int32_t blockCount, const uint8_t blockLen, const uint8_t srcGap, const uint8_t dstGap);

其他

__aicore__ inline __gm__ uint8_t* __gm__ SetDumpWorkSpacePtr(__gm__ uint8_t* workspace)
__aicore__ inline __gm__ uint8_t* __gm__ GetDumpWorkSpacePtr()
__aicore__ void SetSysWorkSpacePtr(__gm__ uint8_t* workspace)
__aicore__ inline AscendC::RpcCommClient* GetRpcClient()
__aicore__ inline void ResetMask()
__aicore__ inline void SetLoadDataBoundary(uint32_t boundaryValue)
template <MemDsbT arg0> __aicore__ inline void DataSyncBarrier()
template <HardEventevent, MemoryT memT, bool isVirtual> __aicore__ inline void HSetFlag(int32_t eventID)
template <HardEvent event, MemoryT memT, bool isVirtual> __aicore__ inline void HWaitFlag(int32_t eventID)
__aicore__ inline void PreLoad(const int64_t prefetchLen)
template <typename T> __aicore__ inline __inout_pipe__(S) void InitSyncID(GlobalTensor<T> gmWorkspace);
template <typename T> __aicore__ inline __in_pipe__(V) __out_pipe__(MTE3) void InitOutput(GlobalTensor<T> gmWorkspaceAddr, uint32_t size, T value = 0);
__aicore__ inline void InitDetermineComputeWorkspace(GlobalTensor<int32_t> &gmWorkspace, LocalTensor<int32_t> &ubWorkspace);

__aicore__ inline void NotifyNextBlock(GlobalTensor<int32_t> &gmWorkspace, LocalTensor<int32_t> &ubWorkspace);
__aicore__ inline void WaitPreBlock(GlobalTensor<int32_t> &gmWorkspace, LocalTensor<int32_t> &ubWorkspace);
template <typename T> __aicore__ inline void Concat(LocalTensor<T> &concatLocal, const LocalTensor<T> &srcLocal, const LocalTensor<T> &tmpLocal, const int32_t repeatTimes);
template <typename T> __aicore__ inline void Extract(const LocalTensor<T> &dstValueLocal, const LocalTensor<uint32_t> &dstIndexLocal, const LocalTensor<T> &sortedLocal, const int32_t repeatTimes);
template <typename T, bool isFullSort> __aicore__ inline void Sort(const LocalTensor<T> &dstLocal, const LocalTensor<T> &concatLocal, const LocalTensor<uint32_t> &indexLocal, LocalTensor<T> &tmpLocal, const int32_t repeatTimes);

template <typename T> __aicore__ inline void MrgSort(const LocalTensor<T>& dstLocal, const MrgSortSrcList<T>& srcLocal, const MrgSort4Info& params);
template <typename T, bool isExhaustedSuspension = false> __aicore__ inline void MrgSort(const LocalTensor<T> &dstLocal, const MrgSortSrcList<T> &sortList, const uint16_t elementCountList[4], uint32_t sortedNum[4], uint16_t validBit, const int32_t repeatTimes);
template <typename T> __aicore__ inline uint32_t GetSortLen(const uint32_t elemCount);
template <typename T> __aicore__ inline uint32_t GetSortOffset(const uint32_t elemOffset);
inline uint32_t GetSortTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t elemCount, const uint32_t dataTypeSize);
inline uint32_t GetConcatTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t elemCount, const uint32_t dataTypeSize);
__aicore__ inline void WaitEvent(uint16_t flagId);
emplate<pipe_t pipe> __aicore__ inline void NotifyEvent(uint16_t flagId);
__aicore__ inline int64_t GetStoreAtomicConfig();
template <AtomicDtype type, AtomicOp op> __aicore__ inline void SetStoreAtomicConfig();
__aicore__ inline int64_t GetAccVal();
__aicore__ inline int64_t GetReduceMaxMinCount();
__aicore__ inline void InitDump(uint32_t gmLen);
__aicore__ inline void InitDump(GM_ADDR dumpStartAddr, uint32_t gmLen);
template <typename T> __aicore__ inline void DumpAccChkPoint(LocalTensor<T> &tensor, uint32_t index, uint32_t countOff, uint32_t dumpSize);
template <typename T> __aicore__ inline void DumpAccChkPoint(GlobalTensor<T> &tensor, uint32_t index, uint32_t countOff, uint32_t dumpSize);
#define GET_TILING_DATA_WITH_STRUCT(tiling_struct, tiling_data, tiling_arg) tiling_struct tiling_data
#define GET_TILING_DATA_MEMBER(tiling_type, member, var, tiling) tiling_type point##var
#define KERNEL_TASK_TYPE(key, value) ENABLE_FEATURE_FOR_COMPILE(key, value)
templete<typename T> void TransDataTo5HD(uint64_t dstLocalList[16], uint64_t srcLocalList[16], const TransDataTo5HDParams& transDataParams);
template <typename T> __aicore__ inline void SetCmpMask(const LocalTensor<T>& src);