下载
中文
注册

进阶样例

本节设计了一个张量加法的Sample,通过综合应用文档介绍的TIK优化机制,增强用户对TIK编程与优化的理解。

为保证张量加法Sample的结构清晰性和易读性,本章采用类的形式对Sample进行组织和实现,类的定义如下所示。

class Vadd():
       # 接收数据,并完成相关初始化计算
       def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
       
       # 完成算子计算与编译
       def vadd_compute(self):
       
       # 定义每个AI Core上的运算
       def vadd_compute_each_core(self, move_offset, move_num):
       
       # 定义AI Core上的分片计算
       def vadd_compute_each_loop(self, move_offset, move_num):
       
 # 用于功能和性能测试
 def vadd_sample(input_x, input_y, output_z, kernel_name):

完整Sample如下所示。

import math
from functools import reduce as functools_reduce
import numpy as np
from tbe import tik
import tbe.common.platform as tbe_platform
from tbe.common.utils import para_check

# 计算每种数据类型所占bit数
def get_bit_len(dtype):
    index = 0
    for i in dtype:
        if i.isdigit():
            break
        index += 1
    return int(dtype[index:])

class Vadd():
    def __init__(self, input_x, input_y, output_z, kernel_name="vadd_sample"):
        self.shape_x = input_x.get("shape")
        self.dtype_x = input_x.get("dtype")
        self.shape_y = input_y.get("shape")
        self.dtype_y = input_y.get("dtype")
        self.shape_z = output_z.get("shape")
        self.dtype_z = output_z.get("dtype")
        self.kernel_name = kernel_name
        # 构造TIK容器并开启debug调试功能
        self.tik_instance = tik.Tik(disable_debug=False)

        # 请根据实际昇腾AI处理器型号进行设置
        soc_version="xxx"
        tbe_platform.set_current_compile_soc_info(soc_version,core_type="AiCore")

        # 获取AI Core的个数
        self.aicore_num = tbe_platform.get_soc_spec("CORE_NUM")
        # Unified Buffer上数据读取和写入必须32B对齐,此参数用来计算tensor划分和数据搬运指令参数
        block_byte_size = 32

        # 获取Unified Buffer空间大小,单位为bytes
        ub_size_bytes = tbe_platform.get_soc_spec("UB_SIZE")

        # 根据输入的数据类型计算一个block可以存放多少个对应的元素
        dtype_bytes_size = get_bit_len(self.dtype_x) // 8
        self.data_each_block = block_byte_size // dtype_bytes_size

        # 计算在Unified Buffer上给两个输入和计算结果分别分配多少空间(地址重叠),并进行32B对齐
        self.ub_tensor_size = (
            ub_size_bytes // dtype_bytes_size // 2 // self.data_each_block *
            self.data_each_block)

        # 计算输入的元素个数
        self.input_num = functools_reduce(lambda x, y: x * y, self.shape_x)

        # 计算每个AI Core需要处理的数据量,当前只考虑均分场景,且均分后32 Bytes对齐
        self.data_num_each_core = self.input_num // self.aicore_num

        # vector指令每个repeat最多计算8个block,该参数为mask的最大值
        self.vector_mask_max = 8 * self.data_each_block

        self.input_x_gm = self.tik_instance.Tensor(
            self.dtype_x, self.shape_x, name="input_x_gm", scope=tik.scope_gm)
        self.input_y_gm = self.tik_instance.Tensor(
            self.dtype_y, self.shape_y, name="input_y_gm", scope=tik.scope_gm)
        self.output_z_gm = self.tik_instance.Tensor(
            self.dtype_z, self.shape_z, name="output_z_gm", scope=tik.scope_gm)

    def vadd_compute(self):
        with self.tik_instance.for_range(
                0, self.aicore_num, block_num=self.aicore_num) as index:
            # 创建两个输入在Unified Buffer上的tensor
            self.input_x_ub = self.tik_instance.Tensor(
                self.dtype_x, (self.ub_tensor_size,),
                name="input_x_ub",
                scope=tik.scope_ubuf)
            self.input_y_ub = self.tik_instance.Tensor(
                self.dtype_y, (self.ub_tensor_size,),
                name="input_y_ub",
                scope=tik.scope_ubuf)

            # 将对应的GM上的数据搬运到Unified Buffer,每次搬运的偏移量为已经处理过的数据个数
            move_offset = index * self.data_num_each_core

            # 每个aicore计算自己负责的数据分片
            self.vadd_compute_each_core(move_offset, self.data_num_each_core)

        self.tik_instance.BuildCCE(
            kernel_name=self.kernel_name,
            inputs=[self.input_x_gm, self.input_y_gm],
            outputs=[self.output_z_gm])

        return self.tik_instance

    def vadd_compute_each_core(self, move_offset, move_num):
        loop_time = move_num // self.ub_tensor_size
        move_offset_init = move_offset
        if loop_time > 0:
            with self.tik_instance.for_range(0, loop_time) as loop_index:
                move_offset += loop_index * self.ub_tensor_size
                self.vadd_compute_each_loop(move_offset, self.ub_tensor_size)
            move_offset = move_offset_init + loop_time * self.ub_tensor_size

        last_num = move_num % self.ub_tensor_size
        if last_num > 0:
            self.vadd_compute_each_loop(move_offset, last_num)

    def vadd_compute_each_loop(self, move_offset, move_num):
        # 计算每次搬运的burst_len
        burst_len = math.ceil(move_num / self.data_each_block)

        self.tik_instance.data_move(self.input_x_ub,
                                    self.input_x_gm[move_offset], 0, 1,
                                    burst_len, 0, 0)
        self.tik_instance.data_move(self.input_y_ub,
                                    self.input_y_gm[move_offset], 0, 1,
                                    burst_len, 0, 0)
        vadd_loop = move_num // (self.vector_mask_max * 255)
        add_offset = 0
        if vadd_loop > 0:
            with self.tik_instance.for_range(0, vadd_loop) as add_index:
                add_offset = add_index * self.vector_mask_max * 255
                self.tik_instance.vec_add(self.vector_mask_max,
                                       self.input_x_ub[add_offset],
                                       self.input_x_ub[add_offset],
                                       self.input_y_ub[add_offset], 
                                       255, 8, 8, 8)
            add_offset = vadd_loop * self.vector_mask_max * 255
        repeat_time = (
            move_num % (self.vector_mask_max * 255) // self.vector_mask_max)
        if repeat_time > 0:
            self.tik_instance.vec_add(self.vector_mask_max,
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   repeat_time, 8, 8, 8)
            add_offset += repeat_time * self.vector_mask_max
        last_num = move_num % self.vector_mask_max
        if last_num > 0:
            self.tik_instance.vec_add(last_num, 
                                   self.input_x_ub[add_offset],
                                   self.input_x_ub[add_offset],
                                   self.input_y_ub[add_offset], 
                                   1, 8, 8, 8)

        self.tik_instance.data_move(self.output_z_gm[move_offset],
                                    self.input_x_ub, 0, 1, burst_len, 0, 0)


@para_check.check_input_type(dict, dict, dict, str)
def vadd_sample(input_x, input_y, output_z, kernel_name):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    input_y : dict
        shape and dtype of input
    output_z : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "vadd_sample"

    Returns
    -------
    None
    """
    vadd_instance = Vadd(input_x, input_y, output_z, kernel_name)
    tik_instance = vadd_instance.vadd_compute()

    return tik_instance


if __name__ == "__main__":
    tik_instance = vadd_sample({"shape":(32, 16384), "dtype":"float16"},
                               {"shape":(32, 16384), "dtype":"float16"},
                               {"shape":(32, 16384), "dtype":"float16"},
                               "vadd_32_16384_float16")

    input_x = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
    input_y = np.random.uniform(-10, 5, (32, 16384)).astype(np.float16)
    feed_dict = {
        'input_x_gm': input_x,
        'input_y_gm': input_y
    }
    result = tik_instance.tikdb.start_debug(feed_dict, interactive=True)
    print(result)