核函数NPU上板精度验证
NPU调测支持一键PIPE_ALL等调试功能,更多相关介绍参见NPU调测功能。
本场景以FlashAttentionScore算子为例,假设输入数据和标杆数据是用户自行提供的bin文件,核函数NPU上板调测过程如下。请根据自身实际情况,按需修改示例代码。
import torch import numpy as np import ascendebug # 设置和清理日志文件 ascendebug.set_log_file('ops_adv.log', clean=True) # 根据实际情况设置路径 CANN_INSTALL_PATH = "/usr/local/Ascend/" DATA_PATH = '/user_data_path/' REPO_PATH = "ops_adv_code_path/" # 1.导入输入/标杆数据,构建算子信息 debug_op = ascendebug.create_debug_op('FlashAttentionScore', 'MixCore', 'Ascendxxx') \ .custom_input('query', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'q.bin')) \ .custom_input('key', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'k.bin')) \ .custom_input('value', 'float16', [24, 144, 1280], os.path.join(DATA_PATH, 'v.bin')) \ .custom_input('real_shift', 'float16', None, None, ['optional']) \ .custom_input('drop_mask', 'uint8', [1244160], os.path.join(DATA_PATH, 'drop_mask.bin'), ['optional']) \ .custom_input('padding_mask', 'float16', None, None, ['optional']) \ .custom_input('atten_mask', 'bool', None, None, ['optional']) \ .custom_input('prefix', 'int64', None, None, ['optional']) \ .custom_input('actual_seq_qlen', 'int64', None, None, ['optional']) \ .custom_input('actual_seq_kvlen', 'int64', None, None, ['optional']) \ .custom_input('q_start_idx', 'int64', None, None, ['optional']) \ .custom_input('kv_start_idx', 'int64', None, None, ['optional']) \ .custom_output('softmax_max', 'float32', [24, 20, 144, 8], None) \ .custom_output('softmax_sum', 'float32', [24, 20, 144, 8], None) \ .custom_output('softmax_out', 'float16', [24, 20, 144, 144], None) \ .custom_output('attention_out', 'float16', [24, 20, 144, 64], os.path.join(DATA_PATH, 'attention_out.bin')) \ .attr('scale_value', 'float', 1.0) \ .attr('keep_prob', 'float', 0.8) \ .attr('pre_tockens', 'int', 2147483647) \ .attr('next_tockens', 'int', 2147483647) \ .attr('head_num', 'int', 20) \ .attr('input_layout', 'string', 'BSH') \ .attr('inner_precise', 'int', 0) # 2.创建调试对象并初始化工作空间 op_executor = ascendebug.create_op_executor(debug_op=debug_op, install_path=CANN_INSTALL_PATH) # 3.调用Tiling调测接口(可选,若已有Tiling bin文件可跳过本步骤) # 如需本地调试Tiling,推荐使用方式1,否则使用方式2 # 方式1:基于本地ops_adv仓编译Tiling so,再执行Tiling计算 tiling_so = op_executor.compile_ops_adv_tiling(REPO_PATH) tiling_info = op_executor.run_tiling(tiling_so) # 方式2:工具自动从CANN包获取Tiling so并进行Tililng计算 # tiling_info = op_executor.run_ops_adv_tiling() print(tiling_info.tiling_bin, tiling_info.tiling_workspace, tiling_info.block_num, tiling_info.tiling_key) # 4.调用NPU编译接口 compile_npu_options = ascendebug.CompileNpuOptions() name, kernel_file, extern = op_executor.compile_ops_adv_npu(REPO_PATH, tiling_info.tiling_key, compile_npu_options) # 5.调用NPU运行接口,完成上板精度比对 run_npu_options = ascendebug.RunNpuOptions() compile_info = ascendebug.NpuCompileInfo(syncall=extern['cross_core_sync'], task_ration=extern['task_ration']) op_executor.run_npu(kernel_file, run_npu_options, npu_compile_info=compile_info, tiling_info=tiling_info)
算子在NPU板端运行调测的精度比对结果示例可以参见“NPU调测功能 > 调测产物”。
父主题: 精度调试