完成了PyTorch框架的适配插件开发后,即可实现从PyTorch框架调用Ascend C自定义算子。下文以自定义Add算子为例,介绍PyTorch 2.1.0框架下,注册算子开发过程以及算子适配开发过程。
"ENABLE_BINARY_PACKAGE": { "type": "BOOL", "value": "True" },
export LD_LIBRARY_PATH=$ASCEND_OPP_PATH/vendors/customize/op_api/lib/:$LD_LIBRARY_PATH
git clone https://gitee.com/ascend/pytorch.git -b v6.0.rc2.alpha002-pytorch2.1.0 --recursive cd pytorch/third_party/op-plugin
vi op_plugin/config/v2r1/op_plugin_functions.yaml
- func: npu_add_custom(Tensor x, Tensor y) -> Tensor impl_ns: op_api
拷贝后的示意代码如下:
custom: - func: npu_add_custom(Tensor x, Tensor y) -> Tensor impl_ns: op_api
#include "op_plugin/OpApiInterface.h" #include "op_plugin/AclOpsInterface.h" #include "op_plugin/utils/op_api_common.h" namespace op_api { using npu_preparation = at_npu::native::OpPreparation; at::Tensor npu_add_custom(const at::Tensor& x, const at::Tensor& y) { // 构造输出tensor at::Tensor result = npu_preparation::apply_tensor_without_format(x); // 计算输出结果 // 调用EXEC_NPU_CMD接口,完成输出结果的计算 // 第一个入参格式为aclnn+Optype,之后的参数分别为输入输出 EXEC_NPU_CMD(aclnnAddCustom, x, y, result); return result; } } // namespace op_api
# 回到torch_npu目录 cd ../../ bash ci/build.sh --python=3.8 pip3 install dist/*.whl --force-reinstall
指定Python版本编包方式,以Python3.8为例,其他Python版本请使用 --python=3.9或--python3.10。
上述开发过程完成后,您可以调用如下的脚本test_ops_custom.py,测试torch_npu.npu_add_custom()的功能,测试脚本如下:
import torch import torch_npu from torch_npu.testing.testcase import TestCase, run_tests torch.npu.config.allow_internal_format=False class TestCustomAdd(TestCase): def test_add_custom(self): length = [8, 2048] x = torch.rand(length, device='cpu', dtype=torch.float16) y = torch.rand(length, device='cpu', dtype=torch.float16) print(x, '\n', y) prof_path = "./prof_total" with torch.npu.profile(prof_path) as prof: torch.npu.synchronize() output = torch_npu.npu_add_custom(x.npu(), y.npu()).cpu() torch.npu.synchronize() print(output) self.assertRtolEqual(output, x + y) if __name__ == "__main__": run_tests()
执行命令如下:
python3 test_ops_custom.py
输出如下打印,说明执行正确:
tensor([[0.1152, 0.9385, 0.7095, ..., 0.7500, 0.3130, 0.0044], [0.2759, 0.1240, 0.3550, ..., 0.7183, 0.3540, 0.5127], [0.6475, 0.8037, 0.6343, ..., 0.0840, 0.3560, 0.8677], ..., [0.7900, 0.2070, 0.7319, ..., 0.2363, 0.2803, 0.2510], [0.2993, 0.3140, 0.4355, ..., 0.8130, 0.3618, 0.5693], [0.3540, 0.7471, 0.9448, ..., 0.8877, 0.8691, 0.0869]], dtype=torch.float16) tensor([[0.6689, 0.2119, 0.3105, ..., 0.6313, 0.9546, 0.7935], [0.0757, 0.8447, 0.2329, ..., 0.7256, 0.9160, 0.3975], [0.1968, 0.6567, 0.5322, ..., 0.3071, 0.8501, 0.0947], ..., [0.6748, 0.4189, 0.7202, ..., 0.0103, 0.6133, 0.3706], [0.1079, 0.3457, 0.7505, ..., 0.5947, 0.4390, 0.4434], [0.4102, 0.1792, 0.9648, ..., 0.6333, 0.5381, 0.6646]], dtype=torch.float16) tensor([[0.7842, 1.1504, 1.0195, ..., 1.3809, 1.2676, 0.7979], [0.3516, 0.9688, 0.5879, ..., 1.4434, 1.2695, 0.9102], [0.8442, 1.4609, 1.1660, ..., 0.3911, 1.2061, 0.9624], ..., [1.4648, 0.6260, 1.4521, ..., 0.2466, 0.8936, 0.6216], [0.4072, 0.6597, 1.1855, ..., 1.4082, 0.8008, 1.0127], [0.7642, 0.9263, 1.9102, ..., 1.5215, 1.4072, 0.7515]], dtype=torch.float16) . ---------------------------------------------------------------------- Ran 1 test in 0.669s OK