异步多流推理样例
前提条件
以Resnet50为例进行样例展示,运行该样例需要用户自行安装Torchvision,如未安装,请单击链接获取Torchvision安装包并参照如下方法进行安装:
使用以下命令安装Torchvision。
pip install 软件包名.whl
软件包名.whl表示Torchvision安装包torchvision-{version}-cp310-cp310-{os}_{arch}.whl,请根据实际包名进行替换。

Torchvision版本与Torch版本严格配套使用,推荐使用与Torch2.1.0配套的0.16.0版本。
Python开发环境
Python开发环境样例如下所示:
# 请务必先导入Torch,再导入MindIE Torch import torch import mindietorch import torchvision COSINE_THRESHOLD = 0.99 # 计算张量相似度 def cosine_similarity(gt_tensor, pred_tensor): gt_tensor = gt_tensor.flatten().to(torch.float32) pred_tensor = pred_tensor.flatten().to(torch.float32) if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0: if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True): return 1.0 res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6) res = res.cpu().detach().item() return res if __name__ == "__main__": mindietorch.set_device(0) # 1. 加载resnet50模型 batch = 32 tensor_cpu = torch.randn([batch, 3, 224, 224], pin_memory = True) model_torchvision = torchvision.models.resnet50().eval() input = [ mindietorch.Input((batch, 3, 224, 224)), ] # 2. 模型编译,具体参考python接口的函数方法中的mindietorch.compile。 # 可以选择trace成TorchScript模型后进行编译 model = torch.jit.trace(model_torchvision, tensor_cpu) compile_model = mindietorch.compile(model, inputs = input) # 也可以选择导出成ExportedProgram编译模型, 此处直接传入nn.Module让MindIE Torch进行export compile_model = mindietorch.compile(model_torchvision, inputs = input, ir="dynamo") # 3.模型推理 # torch forward torch_result = model_torchvision(tensor_cpu) # 创建 stream stream_h2d = mindietorch.npu.Stream() stream_forward = mindietorch.npu.Stream() stream_d2h = mindietorch.npu.Stream() # 推理预热 input_npus = [] output_npus = [] output_cpu = torch.randn([batch, 1000], pin_memory = True) for i in range(3): input_npu = tensor_cpu.to("npu:0") output_npu = compile_model(input_npu)[0] output_cpu.copy_(output_npu) input_npus.append(input_npu) output_npus.append(output_npu) loop_time = 100 result = True for i in range(loop_time + 2): # dispatch if i >= 1 and i < loop_time + 1: with mindietorch.npu.stream(stream_forward): output_npus[(i - 1) % 3] = compile_model(input_npus[(i - 1) % 3])[0] if i > 0 and i < loop_time: with mindietorch.npu.stream(stream_h2d): input_npus[i % 3] = tensor_cpu.to("npu:0", non_blocking = True) if i >= 2: with mindietorch.npu.stream(stream_d2h): output_cpu.copy_(output_npus[(i - 2) % 3], non_blocking = True) # 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize() if i >= 2: stream_d2h.synchronize() cos_sim = cosine_similarity(output_cpu, torch_result) if cos_sim < COSINE_THRESHOLD: result = False if i > 0 and i < loop_time: stream_h2d.synchronize() if i >= 1 and i < loop_time + 1: stream_forward.synchronize() if result: print("test success") else: print("test error")
C++开发环境
C++开发环境样例如下所示:
#include <torch/torch.h> #include <torch/script.h> #include "cpp/include/torch_aie.h" int main() { // 1. 加载resnet50模型 const std::string modelPath = "Resnet50.pth"; // resnet50 torchsctipt module torch_aie::set_device(0); torch::jit::script::Module module = torch::jit::load(modelPath); module.eval(); int batch = 64; // 2. 模型编译 std::vector<int64_t> shape = { batch, 3, 224, 224 }; std::vector<torch_aie::Input> inputs; inputs.emplace_back(torch_aie::Input(shape, torch_aie::DataType::FLOAT, torch_aie::TensorFormat::NCHW)); torch_aie::torchscript::CompileSpec compileSpec(inputs); auto compiledModule = torch_aie::torchscript::compile(module, compileSpec); // 3. 创建stream c10::Stream streamH2d = c10::Stream(c10::Stream::UNSAFE, at::Device("npu:0"), 0); c10::Stream streamForward = c10::Stream(c10::Stream::UNSAFE, at::Device("npu:0"), 1); c10::Stream streamD2h = c10::Stream(c10::Stream::UNSAFE, at::Device("npu:0"), 2); c10::StreamGuard streamGuard(streamH2d); // 4. 准备数据和内存 std::vector<at::Tensor> intputNpus; std::vector<at::Tensor> outputNpus; auto optionCpu = torch::TensorOptions().device(at::Device("cpu")).layout(torch::kStrided) .pinned_memory(true); auto inputCpu = torch::ones({ batch, 3, 224, 224 }, optionCpu) * 0.5; auto outputCpu = at::empty({ batch, 1000 }, optionCpu); // 5. 推理预热 for (int i = 0; i < 3; i++) { auto inputNpu = inputCpu.to("npu:0"); auto outputNpu = compiledModule({ inputNpu }).toTensor(); outputCpu.copy_(outputNpu); intputNpus.push_back(inputNpu); outputNpus.push_back(outputNpu); } // 6. 执行torch_aie forward int loop_time = 50; for (int i = 0; i < loop_time + 2; i++) { // dispatch if (i >= 1 and i < loop_time + 1) { streamGuard.reset_stream(streamForward); outputNpus[(i - 1) % 3] = compiledModule( { intputNpus[(i - 1) % 3] }).toTensor(); } if (i > 0 and i < loop_time) { streamGuard.reset_stream(streamH2d); intputNpus[i % 3] = inputCpu.to("npu:0", true); } if (i >= 2) { streamGuard.reset_stream(streamD2h); outputCpu.copy_(outputNpus[(i - 2) % 3], true); } // 流同步,具体参考python接口的类参考中的mindietorch.npu.Stream的方法synchronize() if (i >= 2) { streamD2h.synchronize(); // process data after synchronize // ... } if (i > 0 and i < loop_time) { streamH2d.synchronize(); } if (i >= 1 and i < loop_time + 1) { streamForward.synchronize(); } } // 7. 资源释放 torch_aie::finalize(); return 0; }
父主题: 样例参考