本节为用户提供了将单卡训练脚本修改为多卡训练脚本的核心步骤指导,以自动混合精度(AMP)章节中的DDP场景(one NPU per process)的代码为样例进行介绍。
local_rank = int(os.environ["LOCAL_RANK"])
device = torch.device('npu', local_rank)
torch.distributed.init_process_group(backend="hccl",rank=local_rank)
torch_npu.npu.set_device(device)
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)
train_dataloader = DataLoader(dataset = train_data, batch_size=batch_size, sampler = train_sampler)