本文档提供的手工迁移样例是基于PyTorch官网的Imagenet数据集训练模型脚本代码的,以PyTorch1.8.1为例。
import torch import torch_npu
代码位置:main.py文件中的main()函数。
原代码如下:
if torch.cuda.is_available(): ngpus_per_node = torch.cuda.device_count() else: ngpus_per_node = 1
if torch_npu.npu.is_available(): ngpus_per_node = torch_npu.npu.device_count() else: ngpus_per_node = 1
if not torch.cuda.is_available() and not torch.backends.mps.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if torch.cuda.is_available(): if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs of the current node. args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None and torch.cuda.is_available(): torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) #elif torch.backends.mps.is_available(): #device = torch.device("mps") #model = model.to(device) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() if torch.cuda.is_available(): if args.gpu: device = torch.device('cuda:{}'.format(args.gpu)) else: device = torch.device("cuda") #elif torch.backends.mps.is_available(): #device = torch.device("mps") else: device = torch.device("cpu") # define loss function (criterion), optimizer, and learning rate scheduler criterion = nn.CrossEntropyLoss().to(device)
if not torch_npu.npu.is_available() and not torch.backends.mps.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if torch_npu.npu.is_available(): if args.gpu is not None: torch_npu.npu.set_device(args.gpu) model.npu(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs of the current node. args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.npu() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None and torch_npu.npu.is_available(): torch_npu.npu.set_device(args.gpu) model = model.npu(args.gpu) elif torch.backends.mps.is_available(): device = torch.device("mps") model = model.to(device) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.npu() else: model = torch.nn.DataParallel(model).npu() if torch_npu.npu.is_available(): if args.gpu: device = torch.device('npu:{}'.format(args.gpu)) else: device = torch.device("npu") #elif torch.backends.mps.is_available(): #device = torch.device("mps") else: device = torch.device("cpu") # define loss function (criterion), optimizer, and learning rate scheduler criterion = nn.CrossEntropyLoss().to(device)
原代码如下:
if args.gpu is None: checkpoint = torch.load(args.resume) elif torch.cuda.is_available(): # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc)
修改后代码如下:
if args.gpu is None: checkpoint = torch.load(args.resume) elif torch_npu.npu.is_available(): # Map model to be loaded to specified single gpu. loc = 'npu:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc)
代码位置:main.py文件中validate()函数内的run_validate()函数。
原代码中数据集在GPU上进行加载计算,原代码如下:
if args.gpu is not None and torch.cuda.is_available(): images = images.cuda(args.gpu, non_blocking=True) #if torch.backends.mps.is_available(): #images = images.to('mps') #target = target.to('mps') if torch.cuda.is_available(): target = target.cuda(args.gpu, non_blocking=True)
if args.gpu is not None and torch_npu.npu.is_available(): images = images.npu(args.gpu, non_blocking=True) #if torch.backends.mps.is_available(): #images = images.to('mps') #target = target.to('mps') if torch_npu.npu.is_available(): target = target.npu(args.gpu, non_blocking=True)
代码位置:class AverageMeter(object)中的all_reduce()函数
原代码如下:
def all_reduce(self): if torch.cuda.is_available(): device = torch.device("cuda") ……
修改后代码如下:
def all_reduce(self): if torch_npu.npu.is_available(): device = torch.device("npu") ……