代码样例
compare_data.py
from ptdbg_ascend import compare pkl_path = "/home/npu_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump.pkl" dump_data_dir = "/home/npu_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump" dump_path_param = { "npu_pkl_path":pkl_path , "bench_pkl_path": "/home/bench_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump.pkl", "npu_dump_data_dir":dump_data_dir , "bench_dump_data_dir":"/home/bench_dump_path/ptdbg_dump_v3.2/rank0/api_stack_dump" , "is_print_compare_log": True } compare(dump_path_param, output_path="/home/output", stack_mode=True)
ddp_basic.py
import torch import torch_npu from torch.utils.data import Dataset, DataLoader from torch.utils.data.distributed import DistributedSampler from utils import MyTrainDataset import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel as DDP from torch.distributed import init_process_group, destroy_process_group, barrier import os import sys from model import SimpleNet, ResNetOverflow from torch_npu.contrib import transfer_to_npu from ptdbg_ascend import * #dump debugger = PrecisionDebugger(dump_path="./dump_path", hook_name="dump", rank=0, step=[0], enable_dataloader=True) #只dump统计量pkl #debugger.configure_hook(summary_only=True) #dump指定API列表 #debugger.configure_hook(mode="list", scope=["Functional_batch_norm_1_forward", "Functional_conv2d_5_backward", "Tensor___iadd___2_forward"]) #dump指定范围 #debugger.configure_hook(mode="range", scope=["Functional_conv2d_5_forward", "Tensor___iadd___2_backward"]) #dump指定某一类API的API级别输入输出数据 #debugger.configure_hook(mode="api_list", api_list=["relu"]) #溢出检测 #debugger = PrecisionDebugger(dump_path="./dump_overflow_path", hook_name="overflow_check") def ddp_setup(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' init_process_group(backend='hccl', rank=rank, world_size=world_size) class Trainer: def __init__( self, model, train_loader, optimizer, gpu_id, save_every, world_size): self.gpu_id = gpu_id self.model = model.to(f"npu:{gpu_id}") #self.model = model self.train_loader = train_loader self.optimizer = optimizer self.save_every = save_every self.world_size = world_size if world_size!=-1: self.model = DDP(self.model, device_ids=[self.gpu_id]) def _run_batch(self, source, targets): self.optimizer.zero_grad() output = self.model(source) loss = torch.nn.CrossEntropyLoss()(output, targets) loss.backward() self.optimizer.step() def _run_epoch(self, epoch): b_sz = len(next(iter(self.train_loader))[0]) print(f"[NPU{self.gpu_id}] Epoch {epoch} | Batchsize:{b_sz} | Steps: {len(self.train_loader)}") for i, (source, targets) in enumerate(self.train_loader): print(i) if i == 0: print(i) #当enable_dataloader为False时,需设置debugger.start() #PrecisionDebugger.start() source = source.to(self.gpu_id) targets = targets.to(self.gpu_id).long() targets=targets.long() self._run_batch(source, targets) if i==0: print(i) #当enable_dataloader为False时,需设置debugger.stop() #PrecisionDebugger.stop() def _save_checkpoint(self, epoch): if self.world_size !=-1 and self.gpu_id==0: ckp = self.model.module.state_dict() else: ckp = self.model.state_dict() torch.save(ckp, "checkpoint.pt") print(f"Epoch {epoch} | checkpoint saved") def train(self, max_epochs): for epoch in range(max_epochs): self._run_epoch(epoch) if self.gpu_id==0 and epoch % self.save_every == 0: self._save_checkpoint(epoch) def load_train_objs(): train_set = MyTrainDataset(2048, shape=(3,64,64)) model = ResNetOverflow() optimizer = torch.optim.SGD(model.parameters(),lr=0.1) return train_set, model, optimizer def prepare_dataloader(dataset: Dataset, batch_size:int, world_size:int): return DataLoader( dataset, batch_size=batch_size, pin_memory=True, shuffle=False if world_size!=-1 else True, sampler=DistributedSampler(dataset) if world_size!=-1 else None ) def main(rank, world_size, total_epochs, save_every): torch.npu.set_device(f"npu:{rank}") if world_size!=-1: ddp_setup(rank, world_size) dataset, model, optimizer = load_train_objs() train_data = prepare_dataloader(dataset, batch_size=32, world_size=world_size) trainer = Trainer(model, train_data, optimizer, rank, save_every, world_size) trainer.train(total_epochs) if world_size!=-1: destroy_process_group() if __name__ == "__main__": total_epochs = 1 save_every = 5 n_device = int(sys.argv[1]) if n_device>=2: world_size = n_device mp.spawn(main, args=(world_size, total_epochs, save_every), nprocs=world_size) else: device = 0 main(device, -1, total_epochs, save_every)
model.py
import torch import torch.nn as nn from torchvision.models import resnet18 class ModelParallelNet(nn.Module): def __init__(self): super(ModelParallelNet,self).__init__() self.linear1 = nn.Linear(20,10).to("npu:0") self.linear2 = nn.Linear(10,1).to("npu:1") def forward(self, x): x = self.linear1(x.to("npu:0")) x = self.linear2(x.to("npu:1")) return x class SimpleNet(nn.Module): def __init__(self): super(SimpleNet, self).__init__() self.linear1 = nn.Linear(20,10) self.linear2 = nn.Linear(10,1) def forward(self, x): x = self.linear1(x) x = self.linear2(x) x = x.half() x += 65536*2 return x class ResNetOverflow(nn.Module): def __init__(self): super().__init__() self.resnet = resnet18() self.linear = nn.Linear(1000,100) def forward(self, x): x = self.resnet(x) x = self.linear(x) x = x.half() x += 65536*2 return x
utils.py
import torch import torch_npu from torch.utils.data import Dataset class MyTrainDataset(Dataset): def __init__(self, size, shape=(20,)): super().__init__() self.data = torch.randn(size, *shape) self.label = torch.randn(size)*10 self.size = size def __len__(self): return self.size def __getitem__(self, idx): return self.data[idx], self.label[idx]
父主题: 精度比对工具