def fuse_add_softmax_dropout(training, dropout, attn_mask, attn_scores, attn_head_size, p=0.5, dim=-1):
使用NPU自定义算子替换原生写法,以提高性能。
from torch_npu.contrib.function import fuse_add_softmax_dropout fuse_add_softmax_dropout(training, dropout, npu_input1, npu_input2, alpha, p=axis)
>>> training = True >>> dropout = nn.DropoutWithByteMask(0.1) >>> npu_input1 = torch.rand(96, 12, 384, 384).half().npu() >>> npu_input2 = torch.rand(96, 12, 384, 384).half().npu() >>> alpha = 0.125 >>> axis = -1 >>> output = fuse_add_softmax_dropout(training, dropout, npu_input1, npu_input2, alpha, p=axis)
def npu_diou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
应用基于NPU的DIoU操作。考虑到目标之间距离,以及距离和范围的重叠率,不同目标或边界需趋于稳定。
到目前为止,DIoU向后只支持当前版本中的trans==True、is_cross==False、mode==0('iou')。如果需要反向传播,确保参数正确。
from torch_npu.contrib.function import npu_diou diou = npu_diou(box1, box2)
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> diou = npu_diou(box1, box2) >>> l = diou.sum() >>> l.backward()
def npu_ciou(boxes1,boxes2,trans=True, is_cross=False, mode=0):
应用基于NPU的CIoU操作。在DIoU的基础上增加了penalty item,并propose CIoU。
到目前为止,CIoU向后只支持当前版本中的trans==True、is_cross==False、mode==0('iou')。如果需要反向传播,确保参数正确。
from torch_npu.contrib.function import npu_ciou ciou = npu_ciou(box1, box2)
>>> box1 = torch.randn(4, 32).npu() >>> box1.requires_grad = True >>> box2 = torch.randn(4, 32).npu() >>> box2.requires_grad = True >>> ciou = npu_ciou(box1, box2) >>> l = ciou.sum() >>> l.backward()
def npu_single_level_responsible_flags(featmap_size,gt_bboxes,stride,num_base_anchors):
使用NPU OP在单个特征图中生成锚点的responsible flags。
from torch_npu.contrib.function import npu_single_level_responsible_flags out = npu_single_level_responsible_flags(featmap_sizes[i],gt_bboxes,stride[i],num_base_anchors)
torch.Tensor - 单层特征图中每个锚点的有效标志。输出大小为[featmap_size[0] * featmap_size[1] * num_base_anchors]。
>>> featmap_sizes = [[10, 10], [20, 20], [40, 40]] >>> stride = [[32, 32], [16, 16], [8, 8]] >>> gt_bboxes = torch.randint(0, 512, size=(128, 4)) >>> num_base_anchors = 3 >>> featmap_level = len(featmap_sizes) >>> torch.npu.set_device(0) >>> for i in range(featmap_level): gt_bboxes = gt_bboxes.npu() >>> out = npu_single_level_responsible_flags(featmap_sizes[i],gt_bboxes,stride[i],num_base_anchors) >>> print(out.shape, out.max(), out.min())
def npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride):
使用NPU OP获取将bbox转换为gt_bbox的框回归转换deltas。
from torch_npu.contrib.function import npu_bbox_coder_encode_yolo out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride)
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_encode_xyxy2xywh(bboxes,gt_bboxes,means=None,stds=None,is_normalized=False,normalized_scale=10000.):
应用基于NPU的bbox格式编码操作,将格式从xyxy编码为xywh。
from torch_npu.contrib.function import npu_bbox_coder_encode_yolo out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride)
不支持动态shape。由于算子语义限制,仅支持二维(n, 4)场景。Bboxes和gt_bboxesshape和dtype必须相同, dtype只可为float16和float32。第三个输入(步长)仅支持1D,且第一个维度与第一个输入(bboxes)相同。
>>> A = 1024 >>> bboxes = torch.randint(0, 512, size=(A, 4)) >>> gt_bboxes = torch.randint(0, 512, size=(A, 4)) >>> stride = torch.randint(0, 32, size=(A,)) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> gt_bboxes = gt_bboxes.npu() >>> stride = stride.npu() >>> out = npu_bbox_coder_encode_yolo(bboxes, gt_bboxes, stride) >>> torch.npu.synchronize() >>> print('_npu_bbox_coder_encode_yolo done. output shape is ', out.shape)
def npu_bbox_coder_decode_xywh2xyxy(bboxes,pred_bboxes,means=None,stds=None,max_shape=[9999, 9999],wh_ratio_clip=16 / 1000):
应用基于NPU的bbox格式编码操作,将格式从xywh编码为xyxy。
from torch_npu.contrib.function import npu_bbox_coder_decode_xywh2xyxy out = npu_bbox_coder_decode_xywh2xyxy(bboxes, pred_bboxes, max_shape=(max_shape, max_shape))
>>> A = 1024 >>> max_shape = 512 >>> bboxes = torch.randint(0, max_shape, size=(A, 4)) >>> pred_bboxes = torch.randn(A, 4) >>> torch.npu.set_device(0) >>> bboxes = bboxes.npu() >>> pred_bboxes = pred_bboxes.npu() >>> out = npu_bbox_coder_decode_xywh2xyxy(bboxes, pred_bboxes, max_shape=(max_shape, max_shape)) >>> torch.npu.synchronize >>> () >>> print('_npu_bbox_coder_decode_xywh2xyxy done. output shape is ', out.shape)
def npu_fast_condition_index_put(x, condition, value):
使用NPU亲和写法替换bool型index_put函数中的原生写法。
from torch_npu.contrib.function import npu_fast_condition_index_put x1_opt = npu_fast_condition_index_put(x, condition, value)
>>> x = torch.randn(128, 8192) >>> condition = x < 0.5 >>> value = 0. >>> x1 = copy.deepcopy(x)[condition] = value >>> x1_opt = npu_fast_condition_index_put(x, condition, value)
class matmul_transpose (torch.autograd.Function):
使用NPU自定义算子替换原生写法,以提高性能。
from torch_npu.contrib.function import
matmul_transpose
output = matmul_transpose(tensor1, tensor2)
>>> tensor1 = torch.randn(68, 5, 75, 16).npu() >>> tensor1.requires_grad_(True) >>> tensor2 = torch.randn(68, 5, 75, 16).npu() >>> tensor2.requires_grad_(True) >>> output = matmul_transpose(tensor1, tensor2) >>> output.sum().backward()
def npu_multiclass_nms(multi_bboxes,multi_scores, score_thr=0.05,nms_thr=0.45,max_num=50,score_factors=None):
使用NPU API的多类bbox NMS。
from torch_npu.contrib.function import npu_multiclass_nms det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3)
在动态shape条件下,由于NPU op的限制,最多支持20个类别(nmsed_classes)和10000个框(nmsed_boxes)。
>>> boxes = torch.randint(1, 255, size=(1000, 4)) >>> scores = torch.randn(1000, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[ 57.0000, 198.8750, 45.9688, 221.8750, 4.1484],[215.0000, 155.0000, 236.8750, 137.0000,3.9023], [208.8750, 221.0000, 228.0000, 17.0000, 3.8867]],dtype=torch.float16)
def npu_batched_multiclass_nms(multi_bboxes,multi_scores,score_thr=0.05,nms_thr=0.45,max_num=50,score_factors=None):
使用NPU API的批量多类bbox NMS。
from torch_npu.contrib.function import npu_batched_multiclass_nms det_bboxes, det_labels = npu_batched_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3)
在动态shape条件下,由于NPU op的限制,最多支持20个类别(nmsed_classes)和10000个框(nmsed_boxes)。
Tuple - (bboxes, labels),shape为(bs, k, 5)和(bs, k, 1)的张量。标签以0为基础。
>>> boxes = torch.randint(1, 255, size=(4, 200, 80, 4)) >>> scores = torch.randn(4, 200, 81) >>> boxes = boxes.npu().half() >>> scores = scores.npu().half() >>> det_bboxes, det_labels = npu_batched_multiclass_nms(boxes, scores, score_thr=0.3, nms_thr=0.5, max_num=3) >>> expedt_det_bboxes = torch.tensor([[[221.8750, 60.0000, 183.0000, 22.0000, 3.8867], [167.0000, 250.0000, 136.0000, 144.0000, 3.6445], [ 45.9688, 147.0000, 67.0000, 241.8750, 3.4844]], [[ 5.0000, 178.0000, 243.8750, 138.0000, 3.7344], [238.0000, 132.0000, 47.0000, 84.0000, 3.6836], [ 32.0000, 110.0000, 131.0000, 73.0000, 3.6309]], [[111.9375, 120.9375, 54.0000, 231.0000, 3.9219], [147.0000, 162.0000, 78.0000, 1.0010, 3.9219], [157.0000, 118.0000, 57.0000, 115.0000, 3.6523]], [[ 80.0000, 126.9375, 54.0000, 246.8750, 3.7344], [ 31.0000, 253.8750, 19.0000, 138.0000, 3.6328], [ 54.0000, 253.8750, 78.0000, 75.0000, 3.5586]]],dtype=torch.float16)
def dropout_with_byte_mask(input1, p=0.5, training=True, inplace=False)
此dropout_with_byte_mask方法生成无状态随机uint8掩码,并根据掩码执行dropout。
from torch_npu.contrib.function import dropout_with_byte_mask torch_npu.dropout_with_byte_mask(input1, p, training)
class roll():
使用NPU亲和写法替换swin-transformer中的原生roll。
from torch_npu.contrib.function import roll
shifted_x_npu = roll(input1, shifts=(-shift_size, -shift_size), dims=(1, 2))
>>> input1 = torch.randn(32, 56, 56, 16).npu() >>> shift_size = 3 >>> shifted_x_npu = roll(input1, shifts=(-shift_size, -shift_size), dims=(1, 2))
class Mish(nn.Module):
应用基于NPU的Mish操作。
>>> m = nn.Mish() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class SiLU(nn.Module):
按元素应用基于NPU的Sigmoid线性单元(SiLU)函数。SiLU函数也称为Swish函数。
>>> m = nn.SiLU() >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor)
class ChannelShuffle(nn.Module):
应用NPU兼容的通道shuffle操作。为避免NPU上效率不高的连续操作,我们用相同语义重写替换原始操作。以下两个不连续操作已被替换:transpose和chunk。
from torch_npu.contrib.module import ChannelShuffle m = ChannelShuffle(64, split_shuffle=True)
>>> x1 = torch.randn(2,32,7,7) >>> x2 = torch.randn(2,32,7,7) >>> m = ChannelShuffle(64, split_shuffle=True) >>> output = m(x1, x2)
class LabelSmoothingCrossEntropy(nn.Module):
使用NPU API进行LabelSmoothing Cross Entropy。
from torch_npu.contrib.module import LabelSmoothingCrossEntropy m = LabelSmoothingCrossEntropy(10)
>>> x = torch.randn(2, 10) >>> y = torch.randint(0, 10, size=(2,)) >>> x = x.npu() >>> y = y.npu() >>> x.requires_grad = True >>> m = LabelSmoothingCrossEntropy(10) >>> npu_output = m(x, y) >>> npu_output.backward()
class ModulatedDeformConv(nn.Module):
应用基于NPU的Modulated Deformable 2D卷积操作。
from torch_npu.contrib.module import ModulatedDeformConv m = ModulatedDeformConv(32, 32, 1)
>>> m = ModulatedDeformConv(32, 32, 1) >>> input_tensor = torch.randn(2, 32, 5, 5) >>> output = m(input_tensor) >>> x = torch.randn(2, 32, 7, 7) >>> model = ModulatedDeformConv(32, 32, 3, 2, 1) >>> torch.npu.set_device(0) >>> x = x.npu() >>> model = model.npu() >>> o = model(x) >>> l = o.sum() >>> l.backward() >>> print(l)
class NpuDropPath(nn.Module):
使用NPU亲和写法替换swin_transformer.py中的原生Drop路径。丢弃每个样本(应用于residual blocks的主路径)的路径(随机深度)。
from torch_npu.contrib.module import NpuDropPath fast_drop_path = NpuDropPath(0).npu()
>>> input1 = torch.randn(68, 5).npu() >>> input1.requires_grad_(True) >>> input2 = torch.randn(68, 5).npu() >>> input2.requires_grad_(True) >>> fast_drop_path = NpuDropPath(0).npu() >>> output = input1 + fast_drop_path(input2) >>> output.sum().backward()
class NpuCachedDropout(torch.nn.Dropout):
在NPU设备上使用FairseqDropout。
from torch_npu.contrib.module import NpuDropPath NpuCachedDropout.enable_dropout_ensemble(model)
>>> model = NpuMNIST().to("npu") >>> x = torch.randn(2,10,16,16).to("npu") >>> NpuCachedDropout.enable_dropout_ensemble(model) >>> output = model(x)
class Focus(nn.Module):
使用NPU亲和写法替换YOLOv5中的原生Focus。
from torch_npu.contrib.module import Focus fast_focus = Focus(8, 13).npu()
>>> input = torch.randn(4, 8, 300, 40).npu() >>> input.requires_grad_(True) >>> fast_focus = Focus(8, 13).npu() >>> output = fast_focus(input) >>> output.sum().backward()
class FusedColorJitter(torch.nn.Module):
随机更改图像的亮度、对比度、饱和度和色调。
from torch_npu.contrib.module import FusedColorJitter fcj = FusedColorJitter(0.1, 0.1, 0.1, 0.1)
from PIL import Image image = Image.fromarray(torch.randint(0, 256, size=(224, 224, 3)).numpy().astype(np.uint8)) fcj = FusedColorJitter(0.1, 0.1, 0.1, 0.1) image = fcj(image)
class MultiheadAttention(nn.Module):
Multi-headed attention.
from torch_npu.contrib.module import MultiheadAttention model = MultiheadAttention(embed_dim=1024,num_heads=16,dropout=0.1,kdim=1024,vdim=1024,self_attention=True,encoder_decoder_attention=True)
>>> model = MultiheadAttention(embed_dim=1024,num_heads=16,dropout=0.1,kdim=1024,vdim=1024,self_attention=True,encoder_decoder_attention=True) >>> _, query = create_common_tensor([np.float16, FORMAT_NZ, (1024,1024)], -1, 1) >>> _, key = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, value = create_common_tensor([np.float16, FORMAT_NZ, (1024, 1024)], -1, 1) >>> _, key_padding_mask = create_common_tensor([np.float16, FORMAT_NZ, (16,16,64,64)], -65504, 65504) >>> bsz = 16 >>> tgt_len = 64 >>> s_len=64 >>> model = model.to("npu") >>> output = model(query, key, value, bsz, tgt_len, s_len, key_padding_mask)
class DropoutWithByteMask(Module):
应用NPU兼容的DropoutWithByteMask操作。
from torch_npu.contrib.module.npu_modules import DropoutWithByteMask
m = DropoutWithByteMask(p=0.5)
>>> m = DropoutWithByteMask(p=0.5) >>> input = torch.randn(16, 16) >>> output = m(input)
def dropout_with_byte_mask(input1, p=0.5, training=True, inplace=False)
class PSROIPool(nn.Module):
使用NPU API进行ROIAlign。
from torch_npu.contrib.module import PSROIPool model = PSROIPool(pooled_height=7, pooled_width=7, spatial_scale=1 / 16.0, group_size=7, output_dim=22)
class ROIAlign(nn.Module):
使用NPU API进行ROIAlign。
给定一个连续坐标c,使用floor(c - 0.5) 和ceil(c - 0.5)对它的两个相邻像素索引(像素模型中)进行计算。例如,c=1.3具有离散索引为[0]和[1] (从连续坐标0.5到1.5的底层信号采样)的像素邻域。但原始ROIAlign(aligned=False)在计算相邻像素索引时不会减去0.5,因此在执行双线性插值时,它使用的是未完全对齐的像素(相对于我们的像素模型有一点不对齐)。当aligned=True,首先适当缩放ROI,然后在调用ROIAlign之前将其移动-0.5。这样可以生成正确的邻域。相关验证请参见detectron2/tests/testroialign.py。如果ROIAlign与conv层一起使用,差异也不会对模型的性能产生影响。
from torch_npu.contrib.module import ROIAlign roi_align(input_tensor.float(), rois, output_size, spatial_scale, sampling_ratio, aligned)