LinearWeightQuant是对torch_npu接口torch_npu.npu_weight_quant_batchmatmul的封装类,完成矩阵乘计算中的weight输入和输出的量化操作,支持pertensor,perchannel,pergroup多场景量化。
torch_npu.contrib.module.LinearWeightQuant(in_features, out_features, bias=True, device=None, dtype=None, antiquant_offset=False, quant_scale=False, quant_offset=False, antiquant_group_size=0)
输出为Device侧Tensor类型,代表计算结果。当输入存在quant_scale时输出数据类型为INT8,当输入不存在quant_sclae时输出数据类型和输入x一致。
Atlas A2 训练系列产品
单算子模式: import torch import torch_npu import torchair as tng from torch_npu.contrib.module import LinearWeightQuant x = torch.randn((8192, 320),device='npu',dtype=torch.bfloat16) weight = torch.randn((320, 256),device='npu',dtype=torch.int8) antiquantscale = torch.randn((1, 256),device='npu',dtype=torch.bfloat16) antiquantoffset = torch.randn((1, 256),device='npu',dtype=torch.bfloat16) quantscale = torch.randn((1, 256),device='npu',dtype=torch.float) quantoffset = torch.randn((1, 256),device='npu',dtype=torch.float) model = LinearWeightQuant(in_features=320, out_features=256, bias=False, dtype=torch.bfloat16, antiquant_offset=True, quant_scale=True, quant_offset=True, antiquant_group_size=0, device=torch.device(f'npu:0') ) model.npu() model.weight.data = weight model.antiquant_scale.data = antiquantscale model.antiquant_offset.data = antiquantoffset model.quant_scale.data = quantscale model.quant_offset.data = quantoffset tng.experimental.inference.use_internal_format_weight(model) out = model.(x) 图模式: import torch import torch_npu import torchair as tng from torch_npu.contrib.module import LinearWeightQuant from torchair.configs.compiler_config import CompilerConfig config = CompilerConfig() config.debug.graph_dump.type = "pbtxt" npu_backend = tng.get_npu_backend(compiler_config=config) x = torch.randn((8192, 320),device='npu',dtype=torch.bfloat16) weight = torch.randn((320, 256),device='npu',dtype=torch.int8) antiquantscale = torch.randn((1, 256),device='npu',dtype=torch.bfloat16) antiquantoffset = torch.randn((1, 256),device='npu',dtype=torch.bfloat16) quantscale = torch.randn((1, 256),device='npu',dtype=torch.float) quantoffset = torch.randn((1, 256),device='npu',dtype=torch.float) model = LinearWeightQuant(in_features=320, out_features=256, bias=False, dtype=torch.bfloat16, antiquant_offset=True, quant_scale=True, quant_offset=True, antiquant_group_size=0, device=torch.device(f'npu:0') ) model.npu() model.weight.data = weight model.antiquant_scale.data = antiquantscale model.antiquant_offset.data = antiquantoffset model.quant_scale.data = quantscale model.quant_offset.data = quantoffset tng.experimental.inference.use_internal_format_weight(model) model = torch.compile(model, backend=npu_backend, dynamic=False) out = model.(x)