示例:>>> import torch
>>> import torch_npu
>>> query_layer = torch_npu.npu_format_cast(torch.rand(24, 16, 512, 64).npu() , 29).half()
>>> query_layer = torch_npu.npu_format_cast(torch.rand(24, 16, 512, 64).npu(), 29).half()
>>> key_layer = torch_npu.npu_format_cast(torch.rand(24, 16, 512, 64).npu(), 29).half()
>>> value_layer = torch_npu.npu_format_cast(torch.rand(24, 16, 512, 64).npu(), 29).half()
>>> attention_mask = torch_npu.npu_format_cast(torch.rand(24, 16, 512, 512).npu(), 29).half()
>>> scale = 0.125
>>> keep_prob = 0.5
>>> context_layer = torch_npu.npu_fused_attention_score(query_layer, key_layer, value_layer, attention_mask, scale, keep_prob)
>>> print(context_layer)
tensor([[0.5063, 0.4900, 0.4951, ..., 0.5493, 0.5249, 0.5400],
[0.4844, 0.4724, 0.4927, ..., 0.5176, 0.4702, 0.4790],
[0.4683, 0.4771, 0.5054, ..., 0.4917, 0.4614, 0.4739],
...,
[0.5137, 0.5010, 0.5078, ..., 0.4656, 0.4592, 0.5034],
[0.5425, 0.5732, 0.5347, ..., 0.5054, 0.5024, 0.4924],