开发示例
本章节将详细介绍如何新增自定义告警,以新增告警ID为001A0000的自定义告警为例,指导用户进行二次开发。
文件说明
开发示例中涉及到的文件路径为“{project_dir}/src/app/add_customized_alarm”,目录结构如下:
├── alarm_info_en.json // 后端告警信息配置文件,用于上报告警到FusionDirector
├── alarm_info_solution_en.json // 前端告警信息配置文件,用于页面展示
├── alarm_info_solution_zh.json // 前端告警信息配置文件,用于页面展示
├── all_alarm_for_manager.json // 后端告警屏蔽规则配置文件,用于检验创建的登录规则是否合法
├── all_alarm_for_manager_web.json // 前端告警屏蔽规则配置文件,用于页面展示
├── build_customized_alarm.sh // 编译脚本
├── validate_alarm_config.py // 配置文件校验脚本
├── CMakeLists.txt // CMake配置文件
├── customized_alarm_check.c // 自定义告警实现源文件
└── customized_alarm_check.h // 自定义告警实现头文件
配置文件说明
自定义告警涉及到的配置文件如文件说明所示,配置文件的相关字段说明如下。
- alarm_info_en.json、alarm_info_solution_en.json和alarm_info_solution_zh.json等配置文件格式一致,此处以alarm_info_en.json为例进行介绍。
{ "MAJOR_VERSION": "2", # 主版本 "MINOR_VERSION": "8", # 次版本 "AUX_VERSION": "0", # 辅助版本 "EventSuggestion": [ # 告警信息列表 { "id": "00000000", # 告警ID "name": "Drive Overtemperature", # 告警名称 "dealSuggestion": "1. Check whether a TEC alarm is generated. @#AB2. Check whether the ambient temperature of the device exceeds 60°C.@#AB3. Restart the system. Then check whether the alarm is cleared.@#AB4. Contact Vendor technical support.", # 处理建议 "detailedInformation": "The component temperature exceeds the threshold.", # 详细信息 "reason": " The ambient temperature is excessively high.", # 产生原因 "impact": " The system reliability may be affected." # 影响 } ] }
- all_alarm_for_manager.json和all_alarm_for_manager_web.json字段和格式相同,区别在all_alarm_for_manager_web.json的唯一标识字段定义为innerid。
{ "LANG": { "MAJOR_VERSION": "2", # 主版本 "MINOR_VERSION": "8", # 次版本 "AUX_VERSION": "0", # 辅助版本 "EventSuggestion": { # 告警屏蔽规则信息 "item": [ # 告警屏蔽规则列表 { "@innerid": "a000000000", # 唯一标识,all_alarm_for_manager_web.json的唯一标识字段为innerid "id": "00000000", # 告警ID "level": "2", # 告警级别 "AlarmInstance": "M.2" # 告警主体 } ] } } }
操作步骤
- 打开新增自定义告警功能开关。
将project.conf文件的_NeedCustomizedAlarmCheck字段取值改为yes(默认值为“no”)。
- 在customized_alarm_check.h中定义告警初始化接口、回调函数注册接口以及相关结构体。
- 告警消息由两部分拼接组成,分别是消息头(FAULT_MSG_HEAD_STRU,定义了消息长度、消息来源、告警数量)和消息体(FAULT_ITEM_STRU类型的数组,每个FAULT_ITEM_STRU表示一条告警)。FAULT_MSG_HEAD_STRU和FAULT_ITEM_STRU必须按照下面样例里的结构定义,否则OM SDK解析告警信息将会失败。
- 消息头FAULT_MSG_HEAD_STRU里的cmd字段必须设置为2,表明是自定义告警,如果设置为其他值,可能导致自定义告警功能失效或影响OM SDK已支持的告警。
- 消息头FAULT_ITEM_STRU里的fault_id和sub_fault_id字段的取值范围为0~49,如果超过49,会导致告警上报失败;相同fault_id的告警类型不能超过50。
- 下面示例中的CUSTOMIZED_ALARM_CHECK_CALLBACK_PFN表示OM SDK提供的回调函数,必须按照示例中的函数定义来定义,否则注册OM SDK的回调函数将失败,告警信息将上报失败。
- OM SDK启动时,将调用自定义告警检测组件提供的初始化接口drv_fault_check_init和回调函数注册接口drv_fault_check_register_callback。请完全按照示例中的接口定义来定义这两个接口,否则会导致自定义告警检测组件启动失败。
#ifndef CUSTOMIZED_ALARM_CHECK_H #define CUSTOMIZED_ALARM_CHECK_H #define ALARM_INFO_MAX_SIZE 64 * 1024 typedef int (*CUSTOMIZED_ALARM_CHECK_CALLBACK_PFN)(unsigned char *data, unsigned int data_len); typedef struct { unsigned int data_len; // 报文长度 unsigned int cmd; // 赋值时,必须设置为2,表明是自定义告警,如果设置为其他值可能导致自定义告警功能失效或影响OM SDK已支持的告警 unsigned int item_num; // 告警个数 } FAULT_MSG_HEAD_STRU; typedef struct { unsigned short fault_id; // 告警id unsigned short sub_fault_id; // 子告警id unsigned short fault_level; // 告警级别 unsigned short reserved; // 4字节对齐 time_t raise_time_stamp; // 告警时间戳(元年到告警产生的秒数) char fault_name[64]; // 告警名称 char resource[32]; // 告警实体 } FAULT_ITEM_STRU; int drv_fault_check_init(); void *customized_alarm_check(void *para); unsigned char *generate_customized_alarm(); int drv_fault_check_register_callback(CUSTOMIZED_ALARM_CHECK_CALLBACK_PFN pfnRx); #endif
- 在customized_alarm_check.c中实现自定义告警初始化接口和回调函数注册接口。
用户实现自定义告警检测初始化接口时,最好是通过新建线程实现,此处以打桩构造告警为例。
/* * Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. */ #include "unistd.h" #include "stdlib.h" #include "string.h" #include <time.h> #include "pthread.h" #include "customized_alarm_check.h" CUSTOMIZED_ALARM_CHECK_CALLBACK_PFN g_customized_alarm_check_rx_pfn = NULL; // 告警检测函数,此处是打桩构造自定义告警 void* customized_alarm_check(void *para) { int ret; unsigned char *alarmBuff = NULL; unsigned int alarmSize = 0; sleep(5); // 打桩构造告警 do { alarmSize = sizeof(FAULT_MSG_HEAD_STRU) + sizeof(FAULT_ITEM_STRU); alarmBuff = generate_customized_alarm(); if (alarmBuff == NULL) { continue; } FAULT_MSG_HEAD_STRU *msg_head = (FAULT_MSG_HEAD_STRU *)alarmBuff; msg_head->data_len = alarmSize; msg_head->item_num = 1; msg_head->cmd = 2; if(g_customized_alarm_check_rx_pfn(alarmBuff, alarmSize) != 0) { free(alarmBuff); continue; } sleep(30); free(alarmBuff); } while (1); return ((void *)0); } // 生成告警信息 unsigned char *generate_customized_alarm() { unsigned int head_size = sizeof(FAULT_MSG_HEAD_STRU); unsigned int body_size = sizeof(FAULT_ITEM_STRU); if (head_size + body_size > ALARM_INFO_MAX_SIZE) { return NULL; } unsigned char *alarmBuff = (unsigned char *)malloc(head_size + body_size); if (alarmBuff == NULL) { return NULL; } (void)memset_s(alarmBuff, sizeof(head_size + body_size), 0, sizeof(head_size + body_size)); FAULT_ITEM_STRU *item = (FAULT_ITEM_STRU *)(alarmBuff + head_size); item->fault_id = 26; // 告警id item->fault_level = 1; // 告警级别 FAULT_LEVEL_ENUM :紧急告警 严重告警 轻微告警 item->raise_time_stamp = time(NULL); // 告警时间戳(元年到告警产生的秒数) (void)strncpy_s(item->fault_name, 64, "TEST_ERROR", strlen("TEST_ERROR")); // 告警名称 (void)strncpy_s(item->resource, 32, "TEST", strlen("TEST")); // 告警实体 return alarmBuff; } // 自定义告警初始化,为避免初始化过程耗时过长,新建线程启动 int drv_fault_check_init() { int ret = 0; unsigned long customized_alarm_check_thread = 0; ret = pthread_create(&customized_alarm_check_thread, NULL, customized_alarm_check, NULL); return ret; } // 注册回调函数,将回调函数保存在全局变量中 int drv_fault_check_register_callback(CUSTOMIZED_ALARM_CHECK_CALLBACK_PFN pfnRx) { g_customized_alarm_check_rx_pfn = pfnRx; return 0; }
- 将OM SDK中的告警配置文件拷贝到“{project_dir}/src/app/add_customized_alarm”路径下,并在此基础上新增自定义告警信息。相应的配置文件路径如下:
- alarm_info_en.json:config/alarm_info_en.json
- alarm_info_solution_zh.json:software/nginx/html/manager/config/alarm_info_solution_zh.json
- alarm_info_solution_en.json:software/nginx/html/manager/config/alarm_info_solution_en.json
- all_alarm_for_manager.json:software/ibma/config/all_alarm_for_manager.json
- all_alarm_for_manager_web.json:software/nginx/html/manager/config/all_alarm_for_manager.json
OM SDK软件包中的告警屏蔽规则配置文件都叫all_alarm_for_manager.json,为了区分,因此将前端使用的告警屏蔽规则配置文件命名为all_alarm_for_manager_web.json。
- 修改配置文件。此处以新增告警ID为001A0000为例,在对应配置文件中新增的告警信息。以下仅为配置示例,只做参考,不能直接复制使用。
告警ID是由fault_id左移十六位再加上sub_fault_id后,转为16进制得到的,如果转换后的数字不足八位,则补齐八位。以告警ID001A0000为例,fault_id等于26,sub_fault_id等于0,26左移十六位后得到110100000000000000000,加上sub_fault_id后等于110100000000000000000,再将其转为16进制,等到数值1A0000,由于其不足八位,补齐八位得到告警ID为001A0000。
- 在alarm_info_en.json中新增自定义告警信息。
{ "MAJOR_VERSION": "2", "MINOR_VERSION": "8", "AUX_VERSION": "0", "EventSuggestion": [ { "id": "00000000", "name": "Drive Overtemperature", "dealSuggestion": "1. Check whether a TEC alarm is generated. @#AB2. Check whether the ambient temperature of the device exceeds 60°C.@#AB3. Restart the system. Then check whether the alarm is cleared.@#AB4. Contact Vendor technical support.", "detailedInformation": "The component temperature exceeds the threshold.", "reason": " The ambient temperature is excessively high.", "impact": " The system reliability may be affected." }, { "id": "00000001", "name": "Drive Service Life Prewarning", "dealSuggestion": "1. Restart the system. Then check whether the alarm is cleared. @#AB2. Back up data and replace the drive. Then check whether the alarm is cleared. @#AB3. Contact Vendor technical support.", "detailedInformation": "The drive is severely worn.", "reason": " The hard drive has bad blocks.", "impact": "Data may be lost." }, ... # 以上为配置文件原本的告警信息,新增的告警信息如下 { "id": "001A0000", "name": "Test customized alarm", "dealSuggestion": "do nothing", "detailedInformation": "Test customized alarm.", "reason": "Test customized alarm.", "impact": "no impact." } ] }
- 在alarm_info_solution_zh.json和alarm_info_solution_en.json中新增自定义告警信息,两者只是中英文的区别。
{ "MAJOR_VERSION": "2", "MINOR_VERSION": "8", "AUX_VERSION": "0", "EventSuggestion": [ { "id": "00000000", "name": "硬盘温度过高", "dealSuggestion": "1、检查是否存在TEC告警。@#AB2、使用测温工具检测设备周围环境是否超过60度。@#AB3、重启智能小站,查看告警是否消失。@#AB4、联系供应商技术支持", "detailedInformation": "硬盘温度超过门限。", "reason": "环境温度过高。", "impact": "可能影响系统运行的可靠性。" }, { "id": "00000001", "name": "硬盘寿命到期预警", "dealSuggestion": "1、重启智能小站,查看告警是否消失。@#AB2、备份数据后更换硬盘,查看告警是否消失。@#AB3、联系供应商技术支持。", "detailedInformation": "硬盘磨损严重。", "reason": "硬盘坏块。", "impact": "可能导致数据丢失。" }, ... # 以上为配置文件原本的告警信息,新增的告警信息如下 { "id": "001A0000", "name": "测试自定义告警", "dealSuggestion": "什么都不用做。", "detailedInformation": "什么都不用做。", "reason": "测试自定义告警", "impact": "没有影响。" } ] }
- 在all_alarm_for_manager.json中新增告警屏蔽规则。
{ "LANG": { "MAJOR_VERSION": "2", "MINOR_VERSION": "8", "AUX_VERSION": "0", "EventSuggestion": { "item": [ { "@innerid": "a000000000", "id": "00000000", "level": "2", "AlarmInstance": "M.2" }, { "@innerid": "a000000001", "id": "00000001", "level": "2", "AlarmInstance": "M.2" }, ... # 以上为配置文件原本的告警信息,新增的告警信息如下 { "@innerid": "x000000000", "id": "001A0000", "level": "1", "AlarmInstance": "TEST" } ] } } }
- 在all_alarm_for_manager_web.json中新增告警屏蔽规则。
{ "MAJOR_VERSION": "2", "MINOR_VERSION": "8", "AUX_VERSION": "0", "EventSuggestion": [ { "innerid": "a000000000", "id": "00000000", "level": "2", "AlarmInstance": "M.2" }, { "innerid": "a000000001", "id": "00000001", "level": "2", "AlarmInstance": "M.2" }, ... # 以上为配置文件原本的告警信息,新增的告警信息如下 { "innerid": "x000000000", "id": "001A0000", "level": "1", "AlarmInstance": "TEST" } ] }
- 在alarm_info_en.json中新增自定义告警信息。
- 校验配置文件是否配置正确。
validate_alarm_config.py脚本会校验告警屏蔽配置文件(all_alarm_for_manager.json和all_alarm_for_manager_web.json)和告警信息配置文件(alarm_info_en.json、alarm_info_solution_en.json、alarm_info_solution_zh.json)。对于告警屏蔽配置文件,校验内容包括是否存在重复的@innerid、是否存在重复的innerid、是否存在重复的id+AlarmInstance组合、配置文件格式是否正确以及字段是否有缺失或不在要求的范围内;对于告警信息配置文件,校验内容包括是否存在重复的ID、配置文件格式是否正确以及字段是否有缺失或不在要求的范围内。
- 执行以下命令,进入配置文件所在路径。
cd {project_dir}/src/app/add_customized_alarm
- 执行以下命令,校验配置文件。
python3 validate_alarm_config.py ./
validate_alarm_config.py脚本参考示例如下:
import json import os import sys ALARM_INFO_CONFIG = ("alarm_info_en.json", "alarm_info_solution_en.json", "alarm_info_solution_zh.json") ALARM_INFO_FIELDS = {"id", "name", "dealSuggestion", "detailedInformation", "reason", "impact"} ALARM_SHIELD_FIELDS = {"@innerid", "id", "level", "AlarmInstance"} ALARM_SHIELD_WEB_FIELDS = {"innerid", "id", "level", "AlarmInstance"} class Result: def __init__(self, result: bool, err_msg: str = ""): self._result = result self._err_msg = err_msg def __bool__(self): return self._result @property def error(self) -> str: return self._err_msg def check_alarm_id(filename: str, alarm_id: str): if len(alarm_id) != 8: raise Exception(f"{filename}: alarm id length wrong, should be 8") try: int(alarm_id, base=16) except Exception as err: raise Exception(f"{filename}: alarm id is invalid, reason: {err}") def check_alarm_field_range(alarms: list, filename: str, filed_range: set): for alarm in alarms: if not isinstance(alarm, dict): raise Exception(f"{filename}: alarm info type is wrong, should be map") if len(alarm.keys()) != len(filed_range): raise Exception(f"{filename}: alarm info miss some fields") if not set(alarm.keys()).issubset(filed_range): raise Exception(f"{filename}: alarm info fields is not in range of {filed_range}") check_alarm_id(filename, alarm.get("id")) def check_alarm_info_config(content: dict, filename: str) -> Result: """ 检查告警信息配置文件是否配置正确,检测项包含:告警id是否重复、告警信息格式是否正确以及字段是否缺失或不在要求的字段范围内 :param filename: 告警配置文件名 :param content: 配置文件内容 :return:检测结果 """ alarms = content.get("EventSuggestion") if not alarms: return Result(False, f"{filename}: EventSuggestion is null") if not isinstance(alarms, list): return Result(False, f"{filename}: EventSuggestion type is wrong, should be list") try: check_alarm_field_range(alarms, filename, ALARM_INFO_FIELDS) except Exception as err: return Result(False, f'{err}') ids = [alarm.get("id") for alarm in alarms] if len(set(ids)) != len(alarms): return Result(False, f"{filename}: have same id, please check") return Result(True) def check_alarm_shield(inner_id: str, alarm_shields: list, filename: str, filed_range: set) -> Result: try: check_alarm_field_range(alarm_shields, filename, filed_range) except Exception as err: return Result(False, f'{err}') ids = [alarm_shield.get(inner_id) for alarm_shield in alarm_shields] if len(set(ids)) != len(alarm_shields): return Result(False, f"{filename}: have same {inner_id}, please check") unique_keys = [f"{alarm_shield.get('id')}-{alarm_shield.get('AlarmInstance')}" for alarm_shield in alarm_shields] if len(set(unique_keys)) != len(alarm_shields): return Result(False, f"{filename}: have same unique_key(id-AlarmInstance), please check") return Result(True) def check_alarm_shield_config(content: dict, filename: str) -> Result: """ 检查告警屏蔽配置文件是否正确,检查内容包括是否存在重复的inerid、是否存在重复的告警id+主体、告警屏蔽配置格式是否正确以及字段是否缺失和字段不在要求的字段范围内 :param content:配置文件内容 :param filename:配置文件名 :return:检测结果 """ lang = content.get("LANG") if not isinstance(lang, dict): return Result(False, f"{filename}: LANG type is wrong, should be map") event_suggestion = lang.get("EventSuggestion") if not event_suggestion: return Result(False, f"{filename}: EventSuggestion is null") if not isinstance(event_suggestion, dict): return Result(False, f"{filename}: EventSuggestion type is wrong, should be map") alarm_shields = event_suggestion.get("item") if not alarm_shields: return Result(False, f"{filename}: item is null") if not isinstance(alarm_shields, list): return Result(False, f"{filename}: item type is wrong, should be list") return check_alarm_shield("@innerid", alarm_shields, filename, ALARM_SHIELD_FIELDS) def check_alarm_shield_web_config(content: dict, filename: str) -> Result: """ 检查前端告警屏蔽配置文件是否正确,检查内容包括是否存在重复的inerid、是否存在重复的告警id+主体、告警屏蔽配置格式是否正确以及字段是否缺失和字段不在要求的字段范围内 :param content:配置文件内容 :param filename:配置文件名 :return:检测结果 """ alarm_shields = content.get("EventSuggestion") if not alarm_shields: return Result(False, f"{filename}: EventSuggestion is null") if not isinstance(alarm_shields, list): return Result(False, f"{filename}: EventSuggestion type is wrong, should be list") return check_alarm_shield("innerid", alarm_shields, filename, ALARM_SHIELD_WEB_FIELDS) def check(config_file_dir: str): """ 检查告警配置文件配置内容是否正确 :param config_file_dir: 告警相关配置文件路径 :return: None """ for filename in os.listdir(config_file_dir): if not filename.endswith("json"): continue try: with open(os.path.join(config_file_dir, filename)) as stream: document = json.load(stream) except Exception as err: print(f"read file content of {filename} failed, reason: {err}") return if filename in ALARM_INFO_CONFIG: ret = check_alarm_info_config(document, filename) if not ret: print(f"check file {filename} failed, reason:{ret.error}") return elif filename == "all_alarm_for_manager.json": ret = check_alarm_shield_config(document, filename) if not ret: print(f"check file {filename} failed, reason:{ret.error}") return elif filename == "all_alarm_for_manager_web.json": ret = check_alarm_shield_web_config(document, filename) if not ret: print(f"check file {filename} failed, reason:{ret.error}") return else: raise AssertionError(f"Unknown file: {filename}") print(f"The collection of configurations in {config_file_dir} is correct. ") if __name__ == "__main__": if len(sys.argv) == 2: check(sys.argv[1]) else: print("The input argument should be the directory of the configuration files")
- 执行以下命令,进入配置文件所在路径。
- 在CMakeLists.txt中编写构建方式。
生成的二进制文件名称必须是libcustomized_alarm.so,否则可能导致OM SDK启动自定义告警组件失败。
#设置CMake的最低版本 cmake_minimum_required(VERSION 3.16) #交叉编译选项 if (CROSSCOMPILE_ENABLED) set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR aarch64) set(target_arch aarch64-linux-gnu) set(CMAKE_C_COMPILER /usr/bin/aarch64-linux-gnu-gcc) set(CMAKE_CXX_COMPILER /usr/bin/aarch64-linux-gnu-g++) set(CMAKE_LIBRARY_ARCHITECTURE ${target_arch} CACHE STRING "" FORCE) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) endif() #添加构建的项目名称 project(customized_alarm) #将源文件生成名为libcustomized_alarm.so链接文件 STATIC:静态链接 SHARED:动态链接 add_library(customized_alarm SHARED customized_alarm_check.c customized_alarm_check.h)
- 在build_customized_alarm.sh脚本中实现构建逻辑。
#!/bin/bash CUR_DIR=$(dirname "$(readlink -f "$0")") function build_customized_alarm() { echo "build customized alarm ..." if [ ! -d "${CUR_DIR}/build" ];then mkdir -p "${CUR_DIR}/build" else rm -rf "${CUR_DIR}/build"/* fi cd "${CUR_DIR}/build" cmake -DCROSSCOMPILE_ENABLED=ON .. make echo "build customized alarm success" return 0 } build_customized_alarm RESULT=$? exit "${RESULT}"
- 在“{project_dir}/build/build.sh”中增加自定义构建脚本调用。
# 添加自定义告警检测 if [[ "${_NeedCustomizedAlarmCheck}" == "yes" ]]; then if ! bash "${TOP_DIR}/src/app/add_customized_alarm/build_customized_alarm.sh";then return 1 fi cp -rf ${TOP_DIR}/src/app/add_customized_alarm/build/libcustomized_alarm.so ${OMSDK_TAR_PATH}/lib/ # 覆盖OM SDK软件包中告警相关的配置文件 cp -rf ${TOP_DIR}/src/app/add_customized_alarm/alarm_info_en.json ${OMSDK_TAR_PATH}/config/alarm_info_en.json cp -rf ${TOP_DIR}/src/app/add_customized_alarm/all_alarm_for_manager.json ${OMSDK_TAR_PATH}/software/ibma/config/all_alarm_for_manager.json cp -rf ${TOP_DIR}/src/app/add_customized_alarm/all_alarm_for_manager_web.json ${OMSDK_TAR_PATH}/software/nginx/html/manager/config/all_alarm_for_manager.json cp -rf ${TOP_DIR}/src/app/add_customized_alarm/alarm_info_solution_en.json ${OMSDK_TAR_PATH}/software/nginx/html/manager/config/alarm_info_solution_en.json cp -rf ${TOP_DIR}/src/app/add_customized_alarm/alarm_info_solution_zh.json ${OMSDK_TAR_PATH}/software/nginx/html/manager/config/alarm_info_solution_zh.json fi