使用Atlas 300I Duo卡的推理服务场景,推理请求的后处理参数不支持top_k。如果并发发送推理(包括EndPoint提供的RESTfull接口和Engine提供的Forward接口),并且部分推理请求的后处理参数设置了top_k,部分请求后处理参数不设置top_k,会造成推理服务异常,导致后续推理请求执行失败,需要重启推理服务。
http/https请求的URL的IP地址和端口号在config.json中进行配置,详情请参见表3。
curl -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '{ "inputs": "My name is Olivier and I", "parameters": { "details": true, "do_sample": true, "repetition_penalty": 1.1, "return_full_text": false, "seed": null, "temperature": 1, "top_n_tokens": 5, "top_p": 0.99 }, "stream": false }' http://{ip}:{port}/generate
curl --location --request POST 'https://{ip}:{port}/generate' \ --header 'Content-Type: application/json' \ --cacert /home/runs/static_conf/ca/ca.pem \ --cert /home/runs/static_conf/cert/client.pem \ --key /home/runs/static_conf/cert/client.key.pem \ --data-raw '{ "inputs": "My name is Olivier and I", "parameters": { "best_of": 1, "decoder_input_details": false, "details": false, "do_sample": true, "max_new_tokens": 20, "repetition_penalty": 2, "return_full_text": false, "seed": 12, "stop": [ "photographer" ], "temperature": 0.1, "top_k": 1, "top_n_tokens": 5, "top_p": 0.9, "truncate": 1024, "typical_p": 0.95, "watermark": true }, "stream": true }'
请用户根据实际情况对相应参数进行修改。
API |
接口类型 |
URL |
说明 |
支持框架 |
---|---|---|---|---|
Server Live |
GET |
/v2/health/live |
检查服务器是否在线。 |
Triton |
Server Ready |
GET |
/v2/health/ready |
检查服务器是否准备。 |
Triton |
Model Ready |
GET |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/ready |
检查模型是否准备。 |
Triton |
health |
GET |
/health |
服务健康检查。 |
TGI/vLLM |
查询TGI EndPoint信息 |
GET |
/info |
查询TGI EndPoint信息。 |
TGI |
slot统计 |
GET |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/getSlotCount |
参考Triton格式,自定义的slot统计信息查询接口。 |
华为自研 |
API |
接口类型 |
URL |
说明 |
支持框架 |
---|---|---|---|---|
models列表 |
GET |
/v1/models |
列举当前可用模型列表。 |
OpenAI |
model详情 |
GET |
/v1/models/{model} |
查询模型信息。 |
OpenAI |
服务元数据查询 |
GET |
/v2 |
获取服务元数据。 |
Triton |
模型元数据查询 |
GET |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}] |
查询模型元数据信息。 |
Triton |
查询模型配置 |
GET |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/config |
查询模型配置。 |
Triton |
API |
接口类型 |
URL |
说明 |
支持框架 |
---|---|---|---|---|
推理任务 |
POST |
/ |
TGI推理接口,stream==false返回文本推理结果,stream==true返回流式推理结果。 |
TGI |
POST |
/generate |
TGI和vLLM的推理接口,通过请求参数来区分是哪种服务的接口。 |
TGI/vLLM |
|
POST |
/generate_stream |
TGI流式推理接口,使用Server-Sent Events格式返回结果。 |
TGI |
|
POST |
/v1/chat/completions |
OpenAI文本推理接口。 |
OpenAI |
|
POST |
/infer |
华为自研推理接口,支持文本/流式返回结果。 |
华为自研 |
|
POST |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/infer |
Triton的token推理接口。 |
Triton |
|
POST |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/stopInfer |
参考Triton接口定义,提供提前终止请求接口。 |
华为自研 |
|
POST |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate |
Triton文本推理接口。 |
Triton |
|
POST |
/v2/models/${MODEL_NAME}[/versions/${MODEL_VERSION}]/generate_stream |
Triton流式推理接口。 |
Triton |
使用Engine模块提供C++接口,需要开发代码来集成,以下代码提供接口使用样例,仅供参考。
/* * Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. */ #include <iostream> #include <memory> #include <map> #include <thread> #include <algorithm> #include <wait.h> #include "infer_engine.h" #include "infer_request.h" #include "metrics.h" #include "data_loader.h" #include "util.h" #include "io_manager.h" using namespace SimpleLLMInference; using SC = std::chrono::steady_clock; IOManager g_Manager; Statistics g_Statistics; std::map<std::string, Metrics> g_Metrics; volatile int g_CompleteNum = false; std::mutex g_Mutex; std::mutex g_MetricsMutex; static std::mutex g_ExitMtx; static bool g_ProcessExit = false; bool g_RecordOutput = false; namespace SimpleLLMInference { /** * * @param response */ Status ParseEosAttr(std::shared_ptr<InferenceResponse> &response, int64_t *flag, int64_t *outputLen) { InferenceResponse::Output *output; auto status = response->ImmutableOutput("IBIS_EOS_ATTR", &output); if (!status.IsOk()) { return status; } auto *eosData = static_cast<int64_t *>(output->Buffer()); *flag = eosData[0]; *outputLen = eosData[1]; return Status(infrastructure::Error::Code::OK, "Success"); } /** * 解析返回的token id * @param response */ Status ParseOutputId(std::shared_ptr<InferenceResponse> &response, std::vector<int64_t> &outputIds) { InferenceResponse::Output *output; auto status = response->ImmutableOutput("OUTPUT_IDS", &output); if (!status.IsOk()) { return status; } if (outputIds.empty()) { outputIds.reserve(128); } // 获取输出长度 auto len = output->Shape()[0]; auto *data = static_cast<int64_t *>(output->Buffer()); for (int i = 0; i < len; ++i) { outputIds.push_back(data[i]); } return Status(infrastructure::Error::Code::OK, "Success"); } /** * 请求回调 * @param response */ void ResponseCallback(std::shared_ptr<InferenceResponse> &response) { auto reqId = response->GetRequestId().StringValue(); size_t decodeTime; auto now = SC::now(); g_Manager.SetOutputData(reqId); { std::unique_lock lock(g_MetricsMutex); // 生成token数 int64_t flag; int64_t outputLen; auto ret = ParseEosAttr(response, &flag, &outputLen); if (!ret.IsOk()) { std::cout << "ReqId:" << reqId << ", Error:" << ret.StatusMsg() << std::endl; return; } g_Metrics[reqId].tokensOutput += outputLen; if (g_Metrics[reqId].firstTokenCost == 0) { // prefill 记录首token时间 decodeTime = GetDuration(now, g_Metrics[reqId].startingTime); g_Metrics[reqId].firstTokenCost = decodeTime; } else { // decode 记录每次decode的时间 decodeTime = GetDuration(now, g_Metrics[reqId].lastTokenTime); // 针对投机场景适配,decode返回小于等于gamma个token,四舍五入 auto avgDecodeTime = (decodeTime + outputLen / 2) / outputLen; for (int i = 0; i < outputLen; ++i) { g_Metrics[reqId].decodeTime.push_back(avgDecodeTime); } } g_Metrics[reqId].lastTokenTime = now; // 生成token id if (g_RecordOutput) { ret = ParseOutputId(response, g_Metrics[reqId].outputTokenIds); if (!ret.IsOk()) { std::cout << "ReqId:" << reqId << ", Error:" << ret.StatusMsg() << std::endl; return; } } if (response->IsEOS()) { g_Metrics[reqId].endingTime = now; // 最后一个Token耗时 g_Metrics[reqId].lastTokenCost = decodeTime; } } if (response->IsEOS()) { std::unique_lock lock(g_Mutex); g_CompleteNum++; std::cout << "ReqId:" << reqId << " Finished" << std::endl; } } void SendRequest(InferenceEngine &engine, uint64_t maxBatchSize) { uint64_t processingNum = 0; engine.GetProcessingRequest(&processingNum); std::cout << "the processing request num is " << processingNum << " at first." << std::endl; uint64_t slotNum = 0; uint64_t remainBlocks = 0; uint64_t remainPrefillSlots = 0; uint64_t remainPrefillTokens = 0; while (!g_Manager.Empty()) { // 2. 获取可用的slot数目 engine.GetRequestBlockQuotas(&remainBlocks, &remainPrefillSlots, &remainPrefillTokens); engine.GetProcessingRequest(&processingNum); slotNum = maxBatchSize - processingNum; if (remainBlocks > 0 && remainPrefillSlots > 0 && remainPrefillTokens > 0) { // 3. Set input std::vector<std::shared_ptr<Data>> data = g_Manager.GetInputDataByQuotas(remainBlocks, remainPrefillSlots, remainPrefillTokens, slotNum); if (!data.empty()) { std::vector<std::shared_ptr<InferenceRequest>> requests = Data2Request(data); g_Statistics.requestNumber += requests.size(); // total num // 4. forward(异步) for (size_t i = 0; i < requests.size(); ++i) { auto reqId = requests[i]->GetRequestId().StringValue(); { std::unique_lock lock(g_MetricsMutex); g_Metrics[reqId].startingTime = SC::now(); g_Metrics[reqId].tokensInput = data[i]->size; } engine.Forward(requests[i]); } } } std::this_thread::sleep_for(std::chrono::milliseconds(20L)); } engine.GetProcessingRequest(&processingNum); std::cout << "the processing request num is " << processingNum << " when all requests dispatched." << std::endl; } void RunEngine(std::string dataset) { TtimeT start; TtimeT end; g_Manager.SetInputData(dataset); // 初始化engine InferenceEngine engine; auto ret = engine.Init(ResponseCallback); if (!ret.IsOk()) { std::cout << "engine init failed: " << ret.StatusMsg() << std::endl; return; } uint64_t maxBatchSize; ret = engine.GetMaxBatchSize(&maxBatchSize); if (!ret.IsOk()) { std::cout << "GetMaxBatchSize failed: " << ret.StatusMsg() << std::endl; return; } start = SC::now(); SendRequest(engine, maxBatchSize); while (g_CompleteNum < g_Statistics.requestNumber) { std::this_thread::sleep_for(std::chrono::milliseconds(10L)); } end = SC::now(); // 5. 统计打点信息 g_Statistics.modelFullName = ""; g_Statistics.tp = 8; g_Statistics.pp = 1; g_Statistics.latencyForAll = GetDuration(end, start); FormatMetrics(g_Metrics, g_Statistics); PrintStatistics(g_Statistics); if (g_RecordOutput) { std::map<std::string, std::vector<int64_t>> outputTokensId; for (auto &m : g_Metrics) { outputTokensId[m.first] = m.second.outputTokenIds; } WriteOutputIds(outputTokensId, "./token_output.csv"); } // 6. 释放资源 auto res = engine.Finalize(); std::cout << "inferenceEngine finalize message is : " << res.StatusMsg() << std::endl; { std::unique_lock<std::mutex> lock(g_ExitMtx); g_ProcessExit = true; } } } int main(int argc, char *argv[]) { // 数据集管理 std::string dataset = argc > 1 ? argv[1] : "token_input_gsm.csv"; g_RecordOutput = argc > 2 && std::stoi(argv[2]); std::thread businessThread(RunEngine, dataset); businessThread.detach(); int status; while (true) { pid_t childPid = wait(&status); if (childPid == -1) { sleep(10u); } else { std::cout << "Child Process:" << childPid << " Exited" << std::endl; } std::unique_lock<std::mutex> lock(g_ExitMtx); if (g_ProcessExit) { break; } } return 0; }