m5stack · Mar 18, 2025 · Mar 18, 2025 · Mar 20, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/doc/projects_llm_framework_doc/llm_llm_en.md b/doc/projects_llm_framework_doc/llm_llm_en.md
@@ -55,6 +55,75 @@ Response json:
 - created: Message creation time, unix time.
 - work_id: The successfully created work_id unit.
 
+## inference
+
+### streaming input
+
+```json
+{
+    "request_id": "2",
+    "work_id": "llm.1003",
+    "action": "inference",
+    "object": "llm.utf-8.stream",
+    "data": {
+        "delta": "What's ur name?",
+        "index": 0,
+        "finish": true
+    }
+}
+```
+- object: The data type transmitted is llm.utf-8.stream, indicating a streaming input from the user's UTF-8.
+- delta: Segment data of the streaming input.
+- index: Index of the segment in the streaming input.
+- finish: A flag indicating whether the streaming input has completed.
+
+### non-streaming input
+
+```json
+{
+    "request_id": "2",
+    "work_id": "llm.1003",
+    "action": "inference",
+    "object": "llm.utf-8",
+    "data": "What's ur name?"
+}
+```
+
+- object: The data type transmitted is llm.utf-8, indicating a non-streaming input from the user's UTF-8.
+- data: Data for non-streaming input.
+
+streaming response json:
+
+```json
+{"created":1742779468,"data":{"delta":"I am not","finish":false,"index":0},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779469,"data":{"delta":" a person,","finish":false,"index":1},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779469,"data":{"delta":" but I'm","finish":false,"index":2},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779470,"data":{"delta":" here to assist","finish":false,"index":3},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779471,"data":{"delta":" you with any","finish":false,"index":4},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779472,"data":{"delta":" questions or tasks","finish":false,"index":5},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779473,"data":{"delta":" you may have","finish":false,"index":6},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779474,"data":{"delta":". How can","finish":false,"index":7},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779474,"data":{"delta":" I help you","finish":false,"index":8},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779475,"data":{"delta":" today?","finish":false,"index":9},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779475,"data":{"delta":"","finish":true,"index":10},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+```
+
+non-streaming response json:
+
+```json
+{
+    "created": 1742780120,
+    "data": "As an artificial intelligence, I don't have a name in the traditional sense. However, I am here to assist you with any questions or information you may need. How can I help you today?",
+    "error": {
+        "code": 0,
+        "message": ""
+    },
+    "object": "llm.utf-8",
+    "request_id": "2",
+    "work_id": "llm.1003"
+}
+```
+
 ## link
 
 Link the output of the upper unit.

diff --git a/doc/projects_llm_framework_doc/llm_llm_zh.md b/doc/projects_llm_framework_doc/llm_llm_zh.md
@@ -55,6 +55,74 @@
 - created：消息创建时间，unix 时间。
 - work_id：返回成功创建的 work_id 单元。
 
+## inference
+
+### 流式输入
+
+```json
+{
+    "request_id": "2",
+    "work_id": "llm.1003",
+    "action": "inference",
+    "object": "llm.utf-8.stream",
+    "data": {
+        "delta": "What's ur name?",
+        "index": 0,
+        "finish": true
+    }
+}
+```
+- object：传输的数据类型为 `llm.utf-8.stream` 代表的是从用户 utf-8 的流式输入
+- delta：流式输入的分段数据
+- index：流式输入的分段索引
+- finish:流式输入是否完成的标志位
+
+### 非流式输入
+
+```json
+{
+    "request_id": "2",
+    "work_id": "llm.1003",
+    "action": "inference",
+    "object": "llm.utf-8",
+    "data": "What's ur name?"
+}
+```
+- object：传输的数据类型为 `llm.utf-8` 代表的是从用户 utf-8 的非流式输入
+- data：非流式输入的数据
+
+流式响应 json：
+
+```json
+{"created":1742779468,"data":{"delta":"I am not","finish":false,"index":0},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779469,"data":{"delta":" a person,","finish":false,"index":1},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779469,"data":{"delta":" but I'm","finish":false,"index":2},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779470,"data":{"delta":" here to assist","finish":false,"index":3},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779471,"data":{"delta":" you with any","finish":false,"index":4},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779472,"data":{"delta":" questions or tasks","finish":false,"index":5},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779473,"data":{"delta":" you may have","finish":false,"index":6},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779474,"data":{"delta":". How can","finish":false,"index":7},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779474,"data":{"delta":" I help you","finish":false,"index":8},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779475,"data":{"delta":" today?","finish":false,"index":9},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+{"created":1742779475,"data":{"delta":"","finish":true,"index":10},"error":{"code":0,"message":""},"object":"llm.utf-8.stream","request_id":"2","work_id":"llm.1003"}
+```
+
+非流式响应 json：
+
+```json
+{
+    "created": 1742780120,
+    "data": "As an artificial intelligence, I don't have a name in the traditional sense. However, I am here to assist you with any questions or information you may need. How can I help you today?",
+    "error": {
+        "code": 0,
+        "message": ""
+    },
+    "object": "llm.utf-8",
+    "request_id": "2",
+    "work_id": "llm.1003"
+}
+```
+
 ## link
 
 链接上级单元的输出。

diff --git a/projects/llm_framework/main_llm/scripts/llm-llm_tokenizer_auto.py b/projects/llm_framework/main_llm/scripts/llm-llm_tokenizer_auto.py
@@ -4,6 +4,43 @@
 import json
 import argparse
 import sys
+import base64
+
+def replace_base64_in_jrpcobj(jrpc_obj):
+    """
+    Traverse and replace specific structures in a JRpcObj object.
+    If {"type": "bytes.base64", "encode": "bmloYW8K"} is found, replace it with the decoded value "nihao".
+    """
+    if isinstance(jrpc_obj, dict):
+        # If it is a dictionary, check if it meets the condition
+        if jrpc_obj.get("type") == "bytes.base64" and "encode" in jrpc_obj:
+            try:
+                # Attempt to decode and replace
+                decoded_value = base64.b64decode(jrpc_obj["encode"])
+                return decoded_value  # Replace with the decoded value
+            except Exception as e:
+                # print(f"Decoding error: {e}")
+                return jrpc_obj  # If decoding fails, return the original object
+        elif jrpc_obj.get("type") == "str.base64" and "encode" in jrpc_obj:
+            try:
+                # Attempt to decode and replace
+                decoded_value = base64.b64decode(jrpc_obj["encode"]).decode("utf-8")
+                return decoded_value  # Replace with the decoded value
+            except Exception as e:
+                # print(f"Decoding error: {e}")
+                return jrpc_obj  # If decoding fails, return the original object
+        else:
+            # If conditions are not met, recursively process the dictionary's values
+            return {
+                key: replace_base64_in_jrpcobj(value) for key, value in jrpc_obj.items()
+            }
+    elif isinstance(jrpc_obj, list):
+        # If it is a list, recursively process each element
+        return [replace_base64_in_jrpcobj(item) for item in jrpc_obj]
+    else:
+        # If it is another type, return it directly
+        return jrpc_obj
+
 
 def send_msg(content, chunk_size=1024):
     data = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
@@ -13,12 +50,26 @@ def send_msg(content, chunk_size=1024):
     sys.stdout.write(data[-1]+'\n')
     sys.stdout.flush()
 
+def str2bool(value):
+    if isinstance(value, bool):
+        return value
+    if value.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif value.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
 
 args = argparse.ArgumentParser()
 args.add_argument('--model_id', type=str, default='qwen2.5_coder_tokenizer')
 args.add_argument('--content', type=str, default='You are Qwen, created by Alibaba Cloud. You are a helpful assistant.')
+args.add_argument('--trust_remote_code', type=str2bool, nargs='?', const=True, default=None)
+args.add_argument('--use_fast', type=str2bool, nargs='?', const=False, default=None)
 args = args.parse_args()
-tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+if args.trust_remote_code is None or args.use_fast is None:
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+else:
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=args.trust_remote_code, use_fast=args.use_fast)
 JRpcResultObj = {
     'jsonrpc': "2.0",
     'result': {"bos_id": tokenizer.bos_token_id, "eos_id": tokenizer.eos_token_id},
@@ -34,7 +85,8 @@ def send_msg(content, chunk_size=1024):
     else:
         line += part
     try:
-        JRpcObj = json.loads(line)
+        JRpcObjsrc = json.loads(line)
+        JRpcObj = replace_base64_in_jrpcobj(JRpcObjsrc)
         RpcMethod = getattr(tokenizer, JRpcObj['method'])
         if callable(RpcMethod):
             result = RpcMethod(*JRpcObj['params'][0],**JRpcObj['params'][1])

diff --git a/projects/llm_framework/main_llm/src/main.cpp b/projects/llm_framework/main_llm/src/main.cpp
@@ -9,6 +9,7 @@
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <unistd.h>
 #include <base64.h>
 #include <fstream>
@@ -41,7 +42,13 @@ typedef std::function<void(const std::string &data, bool finish)> task_callback_
 
 class llm_task {
 private:
+    static std::atomic<unsigned int> next_port_;
+    std::atomic_bool tokenizer_server_flage_;
+    unsigned int port_;
+    pid_t tokenizer_pid_ = -1;
+
 public:
+    enum inference_status { INFERENCE_NONE = 0, INFERENCE_RUNNING };
     LLMAttrType mode_config_;
     std::unique_ptr<LLM> lLaMa_;
     std::string model_;
@@ -51,9 +58,8 @@ class llm_task {
     task_callback_t out_callback_;
     bool enoutput_;
     bool enstream_;
-    std::atomic_bool tokenizer_server_flage_;
-    unsigned int port_ = 8080;
     sem_t inference_semaphore;
+    std::atomic_int inference_status_;
     std::unique_ptr<std::thread> inference_run_;
     std::atomic_bool is_running_;
     std::string _inference_msg;
@@ -132,6 +138,7 @@ class llm_task {
             CONFIG_AUTO_SET(file_body["mode_param"], top_p);
 
             if (mode_config_.filename_tokenizer_model.find("http:") != std::string::npos) {
+                mode_config_.filename_tokenizer_model = "http://localhost:" + std::to_string(port_);
                 std::string tokenizer_file;
                 if (file_exists(std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py"))) {
                     tokenizer_file = std::string("/opt/m5stack/scripts/") + model_ + std::string("_tokenizer.py");
@@ -146,16 +153,16 @@ class llm_task {
                     __log += " not found!";
                     SLOGE("%s", __log.c_str());
                 }
-                if (!tokenizer_server_flage_) {
-                    pid_t pid = fork();
-                    if (pid == 0) {
+                if (!tokenizer_server_flage_.load()) {
+                    tokenizer_pid_ = fork();
+                    if (tokenizer_pid_ == 0) {
                         execl("/usr/bin/python3", "python3", tokenizer_file.c_str(), "--host", "localhost", "--port",
                               std::to_string(port_).c_str(), "--model_id", (base_model + "tokenizer").c_str(),
                               "--content", ("'" + prompt_ + "'").c_str(), nullptr);
                         perror("execl failed");
                         exit(1);
                     }
-                    tokenizer_server_flage_ = true;
+                    tokenizer_server_flage_.store(true);
                     SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
                           (base_model + "tokenizer").c_str(), ("'" + prompt_ + "'").c_str());
                     std::this_thread::sleep_for(std::chrono::seconds(15));
@@ -219,23 +226,21 @@ class llm_task {
         sem_wait(&inference_semaphore);
         while (is_running_) {
             {
-                sem_wait(&inference_semaphore);
                 inference(_inference_msg);
+                inference_status_--;
                 sem_wait(&inference_semaphore);
             }
         }
     }
 
     int inference_async(const std::string &msg)
     {
-        int count = 0;
-        sem_getvalue(&inference_semaphore, &count);
-        if (count == 0) {
-            _inference_msg = msg;
-            sem_post(&inference_semaphore);
+        if (inference_status_ == INFERENCE_NONE) {
+            _inference_msg    = msg;
+            inference_status_ = INFERENCE_RUNNING;
             sem_post(&inference_semaphore);
         }
-        return count;
+        return inference_status_;
     }
 
     void inference(const std::string &msg)
@@ -287,13 +292,29 @@ class llm_task {
 
     bool delete_model()
     {
+        if (tokenizer_pid_ != -1) {
+            kill(tokenizer_pid_, SIGTERM);
+            waitpid(tokenizer_pid_, nullptr, 0);
+            tokenizer_pid_ = -1;
+        }
         lLaMa_->Deinit();
         lLaMa_.reset();
         return true;
     }
 
-    llm_task(const std::string &workid)
+    static unsigned int getNextPort()
     {
+        unsigned int port = next_port_++;
+        if (port > 8089) {
+            next_port_ = 8080;
+            port       = 8080;
+        }
+        return port;
+    }
+
+    llm_task(const std::string &workid) : tokenizer_server_flage_(false), port_(getNextPort())
+    {
+        inference_status_ = INFERENCE_NONE;
         sem_init(&inference_semaphore, 0, 0);
         is_running_    = true;
         inference_run_ = std::make_unique<std::thread>(std::bind(&llm_task::run, this));
@@ -304,12 +325,19 @@ class llm_task {
         is_running_ = false;
         sem_post(&inference_semaphore);
         if (inference_run_) inference_run_->join();
+        if (tokenizer_pid_ != -1) {
+            kill(tokenizer_pid_, SIGTERM);
+            waitpid(tokenizer_pid_, nullptr, WNOHANG);
+        }
         if (lLaMa_) {
             lLaMa_->Deinit();
         }
+        sem_destroy(&inference_semaphore);
     }
 };
 
+std::atomic<unsigned int> llm_task::next_port_{8080};
+
 #undef CONFIG_AUTO_SET
 
 class llm_llm : public StackFlow {
@@ -619,6 +647,7 @@ class llm_llm : public StackFlow {
             send("None", "None", error_body, work_id);
             return -1;
         }
+        task_pause(llm_task_[work_id_num], get_channel(work_id_num));
         auto llm_channel = get_channel(work_id_num);
         llm_channel->stop_subscriber("");
         llm_task_[work_id_num]->lLaMa_->Stop();

diff --git a/projects/llm_framework/main_llm/src/runner/Tokenizer/Tokenizer.cpp b/projects/llm_framework/main_llm/src/runner/Tokenizer/Tokenizer.cpp
@@ -860,6 +860,16 @@ class Tokenizer_Auto : public BaseTokenizer {
                     msg["content"] = message.second;
                     messages_list.push_back(msg);
                 } break;
+                case ROLE_TOOL: {
+                    msg["role"]    = "tool";
+                    msg["content"] = message.second;
+                    messages_list.push_back(msg);
+                } break;
+                case ROLE_IPYTHON: {
+                    msg["role"]    = "ipython";
+                    msg["content"] = message.second;
+                    messages_list.push_back(msg);
+                } break;
                 case ROLE_ASSISTANT: {
                     msg["role"]    = "assistant";
                     msg["content"] = message.second;