添加新模型

开发者需要先在models中新建以模型名称命名的文件夹, 之后按照如下步骤添加新模型：

添加Tokenizer/Processor
添加配置文件
添加模型文件
使用模型

增加模型的Tokenizer/Processor

在”models/模型名”文件夹下添加tokenization_XXX.hpp或者processing_XXX.hpp。
构建XXXTokenizer/Processor。

示例如下：

#ifndef TOKENIZATION_XXX_HPP
#define TOKENIZATION_XXX_HPP

namespace mllm {

class XXXTokenizer final {
    ... ...,

public:
    explicit XXXTokenizer(const std::string &vocab_file) {
        ... ...,
    }
    Tensor tokenize(std::string &text, int str_i = 0) const {
        ... ...,
    }
    std::pair<std::string, unsigned> detokenize(Tensor& result) {
        ... ...,
    }
};
} // namespace mllm

#endif // TOKENIZATION_XXX_HPP

添加配置文件

在”models/模型名”文件夹下添加configuration_XXX.hpp。
构建XXXNameConfig和XXXConfig。

示例如下：

#ifndef CONFIG_XXX_HPP
#define CONFIG_XXX_HPP
#include "models/transformer/configuration_transformer.hpp"

using namespace mllm;

class XXXNameConfig : public TransformerNameConfig{
public:
    std::string xxx_name;
    ... ...,

    void init() {
        ... ...,
    }
};

class XXXConfig {
public:
    XXXNameConfig names_config;
    ... ...,

    explicit XXXConfig() {
        names_config.init(type);
        ... ...,
    }
};

#endif // CONFIG_XXX_HPP

注意： XXXNameConfig中的的name参数，用于在模型中标识Op，不可重复，且需与模式未转换的pytorch/safetensor模型的对应名称相同。

例如，您可以通过如下方式查看pth模型中Op的名称：

import torch
model_path = "/path/to/model.pth"
model = torch.load(model_path)
for key in model:
    print(f"{key}  Shape: {model[key].shape}")

输出如下：

tok_embeddings.weight  Shape: torch.Size([32000, 4096])
norm.weight  Shape: torch.Size([4096])
output.weight  Shape: torch.Size([32000, 4096])
layers.0.attention.wq.weight  Shape: torch.Size([4096, 4096])
layers.0.attention.wk.weight  Shape: torch.Size([4096, 4096])
layers.0.attention.wv.weight  Shape: torch.Size([4096, 4096])
layers.0.attention.wo.weight  Shape: torch.Size([4096, 4096])
layers.0.feed_forward.w1.weight  Shape: torch.Size([11008, 4096])
... ...

上述代码将打印出模型中所有Op的“key”值中“.weight”前的部分需与mllm中的name参数相同。例如： "layers.0.attention.wq.weight"对应的mllm中的name参数应为"layers.0.attention.wq"。

添加模型文件

在”models/模型名”文件夹下添加modeling_XXX.hpp。
构建XXXModel。

LLaMA的示例如下：

class LLaMAModel final : public Module {
    Layer embedding;
    vector<LLaMABlock> blocks;
    Layer norm;
    Layer lm_head;

public:
    explicit LLaMAModel(const LLaMAConfig &config) :
        LLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit,
                   config.names_config, config.names_config.blk_name) {
    }
    LLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit,
               const LLaMANameConfig &names, const string &base_name) {
        embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name);
        blocks = List<LLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name);
        norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name);
        lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
    }
    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
        auto x = embedding(inputs[0]);
        for (auto &block : blocks) {
            x = block({x})[0];
        }
        x = norm(x);
        x = lm_head(x);
        return {x};
    }
};

使用模型

在example中新建文件demo_XXX.cpp。

LLaMA的实例如下:

#include <iostream>
#include "cmdline.h"
#include "models/llama/modeling_llama.hpp"
#include "models/llama/tokenization_llama.hpp"
#include "processor/PostProcess.hpp"

using namespace mllm;

int main(int argc, char **argv) {
    cmdline::parser cmdParser;
    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
    cmdParser.add<int>("thread", 't', "num of threads", false, 4);
    cmdParser.parse_check(argc, argv);

    string vocab_path = cmdParser.get<string>("vocab");
    string model_path = cmdParser.get<string>("model");
    int tokens_limit = cmdParser.get<int>("limits");
    Layer::cpu_thread = cmdParser.get<int>("thread");

    auto tokenizer = LLaMATokenizer(vocab_path);

    LLaMAConfig config(tokens_limit, "7B", LLAMAROPE);
    auto model = LLaMAModel(config);
    model.load(model_path);

    vector<string> in_strs = {
        " Hello, who are you?",
        " What can you do?",
        "Please introduce Beijing University of Posts and Telecommunications."};

    for (int i = 0; i < in_strs.size(); ++i) {
        auto in_str = in_strs[i];
        auto input_tensor = tokenizer.tokenize(in_str, i);
        std::cout << "[Q] " << in_str << std::endl;
        std::cout << "[A] " << std::flush;
        for (int step = 0; step < 100; step++) {
            auto result = model({input_tensor});
            auto outputs = tokenizer.detokenize(result[0]);
            auto out_string = outputs.first;
            auto out_token = outputs.second;
            if (out_token == 2) {
                break;
            }
            std::cout << out_string << std::flush;
            chatPostProcessing(out_token, input_tensor, {});
        }
        printf("\n");
    }

    return 0;
}