跳转到内容

添加新模型

开发者需要先在models中新建以模型名称命名的文件夹, 之后按照如下步骤添加新模型:

  1. 添加Tokenizer/Processor
  2. 添加配置文件
  3. 添加模型文件
  4. 使用模型

增加模型的Tokenizer/Processor

  1. 在”models/模型名”文件夹下添加tokenization_XXX.hpp或者processing_XXX.hpp。
  2. 构建XXXTokenizer/Processor。

示例如下:

#ifndef TOKENIZATION_XXX_HPP
#define TOKENIZATION_XXX_HPP
namespace mllm {
class XXXTokenizer final {
... ...,
public:
explicit XXXTokenizer(const std::string &vocab_file) {
... ...,
}
Tensor tokenize(std::string &text, int str_i = 0) const {
... ...,
}
std::pair<std::string, unsigned> detokenize(Tensor& result) {
... ...,
}
};
} // namespace mllm
#endif // TOKENIZATION_XXX_HPP

添加配置文件

  1. 在”models/模型名”文件夹下添加configuration_XXX.hpp。
  2. 构建XXXNameConfig和XXXConfig。

示例如下:

#ifndef CONFIG_XXX_HPP
#define CONFIG_XXX_HPP
#include "models/transformer/configuration_transformer.hpp"
using namespace mllm;
class XXXNameConfig : public TransformerNameConfig{
public:
std::string xxx_name;
... ...,
void init() {
... ...,
}
};
class XXXConfig {
public:
XXXNameConfig names_config;
... ...,
explicit XXXConfig() {
names_config.init(type);
... ...,
}
};
#endif // CONFIG_XXX_HPP

注意: XXXNameConfig中的的name参数,用于在模型中标识Op,不可重复,且需与模式未转换的pytorch/safetensor模型的对应名称相同。

例如,您可以通过如下方式查看pth模型中Op的名称:

import torch
model_path = "/path/to/model.pth"
model = torch.load(model_path)
for key in model:
print(f"{key} Shape: {model[key].shape}")

输出如下:

tok_embeddings.weight Shape: torch.Size([32000, 4096])
norm.weight Shape: torch.Size([4096])
output.weight Shape: torch.Size([32000, 4096])
layers.0.attention.wq.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wk.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wv.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wo.weight Shape: torch.Size([4096, 4096])
layers.0.feed_forward.w1.weight Shape: torch.Size([11008, 4096])
... ...

上述代码将打印出模型中所有Op的“key”值中“.weight”前的部分需与mllm中的name参数相同。 例如: "layers.0.attention.wq.weight"对应的mllm中的name参数应为"layers.0.attention.wq"

添加模型文件

  1. 在”models/模型名”文件夹下添加modeling_XXX.hpp。
  2. 构建XXXModel。

LLaMA的示例如下:

class LLaMAModel final : public Module {
Layer embedding;
vector<LLaMABlock> blocks;
Layer norm;
Layer lm_head;
public:
explicit LLaMAModel(const LLaMAConfig &config) :
LLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit,
config.names_config, config.names_config.blk_name) {
}
LLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit,
const LLaMANameConfig &names, const string &base_name) {
embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name);
blocks = List<LLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name);
norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name);
lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
}
vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
auto x = embedding(inputs[0]);
for (auto &block : blocks) {
x = block({x})[0];
}
x = norm(x);
x = lm_head(x);
return {x};
}
};

使用模型

example中新建文件demo_XXX.cpp。

LLaMA的实例如下:

#include <iostream>
#include "cmdline.h"
#include "models/llama/modeling_llama.hpp"
#include "models/llama/tokenization_llama.hpp"
#include "processor/PostProcess.hpp"
using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
Layer::cpu_thread = cmdParser.get<int>("thread");
auto tokenizer = LLaMATokenizer(vocab_path);
LLaMAConfig config(tokens_limit, "7B", LLAMAROPE);
auto model = LLaMAModel(config);
model.load(model_path);
vector<string> in_strs = {
" Hello, who are you?",
" What can you do?",
"Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
auto input_tensor = tokenizer.tokenize(in_str, i);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
auto outputs = tokenizer.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
if (out_token == 2) {
break;
}
std::cout << out_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
}
return 0;
}