添加新模型
开发者需要先在models中新建以模型名称命名的文件夹, 之后按照如下步骤添加新模型:
增加模型的Tokenizer/Processor
- 在”models/模型名”文件夹下添加tokenization_XXX.hpp或者processing_XXX.hpp。
- 构建XXXTokenizer/Processor。
示例如下:
#ifndef TOKENIZATION_XXX_HPP#define TOKENIZATION_XXX_HPP
namespace mllm {
class XXXTokenizer final { ... ...,
public: explicit XXXTokenizer(const std::string &vocab_file) { ... ..., } Tensor tokenize(std::string &text, int str_i = 0) const { ... ..., } std::pair<std::string, unsigned> detokenize(Tensor& result) { ... ..., }};} // namespace mllm
#endif // TOKENIZATION_XXX_HPP
添加配置文件
- 在”models/模型名”文件夹下添加configuration_XXX.hpp。
- 构建XXXNameConfig和XXXConfig。
示例如下:
#ifndef CONFIG_XXX_HPP#define CONFIG_XXX_HPP#include "models/transformer/configuration_transformer.hpp"
using namespace mllm;
class XXXNameConfig : public TransformerNameConfig{public: std::string xxx_name; ... ...,
void init() { ... ..., }};
class XXXConfig {public: XXXNameConfig names_config; ... ...,
explicit XXXConfig() { names_config.init(type); ... ..., }};
#endif // CONFIG_XXX_HPP
注意: XXXNameConfig中的的name参数,用于在模型中标识Op,不可重复,且需与模式未转换的pytorch/safetensor模型的对应名称相同。
例如,您可以通过如下方式查看pth模型中Op的名称:
import torchmodel_path = "/path/to/model.pth"model = torch.load(model_path)for key in model: print(f"{key} Shape: {model[key].shape}")
输出如下:
tok_embeddings.weight Shape: torch.Size([32000, 4096])norm.weight Shape: torch.Size([4096])output.weight Shape: torch.Size([32000, 4096])layers.0.attention.wq.weight Shape: torch.Size([4096, 4096])layers.0.attention.wk.weight Shape: torch.Size([4096, 4096])layers.0.attention.wv.weight Shape: torch.Size([4096, 4096])layers.0.attention.wo.weight Shape: torch.Size([4096, 4096])layers.0.feed_forward.w1.weight Shape: torch.Size([11008, 4096])... ...
上述代码将打印出模型中所有Op的“key”值中“.weight”前的部分需与mllm中的name参数相同。
例如: "layers.0.attention.wq.weight"
对应的mllm中的name参数应为"layers.0.attention.wq"
。
添加模型文件
- 在”models/模型名”文件夹下添加modeling_XXX.hpp。
- 构建XXXModel。
LLaMA的示例如下:
class LLaMAModel final : public Module { Layer embedding; vector<LLaMABlock> blocks; Layer norm; Layer lm_head;
public: explicit LLaMAModel(const LLaMAConfig &config) : LLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit, config.names_config, config.names_config.blk_name) { } LLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit, const LLaMANameConfig &names, const string &base_name) { embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name); blocks = List<LLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name); norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name); lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name); } vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override { auto x = embedding(inputs[0]); for (auto &block : blocks) { x = block({x})[0]; } x = norm(x); x = lm_head(x); return {x}; }};
使用模型
在example中新建文件demo_XXX.cpp。
LLaMA的实例如下:
#include <iostream>#include "cmdline.h"#include "models/llama/modeling_llama.hpp"#include "models/llama/tokenization_llama.hpp"#include "processor/PostProcess.hpp"
using namespace mllm;
int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm"); cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm"); cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400); cmdParser.add<int>("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv);
string vocab_path = cmdParser.get<string>("vocab"); string model_path = cmdParser.get<string>("model"); int tokens_limit = cmdParser.get<int>("limits"); Layer::cpu_thread = cmdParser.get<int>("thread");
auto tokenizer = LLaMATokenizer(vocab_path);
LLaMAConfig config(tokens_limit, "7B", LLAMAROPE); auto model = LLaMAModel(config); model.load(model_path);
vector<string> in_strs = { " Hello, who are you?", " What can you do?", "Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) { auto in_str = in_strs[i]; auto input_tensor = tokenizer.tokenize(in_str, i); std::cout << "[Q] " << in_str << std::endl; std::cout << "[A] " << std::flush; for (int step = 0; step < 100; step++) { auto result = model({input_tensor}); auto outputs = tokenizer.detokenize(result[0]); auto out_string = outputs.first; auto out_token = outputs.second; if (out_token == 2) { break; } std::cout << out_string << std::flush; chatPostProcessing(out_token, input_tensor, {}); } printf("\n"); }
return 0;}