Skip to content

Model Development Guide

Developers need to first create a folder named after the model in the models directory. Following that, add a new model by following the steps below:

  1. Add Tokenizer or Processor
  2. Add Configuration
  3. Add Model
  4. Use Model

Add Tokenizer or Processor

  1. Add tokenization_XXX.hpp or processing_XXX.hpp in the “models/XXX” folder.
  2. Build XXXTokenizer/Processor.

Example as follows:

#ifndef TOKENIZATION_XXX_HPP
#define TOKENIZATION_XXX_HPP
namespace mllm {
class XXXTokenizer final {
... ...,
public:
explicit XXXTokenizer(const std::string &vocab_file) {
... ...,
}
Tensor tokenize(std::string &text, int str_i = 0) const {
... ...,
}
std::pair<std::string, unsigned> detokenize(Tensor& result) {
... ...,
}
};
} // namespace mllm
#endif // TOKENIZATION_XXX_HPP

Add Configuration

  1. Add configuration_XXX.hpp in the “models/XXX” folder.
  2. Build XXXNameConfig and XXXConfig.

Example as follows:

#ifndef CONFIG_XXX_HPP
#define CONFIG_XXX_HPP
#include "models/transformer/configuration_transformer.hpp"
using namespace mllm;
class XXXNameConfig : public TransformerNameConfig{
public:
std::string xxx_name;
... ...,
void init() {
... ...,
}
};
class XXXConfig {
public:
XXXNameConfig names_config;
... ...,
explicit XXXConfig() {
names_config.init(type);
... ...,
}
};
#endif // CONFIG_XXX_HPP

Note: The “name” parameter in “XXXNameConfig” is used to identify the operation in the model. It must be unique and should match the corresponding name in the original PyTorch/SafeTensor model that has not been converted.

For example, you can inspect the names of operations in a “.pth” model using the following code:

import torch
model_path = "/path/to/model.pth"
model = torch.load(model_path)
for key in model:
print(f"{key} Shape: {model[key].shape}")

The output will be as follows:

tok_embeddings.weight Shape: torch.Size([32000, 4096])
norm.weight Shape: torch.Size([4096])
output.weight Shape: torch.Size([32000, 4096])
layers.0.attention.wq.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wk.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wv.weight Shape: torch.Size([4096, 4096])
layers.0.attention.wo.weight Shape: torch.Size([4096, 4096])
layers.0.feed_forward.w1.weight Shape: torch.Size([11008, 4096])
... ...

The above code will print the part before “.weight” in the “key” values of all operations in the model. This part should match the “name” parameter in the MLLM code. For instance, if the key is "layers.0.attention.wq.weight", the corresponding “name” parameter in MLLM should be "layers.0.attention.wq".

Add Model

  1. Add modeling_XXX.hpp in the “models/XXX” folder.
  2. Build XXXModel.

For example with LLaMA:

class LLaMAModel final : public Module {
Layer embedding;
vector<LLaMABlock> blocks;
Layer norm;
Layer lm_head;
public:
explicit LLaMAModel(const LLaMAConfig &config) :
LLaMAModel(config.vocab_size, config.hidden_dim, config.head_size, config.ffn_hidden, config.block_num, config.RoPE_type, config.cache_limit,
config.names_config, config.names_config.blk_name) {
}
LLaMAModel(int vocab_size, int hidden_dim, int head_size, int ffn_hidden, int block_num, RoPEType RoPE_type, int cache_limit,
const LLaMANameConfig &names, const string &base_name) {
embedding = Embedding(vocab_size, hidden_dim, names.token_embd_name);
blocks = List<LLaMABlock>(block_num, hidden_dim, head_size, ffn_hidden, RoPE_type, cache_limit, names, base_name);
norm = RMSNorm(hidden_dim, 1e-6, names.post_norm_name);
lm_head = Linear(hidden_dim, vocab_size, false, names.lm_head_name);
}
vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
auto x = embedding(inputs[0]);
for (auto &block : blocks) {
x = block({x})[0];
}
x = norm(x);
x = lm_head(x);
return {x};
}
};

Use Model

Create a new file demo_XXX.cpp in the example directory.

For example with LLaMA:

#include <iostream>
#include "cmdline.h"
#include "models/llama/modeling_llama.hpp"
#include "models/llama/tokenization_llama.hpp"
#include "processor/PostProcess.hpp"
using namespace mllm;
int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/llama_vocab.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/llama-2-7b-chat-q4_k.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
cmdParser.parse_check(argc, argv);
string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
Layer::cpu_thread = cmdParser.get<int>("thread");
auto tokenizer = LLaMATokenizer(vocab_path);
LLaMAConfig config(tokens_limit, "7B", LLAMAROPE);
auto model = LLaMAModel(config);
model.load(model_path);
vector<string> in_strs = {
" Hello, who are you?",
" What can you do?",
"Please introduce Beijing University of Posts and Telecommunications."};
for (int i = 0; i < in_strs.size(); ++i) {
auto in_str = in_strs[i];
auto input_tensor = tokenizer.tokenize(in_str, i);
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;
for (int step = 0; step < 100; step++) {
auto result = model({input_tensor});
auto outputs = tokenizer.detokenize(result[0]);
auto out_string = outputs.first;
auto out_token = outputs.second;
if (out_token == 2) {
break;
}
std::cout << out_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
}
return 0;
}