pymllm.configs.server_config

Classes

ServerConfig

Centralized runtime configuration for the MLLM server.

Module Contents

class pymllm.configs.server_config.ServerConfig

Centralized runtime configuration for the MLLM server.

model_path: pathlib.Path | None = None
tokenizer_path: pathlib.Path | None = None
tokenizer_mode: Literal['auto', 'slow', 'fast'] = 'auto'
load_format: Literal['auto', 'safetensors'] = 'auto'
trust_remote_code: bool = False
download_dir: pathlib.Path | None = None
context_length: int | None = None
dtype: Literal['auto', 'float16', 'bfloat16', 'float32'] = 'auto'
host: str = '127.0.0.1'
port: int = 30000
fastapi_root_path: str = ''
api_key: str | None = None
admin_api_key: str | None = None
served_model_name: str | None = None
file_storage_path: pathlib.Path
cors_allow_origins: list[str] = ['*']
mem_fraction_static: float | None = None
max_running_requests: int | None = 1
max_queued_requests: int | None = None
max_total_tokens: int | None = None
chunked_prefill_size: int | None = None
max_prefill_tokens: int | None = None
schedule_policy: Literal['auto', 'fcfs'] = 'fcfs'
schedule_conservativeness: float = 1.0
sleep_on_idle: bool = False
stream_interval: int = 1
stream_output: bool = True
base_gpu_id: int = 0
attention_backend: Literal['auto', 'flashinfer'] = 'auto'
gdn_decode_backend: Literal['auto', 'flashinfer', 'mllm_kernel', 'pytorch'] = 'auto'
sampling_backend: str | None = None
disable_cuda_graph: bool = False
enable_torch_compile: bool = False
torch_compile_max_bs: int = 32
random_seed: int | None = 42
reasoning_parser: str | None = None
tool_call_parser: str | None = None
log_level: Literal['debug', 'info', 'warning', 'error', 'critical'] = 'info'
enable_metrics: bool = False
show_time_cost: bool = False
decode_log_interval: int = 40
enable_shared_queue: bool = False
disable_radix_cache: bool = False
radix_cache_page_size: int = 1
enable_mamba_cache: bool = False
tensor_transport_mode: str = 'default'
cuda_ipc_pool_size_mb: int = 512
cuda_ipc_recycle_interval: float = 0.1
extra_options: dict[str, Any]
__post_init__()
Return type:

None