Index _ | A | B | C | D | E | F | G | H | I | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Z _ __add__() (pymllm.mobile.ffi.Tensor method) __class_vars__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __contains__() (pymllm.mem_cache.mamba_radix_cache.LRUList method) __del__() (pymllm.orchestrator.cuda_ipc_transport.ShmSyncBuffer method) __div__() (pymllm.mobile.ffi.Tensor method) __enter__() (pymllm.mobile.utils.adb.ShellContext method) __exit__() (pymllm.mobile.utils.adb.ShellContext method) __getattr__() (pymllm.configs.model_config.ModelConfig method) __getitem__() (pymllm.engine.io_struct.BatchTokenizedGenerateReqInput method) (pymllm.engine.io_struct.GenerateReqInput method) (pymllm.mem_cache.base_prefix_cache.RadixKey method) __getstate__() (pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor method) __iter__() (pymllm.engine.io_struct.BatchTokenizedGenerateReqInput method) (pymllm.mem_cache.base_prefix_cache.RadixKey method) __len__() (pymllm.engine.io_struct.BatchTokenizedGenerateReqInput method) (pymllm.mem_cache.base_prefix_cache.RadixKey method) (pymllm.mem_cache.mamba_radix_cache.LRUList method) __lt__() (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode method) (pymllm.mem_cache.radix_cache.TreeNode method) __mul__() (pymllm.mobile.ffi.Tensor method) __neg__() (pymllm.mobile.ffi.Tensor method) __post_init__() (pymllm.configs.server_config.ServerConfig method) __private_attributes__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_complete__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_computed_fields__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_core_schema__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_custom_init__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_decorators__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_extra__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_fields__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_fields_set__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_generic_metadata__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_parent_namespace__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_post_init__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_private__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_root_model__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_serializer__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __pydantic_validator__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __repr__() (pymllm.mem_cache.base_prefix_cache.RadixKey method) (pymllm.orchestrator.scheduler_process.Req method) (pymllm.orchestrator.scheduler_process.ScheduleBatch method) (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig method) __setstate__() (pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor method) __signature__ (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.ImageUrl attribute) (pymllm.server.launch.StreamOptions attribute) (pymllm.server.launch.Tool attribute) (pymllm.server.launch.ToolFunction attribute) __slots__ (pymllm.mem_cache.base_prefix_cache.RadixKey attribute) (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) (pymllm.orchestrator.scheduler_process.Req attribute) __str__() (pymllm.mobile.ffi.Tensor method) __sub__() (pymllm.mobile.ffi.Tensor method) A A_log (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) abort() (pymllm.orchestrator.scheduler_process.Req method) abort_request() (in module pymllm.server.launch) (pymllm.orchestrator.request_response_process.RequestResponseProcess method) AbortRequest (class in pymllm.server.launch) abs() (pymllm.mobile.ffi.Tensor method) act_fn (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) act_quant (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) activation (pymllm.layers.mlp.MLP attribute) (pymllm.layers.mlp.ParallelMLP attribute) activation_post_process (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) ActivationQDQ (class in pymllm.mobile.backends.qualcomm.transformers.core.qdq) adb_path (pymllm.mobile.utils.adb.ADBToolkit attribute) (pymllm.mobile.utils.adb.ShellContext attribute) ADBToolkit (class in pymllm.mobile.utils.adb) add_observer() (pymllm.mobile.backends.qualcomm.transformers.core.observer.ConcatObserver method) add_request() (pymllm.orchestrator.request_response_process.RequestResponseProcess method) admin_api_key (pymllm.configs.server_config.ServerConfig attribute) all_gather() (pymllm.orchestrator.group_coordinator.GroupCoordinator method) all_reduce() (pymllm.orchestrator.group_coordinator.GroupCoordinator method) alloc() (pymllm.mem_cache.memory_pool.ReqToTokenPool method) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) alloc_track_slot() (pymllm.mem_cache.memory_pool.GDNPool method) api_key (pymllm.configs.server_config.ServerConfig attribute) app (in module pymllm.mobile.service.network) (in module pymllm.server.launch) apply() (pymllm.layers.quantize_base.LinearMethodBase method) (pymllm.layers.quantize_base.QuantizeMethodBase method) (pymllm.layers.quantize_base.UnquantizedLinearMethod method) (pymllm.quantization.methods.awq_marlin.AWQMarlinLinearMethod method) apply_llama31_rope() (in module pymllm.layers.rope) apply_llama31_rope_pos_ids() (in module pymllm.layers.rope) apply_mrope() (in module pymllm.layers.rope) apply_rope() (in module pymllm.layers.rope) apply_rope_pos_ids() (in module pymllm.layers.rope) apply_rope_with_cos_sin_cache() (in module pymllm.layers.rope) arange() (in module pymllm.mobile.ffi) area (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk attribute) ARGeneration (C++ class) ARGeneration::categoricalSample (C++ function) ARGeneration::do_sample_ (C++ member) ARGeneration::eos_token_id_ (C++ member) ARGeneration::forward (C++ function) ARGeneration::generate (C++ function) ARGeneration::getLastLogits (C++ function) ARGeneration::max_length_ (C++ member) ARGeneration::sampleFromDistribution (C++ function) ARGeneration::sampleGreedy (C++ function) ARGeneration::sampleTemperature (C++ function) ARGeneration::sampleTopK (C++ function) ARGeneration::sampleTopP (C++ function) ARGeneration::streamGenerate (C++ function) ARGeneration::trace (C++ function) arguments (pymllm.parsers.tool_call_parser.ToolCallItem attribute) attention_backend (pymllm.configs.server_config.ServerConfig attribute) AttentionBackend (class in pymllm.layers.attention.attention_backend) AttentionType (class in pymllm.layers.attention.radix_attention) attn (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VisionBlock attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) attn_backend (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.executor.model_runner.ModelRunner attribute) attn_output_gate (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) attn_type (pymllm.layers.attention.radix_attention.RadixAttention attribute) audio_data (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.server.launch.GenerateRequest attribute) available_chunks (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool attribute) available_size() (pymllm.mem_cache.memory_pool.ReqToTokenPool method) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) awq_to_marlin_zero_points() (in module pymllm.quantization.methods.awq_marlin) AWQMarlinConfig (class in pymllm.quantization.methods.awq_marlin) AWQMarlinLinearMethod (class in pymllm.quantization.methods.awq_marlin) B backend (pymllm.orchestrator.group_coordinator.GroupCoordinator attribute) base_gpu_id (pymllm.configs.server_config.ServerConfig attribute) base_model_prefix (modeling_llama.LlamaForQuestionAnswering attribute) (modeling_llama.LlamaPreTrainedModel attribute) (modeling_qwen2.Qwen2ForQuestionAnswering attribute) (modeling_qwen2.Qwen2PreTrainedModel attribute) (modeling_qwen3.Qwen3ForQuestionAnswering attribute) (modeling_qwen3.Qwen3PreTrainedModel attribute) BaseBatchReq (class in pymllm.engine.io_struct) BaseOp (class in pymllm.mobile.ffi) BasePrefixCache (class in pymllm.mem_cache.base_prefix_cache) BaseReq (class in pymllm.engine.io_struct) batch_size (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.ScheduleBatch property) BatchStrOutput (class in pymllm.engine.io_struct) BatchTokenIDOutput (class in pymllm.engine.io_struct) BatchTokenizedGenerateReqInput (class in pymllm.engine.io_struct) bfloat16 (in module pymllm.mobile.ffi) bfloat16_() (in module pymllm.mobile.ffi) bias (pymllm.layers.layer_norm.LayerNorm attribute) bits (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ attribute) bitwidth_of_scale (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver attribute) block_size (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ attribute) blocks (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) bool_() (in module pymllm.mobile.ffi) boolean (in module pymllm.mobile.ffi) broadcast() (pymllm.orchestrator.group_coordinator.GroupCoordinator method) buffer (pymllm.orchestrator.cuda_ipc_transport.ShmSyncBuffer attribute) buffers (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) build_cast2fp32_pipeline() (in module pymllm.mobile.quantize.pipeline) build_raw_pipeline() (in module pymllm.mobile.quantize.pipeline) build_w4a32_kai_pipeline() (in module pymllm.mobile.quantize.pipeline) BUILTIN_QUANTIZE_PASS (in module pymllm.mobile.quantize.pipeline) BUILTIN_QUANTIZE_PIPELINE (in module pymllm.mobile.quantize.pipeline) C cache_indices (pymllm.layers.attention.gdn_backend.GDNForwardMetadata attribute) calculate_qparams() (pymllm.mobile.backends.qualcomm.transformers.core.observer.ConcatObserver method) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize method) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver method) calibrate() (runner.LlamaQuantizer method) calibrated (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver attribute) can_run() (pymllm.executor.cuda_graph_runner.CudaGraphRunner method) capture() (pymllm.executor.cuda_graph_runner.CudaGraphRunner method) capture_bs (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) Cast2Fp32QuantizePass (class in pymllm.mobile.quantize.cast2fp32_pass) CausalMask (C++ class) CausalMask::CausalMask::CausalMask (C++ function), [1], [2] chain_speculative_sampling() (in module pymllm.layers.sampling) channels (pymllm.layers.gated_delta_net.GDNConv1d attribute) chat_template_kwargs (pymllm.server.launch.ChatCompletionRequest attribute) ChatCompletionRequest (class in pymllm.mobile.service.network) (class in pymllm.server.launch) ChatMessage (class in pymllm.server.launch) check_finished() (pymllm.orchestrator.scheduler_process.Req method) children (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) ChunkCache (class in pymllm.mem_cache.chunk_cache) chunked_prefill_size (pymllm.configs.server_config.ServerConfig attribute) cleanup() (pymllm.orchestrator.shared_memory_queue.SharedMemoryManager static method) cleanup_ipc_files() (in module pymllm.orchestrator.ipc_utils) clear() (pymllm.mem_cache.memory_pool.ReqToTokenPool method) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) cli_app (in module pymllm.mobile.service.tools) clip() (pymllm.mobile.ffi.Tensor method) clone() (pymllm.mobile.ffi.Tensor method) close() (pymllm.mobile.utils.adb.ShellContext method) (pymllm.orchestrator.shared_memory_queue.TensorQueue method) close_zmq_socket() (in module pymllm.orchestrator.ipc_utils) ColumnParallelLinear (class in pymllm.layers.linear) compile() (runner.LlamaQuantizer method) compile_bs (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) completion_tokens (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) CompletionRequest (class in pymllm.server.launch) compute_node_hash() (pymllm.mem_cache.radix_cache.RadixCache method) ConcatObserver (class in pymllm.mobile.backends.qualcomm.transformers.core.observer) config (modeling_llama.LlamaPreTrainedModel attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen2.Qwen2PreTrainedModel attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) (modeling_qwen3.Qwen3PreTrainedModel attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) content (pymllm.server.launch.ChatMessage attribute) ContentPart (class in pymllm.server.launch) context_len (pymllm.executor.model_runner.ModelRunner attribute) context_length (pymllm.configs.server_config.ServerConfig attribute) contiguous() (pymllm.mobile.ffi.Tensor method) continuous_usage_stats (pymllm.server.launch.StreamOptions attribute) conv1d (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) Conv3D (C++ class) Conv3D::Conv3D::bias (C++ function) Conv3D::Conv3D::Conv3D (C++ function), [1], [2] Conv3D::Conv3D::weight (C++ function) conv_dim (pymllm.mem_cache.memory_pool.GDNPool attribute) conv_kernel_size (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) conv_state (pymllm.mem_cache.memory_pool.GDNPool attribute) conv_weight (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) convert() (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize method) (runner.LlamaQuantizer method) convert_rope_for_deploy() (modeling_llama.LlamaModel method) (modeling_qwen2.Qwen2Model method) (modeling_qwen3.Qwen3Model method) convert_to_conv2d_deploy_hwio() (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearW8A16_PerChannelSym method) convert_to_deploy() (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearW8A16_PerChannelSym method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) convert_weight() (in module runner) copy_lm_head_weight_from_embed_tokens() (modeling_llama.LlamaForCausalLM method) (modeling_qwen2.Qwen2ForCausalLM method) (modeling_qwen3.Qwen3ForCausalLM method) copy_states() (pymllm.mem_cache.memory_pool.GDNPool method) cors_allow_origins (pymllm.configs.server_config.ServerConfig attribute) cos_embedding_input_qdq (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) cpu (in module pymllm.mobile.ffi) cpu() (pymllm.mobile.ffi.Tensor method) cpu_() (in module pymllm.mobile.ffi) create() (pymllm.mobile.ffi.SoftmaxOp static method) create_session() (in module pymllm.mobile.service.models_hub) create_weights() (pymllm.layers.quantize_base.LinearMethodBase method) (pymllm.layers.quantize_base.QuantizeMethodBase method) (pymllm.layers.quantize_base.UnquantizedLinearMethod method) (pymllm.quantization.methods.awq_marlin.AWQMarlinLinearMethod method) create_zmq_socket() (in module pymllm.orchestrator.ipc_utils) created_at (pymllm.orchestrator.request_response_process.ReqState attribute) CROSS_ATTENTION (pymllm.layers.attention.flashinfer_backend.WrapperDispatch attribute) cu_seqlens (pymllm.layers.attention.gdn_backend.GDNForwardMetadata attribute) cuda (in module pymllm.mobile.ffi) cuda() (pymllm.mobile.ffi.Tensor method) cuda_() (in module pymllm.mobile.ffi) cuda_ipc_pool_size_mb (pymllm.configs.server_config.ServerConfig attribute) cuda_ipc_recycle_interval (pymllm.configs.server_config.ServerConfig attribute) CudaGraphRunner (class in pymllm.executor.cuda_graph_runner) CudaIpcTensorTransportProxy (class in pymllm.orchestrator.cuda_ipc_transport) D data_parallel_all_reduce() (in module pymllm.orchestrator.parallel_state) dec_lock_ref() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) DECODE (pymllm.engine.forward_batch.ForwardMode attribute) decode_cuda_graph_metadata (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) decode_ids (pymllm.engine.io_struct.BatchTokenIDOutput attribute) decode_log_interval (pymllm.configs.server_config.ServerConfig attribute) decode_use_tensor_cores (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) decode_wrappers (pymllm.layers.attention.flashinfer_backend.DecodeMetadata attribute) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) DecodeMetadata (class in pymllm.layers.attention.flashinfer_backend) DECODER (pymllm.layers.attention.radix_attention.AttentionType attribute) DECODER_BIDIRECTIONAL (pymllm.layers.attention.radix_attention.AttentionType attribute) deepstack_merger_list (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) deepstack_visual_indexes (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) DEFAULT_EPS_16BIT (in module pymllm.mobile.backends.qualcomm.transformers.core.qdq) DEFAULT_EPS_8BIT (in module pymllm.mobile.backends.qualcomm.transformers.core.qdq) deploy_mode (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) description (pymllm.server.launch.ToolFunction attribute) detail (pymllm.server.launch.ImageUrl attribute) DetokenizerProcess (class in pymllm.orchestrator.detokenizer_process) Device (class in pymllm.mobile.ffi) device (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.layers.attention.gdn_backend.GDNAttnBackend attribute) (pymllm.mem_cache.chunk_cache.ChunkCache attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) (pymllm.mem_cache.memory_pool.KVPool attribute) (pymllm.mem_cache.memory_pool.ReqToTokenPool attribute) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator attribute) (pymllm.mobile.ffi.Tensor property) (pymllm.models.qwen3_vl.Qwen3VLVisionModel property) device() (in module pymllm.mobile.ffi) device_id (pymllm.mobile.utils.adb.ShellContext attribute) disable_cuda_graph (pymllm.configs.server_config.ServerConfig attribute) disable_fake_quant() (in module runner) (runner.LlamaQuantizer method) disable_fakequant() (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) disable_observer() (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) disable_qdq_observer() (in module runner) disable_radix_cache (pymllm.configs.server_config.ServerConfig attribute) divide() (in module pymllm.orchestrator.group_coordinator) down_proj (pymllm.layers.mlp.MLP attribute) (pymllm.layers.mlp.ParallelMLP attribute) download_dir (pymllm.configs.server_config.ServerConfig attribute) download_mllm_model() (in module pymllm.mobile.service.models_hub) dt_bias (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) DType (class in pymllm.mobile.ffi) dtype (pymllm.configs.server_config.ServerConfig attribute) (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) (pymllm.mem_cache.memory_pool.KVPool attribute) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ attribute) (pymllm.mobile.ffi.Tensor property) (pymllm.models.qwen3_vl.Qwen3VLVisionModel property) E echo (pymllm.server.launch.CompletionRequest attribute) echo() (in module pymllm.mobile.ffi) embed_dim (pymllm.models.qwen3_vl.Qwen3VisionAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed attribute) embed_tokens (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_vl.Qwen3VLTextModel attribute) Embedding (C++ class) Embedding::Embedding::Embedding (C++ function), [1], [2] Embedding::Embedding::weight (C++ function) embedding_dim (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) embedding_forward_tp8_worker_cuda() (in module test_vocab_parallel_embedding) empty() (in module pymllm.mobile.ffi) (pymllm.orchestrator.shared_memory_queue.TensorQueue method) enable_activation_update() (runner.LlamaQuantizer method) enable_fake_quant() (in module runner) (runner.LlamaQuantizer method) enable_fakequant() (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) enable_faulthandler() (in module pymllm.mobile.utils.error_handler) enable_mamba_cache (pymllm.configs.server_config.ServerConfig attribute) enable_metrics (pymllm.configs.server_config.ServerConfig attribute) enable_observer() (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) enable_pdl (pymllm.layers.mlp.MLP attribute) (pymllm.layers.mlp.ParallelMLP attribute) enable_qdq_observer() (in module runner) enable_shared_queue (pymllm.configs.server_config.ServerConfig attribute) enable_thinking (pymllm.mobile.service.network.ChatCompletionRequest attribute) enable_torch_compile (pymllm.configs.server_config.ServerConfig attribute) (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) ENCODER_ONLY (pymllm.layers.attention.radix_attention.AttentionType attribute) end (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk property) Engine (class in pymllm.engine.launch) eps (pymllm.layers.layer_norm.LayerNorm attribute) (pymllm.layers.rms_norm.GemmaRMSNorm attribute) (pymllm.layers.rms_norm.RMSNorm attribute) (pymllm.layers.rms_norm_gated.RMSNormGated attribute) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm attribute) event (pymllm.orchestrator.request_response_process.ReqState attribute) event_loop() (pymllm.orchestrator.detokenizer_process.DetokenizerProcess method) (pymllm.orchestrator.scheduler_process.SchedulerProcess method) (pymllm.orchestrator.tokenizer_process.TokenizerProcess method) evict() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) evictable_size() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) evicted (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode property) (pymllm.mem_cache.radix_cache.TreeNode property) EvictResult (class in pymllm.mem_cache.base_prefix_cache) execute() (pymllm.mobile.utils.adb.ShellContext method) execute_command() (pymllm.mobile.utils.adb.ADBToolkit method) EXTEND (pymllm.engine.forward_batch.ForwardMode attribute) extend_no_prefix (pymllm.layers.attention.flashinfer_backend.PrefillMetadata attribute) extend_num_tokens (pymllm.engine.forward_batch.ForwardBatch attribute) extend_prefix_lens (pymllm.engine.forward_batch.ForwardBatch attribute) extend_prefix_lens_cpu (pymllm.engine.forward_batch.ForwardBatch attribute) extend_seq_lens (pymllm.engine.forward_batch.ForwardBatch attribute) extend_seq_lens_cpu (pymllm.engine.forward_batch.ForwardBatch attribute) extend_start_loc (pymllm.engine.forward_batch.ForwardBatch attribute) extra_key (pymllm.mem_cache.base_prefix_cache.RadixKey attribute) extra_options (pymllm.configs.server_config.ServerConfig attribute) (pymllm.engine.io_struct.GenerateReqInput attribute) extra_repr() (pymllm.layers.attention.radix_attention.RadixAttention method) (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention method) (pymllm.layers.rms_norm_gated.RMSNormGated method) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) F fake_quant (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ attribute) fast_pos_embed_interpolate() (pymllm.models.qwen3_vl.Qwen3VLVisionModel method) fastapi_root_path (pymllm.configs.server_config.ServerConfig attribute) file_handler (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) file_path (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) file_storage_path (pymllm.configs.server_config.ServerConfig attribute) finalize() (pymllm.mobile.convertor.model_file_v2.ModelFileV2 method) finished (pymllm.orchestrator.request_response_process.ReqState attribute) finished_reason (pymllm.orchestrator.scheduler_process.Req attribute) finished_reasons (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) FixedActivationQDQ (class in pymllm.mobile.backends.qualcomm.transformers.core.qdq) FlashInferAttnBackend (class in pymllm.layers.attention.flashinfer_backend) float16 (in module pymllm.mobile.ffi) float16_() (in module pymllm.mobile.ffi) float32 (in module pymllm.mobile.ffi) float32_() (in module pymllm.mobile.ffi) flush() (pymllm.parsers.tool_call_parser.ToolCallParser method) flush_cache() (in module pymllm.server.launch) FooModule (class in test_nn) forward() (modeling_llama.LlamaForCausalLM method) (modeling_llama.LlamaModel method) (modeling_qwen2.Qwen2ForCausalLM method) (modeling_qwen2.Qwen2Model method) (modeling_qwen3.Qwen3ForCausalLM method) (modeling_qwen3.Qwen3Model method) (pymllm.executor.model_runner.ModelRunner method) (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.radix_attention.RadixAttention method) (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention method) (pymllm.layers.base.MllmBaseLayer method) (pymllm.layers.embedding.VocabParallelEmbedding method) (pymllm.layers.gated_delta_net.GatedDeltaNet method) (pymllm.layers.layer_norm.LayerNorm method) (pymllm.layers.linear.ColumnParallelLinear method) (pymllm.layers.linear.Linear method) (pymllm.layers.linear.RowParallelLinear method) (pymllm.layers.mlp.MLP method) (pymllm.layers.mlp.ParallelMLP method) (pymllm.layers.rms_norm.GemmaRMSNorm method) (pymllm.layers.rms_norm.RMSNorm method) (pymllm.layers.rms_norm_gated.RMSNormGated method) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.observer.ConcatObserver method) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize method) (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearW8A16_PerChannelSym method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) (pymllm.models.qwen3_5.Qwen3_5AttentionDecoderLayer method) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM method) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration method) (pymllm.models.qwen3_5.Qwen3_5FullAttention method) (pymllm.models.qwen3_5.Qwen3_5LinearDecoderLayer method) (pymllm.models.qwen3_vl.Qwen3VisionAttention method) (pymllm.models.qwen3_vl.Qwen3VisionBlock method) (pymllm.models.qwen3_vl.Qwen3VisionMLP method) (pymllm.models.qwen3_vl.Qwen3VLAttention method) (pymllm.models.qwen3_vl.Qwen3VLDecoderLayer method) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration method) (pymllm.models.qwen3_vl.Qwen3VLTextModel method) (pymllm.models.qwen3_vl.Qwen3VLVisionModel method) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed method) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger method) (test_nn.FooModule method) forward_decode() (pymllm.executor.model_runner.ModelRunner method) (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) forward_extend() (pymllm.executor.model_runner.ModelRunner method) (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) forward_gdn() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) forward_metadata (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.layers.attention.gdn_backend.GDNAttnBackend attribute) forward_mode (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.orchestrator.scheduler_process.ScheduleBatch attribute) forward_pass_id (pymllm.executor.model_runner.ModelRunner attribute) ForwardBatch (class in pymllm.engine.forward_batch) ForwardMode (class in pymllm.engine.forward_batch) free() (pymllm.mem_cache.memory_pool.ReqToTokenPool method) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) free_group_begin() (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) free_group_end() (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) free_track_slot() (pymllm.mem_cache.memory_pool.GDNPool method) freeze_activation() (runner.LlamaQuantizer method) freeze_gc() (in module pymllm.executor.cuda_graph_runner) freeze_llama_embed_tokens_weight() (in module runner) freeze_llama_linear_weight() (in module runner) freeze_llama_rmsnorm_weight() (in module runner) freeze_weight() (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding method) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear method) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm method) frequency_penalty (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) from_config() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig class method) (pymllm.quantization.quant_config.QuantizationConfig class method) from_numpy() (in module pymllm.mobile.ffi) from_torch() (in module pymllm.mobile.ffi) full_attn_backend (pymllm.layers.attention.hybrid_backend.HybridAttnBackend attribute) full_attn_layer_ids (pymllm.layers.attention.hybrid_backend.HybridAttnBackend attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) full_evicted (pymllm.mem_cache.base_prefix_cache.EvictResult attribute) full_lock_ref (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) full_lru (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) function (pymllm.server.launch.Tool attribute) G gate_proj (pymllm.layers.mlp.ParallelMLP attribute) GatedDeltaNet (class in pymllm.layers.gated_delta_net) gather_output (pymllm.layers.linear.ColumnParallelLinear attribute) gdn_backend (pymllm.layers.attention.hybrid_backend.HybridAttnBackend attribute) gdn_decode_backend (pymllm.configs.server_config.ServerConfig attribute) gdn_layer_idx (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) gdn_pool (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.attention.gdn_backend.GDNAttnBackend attribute) GDNAttnBackend (class in pymllm.layers.attention.gdn_backend) GDNConv1d (class in pymllm.layers.gated_delta_net) GDNForwardMetadata (class in pymllm.layers.attention.gdn_backend) GDNPool (class in pymllm.mem_cache.memory_pool) GELU (C++ class) GELU::GELU::GELU (C++ function), [1] GemmaRMSNorm (class in pymllm.layers.rms_norm) generate() (in module pymllm.server.launch) (pymllm.engine.launch.Engine method) generate_async() (pymllm.engine.launch.Engine method) GenerateReqInput (class in pymllm.engine.io_struct) GenerateRequest (class in pymllm.server.launch) get() (pymllm.orchestrator.shared_memory_queue.TensorQueue method) get_available_gpu_memory() (in module pymllm.executor.model_runner) get_config_filenames() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig static method) (pymllm.quantization.quant_config.QuantizationConfig static method) get_cuda_graph_seq_len_fill_value() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) get_data_parallel_rank() (in module pymllm.orchestrator.parallel_state) get_data_parallel_world_size() (in module pymllm.orchestrator.parallel_state) get_device_info() (pymllm.mobile.utils.adb.ADBToolkit method) get_devices() (pymllm.mobile.utils.adb.ADBToolkit method) get_download_model_path() (in module pymllm.mobile.service.models_hub) get_dp_group() (in module pymllm.orchestrator.parallel_state) get_global_config() (in module pymllm.configs.global_config) get_global_graph_memory_pool() (in module pymllm.executor.cuda_graph_runner) get_input_embeddings() (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration method) get_instance() (pymllm.configs.global_config.GlobalConfig class method) get_key_buffer() (pymllm.mem_cache.memory_pool.KVPool method) get_kv_buffer() (pymllm.mem_cache.memory_pool.KVPool method) get_layer_state() (pymllm.mem_cache.memory_pool.GDNPool method) get_lru_leaf_unlocked() (pymllm.mem_cache.mamba_radix_cache.LRUList method) get_lru_unlocked() (pymllm.mem_cache.mamba_radix_cache.LRUList method) get_min_capability() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig class method) (pymllm.quantization.quant_config.QuantizationConfig class method) get_model_class() (in module pymllm.models) get_name() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig method) (pymllm.quantization.quant_config.QuantizationConfig method) get_next_batch_to_run() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) get_pipeline_model_parallel_rank() (in module pymllm.orchestrator.parallel_state) get_pipeline_model_parallel_world_size() (in module pymllm.orchestrator.parallel_state) get_pp_group() (in module pymllm.orchestrator.parallel_state) get_quant_method() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig method) (pymllm.quantization.quant_config.QuantizationConfig method) get_quantization_config() (in module pymllm.quantization.quant_config) get_response() (in module pymllm.mobile.service.rr_process) get_rope_index() (in module pymllm.models.qwen3_vl) get_scale_perms() (in module pymllm.quantization.methods.awq_marlin) get_shell_context() (pymllm.mobile.utils.adb.ADBToolkit method) get_slice_with_flag() (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool method) get_supported_act_dtypes() (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig method) (pymllm.quantization.quant_config.QuantizationConfig method) get_tensor_model_parallel_rank() (in module pymllm.orchestrator.parallel_state) get_tensor_model_parallel_world_size() (in module pymllm.orchestrator.parallel_state) get_total_gpu_memory() (in module pymllm.executor.model_runner) get_tp_group() (in module pymllm.orchestrator.parallel_state) get_value_buffer() (pymllm.mem_cache.memory_pool.KVPool method) GlobalConfig (class in pymllm.configs.global_config) GPTQ_MARLIN_MIN_THREAD_K (in module pymllm.quantization.methods.awq_marlin) GPTQ_MARLIN_MIN_THREAD_N (in module pymllm.quantization.methods.awq_marlin) GPTQ_MARLIN_TILE (in module pymllm.quantization.methods.awq_marlin) gpu_id (pymllm.executor.model_runner.ModelRunner attribute) gradient_checkpointing (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) graph_runner (pymllm.executor.model_runner.ModelRunner attribute) graphs (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) group_size (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig attribute) GroupCoordinator (class in pymllm.orchestrator.group_coordinator) H HAS_BANNER_LIBS (in module pymllm.engine.launch) has_sliding_layers (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) has_tool_call() (pymllm.parsers.tool_call_parser.ToolCallParser method) hash_bytes() (in module pymllm.mem_cache.base_prefix_cache) hash_to_int64() (in module pymllm.mem_cache.base_prefix_cache) hash_token_ids() (in module pymllm.mem_cache.base_prefix_cache) hash_values (pymllm.mem_cache.radix_cache.TreeNode attribute) head (pymllm.mem_cache.mamba_radix_cache.LRUList attribute) head_dim (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.layers.attention.radix_attention.RadixAttention attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VisionAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) head_k_dim (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) head_v_dim (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) health() (in module pymllm.server.launch) hf_config (pymllm.configs.model_config.ModelConfig attribute) hidden_size (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.layers.layer_norm.LayerNorm attribute) (pymllm.layers.mlp.MLP attribute) (pymllm.layers.mlp.ParallelMLP attribute) (pymllm.layers.rms_norm.GemmaRMSNorm attribute) (pymllm.layers.rms_norm.RMSNorm attribute) (pymllm.layers.rms_norm_gated.RMSNormGated attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLTextModel attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) hidden_states (pymllm.executor.model_runner.LogitsProcessorOutput attribute), [1] hit_count (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) host (pymllm.configs.server_config.ServerConfig attribute) http_exception_handler() (in module pymllm.server.launch) HybridAttnBackend (class in pymllm.layers.attention.hybrid_backend) I id (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) (pymllm.mobile.service.network.ChatCompletionRequest attribute) IDLE (pymllm.engine.forward_batch.ForwardMode attribute) IdleSleeper (class in pymllm.orchestrator.scheduler_process) image_data (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.server.launch.GenerateRequest attribute) image_grid_thw (pymllm.engine.forward_batch.ForwardBatch attribute) image_token_id (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) image_url (pymllm.server.launch.ContentPart attribute) ImageUrl (class in pymllm.server.launch) in_channels (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed attribute) in_features (pymllm.layers.linear.ColumnParallelLinear attribute) (pymllm.layers.linear.Linear attribute) (pymllm.layers.linear.RowParallelLinear attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) in_features_per_partition (pymllm.layers.linear.RowParallelLinear attribute) in_proj_a (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) in_proj_b (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) in_proj_qkv (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) in_proj_z (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) inc_lock_ref() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) include_usage (pymllm.server.launch.StreamOptions attribute) index (pymllm.parsers.tool_call_parser.ToolCallItem attribute) indices (pymllm.mem_cache.base_prefix_cache.MatchResult attribute) indices_updater_decode (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) infer() (runner.LlamaQuantizer method) init_attention_backend() (pymllm.executor.model_runner.ModelRunner method) init_cuda_graph_state() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) init_cuda_graphs() (pymllm.executor.model_runner.ModelRunner method) init_forward_metadata() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) init_forward_metadata_capture_cuda_graph() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) init_forward_metadata_replay_cuda_graph() (pymllm.layers.attention.attention_backend.AttentionBackend method) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend method) (pymllm.layers.attention.gdn_backend.GDNAttnBackend method) (pymllm.layers.attention.hybrid_backend.HybridAttnBackend method) init_memory_pool() (pymllm.executor.model_runner.ModelRunner method) init_model() (pymllm.orchestrator.model_runner_process.ModelRunnerProcess method) (pymllm.orchestrator.scheduler_process.SchedulerProcess method) init_sockets() (pymllm.orchestrator.detokenizer_process.DetokenizerProcess method) (pymllm.orchestrator.scheduler_process.SchedulerProcess method) (pymllm.orchestrator.tokenizer_process.TokenizerProcess method) init_tokenizer() (pymllm.orchestrator.detokenizer_process.DetokenizerProcess method) initialize() (pymllm.executor.model_runner.ModelRunner method) initialize_context() (in module pymllm.mobile.ffi) initialize_model_parallel() (in module pymllm.orchestrator.parallel_state) input_end_index (pymllm.layers.linear.RowParallelLinear attribute) input_ids (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.GenerateRequest attribute) input_layernorm (pymllm.models.qwen3_5.Qwen3_5AttentionDecoderLayer attribute) (pymllm.models.qwen3_5.Qwen3_5LinearDecoderLayer attribute) (pymllm.models.qwen3_vl.Qwen3VLDecoderLayer attribute) input_observers (pymllm.mobile.backends.qualcomm.transformers.core.observer.ConcatObserver attribute) input_start_index (pymllm.layers.linear.RowParallelLinear attribute) input_text (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) input_token_logprobs_idx (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) input_token_logprobs_val (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) input_top_logprobs_idx (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) input_top_logprobs_val (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) inputs_dict (pymllm.mobile.quantize.quantize_pass.QuantizePlanPayload attribute) inputs_num (pymllm.mobile.quantize.quantize_pass.QuantizePlanPayload attribute) insert() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) insert_mru() (pymllm.mem_cache.mamba_radix_cache.LRUList method) insert_session() (in module pymllm.mobile.service.rr_process) InsertResult (class in pymllm.mem_cache.base_prefix_cache) install_apk() (pymllm.mobile.utils.adb.ADBToolkit method) int16 (in module pymllm.mobile.ffi) int16_() (in module pymllm.mobile.ffi) int32 (in module pymllm.mobile.ffi) int32_() (in module pymllm.mobile.ffi) int64 (in module pymllm.mobile.ffi) int64_() (in module pymllm.mobile.ffi) int8 (in module pymllm.mobile.ffi) int8_() (in module pymllm.mobile.ffi) intermediate_size (pymllm.layers.mlp.MLP attribute) (pymllm.layers.mlp.ParallelMLP attribute) is_alive() (pymllm.mobile.utils.adb.ShellContext method) is_capture_mode() (in module pymllm.executor.cuda_graph_runner) is_contiguous() (pymllm.mobile.ffi.Tensor method) is_cross_attention (pymllm.layers.attention.radix_attention.RadixAttention attribute) is_decode() (pymllm.engine.forward_batch.ForwardMode method) is_decode_or_idle() (pymllm.engine.forward_batch.ForwardMode method) is_extend() (pymllm.engine.forward_batch.ForwardMode method) is_finished (pymllm.orchestrator.scheduler_process.Req property) is_generation (pymllm.executor.model_runner.ModelRunner property) is_healthy (pymllm.engine.launch.Engine property) is_idle() (pymllm.engine.forward_batch.ForwardMode method) is_mixed() (pymllm.engine.forward_batch.ForwardMode method) is_numpy_available() (in module pymllm.mobile.ffi) is_prefill() (pymllm.engine.forward_batch.ForwardMode method) is_prefilled (pymllm.orchestrator.scheduler_process.Req attribute) is_qnn_aot_on_x86_enabled() (in module pymllm.mobile.ffi) is_single (pymllm.engine.io_struct.GenerateReqInput attribute) is_torch_available() (in module pymllm.mobile.ffi) K k_buffer (pymllm.mem_cache.memory_pool.KVPool attribute) k_head_dim (pymllm.mem_cache.memory_pool.KVPool attribute) k_head_num (pymllm.mem_cache.memory_pool.KVPool attribute) k_norm (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) k_proj (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) kernel_size (pymllm.layers.gated_delta_net.GDNConv1d attribute) key (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) key_dim (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) kv_cache_dtype (pymllm.configs.quantization_config.QuantizationConfig attribute) (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) kv_size (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) KVCache (C++ class) KVCache::KVCache::KVCache (C++ function), [1], [2] KVCache::KVCache::setLayerIndex (C++ function) KVPool (class in pymllm.mem_cache.memory_pool) L last_access_time (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) last_node (pymllm.mem_cache.base_prefix_cache.InsertResult attribute) (pymllm.mem_cache.base_prefix_cache.MatchResult attribute) launch() (pymllm.engine.launch.Engine method) launch_server() (in module pymllm.server.launch) Layer (C++ class) Layer::__fmt_print (C++ function) Layer::__main (C++ function) Layer::impl (C++ function) Layer::Layer (C++ function), [1] Layer::opType (C++ function) Layer::refOptions (C++ function) Layer::to (C++ function) layer_id (pymllm.layers.attention.radix_attention.RadixAttention attribute) (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) layer_num (pymllm.mem_cache.memory_pool.KVPool attribute) layer_types (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) LayerNorm (C++ class) (class in pymllm.layers.layer_norm) LayerNorm::LayerNorm::LayerNorm (C++ function), [1], [2] layers (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_vl.Qwen3VLTextModel attribute) lifespan() (in module pymllm.mobile.service.network) (in module pymllm.server.launch) Linear (C++ class) (class in pymllm.layers.linear) Linear::Linear::bias (C++ function) Linear::Linear::Linear (C++ function), [1], [2] Linear::Linear::weight (C++ function) linear_attn (pymllm.models.qwen3_5.Qwen3_5LinearDecoderLayer attribute) linear_fc1 (pymllm.models.qwen3_vl.Qwen3VisionMLP attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) linear_fc2 (pymllm.models.qwen3_vl.Qwen3VisionMLP attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) LinearMethodBase (class in pymllm.layers.quantize_base) list_models() (in module pymllm.server.launch) list_quantization_methods() (in module pymllm.quantization.quant_config) listen() (pymllm.orchestrator.request_response_process.RequestResponseProcess method) LlamaForCausalLM (class in modeling_llama) LlamaForQuestionAnswering (class in modeling_llama) LlamaForSequenceClassification (class in modeling_llama) LlamaForTokenClassification (class in modeling_llama) LlamaModel (class in modeling_llama) LlamaPreTrainedModel (class in modeling_llama) LlamaQuantizer (class in runner) lm_head (modeling_llama.LlamaForCausalLM attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) lm_head_input_qdq (modeling_llama.LlamaForCausalLM attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) lm_head_output_qdq (modeling_llama.LlamaForCausalLM attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) load() (pymllm.mobile.ffi.BaseOp method) load_format (pymllm.configs.server_config.ServerConfig attribute) load_model() (in module pymllm.mobile.convertor) (pymllm.executor.model_runner.ModelRunner method) load_weight() (in module test_vocab_parallel_embedding) load_weights() (pymllm.models.qwen3_5.Qwen3_5ForCausalLM method) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration method) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration method) local_rank (pymllm.orchestrator.group_coordinator.GroupCoordinator attribute) lock_ref (pymllm.mem_cache.radix_cache.TreeNode attribute) log_level (pymllm.configs.server_config.ServerConfig attribute) logger (in module pymllm.engine.launch) (in module pymllm.executor.cuda_graph_runner) (in module pymllm.executor.model_runner) (in module pymllm.layers.attention.flashinfer_backend) (in module pymllm.layers.attention.gdn_backend) (in module pymllm.layers.attention.hybrid_backend) (in module pymllm.layers.gated_delta_net) (in module pymllm.layers.mlp) (in module pymllm.layers.rms_norm_gated) (in module pymllm.layers.sampling) (in module pymllm.mem_cache.mamba_radix_cache) (in module pymllm.mem_cache.memory_pool) (in module pymllm.mem_cache.radix_cache) (in module pymllm.models) (in module pymllm.models.qwen3_5) (in module pymllm.models.qwen3_vl) (in module pymllm.orchestrator.cuda_ipc_transport) (in module pymllm.orchestrator.detokenizer_process) (in module pymllm.orchestrator.model_runner_process) (in module pymllm.orchestrator.parallel_state) (in module pymllm.orchestrator.request_response_process) (in module pymllm.orchestrator.scheduler_process) (in module pymllm.orchestrator.shared_memory_queue) (in module pymllm.orchestrator.tokenizer_process) (in module pymllm.quantization.methods.awq_marlin) (in module pymllm.server.launch) logit_cap (pymllm.layers.attention.radix_attention.RadixAttention attribute) LogitsProcessorOutput (class in pymllm.executor.model_runner) logprob_start_len (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.GenerateRequest attribute) logprobs (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) lora_path (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.server.launch.GenerateRequest attribute) LRUList (class in pymllm.mem_cache.mamba_radix_cache) M magic (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) main() (in module pymllm.__main__) (in module pymllm.mobile.service.tools) (in module pymllm.mobile.utils.mllm_convertor) (in module pymllm.server.launch) (in module train) make_args() (in module pymllm.configs.global_config) make_full_attention_net_mem_pool() (in module pymllm.mem_cache.memory_pool) make_ipc_address() (in module pymllm.orchestrator.ipc_utils) make_req_to_token_pool() (in module pymllm.mem_cache.memory_pool) mamba (pymllm.mem_cache.mamba_radix_cache.LRUList attribute) mamba_branching_seqlen (pymllm.mem_cache.base_prefix_cache.MatchResult attribute) mamba_evictable_size() (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) mamba_evicted (pymllm.mem_cache.base_prefix_cache.EvictResult attribute) mamba_exist (pymllm.mem_cache.base_prefix_cache.InsertResult attribute) mamba_lock_ref (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) mamba_lru (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) mamba_next (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) mamba_pool (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) mamba_prev (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) mamba_protected_size() (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) mamba_tombstone (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode property) mamba_value (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) MambaRadixCache (class in pymllm.mem_cache.mamba_radix_cache) MambaTreeNode (class in pymllm.mem_cache.mamba_radix_cache) marlin_make_empty_g_idx() (in module pymllm.quantization.methods.awq_marlin) marlin_make_workspace() (in module pymllm.quantization.methods.awq_marlin) marlin_permute_scales() (in module pymllm.quantization.methods.awq_marlin) MARLIN_SUPPORTED_GROUP_SIZES (in module pymllm.quantization.methods.awq_marlin) marlin_zero_points() (in module pymllm.quantization.methods.awq_marlin) match() (pymllm.mobile.quantize.cast2fp32_pass.Cast2Fp32QuantizePass method) (pymllm.mobile.quantize.kai.w4a32.W4A32KAIQuantizePass method) (pymllm.mobile.quantize.quantize_pass.QuantizeBasePass method) match_prefix() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) MatchResult (class in pymllm.mem_cache.base_prefix_cache) matmul() (in module pymllm.mobile.nn.functional) matmul_impl_blas (in module pymllm.mobile.nn.functional) matmul_impl_default (in module pymllm.mobile.nn.functional) matmul_impl_gguf (in module pymllm.mobile.nn.functional) matmul_impl_mllmblas (in module pymllm.mobile.nn.functional) max() (pymllm.mobile.ffi.Tensor method) max_bs (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) max_completion_tokens (pymllm.server.launch.ChatCompletionRequest attribute) max_context_len (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.mem_cache.memory_pool.ReqToTokenPool attribute) max_new_tokens (pymllm.orchestrator.scheduler_process.Req attribute) max_prefill_tokens (pymllm.configs.server_config.ServerConfig attribute) max_queued_requests (pymllm.configs.server_config.ServerConfig attribute) max_reqs (pymllm.mem_cache.memory_pool.GDNPool attribute) max_running_requests (pymllm.configs.server_config.ServerConfig attribute) (pymllm.executor.model_runner.ModelRunner attribute) max_tokens (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) max_total_num_tokens (pymllm.executor.model_runner.ModelRunner attribute) max_total_tokens (pymllm.configs.server_config.ServerConfig attribute) max_track_slots (pymllm.mem_cache.memory_pool.GDNPool attribute) mean() (pymllm.mobile.ffi.Tensor method) mem_bytes() (pymllm.mem_cache.memory_pool.GDNPool method) mem_fraction_static (pymllm.configs.server_config.ServerConfig attribute) mem_size (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk property) merge_and_sort_free() (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator method) merger (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) messages (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.server.launch.ChatCompletionRequest attribute) meta_data (pymllm.orchestrator.cuda_ipc_transport.ShmSyncBuffer attribute) method (pymllm.configs.quantization_config.QuantizationConfig attribute) min() (pymllm.mobile.ffi.Tensor method) min_p_sampling_from_probs() (in module pymllm.layers.sampling) MIXED (pymllm.engine.forward_batch.ForwardMode attribute) mllm::__mllm_exception_main (C++ function) mllm::__setup_signal_handler (C++ function) mllm::__signal_handler (C++ function) mllm::async::fork (C++ function) mllm::async::wait (C++ function), [1] mllm::cleanThisThread (C++ function) mllm::initializeContext (C++ function) mllm::isOpenCLAvailable (C++ function) mllm::isQnnAvailable (C++ function) mllm::load (C++ function) mllm::memoryReport (C++ function) mllm::nn::functional::clip (C++ function) mllm::nn::functional::concat (C++ function) mllm::nn::functional::flashAttention2 (C++ function) mllm::nn::functional::interpolate (C++ function), [1] mllm::nn::functional::log (C++ function) mllm::nn::functional::matmul (C++ function) mllm::nn::functional::max (C++ function) mllm::nn::functional::mean (C++ function) mllm::nn::functional::min (C++ function) mllm::nn::functional::pad (C++ function) mllm::nn::functional::softmax (C++ function) mllm::nn::functional::split (C++ function), [1], [2], [3] mllm::nn::functional::sum (C++ function) mllm::nn::functional::topk (C++ function) mllm::nn::functional::view (C++ function) mllm::perf::warmup (C++ function) mllm::print (C++ function) mllm::save (C++ function) mllm::setMaximumNumThreads (C++ function) mllm::setPrintMaxElementsPerDim (C++ function) mllm::setPrintPrecision (C++ function) mllm::setRandomSeed (C++ function) mllm::shutdownContext (C++ function) mllm::signal_description (C++ function) mllm::test::allClose (C++ function) mllm::test::AllCloseResult (C++ class) mllm::test::AllCloseResult::mllm::test::AllCloseResult::is_close (C++ member) mllm::test::AllCloseResult::mllm::test::AllCloseResult::max_absolute_diff (C++ member) mllm::test::AllCloseResult::mllm::test::AllCloseResult::max_relative_diff (C++ member) mllm::test::AllCloseResult::mllm::test::AllCloseResult::mismatched_elements (C++ member) mllm::test::AllCloseResult::mllm::test::AllCloseResult::total_elements (C++ member) mllm::thisThread (C++ function) MLLM_FIND_NUMPY_AVAILABLE (in module pymllm.mobile.ffi) MLLM_FIND_SAFETENSORS_AVAILABLE (in module pymllm.mobile.convertor) MLLM_FIND_TORCH_AVAILABLE (in module pymllm.mobile.ffi) MLLM_LAYER_ANY_INPUTS_1_OUTPUTS_FORWARD (C macro) MLLM_LAYER_ANY_INPUTS_2_OUTPUTS_FORWARD (C macro) MLLM_LAYER_ANY_INPUTS_3_OUTPUTS_FORWARD (C macro) MLLM_MAIN (C macro) MLLM_MODEL_FILE_V2_MAGIC_NUMBER (in module pymllm.mobile.convertor.model_file_v2) MLLM_MODEL_FILE_V2_MODEL_NAME_LENGTH (in module pymllm.mobile.convertor.model_file_v2) MLLM_MODEL_FILE_V2_PARAMS_NAME_LENGTH (in module pymllm.mobile.convertor.model_file_v2) MLLM_MODEL_FILE_V2_TENSOR_SHAPE_LENGTH (in module pymllm.mobile.convertor.model_file_v2) MLLM_MODEL_FILE_V2_VERSION (in module pymllm.mobile.convertor.model_file_v2) mllm_qualcomm_max_length (modeling_llama.LlamaForCausalLM attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) (runner.LlamaQuantizer attribute) MLLM_TYPE_MAPPING (in module pymllm.mobile.convertor.mllm_type_mapping) MllmBaseLayer (class in pymllm.layers.base) MLP (class in pymllm.layers.mlp) mlp (pymllm.models.qwen3_5.Qwen3_5AttentionDecoderLayer attribute) (pymllm.models.qwen3_5.Qwen3_5LinearDecoderLayer attribute) (pymllm.models.qwen3_vl.Qwen3VisionBlock attribute) (pymllm.models.qwen3_vl.Qwen3VLDecoderLayer attribute) MLPActivation (in module pymllm.layers.mlp) mm_inputs (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) MmItemMemoryChunk (class in pymllm.orchestrator.cuda_ipc_transport) MmItemMemoryPool (class in pymllm.orchestrator.cuda_ipc_transport) model (modeling_llama.LlamaForCausalLM attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) (pymllm.configs.global_config.GlobalConfig attribute) (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) (runner.LlamaQuantizer attribute) model_capture_mode() (in module pymllm.executor.cuda_graph_runner) model_config (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.CompletionRequest attribute) (pymllm.server.launch.GenerateRequest attribute) MODEL_HUB_LOOKUP_TABLE (in module pymllm.mobile.service.models_hub) model_info() (in module pymllm.server.launch) model_name (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) model_parallel_is_initialized() (in module pymllm.orchestrator.parallel_state) model_path (pymllm.configs.server_config.ServerConfig attribute) model_runner (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) MODEL_SESSION_CREATED (in module pymllm.mobile.service.network) ModelConfig (class in pymllm.configs.model_config) ModelFileV2 (class in pymllm.mobile.convertor.model_file_v2) ModelFileV2Descriptor (class in pymllm.mobile.convertor.model_file_v2) ModelFileV2ParamsDescriptor (class in pymllm.mobile.convertor.model_file_v2) modeling_llama module modeling_qwen2 module modeling_qwen3 module ModelRunner (class in pymllm.executor.model_runner) ModelRunnerProcess (class in pymllm.orchestrator.model_runner_process) module modeling_llama modeling_qwen2 modeling_qwen3 pymllm pymllm.__main__ pymllm.configs pymllm.configs.global_config pymllm.configs.model_config pymllm.configs.quantization_config pymllm.configs.server_config pymllm.engine pymllm.engine.forward_batch pymllm.engine.io_struct pymllm.engine.launch pymllm.executor pymllm.executor.cuda_graph_runner pymllm.executor.model_runner pymllm.layers pymllm.layers.attention pymllm.layers.attention.attention_backend pymllm.layers.attention.flashinfer_backend pymllm.layers.attention.gdn pymllm.layers.attention.gdn_backend pymllm.layers.attention.hybrid_backend pymllm.layers.attention.radix_attention pymllm.layers.attention.radix_linear_attention pymllm.layers.base pymllm.layers.custom_event pymllm.layers.embedding pymllm.layers.gated_delta_net pymllm.layers.layer_norm pymllm.layers.linear pymllm.layers.mlp pymllm.layers.quantize_base pymllm.layers.rms_norm pymllm.layers.rms_norm_gated pymllm.layers.rope pymllm.layers.sampling pymllm.layers.utils pymllm.mem_cache pymllm.mem_cache.base_prefix_cache pymllm.mem_cache.chunk_cache pymllm.mem_cache.mamba_radix_cache pymllm.mem_cache.memory_pool pymllm.mem_cache.radix_cache pymllm.mobile pymllm.mobile.backends pymllm.mobile.backends.qualcomm pymllm.mobile.backends.qualcomm.nn pymllm.mobile.backends.qualcomm.qnn_aot_env pymllm.mobile.backends.qualcomm.transformers pymllm.mobile.backends.qualcomm.transformers.core pymllm.mobile.backends.qualcomm.transformers.core.embedding pymllm.mobile.backends.qualcomm.transformers.core.observer pymllm.mobile.backends.qualcomm.transformers.core.qdq pymllm.mobile.backends.qualcomm.transformers.core.qlinear pymllm.mobile.backends.qualcomm.transformers.core.rms_norm pymllm.mobile.convertor pymllm.mobile.convertor.mllm_type_mapping pymllm.mobile.convertor.model_file_v1 pymllm.mobile.convertor.model_file_v2 pymllm.mobile.ffi pymllm.mobile.ffi.base pymllm.mobile.nn pymllm.mobile.nn.functional pymllm.mobile.quantize pymllm.mobile.quantize.cast2fp32_pass pymllm.mobile.quantize.gguf pymllm.mobile.quantize.kai pymllm.mobile.quantize.kai.w4a32 pymllm.mobile.quantize.pipeline pymllm.mobile.quantize.quantize_pass pymllm.mobile.quantize.solver pymllm.mobile.quantize.spinquant pymllm.mobile.service pymllm.mobile.service.models_hub pymllm.mobile.service.network pymllm.mobile.service.rr_process pymllm.mobile.service.tools pymllm.mobile.utils pymllm.mobile.utils.adb pymllm.mobile.utils.error_handler pymllm.mobile.utils.mllm_convertor pymllm.models pymllm.models.qwen3_5 pymllm.models.qwen3_moe pymllm.models.qwen3_vl pymllm.orchestrator pymllm.orchestrator.cuda_ipc_transport pymllm.orchestrator.detokenizer_process pymllm.orchestrator.group_coordinator pymllm.orchestrator.ipc_utils pymllm.orchestrator.model_runner_process pymllm.orchestrator.parallel_state pymllm.orchestrator.request_response_process pymllm.orchestrator.scheduler_process pymllm.orchestrator.shared_memory_queue pymllm.orchestrator.tokenizer_process pymllm.parsers pymllm.parsers.reasoning_parser pymllm.parsers.tool_call_parser pymllm.quantization pymllm.quantization.methods pymllm.quantization.methods.awq_marlin pymllm.quantization.quant_config pymllm.server pymllm.server.launch runner test_context_create test_nn test_tensor test_vocab_parallel_embedding train Module (C++ class) Module::__fmt_print (C++ function) Module::__main (C++ function) Module::__send_graph_begin (C++ function) Module::__send_graph_end (C++ function) Module::__trace (C++ function) Module::forward (C++ function) Module::getBuffer (C++ function) Module::getModuleName (C++ function) Module::impl (C++ function) Module::load (C++ function) Module::Module (C++ function), [1], [2] Module::operator() (C++ function) Module::params (C++ function) Module::reg (C++ function) Module::registerBuffer (C++ function) Module::to (C++ function) mrope_interleaved (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) mrope_position_deltas (pymllm.engine.forward_batch.ForwardBatch attribute) mrope_section (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) MultimodalRoPE (C++ class) MultimodalRoPE::MultimodalRoPE::MultimodalRoPE (C++ function), [1], [2] N n (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) name (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) (pymllm.mobile.ffi.Tensor property) (pymllm.parsers.tool_call_parser.ToolCallItem attribute) (pymllm.server.launch.ChatMessage attribute) (pymllm.server.launch.ToolFunction attribute) need_sort (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator attribute) next (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) next_token_logits (pymllm.executor.model_runner.LogitsProcessorOutput attribute), [1] norm (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_vl.Qwen3VLTextModel attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) norm1 (pymllm.models.qwen3_vl.Qwen3VisionBlock attribute) norm2 (pymllm.models.qwen3_vl.Qwen3VisionBlock attribute) norm_before_gate (pymllm.layers.rms_norm_gated.RMSNormGated attribute) norm_input_qdq (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) normalize_batch_and_arguments() (pymllm.engine.io_struct.GenerateReqInput method) num_attention_heads (pymllm.executor.model_runner.ModelRunner attribute) num_consumers (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool attribute) num_embeddings (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) num_embeddings_per_partition (pymllm.layers.embedding.VocabParallelEmbedding attribute) num_gdn_layers (pymllm.mem_cache.memory_pool.GDNPool attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) num_grid_per_side (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) num_heads (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VisionAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) num_hidden_layers (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.models.qwen3_vl.Qwen3VLTextModel attribute) num_k_heads (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) num_kv_heads (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) num_params (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) num_position_embeddings (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) num_steps (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamObserver attribute) num_v_heads (pymllm.layers.attention.radix_linear_attention.RadixLinearAttention attribute) (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.mem_cache.memory_pool.GDNPool attribute) numel() (pymllm.mobile.ffi.Tensor method) O o_proj (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) occupied_chunks (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool attribute) on_node_evict (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) (pymllm.mem_cache.radix_cache.RadixCache attribute) ones() (in module pymllm.mobile.ffi) open() (pymllm.orchestrator.cuda_ipc_transport.ShmSyncBuffer static method) openai_chat_completions() (in module pymllm.server.launch) openai_completions() (in module pymllm.server.launch) out_cache_loc (pymllm.engine.forward_batch.ForwardBatch attribute) out_features (pymllm.layers.linear.ColumnParallelLinear attribute) (pymllm.layers.linear.Linear attribute) (pymllm.layers.linear.RowParallelLinear attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) out_features_per_partition (pymllm.layers.linear.ColumnParallelLinear attribute) out_hidden_size (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) out_list (pymllm.orchestrator.request_response_process.ReqState attribute) out_proj (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) (pymllm.models.qwen3_vl.Qwen3VisionAttention attribute) output_buffers (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) output_end_index (pymllm.layers.linear.ColumnParallelLinear attribute) output_ids (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) output_start_index (pymllm.layers.linear.ColumnParallelLinear attribute) output_strs (pymllm.engine.io_struct.BatchStrOutput attribute) output_token_logprobs_idx (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) output_token_logprobs_val (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) output_top_logprobs_idx (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) output_top_logprobs_val (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) outputs_dict (pymllm.mobile.quantize.quantize_pass.QuantizePlanPayload attribute) outputs_num (pymllm.mobile.quantize.quantize_pass.QuantizePlanPayload attribute) P pack_cols() (in module pymllm.quantization.methods.awq_marlin) pack_factor (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig attribute) padding_idx (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) page_size (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator attribute) (pymllm.mem_cache.radix_cache.RadixCache attribute) ParallelMLP (class in pymllm.layers.mlp) Param (C++ class) Param::Param::Param (C++ function), [1], [2] Param::Param::weight (C++ function) param_id (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) param_offset (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) param_size (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) param_type (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) ParameterFile (class in pymllm.mobile.ffi) parameters (pymllm.server.launch.ToolFunction attribute) params_desc_offset (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) parent (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) parse_non_stream() (pymllm.parsers.reasoning_parser.ReasoningParser method) (pymllm.parsers.tool_call_parser.ToolCallParser method) parse_stream_chunk() (pymllm.parsers.reasoning_parser.ReasoningParser method) (pymllm.parsers.tool_call_parser.ToolCallParser method) partial_rotary_factor (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) passes (pymllm.mobile.quantize.solver.QuantizeSolver attribute) patch_embed (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) patch_size (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed attribute) PerBlockParamFakeQuantize (class in pymllm.mobile.backends.qualcomm.transformers.core.observer) PerBlockParamObserver (class in pymllm.mobile.backends.qualcomm.transformers.core.observer) permute() (pymllm.mobile.ffi.Tensor method) pixel_values (pymllm.engine.forward_batch.ForwardBatch attribute) poll_timeout_ms (pymllm.orchestrator.scheduler_process.IdleSleeper attribute) poller (pymllm.orchestrator.scheduler_process.IdleSleeper attribute) pool (pymllm.mem_cache.chunk_cache.ChunkCache attribute) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache attribute) (pymllm.mem_cache.radix_cache.RadixCache attribute) port (pymllm.configs.server_config.ServerConfig attribute) pos_embed (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) positions (pymllm.engine.forward_batch.ForwardBatch attribute) post_attention_layernorm (pymllm.models.qwen3_5.Qwen3_5AttentionDecoderLayer attribute) (pymllm.models.qwen3_5.Qwen3_5LinearDecoderLayer attribute) (pymllm.models.qwen3_vl.Qwen3VLDecoderLayer attribute) predict() (in module pymllm.mobile.service.network) prefill_cuda_graph_metadata (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) prefill_wrapper_ragged (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) prefill_wrappers (pymllm.layers.attention.flashinfer_backend.PrefillMetadata attribute) prefill_wrappers_paged (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) PrefillMetadata (class in pymllm.layers.attention.flashinfer_backend) prefix_len (pymllm.mem_cache.base_prefix_cache.InsertResult attribute) (pymllm.mem_cache.base_prefix_cache.MatchResult attribute) (pymllm.orchestrator.scheduler_process.Req attribute) prepare() (pymllm.mobile.quantize.cast2fp32_pass.Cast2Fp32QuantizePass method) (pymllm.mobile.quantize.kai.w4a32.W4A32KAIQuantizePass method) (pymllm.mobile.quantize.quantize_pass.QuantizeBasePass method) prepare_for_decode() (pymllm.orchestrator.scheduler_process.ScheduleBatch method) prepare_for_extend() (pymllm.orchestrator.scheduler_process.ScheduleBatch method) prepare_forward_batch_decode() (pymllm.executor.model_runner.ModelRunner method) prepare_forward_batch_extend() (pymllm.executor.model_runner.ModelRunner method) presence_penalty (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) pretty_print() (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) prev (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) process_batch_result() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) process_input_requests() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) process_weights_after_loading() (pymllm.layers.quantize_base.QuantizeMethodBase method) (pymllm.quantization.methods.awq_marlin.AWQMarlinLinearMethod method) proj (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed attribute) prompt (pymllm.server.launch.CompletionRequest attribute) prompt_len (pymllm.orchestrator.scheduler_process.Req attribute) prompt_tokens (pymllm.engine.io_struct.BatchStrOutput attribute) (pymllm.engine.io_struct.BatchTokenIDOutput attribute) protected_size() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) pull_file() (pymllm.mobile.utils.adb.ADBToolkit method) push_file() (pymllm.mobile.utils.adb.ADBToolkit method) put() (pymllm.orchestrator.shared_memory_queue.TensorQueue method) pymllm module pymllm.__main__ module pymllm.configs module pymllm.configs.global_config module pymllm.configs.model_config module pymllm.configs.quantization_config module pymllm.configs.server_config module pymllm.engine module pymllm.engine.forward_batch module pymllm.engine.io_struct module pymllm.engine.launch module pymllm.executor module pymllm.executor.cuda_graph_runner module pymllm.executor.model_runner module pymllm.layers module pymllm.layers.attention module pymllm.layers.attention.attention_backend module pymllm.layers.attention.flashinfer_backend module pymllm.layers.attention.gdn module pymllm.layers.attention.gdn_backend module pymllm.layers.attention.hybrid_backend module pymllm.layers.attention.radix_attention module pymllm.layers.attention.radix_linear_attention module pymllm.layers.base module pymllm.layers.custom_event module pymllm.layers.embedding module pymllm.layers.gated_delta_net module pymllm.layers.layer_norm module pymllm.layers.linear module pymllm.layers.mlp module pymllm.layers.quantize_base module pymllm.layers.rms_norm module pymllm.layers.rms_norm_gated module pymllm.layers.rope module pymllm.layers.sampling module pymllm.layers.utils module pymllm.mem_cache module pymllm.mem_cache.base_prefix_cache module pymllm.mem_cache.chunk_cache module pymllm.mem_cache.mamba_radix_cache module pymllm.mem_cache.memory_pool module pymllm.mem_cache.radix_cache module pymllm.mobile module pymllm.mobile.backends module pymllm.mobile.backends.qualcomm module pymllm.mobile.backends.qualcomm.nn module pymllm.mobile.backends.qualcomm.qnn_aot_env module pymllm.mobile.backends.qualcomm.transformers module pymllm.mobile.backends.qualcomm.transformers.core module pymllm.mobile.backends.qualcomm.transformers.core.embedding module pymllm.mobile.backends.qualcomm.transformers.core.observer module pymllm.mobile.backends.qualcomm.transformers.core.qdq module pymllm.mobile.backends.qualcomm.transformers.core.qlinear module pymllm.mobile.backends.qualcomm.transformers.core.rms_norm module pymllm.mobile.convertor module pymllm.mobile.convertor.mllm_type_mapping module pymllm.mobile.convertor.model_file_v1 module pymllm.mobile.convertor.model_file_v2 module pymllm.mobile.ffi module pymllm.mobile.ffi.base module pymllm.mobile.nn module pymllm.mobile.nn.functional module pymllm.mobile.quantize module pymllm.mobile.quantize.cast2fp32_pass module pymllm.mobile.quantize.gguf module pymllm.mobile.quantize.kai module pymllm.mobile.quantize.kai.w4a32 module pymllm.mobile.quantize.pipeline module pymllm.mobile.quantize.quantize_pass module pymllm.mobile.quantize.solver module pymllm.mobile.quantize.spinquant module pymllm.mobile.service module pymllm.mobile.service.models_hub module pymllm.mobile.service.network module pymllm.mobile.service.rr_process module pymllm.mobile.service.tools module pymllm.mobile.utils module pymllm.mobile.utils.adb module pymllm.mobile.utils.error_handler module pymllm.mobile.utils.mllm_convertor module pymllm.models module pymllm.models.qwen3_5 module pymllm.models.qwen3_moe module pymllm.models.qwen3_vl module pymllm.orchestrator module pymllm.orchestrator.cuda_ipc_transport module pymllm.orchestrator.detokenizer_process module pymllm.orchestrator.group_coordinator module pymllm.orchestrator.ipc_utils module pymllm.orchestrator.model_runner_process module pymllm.orchestrator.parallel_state module pymllm.orchestrator.request_response_process module pymllm.orchestrator.scheduler_process module pymllm.orchestrator.shared_memory_queue module pymllm.orchestrator.tokenizer_process module pymllm.parsers module pymllm.parsers.reasoning_parser module pymllm.parsers.tool_call_parser module pymllm.quantization module pymllm.quantization.methods module pymllm.quantization.methods.awq_marlin module pymllm.quantization.quant_config module pymllm.server module pymllm.server.launch module Q q_dtype (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) q_norm (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) q_proj (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) q_size (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) QEmbedding (class in pymllm.mobile.backends.qualcomm.transformers.core.embedding) qk_head_dim (pymllm.layers.attention.radix_attention.RadixAttention attribute) qkv_proj (pymllm.models.qwen3_vl.Qwen3VisionAttention attribute) QLinear (class in pymllm.mobile.backends.qualcomm.transformers.core.qlinear) QLinearLPBQ (class in pymllm.mobile.backends.qualcomm.transformers.core.qlinear) QLinearW8A16_PerChannelSym (class in pymllm.mobile.backends.qualcomm.transformers.core.qlinear) qnn (in module pymllm.mobile.ffi) qnn_() (in module pymllm.mobile.ffi) qnn_aot_env (in module test_context_create) qnn_context (in module test_context_create) QnnDeviceAndContext (class in pymllm.mobile.ffi) (in module pymllm.mobile.backends.qualcomm.qnn_aot_env) QnnRoPE (class in pymllm.mobile.backends.qualcomm.nn) QnnSoftmax (class in pymllm.mobile.backends.qualcomm.nn) QRMSNorm (class in pymllm.mobile.backends.qualcomm.transformers.core.rms_norm) qscheme (pymllm.mobile.backends.qualcomm.transformers.core.qdq.ActivationQDQ attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ attribute) qsize() (pymllm.orchestrator.shared_memory_queue.TensorQueue method) quant_bits (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm attribute) quant_config (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) (pymllm.quantization.methods.awq_marlin.AWQMarlinLinearMethod attribute) quant_max (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) quant_method (pymllm.layers.base.MllmBaseLayer attribute) (pymllm.layers.linear.ColumnParallelLinear attribute) (pymllm.layers.linear.Linear attribute) (pymllm.layers.linear.RowParallelLinear attribute) quant_min (pymllm.mobile.backends.qualcomm.transformers.core.observer.PerBlockParamFakeQuantize attribute) quant_type (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig attribute) quantization (pymllm.configs.global_config.GlobalConfig attribute) QuantizationConfig (class in pymllm.configs.quantization_config) (class in pymllm.quantization.quant_config) QuantizeBasePass (class in pymllm.mobile.quantize.quantize_pass) QuantizeMethodBase (class in pymllm.layers.quantize_base) QuantizePlanPayload (class in pymllm.mobile.quantize.quantize_pass) QuantizeSolver (class in pymllm.mobile.quantize.solver) QuickGELU (C++ class) QuickGELU::QuickGELU::QuickGELU (C++ function), [1] Qwen2ForCausalLM (class in modeling_qwen2) Qwen2ForQuestionAnswering (class in modeling_qwen2) Qwen2ForSequenceClassification (class in modeling_qwen2) Qwen2ForTokenClassification (class in modeling_qwen2) Qwen2Model (class in modeling_qwen2) Qwen2PreTrainedModel (class in modeling_qwen2) Qwen2RMSNorm (class in modeling_qwen2) Qwen3_5AttentionDecoderLayer (class in pymllm.models.qwen3_5) Qwen3_5ForCausalLM (class in pymllm.models.qwen3_5) Qwen3_5ForConditionalGeneration (class in pymllm.models.qwen3_5) Qwen3_5FullAttention (class in pymllm.models.qwen3_5) Qwen3_5LinearDecoderLayer (class in pymllm.models.qwen3_5) Qwen3ForCausalLM (class in modeling_qwen3) Qwen3ForQuestionAnswering (class in modeling_qwen3) Qwen3ForSequenceClassification (class in modeling_qwen3) Qwen3ForTokenClassification (class in modeling_qwen3) Qwen3Model (class in modeling_qwen3) Qwen3PreTrainedModel (class in modeling_qwen3) Qwen3VisionAttention (class in pymllm.models.qwen3_vl) Qwen3VisionBlock (class in pymllm.models.qwen3_vl) Qwen3VisionMLP (class in pymllm.models.qwen3_vl) Qwen3VLAttention (class in pymllm.models.qwen3_vl) Qwen3VLDecoderLayer (class in pymllm.models.qwen3_vl) Qwen3VLForConditionalGeneration (class in pymllm.models.qwen3_vl) Qwen3VLTextModel (class in pymllm.models.qwen3_vl) Qwen3VLVisionModel (class in pymllm.models.qwen3_vl) Qwen3VLVisionPatchEmbed (class in pymllm.models.qwen3_vl) Qwen3VLVisionPatchMerger (class in pymllm.models.qwen3_vl) R radix_cache_page_size (pymllm.configs.server_config.ServerConfig attribute) RadixAttention (class in pymllm.layers.attention.radix_attention) RadixCache (class in pymllm.mem_cache.radix_cache) RadixKey (class in pymllm.mem_cache.base_prefix_cache) RadixLinearAttention (class in pymllm.layers.attention.radix_linear_attention) random() (in module pymllm.mobile.ffi) random_seed (pymllm.configs.server_config.ServerConfig attribute) rank (pymllm.mobile.ffi.Tensor property) rank_in_group (pymllm.orchestrator.group_coordinator.GroupCoordinator attribute) ranks (pymllm.orchestrator.group_coordinator.GroupCoordinator attribute) read_args() (in module pymllm.configs.global_config) read_metadata() (pymllm.orchestrator.shared_memory_queue.SharedMemoryManager static method) read_offset (pymllm.orchestrator.scheduler_process.Req attribute) read_offsets (pymllm.engine.io_struct.BatchTokenIDOutput attribute) reasoning_parser (pymllm.configs.server_config.ServerConfig attribute) ReasoningParser (class in pymllm.parsers.reasoning_parser) reboot_device() (pymllm.mobile.utils.adb.ADBToolkit method) recompute_scale_zp() (in module runner) (runner.LlamaQuantizer method) reconstruct_on_device() (pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy method) record_screen() (pymllm.mobile.utils.adb.ADBToolkit method) recurrent_state (pymllm.mem_cache.memory_pool.GDNPool attribute) recv_requests() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) reduce_output (pymllm.layers.linear.RowParallelLinear attribute) regenerate_rid() (pymllm.engine.io_struct.BaseReq method) regenerate_rids() (pymllm.engine.io_struct.BaseBatchReq method) register_pass() (pymllm.mobile.quantize.solver.QuantizeSolver method) register_quantization() (in module pymllm.quantization.quant_config) ReLU (C++ class) ReLU::ReLU::ReLU (C++ function), [1] remove() (pymllm.mem_cache.mamba_radix_cache.LRUList method) remove_state() (pymllm.orchestrator.request_response_process.RequestResponseProcess method) repeat() (pymllm.mobile.ffi.Tensor method) repetition_penalty (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) replace_parameter() (in module pymllm.quantization.methods.awq_marlin) replay() (pymllm.executor.cuda_graph_runner.CudaGraphRunner method) Req (class in pymllm.orchestrator.scheduler_process) req_pool_idx (pymllm.orchestrator.scheduler_process.Req attribute) req_pool_indices (pymllm.engine.forward_batch.ForwardBatch attribute) req_to_token (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) (pymllm.mem_cache.memory_pool.ReqToTokenPool attribute) req_to_token_pool (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.executor.model_runner.ModelRunner attribute) reqs (pymllm.engine.io_struct.BatchTokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.ScheduleBatch attribute) ReqState (class in pymllm.orchestrator.request_response_process) ReqToTokenPool (class in pymllm.mem_cache.memory_pool) RequestResponseProcess (class in pymllm.orchestrator.request_response_process) reset() (pymllm.configs.global_config.GlobalConfig class method) (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.chunk_cache.ChunkCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) reset_states() (pymllm.mem_cache.memory_pool.GDNPool method) retrieve_model() (in module pymllm.server.launch) return_logprob (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.GenerateRequest attribute) rid (pymllm.engine.io_struct.BaseReq attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.AbortRequest attribute) (pymllm.server.launch.GenerateRequest attribute) rids (pymllm.engine.io_struct.BaseBatchReq attribute) rms_norm_gated() (in module pymllm.layers.rms_norm_gated) RMSNorm (C++ class) (class in pymllm.layers.rms_norm) RMSNorm::RMSNorm::RMSNorm (C++ function), [1], [2] RMSNorm::RMSNorm::weight (C++ function) RMSNormGated (class in pymllm.layers.rms_norm_gated) role (pymllm.server.launch.ChatMessage attribute) rope_theta (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) rot_pos_emb() (pymllm.models.qwen3_vl.Qwen3VLVisionModel method) rotary_dim (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) rotary_emb (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) RowParallelLinear (class in pymllm.layers.linear) run() (pymllm.mobile.quantize.cast2fp32_pass.Cast2Fp32QuantizePass method) (pymllm.mobile.quantize.kai.w4a32.W4A32KAIQuantizePass method) (pymllm.mobile.quantize.quantize_pass.QuantizeBasePass method) run_batch() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) run_detokenizer_process() (in module pymllm.orchestrator.detokenizer_process) run_scheduler_process() (in module pymllm.orchestrator.scheduler_process) run_tokenizer_process() (in module pymllm.orchestrator.tokenizer_process) run_worker_tp8_cuda() (in module test_vocab_parallel_embedding) runner module S sample() (pymllm.executor.model_runner.ModelRunner method) sampling_backend (pymllm.configs.server_config.ServerConfig attribute) sampling_from_logits() (in module pymllm.layers.sampling) sampling_from_probs() (in module pymllm.layers.sampling) sampling_params (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.GenerateRequest attribute) SCALAR_TYPE_UINT4 (in module pymllm.quantization.methods.awq_marlin) SCALAR_TYPE_UINT8 (in module pymllm.quantization.methods.awq_marlin) scale (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ property) scaling (pymllm.layers.attention.radix_attention.RadixAttention attribute) (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) schedule_conservativeness (pymllm.configs.server_config.ServerConfig attribute) schedule_policy (pymllm.configs.server_config.ServerConfig attribute) ScheduleBatch (class in pymllm.orchestrator.scheduler_process) SchedulerProcess (class in pymllm.orchestrator.scheduler_process) seed (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) self_attn (pymllm.models.qwen3_5.Qwen3_5AttentionDecoderLayer attribute) (pymllm.models.qwen3_vl.Qwen3VLDecoderLayer attribute) send_request() (in module pymllm.mobile.service.rr_process) separate_reasoning (pymllm.server.launch.ChatCompletionRequest attribute) seq_len (pymllm.orchestrator.scheduler_process.Req attribute) seq_len_fill_value (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) seq_lens (pymllm.engine.forward_batch.ForwardBatch attribute) seq_lens_cpu (pymllm.engine.forward_batch.ForwardBatch attribute) seq_lens_sum (pymllm.engine.forward_batch.ForwardBatch attribute) served_model_name (pymllm.configs.server_config.ServerConfig attribute) server (pymllm.configs.global_config.GlobalConfig attribute) server_config (pymllm.executor.model_runner.ModelRunner attribute) server_info() (in module pymllm.server.launch) ServerConfig (class in pymllm.configs.server_config) Session (class in pymllm.mobile.ffi) session_params (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.server.launch.GenerateRequest attribute) session_qwen3() (in module pymllm.mobile.service.models_hub) set_global_graph_memory_pool() (in module pymllm.executor.cuda_graph_runner) set_kv_buffer() (pymllm.mem_cache.memory_pool.KVPool method) set_name() (pymllm.mobile.ffi.Tensor method) set_weight_attrs() (in module pymllm.layers.utils) setup_config() (test_vocab_parallel_embedding.TestVocabParallelEmbeddingCUDA method) setup_subprocess_logging() (in module pymllm.orchestrator.ipc_utils) sf (test_nn.FooModule attribute) shape (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) (pymllm.mobile.ffi.Tensor property) shape_len (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) SharedMemoryManager (class in pymllm.orchestrator.shared_memory_queue) ShellContext (class in pymllm.mobile.utils.adb) ShmSyncBuffer (class in pymllm.orchestrator.cuda_ipc_transport) should_use_tensor_core() (in module pymllm.layers.attention.flashinfer_backend) show_config() (in module pymllm.__main__) show_time_cost (pymllm.configs.server_config.ServerConfig attribute) shutdown() (pymllm.engine.launch.Engine method) (pymllm.executor.cuda_graph_runner.CudaGraphRunner method) (pymllm.executor.model_runner.ModelRunner method) (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool method) (pymllm.orchestrator.detokenizer_process.DetokenizerProcess method) (pymllm.orchestrator.model_runner_process.ModelRunnerProcess method) (pymllm.orchestrator.request_response_process.RequestResponseProcess method) (pymllm.orchestrator.scheduler_process.SchedulerProcess method) (pymllm.orchestrator.tokenizer_process.TokenizerProcess method) shutdown_context() (in module pymllm.mobile.ffi) SiLU (C++ class) SiLU::SiLU::SiLU (C++ function), [1] sin_embedding_input_qdq (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3Model attribute) size (pymllm.mem_cache.memory_pool.KVPool attribute) (pymllm.mem_cache.memory_pool.ReqToTokenPool attribute) (pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator attribute) SIZE (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) (pymllm.mobile.convertor.model_file_v2.ModelFileV2ParamsDescriptor attribute) skip_prefill (pymllm.layers.attention.flashinfer_backend.FlashInferAttnBackend attribute) skip_special_tokens (pymllm.engine.io_struct.BatchTokenIDOutput attribute) sleep() (pymllm.orchestrator.scheduler_process.IdleSleeper method) sleep_on_idle (pymllm.configs.server_config.ServerConfig attribute) SLIDING_WINDOW (pymllm.layers.attention.flashinfer_backend.WrapperDispatch attribute) sliding_window_size (pymllm.executor.model_runner.ModelRunner property) (pymllm.layers.attention.radix_attention.RadixAttention attribute) (pymllm.mem_cache.radix_cache.RadixCache attribute) Softmax (C++ class) softmax() (in module pymllm.layers.sampling) Softmax::Softmax::Softmax (C++ function), [1], [2] SoftmaxOp (class in pymllm.mobile.ffi) SoftmaxOpOptions (class in pymllm.mobile.ffi) spatial_merge_size (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) split_tensor_along_dim() (in module pymllm.orchestrator.group_coordinator) squeeze() (pymllm.mobile.ffi.Tensor method) start (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk property) start() (pymllm.orchestrator.request_response_process.RequestResponseProcess method) start_service() (in module pymllm.mobile.service.rr_process) static_write() (pymllm.mobile.convertor.model_file_v2.ModelFileV2 method) STFT (C++ class) STFT::STFT::STFT (C++ function), [1], [2] stop (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) stop_service() (in module pymllm.mobile.service.rr_process) stop_token_ids (pymllm.orchestrator.scheduler_process.Req attribute) stream (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.mobile.service.network.ChatCompletionRequest attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) (pymllm.server.launch.GenerateRequest attribute) stream_interval (pymllm.configs.server_config.ServerConfig attribute) stream_options (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) stream_output (pymllm.configs.server_config.ServerConfig attribute) stream_output() (pymllm.orchestrator.scheduler_process.SchedulerProcess method) stream_quantize() (pymllm.mobile.quantize.solver.QuantizeSolver method) stream_quantize_params_size() (pymllm.mobile.quantize.solver.QuantizeSolver method) stream_reasoning (pymllm.server.launch.ChatCompletionRequest attribute) streaming_write() (pymllm.mobile.convertor.model_file_v2.ModelFileV2 method) StreamOptions (class in pymllm.server.launch) sum() (pymllm.mobile.ffi.Tensor method) SUPPORTED (pymllm.parsers.reasoning_parser.ReasoningParser attribute) (pymllm.parsers.tool_call_parser.ToolCallParser attribute) supports_gradient_checkpointing (modeling_llama.LlamaPreTrainedModel attribute) (modeling_qwen2.Qwen2PreTrainedModel attribute) (modeling_qwen3.Qwen3PreTrainedModel attribute) supports_swa (pymllm.mem_cache.radix_cache.RadixCache property) swa_boundary_id (pymllm.mem_cache.radix_cache.TreeNode attribute) swa_evictable_size() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) swa_evicted (pymllm.mem_cache.base_prefix_cache.EvictResult attribute) swa_lock_ref (pymllm.mem_cache.radix_cache.TreeNode attribute) swa_protected_size() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) swa_tombstone (pymllm.mem_cache.radix_cache.TreeNode attribute) sync_data_meta (pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy attribute) sync_flag (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk attribute) T T (pymllm.mobile.ffi.Tensor property) tail (pymllm.mem_cache.mamba_radix_cache.LRUList attribute) take_screenshot() (pymllm.mobile.utils.adb.ADBToolkit method) temperature (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) temporal_patch_size (pymllm.models.qwen3_vl.Qwen3VLVisionModel attribute) (pymllm.models.qwen3_vl.Qwen3VLVisionPatchEmbed attribute) Tensor (class in pymllm.mobile.ffi) Tensor::abs (C++ function) Tensor::alloc (C++ function) Tensor::allocExtraTensorView (C++ function) Tensor::arange (C++ function) Tensor::at (C++ function) Tensor::bytes (C++ function) Tensor::clip (C++ function) Tensor::clone (C++ function) Tensor::coffsettedPtr (C++ function) Tensor::constAt (C++ function) Tensor::contiguous (C++ function) Tensor::copy2 (C++ function) Tensor::cptrAt (C++ function) Tensor::cpu (C++ function) Tensor::cuda (C++ function) Tensor::delete_ (C++ function) Tensor::device (C++ function) Tensor::dtype (C++ function) Tensor::empty (C++ function) Tensor::fromVector (C++ function) Tensor::getExtraTensorViewInTensor (C++ function) Tensor::isContiguous (C++ function) Tensor::isContiguousN (C++ function) Tensor::isNil (C++ function) Tensor::max (C++ function) Tensor::mean (C++ function) Tensor::memType (C++ function) Tensor::min (C++ function) Tensor::name (C++ function) Tensor::nil (C++ function) Tensor::numel (C++ function) Tensor::offsettedPtr (C++ function) Tensor::ones (C++ function) Tensor::operator bool (C++ function) Tensor::operator delete (C++ function) Tensor::operator* (C++ function), [1] Tensor::operator+ (C++ function), [1] Tensor::operator- (C++ function), [1], [2] Tensor::operator/ (C++ function), [1] Tensor::operator[] (C++ function), [1] Tensor::permute (C++ function) Tensor::ptr (C++ function) Tensor::ptrAt (C++ function) Tensor::random (C++ function) Tensor::repeat (C++ function) Tensor::reshape (C++ function) Tensor::setMemType (C++ function) Tensor::setName (C++ function) Tensor::shape (C++ function) Tensor::stride (C++ function) Tensor::sum (C++ function) Tensor::T (C++ function) Tensor::to (C++ function), [1] Tensor::topk (C++ function) Tensor::transpose (C++ function) Tensor::unsqueeze (C++ function) Tensor::uuid (C++ function) Tensor::view (C++ function) Tensor::zeros (C++ function) tensor_model_parallel_all_gather() (in module pymllm.orchestrator.parallel_state) tensor_model_parallel_all_reduce() (in module pymllm.orchestrator.parallel_state) tensor_transport_mode (pymllm.configs.server_config.ServerConfig attribute) TensorQueue (class in pymllm.orchestrator.shared_memory_queue) TensorTransportMode (in module pymllm.orchestrator.cuda_ipc_transport) test_context_create module test_cuda_forward() (test_vocab_parallel_embedding.TestVocabParallelEmbeddingCUDA method) test_cuda_weight_loader() (test_vocab_parallel_embedding.TestVocabParallelEmbeddingCUDA method) test_empty_tensor_create() (in module test_tensor) test_forward_pass_tp8_real() (test_vocab_parallel_embedding.TestVocabParallelEmbeddingRealTP8 method) test_is_torch_available() (in module test_tensor) test_nn module test_tensor module test_vocab_parallel_embedding module test_weight_loading_tp8_real() (test_vocab_parallel_embedding.TestVocabParallelEmbeddingRealTP8 method) TestVocabParallelEmbeddingCUDA (class in test_vocab_parallel_embedding) TestVocabParallelEmbeddingRealTP8 (class in test_vocab_parallel_embedding) text (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.GenerateRequest attribute) to() (pymllm.mobile.ffi.Tensor method) to_batch_dict() (pymllm.orchestrator.scheduler_process.ScheduleBatch method) to_openai_dict() (pymllm.parsers.tool_call_parser.ToolCallItem method) to_pod() (pymllm.mobile.ffi.Device method) (pymllm.mobile.ffi.DType method) to_request_dict() (pymllm.engine.io_struct.GenerateReqInput method) tobytes() (pymllm.mobile.ffi.Tensor method) token_ids (pymllm.mem_cache.base_prefix_cache.RadixKey attribute) token_to_kv_pool (pymllm.engine.forward_batch.ForwardBatch attribute) (pymllm.executor.model_runner.ModelRunner attribute) token_to_kv_pool_allocator (pymllm.executor.model_runner.ModelRunner attribute) TokenizedGenerateReqInput (class in pymllm.engine.io_struct) tokenizer (runner.LlamaQuantizer attribute) tokenizer_mode (pymllm.configs.server_config.ServerConfig attribute) tokenizer_path (pymllm.configs.server_config.ServerConfig attribute) TokenizerProcess (class in pymllm.orchestrator.tokenizer_process) TokenToKVPoolAllocator (class in pymllm.mem_cache.memory_pool) Tool (class in pymllm.server.launch) tool_call_id (pymllm.parsers.tool_call_parser.ToolCallItem attribute) (pymllm.server.launch.ChatMessage attribute) tool_call_parser (pymllm.configs.server_config.ServerConfig attribute) tool_calls (pymllm.server.launch.ChatMessage attribute) tool_choice (pymllm.server.launch.ChatCompletionRequest attribute) ToolCallItem (class in pymllm.parsers.tool_call_parser) ToolCallParser (class in pymllm.parsers.tool_call_parser) ToolFunction (class in pymllm.server.launch) tools (pymllm.server.launch.ChatCompletionRequest attribute) top_k (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) top_k_mask_logits() (in module pymllm.layers.sampling) top_k_renorm_prob (in module pymllm.layers.sampling) top_k_renorm_probs() (in module pymllm.layers.sampling) top_k_sampling_from_probs() (in module pymllm.layers.sampling) top_k_top_p_sampling_from_logits() (in module pymllm.layers.sampling) top_k_top_p_sampling_from_probs() (in module pymllm.layers.sampling) top_logprobs (pymllm.server.launch.ChatCompletionRequest attribute) top_logprobs_num (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.engine.io_struct.TokenizedGenerateReqInput attribute) (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.GenerateRequest attribute) top_logprobs_nums (pymllm.engine.forward_batch.ForwardBatch attribute) top_p (pymllm.orchestrator.scheduler_process.Req attribute) (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) top_p_renorm_prob (in module pymllm.layers.sampling) top_p_renorm_probs() (in module pymllm.layers.sampling) top_p_sampling_from_probs() (in module pymllm.layers.sampling) torch_compile_max_bs (pymllm.configs.server_config.ServerConfig attribute) (pymllm.executor.cuda_graph_runner.CudaGraphRunner attribute) total_size() (pymllm.mem_cache.base_prefix_cache.BasePrefixCache method) (pymllm.mem_cache.mamba_radix_cache.MambaRadixCache method) (pymllm.mem_cache.radix_cache.RadixCache method) touch_mru() (pymllm.mem_cache.mamba_radix_cache.LRUList method) touch_node_and_parents_mru() (pymllm.mem_cache.mamba_radix_cache.LRUList method) tp_k_head_num (pymllm.layers.attention.radix_attention.RadixAttention attribute) tp_q_head_num (pymllm.layers.attention.radix_attention.RadixAttention attribute) tp_rank (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.layers.linear.ColumnParallelLinear attribute) (pymllm.layers.linear.RowParallelLinear attribute) tp_size (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.layers.linear.ColumnParallelLinear attribute) (pymllm.layers.linear.RowParallelLinear attribute) tp_v_head_num (pymllm.layers.attention.radix_attention.RadixAttention attribute) train module transport_mode (pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor property) TransportProxyTensor (class in pymllm.orchestrator.cuda_ipc_transport) transpose() (pymllm.mobile.ffi.Tensor method) TreeNode (class in pymllm.mem_cache.radix_cache) trust_remote_code (pymllm.configs.server_config.ServerConfig attribute) try_to_recycle() (pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryChunk method) type (pymllm.server.launch.ContentPart attribute) (pymllm.server.launch.Tool attribute) U uint16 (in module pymllm.mobile.ffi) uint16_() (in module pymllm.mobile.ffi) uint32 (in module pymllm.mobile.ffi) uint32_() (in module pymllm.mobile.ffi) uint64 (in module pymllm.mobile.ffi) uint64_() (in module pymllm.mobile.ffi) uint8 (in module pymllm.mobile.ffi) uint8_() (in module pymllm.mobile.ffi) uninstall_app() (pymllm.mobile.utils.adb.ADBToolkit method) unpack_cols() (in module pymllm.quantization.methods.awq_marlin) UnquantizedLinearMethod (class in pymllm.layers.quantize_base) unsqueeze() (pymllm.mobile.ffi.Tensor method) unwrap_mm_inputs_from_ipc() (in module pymllm.orchestrator.cuda_ipc_transport) up_proj (pymllm.layers.mlp.ParallelMLP attribute) update_mode (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) url (pymllm.server.launch.ImageUrl attribute) use_fused_gate_up_proj (pymllm.layers.mlp.MLP attribute) use_fused_qkv (pymllm.models.qwen3_vl.Qwen3VLAttention attribute) use_postshuffle_norm (pymllm.models.qwen3_vl.Qwen3VLVisionPatchMerger attribute) use_ragged (pymllm.layers.attention.flashinfer_backend.PrefillMetadata attribute) user (pymllm.server.launch.ChatCompletionRequest attribute) (pymllm.server.launch.CompletionRequest attribute) V v2_file_header (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) v2_param_descriptor (pymllm.mobile.convertor.model_file_v2.ModelFileV2 attribute) v_buffer (pymllm.mem_cache.memory_pool.KVPool attribute) v_head_dim (pymllm.layers.attention.radix_attention.RadixAttention attribute) (pymllm.mem_cache.memory_pool.KVPool attribute) v_head_num (pymllm.mem_cache.memory_pool.KVPool attribute) v_proj (pymllm.models.qwen3_5.Qwen3_5FullAttention attribute) validate_concat_observer() (runner.LlamaQuantizer method) validate_concat_observer_fn() (in module runner) value (pymllm.mem_cache.mamba_radix_cache.MambaTreeNode attribute) (pymllm.mem_cache.radix_cache.TreeNode attribute) value_dim (pymllm.layers.gated_delta_net.GatedDeltaNet attribute) verify_marlin_supported() (in module pymllm.quantization.methods.awq_marlin) verify_marlin_supports_shape() (in module pymllm.quantization.methods.awq_marlin) version (pymllm.mobile.convertor.model_file_v2.ModelFileV2Descriptor attribute) video_data (pymllm.engine.io_struct.GenerateReqInput attribute) (pymllm.server.launch.GenerateRequest attribute) video_token_id (pymllm.models.qwen3_5.Qwen3_5ForConditionalGeneration attribute) (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) view() (pymllm.mobile.ffi.Tensor method) vision_start_token_id (pymllm.models.qwen3_vl.Qwen3VLForConditionalGeneration attribute) VisionRoPE (C++ class) VisionRoPE::VisionRoPE::VisionRoPE (C++ function), [1], [2] vocab_end_index (pymllm.layers.embedding.VocabParallelEmbedding attribute) vocab_size (modeling_llama.LlamaForCausalLM attribute) (modeling_llama.LlamaModel attribute) (modeling_qwen2.Qwen2ForCausalLM attribute) (modeling_qwen2.Qwen2Model attribute) (modeling_qwen3.Qwen3ForCausalLM attribute) (modeling_qwen3.Qwen3Model attribute) (pymllm.executor.model_runner.ModelRunner attribute) (pymllm.models.qwen3_5.Qwen3_5ForCausalLM attribute) vocab_start_index (pymllm.layers.embedding.VocabParallelEmbedding attribute) VocabParallelEmbedding (class in pymllm.layers.embedding) W W4A32KAIQuantizePass (class in pymllm.mobile.quantize.kai.w4a32) weight (pymllm.layers.embedding.VocabParallelEmbedding attribute) (pymllm.layers.gated_delta_net.GDNConv1d attribute) (pymllm.layers.layer_norm.LayerNorm attribute) (pymllm.layers.rms_norm.GemmaRMSNorm attribute) (pymllm.layers.rms_norm.RMSNorm attribute) (pymllm.layers.rms_norm_gated.RMSNormGated attribute) (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm attribute) weight_bits (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig attribute) weight_fake_quant (pymllm.mobile.backends.qualcomm.transformers.core.embedding.QEmbedding attribute) (pymllm.mobile.backends.qualcomm.transformers.core.rms_norm.QRMSNorm attribute) weight_loader() (pymllm.layers.base.MllmBaseLayer method) (pymllm.layers.embedding.VocabParallelEmbedding method) (pymllm.layers.linear.ColumnParallelLinear method) (pymllm.layers.linear.RowParallelLinear method) weight_loading_tp8_worker_cuda() (in module test_vocab_parallel_embedding) weight_quant (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinear attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearLPBQ attribute) (pymllm.mobile.backends.qualcomm.transformers.core.qlinear.QLinearW8A16_PerChannelSym attribute) world_size (pymllm.orchestrator.group_coordinator.GroupCoordinator attribute) wrap_mm_inputs_for_ipc() (in module pymllm.orchestrator.cuda_ipc_transport) WrapperDispatch (class in pymllm.layers.attention.flashinfer_backend) write() (pymllm.mem_cache.memory_pool.ReqToTokenPool method) write_metadata() (pymllm.orchestrator.shared_memory_queue.SharedMemoryManager static method) X x (in module test_nn) Z zero_point (pymllm.mobile.backends.qualcomm.transformers.core.qdq.FixedActivationQDQ property) (pymllm.quantization.methods.awq_marlin.AWQMarlinConfig attribute) zeros() (in module pymllm.mobile.ffi)