lmdeploy.pytorch 新模型支持#
lmdeploy.pytorch 旨在简化对新模型的支持和原型的开发。用户可以根据自己的需求适配新模型。
模型支持#
配置加载(可选)#
lmdeploy.pytorch 根据模型的配置文件初始化引擎。如果要集成的模型的参数命名与 transformers 中的常见模型不同,可能会出现解析错误。可以添加自定义的 ConfigBuilder 来解析配置。
# lmdeploy/pytorch/configurations/gemma.py
from lmdeploy.pytorch.config import ModelConfig
from .builder import AutoModelConfigBuilder
class GemmaModelConfigBuilder(AutoModelConfigBuilder):
@classmethod
def condition(cls, hf_config):
# Check if hf_config is suitable for this builder
return hf_config.model_type in ['gemma', 'gemma2']
@classmethod
def build(cls, hf_config, model_path: str = None):
# Use the hf_config loaded by transformers
# Construct the ModelConfig for the pytorch engine
return ModelConfig(hidden_size=hf_config.hidden_size,
num_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,
num_key_value_heads=hf_config.num_key_value_heads,
bos_token_id=hf_config.bos_token_id,
eos_token_id=hf_config.eos_token_id,
head_dim=hf_config.head_dim,
vocab_size=hf_config.vocab_size)
lmdeploy.pytorch.check_env.check_model
函数可用于验证配置是否可以正确解析。
实现模型#
在确保配置可以正确解析后,您可以开始实现模型逻辑。以llama的实现为例,我们需要使用来自transformers的配置文件来创建模型。
class LlamaForCausalLM(nn.Module):
# Constructor, builds the model with the given config
# ctx_mgr is the context manager, which can be used to pass engine configurations or additional parameters
def __init__(self,
config: LlamaConfig,
ctx_mgr: StepContextManager,
dtype: torch.dtype = None,
device: torch.device = None):
super().__init__()
self.config = config
self.ctx_mgr = ctx_mgr
# build LLamaModel
self.model = LlamaModel(config, dtype=dtype, device=device)
# build lm_head
self.lm_head = build_rowwise_linear(config.hidden_size,
config.vocab_size,
bias=False,
dtype=dtype,
device=device)
# Model inference function
# It is recommended to use the same parameters as below
def forward(
self,
input_ids: torch.Tensor,
position_ids: torch.Tensor,
past_key_values: List[List[torch.Tensor]],
attn_metadata: Any = None,
inputs_embeds: torch.Tensor = None,
**kwargs,
):
hidden_states = self.model(
input_ids=input_ids,
position_ids=position_ids,
past_key_values=past_key_values,
attn_metadata=attn_metadata,
inputs_embeds=inputs_embeds,
)
logits = self.lm_head(hidden_states)
logits = logits.float()
return logits
除了这些,还需要添加以下内容:
class LlamaForCausalLM(nn.Module):
...
# Indicates whether the model supports cudagraph
# Can be a callable object, receiving forward inputs
# Dynamically determines if cudagraph is supported
support_cuda_graph = True
# Builds model inputs
# Returns a dictionary, the keys of which must be inputs to forward
def prepare_inputs_for_generation(
self,
past_key_values: List[List[torch.Tensor]],
inputs_embeds: Optional[torch.Tensor] = None,
context: StepContext = None,
):
...
# Loads weights
# The model's inputs are key-value pairs of the state dict
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
...
我们封装了许多融合操作符以简化模型构建。这些操作符更好地支持各种功能,如张量并行化和量化。我们鼓励开发者尽可能使用这些操作符。
# Using predefined build_merged_colwise_linear, SiluAndMul, build_rowwise_linear
# Helps us build the model faster and without worrying about tensor concurrency, quantization, etc.
class LlamaMLP(nn.Module):
def __init__(self,
config: LlamaConfig,
dtype: torch.dtype = None,
device: torch.device = None):
super().__init__()
quantization_config = getattr(config, 'quantization_config', None)
# gate up
self.gate_up_proj = build_merged_colwise_linear(
config.hidden_size,
[config.intermediate_size, config.intermediate_size],
bias=config.mlp_bias,
dtype=dtype,
device=device,
quant_config=quantization_config,
is_tp=True,
)
# silu and mul
self.act_fn = SiluAndMul(inplace=True)
# down
self.down_proj = build_rowwise_linear(config.intermediate_size,
config.hidden_size,
bias=config.mlp_bias,
quant_config=quantization_config,
dtype=dtype,
device=device,
is_tp=True)
def forward(self, x):
"""forward."""
gate_up = self.gate_up_proj(x)
act = self.act_fn(gate_up)
return self.down_proj(act)
模型注册#
为了确保开发的模型实现能够正常使用,我们还需要在lmdeploy/pytorch/models/module_map.py
中注册模型。
MODULE_MAP.update({
'LlamaForCausalLM':
f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llama.LlamaForCausalLM',
})
如果您不希望修改模型源代码,您也可以从外部传递自定义模块映射,使其更容易集成到其他项目中。
from lmdeploy import PytorchEngineConfig, pipeline
backend_config = PytorchEngineConfig(custom_module_map='/path/to/custom/module_map.py')
generator = pipeline(model_path, backend_config=backend_config)