feat: add MiniCPM4 Eagle speculative decoding#470
Conversation
|
/test |
|
✅ Started CI workflow run 28506016186 for commit |
| handle_oom_and_exit(e) | ||
| raise | ||
|
|
||
| def forward_raw( |
There was a problem hiding this comment.
只有forward_raw才有 visual_token_ranges, 是这样么
There was a problem hiding this comment.
forward_raw 和 forward两个函数代码重复太高了
There was a problem hiding this comment.
这个是因为rank worker依赖forward_raw函数名去做run_raw的操作
There was a problem hiding this comment.
改了一版不用forward_raw的
| namespace infinilm::models::minicpm_eagle { | ||
|
|
||
| namespace { | ||
| float eagle_residual_scale(const std::shared_ptr<infinilm::config::ModelConfig> &model_config) { | ||
| const float scale_depth = model_config->get_or<float>("scale_depth", 1.0f); | ||
| const float mup_denominator = model_config->get_or<float>("mup_denominator", 1.0f); | ||
| return scale_depth / std::sqrt(mup_denominator); | ||
| } | ||
| } // namespace | ||
|
|
||
| MiniCPMEagleAttention::MiniCPMEagleAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config, | ||
| size_t layer_idx, | ||
| const infinicore::Device &device) | ||
| : Attention(model_config, layer_idx, device) { | ||
| o_proj_->set_alpha(eagle_residual_scale(model_config)); | ||
| } | ||
|
|
||
| MiniCPMEagleMLP::MiniCPMEagleMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config, | ||
| const infinicore::Device &device) | ||
| : MLP(model_config, device) { | ||
| down_proj_->set_alpha(eagle_residual_scale(model_config)); | ||
| } | ||
|
|
||
| MiniCPMEagleDecoderLayer::MiniCPMEagleDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config, | ||
| size_t layer_idx, | ||
| const infinicore::Device &device) { | ||
| const auto &dtype = model_config->get_dtype(); | ||
| const size_t hidden_size = model_config->get<size_t>("hidden_size"); | ||
| const double rms_norm_eps = model_config->get<double>("rms_norm_eps"); | ||
|
|
||
| INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device); | ||
| INFINICORE_NN_MODULE_INIT(self_attn, model_config, layer_idx, device); | ||
| INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device); | ||
| INFINICORE_NN_MODULE_INIT(mlp, model_config, device); | ||
| } | ||
|
|
||
| infinicore::Tensor MiniCPMEagleDecoderLayer::forward(const infinicore::Tensor &positions, | ||
| const infinicore::Tensor &hidden_states) const { | ||
| auto residual = hidden_states; | ||
| auto x = input_layernorm_->forward(hidden_states); | ||
| x = self_attn_->forward(positions, x); | ||
| x = infinicore::op::add(residual, x); | ||
|
|
||
| residual = x; | ||
| x = post_attention_layernorm_->forward(x); | ||
| x = mlp_->forward(x); | ||
| return infinicore::op::add(residual, x); | ||
| } | ||
|
|
||
| MiniCPMEagleModel::MiniCPMEagleModel(std::shared_ptr<infinilm::config::ModelConfig> model_config, | ||
| const infinicore::Device &device) | ||
| : dtype_(model_config->get_dtype()), | ||
| device_(device), | ||
| hidden_size_(model_config->get<size_t>("hidden_size")) { | ||
| const size_t vocab_size = model_config->get<size_t>("vocab_size"); | ||
| const size_t num_hidden_layers = model_config->get<size_t>("num_hidden_layers"); | ||
| const double rms_norm_eps = model_config->get<double>("rms_norm_eps"); | ||
|
|
||
| INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype_, device_); | ||
| INFINICORE_NN_MODULE_INIT(input_norm1, hidden_size_, rms_norm_eps, dtype_, device_); | ||
| INFINICORE_NN_MODULE_INIT(input_norm2, hidden_size_, rms_norm_eps, dtype_, device_); | ||
| INFINICORE_NN_MODULE_INIT(fc, hidden_size_ * 2, hidden_size_, false, dtype_, device_); | ||
|
|
||
| eagle_layers_.reserve(num_hidden_layers); | ||
| for (size_t i = 0; i < num_hidden_layers; ++i) { | ||
| eagle_layers_.push_back(this->register_module<MiniCPMEagleDecoderLayer>("eagle_layers." + std::to_string(i), model_config, i, device_)); | ||
| } | ||
|
|
||
| INFINICORE_NN_MODULE_INIT(norm, hidden_size_, rms_norm_eps, dtype_, device_); | ||
| } | ||
|
|
||
| infinicore::Tensor MiniCPMEagleModel::embed_input_ids(const infinicore::Tensor &input_ids) const { | ||
| return embed_tokens_->forward(input_ids); | ||
| } | ||
|
|
||
| infinicore::Tensor MiniCPMEagleModel::forward_with_hidden(const infinicore::Tensor &input_ids, | ||
| const infinicore::Tensor &position_ids, | ||
| const infinicore::Tensor &target_hidden_states) const { | ||
| auto input_embeds = input_norm1_->forward(embed_input_ids(input_ids)); | ||
| auto target_hidden = input_norm2_->forward(target_hidden_states); | ||
| auto fused_shape = input_embeds->shape(); | ||
| fused_shape.back() = hidden_size_ * 2; | ||
| auto fused_input = infinicore::Tensor::empty(fused_shape, input_embeds->dtype(), input_embeds->device()); | ||
| fused_input->narrow({{fused_shape.size() - 1, 0, hidden_size_}})->copy_from(input_embeds); | ||
| fused_input->narrow({{fused_shape.size() - 1, hidden_size_, hidden_size_}})->copy_from(target_hidden); | ||
| auto hidden_states = fc_->forward(fused_input); | ||
|
|
||
| for (const auto &layer : eagle_layers_) { | ||
| hidden_states = layer->forward(position_ids, hidden_states); | ||
| } | ||
|
|
||
| return hidden_states; | ||
| } | ||
|
|
||
| infinicore::Tensor MiniCPMEagleModel::forward(const infinilm::InfinilmModel::Input &input) const { | ||
| auto input_ids = input.input_ids.value(); | ||
| auto positions = input.position_ids.value(); | ||
| auto zero_hidden_states = infinicore::Tensor::zeros({input_ids->shape()[0], input_ids->shape()[1], hidden_size_}, dtype_, device_); | ||
| return forward_with_hidden(input_ids, positions, zero_hidden_states); | ||
| } | ||
|
|
||
| MiniCPMEagleForCausalLM::MiniCPMEagleForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config, | ||
| const infinicore::Device &device) { | ||
| model_config_ = model_config; | ||
| const auto &dtype = model_config->get_dtype(); | ||
| const size_t hidden_size = model_config->get<size_t>("hidden_size"); | ||
| const size_t vocab_size = model_config->get<size_t>("vocab_size"); | ||
|
|
||
| INFINICORE_NN_MODULE_INIT(model, model_config, device); | ||
| INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); | ||
|
|
||
| if (model_config->get_config_json().contains("dim_model_base")) { | ||
| const float dim_model_base = model_config->get<float>("dim_model_base"); | ||
| lm_head_->set_alpha(dim_model_base / static_cast<float>(hidden_size)); | ||
| } | ||
| } | ||
|
|
||
| infinilm::InfinilmModel::Output MiniCPMEagleForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const { |
There was a problem hiding this comment.
为什么在MiniCPMEagleForCausalLM中没有 看到mtp相关的权重
There was a problem hiding this comment.
真正 Eagle/MTP 相关权重在 MiniCPMEagleModel 里面,不是在 MiniCPMEagleForCausalLM 外层直接展开:
embed_tokens
input_norm1
input_norm2
fc
eagle_layers
norm
lm_head
其中比较关键的 MTP/Eagle 特有部分是:
input_norm1
input_norm2
fc
它们对应 Eagle draft 的输入融合路径:把 draft token embedding 和 target/base model hidden states 拼起来,然后通过 fc 投影回 hidden size:
auto input_embeds = input_norm1_->forward(embed_input_ids(input_ids));
auto target_hidden = input_norm2_->forward(target_hidden_states);
auto fused_input = Tensor::empty(... hidden_size * 2 ...);
fused_input[..., :hidden_size] = input_embeds;
fused_input[..., hidden_size:] = target_hidden;
auto hidden_states = fc_->forward(fused_input);
也就是说,MTP/Eagle 的核心不是一个单独叫 mtp_* 的权重,而是这组 “token embedding + target hidden fusion” 权重,以及后面的 eagle_layers 和 lm_head。
| from infinilm.modeling_utils import load_model_state_dict_by_file | ||
|
|
||
|
|
||
| class SpeculativeRunner: |
There was a problem hiding this comment.
SpeculativeRunner和Runner这两个类的关系和代码层级,需要重新梳理一下,再改。
bde1010 to
8f7b58d
Compare
| block_table, start_num_tokens + offset, total_token_ids | ||
| ) | ||
| slots.append(slot) | ||
| return block_table, slots |
There was a problem hiding this comment.
预测token所在block的hash也可能会被更新,进而被同batch的后续请求复用,而这些token还不确定会被接受
There was a problem hiding this comment.
当追加 token 后跨到新 block 时,说明前一个 block 已经满了。普通 decode 下,这个 full block 可以进入 prefix cache:
self.hash_to_block_id[current_hash] = last_block_id
但是 speculative verify 阶段不一样。它会临时把 draft tokens 追加进 KV cache,用 target 一次 verify 多个 token。此时这些 token 还没最终确认,后面可能被 reject,也可能 correction。review 说的就是这个问题:如果这个临时 full block 立刻更新 hash,同 batch 后续请求可能复用到“还没最终确认甚至可能会被改掉”的预测 token KV。
所以我没有新写一套 append 逻辑,而是在原来的 append_slot 上加了一个开关:
update_hash: bool = True
默认还是 True,普通 decode 行为不变。
然后 append_slots 也透传这个开关:
append_slots(..., update_hash=False)
SpeculativeRunner 里现在这样调用:
cache_manager.append_slots(
...,
update_hash=False,
)
这样做的原因是:speculative verify 用的批量追加最终还是通过 append_slot 跨 block 分配 slot;如果不改 append_slot,它内部仍然会在跨 block 时更新 hash,污染 prefix cache。
改原来的函数是为了让这条共享底层逻辑能“分配 slot 但不注册 prefix hash”。
普通推理还是:
append_slot(..., update_hash=True) # 默认
所以其他模型和普通服务路径不受影响。
There was a problem hiding this comment.
代码已通过 update_hash=False 和 SpeculativeCacheOps.append_verify_slots() 处理
77e3098 to
633940e
Compare
| is_finished = self._check_request_finished(req, token_id) | ||
| if not holds_back: | ||
| req.generated_text = last_committed_text + delta | ||
| req._token_decode_offset = len(req.generated_token_ids) |
There was a problem hiding this comment.
last_committed_text = req.generated_text为什么移到 if not holds_back外,确认逻辑正确
There was a problem hiding this comment.
已经按评论改到 if not holds_back 内,逻辑正确
633940e to
0f2ec50
Compare
| self.scheduled_requests = scheduled_requests | ||
| self.num_requests = len(scheduled_requests) | ||
| self.is_prefill = is_prefill | ||
| self.cache_manager = cache_manager |
There was a problem hiding this comment.
cache_manager传给model runner是要做什么处理,非必要不应该传输
There was a problem hiding this comment.
cache_manager 传给 model runner:已经改成 SpeculativeCacheOps,不再透完整 BlockManager
0f2ec50 to
ee90857
Compare
Qwen 3

MiniCPM V

VideoNSA
