feat: add MiniCPM4 Eagle speculative decoding by wooway777 · Pull Request #470 · InfiniTensor/InfiniLM

wooway777 · 2026-07-01T08:35:59Z

python examples/test_infer.py   --model=/data-aisoft/mechdancer/models/openbmb_MiniCPM4-8B/   --draft-model=/data-aisoft/mechdancer/models/openbmb_MiniCPM4-8B-Eagle-vLLM/   --enable-paged-attn --attn=flash-attn --num-draft-tokens=4

python examples/test_infer.py   --model=/data-aisoft/mechdancer/models/openbmb_MiniCPM4-8B/    --enable-paged-attn --attn=flash-attn

Qwen 3

MiniCPM V

VideoNSA

wooway777 · 2026-07-01T09:00:32Z

/test

github-actions · 2026-07-01T09:00:51Z

✅ Started CI workflow run 28506016186 for commit ca4f321 on branch feat/minicpm4-and-eagle-speculative (triggered by /test).

pengcheng888 · 2026-07-02T04:40:33Z

            handle_oom_and_exit(e)
            raise

+    def forward_raw(


只有forward_raw才有 visual_token_ranges, 是这样么

forward_raw 和 forward两个函数代码重复太高了

这个是因为rank worker依赖forward_raw函数名去做run_raw的操作

改了一版不用forward_raw的

pengcheng888 · 2026-07-02T05:17:12Z

+namespace infinilm::models::minicpm_eagle {
+
+namespace {
+float eagle_residual_scale(const std::shared_ptr<infinilm::config::ModelConfig> &model_config) {
+    const float scale_depth = model_config->get_or<float>("scale_depth", 1.0f);
+    const float mup_denominator = model_config->get_or<float>("mup_denominator", 1.0f);
+    return scale_depth / std::sqrt(mup_denominator);
+}
+} // namespace
+
+MiniCPMEagleAttention::MiniCPMEagleAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                             size_t layer_idx,
+                                             const infinicore::Device &device)
+    : Attention(model_config, layer_idx, device) {
+    o_proj_->set_alpha(eagle_residual_scale(model_config));
+}
+
+MiniCPMEagleMLP::MiniCPMEagleMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                 const infinicore::Device &device)
+    : MLP(model_config, device) {
+    down_proj_->set_alpha(eagle_residual_scale(model_config));
+}
+
+MiniCPMEagleDecoderLayer::MiniCPMEagleDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                                   size_t layer_idx,
+                                                   const infinicore::Device &device) {
+    const auto &dtype = model_config->get_dtype();
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+    const double rms_norm_eps = model_config->get<double>("rms_norm_eps");
+
+    INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device);
+    INFINICORE_NN_MODULE_INIT(self_attn, model_config, layer_idx, device);
+    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device);
+    INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
+}
+
+infinicore::Tensor MiniCPMEagleDecoderLayer::forward(const infinicore::Tensor &positions,
+                                                     const infinicore::Tensor &hidden_states) const {
+    auto residual = hidden_states;
+    auto x = input_layernorm_->forward(hidden_states);
+    x = self_attn_->forward(positions, x);
+    x = infinicore::op::add(residual, x);
+
+    residual = x;
+    x = post_attention_layernorm_->forward(x);
+    x = mlp_->forward(x);
+    return infinicore::op::add(residual, x);
+}
+
+MiniCPMEagleModel::MiniCPMEagleModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                     const infinicore::Device &device)
+    : dtype_(model_config->get_dtype()),
+      device_(device),
+      hidden_size_(model_config->get<size_t>("hidden_size")) {
+    const size_t vocab_size = model_config->get<size_t>("vocab_size");
+    const size_t num_hidden_layers = model_config->get<size_t>("num_hidden_layers");
+    const double rms_norm_eps = model_config->get<double>("rms_norm_eps");
+
+    INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype_, device_);
+    INFINICORE_NN_MODULE_INIT(input_norm1, hidden_size_, rms_norm_eps, dtype_, device_);
+    INFINICORE_NN_MODULE_INIT(input_norm2, hidden_size_, rms_norm_eps, dtype_, device_);
+    INFINICORE_NN_MODULE_INIT(fc, hidden_size_ * 2, hidden_size_, false, dtype_, device_);
+
+    eagle_layers_.reserve(num_hidden_layers);
+    for (size_t i = 0; i < num_hidden_layers; ++i) {
+        eagle_layers_.push_back(this->register_module<MiniCPMEagleDecoderLayer>("eagle_layers." + std::to_string(i), model_config, i, device_));
+    }
+
+    INFINICORE_NN_MODULE_INIT(norm, hidden_size_, rms_norm_eps, dtype_, device_);
+}
+
+infinicore::Tensor MiniCPMEagleModel::embed_input_ids(const infinicore::Tensor &input_ids) const {
+    return embed_tokens_->forward(input_ids);
+}
+
+infinicore::Tensor MiniCPMEagleModel::forward_with_hidden(const infinicore::Tensor &input_ids,
+                                                          const infinicore::Tensor &position_ids,
+                                                          const infinicore::Tensor &target_hidden_states) const {
+    auto input_embeds = input_norm1_->forward(embed_input_ids(input_ids));
+    auto target_hidden = input_norm2_->forward(target_hidden_states);
+    auto fused_shape = input_embeds->shape();
+    fused_shape.back() = hidden_size_ * 2;
+    auto fused_input = infinicore::Tensor::empty(fused_shape, input_embeds->dtype(), input_embeds->device());
+    fused_input->narrow({{fused_shape.size() - 1, 0, hidden_size_}})->copy_from(input_embeds);
+    fused_input->narrow({{fused_shape.size() - 1, hidden_size_, hidden_size_}})->copy_from(target_hidden);
+    auto hidden_states = fc_->forward(fused_input);
+
+    for (const auto &layer : eagle_layers_) {
+        hidden_states = layer->forward(position_ids, hidden_states);
+    }
+
+    return hidden_states;
+}
+
+infinicore::Tensor MiniCPMEagleModel::forward(const infinilm::InfinilmModel::Input &input) const {
+    auto input_ids = input.input_ids.value();
+    auto positions = input.position_ids.value();
+    auto zero_hidden_states = infinicore::Tensor::zeros({input_ids->shape()[0], input_ids->shape()[1], hidden_size_}, dtype_, device_);
+    return forward_with_hidden(input_ids, positions, zero_hidden_states);
+}
+
+MiniCPMEagleForCausalLM::MiniCPMEagleForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                                 const infinicore::Device &device) {
+    model_config_ = model_config;
+    const auto &dtype = model_config->get_dtype();
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+    const size_t vocab_size = model_config->get<size_t>("vocab_size");
+
+    INFINICORE_NN_MODULE_INIT(model, model_config, device);
+    INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device);
+
+    if (model_config->get_config_json().contains("dim_model_base")) {
+        const float dim_model_base = model_config->get<float>("dim_model_base");
+        lm_head_->set_alpha(dim_model_base / static_cast<float>(hidden_size));
+    }
+}
+
+infinilm::InfinilmModel::Output MiniCPMEagleForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const {


为什么在MiniCPMEagleForCausalLM中没有看到mtp相关的权重

真正 Eagle/MTP 相关权重在 MiniCPMEagleModel 里面，不是在 MiniCPMEagleForCausalLM 外层直接展开：
embed_tokens
input_norm1
input_norm2
fc
eagle_layers
norm
lm_head
其中比较关键的 MTP/Eagle 特有部分是：
input_norm1
input_norm2
fc
它们对应 Eagle draft 的输入融合路径：把 draft token embedding 和 target/base model hidden states 拼起来，然后通过 fc 投影回 hidden size：
auto input_embeds = input_norm1_->forward(embed_input_ids(input_ids));
auto target_hidden = input_norm2_->forward(target_hidden_states);

auto fused_input = Tensor::empty(... hidden_size * 2 ...);
fused_input[..., :hidden_size] = input_embeds;
fused_input[..., hidden_size:] = target_hidden;

auto hidden_states = fc_->forward(fused_input);
也就是说，MTP/Eagle 的核心不是一个单独叫 mtp_* 的权重，而是这组 “token embedding + target hidden fusion” 权重，以及后面的 eagle_layers 和 lm_head。

我再理解理解

pengcheng888 · 2026-07-02T05:29:08Z

+from infinilm.modeling_utils import load_model_state_dict_by_file
+
+
+class SpeculativeRunner:


SpeculativeRunner和Runner这两个类的关系和代码层级，需要重新梳理一下，再改。

ma-hang · 2026-07-02T06:53:03Z

+                block_table, start_num_tokens + offset, total_token_ids
+            )
+            slots.append(slot)
+        return block_table, slots


预测token所在block的hash也可能会被更新，进而被同batch的后续请求复用，而这些token还不确定会被接受

当追加 token 后跨到新 block 时，说明前一个 block 已经满了。普通 decode 下，这个 full block 可以进入 prefix cache：
self.hash_to_block_id[current_hash] = last_block_id
但是 speculative verify 阶段不一样。它会临时把 draft tokens 追加进 KV cache，用 target 一次 verify 多个 token。此时这些 token 还没最终确认，后面可能被 reject，也可能 correction。review 说的就是这个问题：如果这个临时 full block 立刻更新 hash，同 batch 后续请求可能复用到“还没最终确认甚至可能会被改掉”的预测 token KV。
所以我没有新写一套 append 逻辑，而是在原来的 append_slot 上加了一个开关：
update_hash: bool = True
默认还是 True，普通 decode 行为不变。
然后 append_slots 也透传这个开关：
append_slots(..., update_hash=False)
SpeculativeRunner 里现在这样调用：
cache_manager.append_slots(
...,
update_hash=False,
)
这样做的原因是：speculative verify 用的批量追加最终还是通过 append_slot 跨 block 分配 slot；如果不改 append_slot，它内部仍然会在跨 block 时更新 hash，污染 prefix cache。
改原来的函数是为了让这条共享底层逻辑能“分配 slot 但不注册 prefix hash”。
普通推理还是：
append_slot(..., update_hash=True) # 默认
所以其他模型和普通服务路径不受影响。

代码已通过 update_hash=False 和 SpeculativeCacheOps.append_verify_slots() 处理

ma-hang · 2026-07-02T07:40:02Z

-            is_finished = self._check_request_finished(req, token_id)
+                if not holds_back:
+                    req.generated_text = last_committed_text + delta
+                    req._token_decode_offset = len(req.generated_token_ids)


last_committed_text = req.generated_text为什么移到 if not holds_back外，确认逻辑正确

已经按评论改到 if not holds_back 内，逻辑正确

ma-hang · 2026-07-02T07:42:44Z

        self.scheduled_requests = scheduled_requests
        self.num_requests = len(scheduled_requests)
        self.is_prefill = is_prefill
+        self.cache_manager = cache_manager


cache_manager传给model runner是要做什么处理，非必要不应该传输

cache_manager 传给 model runner：已经改成 SpeculativeCacheOps，不再透完整 BlockManager

ma-hang

需要修改

wooway777 requested a review from a team July 1, 2026 08:35