Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
to_device_vec(image_bound),
to_device_vec(tgt_sizes),
visual_token_ranges,
to_device(target_hidden_states),
};

infinilm::global_state::get_forward_context().attn_metadata = {
Expand Down
2 changes: 1 addition & 1 deletion csrc/engine/infer_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class InferEngine {

std::vector<std::string> state_dict_keys();

// Run a single forward pass on all workers and return the outputs from all ranks
// Run a single forward pass on all workers and return sampled token IDs.
Output forward(const Input &input);

void compile();
Expand Down
24 changes: 17 additions & 7 deletions csrc/engine/rank_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,10 @@ void RankWorker::thread_loop() {
std::lock_guard<std::mutex> lk(mutex_);

infinicore::Tensor logits;
// Try to get compiled graph
if (compiler_ != nullptr) {
infinicore::Tensor hidden_states;
// All-position speculative/MTP runs need eager mode because
// hidden states are not part of compiled graph outputs.
if (!local_args.sample_all_positions && compiler_ != nullptr) {
auto [graph, output] = compiler_->get_compiled(local_args.to_model_input(infinicore::Device::cpu()));
if (graph != nullptr && output != nullptr) {
graph->run();
Expand All @@ -412,7 +414,9 @@ void RankWorker::thread_loop() {
// Fall back to eager mode
if (!logits) {
auto model_args = local_args.to_model_input(rank_info_.device);
logits = model_->forward(model_args).logits;
auto model_output = model_->forward(model_args);
logits = model_output.logits;
hidden_states = model_output.hidden_states;
}

// Random sampling (rank 0 only)
Expand All @@ -429,10 +433,16 @@ void RankWorker::thread_loop() {
auto n_req = local_args.input_offsets.value()->size(0) - 1;
int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();

auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};
const bool sample_all_positions = local_args.sample_all_positions;
const size_t n_out = sample_all_positions ? static_cast<size_t>(input_offsets[n_req]) : n_req;
auto output_ids{infinicore::Tensor::empty({n_out}, infinicore::DataType::I64, rank_info_.device)};

for (auto i{decltype(n_req)(0)}; i < n_req; ++i) {
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})};
for (size_t i{0}; i < n_out; ++i) {
size_t score_idx = i;
if (!sample_all_positions) {
score_idx = static_cast<size_t>(input_offsets[i + 1] - 1);
}
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, score_idx, 1}})->view({vocab_size})};
auto out{output_ids->narrow({{0, i, 1}})->view({})};
float random_val = std::uniform_real_distribution<float>(0, 1)(rng_);
infinicore::op::random_sample_(
Expand All @@ -443,7 +453,7 @@ void RankWorker::thread_loop() {

infinicore::context::syncStream();

auto out{Output{output_ids}};
auto out{Output{output_ids, logits, hidden_states}};

output_ = std::move(out);
}
Expand Down
8 changes: 7 additions & 1 deletion csrc/engine/rank_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ class RankWorker {
std::optional<std::vector<size_t>> image_req_ids;
/// Flattened [start, end) visual token ranges in the packed language sequence.
std::optional<std::vector<size_t>> visual_token_ranges;
/// Target model hidden states for draft/MTP models.
std::optional<infinicore::Tensor> target_hidden_states;
/// Sample logits at every packed input position instead of one token per request.
bool sample_all_positions{false};

float temperature{1};

Expand All @@ -74,6 +78,8 @@ class RankWorker {

struct Output {
infinicore::Tensor output_ids;
infinicore::Tensor logits;
infinicore::Tensor hidden_states;
};

RankWorker(std::shared_ptr<infinilm::global_state::InfinilmConfig> infinilm_config,
Expand All @@ -96,7 +102,7 @@ class RankWorker {

std::vector<std::string> state_dict_keys();

// Submit a run (forward) job.
// Submit a run (forward + sampling) job.
void run(const Input &args);

// Reset the internal cache with a new configuration
Expand Down
4 changes: 4 additions & 0 deletions csrc/models/infinilm_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,15 @@ class InfinilmModel : public infinicore::nn::Module {
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
/// Flattened [start, end) visual token ranges in the packed language sequence.
std::optional<std::vector<size_t>> visual_token_ranges;
/// Target model hidden states consumed by draft/MTP models.
std::optional<infinicore::Tensor> target_hidden_states;
};

struct Output {
/// Logits.
infinicore::Tensor logits;
/// Optional final hidden states, used by MTP/Eagle draft models.
infinicore::Tensor hidden_states;
};

virtual ~InfinilmModel() = default;
Expand Down
173 changes: 173 additions & 0 deletions csrc/models/minicpm4/minicpm4_for_causal_lm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#include "minicpm4_for_causal_lm.hpp"
#include "../../global_state/global_state.hpp"
#include "../models_registry.hpp"

#include <cmath>
#include <stdexcept>
#include <string>

namespace infinilm::models::minicpm4 {

namespace {
float residual_scale(const std::shared_ptr<infinilm::config::ModelConfig> &model_config) {
const float scale_depth = model_config->get_or<float>("scale_depth", 1.0f);
if (model_config->get_or<std::string>("model_type", "") == "minicpm_eagle") {
const float mup_denominator = model_config->get_or<float>("mup_denominator", 1.0f);
return scale_depth / std::sqrt(mup_denominator);
}
const float num_hidden_layers = static_cast<float>(model_config->get<size_t>("num_hidden_layers"));
return scale_depth / std::sqrt(num_hidden_layers);
}
} // namespace

MiniCPM4Attention::MiniCPM4Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device)
: Attention(model_config, layer_idx, device) {
o_proj_->set_alpha(residual_scale(model_config));
}

MiniCPM4MLP::MiniCPM4MLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device)
: MLP(model_config, device) {
down_proj_->set_alpha(residual_scale(model_config));
}

MiniCPM4DecoderLayer::MiniCPM4DecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device) {
const auto &dtype = model_config->get_dtype();
const size_t hidden_size = model_config->get<size_t>("hidden_size");
const double rms_norm_eps = model_config->get<double>("rms_norm_eps");

INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device);
INFINICORE_NN_MODULE_INIT(self_attn, model_config, layer_idx, device);
INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device);
INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
}

std::tuple<infinicore::Tensor, infinicore::Tensor> MiniCPM4DecoderLayer::forward(const infinicore::Tensor &positions,
infinicore::Tensor &hidden_states,
infinicore::Tensor &residual) {
input_layernorm_->forward_inplace(hidden_states, residual);
hidden_states = self_attn_->forward(positions, hidden_states);
post_attention_layernorm_->forward_inplace(hidden_states, residual);
hidden_states = mlp_->forward(hidden_states);
return std::make_tuple(hidden_states, residual);
}

infinicore::Tensor MiniCPM4DecoderLayer::forward(const infinicore::Tensor &positions,
infinicore::Tensor &hidden_states) {
auto residual = hidden_states;
hidden_states = input_layernorm_->forward(hidden_states);
hidden_states = self_attn_->forward(positions, hidden_states);
hidden_states = infinicore::op::add(residual, hidden_states);

residual = hidden_states;
hidden_states = post_attention_layernorm_->forward(hidden_states);
hidden_states = mlp_->forward(hidden_states);
hidden_states = infinicore::op::add(residual, hidden_states);
return hidden_states;
}

MiniCPM4Model::MiniCPM4Model(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device) {
const auto &dtype = model_config->get_dtype();
const size_t vocab_size = model_config->get<size_t>("vocab_size");
const size_t hidden_size = model_config->get<size_t>("hidden_size");
const size_t num_hidden_layers = model_config->get<size_t>("num_hidden_layers");
const double rms_norm_eps = model_config->get<double>("rms_norm_eps");

INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size, std::nullopt, dtype, device);
layers_.reserve(num_hidden_layers);
for (size_t i = 0; i < num_hidden_layers; ++i) {
layers_.push_back(this->register_module<MiniCPM4DecoderLayer>("layers." + std::to_string(i), model_config, i, device));
}
INFINICORE_NN_MODULE_INIT(norm, hidden_size, rms_norm_eps, dtype, device);
}

infinicore::Tensor MiniCPM4Model::forward(const infinilm::InfinilmModel::Input &input) const {
auto input_ids = input.input_ids.value();
auto positions = input.position_ids.value();
auto hidden_states = embed_tokens_->forward(input_ids);

infinicore::Tensor residual;
for (const auto &layer : layers_) {
layer->forward(positions, hidden_states, residual);
}

norm_->forward_inplace(hidden_states, residual);
return hidden_states;
}

infinicore::Tensor MiniCPM4Model::embed_tokens(const infinicore::Tensor &input_ids) const {
return embed_tokens_->forward(input_ids);
}

MiniCPM4ForCausalLM::MiniCPM4ForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device) {
model_config_ = model_config;
const auto &dtype = model_config->get_dtype();
const size_t hidden_size = model_config->get<size_t>("hidden_size");
const size_t vocab_size = model_config->get<size_t>("vocab_size");

INFINICORE_NN_MODULE_INIT(model, model_config, device);
INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device);

if (model_config->get_config_json().contains("dim_model_base")) {
const float dim_model_base = model_config->get<float>("dim_model_base");
lm_head_->set_alpha(dim_model_base / static_cast<float>(hidden_size));
}
}

infinilm::InfinilmModel::Output MiniCPM4ForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const {
auto hidden_states = forward_hidden(input);
auto logits = lm_head_->forward(hidden_states);
return {logits, hidden_states};
}

infinicore::Tensor MiniCPM4ForCausalLM::forward_hidden(const Input &input) const {
return model_->forward(input);
}

infinicore::Tensor MiniCPM4ForCausalLM::logits_from_hidden(const infinicore::Tensor &hidden_states) const {
return lm_head_->forward(const_cast<infinicore::Tensor &>(hidden_states));
}

std::shared_ptr<infinilm::config::ModelConfig> create_minicpm4_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config) {
const std::string &model_type = model_config->get<std::string>("model_type");
if ("minicpm4" != model_type && "minicpm" != model_type) {
throw std::runtime_error("infinilm::models::minicpm4::create_minicpm4_model_config: model_type is not minicpm4");
}

auto &json = model_config->get_config_json();
if (!json.contains("head_dim")) {
json["head_dim"] = model_config->get<size_t>("hidden_size") / model_config->get<size_t>("num_attention_heads");
}
if (!json.contains("rope_theta")) {
json["rope_theta"] = 10000.0;
}
if (json.contains("bias")) {
json["attention_bias"] = json["bias"];
json["mlp_bias"] = json["bias"];
}
if (!json.contains("attention_bias")) {
json["attention_bias"] = false;
}
if (!json.contains("mlp_bias")) {
json["mlp_bias"] = false;
}
if (!json.contains("attention_output_bias")) {
json["attention_output_bias"] = false;
}
return model_config;
}

} // namespace infinilm::models::minicpm4

namespace {
INFINILM_REGISTER_CAUSAL_LM_MODEL(
minicpm4,
infinilm::models::minicpm4::MiniCPM4ForCausalLM,
infinilm::models::minicpm4::create_minicpm4_model_config);
} // namespace
89 changes: 89 additions & 0 deletions csrc/models/minicpm4/minicpm4_for_causal_lm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#pragma once

#include "../../layers/common_modules.hpp"
#include "../../models/infinilm_model.hpp"
#include "infinicore/nn/embedding.hpp"
#include "infinicore/nn/rmsnorm.hpp"

#include <memory>
#include <vector>

namespace infinilm::models::minicpm4 {

class MiniCPM4Attention : public infinilm::layers::attention::Attention {
public:
MiniCPM4Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device);
};

class MiniCPM4MLP : public infinilm::layers::mlp::MLP {
public:
MiniCPM4MLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device);
};

class MiniCPM4DecoderLayer : public infinicore::nn::Module {
public:
MiniCPM4DecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
size_t layer_idx,
const infinicore::Device &device);

std::tuple<infinicore::Tensor, infinicore::Tensor> forward(const infinicore::Tensor &positions,
infinicore::Tensor &hidden_states,
infinicore::Tensor &residual);

infinicore::Tensor forward(const infinicore::Tensor &positions,
infinicore::Tensor &hidden_states);

void process_weights_after_loading() override {
self_attn_->process_weights_after_loading();
mlp_->process_weights_after_loading();
}

void reset_runtime_state() const override {
self_attn_->reset_runtime_state();
mlp_->reset_runtime_state();
}

protected:
INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
INFINICORE_NN_MODULE(MiniCPM4Attention, self_attn);
INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm);
INFINICORE_NN_MODULE(MiniCPM4MLP, mlp);
};

class MiniCPM4Model : public infinicore::nn::Module {
public:
MiniCPM4Model(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device);

infinicore::Tensor forward(const infinilm::InfinilmModel::Input &input) const;

infinicore::Tensor embed_tokens(const infinicore::Tensor &input_ids) const;

protected:
INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
INFINICORE_NN_MODULE_VEC(MiniCPM4DecoderLayer, layers);
INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
};

class MiniCPM4ForCausalLM : public InfinilmModel {
public:
MiniCPM4ForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
const infinicore::Device &device);

Output forward(const Input &input) const override;

infinicore::Tensor forward_hidden(const Input &input) const;

infinicore::Tensor logits_from_hidden(const infinicore::Tensor &hidden_states) const;

protected:
INFINICORE_NN_MODULE(MiniCPM4Model, model);
INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head);
};

std::shared_ptr<infinilm::config::ModelConfig> create_minicpm4_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config);

} // namespace infinilm::models::minicpm4
Loading