Skip to content
Draft
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@
python scripts/test_perf.py --verbose
```

- 单请求推理服务测试
```bash
python test/service/request.py --content="text:Image 1:" --content="image_url:xxx.jpg" --content="text:Image 2:" --content="image_url:xxxx.jpg" --content="text:Compare the 2 images."
```

- 运行推理基准测试(C-Eval/MMLU)

```bash
Expand Down
25 changes: 20 additions & 5 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,31 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
-> std::optional<infinicore::Tensor> {
return t.has_value() ? t.value()->to(device) : t;
};
auto to_device_vec = [&](const std::optional<std::vector<infinicore::Tensor>> &vec)
-> std::optional<std::vector<infinicore::Tensor>> {
if (!vec.has_value()) {
return vec;
}
std::vector<infinicore::Tensor> result;
result.reserve(vec->size());
for (const auto &t : vec.value()) {
result.push_back(t->to(device));
}
return result;
};

infinilm::InfinilmModel::Input input = {
to_device(input_ids), // @todo: on device in the future
to_device(pixel_values),
to_device(position_ids),
to_device(past_sequence_lengths), // @todo: on device in the future
to_device(total_sequence_lengths),
to_device(input_offsets),
to_device(cu_seqlens),
to_device(block_tables),
to_device(slot_mapping),
to_device(image_bound),
to_device(tgt_sizes),
to_device_vec(pixel_values),
to_device_vec(image_bound),
to_device_vec(tgt_sizes),
};

infinilm::global_state::get_forward_context().attn_metadata = {
Expand All @@ -110,8 +122,11 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
input.input_offsets,
input.cu_seqlens,
input.block_tables,
input.slot_mapping,
};
input.slot_mapping};

global_state::get_forward_context().mm_metadata = {
image_req_ids};

return input;
}

Expand Down
10 changes: 6 additions & 4 deletions csrc/engine/rank_worker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ class RankWorker {
struct Input {
/// Token IDs tensor of shape `[batch, seq_len]`.
std::optional<infinicore::Tensor> input_ids;
/// Image pixel values for multi-modal models.
std::optional<infinicore::Tensor> pixel_values;
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
Expand All @@ -52,10 +50,14 @@ class RankWorker {
std::optional<infinicore::Tensor> block_tables;
/// Slot ids for each token `[seq]`. Used for paged cache.
std::optional<infinicore::Tensor> slot_mapping;
/// Image pixel values for multi-modal models.
std::optional<std::vector<infinicore::Tensor>> pixel_values;
/// Image placeholder bounds for MiniCPM-V style replacement.
std::optional<infinicore::Tensor> image_bound;
std::optional<std::vector<infinicore::Tensor>> image_bound;
/// Target patch sizes for each image (MiniCPM-V).
std::optional<infinicore::Tensor> tgt_sizes;
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
/// req_id for each pixel_values among a batch
std::optional<std::vector<size_t>> image_req_ids;

float temperature{1};

Expand Down
5 changes: 5 additions & 0 deletions csrc/global_state/forward_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,13 @@ struct AttentionMetadata {
input.slot_mapping) {}
};

struct MultiModalMetadata {
std::optional<std::vector<size_t>> image_req_ids;
};

struct ForwardContext {
AttentionMetadata attn_metadata;
MultiModalMetadata mm_metadata;
std::vector<infinicore::Tensor> kv_cache_vec;
};

Expand Down
28 changes: 19 additions & 9 deletions csrc/layers/linear/fused_linear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: infinilm::nn::ColumnParallelLinear(
hidden_size,
calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info),
quantization,
(q_bias || k_bias || v_bias),
dtype,
device,
rank_info.tp_rank,
rank_info.tp_size),
hidden_size,
calculate_out_feature_size(num_q_head, q_dim, num_k_head, k_dim, num_v_head, v_dim, rank_info),
quantization == nullptr ? std::make_shared<infinilm::quantization::NoneQuantization>() : quantization,
(q_bias || k_bias || v_bias),
dtype,
device,
rank_info.tp_rank,
rank_info.tp_size),
q_dim_(q_dim),
k_dim_(k_dim),
v_dim_(v_dim),
Expand Down Expand Up @@ -120,7 +120,17 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
const infinicore::DataType &dtype, const infinicore::Device &device,
engine::distributed::RankInfo rank_info)
: infinilm::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
: infinilm::nn::ColumnParallelLinear(
hidden_size,
intermediate_size * 2,
quantization == nullptr ? std::make_shared<infinilm::quantization::NoneQuantization>() : quantization,
gate_bias || up_bias,
dtype,
device,
rank_info.tp_rank,
rank_info.tp_size),
gate_bias_(gate_bias),
up_bias_(up_bias) {
if (gate_bias_ != up_bias_) {
throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
}
Expand Down
15 changes: 8 additions & 7 deletions csrc/layers/linear/fused_linear.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once
#include "../../engine/distributed/communication_group.hpp"
#include "linear.hpp"
#include "../quantization/quantization.hpp"
#include "linear.hpp"
#include <functional>

namespace infinilm::layers::linear {
Expand All @@ -13,15 +13,15 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {
size_t q_dim, size_t k_dim, size_t v_dim,
size_t num_q_head, size_t num_k_head, size_t num_v_head,
bool q_bias, bool k_bias, bool v_bias,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());

explicit QKVParallelLinear(size_t hidden_size,
size_t head_dim,
size_t num_q_head, size_t num_kv_head,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
Expand All @@ -32,7 +32,7 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {
size_t num_q_head, size_t num_kv_head,
const std::string &q_name, const std::string &k_name, const std::string &v_name,
RegisterParamFn register_fn,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
Expand Down Expand Up @@ -84,21 +84,22 @@ class QKVParallelLinear : public infinilm::nn::ColumnParallelLinear {

class GateUpParallelLinear : public infinilm::nn::ColumnParallelLinear {
public:
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
GateUpParallelLinear(size_t hidden_size, size_t intermediate_size,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());

GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());

GateUpParallelLinear(size_t hidden_size, size_t intermediate_size,
const std::string &gate_name, const std::string &up_name,
RegisterParamFn register_fn,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization,
std::shared_ptr<infinilm::quantization::BaseQuantization> quantization = nullptr,
bool bias = false,
const infinicore::DataType &dtype = infinicore::DataType::F32,
const infinicore::Device &device = infinicore::Device(),
Expand Down
5 changes: 3 additions & 2 deletions csrc/layers/quantization/none_quantization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

namespace infinilm::quantization {

NoneQuantization::NoneQuantization() : NoneQuantization(nlohmann::json()) {}

std::vector<ParamDescriptor> NoneQuantization::get_param_layout(
size_t in_features, size_t out_features,
int split_dim, int tp_rank, int tp_size,
Expand All @@ -14,8 +16,7 @@ std::vector<ParamDescriptor> NoneQuantization::get_param_layout(
std::vector<ParamDescriptor> descs;
descs.push_back({"weight", {out_features, in_features}, dtype, split_dim, tp_rank, tp_size});
if (bias) {
descs.push_back({"bias", {out_features}, dtype, split_dim >= 0 ? 0 : -1,
split_dim >= 0 ? tp_rank : 0, split_dim >= 0 ? tp_size : 1});
descs.push_back({"bias", {out_features}, dtype, split_dim >= 0 ? 0 : -1, split_dim >= 0 ? tp_rank : 0, split_dim >= 0 ? tp_size : 1});
}
return descs;
}
Expand Down
4 changes: 3 additions & 1 deletion csrc/layers/quantization/none_quantization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ namespace infinilm::quantization {
class NoneQuantization : public BaseQuantization {
public:
explicit NoneQuantization(const nlohmann::json &quant_config)
: BaseQuantization(quant_config) {};
: BaseQuantization(quant_config){};

NoneQuantization();

QuantScheme get_quant_scheme() const override {
return QuantScheme::NONE;
Expand Down
14 changes: 7 additions & 7 deletions csrc/models/infinilm_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@ class InfinilmModel : public infinicore::nn::Module {
struct Input {
/// Token IDs tensor of shape `[batch, seq_len]`.
std::optional<infinicore::Tensor> input_ids;
/// Image pixel values for multi-modal models.
/// Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [batch, 3, patch, seq_len * patch]).
std::optional<infinicore::Tensor> pixel_values;
/// Position IDs tensor of shape `[batch, seq_len]` or `[seq_len]`.
std::optional<infinicore::Tensor> position_ids;
/// Past Lengths of cached sequence for each request, of shape `[num_requests]`.
Expand All @@ -38,12 +35,15 @@ class InfinilmModel : public infinicore::nn::Module {
std::optional<infinicore::Tensor> block_tables;
/// Slot ids for each token `[seq]`. Used for paged cache.
std::optional<infinicore::Tensor> slot_mapping;
/// Image pixel values for multi-modal models.
/// Vector of tensors. Shape is model-specific (e.g. LLaVA: [batch, 3, H, W], MiniCPM-V: [n_patch, 3, filter_H, H * W / filter_H]).
std::optional<std::vector<infinicore::Tensor>> pixel_values;
/// Image placeholder bounds for MiniCPM-V style replacement.
/// Tensor shape: [batch, max_ranges, 2] (start, end).
std::optional<infinicore::Tensor> image_bound;
/// Vector of tensors shape: [n_patch, 2].
std::optional<std::vector<infinicore::Tensor>> image_bound;
/// Target patch sizes for each image (MiniCPM-V).
/// Tensor shape: [batch, 2] or [batch, max_slices, 2] if pre-flattened.
std::optional<infinicore::Tensor> tgt_sizes;
/// Vector of tensors shape: [n_path, 2] if pre-flattened.
std::optional<std::vector<infinicore::Tensor>> tgt_sizes;
};

struct Output {
Expand Down
57 changes: 24 additions & 33 deletions csrc/models/minicpmv/minicpmv_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,21 @@ MiniCPMVModel::MiniCPMVModel(std::shared_ptr<infinilm::config::ModelConfig> mode
embed_dim,
num_heads,
vision_cfg.value("hidden_size", 768),
vision_cfg.value("image_size", 224),
vision_cfg.value("patch_size", 16),
dtype,
device);
}

infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const {
auto out = infinicore::Tensor::empty(inputs_embeds->shape(), inputs_embeds->dtype(), inputs_embeds->device());
out->copy_from(inputs_embeds);

void MiniCPMVModel::replace_embeddings(infinicore::Tensor inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const {
auto bounds_cpu = image_bound->to(infinicore::Device::cpu());
auto batch_size = inputs_embeds->size(0);

ASSERT_EQ(batch_size, 1);
ASSERT_EQ(bounds_cpu->size(0), 1);
auto out_slice = out->squeeze(0);
auto out_slice = inputs_embeds->squeeze(0);
auto bound_slice = bounds_cpu->squeeze(0);
auto vision_len = vision_hidden->size(0);
for (size_t patch = 0; patch < vision_len; ++patch) {
Expand All @@ -60,8 +59,6 @@ infinicore::Tensor MiniCPMVModel::replace_embeddings(const infinicore::Tensor &i

out_slice->narrow({{0, size_t(start), size_t(end - start)}})->copy_from(patch_embed);
}

return out;
}

InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input) const {
Expand All @@ -70,36 +67,30 @@ InfinilmModel::Output MiniCPMVModel::forward(const InfinilmModel::Input &input)
}
auto input_ids = input.input_ids.value();

if (input.pixel_values.has_value() && input_ids->size(1) > 1) {
if (!input.image_bound.has_value()) {
throw std::runtime_error("MiniCPMVModel: image_bound required for multimodal input");
if (input.pixel_values.has_value() && input.pixel_values.value().size() > 0) {
if (!input.image_bound.has_value() or !input.tgt_sizes.has_value()) {
throw std::runtime_error("MiniCPMVModel: image_bound and tgt_sizes must be provided with pixel_values");
}
if (input.pixel_values->size() != input.image_bound->size() || input.pixel_values->size() != input.tgt_sizes->size()) {
throw std::runtime_error("MiniCPMVModel: pixel_values, image_bound and tgt_sizes must have the same number of elements");
}
auto pixel_values = input.pixel_values.value();
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes);
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes);

auto inputs_embeds = llm_->model().embed_tokens(input_ids);
auto merged_embeds = replace_embeddings(inputs_embeds, vision_hidden, input.image_bound.value());

infinicore::Tensor position_ids;
if (input.position_ids.has_value()) {
position_ids = input.position_ids.value();
} else {
auto batch = merged_embeds->size(0);
auto seq_len = merged_embeds->size(1);
auto pos_cpu = infinicore::Tensor::zeros({batch, seq_len}, infinicore::DataType::I64, infinicore::Device::cpu());
auto *pos_ptr = reinterpret_cast<int64_t *>(pos_cpu->data());
for (size_t b = 0; b < batch; ++b) {
for (size_t i = 0; i < seq_len; ++i) {
pos_ptr[b * seq_len + i] = static_cast<int64_t>(i);
}
}
position_ids = pos_cpu->to(merged_embeds->device());

// inputs_embeds concat tokens from all requests, while images are processed per request
// slice inputs_embeds using request offsets to get the embedding of each request
infinicore::Tensor input_offsets_cpu = input.input_offsets.value()->to(infinicore::Device::cpu());
int32_t *offsets = (int32_t *)(input_offsets_cpu->data());
for (size_t i : global_state::get_forward_context().mm_metadata.image_req_ids.value()) {
auto pixel_values = input.pixel_values.value().at(i);
auto vision_embedding = vpm_->forward(pixel_values, input.tgt_sizes.value().at(i));
auto vision_hidden = resampler_->forward(vision_embedding, input.tgt_sizes.value().at(i));
replace_embeddings(inputs_embeds->narrow({{1, size_t(offsets[i]), size_t(offsets[i + 1] - offsets[i])}}), vision_hidden, input.image_bound.value().at(i));
}

auto hidden_states = llm_->model().forward_embeds(
merged_embeds,
position_ids);
inputs_embeds,
input.position_ids.value());

auto logits = llm_->logits_from_hidden(hidden_states);
return {logits};
Expand Down
6 changes: 3 additions & 3 deletions csrc/models/minicpmv/minicpmv_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ class MiniCPMVModel : public InfinilmModel {
void reset_cache(const cache::CacheConfig *cache_config) override;

private:
infinicore::Tensor replace_embeddings(const infinicore::Tensor &inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const;
void replace_embeddings(infinicore::Tensor inputs_embeds,
const infinicore::Tensor &vision_hidden,
const infinicore::Tensor &image_bound) const;

std::shared_ptr<infinilm::config::ModelConfig> config_;

Expand Down
Loading