Skip to content

Commit

Permalink
remove the use of ConcateSlice
Browse files Browse the repository at this point in the history
  • Loading branch information
irexyc committed Sep 3, 2024
1 parent 7a0c087 commit df2c846
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 138 deletions.
2 changes: 1 addition & 1 deletion lmdeploy/turbomind/deploy/target_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def pad_weight(tensor):
if emb is not None:
emb = pad_weight(emb)
# try split along hidden dim
if emb.shape[1] % self.cfg.tensor_para_size == 0:
if emb.shape[1] % self.tensor_para_size == 0:
self.save_split(emb, 'tok_embeddings.weight', 1)
else:
self.export_weight(emb, 'tok_embeddings.weight')
Expand Down
135 changes: 26 additions & 109 deletions src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,111 +281,45 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}

template<typename T>
void loadWeights(LlamaDenseWeight<T>& w,
std::string prefix,
int rank,
FtCudaDataType model_file_type,
size_t tensor_para_size,
int slice_dim = 0,
std::vector<size_t> slice_shape = {})
void loadWeights(
LlamaDenseWeight<T>& w, std::string prefix, int rank, FtCudaDataType model_file_type, size_t tensor_para_size)
{
auto max_prefix = prefix + "." + std::to_string(tensor_para_size - 1);
const auto type = model_file_type;

bool enable_slice = true;
// Disable slice if tensor param rank is 1
if (tensor_para_size <= 1) {
enable_slice = false;
}
else {
// Disable slice if weight has already been sliced
if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
TM_LOG_DEBUG("TP weight exists. Disable runtime TP.");
enable_slice = false;
}
// check converted file with tp
auto check_exist = [&](size_t max_index) {
auto weight_file = prefix + "." + std::to_string(max_index) + ".weight";
auto qweight_file = prefix + "." + std::to_string(max_index) + ".qweight";
return std::filesystem::exists(weight_file) || std::filesystem::exists(qweight_file);
};
if (!check_exist(tensor_para_size - 1) || check_exist(tensor_para_size)) {
TM_LOG_ERROR("please make sure the tp parameter is same when you convert the model.");
FT_CHECK(false);
}

size_t dim0 = w.input_dims;
size_t dim1 = w.output_dims;
if (enable_slice) {
// multiple tp size for slice stride
if (slice_dim == 0) {
dim0 = dim0 * tensor_para_size;
if (slice_shape.size() == 0) {
slice_shape = {dim0};
}
}
else {
dim1 = dim1 * tensor_para_size;
if (slice_shape.size() == 0) {
slice_shape = {dim1};
}
}
prefix += "." + std::to_string(rank);

prefix += "." + std::to_string(0);
}
else {
prefix += "." + std::to_string(rank);
}
size_t dim0 = w.input_dims;
size_t dim1 = w.output_dims;
const auto type = model_file_type;

if (w.bias) {
std::vector<ConcateSlice> bias_slices{};
if (enable_slice) {
if (slice_dim == 1) {
size_t start = 0;
ConcateSlice slice0{{{0, 1}}};
ConcateSlice slice1{{{}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
start += len;
}
bias_slices = {slice0, slice1};
}
}
loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type, bias_slices);
loadWeightFromBin((T*)w.bias, {1, dim1}, prefix + ".bias", type);
}
const size_t bit_size = getBitSize(w.type);
if (bit_size >= 16) { // fp16, fp32
std::vector<ConcateSlice> weight_slices{};
if (enable_slice) {
if (slice_dim == 1) {
size_t start = 0;
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice1.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
start += len;
}
weight_slices = {slice0, slice1};
}
else {
size_t start = 0;
ConcateSlice slice0{{}};
ConcateSlice slice1{{{0, dim1}}};
for (auto len : slice_shape) {
size_t stride = len / tensor_para_size;
slice0.slices.push_back({start + stride * rank, start + stride * (rank + 1)});
start += len;
}
weight_slices = {slice0, slice1};
}
}
loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type, weight_slices);
loadWeightFromBin((T*)w.kernel, {dim0, dim1}, prefix + ".weight", type);
}
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;

FT_CHECK(dim1 % factor == 0);

std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8);

const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;

loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type, {});
loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type, {});
loadWeightFromBin((half*)w.scales, {group_count, dim1}, prefix + ".scales", type);
loadWeightFromBin((half*)w.zeros, {group_count, dim1}, prefix + ".zeros", type);
}
}

Expand Down Expand Up @@ -430,29 +364,12 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
(T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type);
loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);

loadWeights(self_attn_weights.qkv,
dir_path + ".attention.w_qkv",
tensor_para_rank_,
type,
tensor_para_size_,
1,
{head_num_ * size_per_head_, kv_head_num_ * size_per_head_, kv_head_num_ * size_per_head_});

loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_, 0);

// if (fused_up_and_gate_) {
// loadWeights(ffn_weights.fused_gating_intermediate,
// dir_path + ".feed_forward.w13",
// tensor_para_rank_,
// type,
// tensor_para_size_,
// 1);
// }
// else {
loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_, 1);
loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_, 1);
// }
loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_, 0);
loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type, tensor_para_size_);
loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type, tensor_para_size_);

loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type, tensor_para_size_);
loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type, tensor_para_size_);
loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_);
}

template<typename T>
Expand Down
40 changes: 12 additions & 28 deletions src/turbomind/models/llama/LlamaWeight.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,37 +118,21 @@ void loadLinearWeights(T* weights,
size_t split_dim)
{
FT_CHECK(split_dim == 0 || split_dim == 1);
auto max_prefix = prefix + "." + std::to_string(tensor_para_size - 1);
bool enable_slice = true;
if (tensor_para_size <= 1 || std::filesystem::exists(max_prefix + ".weight")) {
enable_slice = false;
}

// the weight could be split along split_dim
std::vector<std::reference_wrapper<size_t>> dims = {dim0, dim1};
if (dims[split_dim] % tensor_para_size != 0) {
enable_slice = false;
}
else if (!enable_slice && dims[split_dim] % tensor_para_size == 0) {
dims[split_dim] /= tensor_para_size;
}

prefix += "." + (enable_slice ? std::to_string(0) : std::to_string(rank));
std::vector<ConcateSlice> weight_slices{};
if (enable_slice) {
if (split_dim == 0) {
size_t stride = dim0 / tensor_para_size;
ConcateSlice slice0{{{stride * rank, stride * (rank + 1)}}};
ConcateSlice slice1{{{0, dim1}}};
weight_slices = {slice0, slice1};
}
else if (split_dim == 1) {
size_t stride = dim1 / tensor_para_size;
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{stride * rank, stride * (rank + 1)}}};
weight_slices = {slice0, slice1};
if (dims[split_dim] % tensor_para_size == 0) {
// check converted file with tp
auto should_exist = prefix + "." + std::to_string(tensor_para_size - 1) + ".weight";
auto should_not_exist = prefix + "." + std::to_string(tensor_para_size) + ".weight";
if (!std::filesystem::exists(should_exist) || std::filesystem::exists(should_not_exist)) {
TM_LOG_ERROR("please make sure the tp parameter is same when you convert the model.");
FT_CHECK(false);
}

dims[split_dim] /= tensor_para_size;
prefix += "." + std::to_string(rank);
}
loadWeightFromBin(weights, {dim0, dim1}, prefix + ".weight", type, weight_slices);
loadWeightFromBin(weights, {dim0, dim1}, prefix + ".weight", type);
}

template<typename T>
Expand Down

0 comments on commit df2c846

Please sign in to comment.