From df0b1ffcc03e73d44fe700ebfb378d8f6f408b42 Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 3 Dec 2024 07:30:20 +0000 Subject: [PATCH] add missing use_dynamic_ntk --- src/turbomind/models/llama/unified_attention_layer.cc | 2 +- src/turbomind/triton_backend/llama/LlamaTritonModel.cc | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc index 77d53afd5e..bcabb8ca2c 100644 --- a/src/turbomind/models/llama/unified_attention_layer.cc +++ b/src/turbomind/models/llama/unified_attention_layer.cc @@ -187,7 +187,7 @@ inline void UnifiedAttentionLayer::forward(TensorMap* outputs, const TensorMa bool* is_finished = inputs->getPtr("finished"); float* rope_theta = inputs->getPtr("rope_theta"); - float* cos_sin = inputs->at("cos_sin", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr(); + float* cos_sin = inputs->getPtr("cos_sin"); void** block_ptrs = outputs->getPtr("block_ptrs"); int* cu_block_count = inputs->getPtr("cu_block_counts"); diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc index f95ee84109..7842fcb210 100644 --- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc +++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc @@ -280,7 +280,12 @@ LlamaTritonModel::LlamaTritonModel(size_t tensor_para_size, attn_param_.softmax_scale = attention_reader["softmax_scale"].as(0); attn_param_.use_logn_attn = attention_reader["use_logn_attn"].as(0); // rotary embedding parameters - attn_param_.rope.type = GetRoPEType(attention_reader["rope_scaling_type"].as("")); + if (attention_reader["use_dynamic_ntk"].as(0) == 1) { + attn_param_.rope.type = RopeType::kDynamic; + } + else { + attn_param_.rope.type = GetRoPEType(attention_reader["rope_scaling_type"].as("")); + } attn_param_.rope.dim = attention_reader["rotary_embedding"].as(); attn_param_.rope.base = attention_reader["rope_theta"].as(10000.0f); attn_param_.rope.max_position_embeddings = attention_reader["max_position_embeddings"].as(0);