fix embedding copy size

irexyc · Jan 24, 2024 · f50f343 · f50f343
1 parent da190ef
commit f50f343
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
@@ -182,7 +182,7 @@ void LlamaV2<T>::updateEmbedding(T* decoder_input, const int bsz, const int* h_i
             }
             int    off_dst   = std::max(0, begin - seq.cache_len);
             int    off_src   = std::max(0, seq.cache_len - begin);
-            size_t byte_size = (end - begin) * hidden_units_ * sizeof(T);
+            size_t byte_size = (end - begin - off_src) * hidden_units_ * sizeof(T);
             T*     dst_ptr   = decoder_input + off_dst * hidden_units_;
             auto   src_ptr   = embeddings[j].data() + off_src * hidden_units_ * sizeof(T);
             cudaMemcpyAsync(dst_ptr, src_ptr, byte_size, cudaMemcpyDefault, stream_);