Skip to content

Commit

Permalink
remove ConcateSlice class
Browse files Browse the repository at this point in the history
  • Loading branch information
irexyc committed Sep 3, 2024
1 parent df2c846 commit f8a1289
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 234 deletions.
289 changes: 67 additions & 222 deletions src/turbomind/utils/memory_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,7 @@ template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
// the product of the elements in shape is 0, this function will return an empty vector.
template<typename T>
std::vector<T>
loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::vector<ConcateSlice> slices = {})
std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
{
if (shape.size() > 2) {
printf("[ERROR] shape should have less than two dims \n");
Expand All @@ -315,145 +314,48 @@ loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename, std::ve
dim1 = shape[1];
}

if (slices.size() == 0) {
size_t size = dim0 * dim1;
if (size == 0) {
TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
return std::vector<T>();
}

std::vector<T> host_array(size);
std::ifstream in(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) {
TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
return std::vector<T>();
}

size_t loaded_data_size = sizeof(T) * size;
in.seekg(0, in.end);
const auto file_size_in_bytes = (size_t)in.tellg();
in.seekg(0, in.beg);

TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
in.read((char*)host_array.data(), loaded_data_size);

if (file_size_in_bytes != loaded_data_size) {
TM_LOG_WARNING("file %s has %ld, but request %ld, loading model fails!",
filename.c_str(),
file_size_in_bytes,
loaded_data_size);
return std::vector<T>();
}
in.close();
// If we succeed, return an array with values.
return host_array;
size_t size = dim0 * dim1;
if (size == 0) {
TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
return std::vector<T>();
}
else {
// concate all slices on the same dims

if (slices.size() != shape.size()) {
printf("[ERROR] slices should have same dims as shape \n");
return std::vector<T>();
}

// get slices
ConcateSlice slice0{{{0, dim0}}};
ConcateSlice slice1{{{0, dim1}}};
if (slices.size() > 0 && slices[0].slices.size() > 0) {
slice0 = slices[0];
}
if (shape.size() == 2 && slices[1].slices.size() > 0) {
slice1 = slices[1];
}

size_t w0 = 0;
for (auto& s : slice0.slices) {
if (s.second > dim0) {
s.second = dim0;
}
if (s.second < s.first) {
printf("[ERROR] slice0: end < start \n");
return std::vector<T>();
}
w0 += s.second - s.first;
}

size_t w1 = 0;
for (auto& s : slice1.slices) {
if (s.second > dim1) {
s.second = dim1;
}
if (s.second < s.first) {
printf("[ERROR] slice1: end < start \n");
return std::vector<T>();
}
w1 += s.second - s.first;
}

size_t size = w0 * w1;
size_t loaded_data_size = size * sizeof(T);

TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename + " with slice.");
if (size == 0) {
TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
return std::vector<T>();
}
std::vector<T> host_array(size);
std::ifstream in(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) {
TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
return std::vector<T>();
}

std::vector<T> host_array(size);
std::ifstream in(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) {
TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
return std::vector<T>();
}
size_t loaded_data_size = sizeof(T) * size;
in.seekg(0, in.end);
const auto file_size_in_bytes = (size_t)in.tellg();
in.seekg(0, in.beg);

char* host_ptr = (char*)host_array.data();
if (slice1.slices.size() == 0
|| (slice1.slices.size() == 1 && slice1.slices[0].second - slice1.slices[0].first == dim1)) {
for (auto& s : slice0.slices) {
size_t read_size = (s.second - s.first) * dim1 * sizeof(T);
size_t pos = s.first * dim1;
in.seekg(pos * sizeof(T));
in.read((char*)host_ptr, read_size);
host_ptr += read_size;
}
in.close();
return host_array;
}
TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
in.read((char*)host_array.data(), loaded_data_size);

{
for (auto& s0 : slice0.slices) {
// loop over outer slice
for (size_t line_id = s0.first; line_id < s0.second; ++line_id) {
// loop over lines
size_t pos0 = line_id * dim1;
for (auto& s1 : slice1.slices) {
// loop over inner slice
size_t pos = pos0 + s1.first;
size_t read_size = (s1.second - s1.first) * sizeof(T);
in.seekg(pos * sizeof(T));
in.read(host_ptr, read_size);
host_ptr += read_size;
}
}
}
in.close();
}
return host_array;
if (file_size_in_bytes != loaded_data_size) {
TM_LOG_WARNING("file %s has %ld, but request %ld, loading model fails!",
filename.c_str(),
file_size_in_bytes,
loaded_data_size);
return std::vector<T>();
}
in.close();
// If we succeed, return an array with values.
return host_array;
}

std::vector<float> loadArrayFromBin(std::vector<size_t> shape, std::string filename, std::vector<ConcateSlice> slices)
std::vector<float> loadArrayFromBin(std::vector<size_t> shape, std::string filename)
{
return loadWeightFromBinHelper<float>(shape, filename, slices);
return loadWeightFromBinHelper<float>(shape, filename);
}

template<typename T, typename T_IN>
int loadWeightFromBinFunc(T* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices = std::vector<ConcateSlice>())
int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
{
std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename, slices);
std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);

if (host_array.empty()) {
return 0;
Expand All @@ -472,84 +374,49 @@ int loadWeightFromBinFunc(T* ptr,
return 0;
}

template int loadWeightFromBinFunc<float, float>(float* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<half, float>(half* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<float, half>(float* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<half, half>(half* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
#ifdef ENABLE_BF16
template int loadWeightFromBinFunc<__nv_bfloat16, float>(__nv_bfloat16* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<__nv_bfloat16, half>(__nv_bfloat16* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBinFunc<__nv_bfloat16, float>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
template int
loadWeightFromBinFunc<__nv_bfloat16, half>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* ptr,
std::vector<size_t> shape,
std::string filename);
#endif // ENABLE_BF16
template int loadWeightFromBinFunc<int, int>(int* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
#ifdef ENABLE_FP8
template int loadWeightFromBinFunc<__nv_fp8_e4m3, float>(__nv_fp8_e4m3* ptr,
std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBinFunc<__nv_fp8_e4m3, float>(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
#endif // ENABLE_FP8

template<typename T>
int loadWeightFromBin(T* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices)
int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
{
switch (model_file_type) {
case FtCudaDataType::FP32:
loadWeightFromBinFunc<T, float>(ptr, shape, filename, slices);
loadWeightFromBinFunc<T, float>(ptr, shape, filename);
break;
case FtCudaDataType::FP16:
loadWeightFromBinFunc<T, half>(ptr, shape, filename, slices);
loadWeightFromBinFunc<T, half>(ptr, shape, filename);
break;
case FtCudaDataType::INT8:
loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename, slices);
loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename);
break;
#ifdef ENABLE_BF16
case FtCudaDataType::BF16:
loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename, slices);
loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename);
break;
#endif
#ifdef ENABLE_FP8
case FtCudaDataType::FP8:
loadWeightFromBinFunc<T, float>(ptr, shape, filename, slices);
loadWeightFromBinFunc<T, float>(ptr, shape, filename);
break;
#endif
default:
Expand All @@ -560,50 +427,28 @@ int loadWeightFromBin(T* ptr,
}

template<>
int loadWeightFromBin(int* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices)
int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
{
loadWeightFromBinFunc<int, int>(ptr, shape, filename, slices);
loadWeightFromBinFunc<int, int>(ptr, shape, filename);
return 0;
}

template int loadWeightFromBin(float* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int loadWeightFromBin(half* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int loadWeightFromBin(int8_t* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBin(float* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
template int
loadWeightFromBin(half* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
template int
loadWeightFromBin(int8_t* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
#ifdef ENABLE_BF16
template int loadWeightFromBin(__nv_bfloat16* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBin(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
#endif
#ifdef ENABLE_FP8
template int loadWeightFromBin(__nv_fp8_e4m3* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBin(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
#endif
template int loadWeightFromBin(int* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type,
std::vector<ConcateSlice> slices);
template int
loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);

template<typename T_IN, typename T_OUT>
__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
Expand Down
17 changes: 5 additions & 12 deletions src/turbomind/utils/memory_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,13 @@ void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream =
template<typename T>
void cudaRandomUniform(T* buffer, const size_t size);

struct ConcateSlice {
std::vector<std::pair<size_t, size_t>> slices;
};

template<typename T>
int loadWeightFromBin(T* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type = FtCudaDataType::FP32,
std::vector<ConcateSlice> slices = std::vector<ConcateSlice>());
int loadWeightFromBin(T* ptr,
std::vector<size_t> shape,
std::string filename,
FtCudaDataType model_file_type = FtCudaDataType::FP32);

std::vector<float> loadArrayFromBin(std::vector<size_t> shape,
std::string filename,
std::vector<ConcateSlice> slices = std::vector<ConcateSlice>());
std::vector<float> loadArrayFromBin(std::vector<size_t> shape, std::string filename);

// template<typename T>
// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t* quantized_weight_ptr,
Expand Down

0 comments on commit f8a1289

Please sign in to comment.