Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
HappyAmazonian committed Dec 18, 2024
2 parents ab7e10b + 92a075f commit 5761450
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ on:
permissions:
id-token: write
contents: read

jobs:
build:
uses: ./.github/workflows/docker-nightly-publish.yml
Expand Down
2 changes: 1 addition & 1 deletion engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def _init_tokenizer(self, model_id_or_path: str):
path_to_use = model_id_or_path if self.peft_config is None else self.peft_config.base_model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(
path_to_use,
padding_size="left",
padding_side="left",
trust_remote_code=self.hf_configs.trust_remote_code,
revision=self.hf_configs.revision,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@
from djl_python.telemetry import telemetry_manager
from djl_python.properties_manager.lmi_dist_rb_properties import LmiDistRbProperties

LMI_DIST_GENERATION_PARAMS = set(RequestParams().__dict__.keys()).union(
set(SamplingParams().__struct_fields__)) - {"sampling_params"}
LMI_DIST_GENERATION_PARAMS = set(RequestParams().__struct_fields__)


class LmiDistRollingBatch(RollingBatch):
Expand Down Expand Up @@ -188,7 +187,7 @@ def inference(self, new_requests: List[Request]) -> List:
new_lmi_dist_requests = []
for request in new_requests:
request_id = str(request.id)
llm_input = get_prompt_inputs(request)
prompt_inputs = get_prompt_inputs(request)
params = self.translate_lmi_dist_params(request.parameters)
request_params = RequestParams(**params)
lora_request_params = dict()
Expand All @@ -197,13 +196,10 @@ def inference(self, new_requests: List[Request]) -> List:
lora_request_params["lora_request"] = get_lora_request(
adapter_name, self.lora_requests)
# Constructing Request in lmi-dist library
lmi_dist_request = Request(
id=request_id,
prompt=llm_input.get("prompt"),
prompt_token_ids=llm_input.get("prompt_token_ids"),
multi_modal_input=llm_input.get("multi_modal_data"),
params=request_params,
**lora_request_params)
lmi_dist_request = Request(id=request_id,
prompt=prompt_inputs,
params=request_params,
**lora_request_params)
new_lmi_dist_requests.append(lmi_dist_request)
self.request_cache[request_id] = {
"request_output": request.request_output
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs:
)


def get_multi_modal_data(request: Request) -> dict:
def get_multi_modal_data(request: Request) -> Optional[dict]:
parameters = request.parameters
images = parameters.pop("images", None)
multi_modal_data = None
Expand All @@ -320,8 +320,10 @@ def get_prompt_inputs(request: Request):
# In both HuggingFace and mistral cases, that process can also yield token-ids directly
# that we may want to consider passing directly to the engine
if isinstance(text_prompt, list):
return TokensPrompt(prompt_token_ids=text_prompt,
multi_modal_data=multi_modal_data)
prompt = TokensPrompt(prompt_token_ids=text_prompt)
else:
return TextPrompt(prompt=text_prompt,
multi_modal_data=multi_modal_data)
prompt = TextPrompt(prompt=text_prompt)

if multi_modal_data is not None:
prompt["multi_modal_data"] = multi_modal_data
return prompt
28 changes: 6 additions & 22 deletions engines/python/setup/djl_python/seq_scheduler/lm_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import Tuple, Union

import torch
from transformers import DynamicCache


class LMBlock(ABC):
Expand Down Expand Up @@ -107,34 +108,17 @@ def forward(self, input_ids: torch.tensor, position_ids: torch.tensor,

# Pre-process
if past_key_values is not None:
_, num_head, seq_len, kv_dim = past_key_values[0][0].shape
new_kv_list = []
for k, v in past_key_values:
k_new = torch.permute(
k.view(batch_size * num_head, seq_len, kv_dim), (0, 2, 1))
v_new = v.view(batch_size * num_head, seq_len, kv_dim)
new_kv_list.append((k_new, v_new))
past_key_values = tuple(new_kv_list)
cache = DynamicCache.from_legacy_cache(past_key_values)
else:
cache = DynamicCache()

# Forward
output = self.model.forward(input_ids=input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
past_key_values=cache,
**self.config)
past_key_values = output.past_key_values

# Post-process
_, kv_dim, seq_len = past_key_values[0][0].shape
new_kv_list = []
for k, v in past_key_values:
k_new = torch.permute(k, (0, 2, 1)).view(batch_size, -1, seq_len,
kv_dim)
v_new = v.view(batch_size, -1, seq_len, kv_dim)
new_kv_list.append((k_new, v_new))
past_key_values = tuple(new_kv_list)
output.past_key_values = past_key_values

output.past_key_values = output.past_key_values.to_legacy_cache()
return output


Expand Down
4 changes: 2 additions & 2 deletions engines/python/setup/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def run(self):
requirements = ['psutil', 'packaging', 'wheel']

test_requirements = [
'numpy<2', 'requests', 'Pillow', 'transformers==4.43.4', 'torch',
'einops', 'accelerate', 'sentencepiece', 'protobuf', "peft", 'yapf',
'numpy<2', 'requests', 'Pillow', 'transformers', 'torch', 'einops',
'accelerate', 'sentencepiece', 'protobuf', "peft", 'yapf',
'pydantic>=2.0', "objgraph"
]

Expand Down
2 changes: 1 addition & 1 deletion serving/docker/partition/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def remove_option_from_properties(properties: dict):
def init_hf_tokenizer(model_id_or_path: str, hf_configs):
tokenizer = AutoTokenizer.from_pretrained(
model_id_or_path,
padding_size="left",
padding_side="left",
trust_remote_code=hf_configs.trust_remote_code,
revision=hf_configs.revision,
)
Expand Down
18 changes: 12 additions & 6 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,12 @@
"option.tensor_parallel_degree": 4,
"option.device_map": "auto"
},
"llama-3.1-8b": {
"option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4
},
"llava_v1.6-mistral": {
"option.model_id": "s3://djl-llm/llava-v1.6-mistral-7b-hf/",
"option.limit_mm_per_prompt": "image=4",
Expand All @@ -643,12 +649,6 @@
"option.trust_remote_code": True,
"option.max_model_len": 8192,
},
"llama-3.1-8b": {
"option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
"option.task": "text-generation",
"option.tensor_parallel_degree": 4,
"option.max_rolling_batch_size": 4
},
"pixtral-12b": {
"option.model_id": "s3://djl-llm/pixtral-12b/",
"option.max_model_len": 8192,
Expand All @@ -657,6 +657,12 @@
"option.limit_mm_per_prompt": "image=4",
"option.entryPoint": "djl_python.huggingface"
},
"llama32-11b-multimodal": {
"option.model_id": "s3://djl-llm/llama-3-2-11b-vision-instruct/",
"option.max_model_len": 8192,
"option.max_rolling_batch_size": 16,
"option.enforce_eager": True,
},
"llama32-3b-multi-worker-tp1-pp1": {
"option.model_id": "s3://djl-llm/llama-3-2-3b-instruct/",
"option.tensor_parallel_degree": 1,
Expand Down
6 changes: 6 additions & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,6 +983,12 @@ def test_pixtral_12b(self):
r.launch()
client.run("multimodal pixtral-12b".split())

def test_mllama_11b(self):
with Runner('lmi', 'llama32-11b-multimodal') as r:
prepare.build_lmi_dist_model('llama32-11b-multimodal')
r.launch()
client.run("multimodal llama32-11b-multimodal".split())


class TestMultiModalVllm:

Expand Down
12 changes: 8 additions & 4 deletions wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,16 @@ public void load(Device device) throws ModelException, IOException {
if (translatorFactory != null) {
builder.optArgument("translatorFactory", translatorFactory);
}
if (batchSize > 1) {
if (batchSize > 1 && !arguments.containsKey("batchifier")) {
builder.optArgument("batchifier", "stack");
}
if (translator == null
&& translatorFactory == null
&& "Python".equals(engineName)
&& !arguments.containsKey("translator")
&& !arguments.containsKey("translatorFactory")) {
builder.optTranslatorFactory(new NoopServingTranslatorFactory());
}
}
logger.info("Loading model {} {} on {}", id, uid, device);
if ("nc".equals(device.getDeviceType()) && "PyTorch".equals(engineName)) {
Expand All @@ -265,9 +272,6 @@ public void load(Device device) throws ModelException, IOException {
// override model_id
builder.optOption("model_id", downloadDir.toAbsolutePath().toString());
}
if (translator == null && translatorFactory == null && "Python".equals(engineName)) {
builder.optTranslatorFactory(new NoopServingTranslatorFactory());
}
ZooModel<I, O> m = builder.build().loadModel();
m.setProperty("metric_dimension", id);

Expand Down

0 comments on commit 5761450

Please sign in to comment.