Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A problem when modify the GPT example to fit Llama2-7b-chat #1146

Open
Noblezhong opened this issue Nov 1, 2024 · 0 comments
Open

A problem when modify the GPT example to fit Llama2-7b-chat #1146

Noblezhong opened this issue Nov 1, 2024 · 0 comments

Comments

@Noblezhong
Copy link

Hi, I am a student who interested in pipeline parallelism for LLM inference. I have successfully run the example for GPT mentioned in Pytorch document, so I just want to modify it to fit for llama2 model on single server with multi GPUs, Here is my scripts:

# Copyright (c) Meta Platforms, Inc. and affiliates

# Minimum effort to run this example:
# $ torchrun --nproc-per-node 4 pipeline_inference.py

import argparse
import os

import torch
import torch.distributed as dist
from torch.distributed.pipelining import pipeline, PipelineStage, ScheduleGPipe, SplitPoint

from transformers import AutoModelForCausalLM, AutoTokenizer


def run(args):
    # Grab the model
    llama = AutoModelForCausalLM.from_pretrained(
        "/zt/model/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, local_files_only= True
    )
    # print(llama)

    tokenizer = AutoTokenizer.from_pretrained("/zt/model/Llama-2-7b-chat-hf", local_files_only= True)
    tokenizer.pad_token = tokenizer.eos_token
    mb_prompts = (
        "How do you", "I like to",
    )  # microbatch size = 2

    llama.to(args.device).eval()

    # Cut model by equal number of layers per rank
    layers_per_rank = llama.config.num_hidden_layers //args.world_size
    print(f"layers_per_rank = {layers_per_rank}")
    split_spec = {
        f"model.layers.{i * layers_per_rank}": SplitPoint.BEGINNING
        for i in range(1, args.world_size)
    }

    # Create a pipeline representation from the model
    mb_inputs = tokenizer(mb_prompts, return_tensors="pt", padding=True).to(args.device)
    pipe = pipeline(
        module=llama, 
        mb_args=(mb_inputs["input_ids"],),
        split_spec= split_spec
        )

    # Create pipeline stage for each rank
    stage = pipe.build_stage(args.rank, device=args.device)

    # Run time inputs
    full_batch_prompts = (
        "How do you", "I like to", "Can I help", "You need to",
        "The weather is", "I found a", "What is your", "You are so",
    )  # full batch size = 8
    inputs = tokenizer(full_batch_prompts, return_tensors="pt", padding=True).to(args.device)

    # Attach to a schedule
    # number of microbatches = 8 // 2 = 4
    num_mbs = 4
    schedule = ScheduleGPipe(stage, num_mbs)

    # Run
    if args.rank == 0:
        tmp = inputs["input_ids"]
    else:
        tmp = None

    output = schedule.step(tmp)

    # Decode
    if output is not None:
        next_token_logits = output[0][:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1)
        print(tokenizer.batch_decode(next_token))
   



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--world_size', type=int, default=int(os.getenv("WORLD_SIZE", 4)))
    parser.add_argument('--rank', type=int, default=int(os.getenv("RANK", -1)))
    parser.add_argument('--master_addr', type=str, default=os.getenv('MASTER_ADDR', 'localhost'))
    parser.add_argument('--master_port', type=str, default=os.getenv('MASTER_PORT', '29500'))
    parser.add_argument('--schedule', type=str, default="FillDrain") # 这里可能和LLM调度策略有关系
    parser.add_argument('--cuda', type=int, default=int(torch.cuda.is_available()))
    parser.add_argument("--chunks", type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=4)
    parser.add_argument('--batches', type=int, default=1)

    args = parser.parse_args()

    if args.cuda:
        # 适应多设备训练的情况,确保线程分配再可用GPU上
        dev_id = args.rank % torch.cuda.device_count()
        args.device = torch.device(f"cuda:{dev_id}")
    else:
        args.device = torch.device("cpu")

    # Init process group
    backend = "nccl" if args.cuda else "gloo"
    dist.init_process_group(
        backend=backend,
        rank=args.rank,
        world_size=args.world_size,
    )

    run(args)

    # 销毁进程组
    dist.destroy_process_group()

The idea of my script is just to simply mix the example of GPT and llama2 mentioned in Pytorch document. But it turned out a bug below:

(pippy) root@678c7278cb2d:/zt/code/my_dev# torchrun --nproc-per-node 4 pipeline_inference.py

W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] 
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.26s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.27s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.33s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.37s/it]
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
[rank1]:[E1101 15:32:04.207518491 ProcessGroupNCCL.cpp:616] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
[rank1]:[E1101 15:32:04.208047840 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 1] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:04.252849132 ProcessGroupNCCL.cpp:616] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
[rank3]:[E1101 15:32:04.253307351 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 3] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]: Traceback (most recent call last):
[rank3]:   File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank3]:     run(args)
[rank3]:   File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank3]:     output = schedule.step(tmp)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank3]:     self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank3]:     works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank3]:     work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank3]:     return dist.batch_isend_irecv(p2p_ops).pop()
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank3]:     with _coalescing_manager(group, device, async_ops=True) as cm:
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank3]:     next(self.gen)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank3]:     work = group._end_coalescing(device)
[rank3]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank3]: ncclInternalError: Internal check failed.
[rank3]: Last error:

[rank2]:[E1101 15:32:04.276254351 ProcessGroupNCCL.cpp:616] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
[rank2]:[E1101 15:32:04.276611660 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 2] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]: Traceback (most recent call last):
[rank2]:   File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank2]:     run(args)
[rank2]:   File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank2]:     output = schedule.step(tmp)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank2]:     self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank2]:     works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank2]:     work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank2]:     return dist.batch_isend_irecv(p2p_ops).pop()
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank2]:     with _coalescing_manager(group, device, async_ops=True) as cm:
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank2]:     next(self.gen)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank2]:     work = group._end_coalescing(device)
[rank2]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank2]: ncclInternalError: Internal check failed.
[rank2]: Last error:

[rank1]:[E1101 15:32:04.403228258 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 1] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank1]:[E1101 15:32:04.403366148 ProcessGroupNCCL.cpp:630] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank1]:[E1101 15:32:04.403379158 ProcessGroupNCCL.cpp:636] [Rank 1] To avoid data inconsistency, we are taking the entire process down.
[rank1]:[E1101 15:32:04.410046876 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x756efac36446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x756eb042a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x756eb0431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x756eb043360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x756efad9d5c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x756efb65aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x756efb6ebbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

[rank2]:[E1101 15:32:05.919908176 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 2] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]:[E1101 15:32:05.919949036 ProcessGroupNCCL.cpp:630] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank2]:[E1101 15:32:05.919961456 ProcessGroupNCCL.cpp:636] [Rank 2] To avoid data inconsistency, we are taking the entire process down.
[rank2]:[E1101 15:32:05.921183834 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7b15cd469446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7b1582c2a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7b1582c31ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7b1582c3360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x7b15cd5d05c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7b15cde8dac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x7b15cdf1ebf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

[rank3]:[E1101 15:32:05.943737394 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 3] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:05.943765914 ProcessGroupNCCL.cpp:630] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank3]:[E1101 15:32:05.943772314 ProcessGroupNCCL.cpp:636] [Rank 3] To avoid data inconsistency, we are taking the entire process down.
[rank3]:[E1101 15:32:05.944944992 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x740327cb2446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7402dd42a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7402dd431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7402dd43360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x740327e195c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7403286d6ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x740328767bf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

W1101 15:32:05.280000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9692 closing signal SIGTERM
W1101 15:32:05.281000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9693 closing signal SIGTERM
E1101 15:32:05.960000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 1 (pid: 9691) of binary: /root/miniconda3/envs/pippy/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/pippy/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==2.5.0', 'console_scripts', 'torchrun')())
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
    run(args)
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
    elastic_launch(
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=====================================================
pipeline_inference.py FAILED
-----------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-----------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-11-01_15:32:05
  host      : 678c7278cb2d
  rank      : 1 (local_rank: 1)
  exitcode  : -6 (pid: 9691)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 9691
=====================================================

It seems that the NCCL communication is timeout, but I truly successfully run the GPT example. So how can I fix it. Thank u!!!!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant