A problem when modify the GPT example to fit Llama2-7b-chat

Hi, I am a student who interested in pipeline parallelism for LLM inference. I have successfully run the [example](https://github.com/pytorch/PiPPy/blob/main/examples/cpu_init/gpt2_cpu_init.py) for GPT mentioned in Pytorch document, so I just want to modify it to fit for llama2 model on single server with multi GPUs, Here is my scripts:

```
# Copyright (c) Meta Platforms, Inc. and affiliates

# Minimum effort to run this example:
# $ torchrun --nproc-per-node 4 pipeline_inference.py

import argparse
import os

import torch
import torch.distributed as dist
from torch.distributed.pipelining import pipeline, PipelineStage, ScheduleGPipe, SplitPoint

from transformers import AutoModelForCausalLM, AutoTokenizer


def run(args):
    # Grab the model
    llama = AutoModelForCausalLM.from_pretrained(
        "/zt/model/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, local_files_only= True
    )
    # print(llama)

    tokenizer = AutoTokenizer.from_pretrained("/zt/model/Llama-2-7b-chat-hf", local_files_only= True)
    tokenizer.pad_token = tokenizer.eos_token
    mb_prompts = (
        "How do you", "I like to",
    )  # microbatch size = 2

    llama.to(args.device).eval()

    # Cut model by equal number of layers per rank
    layers_per_rank = llama.config.num_hidden_layers //args.world_size
    print(f"layers_per_rank = {layers_per_rank}")
    split_spec = {
        f"model.layers.{i * layers_per_rank}": SplitPoint.BEGINNING
        for i in range(1, args.world_size)
    }

    # Create a pipeline representation from the model
    mb_inputs = tokenizer(mb_prompts, return_tensors="pt", padding=True).to(args.device)
    pipe = pipeline(
        module=llama, 
        mb_args=(mb_inputs["input_ids"],),
        split_spec= split_spec
        )

    # Create pipeline stage for each rank
    stage = pipe.build_stage(args.rank, device=args.device)

    # Run time inputs
    full_batch_prompts = (
        "How do you", "I like to", "Can I help", "You need to",
        "The weather is", "I found a", "What is your", "You are so",
    )  # full batch size = 8
    inputs = tokenizer(full_batch_prompts, return_tensors="pt", padding=True).to(args.device)

    # Attach to a schedule
    # number of microbatches = 8 // 2 = 4
    num_mbs = 4
    schedule = ScheduleGPipe(stage, num_mbs)

    # Run
    if args.rank == 0:
        tmp = inputs["input_ids"]
    else:
        tmp = None

    output = schedule.step(tmp)

    # Decode
    if output is not None:
        next_token_logits = output[0][:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1)
        print(tokenizer.batch_decode(next_token))
   



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--world_size', type=int, default=int(os.getenv("WORLD_SIZE", 4)))
    parser.add_argument('--rank', type=int, default=int(os.getenv("RANK", -1)))
    parser.add_argument('--master_addr', type=str, default=os.getenv('MASTER_ADDR', 'localhost'))
    parser.add_argument('--master_port', type=str, default=os.getenv('MASTER_PORT', '29500'))
    parser.add_argument('--schedule', type=str, default="FillDrain") # 这里可能和LLM调度策略有关系
    parser.add_argument('--cuda', type=int, default=int(torch.cuda.is_available()))
    parser.add_argument("--chunks", type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=4)
    parser.add_argument('--batches', type=int, default=1)

    args = parser.parse_args()

    if args.cuda:
        # 适应多设备训练的情况，确保线程分配再可用GPU上
        dev_id = args.rank % torch.cuda.device_count()
        args.device = torch.device(f"cuda:{dev_id}")
    else:
        args.device = torch.device("cpu")

    # Init process group
    backend = "nccl" if args.cuda else "gloo"
    dist.init_process_group(
        backend=backend,
        rank=args.rank,
        world_size=args.world_size,
    )

    run(args)

    # 销毁进程组
    dist.destroy_process_group()
```

The idea of my script is just to simply mix the example of GPT and llama2 mentioned in Pytorch document. But it turned out a bug below:

`(pippy) root@678c7278cb2d:/zt/code/my_dev# torchrun --nproc-per-node 4 pipeline_inference.py `

```
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] 
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.26s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.27s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.33s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.37s/it]
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
[rank1]:[E1101 15:32:04.207518491 ProcessGroupNCCL.cpp:616] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
[rank1]:[E1101 15:32:04.208047840 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 1] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:04.252849132 ProcessGroupNCCL.cpp:616] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
[rank3]:[E1101 15:32:04.253307351 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 3] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]: Traceback (most recent call last):
[rank3]:   File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank3]:     run(args)
[rank3]:   File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank3]:     output = schedule.step(tmp)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank3]:     self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank3]:     works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank3]:     work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank3]:     return dist.batch_isend_irecv(p2p_ops).pop()
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank3]:     with _coalescing_manager(group, device, async_ops=True) as cm:
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank3]:     next(self.gen)
[rank3]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank3]:     work = group._end_coalescing(device)
[rank3]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank3]: ncclInternalError: Internal check failed.
[rank3]: Last error:

[rank2]:[E1101 15:32:04.276254351 ProcessGroupNCCL.cpp:616] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
[rank2]:[E1101 15:32:04.276611660 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 2] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]: Traceback (most recent call last):
[rank2]:   File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank2]:     run(args)
[rank2]:   File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank2]:     output = schedule.step(tmp)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank2]:     self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank2]:     works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank2]:     work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank2]:     return dist.batch_isend_irecv(p2p_ops).pop()
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank2]:     with _coalescing_manager(group, device, async_ops=True) as cm:
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank2]:     next(self.gen)
[rank2]:   File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank2]:     work = group._end_coalescing(device)
[rank2]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank2]: ncclInternalError: Internal check failed.
[rank2]: Last error:

[rank1]:[E1101 15:32:04.403228258 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 1] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank1]:[E1101 15:32:04.403366148 ProcessGroupNCCL.cpp:630] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank1]:[E1101 15:32:04.403379158 ProcessGroupNCCL.cpp:636] [Rank 1] To avoid data inconsistency, we are taking the entire process down.
[rank1]:[E1101 15:32:04.410046876 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x756efac36446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x756eb042a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x756eb0431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x756eb043360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x756efad9d5c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x756efb65aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x756efb6ebbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

[rank2]:[E1101 15:32:05.919908176 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 2] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]:[E1101 15:32:05.919949036 ProcessGroupNCCL.cpp:630] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank2]:[E1101 15:32:05.919961456 ProcessGroupNCCL.cpp:636] [Rank 2] To avoid data inconsistency, we are taking the entire process down.
[rank2]:[E1101 15:32:05.921183834 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7b15cd469446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7b1582c2a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7b1582c31ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7b1582c3360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x7b15cd5d05c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7b15cde8dac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x7b15cdf1ebf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

[rank3]:[E1101 15:32:05.943737394 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 3] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:05.943765914 ProcessGroupNCCL.cpp:630] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank3]:[E1101 15:32:05.943772314 ProcessGroupNCCL.cpp:636] [Rank 3] To avoid data inconsistency, we are taking the entire process down.
[rank3]:[E1101 15:32:05.944944992 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x740327cb2446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7402dd42a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7402dd431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7402dd43360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x740327e195c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7403286d6ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x740328767bf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)

W1101 15:32:05.280000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9692 closing signal SIGTERM
W1101 15:32:05.281000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9693 closing signal SIGTERM
E1101 15:32:05.960000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 1 (pid: 9691) of binary: /root/miniconda3/envs/pippy/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/pippy/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==2.5.0', 'console_scripts', 'torchrun')())
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
    run(args)
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
    elastic_launch(
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=====================================================
pipeline_inference.py FAILED
-----------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-----------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-11-01_15:32:05
  host      : 678c7278cb2d
  rank      : 1 (local_rank: 1)
  exitcode  : -6 (pid: 9691)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 9691
=====================================================
```

It seems that the NCCL communication is timeout, but I truly successfully run the GPT example. So how can I fix it. Thank u!!!!




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

A problem when modify the GPT example to fit Llama2-7b-chat #1146

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

A problem when modify the GPT example to fit Llama2-7b-chat #1146

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions