This repository was archived by the owner on Aug 5, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 88
This repository was archived by the owner on Aug 5, 2025. It is now read-only.
A problem when modify the GPT example to fit Llama2-7b-chat #1146
Copy link
Copy link
Open
Description
Hi, I am a student who interested in pipeline parallelism for LLM inference. I have successfully run the example for GPT mentioned in Pytorch document, so I just want to modify it to fit for llama2 model on single server with multi GPUs, Here is my scripts:
# Copyright (c) Meta Platforms, Inc. and affiliates
# Minimum effort to run this example:
# $ torchrun --nproc-per-node 4 pipeline_inference.py
import argparse
import os
import torch
import torch.distributed as dist
from torch.distributed.pipelining import pipeline, PipelineStage, ScheduleGPipe, SplitPoint
from transformers import AutoModelForCausalLM, AutoTokenizer
def run(args):
# Grab the model
llama = AutoModelForCausalLM.from_pretrained(
"/zt/model/Llama-2-7b-chat-hf", low_cpu_mem_usage=True, local_files_only= True
)
# print(llama)
tokenizer = AutoTokenizer.from_pretrained("/zt/model/Llama-2-7b-chat-hf", local_files_only= True)
tokenizer.pad_token = tokenizer.eos_token
mb_prompts = (
"How do you", "I like to",
) # microbatch size = 2
llama.to(args.device).eval()
# Cut model by equal number of layers per rank
layers_per_rank = llama.config.num_hidden_layers //args.world_size
print(f"layers_per_rank = {layers_per_rank}")
split_spec = {
f"model.layers.{i * layers_per_rank}": SplitPoint.BEGINNING
for i in range(1, args.world_size)
}
# Create a pipeline representation from the model
mb_inputs = tokenizer(mb_prompts, return_tensors="pt", padding=True).to(args.device)
pipe = pipeline(
module=llama,
mb_args=(mb_inputs["input_ids"],),
split_spec= split_spec
)
# Create pipeline stage for each rank
stage = pipe.build_stage(args.rank, device=args.device)
# Run time inputs
full_batch_prompts = (
"How do you", "I like to", "Can I help", "You need to",
"The weather is", "I found a", "What is your", "You are so",
) # full batch size = 8
inputs = tokenizer(full_batch_prompts, return_tensors="pt", padding=True).to(args.device)
# Attach to a schedule
# number of microbatches = 8 // 2 = 4
num_mbs = 4
schedule = ScheduleGPipe(stage, num_mbs)
# Run
if args.rank == 0:
tmp = inputs["input_ids"]
else:
tmp = None
output = schedule.step(tmp)
# Decode
if output is not None:
next_token_logits = output[0][:, -1, :]
next_token = torch.argmax(next_token_logits, dim=-1)
print(tokenizer.batch_decode(next_token))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--world_size', type=int, default=int(os.getenv("WORLD_SIZE", 4)))
parser.add_argument('--rank', type=int, default=int(os.getenv("RANK", -1)))
parser.add_argument('--master_addr', type=str, default=os.getenv('MASTER_ADDR', 'localhost'))
parser.add_argument('--master_port', type=str, default=os.getenv('MASTER_PORT', '29500'))
parser.add_argument('--schedule', type=str, default="FillDrain") # 这里可能和LLM调度策略有关系
parser.add_argument('--cuda', type=int, default=int(torch.cuda.is_available()))
parser.add_argument("--chunks", type=int, default=4)
parser.add_argument('--batch_size', type=int, default=4)
parser.add_argument('--batches', type=int, default=1)
args = parser.parse_args()
if args.cuda:
# 适应多设备训练的情况,确保线程分配再可用GPU上
dev_id = args.rank % torch.cuda.device_count()
args.device = torch.device(f"cuda:{dev_id}")
else:
args.device = torch.device("cpu")
# Init process group
backend = "nccl" if args.cuda else "gloo"
dist.init_process_group(
backend=backend,
rank=args.rank,
world_size=args.world_size,
)
run(args)
# 销毁进程组
dist.destroy_process_group()
The idea of my script is just to simply mix the example of GPT and llama2 mentioned in Pytorch document. But it turned out a bug below:
(pippy) root@678c7278cb2d:/zt/code/my_dev# torchrun --nproc-per-node 4 pipeline_inference.py
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793]
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W1101 15:21:21.465000 9688 site-packages/torch/distributed/run.py:793] *****************************************
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
/root/miniconda3/envs/pippy/lib/python3.10/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead.
_torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.26s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.27s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.33s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00, 9.37s/it]
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
layers_per_rank = 8
[rank1]:[E1101 15:32:04.207518491 ProcessGroupNCCL.cpp:616] [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
[rank1]:[E1101 15:32:04.208047840 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 1] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:04.252849132 ProcessGroupNCCL.cpp:616] [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
[rank3]:[E1101 15:32:04.253307351 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 3] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]: Traceback (most recent call last):
[rank3]: File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank3]: run(args)
[rank3]: File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank3]: output = schedule.step(tmp)
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank3]: self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank3]: works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank3]: work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank3]: return dist.batch_isend_irecv(p2p_ops).pop()
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank3]: with _coalescing_manager(group, device, async_ops=True) as cm:
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank3]: next(self.gen)
[rank3]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank3]: work = group._end_coalescing(device)
[rank3]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank3]: ncclInternalError: Internal check failed.
[rank3]: Last error:
[rank2]:[E1101 15:32:04.276254351 ProcessGroupNCCL.cpp:616] [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
[rank2]:[E1101 15:32:04.276611660 ProcessGroupNCCL.cpp:1785] [PG ID 0 PG GUID 0(default_pg) Rank 2] Exception (either an error or timeout) detected by watchdog at work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]: Traceback (most recent call last):
[rank2]: File "/zt/code/my_dev/pipeline_inference.py", line 108, in <module>
[rank2]: run(args)
[rank2]: File "/zt/code/my_dev/pipeline_inference.py", line 68, in run
[rank2]: output = schedule.step(tmp)
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 615, in step
[rank2]: self._step_microbatches(args_split, kwargs_split, targets_split, losses)
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 702, in _step_microbatches
[rank2]: works = _sorted_batch_p2p(ops, desc="fwd_recv")
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 549, in _sorted_batch_p2p
[rank2]: work_by_peer[peer] = _batch_p2p(ops, desc=desc)
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/pipelining/schedules.py", line 524, in _batch_p2p
[rank2]: return dist.batch_isend_irecv(p2p_ops).pop()
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2370, in batch_isend_irecv
[rank2]: with _coalescing_manager(group, device, async_ops=True) as cm:
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/contextlib.py", line 142, in __exit__
[rank2]: next(self.gen)
[rank2]: File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2317, in _coalescing_manager
[rank2]: work = group._end_coalescing(device)
[rank2]: torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:4409, internal error - please report this issue to the NCCL developers, NCCL version 2.21.5
[rank2]: ncclInternalError: Internal check failed.
[rank2]: Last error:
[rank1]:[E1101 15:32:04.403228258 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 1] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank1]:[E1101 15:32:04.403366148 ProcessGroupNCCL.cpp:630] [Rank 1] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank1]:[E1101 15:32:04.403379158 ProcessGroupNCCL.cpp:636] [Rank 1] To avoid data inconsistency, we are taking the entire process down.
[rank1]:[E1101 15:32:04.410046876 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 1] Process group watchdog thread terminated with exception: [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600002 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x756efac36446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x756eb042a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x756eb0431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x756eb043360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x756efad9d5c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x756efb65aac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x756efb6ebbf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)
[rank2]:[E1101 15:32:05.919908176 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 2] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank2]:[E1101 15:32:05.919949036 ProcessGroupNCCL.cpp:630] [Rank 2] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank2]:[E1101 15:32:05.919961456 ProcessGroupNCCL.cpp:636] [Rank 2] To avoid data inconsistency, we are taking the entire process down.
[rank2]:[E1101 15:32:05.921183834 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 2] Process group watchdog thread terminated with exception: [Rank 2] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600073 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x7b15cd469446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7b1582c2a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7b1582c31ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7b1582c3360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x7b15cd5d05c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7b15cde8dac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x7b15cdf1ebf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)
[rank3]:[E1101 15:32:05.943737394 ProcessGroupNCCL.cpp:1834] [PG ID 0 PG GUID 0(default_pg) Rank 3] Timeout at NCCL work: 1, last enqueued NCCL work: 1, last completed NCCL work: -1.
[rank3]:[E1101 15:32:05.943765914 ProcessGroupNCCL.cpp:630] [Rank 3] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data.
[rank3]:[E1101 15:32:05.943772314 ProcessGroupNCCL.cpp:636] [Rank 3] To avoid data inconsistency, we are taking the entire process down.
[rank3]:[E1101 15:32:05.944944992 ProcessGroupNCCL.cpp:1595] [PG ID 0 PG GUID 0(default_pg) Rank 3] Process group watchdog thread terminated with exception: [Rank 3] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=COALESCED, NumelIn=18446744073709551615, NumelOut=18446744073709551615, Timeout(ms)=600000) ran for 600051 milliseconds before timing out.
Exception raised from checkTimeout at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:618 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x96 (0x740327cb2446 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::checkTimeout(std::optional<std::chrono::duration<long, std::ratio<1l, 1000l> > >) + 0x282 (0x7402dd42a762 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #2: c10d::ProcessGroupNCCL::watchdogHandler() + 0x233 (0x7402dd431ba3 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #3: c10d::ProcessGroupNCCL::ncclCommWatchdog() + 0x14d (0x7402dd43360d in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x145c0 (0x740327e195c0 in /root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/lib/libtorch.so)
frame #5: <unknown function> + 0x94ac3 (0x7403286d6ac3 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #6: clone + 0x44 (0x740328767bf4 in /usr/lib/x86_64-linux-gnu/libc.so.6)
W1101 15:32:05.280000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9692 closing signal SIGTERM
W1101 15:32:05.281000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 9693 closing signal SIGTERM
E1101 15:32:05.960000 9688 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -6) local_rank: 1 (pid: 9691) of binary: /root/miniconda3/envs/pippy/bin/python
Traceback (most recent call last):
File "/root/miniconda3/envs/pippy/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.5.0', 'console_scripts', 'torchrun')())
File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
run(args)
File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
elastic_launch(
File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/pippy/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
=====================================================
pipeline_inference.py FAILED
-----------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
-----------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-11-01_15:32:05
host : 678c7278cb2d
rank : 1 (local_rank: 1)
exitcode : -6 (pid: 9691)
error_file: <N/A>
traceback : Signal 6 (SIGABRT) received by PID 9691
=====================================================
It seems that the NCCL communication is timeout, but I truly successfully run the GPT example. So how can I fix it. Thank u!!!!
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels