I'm not sure if this is a kineto issue, or maybe an issue on the NVIDIA side. On some very CPU overhead bound profiles, I'm seeing this artifact where we don't have the ac2g to the nccl a2a kernel:
It seems to happen because the cuda kernel's start time is earlier than the host side launch's end time, so they're not displayed in perfetto. This script profiles comm launches and is able to repro this situation on 8xH100:
"""Standalone repro for the NCCL/CUPTI launch-latency skew artifact.
Run with:
torchrun --nproc-per-node=8 cupti_skew_repro.py
Issues many `_c10d_functional::all_to_all_single` ops back-to-back. Each
call expands to ~world_size `ncclDevKernel_SendRecv` kernels on the NCCL
side stream. When launches are tightly clustered (sub-microsecond apart),
CUPTI's reported `kernel_start_ts` lands BEFORE the corresponding
`cuLaunchKernelEx` start ts — i.e. negative apparent launch latency.
"""
import json
import os
import torch
import torch.distributed as dist
N_COLLECTIVES = 64
TENSOR_NUMEL = 256 * 1024 # 256K floats — tiny, kernels finish before next launches
TRACE_DIR = "/tmp/cupti_skew_repro"
def setup():
dist.init_process_group("nccl")
rank = dist.get_rank()
torch.cuda.set_device(rank % torch.cuda.device_count())
return rank
def issue_collectives(inputs, group_name, world_size):
handles = []
for t in inputs:
splits = [t.shape[0] // world_size] * world_size
h = torch.ops._c10d_functional.all_to_all_single.default(
t, splits, splits, group_name
)
handles.append(h)
return [torch.ops._c10d_functional.wait_tensor(h) for h in handles]
def profile_and_dump(inputs, group_name, world_size, rank, warmup=5, active=3):
out_path = f"{TRACE_DIR}/repro_rank{rank}.json"
os.makedirs(TRACE_DIR, exist_ok=True)
for _ in range(warmup):
issue_collectives(inputs, group_name, world_size)
torch.cuda.synchronize()
activities = [
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
]
with torch.profiler.profile(activities=activities) as prof:
for _ in range(active):
issue_collectives(inputs, group_name, world_size)
torch.cuda.synchronize()
prof.export_chrome_trace(out_path)
return out_path
def analyze(path):
with open(path) as f:
data = json.load(f)
events = data["traceEvents"]
fs = [e for e in events if e.get("ph") == "s"]
fe = [e for e in events if e.get("ph") == "f"]
start_by_id = {e["id"]: e for e in fs}
end_at = {(e["pid"], e["tid"], e["ts"]): e["id"] for e in fe}
nccl = [
e for e in events
if e.get("cat") == "kernel" and e.get("ph") == "X"
and "ncclDevKernel" in e.get("name", "")
]
lats = []
for k in nccl:
fid = end_at.get((k["pid"], k["tid"], k["ts"]))
if not fid:
continue
s = start_by_id.get(fid)
if not s:
continue
lats.append(k["ts"] - s["ts"])
if not lats:
print("[analyze] no NCCL kernels found in trace")
return
lats.sort()
neg = [l for l in lats if l < 0]
print(
f"NCCL kernels: {len(lats)}\n"
f" Negative-latency: {len(neg)} ({100*len(neg)/len(lats):.0f}%)\n"
f" Latency (us): min={lats[0]:.1f} median={lats[len(lats)//2]:.1f} max={lats[-1]:.1f}"
)
if neg:
print(f" Worst negatives: {[round(l, 1) for l in sorted(neg)[:5]]}")
def main():
rank = setup()
world_size = dist.get_world_size()
group_name = dist.group.WORLD.group_name
inputs = [torch.randn(TENSOR_NUMEL, device="cuda") for _ in range(N_COLLECTIVES)]
trace_path = profile_and_dump(inputs, group_name, world_size, rank)
if rank == 0:
print(f"Trace: {trace_path}")
print()
analyze(trace_path)
dist.destroy_process_group()
if __name__ == "__main__":
main()
"""
NCCL kernels: 192
Negative-latency: 3 (2%)
Latency (us): min=-82.4 median=1078.9 max=5832.7
Worst negatives: [-82.4, -51.5, -10.1]
"""
I'm not sure if this is a kineto issue, or maybe an issue on the NVIDIA side. On some very CPU overhead bound profiles, I'm seeing this artifact where we don't have the ac2g to the nccl a2a kernel:
It seems to happen because the cuda kernel's start time is earlier than the host side launch's end time, so they're not displayed in perfetto. This script profiles comm launches and is able to repro this situation on 8xH100: