Missing ac2g for nccl A2A on idle gpu

I'm not sure if this is a kineto issue, or maybe an issue on the NVIDIA side. On some very CPU overhead bound profiles, I'm seeing this artifact where we don't have the ac2g to the nccl a2a kernel:

<img width="782" height="996" alt="Image" src="https://github.com/user-attachments/assets/fb723731-3bc5-4120-a1a7-15f933c70bdb" />

It seems to happen because the cuda kernel's start time is earlier than the host side launch's end time, so they're not displayed in perfetto. This script profiles comm launches and is able to repro this situation on 8xH100:

```python
"""Standalone repro for the NCCL/CUPTI launch-latency skew artifact.

Run with:
    torchrun --nproc-per-node=8 cupti_skew_repro.py

Issues many `_c10d_functional::all_to_all_single` ops back-to-back. Each
call expands to ~world_size `ncclDevKernel_SendRecv` kernels on the NCCL
side stream. When launches are tightly clustered (sub-microsecond apart),
CUPTI's reported `kernel_start_ts` lands BEFORE the corresponding
`cuLaunchKernelEx` start ts — i.e. negative apparent launch latency.
"""
import json
import os

import torch
import torch.distributed as dist


N_COLLECTIVES = 64
TENSOR_NUMEL = 256 * 1024  # 256K floats — tiny, kernels finish before next launches
TRACE_DIR = "/tmp/cupti_skew_repro"


def setup():
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    torch.cuda.set_device(rank % torch.cuda.device_count())
    return rank


def issue_collectives(inputs, group_name, world_size):
    handles = []
    for t in inputs:
        splits = [t.shape[0] // world_size] * world_size
        h = torch.ops._c10d_functional.all_to_all_single.default(
            t, splits, splits, group_name
        )
        handles.append(h)
    return [torch.ops._c10d_functional.wait_tensor(h) for h in handles]


def profile_and_dump(inputs, group_name, world_size, rank, warmup=5, active=3):
    out_path = f"{TRACE_DIR}/repro_rank{rank}.json"
    os.makedirs(TRACE_DIR, exist_ok=True)

    for _ in range(warmup):
        issue_collectives(inputs, group_name, world_size)
        torch.cuda.synchronize()

    activities = [
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
    with torch.profiler.profile(activities=activities) as prof:
        for _ in range(active):
            issue_collectives(inputs, group_name, world_size)
            torch.cuda.synchronize()

    prof.export_chrome_trace(out_path)
    return out_path


def analyze(path):
    with open(path) as f:
        data = json.load(f)
    events = data["traceEvents"]
    fs = [e for e in events if e.get("ph") == "s"]
    fe = [e for e in events if e.get("ph") == "f"]
    start_by_id = {e["id"]: e for e in fs}
    end_at = {(e["pid"], e["tid"], e["ts"]): e["id"] for e in fe}

    nccl = [
        e for e in events
        if e.get("cat") == "kernel" and e.get("ph") == "X"
        and "ncclDevKernel" in e.get("name", "")
    ]
    lats = []
    for k in nccl:
        fid = end_at.get((k["pid"], k["tid"], k["ts"]))
        if not fid:
            continue
        s = start_by_id.get(fid)
        if not s:
            continue
        lats.append(k["ts"] - s["ts"])

    if not lats:
        print("[analyze] no NCCL kernels found in trace")
        return

    lats.sort()
    neg = [l for l in lats if l < 0]
    print(
        f"NCCL kernels: {len(lats)}\n"
        f"  Negative-latency: {len(neg)} ({100*len(neg)/len(lats):.0f}%)\n"
        f"  Latency (us): min={lats[0]:.1f}  median={lats[len(lats)//2]:.1f}  max={lats[-1]:.1f}"
    )
    if neg:
        print(f"  Worst negatives: {[round(l, 1) for l in sorted(neg)[:5]]}")


def main():
    rank = setup()
    world_size = dist.get_world_size()
    group_name = dist.group.WORLD.group_name

    inputs = [torch.randn(TENSOR_NUMEL, device="cuda") for _ in range(N_COLLECTIVES)]

    trace_path = profile_and_dump(inputs, group_name, world_size, rank)

    if rank == 0:
        print(f"Trace: {trace_path}")
        print()
        analyze(trace_path)

    dist.destroy_process_group()


if __name__ == "__main__":
    main()


"""
NCCL kernels: 192
  Negative-latency: 3 (2%)
  Latency (us): min=-82.4  median=1078.9  max=5832.7
  Worst negatives: [-82.4, -51.5, -10.1]
"""
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Missing ac2g for nccl A2A on idle gpu #1380

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Missing ac2g for nccl A2A on idle gpu #1380

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions