Skip to content

Missing ac2g for nccl A2A on idle gpu #1380

@xmfan

Description

@xmfan

I'm not sure if this is a kineto issue, or maybe an issue on the NVIDIA side. On some very CPU overhead bound profiles, I'm seeing this artifact where we don't have the ac2g to the nccl a2a kernel:

Image

It seems to happen because the cuda kernel's start time is earlier than the host side launch's end time, so they're not displayed in perfetto. This script profiles comm launches and is able to repro this situation on 8xH100:

"""Standalone repro for the NCCL/CUPTI launch-latency skew artifact.

Run with:
    torchrun --nproc-per-node=8 cupti_skew_repro.py

Issues many `_c10d_functional::all_to_all_single` ops back-to-back. Each
call expands to ~world_size `ncclDevKernel_SendRecv` kernels on the NCCL
side stream. When launches are tightly clustered (sub-microsecond apart),
CUPTI's reported `kernel_start_ts` lands BEFORE the corresponding
`cuLaunchKernelEx` start ts — i.e. negative apparent launch latency.
"""
import json
import os

import torch
import torch.distributed as dist


N_COLLECTIVES = 64
TENSOR_NUMEL = 256 * 1024  # 256K floats — tiny, kernels finish before next launches
TRACE_DIR = "/tmp/cupti_skew_repro"


def setup():
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    torch.cuda.set_device(rank % torch.cuda.device_count())
    return rank


def issue_collectives(inputs, group_name, world_size):
    handles = []
    for t in inputs:
        splits = [t.shape[0] // world_size] * world_size
        h = torch.ops._c10d_functional.all_to_all_single.default(
            t, splits, splits, group_name
        )
        handles.append(h)
    return [torch.ops._c10d_functional.wait_tensor(h) for h in handles]


def profile_and_dump(inputs, group_name, world_size, rank, warmup=5, active=3):
    out_path = f"{TRACE_DIR}/repro_rank{rank}.json"
    os.makedirs(TRACE_DIR, exist_ok=True)

    for _ in range(warmup):
        issue_collectives(inputs, group_name, world_size)
        torch.cuda.synchronize()

    activities = [
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ]
    with torch.profiler.profile(activities=activities) as prof:
        for _ in range(active):
            issue_collectives(inputs, group_name, world_size)
            torch.cuda.synchronize()

    prof.export_chrome_trace(out_path)
    return out_path


def analyze(path):
    with open(path) as f:
        data = json.load(f)
    events = data["traceEvents"]
    fs = [e for e in events if e.get("ph") == "s"]
    fe = [e for e in events if e.get("ph") == "f"]
    start_by_id = {e["id"]: e for e in fs}
    end_at = {(e["pid"], e["tid"], e["ts"]): e["id"] for e in fe}

    nccl = [
        e for e in events
        if e.get("cat") == "kernel" and e.get("ph") == "X"
        and "ncclDevKernel" in e.get("name", "")
    ]
    lats = []
    for k in nccl:
        fid = end_at.get((k["pid"], k["tid"], k["ts"]))
        if not fid:
            continue
        s = start_by_id.get(fid)
        if not s:
            continue
        lats.append(k["ts"] - s["ts"])

    if not lats:
        print("[analyze] no NCCL kernels found in trace")
        return

    lats.sort()
    neg = [l for l in lats if l < 0]
    print(
        f"NCCL kernels: {len(lats)}\n"
        f"  Negative-latency: {len(neg)} ({100*len(neg)/len(lats):.0f}%)\n"
        f"  Latency (us): min={lats[0]:.1f}  median={lats[len(lats)//2]:.1f}  max={lats[-1]:.1f}"
    )
    if neg:
        print(f"  Worst negatives: {[round(l, 1) for l in sorted(neg)[:5]]}")


def main():
    rank = setup()
    world_size = dist.get_world_size()
    group_name = dist.group.WORLD.group_name

    inputs = [torch.randn(TENSOR_NUMEL, device="cuda") for _ in range(N_COLLECTIVES)]

    trace_path = profile_and_dump(inputs, group_name, world_size, rank)

    if rank == 0:
        print(f"Trace: {trace_path}")
        print()
        analyze(trace_path)

    dist.destroy_process_group()


if __name__ == "__main__":
    main()


"""
NCCL kernels: 192
  Negative-latency: 3 (2%)
  Latency (us): min=-82.4  median=1078.9  max=5832.7
  Worst negatives: [-82.4, -51.5, -10.1]
"""

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions