pytorch - 💡(How to fix) Fix # [inductor] KeyError in `Scheduler.compute_ancestors` due to self-edge created by `MutationLayoutSHOULDREMOVE` + reinplace pass for user-defined Triton kernels

Official PRs (…)
ON THIS PAGE

Recommended Tools

×6

Utilities matched from this issue’s tags and category — try them while you read without losing context.

GitHub issue graph ai analysis

Paste a GitHub issue URL. We fetch that issue, discover linked issues from bodies/comments/timeline, collect linked pull requests, and produce a structured English report.

The report is written in English Markdown for sharing and archival.

Helpful · Quick feedback

Loading…

Error Message

File ".../torch/_inductor/graph.py", line 2254, in _update_scheduler self.scheduler = Scheduler(self.operations) File ".../torch/_inductor/scheduler.py", line 2321, in _init self.compute_ancestors() File ".../torch/_inductor/scheduler.py", line 2964, in compute_ancestors ancestors |= name_to_ancestors[dep_node_name] ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^ torch._inductor.exc.InductorError: KeyError: 'op3'

Code Example

import sys
import torch
import triton
import triton.language as tl


@triton.jit
def transpose_first_two_dims_kernel(
    src_ptr, dst_ptr, M, N, INNER, BLOCK: tl.constexpr,
):
    pid = tl.program_id(0)
    block = pid * BLOCK + tl.arange(0, BLOCK)
    total = M * N * INNER
    mask = block < total
    k = block % INNER
    rest = block // INNER
    n = rest % N
    m = rest // N
    src_off = (m * N + n) * INNER + k
    dst_off = (n * M + m) * INNER + k
    x = tl.load(src_ptr + src_off, mask=mask)
    tl.store(dst_ptr + dst_off, x, mask=mask)


def fn(x: torch.Tensor, s40: torch.SymInt) -> torch.Tensor:
    S = x.shape[1]
    D = x.shape[-1] // 2
    head_dim = D // s40
    n_rep = 4 // s40
    n_kv = s40
    n_total = n_kv * n_rep
    head_grp = n_total // 4

    left, _right = torch.split(x, [D, D], dim=-1)
    v18 = left.view(1, S, n_kv, head_dim)
    expanded = v18.unsqueeze(3).expand(1, S, n_kv, n_rep, head_dim)
    clone_1 = expanded.contiguous()
    v26 = clone_1.view(1, S, n_total, head_dim)
    sq = v26.squeeze(0)
    v28 = sq.view(S, 4, head_grp, head_dim)

    dst = torch.empty(
        n_total, S, head_grp, head_dim, dtype=x.dtype, device=x.device
    )

    total = S * 4 * head_grp * head_dim
    BLOCK = 1024
    grid = (triton.cdiv(total, BLOCK),)
    transpose_first_two_dims_kernel[grid](
        v28, dst, n_total, S, head_grp * head_dim, BLOCK=BLOCK
    )
    out = dst.view(-1, head_dim) + 1.0
    return out


def main() -> int:
    torch.set_default_device("cuda")
    s40 = 2
    S = 65528
    D = 128 * s40
    x = torch.randn(1, S, 2 * D, dtype=torch.bfloat16, device="cuda")
    torch._dynamo.mark_dynamic(x, 1)
    compiled = torch.compile(fn, backend="inductor", dynamic=True)
    y = compiled(x, s40)
    print(f"[ok] y.shape={tuple(y.shape)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())

---

File ".../torch/_inductor/graph.py", line 2254, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
File ".../torch/_inductor/scheduler.py", line 2321, in _init
    self.compute_ancestors()
File ".../torch/_inductor/scheduler.py", line 2964, in compute_ancestors
    ancestors |= name_to_ancestors[dep_node_name]
                 ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
torch._inductor.exc.InductorError: KeyError: 'op3'

---

torch: 2.9.0+cu128
Python: 3.11
RAW_BUFFERClick to expand / collapse

🐛 Describe the bug

When a torch.compile'd region contains:

  1. A view chain that ends with expand(...).contiguous() (so an explicit materialization is needed), and
  2. A user-defined Triton kernel whose dst_ptr is a freshly-allocated torch.empty(...), and
  3. No aten.copy_ epilogue that puts the Triton kernel result back into a separate buffer (the kernel's output is fed straight into another pointwise op),

Inductor lowers this into a SchedulerNode that both reads and writes the same buffer, i.e. a self-edge in the dependency DAG. Scheduler.compute_ancestors does not handle self-edges and crashes with KeyError: 'op<n>'.

Minimal repro

import sys
import torch
import triton
import triton.language as tl


@triton.jit
def transpose_first_two_dims_kernel(
    src_ptr, dst_ptr, M, N, INNER, BLOCK: tl.constexpr,
):
    pid = tl.program_id(0)
    block = pid * BLOCK + tl.arange(0, BLOCK)
    total = M * N * INNER
    mask = block < total
    k = block % INNER
    rest = block // INNER
    n = rest % N
    m = rest // N
    src_off = (m * N + n) * INNER + k
    dst_off = (n * M + m) * INNER + k
    x = tl.load(src_ptr + src_off, mask=mask)
    tl.store(dst_ptr + dst_off, x, mask=mask)


def fn(x: torch.Tensor, s40: torch.SymInt) -> torch.Tensor:
    S = x.shape[1]
    D = x.shape[-1] // 2
    head_dim = D // s40
    n_rep = 4 // s40
    n_kv = s40
    n_total = n_kv * n_rep
    head_grp = n_total // 4

    left, _right = torch.split(x, [D, D], dim=-1)
    v18 = left.view(1, S, n_kv, head_dim)
    expanded = v18.unsqueeze(3).expand(1, S, n_kv, n_rep, head_dim)
    clone_1 = expanded.contiguous()
    v26 = clone_1.view(1, S, n_total, head_dim)
    sq = v26.squeeze(0)
    v28 = sq.view(S, 4, head_grp, head_dim)

    dst = torch.empty(
        n_total, S, head_grp, head_dim, dtype=x.dtype, device=x.device
    )

    total = S * 4 * head_grp * head_dim
    BLOCK = 1024
    grid = (triton.cdiv(total, BLOCK),)
    transpose_first_two_dims_kernel[grid](
        v28, dst, n_total, S, head_grp * head_dim, BLOCK=BLOCK
    )
    out = dst.view(-1, head_dim) + 1.0
    return out


def main() -> int:
    torch.set_default_device("cuda")
    s40 = 2
    S = 65528
    D = 128 * s40
    x = torch.randn(1, S, 2 * D, dtype=torch.bfloat16, device="cuda")
    torch._dynamo.mark_dynamic(x, 1)
    compiled = torch.compile(fn, backend="inductor", dynamic=True)
    y = compiled(x, s40)
    print(f"[ok] y.shape={tuple(y.shape)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())

Throw error

File ".../torch/_inductor/graph.py", line 2254, in _update_scheduler
    self.scheduler = Scheduler(self.operations)
File ".../torch/_inductor/scheduler.py", line 2321, in _init
    self.compute_ancestors()
File ".../torch/_inductor/scheduler.py", line 2964, in compute_ancestors
    ancestors |= name_to_ancestors[dep_node_name]
                 ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^
torch._inductor.exc.InductorError: KeyError: 'op3'

Versions

torch: 2.9.0+cu128
Python: 3.11

cc @chauhang @penguinwu @voznesenskym @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @ipiszy @kadeng @muchulee8 @amjames @aakhundov @coconutruben @jataylo @bdhirsh @bobrenjc93 @aorenste @oulgen @davidberard98 @jansel @Chillee @zou3519

Vote matrix · Quick signals

Works
Did the solution work? Tap to confirm.
Easy Fix
Was it a quick fix?
Time Saver
Did it save you time?
Blocking
Was it severely blocking?
Common Issue
Are others likely hitting this too?
Flaky / Intermittent
Is it intermittent?
Verified / Reproducible
Can you reproduce it reliably?
Loading…

Still need to ship something?

×6

Another batch ranked right after the header list — different links, same matching logic.

Back to top recommendations

TRENDING

pytorch - 💡(How to fix) Fix # [inductor] KeyError in `Scheduler.compute_ancestors` due to self-edge created by `MutationLayoutSHOULDREMOVE` + reinplace pass for user-defined Triton kernels