@@ -435,6 +435,7 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None:
435435 self ._precompile_tmpdir : tempfile .TemporaryDirectory [str ] | None = None
436436 self ._precompile_args_path : str | None = None
437437 self ._precompile_result_counter = count ()
438+ self ._crashed_config_strs : set [str ] = set ()
438439
439440 def _prepare (self ) -> None :
440441 """Some initialization deferred until autotuning actually runs.
@@ -534,6 +535,26 @@ def _try_load_checkpoint(self) -> bool:
534535 self .log (f"Resumed at generation { self ._current_generation } " )
535536 return True
536537
538+ def _load_crashed_configs (self ) -> None :
539+ """Load crashed configs from {hash}.crashed_configs (written by crash-recovery script)."""
540+ checkpoint_dir_str = self .settings .autotune_checkpoint_dir
541+ if checkpoint_dir_str is None :
542+ return
543+ crashed_configs_path = (
544+ Path (checkpoint_dir_str )
545+ / f"{ self ._get_stable_hash ()} .crashed_configs"
546+ )
547+ if crashed_configs_path .exists ():
548+ self ._crashed_config_strs |= {
549+ line .strip ()
550+ for line in crashed_configs_path .read_text ().splitlines ()
551+ if line .strip ()
552+ }
553+ if self ._crashed_config_strs :
554+ self .log (
555+ f"Loaded { len (self ._crashed_config_strs )} crashed config(s) to skip"
556+ )
557+
537558 def _compute_baseline (
538559 self ,
539560 ) -> tuple [object , Sequence [int ], Sequence [object ] | None ]:
@@ -752,6 +773,12 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
752773 Returns:
753774 The performance of the configuration in ms.
754775 """
776+ # Skip configs that previously crashed the subprocess
777+ config_str = str (config )
778+ if config_str in self ._crashed_config_strs :
779+ self .log .warning (f"Skipping known-crashed config: { config } " )
780+ return inf
781+
755782 self ._autotune_metrics .num_configs_tested += 1
756783 self .counters ["benchmark" ] += 1
757784 self .log .debug (lambda : f"Running benchmark for { config !r} " )
@@ -1016,10 +1043,23 @@ def _benchmark(
10161043 fns : list [Callable [..., object ]] = []
10171044 valid_configs : list [Config ] = []
10181045 futures : list [PrecompileFuture ] | None = None
1046+ # Compute pending config path once for breadcrumb writes.
1047+ checkpoint_dir_str = self .settings .autotune_checkpoint_dir
1048+ pending_path = (
1049+ Path (checkpoint_dir_str ) / f"{ self ._get_stable_hash ()} .pending_config"
1050+ if checkpoint_dir_str is not None
1051+ else None
1052+ )
10191053 for i , config in enumerate (configs ):
1054+ # Write breadcrumb before compile so a hard crash (SIGKILL /
1055+ # CUDA IMA) leaves a trace the bash recovery script can find.
1056+ if pending_path is not None :
1057+ pending_path .write_text (str (config ))
10201058 try :
10211059 fn = self .kernel .compile_config (config , allow_print = False )
10221060 except Exception :
1061+ if pending_path is not None :
1062+ pending_path .unlink (missing_ok = True )
10231063 # If all configs failed, raise error
10241064 if not valid_configs and i == len (configs ) - 1 :
10251065 raise
@@ -1029,6 +1069,8 @@ def _benchmark(
10291069 exc_info = True ,
10301070 )
10311071 continue
1072+ if pending_path is not None :
1073+ pending_path .unlink (missing_ok = True )
10321074 fns .append (fn )
10331075 valid_configs .append (config )
10341076 configs = valid_configs
@@ -1089,7 +1131,14 @@ def _benchmark(
10891131 )
10901132 )
10911133 # benchmark one-by-one to avoid noisy results
1134+ # Write pending-config breadcrumb; cleared after benchmark.
1135+ # On crash the file stays so the bash recovery script can
1136+ # detect which config caused the failure.
1137+ if pending_path is not None :
1138+ pending_path .write_text (str (config ))
10921139 perf = self .benchmark_function (config , fn )
1140+ if pending_path is not None :
1141+ pending_path .unlink (missing_ok = True )
10931142 status = "ok" if math .isfinite (perf ) else "error"
10941143 # Log completion after benchmarking
10951144 self .log .record_autotune_entry (
@@ -1194,6 +1243,7 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
11941243
11951244 if not self ._try_load_checkpoint ():
11961245 self ._init_search ()
1246+ self ._load_crashed_configs ()
11971247 try :
11981248 best = self ._autotune ()
11991249 self ._cleanup_checkpoint ()
@@ -1296,6 +1346,12 @@ def _cleanup_checkpoint(self) -> None:
12961346 checkpoint_file .unlink ()
12971347 self .log (f"Checkpoint cleaned up: { checkpoint_file } " )
12981348
1349+ # Clean up crash-recovery artifacts
1350+ for suffix in (".pending_config" , ".crashed_configs" ):
1351+ artifact = Path (checkpoint_dir_str ) / f"{ stable_hash } { suffix } "
1352+ if artifact .exists ():
1353+ artifact .unlink ()
1354+
12991355 @staticmethod
13001356 def _serialize_numpy_rng_state (
13011357 state : tuple [str , Any , int , int , float ],
0 commit comments