Skip to content

GH-133231: Changes to executor management to support proposed sys._jit module #133287

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ struct _ts {
/* The thread's exception stack entry. (Always the last entry.) */
_PyErr_StackItem exc_state;

PyObject *previous_executor;
PyObject *current_executor;

uint64_t dict_global_version;

Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,8 @@ struct _is {
PyObject *common_consts[NUM_COMMON_CONSTANTS];
bool jit;
struct _PyExecutorObject *executor_list_head;
struct _PyExecutorObject *executor_deletion_list_head;
int executor_deletion_list_remaining_capacity;
size_t trace_run_counter;
_rare_events rare_events;
PyDict_WatchCallback builtins_dict_watcher;
Expand Down
2 changes: 1 addition & 1 deletion Include/internal/pycore_opcode_metadata.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion Include/internal/pycore_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ typedef struct {
typedef struct {
uint32_t target;
_Py_BackoffCounter temperature;
const struct _PyExecutorObject *executor;
struct _PyExecutorObject *executor;
} _PyExitData;

typedef struct _PyExecutorObject {
Expand All @@ -84,6 +84,10 @@ typedef struct _PyExecutorObject {
_PyExitData exits[1];
} _PyExecutorObject;

/* If pending deletion list gets large enough, then scan,
* and free any executors that aren't executing
* i.e. any that aren't a thread's current_executor. */
#define EXECUTOR_DELETE_LIST_MAX 100

// Export for '_opcode' shared extension (JIT compiler).
PyAPI_FUNC(_PyExecutorObject*) _Py_GetExecutor(PyCodeObject *code, int offset);
Expand Down Expand Up @@ -304,6 +308,9 @@ static inline int is_terminator(const _PyUOpInstruction *uop)
}

PyAPI_FUNC(int) _PyDumpExecutors(FILE *out);
#ifdef _Py_TIER2
extern void _Py_ClearExecutorDeletionList(PyInterpreterState *interp);
#endif

#ifdef __cplusplus
}
Expand Down
2 changes: 1 addition & 1 deletion Include/internal/pycore_uop_metadata.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 24 additions & 10 deletions Python/bytecodes.c
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,17 @@ dummy_func(
tstate->current_frame = frame->previous;
assert(!_PyErr_Occurred(tstate));
PyObject *result = PyStackRef_AsPyObjectSteal(retval);
#if !Py_TAIL_CALL_INTERP
assert(frame == &entry.frame);
#endif
#ifdef _Py_TIER2
_PyStackRef executor = frame->localsplus[0];
assert(tstate->current_executor == NULL);
if (!PyStackRef_IsNull(executor)) {
tstate->current_executor = PyStackRef_AsPyObjectBorrow(executor);
PyStackRef_CLOSE(executor);
}
#endif
LLTRACE_RESUME_FRAME();
return result;
}
Expand Down Expand Up @@ -2912,8 +2923,7 @@ dummy_func(
}
else {
this_instr[1].counter = initial_jump_backoff_counter();
assert(tstate->previous_executor == NULL);
tstate->previous_executor = Py_None;
assert(tstate->current_executor == NULL);
GOTO_TIER_TWO(executor);
}
}
Expand Down Expand Up @@ -2965,7 +2975,7 @@ dummy_func(
assert(executor->vm_data.index == INSTR_OFFSET() - 1);
assert(executor->vm_data.code == code);
assert(executor->vm_data.valid);
assert(tstate->previous_executor == NULL);
assert(tstate->current_executor == NULL);
/* If the eval breaker is set then stay in tier 1.
* This avoids any potentially infinite loops
* involving _RESUME_CHECK */
Expand All @@ -2978,8 +2988,6 @@ dummy_func(
}
DISPATCH_GOTO();
}
tstate->previous_executor = Py_None;
Py_INCREF(executor);
GOTO_TIER_TWO(executor);
#else
Py_FatalError("ENTER_EXECUTOR is not supported in this build");
Expand Down Expand Up @@ -5254,7 +5262,6 @@ dummy_func(
exit->temperature = initial_temperature_backoff_counter();
Py_CLEAR(exit->executor);
}
tstate->previous_executor = (PyObject *)current_executor;
if (exit->executor == NULL) {
_Py_BackoffCounter temperature = exit->temperature;
if (!backoff_counter_triggers(temperature)) {
Expand All @@ -5277,7 +5284,6 @@ dummy_func(
}
exit->executor = executor;
}
Py_INCREF(exit->executor);
GOTO_TIER_TWO(exit->executor);
}

Expand Down Expand Up @@ -5316,7 +5322,6 @@ dummy_func(
}

tier2 op(_START_EXECUTOR, (executor/4 --)) {
Copy link
Member

@brandtbucher brandtbucher May 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side note, this instruction now exists solely to set the current_executor in the tier two interpreter. We should try to remove it as a follow-up (doesn't need to be in this PR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can replace all uses of current_executor with an operand, which will remove the current_executor local variable from the tier 2 interpreter and the _JIT_EXECUTOR patch from the JIT template.

We can also shrink _CHECK_VALIDITY a bit by replacing current_executor->vm_data.valid with *validity_ptr:

tier2 op(_CHECK_VALIDITY, (validity_ptr/4 --)) {
    DEOPT_IF(*validity_ptr == 0);
}

For another PR.

Py_CLEAR(tstate->previous_executor);
#ifndef _Py_JIT
current_executor = (_PyExecutorObject*)executor;
#endif
Expand All @@ -5337,12 +5342,10 @@ dummy_func(
}

tier2 op(_DEOPT, (--)) {
tstate->previous_executor = (PyObject *)current_executor;
GOTO_TIER_ONE(_PyFrame_GetBytecode(frame) + CURRENT_TARGET());
}

tier2 op(_ERROR_POP_N, (target/2 --)) {
tstate->previous_executor = (PyObject *)current_executor;
assert(oparg == 0);
frame->instr_ptr = _PyFrame_GetBytecode(frame) + target;
SYNC_SP();
Expand Down Expand Up @@ -5463,6 +5466,17 @@ dummy_func(
if (frame->owner == FRAME_OWNED_BY_INTERPRETER) {
/* Restore previous frame and exit */
tstate->current_frame = frame->previous;
#if !Py_TAIL_CALL_INTERP
assert(frame == &entry.frame);
#endif
#ifdef _Py_TIER2
_PyStackRef executor = frame->localsplus[0];
assert(tstate->current_executor == NULL);
if (!PyStackRef_IsNull(executor)) {
tstate->current_executor = PyStackRef_AsPyObjectBorrow(executor);
PyStackRef_CLOSE(executor);
}
#endif
return NULL;
}
next_instr = frame->instr_ptr;
Expand Down
50 changes: 31 additions & 19 deletions Python/ceval.c
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,11 @@ _PyObjectArray_Free(PyObject **array, PyObject **scratch)
#define DONT_SLP_VECTORIZE
#endif

typedef struct {
_PyInterpreterFrame frame;
_PyStackRef stack[1];
} _PyEntryFrame;
Comment on lines +993 to +996
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed? We only need one slot in localsplus, which _PyInterpreterFrame already has.

Copy link
Member Author

@markshannon markshannon May 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now need two slots. One on the stack for the return value, as before, and one local variable to hold the current executor.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see. Forgot about the return value.

Maybe add a comment that the one stack slot is for the return value, and the one local is for the executor.


PyObject* _Py_HOT_FUNCTION DONT_SLP_VECTORIZE
_PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int throwflag)
{
Expand All @@ -1009,7 +1014,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
int oparg; /* Current opcode argument, if any */
assert(tstate->current_frame == NULL || tstate->current_frame->stackpointer != NULL);
#endif
_PyInterpreterFrame entry_frame;
_PyEntryFrame entry;

if (_Py_EnterRecursiveCallTstate(tstate, "")) {
assert(frame->owner != FRAME_OWNED_BY_INTERPRETER);
Expand All @@ -1021,30 +1026,37 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
* These are cached values from the frame and code object. */
_Py_CODEUNIT *next_instr;
_PyStackRef *stack_pointer;
entry_frame.localsplus[0] = PyStackRef_NULL;
entry.stack[0] = PyStackRef_NULL;
#ifdef Py_STACKREF_DEBUG
entry_frame.f_funcobj = PyStackRef_None;
entry.frame.f_funcobj = PyStackRef_None;
#elif defined(Py_DEBUG)
/* Set these to invalid but identifiable values for debugging. */
entry_frame.f_funcobj = (_PyStackRef){.bits = 0xaaa0};
entry_frame.f_locals = (PyObject*)0xaaa1;
entry_frame.frame_obj = (PyFrameObject*)0xaaa2;
entry_frame.f_globals = (PyObject*)0xaaa3;
entry_frame.f_builtins = (PyObject*)0xaaa4;
entry.frame.f_funcobj = (_PyStackRef){.bits = 0xaaa0};
entry.frame.f_locals = (PyObject*)0xaaa1;
entry.frame.frame_obj = (PyFrameObject*)0xaaa2;
entry.frame.f_globals = (PyObject*)0xaaa3;
entry.frame.f_builtins = (PyObject*)0xaaa4;
#endif
entry_frame.f_executable = PyStackRef_None;
entry_frame.instr_ptr = (_Py_CODEUNIT *)_Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS + 1;
entry_frame.stackpointer = entry_frame.localsplus;
entry_frame.owner = FRAME_OWNED_BY_INTERPRETER;
entry_frame.visited = 0;
entry_frame.return_offset = 0;
entry.frame.f_executable = PyStackRef_None;
entry.frame.instr_ptr = (_Py_CODEUNIT *)_Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS + 1;
entry.frame.stackpointer = entry.stack;
entry.frame.owner = FRAME_OWNED_BY_INTERPRETER;
entry.frame.visited = 0;
entry.frame.return_offset = 0;
#ifdef Py_DEBUG
entry_frame.lltrace = 0;
entry.frame.lltrace = 0;
#endif
/* Push frame */
entry_frame.previous = tstate->current_frame;
frame->previous = &entry_frame;
entry.frame.previous = tstate->current_frame;
frame->previous = &entry.frame;
tstate->current_frame = frame;
entry.frame.localsplus[0] = PyStackRef_NULL;
#ifdef _Py_TIER2
if (tstate->current_executor != NULL) {
entry.frame.localsplus[0] = PyStackRef_FromPyObjectNew(tstate->current_executor);
tstate->current_executor = NULL;
}
#endif

/* support for generator.throw() */
if (throwflag) {
Expand All @@ -1071,9 +1083,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
stack_pointer = _PyFrame_GetStackPointer(frame);
#if Py_TAIL_CALL_INTERP
# if Py_STATS
return _TAIL_CALL_error(frame, stack_pointer, tstate, next_instr, 0, lastopcode);
return _TAIL_CALL_error(frame, stack_pointer, tstate, next_instr, 0, lastopcode);
# else
return _TAIL_CALL_error(frame, stack_pointer, tstate, next_instr, 0);
return _TAIL_CALL_error(frame, stack_pointer, tstate, next_instr, 0);
# endif
#else
goto error;
Expand Down
9 changes: 6 additions & 3 deletions Python/ceval_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,12 +359,12 @@ _PyFrame_SetStackPointer(frame, stack_pointer)
do { \
OPT_STAT_INC(traces_executed); \
_PyExecutorObject *_executor = (EXECUTOR); \
tstate->current_executor = (PyObject *)_executor; \
jit_func jitted = _executor->jit_code; \
/* Keep the shim frame alive via the executor: */ \
Py_INCREF(_executor); \
next_instr = jitted(frame, stack_pointer, tstate); \
Py_DECREF(_executor); \
Py_CLEAR(tstate->previous_executor); \
frame = tstate->current_frame; \
stack_pointer = _PyFrame_GetStackPointer(frame); \
if (next_instr == NULL) { \
Expand All @@ -377,7 +377,9 @@ do { \
#define GOTO_TIER_TWO(EXECUTOR) \
do { \
OPT_STAT_INC(traces_executed); \
next_uop = (EXECUTOR)->trace; \
_PyExecutorObject *_executor = (EXECUTOR); \
tstate->current_executor = (PyObject *)_executor; \
next_uop = _executor->trace; \
assert(next_uop->opcode == _START_EXECUTOR); \
goto enter_tier_two; \
} while (0)
Expand All @@ -386,10 +388,11 @@ do { \
#define GOTO_TIER_ONE(TARGET) \
do \
{ \
tstate->current_executor = NULL; \
next_instr = (TARGET); \
assert(tstate->current_executor == NULL); \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: this seems redundant, given that we're setting it one line above.

OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); \
_PyFrame_SetStackPointer(frame, stack_pointer); \
Py_CLEAR(tstate->previous_executor); \
stack_pointer = _PyFrame_GetStackPointer(frame); \
if (next_instr == NULL) \
{ \
Expand Down
7 changes: 0 additions & 7 deletions Python/executor_cases.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading