Commit 91234a16 authored by Inada Naoki's avatar Inada Naoki Committed by GitHub

bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.

Based on Yury's opcache3.patch in bpo-26219.
parent 29ec4228
...@@ -860,6 +860,10 @@ Optimizations ...@@ -860,6 +860,10 @@ Optimizations
methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`, methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
:issue:`35582` and :issue:`36127`.) :issue:`35582` and :issue:`36127`.)
* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
:issue:`26219`.)
Build and C API Changes Build and C API Changes
======================= =======================
......
...@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT; ...@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
# define _Py_OPARG(word) ((word) >> 8) # define _Py_OPARG(word) ((word) >> 8)
#endif #endif
typedef struct _PyOpcache _PyOpcache;
/* Bytecode object */ /* Bytecode object */
typedef struct { typedef struct {
PyObject_HEAD PyObject_HEAD
...@@ -49,6 +51,21 @@ typedef struct { ...@@ -49,6 +51,21 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */ people to go through the proper APIs. */
void *co_extra; void *co_extra;
/* Per opcodes just-in-time cache
*
* To reduce cache size, we use indirect mapping from opcode index to
* cache object:
* cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
*/
// co_opcache_map is indexed by (next_instr - first_instr).
// * 0 means there is no cache for this opcode.
// * n > 0 means there is cache in co_opcache[n-1].
unsigned char *co_opcache_map;
_PyOpcache *co_opcache;
int co_opcache_flag; // used to determine when create a cache.
unsigned char co_opcache_size; // length of co_opcache.
} PyCodeObject; } PyCodeObject;
/* Masks for co_flags above */ /* Masks for co_flags above */
......
...@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc( ...@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
PyAPI_FUNC(void) _PyEval_ReInitThreads( PyAPI_FUNC(void) _PyEval_ReInitThreads(
_PyRuntimeState *runtime); _PyRuntimeState *runtime);
/* Private function */
void _PyEval_Fini(void);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
#ifndef Py_INTERNAL_CODE_H
#define Py_INTERNAL_CODE_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
uint64_t globals_ver; /* ma_version of global dict */
uint64_t builtins_ver; /* ma_version of builtin dict */
} _PyOpcache_LoadGlobal;
struct _PyOpcache {
union {
_PyOpcache_LoadGlobal lg;
} u;
char optimized;
};
/* Private API */
int _PyCode_InitOpcache(PyCodeObject *co);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_CODE_H */
...@@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase): ...@@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase):
# setting a key to the same value with dict.__setitem__ # setting a key to the same value with dict.__setitem__
# must change the version # must change the version
self.check_version_changed(d, d.__setitem__, 'key', value) self.check_version_dont_change(d, d.__setitem__, 'key', value)
# setting a key to the same value with dict.update # setting a key to the same value with dict.update
# must change the version # must change the version
self.check_version_changed(d, d.update, key=value) self.check_version_dont_change(d, d.update, key=value)
d2 = self.new_dict(key=value) d2 = self.new_dict(key=value)
self.check_version_changed(d, d.update, d2) self.check_version_dont_change(d, d.update, d2)
def test_setitem_equal(self): def test_setitem_equal(self):
class AlwaysEqual: class AlwaysEqual:
......
...@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \ ...@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_accu.h \ $(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \ $(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_ceval.h \ $(srcdir)/Include/internal/pycore_ceval.h \
$(srcdir)/Include/internal/pycore_code.h \
$(srcdir)/Include/internal/pycore_condvar.h \ $(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \ $(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_fileutils.h \ $(srcdir)/Include/internal/pycore_fileutils.h \
......
Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
and Inada Naoki.
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
#include "Python.h" #include "Python.h"
#include "code.h" #include "code.h"
#include "opcode.h"
#include "structmember.h" #include "structmember.h"
#include "pycore_code.h"
#include "pycore_pystate.h" #include "pycore_pystate.h"
#include "pycore_tupleobject.h" #include "pycore_tupleobject.h"
#include "clinic/codeobject.c.h" #include "clinic/codeobject.c.h"
...@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount, ...@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_zombieframe = NULL; co->co_zombieframe = NULL;
co->co_weakreflist = NULL; co->co_weakreflist = NULL;
co->co_extra = NULL; co->co_extra = NULL;
co->co_opcache_map = NULL;
co->co_opcache = NULL;
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
return co; return co;
} }
int
_PyCode_InitOpcache(PyCodeObject *co)
{
Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
if (co->co_opcache_map == NULL) {
return -1;
}
_Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
Py_ssize_t opts = 0;
for (Py_ssize_t i = 0; i < co_size;) {
unsigned char opcode = _Py_OPCODE(opcodes[i]);
i++; // 'i' is now aligned to (next_instr - first_instr)
// TODO: LOAD_METHOD, LOAD_ATTR
if (opcode == LOAD_GLOBAL) {
co->co_opcache_map[i] = ++opts;
if (opts > 254) {
break;
}
}
}
if (opts) {
co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
if (co->co_opcache == NULL) {
PyMem_FREE(co->co_opcache_map);
return -1;
}
}
else {
PyMem_FREE(co->co_opcache_map);
co->co_opcache_map = NULL;
co->co_opcache = NULL;
}
co->co_opcache_size = opts;
return 0;
}
PyCodeObject * PyCodeObject *
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno) PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
{ {
...@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw) ...@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
static void static void
code_dealloc(PyCodeObject *co) code_dealloc(PyCodeObject *co)
{ {
if (co->co_opcache != NULL) {
PyMem_FREE(co->co_opcache);
}
if (co->co_opcache_map != NULL) {
PyMem_FREE(co->co_opcache_map);
}
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
if (co->co_extra != NULL) { if (co->co_extra != NULL) {
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
_PyCodeObjectExtra *co_extra = co->co_extra; _PyCodeObjectExtra *co_extra = co->co_extra;
...@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args)) ...@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
res += sizeof(_PyCodeObjectExtra) + res += sizeof(_PyCodeObjectExtra) +
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]); (co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
} }
if (co->co_opcache != NULL) {
assert(co->co_opcache_map != NULL);
// co_opcache_map
res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
// co_opcache
res += co->co_opcache_size * sizeof(_PyOpcache);
}
return PyLong_FromSsize_t(res); return PyLong_FromSsize_t(res);
} }
......
...@@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value) ...@@ -1080,20 +1080,21 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
return 0; return 0;
} }
if (_PyDict_HasSplitTable(mp)) { if (old_value != value) {
mp->ma_values[ix] = value; if (_PyDict_HasSplitTable(mp)) {
if (old_value == NULL) { mp->ma_values[ix] = value;
/* pending state */ if (old_value == NULL) {
assert(ix == mp->ma_used); /* pending state */
mp->ma_used++; assert(ix == mp->ma_used);
mp->ma_used++;
}
} }
else {
assert(old_value != NULL);
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
}
mp->ma_version_tag = DICT_NEXT_VERSION();
} }
else {
assert(old_value != NULL);
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
}
mp->ma_version_tag = DICT_NEXT_VERSION();
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */ Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
ASSERT_CONSISTENT(mp); ASSERT_CONSISTENT(mp);
Py_DECREF(key); Py_DECREF(key);
......
...@@ -158,6 +158,7 @@ ...@@ -158,6 +158,7 @@
<ClInclude Include="..\Include\import.h" /> <ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" /> <ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" /> <ClInclude Include="..\Include\internal\pycore_atomic.h" />
<ClInclude Include="..\Include\internal\pycore_code.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" /> <ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_condvar.h" /> <ClInclude Include="..\Include\internal\pycore_condvar.h" />
<ClInclude Include="..\Include\internal\pycore_context.h" /> <ClInclude Include="..\Include\internal\pycore_context.h" />
......
...@@ -177,6 +177,9 @@ ...@@ -177,6 +177,9 @@
<ClInclude Include="..\Include\internal\pycore_atomic.h"> <ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter> <Filter>Include</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\Include\internal\pycore_code.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_ceval.h"> <ClInclude Include="..\Include\internal\pycore_ceval.h">
<Filter>Include</Filter> <Filter>Include</Filter>
</ClInclude> </ClInclude>
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "Python.h" #include "Python.h"
#include "pycore_ceval.h" #include "pycore_ceval.h"
#include "pycore_code.h"
#include "pycore_object.h" #include "pycore_object.h"
#include "pycore_pyerrors.h" #include "pycore_pyerrors.h"
#include "pycore_pylifecycle.h" #include "pycore_pylifecycle.h"
...@@ -101,6 +102,20 @@ static long dxp[256]; ...@@ -101,6 +102,20 @@ static long dxp[256];
#endif #endif
#endif #endif
/* per opcode cache */
#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */
#define OPCACHE_STATS 0 /* Enable stats */
#if OPCACHE_STATS
static size_t opcache_code_objects = 0;
static size_t opcache_code_objects_extra_mem = 0;
static size_t opcache_global_opts = 0;
static size_t opcache_global_hits = 0;
static size_t opcache_global_misses = 0;
#endif
/* This can set eval_breaker to 0 even though gil_drop_request became /* This can set eval_breaker to 0 even though gil_drop_request became
1. We believe this is all right because the eval loop will release 1. We believe this is all right because the eval loop will release
the GIL eventually anyway. */ the GIL eventually anyway. */
...@@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate) ...@@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate)
} }
} }
void
_PyEval_Fini(void)
{
#if OPCACHE_STATS
fprintf(stderr, "-- Opcode cache number of objects = %zd\n",
opcache_code_objects);
fprintf(stderr, "-- Opcode cache total extra mem = %zd\n",
opcache_code_objects_extra_mem);
fprintf(stderr, "\n");
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n",
opcache_global_hits,
(int) (100.0 * opcache_global_hits /
(opcache_global_hits + opcache_global_misses)));
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
opcache_global_misses,
(int) (100.0 * opcache_global_misses /
(opcache_global_hits + opcache_global_misses)));
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n",
opcache_global_opts);
fprintf(stderr, "\n");
#endif
}
void void
PyEval_AcquireLock(void) PyEval_AcquireLock(void)
{ {
...@@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) ...@@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
const _Py_CODEUNIT *first_instr; const _Py_CODEUNIT *first_instr;
PyObject *names; PyObject *names;
PyObject *consts; PyObject *consts;
_PyOpcache *co_opcache;
#ifdef LLTRACE #ifdef LLTRACE
_Py_IDENTIFIER(__ltrace__); _Py_IDENTIFIER(__ltrace__);
...@@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) ...@@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
Py_XDECREF(traceback); \ Py_XDECREF(traceback); \
} while(0) } while(0)
/* macros for opcode cache */
#define OPCACHE_CHECK() \
do { \
co_opcache = NULL; \
if (co->co_opcache != NULL) { \
unsigned char co_opt_offset = \
co->co_opcache_map[next_instr - first_instr]; \
if (co_opt_offset > 0) { \
assert(co_opt_offset <= co->co_opcache_size); \
co_opcache = &co->co_opcache[co_opt_offset - 1]; \
assert(co_opcache != NULL); \
if (co_opcache->optimized < 0) { \
co_opcache = NULL; \
} \
} \
} \
} while (0)
#if OPCACHE_STATS
#define OPCACHE_STAT_GLOBAL_HIT() \
do { \
if (co->co_opcache != NULL) opcache_global_hits++; \
} while (0)
#define OPCACHE_STAT_GLOBAL_MISS() \
do { \
if (co->co_opcache != NULL) opcache_global_misses++; \
} while (0)
#define OPCACHE_STAT_GLOBAL_OPT() \
do { \
if (co->co_opcache != NULL) opcache_global_opts++; \
} while (0)
#else /* OPCACHE_STATS */
#define OPCACHE_STAT_GLOBAL_HIT()
#define OPCACHE_STAT_GLOBAL_MISS()
#define OPCACHE_STAT_GLOBAL_OPT()
#endif
/* Start of code */ /* Start of code */
/* push frame */ /* push frame */
...@@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag) ...@@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */ f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
f->f_executing = 1; f->f_executing = 1;
if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
co->co_opcache_flag++;
if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
if (_PyCode_InitOpcache(co) < 0) {
return NULL;
}
#if OPCACHE_STATS
opcache_code_objects_extra_mem +=
PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
sizeof(_PyOpcache) * co->co_opcache_size;
opcache_code_objects++;
#endif
}
}
#ifdef LLTRACE #ifdef LLTRACE
lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL; lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
...@@ -2451,11 +2553,30 @@ main_loop: ...@@ -2451,11 +2553,30 @@ main_loop:
} }
case TARGET(LOAD_GLOBAL): { case TARGET(LOAD_GLOBAL): {
PyObject *name = GETITEM(names, oparg); PyObject *name;
PyObject *v; PyObject *v;
if (PyDict_CheckExact(f->f_globals) if (PyDict_CheckExact(f->f_globals)
&& PyDict_CheckExact(f->f_builtins)) && PyDict_CheckExact(f->f_builtins))
{ {
OPCACHE_CHECK();
if (co_opcache != NULL && co_opcache->optimized > 0) {
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
if (lg->globals_ver ==
((PyDictObject *)f->f_globals)->ma_version_tag
&& lg->builtins_ver ==
((PyDictObject *)f->f_builtins)->ma_version_tag)
{
PyObject *ptr = lg->ptr;
OPCACHE_STAT_GLOBAL_HIT();
assert(ptr != NULL);
Py_INCREF(ptr);
PUSH(ptr);
DISPATCH();
}
}
name = GETITEM(names, oparg);
v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals, v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
(PyDictObject *)f->f_builtins, (PyDictObject *)f->f_builtins,
name); name);
...@@ -2468,12 +2589,32 @@ main_loop: ...@@ -2468,12 +2589,32 @@ main_loop:
} }
goto error; goto error;
} }
if (co_opcache != NULL) {
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
if (co_opcache->optimized == 0) {
/* Wasn't optimized before. */
OPCACHE_STAT_GLOBAL_OPT();
} else {
OPCACHE_STAT_GLOBAL_MISS();
}
co_opcache->optimized = 1;
lg->globals_ver =
((PyDictObject *)f->f_globals)->ma_version_tag;
lg->builtins_ver =
((PyDictObject *)f->f_builtins)->ma_version_tag;
lg->ptr = v; /* borrowed */
}
Py_INCREF(v); Py_INCREF(v);
} }
else { else {
/* Slow-path if globals or builtins is not a dict */ /* Slow-path if globals or builtins is not a dict */
/* namespace 1: globals */ /* namespace 1: globals */
name = GETITEM(names, oparg);
v = PyObject_GetItem(f->f_globals, name); v = PyObject_GetItem(f->f_globals, name);
if (v == NULL) { if (v == NULL) {
if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) { if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
......
...@@ -1242,6 +1242,9 @@ Py_FinalizeEx(void) ...@@ -1242,6 +1242,9 @@ Py_FinalizeEx(void)
/* Destroy all modules */ /* Destroy all modules */
PyImport_Cleanup(); PyImport_Cleanup();
/* Print debug stats if any */
_PyEval_Fini();
/* Flush sys.stdout and sys.stderr (again, in case more was printed) */ /* Flush sys.stdout and sys.stderr (again, in case more was printed) */
if (flush_std_files() < 0) { if (flush_std_files() < 0) {
status = -1; status = -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment