Commit 91234a16 authored by Inada Naoki's avatar Inada Naoki Committed by GitHub

bpo-26219: per opcode cache for LOAD_GLOBAL (GH-12884)

This patch implements per opcode cache mechanism, and use it in
only LOAD_GLOBAL opcode.

Based on Yury's opcache3.patch in bpo-26219.
parent 29ec4228
......@@ -860,6 +860,10 @@ Optimizations
methods up to 20--50%. (Contributed by Serhiy Storchaka in :issue:`23867`,
:issue:`35582` and :issue:`36127`.)
* ``LOAD_GLOBAL`` instruction now uses new "per opcode cache" mechanism.
It is about 40% faster now. (Contributed by Yury Selivanov and Inada Naoki in
:issue:`26219`.)
Build and C API Changes
=======================
......
......@@ -17,6 +17,8 @@ typedef uint16_t _Py_CODEUNIT;
# define _Py_OPARG(word) ((word) >> 8)
#endif
typedef struct _PyOpcache _PyOpcache;
/* Bytecode object */
typedef struct {
PyObject_HEAD
......@@ -49,6 +51,21 @@ typedef struct {
Type is a void* to keep the format private in codeobject.c to force
people to go through the proper APIs. */
void *co_extra;
/* Per opcodes just-in-time cache
*
* To reduce cache size, we use indirect mapping from opcode index to
* cache object:
* cache = co_opcache[co_opcache_map[next_instr - first_instr] - 1]
*/
// co_opcache_map is indexed by (next_instr - first_instr).
// * 0 means there is no cache for this opcode.
// * n > 0 means there is cache in co_opcache[n-1].
unsigned char *co_opcache_map;
_PyOpcache *co_opcache;
int co_opcache_flag; // used to determine when create a cache.
unsigned char co_opcache_size; // length of co_opcache.
} PyCodeObject;
/* Masks for co_flags above */
......
......@@ -31,6 +31,9 @@ PyAPI_FUNC(void) _PyEval_SignalAsyncExc(
PyAPI_FUNC(void) _PyEval_ReInitThreads(
_PyRuntimeState *runtime);
/* Private function */
void _PyEval_Fini(void);
#ifdef __cplusplus
}
#endif
......
#ifndef Py_INTERNAL_CODE_H
#define Py_INTERNAL_CODE_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
uint64_t globals_ver; /* ma_version of global dict */
uint64_t builtins_ver; /* ma_version of builtin dict */
} _PyOpcache_LoadGlobal;
struct _PyOpcache {
union {
_PyOpcache_LoadGlobal lg;
} u;
char optimized;
};
/* Private API */
int _PyCode_InitOpcache(PyCodeObject *co);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_CODE_H */
......@@ -80,14 +80,14 @@ class DictVersionTests(unittest.TestCase):
# setting a key to the same value with dict.__setitem__
# must change the version
self.check_version_changed(d, d.__setitem__, 'key', value)
self.check_version_dont_change(d, d.__setitem__, 'key', value)
# setting a key to the same value with dict.update
# must change the version
self.check_version_changed(d, d.update, key=value)
self.check_version_dont_change(d, d.update, key=value)
d2 = self.new_dict(key=value)
self.check_version_changed(d, d.update, d2)
self.check_version_dont_change(d, d.update, d2)
def test_setitem_equal(self):
class AlwaysEqual:
......
......@@ -1070,6 +1070,7 @@ PYTHON_HEADERS= \
$(srcdir)/Include/internal/pycore_accu.h \
$(srcdir)/Include/internal/pycore_atomic.h \
$(srcdir)/Include/internal/pycore_ceval.h \
$(srcdir)/Include/internal/pycore_code.h \
$(srcdir)/Include/internal/pycore_condvar.h \
$(srcdir)/Include/internal/pycore_context.h \
$(srcdir)/Include/internal/pycore_fileutils.h \
......
Implemented per opcode cache mechanism and ``LOAD_GLOBAL`` instruction use
it. ``LOAD_GLOBAL`` is now about 40% faster. Contributed by Yury Selivanov,
and Inada Naoki.
......@@ -2,7 +2,9 @@
#include "Python.h"
#include "code.h"
#include "opcode.h"
#include "structmember.h"
#include "pycore_code.h"
#include "pycore_pystate.h"
#include "pycore_tupleobject.h"
#include "clinic/codeobject.c.h"
......@@ -233,9 +235,56 @@ PyCode_New(int argcount, int posonlyargcount, int kwonlyargcount,
co->co_zombieframe = NULL;
co->co_weakreflist = NULL;
co->co_extra = NULL;
co->co_opcache_map = NULL;
co->co_opcache = NULL;
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
return co;
}
int
_PyCode_InitOpcache(PyCodeObject *co)
{
Py_ssize_t co_size = PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT);
co->co_opcache_map = (unsigned char *)PyMem_Calloc(co_size, 1);
if (co->co_opcache_map == NULL) {
return -1;
}
_Py_CODEUNIT *opcodes = (_Py_CODEUNIT*)PyBytes_AS_STRING(co->co_code);
Py_ssize_t opts = 0;
for (Py_ssize_t i = 0; i < co_size;) {
unsigned char opcode = _Py_OPCODE(opcodes[i]);
i++; // 'i' is now aligned to (next_instr - first_instr)
// TODO: LOAD_METHOD, LOAD_ATTR
if (opcode == LOAD_GLOBAL) {
co->co_opcache_map[i] = ++opts;
if (opts > 254) {
break;
}
}
}
if (opts) {
co->co_opcache = (_PyOpcache *)PyMem_Calloc(opts, sizeof(_PyOpcache));
if (co->co_opcache == NULL) {
PyMem_FREE(co->co_opcache_map);
return -1;
}
}
else {
PyMem_FREE(co->co_opcache_map);
co->co_opcache_map = NULL;
co->co_opcache = NULL;
}
co->co_opcache_size = opts;
return 0;
}
PyCodeObject *
PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno)
{
......@@ -458,6 +507,15 @@ code_new(PyTypeObject *type, PyObject *args, PyObject *kw)
static void
code_dealloc(PyCodeObject *co)
{
if (co->co_opcache != NULL) {
PyMem_FREE(co->co_opcache);
}
if (co->co_opcache_map != NULL) {
PyMem_FREE(co->co_opcache_map);
}
co->co_opcache_flag = 0;
co->co_opcache_size = 0;
if (co->co_extra != NULL) {
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
_PyCodeObjectExtra *co_extra = co->co_extra;
......@@ -504,6 +562,13 @@ code_sizeof(PyCodeObject *co, PyObject *Py_UNUSED(args))
res += sizeof(_PyCodeObjectExtra) +
(co_extra->ce_size-1) * sizeof(co_extra->ce_extras[0]);
}
if (co->co_opcache != NULL) {
assert(co->co_opcache_map != NULL);
// co_opcache_map
res += PyBytes_GET_SIZE(co->co_code) / sizeof(_Py_CODEUNIT);
// co_opcache
res += co->co_opcache_size * sizeof(_PyOpcache);
}
return PyLong_FromSsize_t(res);
}
......
......@@ -1080,6 +1080,7 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
return 0;
}
if (old_value != value) {
if (_PyDict_HasSplitTable(mp)) {
mp->ma_values[ix] = value;
if (old_value == NULL) {
......@@ -1092,8 +1093,8 @@ insertdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject *value)
assert(old_value != NULL);
DK_ENTRIES(mp->ma_keys)[ix].me_value = value;
}
mp->ma_version_tag = DICT_NEXT_VERSION();
}
Py_XDECREF(old_value); /* which **CAN** re-enter (see issue #22653) */
ASSERT_CONSISTENT(mp);
Py_DECREF(key);
......
......@@ -158,6 +158,7 @@
<ClInclude Include="..\Include\import.h" />
<ClInclude Include="..\Include\internal\pycore_accu.h" />
<ClInclude Include="..\Include\internal\pycore_atomic.h" />
<ClInclude Include="..\Include\internal\pycore_code.h" />
<ClInclude Include="..\Include\internal\pycore_ceval.h" />
<ClInclude Include="..\Include\internal\pycore_condvar.h" />
<ClInclude Include="..\Include\internal\pycore_context.h" />
......
......@@ -177,6 +177,9 @@
<ClInclude Include="..\Include\internal\pycore_atomic.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_code.h">
<Filter>Include</Filter>
</ClInclude>
<ClInclude Include="..\Include\internal\pycore_ceval.h">
<Filter>Include</Filter>
</ClInclude>
......
......@@ -11,6 +11,7 @@
#include "Python.h"
#include "pycore_ceval.h"
#include "pycore_code.h"
#include "pycore_object.h"
#include "pycore_pyerrors.h"
#include "pycore_pylifecycle.h"
......@@ -101,6 +102,20 @@ static long dxp[256];
#endif
#endif
/* per opcode cache */
#define OPCACHE_MIN_RUNS 1024 /* create opcache when code executed this time */
#define OPCACHE_STATS 0 /* Enable stats */
#if OPCACHE_STATS
static size_t opcache_code_objects = 0;
static size_t opcache_code_objects_extra_mem = 0;
static size_t opcache_global_opts = 0;
static size_t opcache_global_hits = 0;
static size_t opcache_global_misses = 0;
#endif
/* This can set eval_breaker to 0 even though gil_drop_request became
1. We believe this is all right because the eval loop will release
the GIL eventually anyway. */
......@@ -225,6 +240,35 @@ exit_thread_if_finalizing(PyThreadState *tstate)
}
}
void
_PyEval_Fini(void)
{
#if OPCACHE_STATS
fprintf(stderr, "-- Opcode cache number of objects = %zd\n",
opcache_code_objects);
fprintf(stderr, "-- Opcode cache total extra mem = %zd\n",
opcache_code_objects_extra_mem);
fprintf(stderr, "\n");
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL hits = %zd (%d%%)\n",
opcache_global_hits,
(int) (100.0 * opcache_global_hits /
(opcache_global_hits + opcache_global_misses)));
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL misses = %zd (%d%%)\n",
opcache_global_misses,
(int) (100.0 * opcache_global_misses /
(opcache_global_hits + opcache_global_misses)));
fprintf(stderr, "-- Opcode cache LOAD_GLOBAL opts = %zd\n",
opcache_global_opts);
fprintf(stderr, "\n");
#endif
}
void
PyEval_AcquireLock(void)
{
......@@ -799,6 +843,7 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
const _Py_CODEUNIT *first_instr;
PyObject *names;
PyObject *consts;
_PyOpcache *co_opcache;
#ifdef LLTRACE
_Py_IDENTIFIER(__ltrace__);
......@@ -1061,6 +1106,49 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
Py_XDECREF(traceback); \
} while(0)
/* macros for opcode cache */
#define OPCACHE_CHECK() \
do { \
co_opcache = NULL; \
if (co->co_opcache != NULL) { \
unsigned char co_opt_offset = \
co->co_opcache_map[next_instr - first_instr]; \
if (co_opt_offset > 0) { \
assert(co_opt_offset <= co->co_opcache_size); \
co_opcache = &co->co_opcache[co_opt_offset - 1]; \
assert(co_opcache != NULL); \
if (co_opcache->optimized < 0) { \
co_opcache = NULL; \
} \
} \
} \
} while (0)
#if OPCACHE_STATS
#define OPCACHE_STAT_GLOBAL_HIT() \
do { \
if (co->co_opcache != NULL) opcache_global_hits++; \
} while (0)
#define OPCACHE_STAT_GLOBAL_MISS() \
do { \
if (co->co_opcache != NULL) opcache_global_misses++; \
} while (0)
#define OPCACHE_STAT_GLOBAL_OPT() \
do { \
if (co->co_opcache != NULL) opcache_global_opts++; \
} while (0)
#else /* OPCACHE_STATS */
#define OPCACHE_STAT_GLOBAL_HIT()
#define OPCACHE_STAT_GLOBAL_MISS()
#define OPCACHE_STAT_GLOBAL_OPT()
#endif
/* Start of code */
/* push frame */
......@@ -1142,6 +1230,20 @@ _PyEval_EvalFrameDefault(PyFrameObject *f, int throwflag)
f->f_stacktop = NULL; /* remains NULL unless yield suspends frame */
f->f_executing = 1;
if (co->co_opcache_flag < OPCACHE_MIN_RUNS) {
co->co_opcache_flag++;
if (co->co_opcache_flag == OPCACHE_MIN_RUNS) {
if (_PyCode_InitOpcache(co) < 0) {
return NULL;
}
#if OPCACHE_STATS
opcache_code_objects_extra_mem +=
PyBytes_Size(co->co_code) / sizeof(_Py_CODEUNIT) +
sizeof(_PyOpcache) * co->co_opcache_size;
opcache_code_objects++;
#endif
}
}
#ifdef LLTRACE
lltrace = _PyDict_GetItemId(f->f_globals, &PyId___ltrace__) != NULL;
......@@ -2451,11 +2553,30 @@ main_loop:
}
case TARGET(LOAD_GLOBAL): {
PyObject *name = GETITEM(names, oparg);
PyObject *name;
PyObject *v;
if (PyDict_CheckExact(f->f_globals)
&& PyDict_CheckExact(f->f_builtins))
{
OPCACHE_CHECK();
if (co_opcache != NULL && co_opcache->optimized > 0) {
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
if (lg->globals_ver ==
((PyDictObject *)f->f_globals)->ma_version_tag
&& lg->builtins_ver ==
((PyDictObject *)f->f_builtins)->ma_version_tag)
{
PyObject *ptr = lg->ptr;
OPCACHE_STAT_GLOBAL_HIT();
assert(ptr != NULL);
Py_INCREF(ptr);
PUSH(ptr);
DISPATCH();
}
}
name = GETITEM(names, oparg);
v = _PyDict_LoadGlobal((PyDictObject *)f->f_globals,
(PyDictObject *)f->f_builtins,
name);
......@@ -2468,12 +2589,32 @@ main_loop:
}
goto error;
}
if (co_opcache != NULL) {
_PyOpcache_LoadGlobal *lg = &co_opcache->u.lg;
if (co_opcache->optimized == 0) {
/* Wasn't optimized before. */
OPCACHE_STAT_GLOBAL_OPT();
} else {
OPCACHE_STAT_GLOBAL_MISS();
}
co_opcache->optimized = 1;
lg->globals_ver =
((PyDictObject *)f->f_globals)->ma_version_tag;
lg->builtins_ver =
((PyDictObject *)f->f_builtins)->ma_version_tag;
lg->ptr = v; /* borrowed */
}
Py_INCREF(v);
}
else {
/* Slow-path if globals or builtins is not a dict */
/* namespace 1: globals */
name = GETITEM(names, oparg);
v = PyObject_GetItem(f->f_globals, name);
if (v == NULL) {
if (!_PyErr_ExceptionMatches(tstate, PyExc_KeyError)) {
......
......@@ -1242,6 +1242,9 @@ Py_FinalizeEx(void)
/* Destroy all modules */
PyImport_Cleanup();
/* Print debug stats if any */
_PyEval_Fini();
/* Flush sys.stdout and sys.stderr (again, in case more was printed) */
if (flush_std_files() < 0) {
status = -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment