Commit 45af76e6 authored by Kirill Smelkov's avatar Kirill Smelkov

bigfile/pagemap: specialized {} uint64 -> void * mapping

For BigFiles we'll needs to maintain `{} offset-in-file -> void *` mapping. A
hash or a binary tree could be used there, but since we know files are
most of the time accessed sequentially and locally in pages-batches, we
can also organize the mapping in batches of keys.

Specifically offset bits are so divided into parts, that every part
addresses 1 entry in a table of hardware-page in size. To get to the actual
value, the system lookups first table by first part of offset, then from
first table and next part from address - second table, etc.

To clients this looks like a dictionary with get/set/del & clear methods,
but lookups are O(1) time always, and in contrast to hashes values are
stored with locality (= adjacent lookups almost always access the same tables).
parent 8114ad6c
/* Wendelin.bigfile | Pgoffset -> page mapping
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
*
* Implementation of get/set/del/clear for PageMap.
* See wendelin/bigfile/pagemap.h for general PageMap description.
*/
#include <wendelin/bigfile/pagemap.h>
#include <wendelin/bug.h>
#include <sys/mman.h>
/* allocate 1 hw page from OS */
static void *os_alloc_page(void)
{
// -> xmmap ?
void *addr;
addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS /* | MAP_POPULATE */,
-1, 0);
if (addr == MAP_FAILED)
addr = NULL;
return addr;
}
static void *os_xalloc_page(void)
{
void *addr = os_alloc_page();
if(!addr)
BUGe();
return addr;
}
/* deallocate 1 hw page to OS */
static void os_free_page(void *addr)
{
/* must not fail */
xmunmap(addr, PAGE_SIZE);
}
void pagemap_init(PageMap *pmap, unsigned pageshift)
{
unsigned height;
height = (8*sizeof(pgoff_t) - pageshift + (PAGEMAP_LEVEL_BITS - 1)) /
PAGEMAP_LEVEL_BITS;
pmap->pmap0 = 0ULL;
pmap->rshift0 = height * PAGEMAP_LEVEL_BITS;
}
Page *pagemap_get(const PageMap *pmap, pgoff_t pgoffset)
{
unsigned rshift = pmap->rshift0;
void ***tail, **tab;
unsigned idx;
BUG_ON(pgoffset >> rshift);
/* walk through tables
*
* tail - points to entry in previous-level table,
* tab - to current-level table,
* idx - index of entry in current table
*
* tab
* +-----+ ,+-----+
* | | | | |
* tail +-----+--- | |
* +-----+ | | -> ...
* | | | | |
* | | idx +-----+---
* +-----+ +-----+
*/
tail = (void ***)&pmap->pmap0;
while (1) {
rshift -= PAGEMAP_LEVEL_BITS;
idx = (pgoffset >> rshift) & PAGEMAP_NR_MASK;
tab = *tail;
if (!tab)
return NULL; /* table not present - not found */
tab = PTR_POINTER(tab);
if (!rshift)
break;
// XXX move up?
tail = (void ***)&tab[idx];
}
/* final-level table found - get result from there */
return tab[idx];
}
void pagemap_set(PageMap *pmap, pgoff_t pgoffset, Page *page)
{
unsigned rshift = pmap->rshift0;
void ***tail, **tab;
unsigned idx;
BUG_ON(pgoffset >> rshift);
BUG_ON(!page); // XXX or better call pagemap_del() ?
/* walk through tables, allocating memory along the way, if needed
* (see who is who in pagemap_get) */
tail = (void ***)&pmap->pmap0;
while(1) {
rshift -= PAGEMAP_LEVEL_BITS;
idx = (pgoffset >> rshift) & PAGEMAP_NR_MASK;
tab = *tail;
if (!tab) {
tab = os_xalloc_page(); /* NOTE - is hw page aligned */
BUG_ON(tab != PTR_POINTER(tab));
/* NOTE for newly allocated tab we don't need to adjust count for
* *tail - xcount=0 means there is 1 entry (which we'll set) */
*tail = tab;
}
else {
tab = PTR_POINTER(tab);
/* entry empty - adjust tail counter as we'll next either be setting
* new table pointer, or page pointer to it */
if (!tab[idx])
*tail = PTR_XCOUNT_ADD(*tail, +1);
}
if (!rshift)
break;
// XXX move up?
tail = (void ***)&tab[idx];
}
tab[idx] = page;
}
int pagemap_del(PageMap *pmap, pgoff_t pgoffset)
{
unsigned rshift = pmap->rshift0;
unsigned height = rshift / PAGEMAP_LEVEL_BITS;
unsigned idx, l /* current level */;
/* tailv[l] points to tail pointing to entry pointing to tab_i */
void ***tailv[height], **tab; // XXX height -> height+1 ?
BUG_ON(pgoffset >> rshift);
/* walk tables to the end and see if entry is there
* (see who is who in pagemap_get) */
l = 0;
tailv[0] = (void ***)&pmap->pmap0;
while (1) {
rshift -= PAGEMAP_LEVEL_BITS;
idx = (pgoffset >> rshift) & PAGEMAP_NR_MASK;
tab = *tailv[l];
if (!tab)
return 0; /* entry already missing */
tab = PTR_POINTER(tab);
if (!rshift)
break;
tailv[++l] = (void ***)&tab[idx];
}
if (!tab[idx])
return 0; /* not found in last-level */
/* entry present - clear it and unwind back, decreasing counters and
* freeing tables memory along the way */
tab[idx] = NULL;
do {
void ***tail = tailv[l];
if (PTR_XCOUNT(*tail)) {
*tail = PTR_XCOUNT_ADD(*tail, -1);
break; /* other entries present - nothing to delete */
}
/* table became empty - free & forget it */
os_free_page(*tail);
*tail = NULL;
} while (l--);
return 1;
}
void pagemap_clear(PageMap *pmap)
{
/* leverage pagemap walker logic, but nothing to do with leaf table
* contents - the freeing is done when going each level up */
__pagemap_for_each_leaftab(tab, tailv, l, pmap,
/* when going level up: */
{
os_free_page(PTR_POINTER(*tailv[l]));
/* clearing not needed - we'll free this memory at next step
* anyway, but easier for debugging for now */
*tailv[l] = NULL;
}
)
(void)tab; /* unused */
}
/* Wendelin.bigfile | pagemap tests
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*/
// XXX better link with it
#include "../pagemap.c"
#include <ccan/tap/tap.h>
#include <ccan/array_size/array_size.h>
int main()
{
PageMap pmap;
Page *pageptr;
/* helpers bound to pmap; also translate pointers to uintptr_t to simplify testing */
uintptr_t get(pgoff_t pgoffset) { return (uintptr_t)pagemap_get(&pmap, pgoffset); }
void set(pgoff_t pgoffset, uintptr_t page) { pagemap_set(&pmap, pgoffset, (Page *)page); }
int del(pgoff_t pgoffset) { return pagemap_del(&pmap, pgoffset); }
void init(unsigned pageshift) { pagemap_init(&pmap, pageshift); }
diag("Testing pagemap");
tap_fail_callback = abort; // XXX to catch failure immediately
int N = PAGEMAP_LEVEL_BITS;
int I1 = 1<<PAGEMAP_LEVEL_BITS; /* pgoffset -> T2[1] */
int I2 = 2*I1; /* pgoffset -> T2[2] */
int i;
// FIXME numbers hardcoded
init(64- N); ok1(pmap.rshift0 == N);
init(64-2*N); ok1(pmap.rshift0 == 2*N);
init(12); ok1(pmap.rshift0 == 6*N); /* 4K */
init(21); ok1(pmap.rshift0 == 5*N); /* 2M */
/* go on testing with 2-level pagetab */
init(64-2*N); ok1(pmap.rshift0 == 2*N);
/* access to p: pointer n: counter at appropriate level && v: value in final entry */
pgoff_t pgidx;
void **__x0() { return (void **)pmap.pmap0; }
void **__p0() { return PTR_POINTER(__x0()); }
unsigned __n0() { return PTR_XCOUNT(__x0()); }
void **__x1() { return __p0()[pgidx >> PAGEMAP_LEVEL_BITS]; }
void **__p1() { return PTR_POINTER(__x1()); }
unsigned __n1() { return PTR_XCOUNT(__x1()); }
uintptr_t __v() { return (uintptr_t)__p1()[pgidx & PAGEMAP_NR_MASK]; }
#define p0 (__p0())
#define n0 (__n0() + (__p0() ? 1 : 0) )
#define p1 (__p1())
#define n1 (__n1() + (__p1() ? 1 : 0) )
#define v (__v())
/* check that first level has n0 entries */
#define CHECK0(N0) do { \
/* no pgidx setup needed - n0 & p0 are independent of it */ \
ok(n0 == N0 && (!!N0 == !!p0), \
"CHECK0(%i)", N0); \
} while (0)
/* check that first & second levels have n0 & n1 entries */
#define CHECK1(PGIDX,N0,N1) do { \
pgidx = PGIDX; \
ok(n0 == N0 && p0 && n1 == N1 && (!!N1 == !!p1),\
"CHECK1(0x%x, %i, %i)", PGIDX, N0, N1); \
} while (0)
/* check first & second levels for #entries, and also final entry for value */
#define CHECK(PGIDX,N0,N1,V) do { \
pgidx = PGIDX; \
ok(n0 == N0 && p0 && n1 == N1 && p1 && v == V, \
"CHECK(0x%x, %u, %u, %u)", PGIDX, N0, N1, V); \
} while (0)
/* like check, but report only on error */
#define __CHECK(PGIDX,N0,N1,V) do { \
pgidx = PGIDX; \
if (!(n0 == N0 && p0 && n1 == N1 && p1 && v == V)) \
fail("CHECK(0x%x, %u, %u, %u)", PGIDX, N0, N1, V); \
} while (0)
/* get/set */
diag("get/set");
ok1(!get(0));
ok1(!get(123));
/* pgidx n0 n1 v */
CHECK0 ( 0);
/* I0[0] */
set(0, 77);
ok1(get(0) == 77);
CHECK1 (0, 1, 1);
CHECK1 (I1, 1, 0);
CHECK (0, 1, 1, 77);
/* I0[1] */
ok1(!get(1));
set(1, 88);
ok1(get(1) == 88);
CHECK (0, 1, 2, 77);
CHECK (1, 1, 2, 88);
/* I1[0] */
ok1(!get(I1));
set(I1, 99);
ok1(get(I1) == 99);
CHECK (0, 2, 2, 77);
CHECK (1, 2, 2, 88);
CHECK (I1, 2, 1, 99);
CHECK1 (I2, 2, 0);
/* del */
diag("del");
/* if no entry was there - should not change anything */
ok1(!get(I2));
ok1(!del(I2));
CHECK (0, 2, 2, 77);
CHECK (1, 2, 2, 88);
CHECK (I1, 2, 1, 99);
CHECK1 (I2, 2, 0);
/* del I0[0] - only one counter changes and no tables are deallocated */
ok1(get(0) == 77);
ok1(del(0));
ok1(!get(0));
CHECK (0, 2, 1, 0); /* 0 = NULL */
CHECK (1, 2, 1, 88);
CHECK (I1, 2, 1, 99);
CHECK1 (I2, 2, 0);
/* del I0[1] - I0 should disappear */
ok1(get(1) == 88);
ok1(del(1));
ok1(!get(1));
CHECK1 (0, 1, 0);
CHECK1 (1, 1, 0);
CHECK (I1, 1, 1, 99);
CHECK1 (I2, 1, 0);
/* del I1[0] - I1 & first-level table should disappear */
ok1(get(I1) == 99);
ok1(del(I1));
ok1(!get(I1));
CHECK0 ( 0);
/* set so that all entries are set in T2 table (checks for counter-in-ptr
* overflow) */
diag("xcounter overflow");
# define XVALUE(i) (0x8000 + (i)) /* to distinguish pgoffset from set "pointer" */
for (i = 0; i < PAGEMAP_LEVEL_ENTRIES; ++i) {
set(i, XVALUE(i));
ASSERT(get(i) == XVALUE(i));
__CHECK (i, 1, i+1, XVALUE(i));
}
CHECK (0, 1, (unsigned)PAGEMAP_LEVEL_ENTRIES, XVALUE(0));
for (i = PAGEMAP_LEVEL_ENTRIES-1; i >= 0; --i) {
ASSERT(del(i));
ASSERT(!get(i));
if (i)
__CHECK (i, 1, i, 0);
}
CHECK0 ( 0);
/* pagemap_for_each */
diag("pagemap_for_each");
/* iterate over empty - empty */
i = 0;
pagemap_for_each(pageptr, &pmap) {
i = 1;
}
ok1(i == 0);
/* wrestle with test data */
struct { pgoff_t pgoffset; uintptr_t page; }
testv[] = {
{0, 1},
{1, 2},
{32, 32},
{I1-1, 99},
{I1, 100},
{I1+1, 101},
{I1+32, 132},
{I2-1, 199},
{I2, 200},
{I2+1, 201},
};
for (i = 0; i < ARRAY_SIZE(testv); ++i)
set(testv[i].pgoffset, testv[i].page);
for (i = 0; i < ARRAY_SIZE(testv); ++i)
ok( get(testv[i].pgoffset) == testv[i].page,
"get(%lu) == %lu", testv[i].pgoffset, testv[i].page);
i = 0;
pagemap_for_each(pageptr, &pmap) {
ok( (uintptr_t)pageptr == testv[i].page,
"pagemap_for_each(%i) == %lu", i, testv[i].page );
++i;
}
ok1(i == ARRAY_SIZE(testv));
/* test for break - breaking out entirely from pagemap_for_each() */
i = 0;
pagemap_for_each(pageptr, &pmap) {
ok( (uintptr_t)pageptr == testv[i].page,
"pagemap_for_each(%i) == %lu", i, testv[i].page );
if (i == 2)
break;
++i;
}
ok1(i == 2);
/* test for pagemap_for_each without {} */
i = 0;
pagemap_for_each(pageptr, &pmap)
if (++i, (uintptr_t)pageptr == 99)
break;
ok1(i == 4);
diag("pagemap_clear()");
pagemap_clear(&pmap);
i = 0;
pagemap_for_each(pageptr, &pmap) {
i = 1;
}
// TODO check it did not leak
ok(i == 0, "cleared ok (a)");
CHECK0 ( 0);
return 0;
}
#ifndef _WENDELIN_BIGFILE_PAGEMAP_H_
#define _WENDELIN_BIGFILE_PAGEMAP_H_
/* Wendelin.bigfile | Pgoffset -> page mapping
* Copyright (C) 2014-2015 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
* it under the terms of the GNU General Public License version 3, or (at your
* option) any later version, as published by the Free Software Foundation.
*
* You can also Link and Combine this program with other software covered by
* the terms of any of the Open Source Initiative approved licenses and Convey
* the resulting work. Corresponding source of such a combination shall include
* the source code for all other software used.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See COPYING file for full licensing terms.
*
* ~~~~~~~~
*
* BigFileH needs to maintain `{} f_pgoffset -> page` mapping. A hash or a
* binary tree could be used there, but since we know files are most of the
* time accessed sequentially and locally in pages-batches, we can also
* organize the mapping in batches of keys.
*
* Specifically f_pgoffset bits are so divided into parts, that every part
* addresses 1 entry in a table of hardware-page in size. To get to the actual
* value, the system lookups first table by first part of f_pgoffset, then from
* first table and next part from address - second table, etc.
*
* To clients this looks like a dictionary with get/set/del & clear methods,
* but lookups are O(1) time always, and in contrast to hashes values are
* stored with locality (= adjacent lookups almost always access the same tables).
*/
#include <wendelin/utils.h> /* BUILD_ILOG2_EXACT */
#include <sys/user.h> /* PAGE_SIZE & friends */
/* FIXME Debian before Wheezy patches libc and replaces PAGE_SIZE with
* `sysconf(...)` and removes PAGE_SHIFT. Temporary workaround follows: */
#ifndef PAGE_SHIFT
# if defined(__x86_64__) || defined(__i386__)
# define PAGE_SHIFT 12
# else
# error TODO: please specify PAGE_SHIFT for your arch
# endif
# undef PAGE_SIZE
# define PAGE_SIZE (1UL << PAGE_SHIFT)
#endif
#include <wendelin/bigfile/types.h>
typedef struct Page Page;
/*
* >>> `pgoffset -> page` lookup pagetable
*
* Looking up page by pgoffset is done via sparsely-organized lookup tables -
* each 9 bits in pgoffset index one such table.
*
* pointers in all levels except last contain two parts:
*
* - pointer to next-level table (upper bits)
* - count of entries in next-level table minus 1 (lower bits)
*/
/* how many bits of key each level encodes (table fits exactly in 1 hw page)
*
* TODO it is possible to use near pointers and on amd64 reduce entry size from
* 8 bytes to e.g. 4 bytes */
#define PAGEMAP_LEVEL_BITS (PAGE_SHIFT - BUILD_ILOG2_EXACT(sizeof(void *)))
/* how many entries in each level */
#define PAGEMAP_LEVEL_ENTRIES (1UL << PAGEMAP_LEVEL_BITS)
/* mask extracting nr from pointer to next-level table */
#define PAGEMAP_NR_MASK (PAGEMAP_LEVEL_ENTRIES - 1)
/* "or" & "and" bits in pointer */
#define PTR_OR(ptr, mask) ( (typeof(ptr)) ((uintptr_t)(ptr) | (mask)) )
#define PTR_AND(ptr, mask) ( (typeof(ptr)) ((uintptr_t)(ptr) & (mask)) )
/* pointer without count bits */
#define PTR_POINTER(ptr) PTR_AND(ptr, ~PAGEMAP_NR_MASK)
/* "count-1" extracted from pointer */
#define PTR_XCOUNT(ptr) ((unsigned)((uintptr_t)(ptr) & PAGEMAP_NR_MASK))
/* compute pointer with adjusted counter */
#define PTR_XCOUNT_ADD(ptr, v) ({ \
unsigned c = PTR_XCOUNT(ptr); \
c += v; \
BUG_ON(c & ~PAGEMAP_NR_MASK); \
PTR_OR(PTR_POINTER(ptr), c); \
})
struct PageMap {
/* each pointer contains pointer & nr of entries in next-level table.
* this is pointer to 0-level */
uintptr_t pmap0;
/* = PAGEMAP_LEVEL_BITS * height */
unsigned rshift0;
};
typedef struct PageMap PageMap;
/* initialize new pagemap for looking up pages of (1 << pageshift) in size */
void pagemap_init(PageMap *pmap, unsigned pageshift);
/* pmap[pgoffset] -> page */
Page *pagemap_get(const PageMap *pmap, pgoff_t pgoffset);
/* pmap[pgoffset] <- page */
void pagemap_set(PageMap *pmap, pgoff_t pgoffset, Page *page);
/* del pmap[pgoffset]
*
* @return 0 - entry was already missing; 1 - entry was there
*/
int pagemap_del(PageMap *pmap, pgoff_t pgoffset);
/* remove all entries from pagemap
*
* after such removal pagemap frees all memory allocated for tables.
* The only memory left allocated is `struct PageMap` itself.
*/
void pagemap_clear(PageMap *pmap);
/* (internal helper) iterate over all (!empty leaf) tables in pagemap
*
* tab, tailv, l - names of variables which will be declared internally:
*
* void **tab; iterates !empty leaf tables
* void ***tailv[pmap->height]; *tailv[l] - points to table level l
* unsigned l;
*
* CODE_ON_UP
*
* code to execute right after going up one level
* (on all levels, not only on leafs)
*
* PageMap *pmap;
*/
#define __pagemap_for_each_leaftab(tab, tailv, l, pmap, CODE_ON_UP) \
/* loop-once just to declare 'unsigned' variables */ \
/* (this is not a loop - just a workaround for C not allowing to */ \
/* declare variables of different types in "for") */ \
for (unsigned \
__height = (pmap)->rshift0 / PAGEMAP_LEVEL_BITS, \
l, /* current level */ \
__cont; /* whether to continue looping */ \
/* (vs client saying "break") */ \
\
__height; \
__height = 0 \
) \
\
/* recursion loop - going down/up through levels */ \
for (void **tab, \
***tailv[__height+1], /* tail for each level XXX verify height+1?*/ \
***__down = ( /* whether down/up on next iter */ \
\
/* loop start init */ \
tailv[0] = (void ***)&(pmap)->pmap0, \
tailv[1] = (void ***)PTR_POINTER(*tailv[0]), \
l = tailv[1] ? 1 : 0, \
__cont=1, \
\
NULL); \
\
l && __cont; /* !__cont - inner client code said "break" */ \
\
/* by default go up 1 level, after current tab scanned to the end */\
/* The inner loop may go ask us go down to next level. */ \
({ if (__down) \
tailv[++l] = __down; \
else { \
--l; \
{ CODE_ON_UP } \
++tailv[l]; \
} \
\
__down = NULL; }) \
) \
\
/* go down 1 level with tab */ \
if (l < __height) { \
\
/* loop over entries in tab - continuing from where we stopped */ \
for (void ***__tail = tailv[l], \
**__tab_prev = PTR_POINTER(*tailv[l-1]), \
**__tab; \
\
(void **)__tail - __tab_prev < PAGEMAP_LEVEL_ENTRIES; \
\
++__tail \
) \
\
/* load entry; no entry -> next entry */ \
if (!(__tab = *__tail)) \
continue; \
\
/* go down 1 level with tab */ \
else { \
__down = (void ***)PTR_POINTER(__tab); \
break; /* to control loop */ \
} \
} \
\
/* tailv[l] points at leaf table and is !NULL */ \
else if (tab = (void **)tailv[l], 1) \
/* client code goes here - any C statement - with or without {} */ \
/* with break being special - to break the whole iteration you */ \
/* should break with __cont=0 set */
/* iterate over all !NULL entries in pagemap
*
* Page *page - variable that will iterate.
* PageMap *pmap;
*/
#define pagemap_for_each(page, pmap) \
__pagemap_for_each_leaftab(__tab, __tailv, __l, pmap, {}) \
for (unsigned __idx = ( \
__cont = 0, /* for break - to break the whole iter */ \
page=__tab[0], \
0); \
\
__idx < PAGEMAP_LEVEL_ENTRIES \
/* turn off break handling on loop exit */ \
|| ((__cont=1), 0); \
\
++__idx \
) \
if ((page = __tab[__idx])) \
/* */ \
/* CLIENT CODE GOES HERE: */ \
/* */ \
/* - any C statement allowed - with or without {} */ \
/* - also it can say break and break the whole pagemap */ \
/* iteration */
#endif
......@@ -27,6 +27,7 @@ import os
_bigfile = Extension('wendelin.bigfile._bigfile',
sources = [
'bigfile/_bigfile.c',
'bigfile/pagemap.c',
'lib/bug.c',
'lib/utils.c',
],
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment