Re: [Monetdb-developers] [Monetdb-checkins] MonetDB/src/gdk gdk.mx, , 1.279, 1.280 gdk_atoms.mx, , 1.161, 1.162 gdk_bat.mx, , 1.214, 1.215 gdk_batop.mx, , 1.170, 1.171 gdk_bbp.mx, , 1.252, 1.253 gdk_heap.mx, , 1.117, 1.118 gdk_qsort.mx, , 1.34, 1.35 gdk_ssort.mx, , 1.15, 1.16 gdk_utils.mx, , 1.246, 1.247
Is this a backward compatible change? In other words, can databases created before this change be read by the new version?
I really want backward compatibility, so if it isn't, some kind of conversion would be needed.
Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv5363/src/gdk
Modified Files: gdk.mx gdk_atoms.mx gdk_bat.mx gdk_batop.mx gdk_bbp.mx gdk_heap.mx gdk_qsort.mx gdk_ssort.mx gdk_utils.mx Log Message: support for 32GB string heaps in 64bits/oid32 mode (implies bat format change but ONLY for 64bits/oid32)
src/gdk/gdk.mx
- introduce GDK_VARSHIFT, var_t (str indices) are to be shifted that amount of bits to get to the real offset (padding is 8, in case of 64-bits and oid32 -- otherwise it is 0 => no change)
- clean up usage of var_t in HEAP_* interface
src/gdk/gdk_atoms.mx
- str heaps with difficult double lim distrubution do not maintain a linked list (direct open hashing only)
- allow internal string heap hash table pointers to be unsigned shorts instead of var_t (only switched on for 64bits/oid32)
- double-elim string heaps scaled back to 64KB (from 256-512KB)
- support for GDK_VARSHIFT
src/gdk/gdk_bat.mx
- bugfix in 64bits/oid32 offset calculation (precision loss averted)
src/gdk/gdk_batop.mx src/gdk/gdk_bbp.mx src/gdk/gdk_qsort.mx src/gdk/gdk_ssort.mx
- support for GDK_VARSHIFT
src/gdk/gdk_heap.mx
- HEAPmargin allocates large VM area after a block alloc in 64-bits (this to stimulate in-place HEAPextend without memcpy)
- clean up use of var_t/size_t in HEAP_* functions, and support for GDK_VARSHIFT
src/gdk/gdk_utils.mx
- increase VM size for 64-bits systems to 4TB
U gdk.mx Index: gdk.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk.mx,v retrieving revision 1.279 retrieving revision 1.280 diff -u -d -r1.279 -r1.280 --- gdk.mx 3 Apr 2009 09:14:51 -0000 1.279 +++ gdk.mx 13 Apr 2009 23:36:24 -0000 1.280 @@ -1068,9 +1068,9 @@ @item void HEAP_destroy (Heap* h) @item var_t -HEAP_malloc (Heap* heap, var_t nbytes) +HEAP_malloc (Heap* heap, size_t nbytes) @item void -HEAP_free (Heap *heap, size_t block) +HEAP_free (Heap *heap, var_t block) @item int HEAP_private (Heap* h) @item void @@ -1111,7 +1111,7 @@ int (*sizefcn) (ptr) /* BATatoms[].atomLen function */ );
-gdk_export var_t HEAP_malloc(Heap *heap, var_t nbytes); +gdk_export var_t HEAP_malloc(Heap *heap, size_t nbytes); gdk_export void HEAP_free(Heap *heap, var_t block); gdk_export var_t HEAP_private(Heap *h); gdk_export void HEAP_checkformat(Heap *h); @@ -1350,12 +1350,37 @@ #define Hloc(b,p) ((b)->H->heap.base+((p)<<(b)->H->shift)) #define Tloc(b,p) ((b)->T->heap.base+((p)<<(b)->T->shift))
+#if SIZEOF_VAR_T < SIZEOF_VOID_P +/* NEW 11/4/2009: when compiled with 32-bits oids/var_t on 64-bits systems, align heap strings
- on 8 byte boundaries always (wasting 4 padding bytes on avg). Note that in heaps where
- duplicate elimination is succesful, such padding occurs anyway (as an aside, a better
- implementation with two-bytes pointers in the string heap hash table, could reduce that
- padding to avg 1 byte wasted -- see TODO below).
- This 8 byte alignment allows the offset in the fixed part of the BAT string column to be
- interpreted as an index, which should be multiplied by 8 to get the position (VARSHIFT). The
- overall effect is that 32GB heaps can be addressed even when oids are limited to 4Gtuples.
- In the future, we might extend this such that the string alignment is set in the BAT header
- (columns with long strings take more storage space, but could tolerate more padding).
- It would mostly work, only the sort routine and strPut/strLocate (which do not see the BAT
- header) extra parameters would be needed in their APIs.
- */
+typedef unsigned short stridx_t; +#define GDK_VARSHIFT 3 +#define GDK_VARALIGN (1<<GDK_VARSHIFT) +#else +typedef var_t stridx_t; /* TODO: should also be unsigned short, but kept at var_t not to break BAT images */ +#define GDK_VARSHIFT 0 +#define GDK_VARALIGN sizeof(stridx_t) +#endif
#define BUNhloc(bi,p) Hloc((bi).b,p) #define BUNtloc(bi,p) Tloc((bi).b,p) #define BUNhpos(bi,p) (Hpos(&(bi),p)) #define BUNtpos(bi,p) (Tpos(&(bi),p)) -#define BUNhvar(bi,p) ((bi).b->htype?(Hbase((bi).b)+*((var_t*)BUNhloc(bi,p))):BUNhpos(bi,p)) -#define BUNtvar(bi,p) ((bi).b->ttype?(Tbase((bi).b)+*((var_t*)BUNtloc(bi,p))):BUNtpos(bi,p)) +#define BUNhvar(bi,p) ((bi).b->htype?(Hbase((bi).b)+((*(var_t*)BUNhloc(bi,p))<<GDK_VARSHIFT)):BUNhpos(bi,p)) +#define BUNtvar(bi,p) ((bi).b->ttype?(Tbase((bi).b)+((*(var_t*)BUNtloc(bi,p))<<GDK_VARSHIFT)):BUNtpos(bi,p)) #define BUNhead(bi,p) ((bi).b->hvarsized?BUNhvar(bi,p):BUNhloc(bi,p)) #define BUNtail(bi,p) ((bi).b->tvarsized?BUNtvar(bi,p):BUNtloc(bi,p))
U gdk_atoms.mx Index: gdk_atoms.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_atoms.mx,v retrieving revision 1.161 retrieving revision 1.162 diff -u -d -r1.161 -r1.162 --- gdk_atoms.mx 7 Apr 2009 11:53:22 -0000 1.161 +++ gdk_atoms.mx 13 Apr 2009 23:36:24 -0000 1.162 @@ -175,7 +175,6 @@ gdk_export int strLen(const char *s); gdk_export int strCmp(str l, str r); gdk_export int strNil(str s); -gdk_export void strHeapConvert(Heap *h, int directon); gdk_export int strElimDoubles(Heap *h); gdk_export var_t strLocate(Heap *h, str v); gdk_export int strCmpNoNil(unsigned char *l, unsigned char *r); @@ -457,7 +456,7 @@ 0, 0, (var_t (*)(Heap *, var_t *, ptr)) strPut, 0, (int (*)(ptr)) strLen, strHeap,
(void (*)(Heap *, int)) strHeapConvert, 0},
(void (*)(Heap *, int)) 0, 0},
}; int GDKatomcnt = TYPE_str + 1;
@@ -931,24 +930,25 @@ } \ } while (0)
-#define GDK_STRHASHTABLE (1<<10) +/* string heaps:
- strings are 8 byte aligned
- start with a 1024 bucket hash table
- heaps < 64KB are fully duplicate eliminated with this hash tables
- heaps >= 64KB are opportunistically (imperfect) duplicate eliminated
- as only the last 128KB chunk is considered and there is no linked list
- buckets and next pointers are unsigned short "indices"
- indices should be multiplied by 8 and takes from ELIMBASE to get an offset
- Note that a 64KB chunk of the heap contains at most 8K 8-byte aligned
- strings. The 1K bucket list means that in worst load, the list length is 8 (OK).
- */
+#define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) -#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(var_t)) -#define GDK_STRHASHCREDIT(h) (((var_t*) (h)->base)[GDK_STRHASHTABLE]) +#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) +#define GDK_ELIMPOWER 16 /* 64KB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) +#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently: ELIMBASE == 0 */ #define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) << GDK_ELIMPOWER) -#if SIZEOF_SIZE_T == SIZEOF_INT -#define GDK_ELIMPOWER 16 /* makes for a max 64KB hash table
* ie 256 string bytes per hash bucket
* ~ 4 strings of UP4(8<=len<=11)=12 + 4 bytes
*/
-#else -#define GDK_ELIMPOWER 17 /* makes for a max 128KB hash table
* ie 512 string bytes per hash bucket
* ~ 5 strings of UP8(8<=len<=15)=16 + 8 bytes
*/
-#endif
@- Atomic ADT functions @c @@ -1767,46 +1767,34 @@ Because in many typical situations lots of double string values occur in tables, the string insertion provides automatic double elimination. To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the first -4096 (8192 on 64-bit architectures) bytes of the string heap, consisting of integer offsets of the first -string hashing to that bucket in the heap. Furthermore, the first 4(8) bytes -before each string in the heap is an integer offset to the next string hashing -to the same number. +4096 bytes of the string heap, consisting an offset to the first string +hashing to that bucket in the heap. +These offsets are made small (stridx_t is an unsigned short) by exploiting +the fact that the double elimination chunks are (now) 64KB, hence a short +suffices.
-However, in many other situations the cardinality of string columns is large, +In many other situations the cardinality of string columns is large, or the string values might even be unique. In those cases, our fixed-size hash table will start to overflow quickly. Therefore, after the hash table is full (this is measured very simplistically by looking whether the string heap exceeds a -heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with old bat images) -we flush the hash table. If one views the string heaps as consecutive chunks -of size GDK_ELIMLIMIT bytes, then all strings within one chunk are double-eliminated. -There is a macro GDK_ELIMBASE(offset) that computes the base of the chunk in which -a certain byte-offset falls. -@- -This is a departure from our previous policy of not looking at the hash tables at -all after overflow occurred. The advantage of the new approach is that if we have -a value distribution that is skewed (ie some values are very frequent), these -values will always be double eliminated, saving a considerable amount of space. -Disadvantage of the approach is that we always have to reserve space for the next -pointer (4(8) byte integer offset) that is stored right in front of the string (and -consequently have to keep all string chunks and offsets aligned to 4(8)). All this -translates into some wasted space. However, if there are that many different strings -that the hash table overflows, the strings must be relatively long and the relative -storage overhead should be low. +heap size = GDK_ELIMLIMIT = 64KB) we flush the hash table. Even more, from that moment +on, we do not use a linked list, but a lossy hash table that just contains +the last position for each bucket. Basically, after exceeding GDK_ELIMLIMIT, +we get a probabilistic/opportunistic duplicate elimination mechanism, +that only looks at the last GDK_ELIMLIMIT chunk in the heap, in a lossy way.
+When comparing with the previous string implementation, the biggest difference +is that on 64-nbits bt with 32-bits oids, strings are always 8-byte aligned +and var_t numbers are multiplied by 8 to get the true offset. The goal to do +this is to allow 32-bits var_t on 64-bits systems to address 32GB (using string +alignment=8). For large database, the cost of padding (4 bytes avg) is offset +by the savings in var_t (allowing to go from 64- to 32-bits). Nothing lost there, +and 32-bits var_t also pay in smaller OIDs and smaller hash tables, reducing memory +pressure. For small duplicate eliminated heaps, the short indices +used in the hash table have now allowed more buckets (2K instead of 1K) +and average 2 bytes overhead for the next pointers instead of 6-12. Therefore +small heaps are now more compact than before. @- -Notice that this mechanism enables to keep a certain linear storage property -in the string heaps. This is important if we want to take a BATslice on a BAT -by simply loading or @strong{mmap()}ping slices of the BAT files on disk into memory. -This is relevant in order to process a very large BAT iteratively by taking slices -in order to reduce memory consumption. Notice that if there are few different string -values, the hash table has not overflowed, and the string heap size will be small -(i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load the entire string heap. -If the hash table @strong{has} overflowed, we want to be able to only map a slice of the -string heap as well. Now, given that the first string in the BAT-slice is called F1 -and its heap offset is O1 and the last string in the slice is F2 and its -offset is O2, then the slice we should take from the string heap is: -@example -GDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2)) -@end example The routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash-table. Only then we know that unequal offset-integers in the BUN array means @@ -1877,16 +1865,12 @@ strHeap(Heap *d, size_t cap) { size_t size;
var_t *h, *e;
cap = MAX(cap, BATTINY);
size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT, cap * 12);
- size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN); if (HEAPalloc(d, size, 1) >= 0) {
d->free = GDK_STRHASHTABLE * sizeof(var_t);
h = (var_t *) d->base;
for (e = h; e < h + GDK_STRHASHTABLE; e++) {
*e = 0;
}
d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
}memset(d->base, 0, d->free);
}
@@ -1923,42 +1907,10 @@ void strCleanHash(Heap *h, int rebuild) {
- (void) rebuild; if (!GDK_ELIMDOUBLES(h)) { /* flush hash table for security */ memset(h->base, 0, GDK_STRHASHSIZE);
- } else if (rebuild) {
var_t xx, cur = 0, end = 0;
str hash = (str) h->base;
-/* int cnt[GDK_STRHASHTABLE]; */
/* collect all values in one big linked list */
for (xx = 0; xx < GDK_STRHASHTABLE; xx++) {
var_t yy = ((var_t *) hash)[xx];
-/* cnt[xx]=0; */
((var_t *) hash)[xx] = 0; /* clear hash table slot */
if (end) {
*(var_t *) (hash + end) = yy;
} else {
cur = yy;
}
for (; yy; yy = *(var_t *) (hash + yy))
end = yy;
}
/* process the linked list, inserting the values again */
for (; cur; cur = end) {
str val = hash + cur;
GDK_STRHASH(val + sizeof(var_t), xx);
xx &= GDK_STRHASHMASK;
end = *(var_t *) val;
*(var_t *) val = ((var_t *) hash)[xx];
((var_t *) hash)[xx] = cur;
-/* cnt[xx]++; */
}
-/* for(xx=0; xx<GDK_STRHASHTABLE; xx++)
- if(cnt[xx]) stream_printf(GDKout, "%5d %6d", xx, cnt[xx]); */ }
}
@@ -1970,90 +1922,63 @@ var_t strLocate(Heap *h, str v) {
- var_t *htab = (var_t *) h->base;
- var_t *l, *e;
- BUN idx;
- GDK_STRHASH(v, idx);
- e = htab + (idx & GDK_STRHASHMASK);
- if (*e) {
for (l = e; *l && *l < h->free; l = (var_t *) ((char *) h->base + *l)) {
str x = (str) ((char *) h->base + *l + sizeof(var_t));
if (GDK_STRCMP(v, x) == 0) {
return *l + (var_t) sizeof(var_t);
}
}
- }
- return 0;
-}
- stridx_t *ref, *next;
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define normal_vart_SWAP(x) ((var_t) normal_int_SWAP((int)x)) -#else -#define normal_vart_SWAP(x) ((var_t) long_long_SWAP((lng)x)) -#endif
- /* search hash-table, if double-elimination is still in place */
- BUN off; GDK_STRHASH(v, off);
- off &= GDK_STRHASHMASK;
-/* convert the integers in the implicit hash table structure */ -void -strHeapConvert(Heap *h, int dir) -{
- var_t *htab = (var_t *) h->base;
- var_t *l, i, j;
- /* should only use strLocate iff fully double eliminated */
- assert(GDK_ELIMBASE(h->free) == 0);
- if (dir == CONV_HTON) {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l = (var_t *) (h->base + j)) {
*l = normal_vart_SWAP(j);
}
}
- } else {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l = (var_t *) (h->base + *l)) {
*l = normal_vart_SWAP(j);
}
}
- /* search the linked list */
- for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0)
}return (sizeof(stridx_t) + *ref) >> GDK_VARSHIFT; /* found */
- return 0;
}
var_t strPut(Heap *h, var_t *dst, str v) {
- var_t *l;
- size_t i = GDK_STRLEN(v);
- BUN off;
- /* round up to var_t-byte alignment + var_t (next pointer) */
- size_t len = ((i + sizeof(var_t) - 1) & ~(sizeof(var_t) - 1)) + sizeof(var_t);
- size_t elimlimit = GDK_ELIMBASE(h->free) + GDK_ELIMLIMIT;
size_t elimbase = GDK_ELIMBASE(h->free);
size_t pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN-1));
size_t pos, len = GDK_STRLEN(v);
stridx_t *bucket, *ref, *next;
/* search hash-table, if double-elimination is still in place */
- GDK_STRHASH(v, off);
- BUN off; GDK_STRHASH(v, off); off &= GDK_STRHASHMASK;
- for (l = ((var_t *) h->base) + off; *l && *l < h->free; l = (var_t *) (h->base + *l)) {
str x = (str) (h->base + *l + sizeof(var_t));
- bucket = ((stridx_t *) h->base) + off;
if (GDK_STRCMP(v, x) == 0) {
*dst = *l + (var_t) sizeof(var_t); /* already in heap; do not insert! */
if (GDK_ELIMDOUBLES(h) == 0 && GDK_STRHASHCREDIT(h))
GDK_STRHASHCREDIT(h) += 4;
return *dst;
- if (elimbase == 0) { /* small string heap (<64KB) -- fully double eliminated */
for (ref = bucket; *ref; ref = next) { /* search the linked list */
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0) { /* found */
pos = sizeof(stridx_t) + *ref;
return *dst = (pos >> GDK_VARSHIFT);
}}
- }
- /* flush the hash table if it becomes too big (implies !GDK_ELIMDOUBLES) */
- if (h->free + len >= elimlimit) {
/* if we are not hash-inserting anymore, h->free may no longer be var_t aligned */
size_t mask = h->free & (sizeof(var_t) - 1);
if (mask)
h->free += sizeof(var_t) - mask; /* realign */
memset(h->base, 0, GDK_STRHASHSIZE); /* start over hash inserting in a pristine hash table */
GDK_STRHASHCREDIT(h) = 32; /* only tolerate limited misses in the future */
/* is there room for the next pointer in the padding space? */
if (pad < sizeof(stridx_t))
pad += GDK_VARALIGN; /* if not, pad more */
- } else {
/* large string heap (>=64KB) -- opportunistic/probabilistic double elimination */
pos = elimbase + *bucket;
if (GDK_STRCMP(v, (str) (h->base + pos)) == 0) {
return *dst = (pos >> GDK_VARSHIFT); /* already in heap; do not insert! */
}
+#if SIZEOF_VAR_T < SIZEOF_VOID_P
pad &= (GDK_VARALIGN-1); /* a full VARALIGN pad can be omitted */
+#else
pad = 0; /* only 32-bits var_t in 64-bits needs padding */
+#endif }
/* check heap for space (limited to a certain maximum after which nils are inserted) */
- if (h->free + len >= h->size) {
- if (h->free + pad + len >= h->size) { /* Something really strange happens here, In a special case (Pentium II Klamath, gcc version 2.96 20000731, GNU assembler version 2.10.90 using BFD version 2.10.0.18)
@@ -2064,11 +1989,15 @@ */ float batmargin = (float) BATMARGIN; float hnewsize = h->size * batmargin;
size_t newsize = len + (size_t) hnewsize;
size_t newsize = pad + len + (size_t) hnewsize;
assert(newsize);
if (h->free + len < h->maxsize) {
if (h->free + pad + len >= (((size_t) VAR_MAX) << GDK_VARSHIFT)) {
GDKerror("strPut: string heaps gets larger than %dGB.\n", (((size_t) VAR_MAX) << GDK_VARSHIFT) >> 30);
return 0;
}
}if (h->free + pad + len < h->maxsize) { /* if there is reserved space, first use the reserved space */ newsize = MIN(newsize, h->maxsize);
@@ -2077,32 +2006,27 @@ } /* fill should solve initialisation problems within valgrind */ memset(h->base + h->free, 0, h->size - h->free);
}
if (!GDK_ELIMDOUBLES(h)) {
if (GDK_STRHASHCREDIT(h) == 0) {
/* if credits are gone, we do not hash insert at all */
if (h->free > VAR_MAX) {
GDKerror("strPut: string heaps gets larger than 2GB -- too large for 32-bits oids.\n");
return 0;
}
*dst = (var_t) h->free;
memcpy(h->base + h->free, v, i);
h->free += i; /* in this case, we do not round to var_t either */
return *dst;
}
GDK_STRHASHCREDIT(h)--;
/* make bucket point into the enw heap */
}bucket = ((stridx_t *) h->base) + off;
- /* insert string in hash table and copy into the heap */
- l = (var_t *) (h->base + h->free);
- *(l++) = ((var_t *) h->base)[off];
- assert(h->free + sizeof(var_t) <= VAR_MAX);
- ((var_t *) h->base)[off] = (var_t) h->free;
- *dst = (var_t) (h->free + sizeof(var_t));
- h->free += len;
- memcpy((char *) l, v, i);
/* insert string */
pos = h->free + pad;
*dst = pos >> GDK_VARSHIFT;
memcpy(h->base + pos, v, len);
h->free += pad + len;
/* maintain hash table */
if (elimbase == 0) { /* small string heap: link the next pointer */
pos -= sizeof(stridx_t); /* the stridx_t next pointer directly precedes the string */
*(stridx_t*) (h->base + pos) = *bucket;
}
*bucket = (stridx_t) (pos - elimbase); /* set bucket to the new string */
if (h->free >= elimbase + GDK_ELIMLIMIT) {
memset(h->base, 0, GDK_STRHASHSIZE); /* flush hash table */
} return *dst;
}
U gdk_bbp.mx Index: gdk_bbp.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bbp.mx,v retrieving revision 1.252 retrieving revision 1.253 diff -u -d -r1.252 -r1.253 --- gdk_bbp.mx 9 Apr 2009 15:29:18 -0000 1.252 +++ gdk_bbp.mx 13 Apr 2009 23:36:24 -0000 1.253 @@ -2965,9 +2965,9 @@ BATsetcount(&bs->B, 0);
if (bs->H.vheap)
memset(bs->H.vheap->base, 0, bs->H.vheap->free = GDK_STRHASHTABLE * sizeof(var_t));
if (bs->T.vheap)memset(bs->H.vheap->base, 0, bs->H.vheap->free = GDK_STRHASHTABLE * sizeof(stridx_t));
memset(bs->T.vheap->base, 0, bs->T.vheap->free = GDK_STRHASHTABLE * sizeof(var_t));
return bs; } BATDEBUG THRprintf(GDKout, "#BBPrecycle %d %d "SZFMT" (%d, %d %d %d) N2\n", ht, tt, cap, BATCACHE_NOTYPE(ht), BATCACHE_NOTYPE(tt), batcache_maxbuckets, bin);memset(bs->T.vheap->base, 0, bs->T.vheap->free = GDK_STRHASHTABLE * sizeof(stridx_t));
U gdk_batop.mx Index: gdk_batop.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_batop.mx,v retrieving revision 1.170 retrieving revision 1.171 diff -u -d -r1.170 -r1.171 --- gdk_batop.mx 7 Jan 2009 14:13:48 -0000 1.170 +++ gdk_batop.mx 13 Apr 2009 23:36:24 -0000 1.171 @@ -1656,7 +1656,7 @@ if (b->hkey == 0) { /* sorted and key? */ while (cur < end) {
char *val = base + *(((var_t *)b->H->heap.base)+ cur);
char *val = base + (*(((var_t *)b->H->heap.base)+ cur) << GDK_VARSHIFT); if (cmp(prv, val) @5= 0) { key = FALSE;
@@ -1668,7 +1668,7 @@ } /* sorted? */ while (cur < end) {
char *val = base + *(((var_t *)b->H->heap.base)+ cur);
char *val = base + (*(((var_t *)b->H->heap.base)+ cur) << GDK_VARSHIFT); if (cmp(prv, val) @5 0) { /* record negative position info */
U gdk_utils.mx Index: gdk_utils.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_utils.mx,v retrieving revision 1.246 retrieving revision 1.247 diff -u -d -r1.246 -r1.247 --- gdk_utils.mx 9 Apr 2009 18:48:25 -0000 1.246 +++ gdk_utils.mx 13 Apr 2009 23:36:24 -0000 1.247 @@ -382,7 +382,7 @@ #define GDK_MEM_NULLALLOWED
#if SIZEOF_VOID_P==8 -#define GDK_VM_MAXSIZE LL_CONSTANT(137438953472) /* :-) a 64-bit OS: 128 GB */ +#define GDK_VM_MAXSIZE LL_CONSTANT(4398046511104) /* :-) a 64-bit OS: 4TB */ #else #define GDK_VM_MAXSIZE LL_CONSTANT(1610612736) /* :-| a 32-bit OS: 1.5GB */ #endif
U gdk_heap.mx Index: gdk_heap.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_heap.mx,v retrieving revision 1.117 retrieving revision 1.118 diff -u -d -r1.117 -r1.118 --- gdk_heap.mx 20 Mar 2009 11:59:57 -0000 1.117 +++ gdk_heap.mx 13 Apr 2009 23:36:24 -0000 1.118 @@ -75,8 +75,20 @@ nice). @{ @c -int -HEAPalloc(Heap *h, size_t nitems, size_t itemsize) +size_t HEAPmargin(size_t maxsize) {
- size_t ret;
+#if SIZEOF_VOID_P == 8
- /* in 64-bits systems, try to enforce in-place realloc, but provoke the memcpy on 256MB, then 4GB */
- size_t use = GDKvm_cursize();
- ret = MIN(GDK_mem_maxsize, MAX(((size_t) 1) << 26, 16*maxsize));
- if ((ret+ret) > (GDK_vm_maxsize-MIN(GDK_vm_maxsize,use))) /* only if room */
+#endif
- ret = ((double) BATMARGIN)*maxsize - 1; /* do not waste VM on 32-bits */
- return (1 + (MAX(maxsize,ret) >> 16)) << 16; /* round up to 64K */
+}
+/* in 64-bits space, use very large margins to accomodate reallocations */ +int HEAPalloc(Heap *h, size_t nitems, size_t itemsize) { char nme[PATHLENGTH], *ext = NULL;
@@ -98,8 +110,7 @@ /* when using anonymous vm we malloc we need 64K chunks, also we * 20% extra malloc */ if (h->size > GDK_mem_bigsize) {
h->maxsize = (size_t) ((double) h->maxsize * BATMARGIN) - 1;
h->maxsize = (1 + (h->maxsize >> 16)) << 16;
} if (h->filename == NULL || (h->size < GDK_mmap_minsize)) { h->storage = STORE_MEM;h->maxsize = HEAPmargin(h->maxsize);
@@ -169,17 +180,14 @@ /* extend a malloced heap, possibly switching over to file-mapped storage */ Heap bak = *h; int can_mmap = h->filename && (size >= GDK_mem_bigsize || h->newstorage != STORE_MEM);
int must_mmap = can_mmap && (size >= GDK_mmap_minsize || h->newstorage != STORE_MEM);
int small_cpy = (h->size*4 < size) && (size >= GDK_mmap_minsize);
int must_mmap = can_mmap && (small_cpy || h->newstorage != STORE_MEM);
h->size = size;
if (can_mmap) { /* in anonymous vm, if have to realloc anyway, we reserve some extra space */
if (size > h->maxsize) {
h->maxsize = (size_t) ((double) size * BATMARGIN);
}
/* when using anonymous vm we malloc we need 64K chunks */
h->maxsize = (1 + ((h->maxsize - 1) >> 16)) << 16;
} else { h->maxsize = size; /* for normal GDKmalloc, maxsize = size */ }h->maxsize = HEAPmargin(MAX(size,h->maxsize));
@@ -514,9 +522,9 @@ #define HEAPVERSION 20030408
typedef struct heapheader {
- var_t head; /* index to first free block */
- size_t head; /* index to first free block */ int alignment; /* alignment of objects on heap */
- var_t firstblock; /* first block in heap */
- size_t firstblock; /* first block in heap */ int version; int (*sizefcn) (ptr); /* ADT function to ask length */
} HEADER32; @@ -524,8 +532,8 @@ typedef struct { int version; int alignment;
- var_t head;
- var_t firstblock;
- size_t head;
- size_t firstblock; int (*sizefcn) (ptr);
} HEADER64;
@@ -537,8 +545,8 @@ typedef HEADER64 HEADER_OTHER; #endif typedef struct hfblock {
- var_t size; /* Size of this block in freelist */
- var_t next; /* index of next block */
- size_t size; /* Size of this block in freelist */
- size_t next; /* index of next block */
} CHUNK;
@c @@ -546,13 +554,13 @@ #define roundup_4(x) (((x)+3)&~3) #define blocksize(h,p) ((p)->size)
-static INLINE var_t -roundup_num(var_t number, int alignment) +static INLINE size_t +roundup_num(size_t number, int alignment) {
- var_t rval;
- size_t rval;
- rval = number + (var_t) alignment - 1;
- rval -= (rval % (var_t) alignment);
- rval = number + (size_t) alignment - 1;
- rval -= (rval % (size_t) alignment); return rval;
}
@@ -560,7 +568,7 @@ HEAP_private(Heap *h) { (void) h;
- return (var_t) roundup_8(sizeof(HEADER));
- return (var_t) (roundup_8(sizeof(HEADER)) >> GDK_VARSHIFT);
}
#ifdef TRACE @@ -568,7 +576,7 @@ HEAP_printstatus(Heap *heap) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t block, cur_free = hheader->head;
size_t block, cur_free = hheader->head; CHUNK *blockp;
THRprintf(GDKout, "#HEAP has head " SZFMT " and alignment %d and size " SZFMT "\n",
@@ -591,7 +599,7 @@ cur_free = blockp->next; block += blockp->size; } else {
var_t size = blocksize(hheader, blockp);
size_t size = blocksize(hheader, blockp); THRprintf(GDKout, "# block at " SZFMT " with size " SZFMT "\n", block, size); block += size;
@@ -611,7 +619,7 @@ /* // Calculate position of first and only free block. */
- var_t head = roundup_num((var_t) (roundup_8(sizeof(HEADER)) + roundup_8(nprivate)), alignment);
size_t head = roundup_num((size_t) (roundup_8(sizeof(HEADER)) + roundup_8(nprivate)), alignment); CHUNK *headp = HEAP_index(heap, head, CHUNK);
assert(roundup_8(sizeof(HEADER)) + roundup_8(nprivate) <= VAR_MAX);
@@ -629,7 +637,7 @@ // Fill first free block. */ assert(heap->size - head <= VAR_MAX);
- headp->size = (var_t) (heap->size - head);
- headp->size = (size_t) (heap->size - head); headp->next = 0;
#ifdef TRACE THRprintf(GDKout, "#We created the following heap\n"); @@ -669,9 +677,9 @@
var_t -HEAP_malloc(Heap *heap, var_t nbytes) +HEAP_malloc(Heap *heap, size_t nbytes) {
- var_t block, trail, ttrail;
- size_t block, trail, ttrail; CHUNK *blockp; CHUNK *trailp; HEADER *hheader = HEAP_index(heap, 0, HEADER);
@@ -682,15 +690,9 @@
/* add space for size field */ nbytes += hheader->alignment;
- if (hheader->alignment == 8) {
nbytes = roundup_8(nbytes);
- } else if (hheader->alignment == 4) {
nbytes = roundup_4(nbytes);
- } else {
GDKfatal("HEAP_malloc: Heap structure corrupt\n");
- }
- nbytes = roundup_8(nbytes); if (nbytes < sizeof(CHUNK))
nbytes = (var_t) sizeof(CHUNK);
nbytes = (size_t) sizeof(CHUNK);
/* // block -- points to block with acceptable size (if available).
@@ -718,12 +720,12 @@ // If no block of acceptable size is found we try to enlarge the heap. */ if (block == 0) {
var_t newsize;
size_t newsize;
assert(heap->free + MAX(heap->free, nbytes) <= VAR_MAX);
newsize = (var_t) roundup_8(heap->free + MAX(heap->free, nbytes));
assert(heap->free <= VAR_MAX);newsize = (size_t) roundup_8(heap->free + MAX(heap->free, nbytes));
block = (var_t) heap->free; /* current end-of-heap */
block = (size_t) heap->free; /* current end-of-heap */
#ifdef TRACE THRprintf(GDKout, "#No block found\n"); @@ -747,7 +749,7 @@
blockp->next = 0; assert(heap->free - block <= VAR_MAX);
blockp->size = (var_t) (heap->free - block); /* determine size of allocated block */
blockp->size = (size_t) (heap->free - block); /* determine size of allocated block */
/* // Try to join the last block in the freelist and the newly allocated
@@ -778,7 +780,7 @@ // TUNE: use different amount than 2*sizeof(CHUNK) */ if (blockp->size >= nbytes + 2 * sizeof(CHUNK)) {
var_t newblock = block + nbytes;
size_t newblock = block + nbytes;
CHUNK *newblockp = HEAP_index(heap, newblock, CHUNK);
newblockp->size = blockp->size - nbytes;
@@ -800,17 +802,17 @@ }
block += hheader->alignment;
- return block;
- return block >> GDK_VARSHIFT;
}
void -HEAP_free(Heap *heap, var_t block) +HEAP_free(Heap *heap, var_t mem) { HEADER *hheader = HEAP_index(heap, 0, HEADER); CHUNK *beforep; CHUNK *blockp; CHUNK *afterp;
- var_t after, before;
size_t after, before, block = mem << GDK_VARSHIFT;
if (hheader->alignment != 8 && hheader->alignment != 4) { GDKfatal("HEAP_free: Heap structure corrupt\n");
@@ -884,10 +886,10 @@ HEAP_check(Heap *heap, HeapRepair *hr) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t head = hheader->head, alignshift = 2;
- var_t block, nwords = (var_t) ((heap->free - 1) >> 7);
- size_t head = hheader->head, alignshift = 2;
- size_t block, nwords = (size_t) ((heap->free - 1) >> 7); int *freemask;
- var_t prevblock = 0;
size_t prevblock = 0; CHUNK *blockp;
hr->alignment = hheader->alignment;
@@ -922,8 +924,8 @@ // Walk the freelist; register them in freemask */ for (block = hheader->head; block != 0; block = HEAP_index(heap, block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if ((block <= prevblock) && (block != 0)) {
@@ -942,8 +944,8 @@ */ block = hheader->firstblock; while (block < heap->free) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
hr->validmask[pos] |= mask;
@@ -965,8 +967,8 @@ // Check if there are left over free blocks */ for (block = hheader->head; block != 0; block = HEAP_index(heap, block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if (freemask[pos] & mask) {
@@ -1046,14 +1048,14 @@ if (hheader->head > heap->free) { hheader->head = 0; /* cut off free block */ } else if (hheader->head) {
var_t idx = hheader->head;
size_t idx = hheader->head;
while (idx) { CHUNK *blk = HEAP_index(heap, idx, CHUNK);
if (idx + blk->size > heap->free) { assert(heap->free - idx <= VAR_MAX);
blk->size = (var_t) (heap->free - idx); /* cut off illegal tail of block */
blk->size = (size_t) (heap->free - idx); /* cut off illegal tail of block */ } if (blk->next > heap->free || blk->next < (idx + blk->size) || (blk->next & (hheader->alignment - 1))) { blk->next = 0; /* cut off next block */
U gdk_qsort.mx Index: gdk_qsort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_qsort.mx,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- gdk_qsort.mx 7 Jan 2009 14:13:48 -0000 1.34 +++ gdk_qsort.mx 13 Apr 2009 23:36:24 -0000 1.35 @@ -86,6 +86,7 @@ char *offst; /* NULL or start of varsized heap */ int hshift; /* log2 of hs */ int tshift; /* log2 of hs */
- int vshift; /* shift to be applied on var_ offsets */ unsigned hs; /* width of head record */ unsigned ts; /* width of tail record */ void *h; /* start of head buns */
@@ -189,7 +190,7 @@ #endif #endif
-#define offset(p) (buf->offst + *(var_t*) (p)) +#define offset(p) (buf->offst + ((*(var_t*) (p)) << buf->vshift)) #define direct(p) (p)
#define any_LE(a,b) ((buf->cmp)(a,b) <= 0) @@ -432,6 +433,7 @@ buf.ts = (unsigned) ts; buf.hshift = ATOMelmshift(hs); buf.tshift = ATOMelmshift(ts);
- buf.vshift = ATOMvarsized(tpe)?GDK_VARSHIFT:0; buf.h = h; if (!t) t = &x;
U gdk_bat.mx Index: gdk_bat.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bat.mx,v retrieving revision 1.214 retrieving revision 1.215 diff -u -d -r1.214 -r1.215 --- gdk_bat.mx 3 Apr 2009 09:14:52 -0000 1.214 +++ gdk_bat.mx 13 Apr 2009 23:36:24 -0000 1.215 @@ -434,7 +434,7 @@ BAT * BATextend(BAT *b, BUN newcap) {
- size_t hheap_size, theap_size;
size_t hheap_size = newcap, theap_size = newcap;
assert(newcap <= BUN_MAX); BATcheck(b, "BATextend");
@@ -453,10 +453,10 @@
b->batCapacity = newcap;
- hheap_size = Hsize(b) * newcap;
- hheap_size *= Hsize(b); if (b->H->heap.base && HEAPextend(&b->H->heap, hheap_size) < 0) return NULL;
- theap_size = Tsize(b) * newcap;
- theap_size *= Tsize(b); if (b->T->heap.base && HEAPextend(&b->T->heap, theap_size) < 0) return NULL; HASHdestroy(b);
U gdk_ssort.mx Index: gdk_ssort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_ssort.mx,v retrieving revision 1.15 retrieving revision 1.16 diff -u -d -r1.15 -r1.16 --- gdk_ssort.mx 7 Jan 2009 14:13:48 -0000 1.15 +++ gdk_ssort.mx 13 Apr 2009 23:36:24 -0000 1.16 @@ -203,8 +203,8 @@ } \ } while (0)
-#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)((X), (Y))) < 0) -#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)((X), (Y))) > 0) +#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) < 0) +#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) > 0) @= islt #define ISLT_@1@2(X, Y, ms) (* (@1 *) (X) @3 * (@1 *) (Y))
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
Hi Sjoerd,
Thanks for the question. I wrote earlier last night an email to this list, which I thought covers this.
The answer regarding backward compatibility is: (1) NO, on repositories created by MonetDB configures with --enable-bits=64 --enable-oid32 (2) YES, in all other cases.
It is my understanding that few people use (1). If the MonetDB team agrees, I would propose not to take additional migration measures. If otherwise, ie if we go to a migration anyway, we might also consider changing the new type stridx_t in the (2) cases from var_t to unsigned short. It reduces the fixed overhead of string heaps of small bats (2KB instead of 4KB on 32-bits, 8KB on 64 bits). I refrained from doing so to keep (2) backward compatible.
Peter
-----Original Message----- From: Sjoerd Mullender [mailto:sjoerd@acm.org] Sent: Tuesday, April 14, 2009 1:41 PM To: monetdb-developers@lists.sourceforge.net Cc: monetdb-checkins@lists.sourceforge.net Subject: Re: [Monetdb-checkins] MonetDB/src/gdk gdk.mx, , 1.279, 1.280 gdk_atoms.mx, , 1.161, 1.162 gdk_bat.mx, , 1.214, 1.215 gdk_batop.mx, , 1.170, 1.171 gdk_bbp.mx, , 1.252, 1.253 gdk_heap.mx, , 1.117, 1.118 gdk_qsort.mx, , 1.34, 1.35 gdk_ssort.mx, , 1.15, 1...
Is this a backward compatible change? In other words, can databases created before this change be read by the new version?
I really want backward compatibility, so if it isn't, some kind of conversion would be needed.
Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv5363/src/gdk
Modified Files: gdk.mx gdk_atoms.mx gdk_bat.mx gdk_batop.mx gdk_bbp.mx gdk_heap.mx gdk_qsort.mx gdk_ssort.mx gdk_utils.mx Log Message: support for 32GB string heaps in 64bits/oid32 mode (implies bat format change but ONLY for 64bits/oid32)
src/gdk/gdk.mx
- introduce GDK_VARSHIFT, var_t (str indices) are to be shifted that amount of bits to get to the real offset (padding is 8, in case of 64-bits and oid32 -- otherwise it is 0 => no change)
- clean up usage of var_t in HEAP_* interface
src/gdk/gdk_atoms.mx
- str heaps with difficult double lim distrubution do not maintain a linked list (direct open hashing only)
- allow internal string heap hash table pointers to be unsigned shorts instead of var_t (only switched on for 64bits/oid32)
- double-elim string heaps scaled back to 64KB (from 256-512KB)
- support for GDK_VARSHIFT
src/gdk/gdk_bat.mx
- bugfix in 64bits/oid32 offset calculation (precision loss averted)
src/gdk/gdk_batop.mx src/gdk/gdk_bbp.mx src/gdk/gdk_qsort.mx src/gdk/gdk_ssort.mx
- support for GDK_VARSHIFT
src/gdk/gdk_heap.mx
- HEAPmargin allocates large VM area after a block alloc in 64-bits (this to stimulate in-place HEAPextend without memcpy)
- clean up use of var_t/size_t in HEAP_* functions, and support for
GDK_VARSHIFT
src/gdk/gdk_utils.mx
- increase VM size for 64-bits systems to 4TB
U gdk.mx Index: gdk.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk.mx,v retrieving revision 1.279 retrieving revision 1.280 diff -u -d -r1.279 -r1.280 --- gdk.mx 3 Apr 2009 09:14:51 -0000 1.279 +++ gdk.mx 13 Apr 2009 23:36:24 -0000 1.280 @@ -1068,9 +1068,9 @@ @item void HEAP_destroy (Heap* h) @item var_t -HEAP_malloc (Heap* heap, var_t nbytes) +HEAP_malloc (Heap* heap, size_t nbytes) @item void -HEAP_free (Heap *heap, size_t block) +HEAP_free (Heap *heap, var_t block) @item int HEAP_private (Heap* h) @item void @@ -1111,7 +1111,7 @@ int (*sizefcn) (ptr) /*
BATatoms[].atomLen
function */
);
-gdk_export var_t HEAP_malloc(Heap *heap, var_t nbytes); +gdk_export var_t HEAP_malloc(Heap *heap, size_t nbytes); gdk_export void HEAP_free(Heap *heap, var_t block); gdk_export var_t HEAP_private(Heap *h); gdk_export void HEAP_checkformat(Heap *h); @@ -1350,12 +1350,37 @@ #define Hloc(b,p) ((b)->H->heap.base+((p)<<(b)->H->shift)) #define Tloc(b,p) ((b)->T->heap.base+((p)<<(b)->T->shift))
+#if SIZEOF_VAR_T < SIZEOF_VOID_P +/* NEW 11/4/2009: when compiled with 32-bits oids/var_t on 64-bits
systems, align heap strings
- on 8 byte boundaries always (wasting 4 padding bytes on avg). Note
that in heaps where
- duplicate elimination is succesful, such padding occurs anyway (as an
aside, a better
- implementation with two-bytes pointers in the string heap hash table,
could reduce that
- padding to avg 1 byte wasted -- see TODO below).
- This 8 byte alignment allows the offset in the fixed part of the BAT
string column to be
- interpreted as an index, which should be multiplied by 8 to get the
position (VARSHIFT). The
- overall effect is that 32GB heaps can be addressed even when oids are
limited to 4Gtuples.
- In the future, we might extend this such that the string alignment is
set in the BAT header
- (columns with long strings take more storage space, but could
tolerate more padding).
- It would mostly work, only the sort routine and strPut/strLocate
(which do not see the BAT
- header) extra parameters would be needed in their APIs.
- */
+typedef unsigned short stridx_t; +#define GDK_VARSHIFT 3 +#define GDK_VARALIGN (1<<GDK_VARSHIFT) +#else +typedef var_t stridx_t; /* TODO: should also be unsigned short, but kept
at var_t not to break BAT images */
+#define GDK_VARSHIFT 0 +#define GDK_VARALIGN sizeof(stridx_t) +#endif
#define BUNhloc(bi,p) Hloc((bi).b,p) #define BUNtloc(bi,p) Tloc((bi).b,p) #define BUNhpos(bi,p) (Hpos(&(bi),p)) #define BUNtpos(bi,p) (Tpos(&(bi),p)) -#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+*((var_t*)BUNhloc(bi,p))):BUNhpos(bi,p)) -#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+*((var_t*)BUNtloc(bi,p))):BUNtpos(bi,p)) +#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+((*(var_t*)BUNhloc(bi,p))<<GDK_VARSHIFT)):BUNhpos(bi,
p))
+#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+((*(var_t*)BUNtloc(bi,p))<<GDK_VARSHIFT)):BUNtpos(bi,
p))
#define BUNhead(bi,p) ((bi).b- hvarsized?BUNhvar(bi,p):BUNhloc(bi,p)) #define BUNtail(bi,p) ((bi).b- tvarsized?BUNtvar(bi,p):BUNtloc(bi,p))
U gdk_atoms.mx Index: gdk_atoms.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_atoms.mx,v retrieving revision 1.161 retrieving revision 1.162 diff -u -d -r1.161 -r1.162 --- gdk_atoms.mx 7 Apr 2009 11:53:22 -0000 1.161 +++ gdk_atoms.mx 13 Apr 2009 23:36:24 -0000 1.162 @@ -175,7 +175,6 @@ gdk_export int strLen(const char *s); gdk_export int strCmp(str l, str r); gdk_export int strNil(str s); -gdk_export void strHeapConvert(Heap *h, int directon); gdk_export int strElimDoubles(Heap *h); gdk_export var_t strLocate(Heap *h, str v); gdk_export int strCmpNoNil(unsigned char *l, unsigned char *r); @@ -457,7 +456,7 @@ 0, 0, (var_t (*)(Heap *, var_t *, ptr)) strPut, 0, (int (*)(ptr)) strLen, strHeap,
(void (*)(Heap *, int)) strHeapConvert, 0},
(void (*)(Heap *, int)) 0, 0},
}; int GDKatomcnt = TYPE_str + 1;
@@ -931,24 +930,25 @@ } \ } while (0)
-#define GDK_STRHASHTABLE (1<<10) +/* string heaps:
- strings are 8 byte aligned
- start with a 1024 bucket hash table
- heaps < 64KB are fully duplicate eliminated with this hash tables
- heaps >= 64KB are opportunistically (imperfect) duplicate
eliminated
- as only the last 128KB chunk is considered and there is no linked
list
- buckets and next pointers are unsigned short "indices"
- indices should be multiplied by 8 and takes from ELIMBASE to get an
offset
- Note that a 64KB chunk of the heap contains at most 8K 8-byte aligned
- strings. The 1K bucket list means that in worst load, the list length
is 8 (OK).
- */
+#define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) -#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(var_t)) -#define GDK_STRHASHCREDIT(h) (((var_t*) (h)- base)[GDK_STRHASHTABLE]) +#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) +#define GDK_ELIMPOWER 16 /* 64KB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) +#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently:
ELIMBASE == 0 */
#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) <<
GDK_ELIMPOWER)
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define GDK_ELIMPOWER 16 /* makes for a max 64KB hash
table
* ie 256 string bytes per hash bucket
* ~ 4 strings of UP4(8<=len<=11)=12 + 4
bytes
*/
-#else -#define GDK_ELIMPOWER 17 /* makes for a max 128KB hash
table
* ie 512 string bytes per hash bucket
* ~ 5 strings of UP8(8<=len<=15)=16 + 8
bytes
*/
-#endif
@- Atomic ADT functions @c @@ -1767,46 +1767,34 @@ Because in many typical situations lots of double string values occur in tables, the string insertion provides automatic double elimination. To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the
first
-4096 (8192 on 64-bit architectures) bytes of the string heap, consisting
of integer offsets of the first
-string hashing to that bucket in the heap. Furthermore, the first 4(8)
bytes
-before each string in the heap is an integer offset to the next string
hashing
-to the same number. +4096 bytes of the string heap, consisting an offset to the first string +hashing to that bucket in the heap. +These offsets are made small (stridx_t is an unsigned short) by
exploiting
+the fact that the double elimination chunks are (now) 64KB, hence a
short
+suffices.
-However, in many other situations the cardinality of string columns is
large,
+In many other situations the cardinality of string columns is large, or the string values might even be unique. In those cases, our fixed-
size hash
table will start to overflow quickly. Therefore, after the hash table is
full
(this is measured very simplistically by looking whether the string heap
exceeds a
-heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with
old bat images)
-we flush the hash table. If one views the string heaps as consecutive
chunks
-of size GDK_ELIMLIMIT bytes, then all strings within one chunk are
double-eliminated.
-There is a macro GDK_ELIMBASE(offset) that computes the base of the
chunk in which
-a certain byte-offset falls. -@- -This is a departure from our previous policy of not looking at the hash
tables at
-all after overflow occurred. The advantage of the new approach is that
if we have
-a value distribution that is skewed (ie some values are very frequent),
these
-values will always be double eliminated, saving a considerable amount of
space.
-Disadvantage of the approach is that we always have to reserve space for
the next
-pointer (4(8) byte integer offset) that is stored right in front of the
string (and
-consequently have to keep all string chunks and offsets aligned to
4(8)). All this
-translates into some wasted space. However, if there are that many
different strings
-that the hash table overflows, the strings must be relatively long and
the relative
-storage overhead should be low. +heap size = GDK_ELIMLIMIT = 64KB) we flush the hash table. Even more,
from that moment
+on, we do not use a linked list, but a lossy hash table that just
contains
+the last position for each bucket. Basically, after exceeding
GDK_ELIMLIMIT,
+we get a probabilistic/opportunistic duplicate elimination mechanism, +that only looks at the last GDK_ELIMLIMIT chunk in the heap, in a lossy
way.
+When comparing with the previous string implementation, the biggest
difference
+is that on 64-nbits bt with 32-bits oids, strings are always 8-byte
aligned
+and var_t numbers are multiplied by 8 to get the true offset. The goal
to do
+this is to allow 32-bits var_t on 64-bits systems to address 32GB (using
string
+alignment=8). For large database, the cost of padding (4 bytes avg) is
offset
+by the savings in var_t (allowing to go from 64- to 32-bits). Nothing
lost there,
+and 32-bits var_t also pay in smaller OIDs and smaller hash tables,
reducing memory
+pressure. For small duplicate eliminated heaps, the short indices +used in the hash table have now allowed more buckets (2K instead of 1K) +and average 2 bytes overhead for the next pointers instead of 6-12.
Therefore
+small heaps are now more compact than before. @- -Notice that this mechanism enables to keep a certain linear storage
property
-in the string heaps. This is important if we want to take a BATslice on
a BAT
-by simply loading or @strong{mmap()}ping slices of the BAT files on disk
into memory.
-This is relevant in order to process a very large BAT iteratively by
taking slices
-in order to reduce memory consumption. Notice that if there are few
different string
-values, the hash table has not overflowed, and the string heap size will
be small
-(i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load
the entire string heap.
-If the hash table @strong{has} overflowed, we want to be able to only
map a slice of the
-string heap as well. Now, given that the first string in the BAT-slice
is called F1
-and its heap offset is O1 and the last string in the slice is F2 and its -offset is O2, then the slice we should take from the string heap is: -@example -GDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2)) -@end example The routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash-table. Only then we know that unequal offset-integers in the BUN array means @@ -1877,16 +1865,12 @@ strHeap(Heap *d, size_t cap) { size_t size;
var_t *h, *e;
cap = MAX(cap, BATTINY);
size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT,
cap * 12);
- size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap *
GDK_VARALIGN);
if (HEAPalloc(d, size, 1) >= 0) {
d->free = GDK_STRHASHTABLE * sizeof(var_t);
h = (var_t *) d->base;
for (e = h; e < h + GDK_STRHASHTABLE; e++) {
*e = 0;
}
d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
}memset(d->base, 0, d->free);
}
@@ -1923,42 +1907,10 @@ void strCleanHash(Heap *h, int rebuild) {
- (void) rebuild; if (!GDK_ELIMDOUBLES(h)) { /* flush hash table for security */ memset(h->base, 0, GDK_STRHASHSIZE);
- } else if (rebuild) {
var_t xx, cur = 0, end = 0;
str hash = (str) h->base;
-/* int cnt[GDK_STRHASHTABLE]; */
/* collect all values in one big linked list */
for (xx = 0; xx < GDK_STRHASHTABLE; xx++) {
var_t yy = ((var_t *) hash)[xx];
-/* cnt[xx]=0; */
((var_t *) hash)[xx] = 0; /* clear hash table slot
*/
if (end) {
*(var_t *) (hash + end) = yy;
} else {
cur = yy;
}
for (; yy; yy = *(var_t *) (hash + yy))
end = yy;
}
/* process the linked list, inserting the values again */
for (; cur; cur = end) {
str val = hash + cur;
GDK_STRHASH(val + sizeof(var_t), xx);
xx &= GDK_STRHASHMASK;
end = *(var_t *) val;
*(var_t *) val = ((var_t *) hash)[xx];
((var_t *) hash)[xx] = cur;
-/* cnt[xx]++; */
}
-/* for(xx=0; xx<GDK_STRHASHTABLE; xx++)
- if(cnt[xx]) stream_printf(GDKout, "%5d %6d", xx, cnt[xx]); */ }
}
@@ -1970,90 +1922,63 @@ var_t strLocate(Heap *h, str v) {
- var_t *htab = (var_t *) h->base;
- var_t *l, *e;
- BUN idx;
- GDK_STRHASH(v, idx);
- e = htab + (idx & GDK_STRHASHMASK);
- if (*e) {
for (l = e; *l && *l < h->free; l = (var_t *) ((char *) h->base
- *l)) {
str x = (str) ((char *) h->base + *l + sizeof(var_t));
if (GDK_STRCMP(v, x) == 0) {
return *l + (var_t) sizeof(var_t);
}
}
- }
- return 0;
-}
- stridx_t *ref, *next;
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define normal_vart_SWAP(x) ((var_t) normal_int_SWAP((int)x)) -#else -#define normal_vart_SWAP(x) ((var_t) long_long_SWAP((lng)x)) -#endif
- /* search hash-table, if double-elimination is still in place */
- BUN off; GDK_STRHASH(v, off);
- off &= GDK_STRHASHMASK;
-/* convert the integers in the implicit hash table structure */ -void -strHeapConvert(Heap *h, int dir) -{
- var_t *htab = (var_t *) h->base;
- var_t *l, i, j;
- /* should only use strLocate iff fully double eliminated */
- assert(GDK_ELIMBASE(h->free) == 0);
- if (dir == CONV_HTON) {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + j)) {
*l = normal_vart_SWAP(j);
}
}
- } else {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + *l)) {
*l = normal_vart_SWAP(j);
}
}
- /* search the linked list */
- for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0)
return (sizeof(stridx_t) + *ref) >> GDK_VARSHIFT; /*
found */
}
- return 0;
}
var_t strPut(Heap *h, var_t *dst, str v) {
- var_t *l;
- size_t i = GDK_STRLEN(v);
- BUN off;
- /* round up to var_t-byte alignment + var_t (next pointer) */
- size_t len = ((i + sizeof(var_t) - 1) & ~(sizeof(var_t) - 1)) +
sizeof(var_t);
- size_t elimlimit = GDK_ELIMBASE(h->free) + GDK_ELIMLIMIT;
size_t elimbase = GDK_ELIMBASE(h->free);
size_t pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN-1));
size_t pos, len = GDK_STRLEN(v);
stridx_t *bucket, *ref, *next;
/* search hash-table, if double-elimination is still in place */
- GDK_STRHASH(v, off);
- BUN off; GDK_STRHASH(v, off); off &= GDK_STRHASHMASK;
- for (l = ((var_t *) h->base) + off; *l && *l < h->free; l = (var_t *)
(h->base + *l)) {
str x = (str) (h->base + *l + sizeof(var_t));
- bucket = ((stridx_t *) h->base) + off;
if (GDK_STRCMP(v, x) == 0) {
*dst = *l + (var_t) sizeof(var_t); /* already in
heap;
do not insert! */
if (GDK_ELIMDOUBLES(h) == 0 && GDK_STRHASHCREDIT(h))
GDK_STRHASHCREDIT(h) += 4;
return *dst;
- if (elimbase == 0) { /* small string heap (<64KB) -- fully double
eliminated */
for (ref = bucket; *ref; ref = next) { /* search the linked
list */
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0) { /* found */
pos = sizeof(stridx_t) + *ref;
return *dst = (pos >> GDK_VARSHIFT);
}}
- }
- /* flush the hash table if it becomes too big (implies
!GDK_ELIMDOUBLES) */
- if (h->free + len >= elimlimit) {
/* if we are not hash-inserting anymore, h->free may no longer
be var_t aligned */
size_t mask = h->free & (sizeof(var_t) - 1);
if (mask)
h->free += sizeof(var_t) - mask; /* realign */
memset(h->base, 0, GDK_STRHASHSIZE); /* start over hash
inserting in a pristine hash table */
GDK_STRHASHCREDIT(h) = 32; /* only tolerate limited misses
in the future */
/* is there room for the next pointer in the padding space? */
if (pad < sizeof(stridx_t))
pad += GDK_VARALIGN; /* if not, pad more */
- } else {
/* large string heap (>=64KB) -- opportunistic/probabilistic
double elimination */
pos = elimbase + *bucket;
if (GDK_STRCMP(v, (str) (h->base + pos)) == 0) {
return *dst = (pos >> GDK_VARSHIFT); /* already in heap;
do not insert! */
}
+#if SIZEOF_VAR_T < SIZEOF_VOID_P
pad &= (GDK_VARALIGN-1); /* a full VARALIGN pad can be omitted
*/
+#else
pad = 0; /* only 32-bits var_t in 64-bits needs padding */
+#endif }
/* check heap for space (limited to a certain maximum after which
nils are inserted) */
- if (h->free + len >= h->size) {
- if (h->free + pad + len >= h->size) { /* Something really strange happens here, In a special case (Pentium II Klamath, gcc version 2.96 20000731, GNU assembler version 2.10.90 using BFD version 2.10.0.18)
@@ -2064,11 +1989,15 @@ */ float batmargin = (float) BATMARGIN; float hnewsize = h->size * batmargin;
size_t newsize = len + (size_t) hnewsize;
size_t newsize = pad + len + (size_t) hnewsize;
assert(newsize);
if (h->free + len < h->maxsize) {
if (h->free + pad + len >= (((size_t) VAR_MAX) <<
GDK_VARSHIFT)) {
GDKerror("strPut: string heaps gets larger than
%dGB.\n",
(((size_t) VAR_MAX) << GDK_VARSHIFT) >> 30);
return 0;
}
if (h->free + pad + len < h->maxsize) { /* if there is reserved space, first use the reserved
space */
newsize = MIN(newsize, h->maxsize); }
@@ -2077,32 +2006,27 @@ } /* fill should solve initialisation problems within valgrind */ memset(h->base + h->free, 0, h->size - h->free);
}
if (!GDK_ELIMDOUBLES(h)) {
if (GDK_STRHASHCREDIT(h) == 0) {
/* if credits are gone, we do not hash insert at all */
if (h->free > VAR_MAX) {
GDKerror("strPut: string heaps gets larger than
2GB
-- too large for 32-bits oids.\n");
return 0;
}
*dst = (var_t) h->free;
memcpy(h->base + h->free, v, i);
h->free += i; /* in this case, we do not round to
var_t either */
return *dst;
}
GDK_STRHASHCREDIT(h)--;
/* make bucket point into the enw heap */
}bucket = ((stridx_t *) h->base) + off;
- /* insert string in hash table and copy into the heap */
- l = (var_t *) (h->base + h->free);
- *(l++) = ((var_t *) h->base)[off];
- assert(h->free + sizeof(var_t) <= VAR_MAX);
- ((var_t *) h->base)[off] = (var_t) h->free;
- *dst = (var_t) (h->free + sizeof(var_t));
- h->free += len;
- memcpy((char *) l, v, i);
- /* insert string */
- pos = h->free + pad;
- *dst = pos >> GDK_VARSHIFT;
- memcpy(h->base + pos, v, len);
- h->free += pad + len;
- /* maintain hash table */
- if (elimbase == 0) { /* small string heap: link the next pointer */
pos -= sizeof(stridx_t); /* the stridx_t next pointer directly
precedes the string */
*(stridx_t*) (h->base + pos) = *bucket;
- }
- *bucket = (stridx_t) (pos - elimbase); /* set bucket to the new
string */
- if (h->free >= elimbase + GDK_ELIMLIMIT) {
memset(h->base, 0, GDK_STRHASHSIZE); /* flush hash table */
- } return *dst;
}
U gdk_bbp.mx Index: gdk_bbp.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bbp.mx,v retrieving revision 1.252 retrieving revision 1.253 diff -u -d -r1.252 -r1.253 --- gdk_bbp.mx 9 Apr 2009 15:29:18 -0000 1.252 +++ gdk_bbp.mx 13 Apr 2009 23:36:24 -0000 1.253 @@ -2965,9 +2965,9 @@ BATsetcount(&bs->B, 0);
if (bs->H.vheap)
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
if (bs->T.vheap)
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
return bs;
} BATDEBUG THRprintf(GDKout, "#BBPrecycle %d %d "SZFMT" (%d, %d %d %d)
N2\n", ht, tt, cap, BATCACHE_NOTYPE(ht), BATCACHE_NOTYPE(tt), batcache_maxbuckets, bin);
U gdk_batop.mx Index: gdk_batop.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_batop.mx,v retrieving revision 1.170 retrieving revision 1.171 diff -u -d -r1.170 -r1.171 --- gdk_batop.mx 7 Jan 2009 14:13:48 -0000 1.170 +++ gdk_batop.mx 13 Apr 2009 23:36:24 -0000 1.171 @@ -1656,7 +1656,7 @@ if (b->hkey == 0) { /* sorted and key? */ while (cur < end) {
char *val = base + *(((var_t
*)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t
*)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5= 0) { key = FALSE;
@@ -1668,7 +1668,7 @@ } /* sorted? */ while (cur < end) {
char *val = base + *(((var_t *)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t *)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5 0) { /* record negative position info
*/
U gdk_utils.mx Index: gdk_utils.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_utils.mx,v retrieving revision 1.246 retrieving revision 1.247 diff -u -d -r1.246 -r1.247 --- gdk_utils.mx 9 Apr 2009 18:48:25 -0000 1.246 +++ gdk_utils.mx 13 Apr 2009 23:36:24 -0000 1.247 @@ -382,7 +382,7 @@ #define GDK_MEM_NULLALLOWED
#if SIZEOF_VOID_P==8 -#define GDK_VM_MAXSIZE LL_CONSTANT(137438953472) /* :-) a 64-bit
OS: 128 GB */
+#define GDK_VM_MAXSIZE LL_CONSTANT(4398046511104) /* :-) a 64-bit
OS: 4TB */
#else #define GDK_VM_MAXSIZE LL_CONSTANT(1610612736) /* :-| a 32-bit OS:
1.5GB */
#endif
U gdk_heap.mx Index: gdk_heap.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_heap.mx,v retrieving revision 1.117 retrieving revision 1.118 diff -u -d -r1.117 -r1.118 --- gdk_heap.mx 20 Mar 2009 11:59:57 -0000 1.117 +++ gdk_heap.mx 13 Apr 2009 23:36:24 -0000 1.118 @@ -75,8 +75,20 @@ nice). @{ @c -int -HEAPalloc(Heap *h, size_t nitems, size_t itemsize) +size_t HEAPmargin(size_t maxsize) {
- size_t ret;
+#if SIZEOF_VOID_P == 8
- /* in 64-bits systems, try to enforce in-place realloc, but provoke
the memcpy on 256MB, then 4GB */
- size_t use = GDKvm_cursize();
- ret = MIN(GDK_mem_maxsize, MAX(((size_t) 1) << 26, 16*maxsize));
- if ((ret+ret) > (GDK_vm_maxsize-MIN(GDK_vm_maxsize,use))) /* only if
room */
+#endif
- ret = ((double) BATMARGIN)*maxsize - 1; /* do not waste VM on 32-bits
*/
- return (1 + (MAX(maxsize,ret) >> 16)) << 16; /* round up to 64K */
+}
+/* in 64-bits space, use very large margins to accomodate reallocations
*/
+int HEAPalloc(Heap *h, size_t nitems, size_t itemsize) { char nme[PATHLENGTH], *ext = NULL;
@@ -98,8 +110,7 @@ /* when using anonymous vm we malloc we need 64K chunks, also we * 20% extra malloc */ if (h->size > GDK_mem_bigsize) {
h->maxsize = (size_t) ((double) h->maxsize * BATMARGIN) - 1;
h->maxsize = (1 + (h->maxsize >> 16)) << 16;
} if (h->filename == NULL || (h->size < GDK_mmap_minsize)) { h->storage = STORE_MEM;h->maxsize = HEAPmargin(h->maxsize);
@@ -169,17 +180,14 @@ /* extend a malloced heap, possibly switching over to file-
mapped storage */
Heap bak = *h; int can_mmap = h->filename && (size >= GDK_mem_bigsize || h-
newstorage != STORE_MEM);
int must_mmap = can_mmap && (size >= GDK_mmap_minsize || h-
newstorage != STORE_MEM);
int small_cpy = (h->size*4 < size) && (size >=
GDK_mmap_minsize);
int must_mmap = can_mmap && (small_cpy || h->newstorage !=
STORE_MEM);
h->size = size; if (can_mmap) { /* in anonymous vm, if have to realloc anyway, we
reserve
some extra space */
if (size > h->maxsize) {
h->maxsize = (size_t) ((double) size *
BATMARGIN);
}
/* when using anonymous vm we malloc we need 64K chunks
*/
h->maxsize = (1 + ((h->maxsize - 1) >> 16)) << 16;
} else { h->maxsize = size; /* for normal GDKmalloc, maxsizeh->maxsize = HEAPmargin(MAX(size,h->maxsize));
= size */
}
@@ -514,9 +522,9 @@ #define HEAPVERSION 20030408
typedef struct heapheader {
- var_t head; /* index to first free block */
- size_t head; /* index to first free block */ int alignment; /* alignment of objects on heap */
- var_t firstblock; /* first block in heap */
- size_t firstblock; /* first block in heap */ int version; int (*sizefcn) (ptr); /* ADT function to ask length */
} HEADER32; @@ -524,8 +532,8 @@ typedef struct { int version; int alignment;
- var_t head;
- var_t firstblock;
- size_t head;
- size_t firstblock; int (*sizefcn) (ptr);
} HEADER64;
@@ -537,8 +545,8 @@ typedef HEADER64 HEADER_OTHER; #endif typedef struct hfblock {
- var_t size; /* Size of this block in freelist */
- var_t next; /* index of next block */
- size_t size; /* Size of this block in freelist */
- size_t next; /* index of next block */
} CHUNK;
@c @@ -546,13 +554,13 @@ #define roundup_4(x) (((x)+3)&~3) #define blocksize(h,p) ((p)->size)
-static INLINE var_t -roundup_num(var_t number, int alignment) +static INLINE size_t +roundup_num(size_t number, int alignment) {
- var_t rval;
- size_t rval;
- rval = number + (var_t) alignment - 1;
- rval -= (rval % (var_t) alignment);
- rval = number + (size_t) alignment - 1;
- rval -= (rval % (size_t) alignment); return rval;
}
@@ -560,7 +568,7 @@ HEAP_private(Heap *h) { (void) h;
- return (var_t) roundup_8(sizeof(HEADER));
- return (var_t) (roundup_8(sizeof(HEADER)) >> GDK_VARSHIFT);
}
#ifdef TRACE @@ -568,7 +576,7 @@ HEAP_printstatus(Heap *heap) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t block, cur_free = hheader->head;
size_t block, cur_free = hheader->head; CHUNK *blockp;
THRprintf(GDKout, "#HEAP has head " SZFMT " and alignment %d and size
" SZFMT "\n",
@@ -591,7 +599,7 @@ cur_free = blockp->next; block += blockp->size; } else {
var_t size = blocksize(hheader, blockp);
size_t size = blocksize(hheader, blockp); THRprintf(GDKout, "# block at " SZFMT " with size "
SZFMT "\n", block, size);
block += size;
@@ -611,7 +619,7 @@ /* // Calculate position of first and only free block. */
- var_t head = roundup_num((var_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
- size_t head = roundup_num((size_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
CHUNK *headp = HEAP_index(heap, head, CHUNK);
assert(roundup_8(sizeof(HEADER)) + roundup_8(nprivate) <= VAR_MAX); @@ -629,7 +637,7 @@ // Fill first free block. */ assert(heap->size - head <= VAR_MAX);
- headp->size = (var_t) (heap->size - head);
- headp->size = (size_t) (heap->size - head); headp->next = 0;
#ifdef TRACE THRprintf(GDKout, "#We created the following heap\n"); @@ -669,9 +677,9 @@
var_t -HEAP_malloc(Heap *heap, var_t nbytes) +HEAP_malloc(Heap *heap, size_t nbytes) {
- var_t block, trail, ttrail;
- size_t block, trail, ttrail; CHUNK *blockp; CHUNK *trailp; HEADER *hheader = HEAP_index(heap, 0, HEADER);
@@ -682,15 +690,9 @@
/* add space for size field */ nbytes += hheader->alignment;
- if (hheader->alignment == 8) {
nbytes = roundup_8(nbytes);
- } else if (hheader->alignment == 4) {
nbytes = roundup_4(nbytes);
- } else {
GDKfatal("HEAP_malloc: Heap structure corrupt\n");
- }
- nbytes = roundup_8(nbytes); if (nbytes < sizeof(CHUNK))
nbytes = (var_t) sizeof(CHUNK);
nbytes = (size_t) sizeof(CHUNK);
/* // block -- points to block with acceptable size (if available).
@@ -718,12 +720,12 @@ // If no block of acceptable size is found we try to enlarge the
heap.
*/
if (block == 0) {
var_t newsize;
size_t newsize;
assert(heap->free + MAX(heap->free, nbytes) <= VAR_MAX);
newsize = (var_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
newsize = (size_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
assert(heap->free <= VAR_MAX);
block = (var_t) heap->free; /* current end-of-heap */
block = (size_t) heap->free; /* current end-of-heap */
#ifdef TRACE THRprintf(GDKout, "#No block found\n"); @@ -747,7 +749,7 @@
blockp->next = 0; assert(heap->free - block <= VAR_MAX);
blockp->size = (var_t) (heap->free - block); /* determine
size of allocated block */
blockp->size = (size_t) (heap->free - block); /* determine
size of allocated block */
/* // Try to join the last block in the freelist and the newly
allocated
@@ -778,7 +780,7 @@ // TUNE: use different amount than 2*sizeof(CHUNK) */ if (blockp->size >= nbytes + 2 * sizeof(CHUNK)) {
var_t newblock = block + nbytes;
size_t newblock = block + nbytes;
CHUNK *newblockp = HEAP_index(heap, newblock, CHUNK);
newblockp->size = blockp->size - nbytes;
@@ -800,17 +802,17 @@ }
block += hheader->alignment;
- return block;
- return block >> GDK_VARSHIFT;
}
void -HEAP_free(Heap *heap, var_t block) +HEAP_free(Heap *heap, var_t mem) { HEADER *hheader = HEAP_index(heap, 0, HEADER); CHUNK *beforep; CHUNK *blockp; CHUNK *afterp;
- var_t after, before;
size_t after, before, block = mem << GDK_VARSHIFT;
if (hheader->alignment != 8 && hheader->alignment != 4) { GDKfatal("HEAP_free: Heap structure corrupt\n");
@@ -884,10 +886,10 @@ HEAP_check(Heap *heap, HeapRepair *hr) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t head = hheader->head, alignshift = 2;
- var_t block, nwords = (var_t) ((heap->free - 1) >> 7);
- size_t head = hheader->head, alignshift = 2;
- size_t block, nwords = (size_t) ((heap->free - 1) >> 7); int *freemask;
- var_t prevblock = 0;
size_t prevblock = 0; CHUNK *blockp;
hr->alignment = hheader->alignment;
@@ -922,8 +924,8 @@ // Walk the freelist; register them in freemask */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if ((block <= prevblock) && (block != 0)) {
@@ -942,8 +944,8 @@ */ block = hheader->firstblock; while (block < heap->free) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
hr->validmask[pos] |= mask;
@@ -965,8 +967,8 @@ // Check if there are left over free blocks */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if (freemask[pos] & mask) {
@@ -1046,14 +1048,14 @@ if (hheader->head > heap->free) { hheader->head = 0; /* cut off free block */ } else if (hheader->head) {
var_t idx = hheader->head;
size_t idx = hheader->head;
while (idx) { CHUNK *blk = HEAP_index(heap, idx, CHUNK);
if (idx + blk->size > heap->free) { assert(heap->free - idx <= VAR_MAX);
blk->size = (var_t) (heap->free - idx); /* cut
off illegal tail of block */
blk->size = (size_t) (heap->free - idx);
/* cut
off illegal tail of block */
} if (blk->next > heap->free || blk->next < (idx + blk-
size) || (blk->next & (hheader->alignment - 1))) { blk->next = 0; /* cut off next block */
U gdk_qsort.mx Index: gdk_qsort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_qsort.mx,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- gdk_qsort.mx 7 Jan 2009 14:13:48 -0000 1.34 +++ gdk_qsort.mx 13 Apr 2009 23:36:24 -0000 1.35 @@ -86,6 +86,7 @@ char *offst; /* NULL or start of varsized heap */ int hshift; /* log2 of hs */ int tshift; /* log2 of hs */
- int vshift; /* shift to be applied on var_ offsets */ unsigned hs; /* width of head record */ unsigned ts; /* width of tail record */ void *h; /* start of head buns */
@@ -189,7 +190,7 @@ #endif #endif
-#define offset(p) (buf->offst + *(var_t*) (p)) +#define offset(p) (buf->offst + ((*(var_t*) (p)) << buf- vshift)) #define direct(p) (p)
#define any_LE(a,b) ((buf->cmp)(a,b) <= 0) @@ -432,6 +433,7 @@ buf.ts = (unsigned) ts; buf.hshift = ATOMelmshift(hs); buf.tshift = ATOMelmshift(ts);
- buf.vshift = ATOMvarsized(tpe)?GDK_VARSHIFT:0; buf.h = h; if (!t) t = &x;
U gdk_bat.mx Index: gdk_bat.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bat.mx,v retrieving revision 1.214 retrieving revision 1.215 diff -u -d -r1.214 -r1.215 --- gdk_bat.mx 3 Apr 2009 09:14:52 -0000 1.214 +++ gdk_bat.mx 13 Apr 2009 23:36:24 -0000 1.215 @@ -434,7 +434,7 @@ BAT * BATextend(BAT *b, BUN newcap) {
- size_t hheap_size, theap_size;
size_t hheap_size = newcap, theap_size = newcap;
assert(newcap <= BUN_MAX); BATcheck(b, "BATextend");
@@ -453,10 +453,10 @@
b->batCapacity = newcap;
- hheap_size = Hsize(b) * newcap;
- hheap_size *= Hsize(b); if (b->H->heap.base && HEAPextend(&b->H->heap, hheap_size) < 0) return NULL;
- theap_size = Tsize(b) * newcap;
- theap_size *= Tsize(b); if (b->T->heap.base && HEAPextend(&b->T->heap, theap_size) < 0) return NULL; HASHdestroy(b);
U gdk_ssort.mx Index: gdk_ssort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_ssort.mx,v retrieving revision 1.15 retrieving revision 1.16 diff -u -d -r1.15 -r1.16 --- gdk_ssort.mx 7 Jan 2009 14:13:48 -0000 1.15 +++ gdk_ssort.mx 13 Apr 2009 23:36:24 -0000 1.16 @@ -203,8 +203,8 @@ } \ } while (0)
-#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
- (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)((X),
(Y))) < 0)
-#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)- compare)((X), (Y))) > 0) +#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) < 0)
+#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) <<
GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) > 0)
@= islt #define ISLT_@1@2(X, Y, ms) (* (@1 *) (X) @3 * (@1 *) (Y))
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
-- Sjoerd Mullender
No virus found in this incoming message. Checked by AVG - www.avg.com Version: 8.0.238 / Virus Database: 270.11.20/2013 - Release Date: 04/14/09 06:17:00
i just wanted to add here, that most tijah-users use the
--enable-bits=64 --enable-oid32
configuration. however, this is not a really big group still, so it should be easy to tell them to re-index their collections. -henning
Peter Boncz wrote:
Hi Sjoerd,
Thanks for the question. I wrote earlier last night an email to this list, which I thought covers this.
The answer regarding backward compatibility is: (1) NO, on repositories created by MonetDB configures with --enable-bits=64 --enable-oid32 (2) YES, in all other cases.
It is my understanding that few people use (1). If the MonetDB team agrees, I would propose not to take additional migration measures. If otherwise, ie if we go to a migration anyway, we might also consider changing the new type stridx_t in the (2) cases from var_t to unsigned short. It reduces the fixed overhead of string heaps of small bats (2KB instead of 4KB on 32-bits, 8KB on 64 bits). I refrained from doing so to keep (2) backward compatible.
Peter
-----Original Message----- From: Sjoerd Mullender [mailto:sjoerd@acm.org] Sent: Tuesday, April 14, 2009 1:41 PM To: monetdb-developers@lists.sourceforge.net Cc: monetdb-checkins@lists.sourceforge.net Subject: Re: [Monetdb-checkins] MonetDB/src/gdk gdk.mx, , 1.279, 1.280 gdk_atoms.mx, , 1.161, 1.162 gdk_bat.mx, , 1.214, 1.215 gdk_batop.mx, , 1.170, 1.171 gdk_bbp.mx, , 1.252, 1.253 gdk_heap.mx, , 1.117, 1.118 gdk_qsort.mx, , 1.34, 1.35 gdk_ssort.mx, , 1.15, 1...
Is this a backward compatible change? In other words, can databases created before this change be read by the new version?
I really want backward compatibility, so if it isn't, some kind of conversion would be needed.
Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv5363/src/gdk
Modified Files: gdk.mx gdk_atoms.mx gdk_bat.mx gdk_batop.mx gdk_bbp.mx gdk_heap.mx gdk_qsort.mx gdk_ssort.mx gdk_utils.mx Log Message: support for 32GB string heaps in 64bits/oid32 mode (implies bat format change but ONLY for 64bits/oid32)
src/gdk/gdk.mx
- introduce GDK_VARSHIFT, var_t (str indices) are to be shifted that amount of bits to get to the real offset (padding is 8, in case of 64-bits and oid32 -- otherwise it is 0 => no change)
- clean up usage of var_t in HEAP_* interface
src/gdk/gdk_atoms.mx
- str heaps with difficult double lim distrubution do not maintain a linked list (direct open hashing only)
- allow internal string heap hash table pointers to be unsigned shorts instead of var_t (only switched on for 64bits/oid32)
- double-elim string heaps scaled back to 64KB (from 256-512KB)
- support for GDK_VARSHIFT
src/gdk/gdk_bat.mx
- bugfix in 64bits/oid32 offset calculation (precision loss averted)
src/gdk/gdk_batop.mx src/gdk/gdk_bbp.mx src/gdk/gdk_qsort.mx src/gdk/gdk_ssort.mx
- support for GDK_VARSHIFT
src/gdk/gdk_heap.mx
- HEAPmargin allocates large VM area after a block alloc in 64-bits (this to stimulate in-place HEAPextend without memcpy)
- clean up use of var_t/size_t in HEAP_* functions, and support for
GDK_VARSHIFT
src/gdk/gdk_utils.mx
- increase VM size for 64-bits systems to 4TB
U gdk.mx Index: gdk.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk.mx,v retrieving revision 1.279 retrieving revision 1.280 diff -u -d -r1.279 -r1.280 --- gdk.mx 3 Apr 2009 09:14:51 -0000 1.279 +++ gdk.mx 13 Apr 2009 23:36:24 -0000 1.280 @@ -1068,9 +1068,9 @@ @item void HEAP_destroy (Heap* h) @item var_t -HEAP_malloc (Heap* heap, var_t nbytes) +HEAP_malloc (Heap* heap, size_t nbytes) @item void -HEAP_free (Heap *heap, size_t block) +HEAP_free (Heap *heap, var_t block) @item int HEAP_private (Heap* h) @item void @@ -1111,7 +1111,7 @@ int (*sizefcn) (ptr) /*
BATatoms[].atomLen
function */
);
-gdk_export var_t HEAP_malloc(Heap *heap, var_t nbytes); +gdk_export var_t HEAP_malloc(Heap *heap, size_t nbytes); gdk_export void HEAP_free(Heap *heap, var_t block); gdk_export var_t HEAP_private(Heap *h); gdk_export void HEAP_checkformat(Heap *h); @@ -1350,12 +1350,37 @@ #define Hloc(b,p) ((b)->H->heap.base+((p)<<(b)->H->shift)) #define Tloc(b,p) ((b)->T->heap.base+((p)<<(b)->T->shift))
+#if SIZEOF_VAR_T < SIZEOF_VOID_P +/* NEW 11/4/2009: when compiled with 32-bits oids/var_t on 64-bits
systems, align heap strings
- on 8 byte boundaries always (wasting 4 padding bytes on avg). Note
that in heaps where
- duplicate elimination is succesful, such padding occurs anyway (as an
aside, a better
- implementation with two-bytes pointers in the string heap hash table,
could reduce that
- padding to avg 1 byte wasted -- see TODO below).
- This 8 byte alignment allows the offset in the fixed part of the BAT
string column to be
- interpreted as an index, which should be multiplied by 8 to get the
position (VARSHIFT). The
- overall effect is that 32GB heaps can be addressed even when oids are
limited to 4Gtuples.
- In the future, we might extend this such that the string alignment is
set in the BAT header
- (columns with long strings take more storage space, but could
tolerate more padding).
- It would mostly work, only the sort routine and strPut/strLocate
(which do not see the BAT
- header) extra parameters would be needed in their APIs.
- */
+typedef unsigned short stridx_t; +#define GDK_VARSHIFT 3 +#define GDK_VARALIGN (1<<GDK_VARSHIFT) +#else +typedef var_t stridx_t; /* TODO: should also be unsigned short, but kept
at var_t not to break BAT images */
+#define GDK_VARSHIFT 0 +#define GDK_VARALIGN sizeof(stridx_t) +#endif
#define BUNhloc(bi,p) Hloc((bi).b,p) #define BUNtloc(bi,p) Tloc((bi).b,p) #define BUNhpos(bi,p) (Hpos(&(bi),p)) #define BUNtpos(bi,p) (Tpos(&(bi),p)) -#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+*((var_t*)BUNhloc(bi,p))):BUNhpos(bi,p)) -#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+*((var_t*)BUNtloc(bi,p))):BUNtpos(bi,p)) +#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+((*(var_t*)BUNhloc(bi,p))<<GDK_VARSHIFT)):BUNhpos(bi,
p))
+#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+((*(var_t*)BUNtloc(bi,p))<<GDK_VARSHIFT)):BUNtpos(bi,
p))
#define BUNhead(bi,p) ((bi).b- hvarsized?BUNhvar(bi,p):BUNhloc(bi,p)) #define BUNtail(bi,p) ((bi).b- tvarsized?BUNtvar(bi,p):BUNtloc(bi,p))
U gdk_atoms.mx Index: gdk_atoms.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_atoms.mx,v retrieving revision 1.161 retrieving revision 1.162 diff -u -d -r1.161 -r1.162 --- gdk_atoms.mx 7 Apr 2009 11:53:22 -0000 1.161 +++ gdk_atoms.mx 13 Apr 2009 23:36:24 -0000 1.162 @@ -175,7 +175,6 @@ gdk_export int strLen(const char *s); gdk_export int strCmp(str l, str r); gdk_export int strNil(str s); -gdk_export void strHeapConvert(Heap *h, int directon); gdk_export int strElimDoubles(Heap *h); gdk_export var_t strLocate(Heap *h, str v); gdk_export int strCmpNoNil(unsigned char *l, unsigned char *r); @@ -457,7 +456,7 @@ 0, 0, (var_t (*)(Heap *, var_t *, ptr)) strPut, 0, (int (*)(ptr)) strLen, strHeap,
(void (*)(Heap *, int)) strHeapConvert, 0},
(void (*)(Heap *, int)) 0, 0},
}; int GDKatomcnt = TYPE_str + 1;
@@ -931,24 +930,25 @@ } \ } while (0)
-#define GDK_STRHASHTABLE (1<<10) +/* string heaps:
- strings are 8 byte aligned
- start with a 1024 bucket hash table
- heaps < 64KB are fully duplicate eliminated with this hash tables
- heaps >= 64KB are opportunistically (imperfect) duplicate
eliminated
- as only the last 128KB chunk is considered and there is no linked
list
- buckets and next pointers are unsigned short "indices"
- indices should be multiplied by 8 and takes from ELIMBASE to get an
offset
- Note that a 64KB chunk of the heap contains at most 8K 8-byte aligned
- strings. The 1K bucket list means that in worst load, the list length
is 8 (OK).
- */
+#define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) -#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(var_t)) -#define GDK_STRHASHCREDIT(h) (((var_t*) (h)- base)[GDK_STRHASHTABLE]) +#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) +#define GDK_ELIMPOWER 16 /* 64KB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) +#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently:
ELIMBASE == 0 */
#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) <<
GDK_ELIMPOWER)
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define GDK_ELIMPOWER 16 /* makes for a max 64KB hash
table
* ie 256 string bytes per hash bucket
* ~ 4 strings of UP4(8<=len<=11)=12 + 4
bytes
*/
-#else -#define GDK_ELIMPOWER 17 /* makes for a max 128KB hash
table
* ie 512 string bytes per hash bucket
* ~ 5 strings of UP8(8<=len<=15)=16 + 8
bytes
*/
-#endif
@- Atomic ADT functions @c @@ -1767,46 +1767,34 @@ Because in many typical situations lots of double string values occur in tables, the string insertion provides automatic double elimination. To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the
first
-4096 (8192 on 64-bit architectures) bytes of the string heap, consisting
of integer offsets of the first
-string hashing to that bucket in the heap. Furthermore, the first 4(8)
bytes
-before each string in the heap is an integer offset to the next string
hashing
-to the same number. +4096 bytes of the string heap, consisting an offset to the first string +hashing to that bucket in the heap. +These offsets are made small (stridx_t is an unsigned short) by
exploiting
+the fact that the double elimination chunks are (now) 64KB, hence a
short
+suffices.
-However, in many other situations the cardinality of string columns is
large,
+In many other situations the cardinality of string columns is large, or the string values might even be unique. In those cases, our fixed-
size hash
table will start to overflow quickly. Therefore, after the hash table is
full
(this is measured very simplistically by looking whether the string heap
exceeds a
-heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with
old bat images)
-we flush the hash table. If one views the string heaps as consecutive
chunks
-of size GDK_ELIMLIMIT bytes, then all strings within one chunk are
double-eliminated.
-There is a macro GDK_ELIMBASE(offset) that computes the base of the
chunk in which
-a certain byte-offset falls. -@- -This is a departure from our previous policy of not looking at the hash
tables at
-all after overflow occurred. The advantage of the new approach is that
if we have
-a value distribution that is skewed (ie some values are very frequent),
these
-values will always be double eliminated, saving a considerable amount of
space.
-Disadvantage of the approach is that we always have to reserve space for
the next
-pointer (4(8) byte integer offset) that is stored right in front of the
string (and
-consequently have to keep all string chunks and offsets aligned to
4(8)). All this
-translates into some wasted space. However, if there are that many
different strings
-that the hash table overflows, the strings must be relatively long and
the relative
-storage overhead should be low. +heap size = GDK_ELIMLIMIT = 64KB) we flush the hash table. Even more,
from that moment
+on, we do not use a linked list, but a lossy hash table that just
contains
+the last position for each bucket. Basically, after exceeding
GDK_ELIMLIMIT,
+we get a probabilistic/opportunistic duplicate elimination mechanism, +that only looks at the last GDK_ELIMLIMIT chunk in the heap, in a lossy
way.
+When comparing with the previous string implementation, the biggest
difference
+is that on 64-nbits bt with 32-bits oids, strings are always 8-byte
aligned
+and var_t numbers are multiplied by 8 to get the true offset. The goal
to do
+this is to allow 32-bits var_t on 64-bits systems to address 32GB (using
string
+alignment=8). For large database, the cost of padding (4 bytes avg) is
offset
+by the savings in var_t (allowing to go from 64- to 32-bits). Nothing
lost there,
+and 32-bits var_t also pay in smaller OIDs and smaller hash tables,
reducing memory
+pressure. For small duplicate eliminated heaps, the short indices +used in the hash table have now allowed more buckets (2K instead of 1K) +and average 2 bytes overhead for the next pointers instead of 6-12.
Therefore
+small heaps are now more compact than before. @- -Notice that this mechanism enables to keep a certain linear storage
property
-in the string heaps. This is important if we want to take a BATslice on
a BAT
-by simply loading or @strong{mmap()}ping slices of the BAT files on disk
into memory.
-This is relevant in order to process a very large BAT iteratively by
taking slices
-in order to reduce memory consumption. Notice that if there are few
different string
-values, the hash table has not overflowed, and the string heap size will
be small
-(i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load
the entire string heap.
-If the hash table @strong{has} overflowed, we want to be able to only
map a slice of the
-string heap as well. Now, given that the first string in the BAT-slice
is called F1
-and its heap offset is O1 and the last string in the slice is F2 and its -offset is O2, then the slice we should take from the string heap is: -@example -GDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2)) -@end example The routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash-table. Only then we know that unequal offset-integers in the BUN array means @@ -1877,16 +1865,12 @@ strHeap(Heap *d, size_t cap) { size_t size;
var_t *h, *e;
cap = MAX(cap, BATTINY);
size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT,
cap * 12);
- size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap *
GDK_VARALIGN);
if (HEAPalloc(d, size, 1) >= 0) {
d->free = GDK_STRHASHTABLE * sizeof(var_t);
h = (var_t *) d->base;
for (e = h; e < h + GDK_STRHASHTABLE; e++) {
*e = 0;
}
d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
}memset(d->base, 0, d->free);
}
@@ -1923,42 +1907,10 @@ void strCleanHash(Heap *h, int rebuild) {
- (void) rebuild; if (!GDK_ELIMDOUBLES(h)) { /* flush hash table for security */ memset(h->base, 0, GDK_STRHASHSIZE);
- } else if (rebuild) {
var_t xx, cur = 0, end = 0;
str hash = (str) h->base;
-/* int cnt[GDK_STRHASHTABLE]; */
/* collect all values in one big linked list */
for (xx = 0; xx < GDK_STRHASHTABLE; xx++) {
var_t yy = ((var_t *) hash)[xx];
-/* cnt[xx]=0; */
((var_t *) hash)[xx] = 0; /* clear hash table slot
*/
if (end) {
*(var_t *) (hash + end) = yy;
} else {
cur = yy;
}
for (; yy; yy = *(var_t *) (hash + yy))
end = yy;
}
/* process the linked list, inserting the values again */
for (; cur; cur = end) {
str val = hash + cur;
GDK_STRHASH(val + sizeof(var_t), xx);
xx &= GDK_STRHASHMASK;
end = *(var_t *) val;
*(var_t *) val = ((var_t *) hash)[xx];
((var_t *) hash)[xx] = cur;
-/* cnt[xx]++; */
}
-/* for(xx=0; xx<GDK_STRHASHTABLE; xx++)
- if(cnt[xx]) stream_printf(GDKout, "%5d %6d", xx, cnt[xx]); */ }
}
@@ -1970,90 +1922,63 @@ var_t strLocate(Heap *h, str v) {
- var_t *htab = (var_t *) h->base;
- var_t *l, *e;
- BUN idx;
- GDK_STRHASH(v, idx);
- e = htab + (idx & GDK_STRHASHMASK);
- if (*e) {
for (l = e; *l && *l < h->free; l = (var_t *) ((char *) h->base
- *l)) {
str x = (str) ((char *) h->base + *l + sizeof(var_t));
if (GDK_STRCMP(v, x) == 0) {
return *l + (var_t) sizeof(var_t);
}
}
- }
- return 0;
-}
- stridx_t *ref, *next;
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define normal_vart_SWAP(x) ((var_t) normal_int_SWAP((int)x)) -#else -#define normal_vart_SWAP(x) ((var_t) long_long_SWAP((lng)x)) -#endif
- /* search hash-table, if double-elimination is still in place */
- BUN off; GDK_STRHASH(v, off);
- off &= GDK_STRHASHMASK;
-/* convert the integers in the implicit hash table structure */ -void -strHeapConvert(Heap *h, int dir) -{
- var_t *htab = (var_t *) h->base;
- var_t *l, i, j;
- /* should only use strLocate iff fully double eliminated */
- assert(GDK_ELIMBASE(h->free) == 0);
- if (dir == CONV_HTON) {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + j)) {
*l = normal_vart_SWAP(j);
}
}
- } else {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + *l)) {
*l = normal_vart_SWAP(j);
}
}
- /* search the linked list */
- for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0)
return (sizeof(stridx_t) + *ref) >> GDK_VARSHIFT; /*
found */
}
- return 0;
}
var_t strPut(Heap *h, var_t *dst, str v) {
- var_t *l;
- size_t i = GDK_STRLEN(v);
- BUN off;
- /* round up to var_t-byte alignment + var_t (next pointer) */
- size_t len = ((i + sizeof(var_t) - 1) & ~(sizeof(var_t) - 1)) +
sizeof(var_t);
- size_t elimlimit = GDK_ELIMBASE(h->free) + GDK_ELIMLIMIT;
size_t elimbase = GDK_ELIMBASE(h->free);
size_t pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN-1));
size_t pos, len = GDK_STRLEN(v);
stridx_t *bucket, *ref, *next;
/* search hash-table, if double-elimination is still in place */
- GDK_STRHASH(v, off);
- BUN off; GDK_STRHASH(v, off); off &= GDK_STRHASHMASK;
- for (l = ((var_t *) h->base) + off; *l && *l < h->free; l = (var_t *)
(h->base + *l)) {
str x = (str) (h->base + *l + sizeof(var_t));
- bucket = ((stridx_t *) h->base) + off;
if (GDK_STRCMP(v, x) == 0) {
*dst = *l + (var_t) sizeof(var_t); /* already in
heap;
do not insert! */
if (GDK_ELIMDOUBLES(h) == 0 && GDK_STRHASHCREDIT(h))
GDK_STRHASHCREDIT(h) += 4;
return *dst;
- if (elimbase == 0) { /* small string heap (<64KB) -- fully double
eliminated */
for (ref = bucket; *ref; ref = next) { /* search the linked
list */
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0) { /* found */
pos = sizeof(stridx_t) + *ref;
return *dst = (pos >> GDK_VARSHIFT);
}}
- }
- /* flush the hash table if it becomes too big (implies
!GDK_ELIMDOUBLES) */
- if (h->free + len >= elimlimit) {
/* if we are not hash-inserting anymore, h->free may no longer
be var_t aligned */
size_t mask = h->free & (sizeof(var_t) - 1);
if (mask)
h->free += sizeof(var_t) - mask; /* realign */
memset(h->base, 0, GDK_STRHASHSIZE); /* start over hash
inserting in a pristine hash table */
GDK_STRHASHCREDIT(h) = 32; /* only tolerate limited misses
in the future */
/* is there room for the next pointer in the padding space? */
if (pad < sizeof(stridx_t))
pad += GDK_VARALIGN; /* if not, pad more */
- } else {
/* large string heap (>=64KB) -- opportunistic/probabilistic
double elimination */
pos = elimbase + *bucket;
if (GDK_STRCMP(v, (str) (h->base + pos)) == 0) {
return *dst = (pos >> GDK_VARSHIFT); /* already in heap;
do not insert! */
}
+#if SIZEOF_VAR_T < SIZEOF_VOID_P
pad &= (GDK_VARALIGN-1); /* a full VARALIGN pad can be omitted
*/
+#else
pad = 0; /* only 32-bits var_t in 64-bits needs padding */
+#endif }
/* check heap for space (limited to a certain maximum after which
nils are inserted) */
- if (h->free + len >= h->size) {
- if (h->free + pad + len >= h->size) { /* Something really strange happens here, In a special case (Pentium II Klamath, gcc version 2.96 20000731, GNU assembler version 2.10.90 using BFD version 2.10.0.18)
@@ -2064,11 +1989,15 @@ */ float batmargin = (float) BATMARGIN; float hnewsize = h->size * batmargin;
size_t newsize = len + (size_t) hnewsize;
size_t newsize = pad + len + (size_t) hnewsize;
assert(newsize);
if (h->free + len < h->maxsize) {
if (h->free + pad + len >= (((size_t) VAR_MAX) <<
GDK_VARSHIFT)) {
GDKerror("strPut: string heaps gets larger than
%dGB.\n",
(((size_t) VAR_MAX) << GDK_VARSHIFT) >> 30);
return 0;
}
if (h->free + pad + len < h->maxsize) { /* if there is reserved space, first use the reserved
space */
newsize = MIN(newsize, h->maxsize); }
@@ -2077,32 +2006,27 @@ } /* fill should solve initialisation problems within valgrind */ memset(h->base + h->free, 0, h->size - h->free);
}
if (!GDK_ELIMDOUBLES(h)) {
if (GDK_STRHASHCREDIT(h) == 0) {
/* if credits are gone, we do not hash insert at all */
if (h->free > VAR_MAX) {
GDKerror("strPut: string heaps gets larger than
2GB
-- too large for 32-bits oids.\n");
return 0;
}
*dst = (var_t) h->free;
memcpy(h->base + h->free, v, i);
h->free += i; /* in this case, we do not round to
var_t either */
return *dst;
}
GDK_STRHASHCREDIT(h)--;
/* make bucket point into the enw heap */
}bucket = ((stridx_t *) h->base) + off;
- /* insert string in hash table and copy into the heap */
- l = (var_t *) (h->base + h->free);
- *(l++) = ((var_t *) h->base)[off];
- assert(h->free + sizeof(var_t) <= VAR_MAX);
- ((var_t *) h->base)[off] = (var_t) h->free;
- *dst = (var_t) (h->free + sizeof(var_t));
- h->free += len;
- memcpy((char *) l, v, i);
- /* insert string */
- pos = h->free + pad;
- *dst = pos >> GDK_VARSHIFT;
- memcpy(h->base + pos, v, len);
- h->free += pad + len;
- /* maintain hash table */
- if (elimbase == 0) { /* small string heap: link the next pointer */
pos -= sizeof(stridx_t); /* the stridx_t next pointer directly
precedes the string */
*(stridx_t*) (h->base + pos) = *bucket;
- }
- *bucket = (stridx_t) (pos - elimbase); /* set bucket to the new
string */
- if (h->free >= elimbase + GDK_ELIMLIMIT) {
memset(h->base, 0, GDK_STRHASHSIZE); /* flush hash table */
- } return *dst;
}
U gdk_bbp.mx Index: gdk_bbp.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bbp.mx,v retrieving revision 1.252 retrieving revision 1.253 diff -u -d -r1.252 -r1.253 --- gdk_bbp.mx 9 Apr 2009 15:29:18 -0000 1.252 +++ gdk_bbp.mx 13 Apr 2009 23:36:24 -0000 1.253 @@ -2965,9 +2965,9 @@ BATsetcount(&bs->B, 0);
if (bs->H.vheap)
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
if (bs->T.vheap)
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
return bs;
} BATDEBUG THRprintf(GDKout, "#BBPrecycle %d %d "SZFMT" (%d, %d %d %d)
N2\n", ht, tt, cap, BATCACHE_NOTYPE(ht), BATCACHE_NOTYPE(tt), batcache_maxbuckets, bin);
U gdk_batop.mx Index: gdk_batop.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_batop.mx,v retrieving revision 1.170 retrieving revision 1.171 diff -u -d -r1.170 -r1.171 --- gdk_batop.mx 7 Jan 2009 14:13:48 -0000 1.170 +++ gdk_batop.mx 13 Apr 2009 23:36:24 -0000 1.171 @@ -1656,7 +1656,7 @@ if (b->hkey == 0) { /* sorted and key? */ while (cur < end) {
char *val = base + *(((var_t
*)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t
*)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5= 0) { key = FALSE;
@@ -1668,7 +1668,7 @@ } /* sorted? */ while (cur < end) {
char *val = base + *(((var_t *)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t *)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5 0) { /* record negative position info
*/
U gdk_utils.mx Index: gdk_utils.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_utils.mx,v retrieving revision 1.246 retrieving revision 1.247 diff -u -d -r1.246 -r1.247 --- gdk_utils.mx 9 Apr 2009 18:48:25 -0000 1.246 +++ gdk_utils.mx 13 Apr 2009 23:36:24 -0000 1.247 @@ -382,7 +382,7 @@ #define GDK_MEM_NULLALLOWED
#if SIZEOF_VOID_P==8 -#define GDK_VM_MAXSIZE LL_CONSTANT(137438953472) /* :-) a 64-bit
OS: 128 GB */
+#define GDK_VM_MAXSIZE LL_CONSTANT(4398046511104) /* :-) a 64-bit
OS: 4TB */
#else #define GDK_VM_MAXSIZE LL_CONSTANT(1610612736) /* :-| a 32-bit OS:
1.5GB */
#endif
U gdk_heap.mx Index: gdk_heap.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_heap.mx,v retrieving revision 1.117 retrieving revision 1.118 diff -u -d -r1.117 -r1.118 --- gdk_heap.mx 20 Mar 2009 11:59:57 -0000 1.117 +++ gdk_heap.mx 13 Apr 2009 23:36:24 -0000 1.118 @@ -75,8 +75,20 @@ nice). @{ @c -int -HEAPalloc(Heap *h, size_t nitems, size_t itemsize) +size_t HEAPmargin(size_t maxsize) {
- size_t ret;
+#if SIZEOF_VOID_P == 8
- /* in 64-bits systems, try to enforce in-place realloc, but provoke
the memcpy on 256MB, then 4GB */
- size_t use = GDKvm_cursize();
- ret = MIN(GDK_mem_maxsize, MAX(((size_t) 1) << 26, 16*maxsize));
- if ((ret+ret) > (GDK_vm_maxsize-MIN(GDK_vm_maxsize,use))) /* only if
room */
+#endif
- ret = ((double) BATMARGIN)*maxsize - 1; /* do not waste VM on 32-bits
*/
- return (1 + (MAX(maxsize,ret) >> 16)) << 16; /* round up to 64K */
+}
+/* in 64-bits space, use very large margins to accomodate reallocations
*/
+int HEAPalloc(Heap *h, size_t nitems, size_t itemsize) { char nme[PATHLENGTH], *ext = NULL;
@@ -98,8 +110,7 @@ /* when using anonymous vm we malloc we need 64K chunks, also we * 20% extra malloc */ if (h->size > GDK_mem_bigsize) {
h->maxsize = (size_t) ((double) h->maxsize * BATMARGIN) - 1;
h->maxsize = (1 + (h->maxsize >> 16)) << 16;
} if (h->filename == NULL || (h->size < GDK_mmap_minsize)) { h->storage = STORE_MEM;h->maxsize = HEAPmargin(h->maxsize);
@@ -169,17 +180,14 @@ /* extend a malloced heap, possibly switching over to file-
mapped storage */
Heap bak = *h; int can_mmap = h->filename && (size >= GDK_mem_bigsize || h-
newstorage != STORE_MEM);
int must_mmap = can_mmap && (size >= GDK_mmap_minsize || h-
newstorage != STORE_MEM);
int small_cpy = (h->size*4 < size) && (size >=
GDK_mmap_minsize);
int must_mmap = can_mmap && (small_cpy || h->newstorage !=
STORE_MEM);
h->size = size; if (can_mmap) { /* in anonymous vm, if have to realloc anyway, we
reserve
some extra space */
if (size > h->maxsize) {
h->maxsize = (size_t) ((double) size *
BATMARGIN);
}
/* when using anonymous vm we malloc we need 64K chunks
*/
h->maxsize = (1 + ((h->maxsize - 1) >> 16)) << 16;
} else { h->maxsize = size; /* for normal GDKmalloc, maxsizeh->maxsize = HEAPmargin(MAX(size,h->maxsize));
= size */
}
@@ -514,9 +522,9 @@ #define HEAPVERSION 20030408
typedef struct heapheader {
- var_t head; /* index to first free block */
- size_t head; /* index to first free block */ int alignment; /* alignment of objects on heap */
- var_t firstblock; /* first block in heap */
- size_t firstblock; /* first block in heap */ int version; int (*sizefcn) (ptr); /* ADT function to ask length */
} HEADER32; @@ -524,8 +532,8 @@ typedef struct { int version; int alignment;
- var_t head;
- var_t firstblock;
- size_t head;
- size_t firstblock; int (*sizefcn) (ptr);
} HEADER64;
@@ -537,8 +545,8 @@ typedef HEADER64 HEADER_OTHER; #endif typedef struct hfblock {
- var_t size; /* Size of this block in freelist */
- var_t next; /* index of next block */
- size_t size; /* Size of this block in freelist */
- size_t next; /* index of next block */
} CHUNK;
@c @@ -546,13 +554,13 @@ #define roundup_4(x) (((x)+3)&~3) #define blocksize(h,p) ((p)->size)
-static INLINE var_t -roundup_num(var_t number, int alignment) +static INLINE size_t +roundup_num(size_t number, int alignment) {
- var_t rval;
- size_t rval;
- rval = number + (var_t) alignment - 1;
- rval -= (rval % (var_t) alignment);
- rval = number + (size_t) alignment - 1;
- rval -= (rval % (size_t) alignment); return rval;
}
@@ -560,7 +568,7 @@ HEAP_private(Heap *h) { (void) h;
- return (var_t) roundup_8(sizeof(HEADER));
- return (var_t) (roundup_8(sizeof(HEADER)) >> GDK_VARSHIFT);
}
#ifdef TRACE @@ -568,7 +576,7 @@ HEAP_printstatus(Heap *heap) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t block, cur_free = hheader->head;
size_t block, cur_free = hheader->head; CHUNK *blockp;
THRprintf(GDKout, "#HEAP has head " SZFMT " and alignment %d and size
" SZFMT "\n",
@@ -591,7 +599,7 @@ cur_free = blockp->next; block += blockp->size; } else {
var_t size = blocksize(hheader, blockp);
size_t size = blocksize(hheader, blockp); THRprintf(GDKout, "# block at " SZFMT " with size "
SZFMT "\n", block, size);
block += size;
@@ -611,7 +619,7 @@ /* // Calculate position of first and only free block. */
- var_t head = roundup_num((var_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
- size_t head = roundup_num((size_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
CHUNK *headp = HEAP_index(heap, head, CHUNK);
assert(roundup_8(sizeof(HEADER)) + roundup_8(nprivate) <= VAR_MAX); @@ -629,7 +637,7 @@ // Fill first free block. */ assert(heap->size - head <= VAR_MAX);
- headp->size = (var_t) (heap->size - head);
- headp->size = (size_t) (heap->size - head); headp->next = 0;
#ifdef TRACE THRprintf(GDKout, "#We created the following heap\n"); @@ -669,9 +677,9 @@
var_t -HEAP_malloc(Heap *heap, var_t nbytes) +HEAP_malloc(Heap *heap, size_t nbytes) {
- var_t block, trail, ttrail;
- size_t block, trail, ttrail; CHUNK *blockp; CHUNK *trailp; HEADER *hheader = HEAP_index(heap, 0, HEADER);
@@ -682,15 +690,9 @@
/* add space for size field */ nbytes += hheader->alignment;
- if (hheader->alignment == 8) {
nbytes = roundup_8(nbytes);
- } else if (hheader->alignment == 4) {
nbytes = roundup_4(nbytes);
- } else {
GDKfatal("HEAP_malloc: Heap structure corrupt\n");
- }
- nbytes = roundup_8(nbytes); if (nbytes < sizeof(CHUNK))
nbytes = (var_t) sizeof(CHUNK);
nbytes = (size_t) sizeof(CHUNK);
/* // block -- points to block with acceptable size (if available).
@@ -718,12 +720,12 @@ // If no block of acceptable size is found we try to enlarge the
heap.
*/
if (block == 0) {
var_t newsize;
size_t newsize;
assert(heap->free + MAX(heap->free, nbytes) <= VAR_MAX);
newsize = (var_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
newsize = (size_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
assert(heap->free <= VAR_MAX);
block = (var_t) heap->free; /* current end-of-heap */
block = (size_t) heap->free; /* current end-of-heap */
#ifdef TRACE THRprintf(GDKout, "#No block found\n"); @@ -747,7 +749,7 @@
blockp->next = 0; assert(heap->free - block <= VAR_MAX);
blockp->size = (var_t) (heap->free - block); /* determine
size of allocated block */
blockp->size = (size_t) (heap->free - block); /* determine
size of allocated block */
/* // Try to join the last block in the freelist and the newly
allocated
@@ -778,7 +780,7 @@ // TUNE: use different amount than 2*sizeof(CHUNK) */ if (blockp->size >= nbytes + 2 * sizeof(CHUNK)) {
var_t newblock = block + nbytes;
size_t newblock = block + nbytes;
CHUNK *newblockp = HEAP_index(heap, newblock, CHUNK);
newblockp->size = blockp->size - nbytes;
@@ -800,17 +802,17 @@ }
block += hheader->alignment;
- return block;
- return block >> GDK_VARSHIFT;
}
void -HEAP_free(Heap *heap, var_t block) +HEAP_free(Heap *heap, var_t mem) { HEADER *hheader = HEAP_index(heap, 0, HEADER); CHUNK *beforep; CHUNK *blockp; CHUNK *afterp;
- var_t after, before;
size_t after, before, block = mem << GDK_VARSHIFT;
if (hheader->alignment != 8 && hheader->alignment != 4) { GDKfatal("HEAP_free: Heap structure corrupt\n");
@@ -884,10 +886,10 @@ HEAP_check(Heap *heap, HeapRepair *hr) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t head = hheader->head, alignshift = 2;
- var_t block, nwords = (var_t) ((heap->free - 1) >> 7);
- size_t head = hheader->head, alignshift = 2;
- size_t block, nwords = (size_t) ((heap->free - 1) >> 7); int *freemask;
- var_t prevblock = 0;
size_t prevblock = 0; CHUNK *blockp;
hr->alignment = hheader->alignment;
@@ -922,8 +924,8 @@ // Walk the freelist; register them in freemask */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if ((block <= prevblock) && (block != 0)) {
@@ -942,8 +944,8 @@ */ block = hheader->firstblock; while (block < heap->free) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
hr->validmask[pos] |= mask;
@@ -965,8 +967,8 @@ // Check if there are left over free blocks */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if (freemask[pos] & mask) {
@@ -1046,14 +1048,14 @@ if (hheader->head > heap->free) { hheader->head = 0; /* cut off free block */ } else if (hheader->head) {
var_t idx = hheader->head;
size_t idx = hheader->head;
while (idx) { CHUNK *blk = HEAP_index(heap, idx, CHUNK);
if (idx + blk->size > heap->free) { assert(heap->free - idx <= VAR_MAX);
blk->size = (var_t) (heap->free - idx); /* cut
off illegal tail of block */
blk->size = (size_t) (heap->free - idx);
/* cut
off illegal tail of block */
} if (blk->next > heap->free || blk->next < (idx + blk-
size) || (blk->next & (hheader->alignment - 1))) { blk->next = 0; /* cut off next block */
U gdk_qsort.mx Index: gdk_qsort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_qsort.mx,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- gdk_qsort.mx 7 Jan 2009 14:13:48 -0000 1.34 +++ gdk_qsort.mx 13 Apr 2009 23:36:24 -0000 1.35 @@ -86,6 +86,7 @@ char *offst; /* NULL or start of varsized heap */ int hshift; /* log2 of hs */ int tshift; /* log2 of hs */
- int vshift; /* shift to be applied on var_ offsets */ unsigned hs; /* width of head record */ unsigned ts; /* width of tail record */ void *h; /* start of head buns */
@@ -189,7 +190,7 @@ #endif #endif
-#define offset(p) (buf->offst + *(var_t*) (p)) +#define offset(p) (buf->offst + ((*(var_t*) (p)) << buf- vshift)) #define direct(p) (p)
#define any_LE(a,b) ((buf->cmp)(a,b) <= 0) @@ -432,6 +433,7 @@ buf.ts = (unsigned) ts; buf.hshift = ATOMelmshift(hs); buf.tshift = ATOMelmshift(ts);
- buf.vshift = ATOMvarsized(tpe)?GDK_VARSHIFT:0; buf.h = h; if (!t) t = &x;
U gdk_bat.mx Index: gdk_bat.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bat.mx,v retrieving revision 1.214 retrieving revision 1.215 diff -u -d -r1.214 -r1.215 --- gdk_bat.mx 3 Apr 2009 09:14:52 -0000 1.214 +++ gdk_bat.mx 13 Apr 2009 23:36:24 -0000 1.215 @@ -434,7 +434,7 @@ BAT * BATextend(BAT *b, BUN newcap) {
- size_t hheap_size, theap_size;
size_t hheap_size = newcap, theap_size = newcap;
assert(newcap <= BUN_MAX); BATcheck(b, "BATextend");
@@ -453,10 +453,10 @@
b->batCapacity = newcap;
- hheap_size = Hsize(b) * newcap;
- hheap_size *= Hsize(b); if (b->H->heap.base && HEAPextend(&b->H->heap, hheap_size) < 0) return NULL;
- theap_size = Tsize(b) * newcap;
- theap_size *= Tsize(b); if (b->T->heap.base && HEAPextend(&b->T->heap, theap_size) < 0) return NULL; HASHdestroy(b);
U gdk_ssort.mx Index: gdk_ssort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_ssort.mx,v retrieving revision 1.15 retrieving revision 1.16 diff -u -d -r1.15 -r1.16 --- gdk_ssort.mx 7 Jan 2009 14:13:48 -0000 1.15 +++ gdk_ssort.mx 13 Apr 2009 23:36:24 -0000 1.16 @@ -203,8 +203,8 @@ } \ } while (0)
-#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
- (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)((X),
(Y))) < 0)
-#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)- compare)((X), (Y))) > 0) +#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) < 0)
+#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) <<
GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) > 0)
@= islt #define ISLT_@1@2(X, Y, ms) (* (@1 *) (X) @3 * (@1 *) (Y))
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
-- Sjoerd Mullender
No virus found in this incoming message. Checked by AVG - www.avg.com Version: 8.0.238 / Virus Database: 270.11.20/2013 - Release Date: 04/14/09 06:17:00
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
i just wanted to add here, that most tijah-users use the --enable-bits=64 --enable-oid32 configuration. however, this is not a really big group still, so it should be easy to tell them to re-index their collections. -henning
On 14.04.2009, at 14:04, Peter Boncz wrote:
Hi Sjoerd,
Thanks for the question. I wrote earlier last night an email to this list, which I thought covers this.
The answer regarding backward compatibility is: (1) NO, on repositories created by MonetDB configures with --enable- bits=64 --enable-oid32 (2) YES, in all other cases.
It is my understanding that few people use (1). If the MonetDB team agrees, I would propose not to take additional migration measures. If otherwise, ie if we go to a migration anyway, we might also consider changing the new type stridx_t in the (2) cases from var_t to unsigned short. It reduces the fixed overhead of string heaps of small bats (2KB instead of 4KB on 32-bits, 8KB on 64 bits). I refrained from doing so to keep (2) backward compatible.
Peter
-----Original Message----- From: Sjoerd Mullender [mailto:sjoerd@acm.org] Sent: Tuesday, April 14, 2009 1:41 PM To: monetdb-developers@lists.sourceforge.net Cc: monetdb-checkins@lists.sourceforge.net Subject: Re: [Monetdb-checkins] MonetDB/src/gdk gdk.mx, , 1.279, 1.280 gdk_atoms.mx, , 1.161, 1.162 gdk_bat.mx, , 1.214, 1.215 gdk_batop.mx, , 1.170, 1.171 gdk_bbp.mx, , 1.252, 1.253 gdk_heap.mx, , 1.117, 1.118 gdk_qsort.mx, , 1.34, 1.35 gdk_ssort.mx, , 1.15, 1...
Is this a backward compatible change? In other words, can databases created before this change be read by the new version?
I really want backward compatibility, so if it isn't, some kind of conversion would be needed.
Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv5363/src/gdk
Modified Files: gdk.mx gdk_atoms.mx gdk_bat.mx gdk_batop.mx gdk_bbp.mx gdk_heap.mx gdk_qsort.mx gdk_ssort.mx gdk_utils.mx Log Message: support for 32GB string heaps in 64bits/oid32 mode (implies bat format change but ONLY for 64bits/oid32)
src/gdk/gdk.mx
- introduce GDK_VARSHIFT, var_t (str indices) are to be shifted that
amount of bits to get to the real offset (padding is 8, in case of 64-bits and oid32 -- otherwise it is 0 => no change)
- clean up usage of var_t in HEAP_* interface
src/gdk/gdk_atoms.mx
- str heaps with difficult double lim distrubution do not maintain
a linked list (direct open hashing only)
- allow internal string heap hash table pointers to be unsigned
shorts instead of var_t (only switched on for 64bits/oid32)
- double-elim string heaps scaled back to 64KB (from 256-512KB)
- support for GDK_VARSHIFT
src/gdk/gdk_bat.mx
- bugfix in 64bits/oid32 offset calculation (precision loss averted)
src/gdk/gdk_batop.mx src/gdk/gdk_bbp.mx src/gdk/gdk_qsort.mx src/gdk/gdk_ssort.mx
- support for GDK_VARSHIFT
src/gdk/gdk_heap.mx
- HEAPmargin allocates large VM area after a block alloc in 64-bits
(this to stimulate in-place HEAPextend without memcpy)
- clean up use of var_t/size_t in HEAP_* functions, and support for
GDK_VARSHIFT
src/gdk/gdk_utils.mx
- increase VM size for 64-bits systems to 4TB
U gdk.mx Index: gdk.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk.mx,v retrieving revision 1.279 retrieving revision 1.280 diff -u -d -r1.279 -r1.280 --- gdk.mx 3 Apr 2009 09:14:51 -0000 1.279 +++ gdk.mx 13 Apr 2009 23:36:24 -0000 1.280 @@ -1068,9 +1068,9 @@ @item void HEAP_destroy (Heap* h) @item var_t -HEAP_malloc (Heap* heap, var_t nbytes) +HEAP_malloc (Heap* heap, size_t nbytes) @item void -HEAP_free (Heap *heap, size_t block) +HEAP_free (Heap *heap, var_t block) @item int HEAP_private (Heap* h) @item void @@ -1111,7 +1111,7 @@ int (*sizefcn) (ptr) /*
BATatoms[].atomLen
function */
);
-gdk_export var_t HEAP_malloc(Heap *heap, var_t nbytes); +gdk_export var_t HEAP_malloc(Heap *heap, size_t nbytes); gdk_export void HEAP_free(Heap *heap, var_t block); gdk_export var_t HEAP_private(Heap *h); gdk_export void HEAP_checkformat(Heap *h); @@ -1350,12 +1350,37 @@ #define Hloc(b,p) ((b)->H->heap.base+((p)<<(b)->H->shift)) #define Tloc(b,p) ((b)->T->heap.base+((p)<<(b)->T->shift))
+#if SIZEOF_VAR_T < SIZEOF_VOID_P +/* NEW 11/4/2009: when compiled with 32-bits oids/var_t on 64-bits
systems, align heap strings
- on 8 byte boundaries always (wasting 4 padding bytes on avg).
Note
that in heaps where
- duplicate elimination is succesful, such padding occurs anyway
(as an
aside, a better
- implementation with two-bytes pointers in the string heap hash
table,
could reduce that
- padding to avg 1 byte wasted -- see TODO below).
- This 8 byte alignment allows the offset in the fixed part of
the BAT
string column to be
- interpreted as an index, which should be multiplied by 8 to
get the
position (VARSHIFT). The
- overall effect is that 32GB heaps can be addressed even when
oids are
limited to 4Gtuples.
- In the future, we might extend this such that the string
alignment is
set in the BAT header
- (columns with long strings take more storage space, but could
tolerate more padding).
- It would mostly work, only the sort routine and strPut/strLocate
(which do not see the BAT
- header) extra parameters would be needed in their APIs.
- */
+typedef unsigned short stridx_t; +#define GDK_VARSHIFT 3 +#define GDK_VARALIGN (1<<GDK_VARSHIFT) +#else +typedef var_t stridx_t; /* TODO: should also be unsigned short, but kept
at var_t not to break BAT images */
+#define GDK_VARSHIFT 0 +#define GDK_VARALIGN sizeof(stridx_t) +#endif
#define BUNhloc(bi,p) Hloc((bi).b,p) #define BUNtloc(bi,p) Tloc((bi).b,p) #define BUNhpos(bi,p) (Hpos(&(bi),p)) #define BUNtpos(bi,p) (Tpos(&(bi),p)) -#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+*((var_t*)BUNhloc(bi,p))):BUNhpos(bi,p)) -#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+*((var_t*)BUNtloc(bi,p))):BUNtpos(bi,p)) +#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+ ((*(var_t*)BUNhloc(bi,p))<<GDK_VARSHIFT)):BUNhpos(bi,
p))
+#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+ ((*(var_t*)BUNtloc(bi,p))<<GDK_VARSHIFT)):BUNtpos(bi,
p))
#define BUNhead(bi,p) ((bi).b- hvarsized?BUNhvar(bi,p):BUNhloc(bi,p)) #define BUNtail(bi,p) ((bi).b- tvarsized?BUNtvar(bi,p):BUNtloc(bi,p))
U gdk_atoms.mx Index: gdk_atoms.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_atoms.mx,v retrieving revision 1.161 retrieving revision 1.162 diff -u -d -r1.161 -r1.162 --- gdk_atoms.mx 7 Apr 2009 11:53:22 -0000 1.161 +++ gdk_atoms.mx 13 Apr 2009 23:36:24 -0000 1.162 @@ -175,7 +175,6 @@ gdk_export int strLen(const char *s); gdk_export int strCmp(str l, str r); gdk_export int strNil(str s); -gdk_export void strHeapConvert(Heap *h, int directon); gdk_export int strElimDoubles(Heap *h); gdk_export var_t strLocate(Heap *h, str v); gdk_export int strCmpNoNil(unsigned char *l, unsigned char *r); @@ -457,7 +456,7 @@ 0, 0, (var_t (*)(Heap *, var_t *, ptr)) strPut, 0, (int (*)(ptr)) strLen, strHeap,
(void (*)(Heap *, int)) strHeapConvert, 0},
(void (*)(Heap *, int)) 0, 0},
}; int GDKatomcnt = TYPE_str + 1;
@@ -931,24 +930,25 @@ } \ } while (0)
-#define GDK_STRHASHTABLE (1<<10) +/* string heaps:
- strings are 8 byte aligned
- start with a 1024 bucket hash table
- heaps < 64KB are fully duplicate eliminated with this hash
tables
- heaps >= 64KB are opportunistically (imperfect) duplicate
eliminated
- as only the last 128KB chunk is considered and there is no
linked
list
- buckets and next pointers are unsigned short "indices"
- indices should be multiplied by 8 and takes from ELIMBASE to
get an
offset
- Note that a 64KB chunk of the heap contains at most 8K 8-byte
aligned
- strings. The 1K bucket list means that in worst load, the list
length
is 8 (OK).
- */
+#define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) -#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(var_t)) -#define GDK_STRHASHCREDIT(h) (((var_t*) (h)- base)[GDK_STRHASHTABLE]) +#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) +#define GDK_ELIMPOWER 16 /* 64KB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) +#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently:
ELIMBASE == 0 */
#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) <<
GDK_ELIMPOWER)
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define GDK_ELIMPOWER 16 /* makes for a max 64KB hash
table
* ie 256 string bytes per hash bucket
* ~ 4 strings of UP4(8<=len<=11)=12 + 4
bytes
*/
-#else -#define GDK_ELIMPOWER 17 /* makes for a max 128KB hash
table
* ie 512 string bytes per hash bucket
* ~ 5 strings of UP8(8<=len<=15)=16 + 8
bytes
*/
-#endif
@- Atomic ADT functions @c @@ -1767,46 +1767,34 @@ Because in many typical situations lots of double string values occur in tables, the string insertion provides automatic double elimination. To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the
first
-4096 (8192 on 64-bit architectures) bytes of the string heap, consisting
of integer offsets of the first
-string hashing to that bucket in the heap. Furthermore, the first 4(8)
bytes
-before each string in the heap is an integer offset to the next string
hashing
-to the same number. +4096 bytes of the string heap, consisting an offset to the first string +hashing to that bucket in the heap. +These offsets are made small (stridx_t is an unsigned short) by
exploiting
+the fact that the double elimination chunks are (now) 64KB, hence a
short
+suffices.
-However, in many other situations the cardinality of string columns is
large,
+In many other situations the cardinality of string columns is large, or the string values might even be unique. In those cases, our fixed-
size hash
table will start to overflow quickly. Therefore, after the hash table is
full
(this is measured very simplistically by looking whether the string heap
exceeds a
-heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with
old bat images)
-we flush the hash table. If one views the string heaps as consecutive
chunks
-of size GDK_ELIMLIMIT bytes, then all strings within one chunk are
double-eliminated.
-There is a macro GDK_ELIMBASE(offset) that computes the base of the
chunk in which
-a certain byte-offset falls. -@- -This is a departure from our previous policy of not looking at the hash
tables at
-all after overflow occurred. The advantage of the new approach is that
if we have
-a value distribution that is skewed (ie some values are very frequent),
these
-values will always be double eliminated, saving a considerable amount of
space.
-Disadvantage of the approach is that we always have to reserve space for
the next
-pointer (4(8) byte integer offset) that is stored right in front of the
string (and
-consequently have to keep all string chunks and offsets aligned to
4(8)). All this
-translates into some wasted space. However, if there are that many
different strings
-that the hash table overflows, the strings must be relatively long and
the relative
-storage overhead should be low. +heap size = GDK_ELIMLIMIT = 64KB) we flush the hash table. Even more,
from that moment
+on, we do not use a linked list, but a lossy hash table that just
contains
+the last position for each bucket. Basically, after exceeding
GDK_ELIMLIMIT,
+we get a probabilistic/opportunistic duplicate elimination mechanism, +that only looks at the last GDK_ELIMLIMIT chunk in the heap, in a lossy
way.
+When comparing with the previous string implementation, the biggest
difference
+is that on 64-nbits bt with 32-bits oids, strings are always 8-byte
aligned
+and var_t numbers are multiplied by 8 to get the true offset. The goal
to do
+this is to allow 32-bits var_t on 64-bits systems to address 32GB (using
string
+alignment=8). For large database, the cost of padding (4 bytes avg) is
offset
+by the savings in var_t (allowing to go from 64- to 32-bits). Nothing
lost there,
+and 32-bits var_t also pay in smaller OIDs and smaller hash tables,
reducing memory
+pressure. For small duplicate eliminated heaps, the short indices +used in the hash table have now allowed more buckets (2K instead of 1K) +and average 2 bytes overhead for the next pointers instead of 6-12.
Therefore
+small heaps are now more compact than before. @- -Notice that this mechanism enables to keep a certain linear storage
property
-in the string heaps. This is important if we want to take a BATslice on
a BAT
-by simply loading or @strong{mmap()}ping slices of the BAT files on disk
into memory.
-This is relevant in order to process a very large BAT iteratively by
taking slices
-in order to reduce memory consumption. Notice that if there are few
different string
-values, the hash table has not overflowed, and the string heap size will
be small
-(i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load
the entire string heap.
-If the hash table @strong{has} overflowed, we want to be able to only
map a slice of the
-string heap as well. Now, given that the first string in the BAT- slice
is called F1
-and its heap offset is O1 and the last string in the slice is F2 and its -offset is O2, then the slice we should take from the string heap is: -@example -GDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2)) -@end example The routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash- table. Only then we know that unequal offset-integers in the BUN array means @@ -1877,16 +1865,12 @@ strHeap(Heap *d, size_t cap) { size_t size;
var_t *h, *e;
cap = MAX(cap, BATTINY);
size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT,
cap * 12);
- size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT,
cap *
GDK_VARALIGN);
if (HEAPalloc(d, size, 1) >= 0) {
d->free = GDK_STRHASHTABLE * sizeof(var_t);
h = (var_t *) d->base;
for (e = h; e < h + GDK_STRHASHTABLE; e++) {
*e = 0;
}
d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
}memset(d->base, 0, d->free);
}
@@ -1923,42 +1907,10 @@ void strCleanHash(Heap *h, int rebuild) {
- (void) rebuild; if (!GDK_ELIMDOUBLES(h)) { /* flush hash table for security */ memset(h->base, 0, GDK_STRHASHSIZE);
- } else if (rebuild) {
var_t xx, cur = 0, end = 0;
str hash = (str) h->base;
-/* int cnt[GDK_STRHASHTABLE]; */
/* collect all values in one big linked list */
for (xx = 0; xx < GDK_STRHASHTABLE; xx++) {
var_t yy = ((var_t *) hash)[xx];
-/* cnt[xx]=0; */
((var_t *) hash)[xx] = 0; /* clear hash table slot
*/
if (end) {
*(var_t *) (hash + end) = yy;
} else {
cur = yy;
}
for (; yy; yy = *(var_t *) (hash + yy))
end = yy;
}
/* process the linked list, inserting the values again */
for (; cur; cur = end) {
str val = hash + cur;
GDK_STRHASH(val + sizeof(var_t), xx);
xx &= GDK_STRHASHMASK;
end = *(var_t *) val;
*(var_t *) val = ((var_t *) hash)[xx];
((var_t *) hash)[xx] = cur;
-/* cnt[xx]++; */
}
-/* for(xx=0; xx<GDK_STRHASHTABLE; xx++)
- if(cnt[xx]) stream_printf(GDKout, "%5d %6d", xx, cnt[xx]); */ }
}
@@ -1970,90 +1922,63 @@ var_t strLocate(Heap *h, str v) {
- var_t *htab = (var_t *) h->base;
- var_t *l, *e;
- BUN idx;
- GDK_STRHASH(v, idx);
- e = htab + (idx & GDK_STRHASHMASK);
- if (*e) {
for (l = e; *l && *l < h->free; l = (var_t *) ((char *) h->base
- *l)) {
str x = (str) ((char *) h->base + *l + sizeof(var_t));
if (GDK_STRCMP(v, x) == 0) {
return *l + (var_t) sizeof(var_t);
}
}
- }
- return 0;
-}
- stridx_t *ref, *next;
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define normal_vart_SWAP(x) ((var_t) normal_int_SWAP((int)x)) -#else -#define normal_vart_SWAP(x) ((var_t) long_long_SWAP((lng)x)) -#endif
- /* search hash-table, if double-elimination is still in place */
- BUN off; GDK_STRHASH(v, off);
- off &= GDK_STRHASHMASK;
-/* convert the integers in the implicit hash table structure */ -void -strHeapConvert(Heap *h, int dir) -{
- var_t *htab = (var_t *) h->base;
- var_t *l, i, j;
- /* should only use strLocate iff fully double eliminated */
- assert(GDK_ELIMBASE(h->free) == 0);
- if (dir == CONV_HTON) {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + j)) {
*l = normal_vart_SWAP(j);
}
}
- } else {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + *l)) {
*l = normal_vart_SWAP(j);
}
}
- /* search the linked list */
- for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0)
return (sizeof(stridx_t) + *ref) >> GDK_VARSHIFT; /*
found */
}
- return 0;
}
var_t strPut(Heap *h, var_t *dst, str v) {
- var_t *l;
- size_t i = GDK_STRLEN(v);
- BUN off;
- /* round up to var_t-byte alignment + var_t (next pointer) */
- size_t len = ((i + sizeof(var_t) - 1) & ~(sizeof(var_t) - 1)) +
sizeof(var_t);
- size_t elimlimit = GDK_ELIMBASE(h->free) + GDK_ELIMLIMIT;
size_t elimbase = GDK_ELIMBASE(h->free);
size_t pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN-1));
size_t pos, len = GDK_STRLEN(v);
stridx_t *bucket, *ref, *next;
/* search hash-table, if double-elimination is still in place */
- GDK_STRHASH(v, off);
- BUN off; GDK_STRHASH(v, off); off &= GDK_STRHASHMASK;
- for (l = ((var_t *) h->base) + off; *l && *l < h->free; l =
(var_t *)
(h->base + *l)) {
str x = (str) (h->base + *l + sizeof(var_t));
- bucket = ((stridx_t *) h->base) + off;
if (GDK_STRCMP(v, x) == 0) {
*dst = *l + (var_t) sizeof(var_t); /* already in
heap;
do not insert! */
if (GDK_ELIMDOUBLES(h) == 0 && GDK_STRHASHCREDIT(h))
GDK_STRHASHCREDIT(h) += 4;
return *dst;
- if (elimbase == 0) { /* small string heap (<64KB) -- fully double
eliminated */
for (ref = bucket; *ref; ref = next) { /* search the linked
list */
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0) { /* found */
pos = sizeof(stridx_t) + *ref;
return *dst = (pos >> GDK_VARSHIFT);
}}
- }
- /* flush the hash table if it becomes too big (implies
!GDK_ELIMDOUBLES) */
- if (h->free + len >= elimlimit) {
/* if we are not hash-inserting anymore, h->free may no longer
be var_t aligned */
size_t mask = h->free & (sizeof(var_t) - 1);
if (mask)
h->free += sizeof(var_t) - mask; /* realign */
memset(h->base, 0, GDK_STRHASHSIZE); /* start over hash
inserting in a pristine hash table */
GDK_STRHASHCREDIT(h) = 32; /* only tolerate limited misses
in the future */
/* is there room for the next pointer in the padding space? */
if (pad < sizeof(stridx_t))
pad += GDK_VARALIGN; /* if not, pad more */
- } else {
/* large string heap (>=64KB) -- opportunistic/
probabilistic
double elimination */
pos = elimbase + *bucket;
if (GDK_STRCMP(v, (str) (h->base + pos)) == 0) {
return *dst = (pos >> GDK_VARSHIFT); /* already in heap;
do not insert! */
}
+#if SIZEOF_VAR_T < SIZEOF_VOID_P
pad &= (GDK_VARALIGN-1); /* a full VARALIGN pad can be omitted
*/
+#else
pad = 0; /* only 32-bits var_t in 64-bits needs padding */
+#endif }
/* check heap for space (limited to a certain maximum after which
nils are inserted) */
- if (h->free + len >= h->size) {
- if (h->free + pad + len >= h->size) { /* Something really strange happens here, In a special case (Pentium II Klamath, gcc version 2.96 20000731, GNU assembler version 2.10.90 using BFD version 2.10.0.18)
@@ -2064,11 +1989,15 @@ */ float batmargin = (float) BATMARGIN; float hnewsize = h->size * batmargin;
size_t newsize = len + (size_t) hnewsize;
size_t newsize = pad + len + (size_t) hnewsize;
assert(newsize);
if (h->free + len < h->maxsize) {
if (h->free + pad + len >= (((size_t) VAR_MAX) <<
GDK_VARSHIFT)) {
GDKerror("strPut: string heaps gets larger than
%dGB.\n",
(((size_t) VAR_MAX) << GDK_VARSHIFT) >> 30);
return 0;
}
if (h->free + pad + len < h->maxsize) { /* if there is reserved space, first use the reserved
space */
newsize = MIN(newsize, h->maxsize); }
@@ -2077,32 +2006,27 @@ } /* fill should solve initialisation problems within valgrind */ memset(h->base + h->free, 0, h->size - h->free);
}
if (!GDK_ELIMDOUBLES(h)) {
if (GDK_STRHASHCREDIT(h) == 0) {
/* if credits are gone, we do not hash insert at all */
if (h->free > VAR_MAX) {
GDKerror("strPut: string heaps gets larger than
2GB
-- too large for 32-bits oids.\n");
return 0;
}
*dst = (var_t) h->free;
memcpy(h->base + h->free, v, i);
h->free += i; /* in this case, we do not round to
var_t either */
return *dst;
}
GDK_STRHASHCREDIT(h)--;
/* make bucket point into the enw heap */
}bucket = ((stridx_t *) h->base) + off;
- /* insert string in hash table and copy into the heap */
- l = (var_t *) (h->base + h->free);
- *(l++) = ((var_t *) h->base)[off];
- assert(h->free + sizeof(var_t) <= VAR_MAX);
- ((var_t *) h->base)[off] = (var_t) h->free;
- *dst = (var_t) (h->free + sizeof(var_t));
- h->free += len;
- memcpy((char *) l, v, i);
- /* insert string */
- pos = h->free + pad;
- *dst = pos >> GDK_VARSHIFT;
- memcpy(h->base + pos, v, len);
- h->free += pad + len;
- /* maintain hash table */
- if (elimbase == 0) { /* small string heap: link the next pointer
*/
pos -= sizeof(stridx_t); /* the stridx_t next pointer directly
precedes the string */
*(stridx_t*) (h->base + pos) = *bucket;
- }
- *bucket = (stridx_t) (pos - elimbase); /* set bucket to the new
string */
- if (h->free >= elimbase + GDK_ELIMLIMIT) {
memset(h->base, 0, GDK_STRHASHSIZE); /* flush hash table */
- } return *dst;
}
U gdk_bbp.mx Index: gdk_bbp.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bbp.mx,v retrieving revision 1.252 retrieving revision 1.253 diff -u -d -r1.252 -r1.253 --- gdk_bbp.mx 9 Apr 2009 15:29:18 -0000 1.252 +++ gdk_bbp.mx 13 Apr 2009 23:36:24 -0000 1.253 @@ -2965,9 +2965,9 @@ BATsetcount(&bs->B, 0);
if (bs->H.vheap)
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
if (bs->T.vheap)
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
return bs;
} BATDEBUG THRprintf(GDKout, "#BBPrecycle %d %d "SZFMT" (%d, %d %d %d)
N2\n", ht, tt, cap, BATCACHE_NOTYPE(ht), BATCACHE_NOTYPE(tt), batcache_maxbuckets, bin);
U gdk_batop.mx Index: gdk_batop.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_batop.mx,v retrieving revision 1.170 retrieving revision 1.171 diff -u -d -r1.170 -r1.171 --- gdk_batop.mx 7 Jan 2009 14:13:48 -0000 1.170 +++ gdk_batop.mx 13 Apr 2009 23:36:24 -0000 1.171 @@ -1656,7 +1656,7 @@ if (b->hkey == 0) { /* sorted and key? */ while (cur < end) {
char *val = base + *(((var_t
*)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t
*)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5= 0) { key = FALSE;
@@ -1668,7 +1668,7 @@ } /* sorted? */ while (cur < end) {
char *val = base + *(((var_t *)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t *)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5 0) { /* record negative position info
*/
U gdk_utils.mx Index: gdk_utils.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_utils.mx,v retrieving revision 1.246 retrieving revision 1.247 diff -u -d -r1.246 -r1.247 --- gdk_utils.mx 9 Apr 2009 18:48:25 -0000 1.246 +++ gdk_utils.mx 13 Apr 2009 23:36:24 -0000 1.247 @@ -382,7 +382,7 @@ #define GDK_MEM_NULLALLOWED
#if SIZEOF_VOID_P==8 -#define GDK_VM_MAXSIZE LL_CONSTANT(137438953472) /* :-) a 64-bit
OS: 128 GB */
+#define GDK_VM_MAXSIZE LL_CONSTANT(4398046511104) /* :-) a 64-bit
OS: 4TB */
#else #define GDK_VM_MAXSIZE LL_CONSTANT(1610612736) /* :-| a 32-bit OS:
1.5GB */
#endif
U gdk_heap.mx Index: gdk_heap.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_heap.mx,v retrieving revision 1.117 retrieving revision 1.118 diff -u -d -r1.117 -r1.118 --- gdk_heap.mx 20 Mar 2009 11:59:57 -0000 1.117 +++ gdk_heap.mx 13 Apr 2009 23:36:24 -0000 1.118 @@ -75,8 +75,20 @@ nice). @{ @c -int -HEAPalloc(Heap *h, size_t nitems, size_t itemsize) +size_t HEAPmargin(size_t maxsize) {
- size_t ret;
+#if SIZEOF_VOID_P == 8
- /* in 64-bits systems, try to enforce in-place realloc, but
provoke
the memcpy on 256MB, then 4GB */
- size_t use = GDKvm_cursize();
- ret = MIN(GDK_mem_maxsize, MAX(((size_t) 1) << 26, 16*maxsize));
- if ((ret+ret) > (GDK_vm_maxsize-MIN(GDK_vm_maxsize,use))) /*
only if
room */
+#endif
- ret = ((double) BATMARGIN)*maxsize - 1; /* do not waste VM on 32-
bits
*/
- return (1 + (MAX(maxsize,ret) >> 16)) << 16; /* round up to 64K */
+}
+/* in 64-bits space, use very large margins to accomodate reallocations
*/
+int HEAPalloc(Heap *h, size_t nitems, size_t itemsize) { char nme[PATHLENGTH], *ext = NULL;
@@ -98,8 +110,7 @@ /* when using anonymous vm we malloc we need 64K chunks, also we * 20% extra malloc */ if (h->size > GDK_mem_bigsize) {
h->maxsize = (size_t) ((double) h->maxsize * BATMARGIN) - 1;
h->maxsize = (1 + (h->maxsize >> 16)) << 16;
} if (h->filename == NULL || (h->size < GDK_mmap_minsize)) { h->storage = STORE_MEM;h->maxsize = HEAPmargin(h->maxsize);
@@ -169,17 +180,14 @@ /* extend a malloced heap, possibly switching over to file-
mapped storage */
Heap bak = *h; int can_mmap = h->filename && (size >= GDK_mem_bigsize || h-
newstorage != STORE_MEM);
int must_mmap = can_mmap && (size >= GDK_mmap_minsize || h-
newstorage != STORE_MEM);
int small_cpy = (h->size*4 < size) && (size >=
GDK_mmap_minsize);
int must_mmap = can_mmap && (small_cpy || h->newstorage !=
STORE_MEM);
h->size = size; if (can_mmap) { /* in anonymous vm, if have to realloc anyway, we
reserve
some extra space */
if (size > h->maxsize) {
h->maxsize = (size_t) ((double) size *
BATMARGIN);
}
/* when using anonymous vm we malloc we need 64K chunks
*/
h->maxsize = (1 + ((h->maxsize - 1) >> 16)) << 16;
} else { h->maxsize = size; /* for normal GDKmalloc, maxsizeh->maxsize = HEAPmargin(MAX(size,h->maxsize));
= size */
}
@@ -514,9 +522,9 @@ #define HEAPVERSION 20030408
typedef struct heapheader {
- var_t head; /* index to first free block */
- size_t head; /* index to first free block */ int alignment; /* alignment of objects on heap */
- var_t firstblock; /* first block in heap */
- size_t firstblock; /* first block in heap */ int version; int (*sizefcn) (ptr); /* ADT function to ask length */
} HEADER32; @@ -524,8 +532,8 @@ typedef struct { int version; int alignment;
- var_t head;
- var_t firstblock;
- size_t head;
- size_t firstblock; int (*sizefcn) (ptr);
} HEADER64;
@@ -537,8 +545,8 @@ typedef HEADER64 HEADER_OTHER; #endif typedef struct hfblock {
- var_t size; /* Size of this block in freelist */
- var_t next; /* index of next block */
- size_t size; /* Size of this block in freelist */
- size_t next; /* index of next block */
} CHUNK;
@c @@ -546,13 +554,13 @@ #define roundup_4(x) (((x)+3)&~3) #define blocksize(h,p) ((p)->size)
-static INLINE var_t -roundup_num(var_t number, int alignment) +static INLINE size_t +roundup_num(size_t number, int alignment) {
- var_t rval;
- size_t rval;
- rval = number + (var_t) alignment - 1;
- rval -= (rval % (var_t) alignment);
- rval = number + (size_t) alignment - 1;
- rval -= (rval % (size_t) alignment); return rval;
}
@@ -560,7 +568,7 @@ HEAP_private(Heap *h) { (void) h;
- return (var_t) roundup_8(sizeof(HEADER));
- return (var_t) (roundup_8(sizeof(HEADER)) >> GDK_VARSHIFT);
}
#ifdef TRACE @@ -568,7 +576,7 @@ HEAP_printstatus(Heap *heap) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t block, cur_free = hheader->head;
size_t block, cur_free = hheader->head; CHUNK *blockp;
THRprintf(GDKout, "#HEAP has head " SZFMT " and alignment %d and
size
" SZFMT "\n",
@@ -591,7 +599,7 @@ cur_free = blockp->next; block += blockp->size; } else {
var_t size = blocksize(hheader, blockp);
size_t size = blocksize(hheader, blockp); THRprintf(GDKout, "# block at " SZFMT " with size "
SZFMT "\n", block, size);
block += size;
@@ -611,7 +619,7 @@ /* // Calculate position of first and only free block. */
- var_t head = roundup_num((var_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
- size_t head = roundup_num((size_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
CHUNK *headp = HEAP_index(heap, head, CHUNK);
assert(roundup_8(sizeof(HEADER)) + roundup_8(nprivate) <= VAR_MAX); @@ -629,7 +637,7 @@ // Fill first free block. */ assert(heap->size - head <= VAR_MAX);
- headp->size = (var_t) (heap->size - head);
- headp->size = (size_t) (heap->size - head); headp->next = 0;
#ifdef TRACE THRprintf(GDKout, "#We created the following heap\n"); @@ -669,9 +677,9 @@
var_t -HEAP_malloc(Heap *heap, var_t nbytes) +HEAP_malloc(Heap *heap, size_t nbytes) {
- var_t block, trail, ttrail;
- size_t block, trail, ttrail; CHUNK *blockp; CHUNK *trailp; HEADER *hheader = HEAP_index(heap, 0, HEADER);
@@ -682,15 +690,9 @@
/* add space for size field */ nbytes += hheader->alignment;
- if (hheader->alignment == 8) {
nbytes = roundup_8(nbytes);
- } else if (hheader->alignment == 4) {
nbytes = roundup_4(nbytes);
- } else {
GDKfatal("HEAP_malloc: Heap structure corrupt\n");
- }
- nbytes = roundup_8(nbytes); if (nbytes < sizeof(CHUNK))
nbytes = (var_t) sizeof(CHUNK);
nbytes = (size_t) sizeof(CHUNK);
/* // block -- points to block with acceptable size (if
available). @@ -718,12 +720,12 @@ // If no block of acceptable size is found we try to enlarge the
heap.
*/
if (block == 0) {
var_t newsize;
size_t newsize;
assert(heap->free + MAX(heap->free, nbytes) <= VAR_MAX);
newsize = (var_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
newsize = (size_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
assert(heap->free <= VAR_MAX);
block = (var_t) heap->free; /* current end-of-heap */
block = (size_t) heap->free; /* current end-of-heap */
#ifdef TRACE THRprintf(GDKout, "#No block found\n"); @@ -747,7 +749,7 @@
blockp->next = 0; assert(heap->free - block <= VAR_MAX);
blockp->size = (var_t) (heap->free - block); /* determine
size of allocated block */
blockp->size = (size_t) (heap->free - block); /* determine
size of allocated block */
/* // Try to join the last block in the freelist and the newly
allocated
@@ -778,7 +780,7 @@ // TUNE: use different amount than 2*sizeof(CHUNK) */ if (blockp->size >= nbytes + 2 * sizeof(CHUNK)) {
var_t newblock = block + nbytes;
size_t newblock = block + nbytes;
CHUNK *newblockp = HEAP_index(heap, newblock, CHUNK);
newblockp->size = blockp->size - nbytes;
@@ -800,17 +802,17 @@ }
block += hheader->alignment;
- return block;
- return block >> GDK_VARSHIFT;
}
void -HEAP_free(Heap *heap, var_t block) +HEAP_free(Heap *heap, var_t mem) { HEADER *hheader = HEAP_index(heap, 0, HEADER); CHUNK *beforep; CHUNK *blockp; CHUNK *afterp;
- var_t after, before;
size_t after, before, block = mem << GDK_VARSHIFT;
if (hheader->alignment != 8 && hheader->alignment != 4) { GDKfatal("HEAP_free: Heap structure corrupt\n");
@@ -884,10 +886,10 @@ HEAP_check(Heap *heap, HeapRepair *hr) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t head = hheader->head, alignshift = 2;
- var_t block, nwords = (var_t) ((heap->free - 1) >> 7);
- size_t head = hheader->head, alignshift = 2;
- size_t block, nwords = (size_t) ((heap->free - 1) >> 7); int *freemask;
- var_t prevblock = 0;
size_t prevblock = 0; CHUNK *blockp;
hr->alignment = hheader->alignment;
@@ -922,8 +924,8 @@ // Walk the freelist; register them in freemask */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if ((block <= prevblock) && (block != 0)) {
@@ -942,8 +944,8 @@ */ block = hheader->firstblock; while (block < heap->free) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
hr->validmask[pos] |= mask;
@@ -965,8 +967,8 @@ // Check if there are left over free blocks */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if (freemask[pos] & mask) {
@@ -1046,14 +1048,14 @@ if (hheader->head > heap->free) { hheader->head = 0; /* cut off free block */ } else if (hheader->head) {
var_t idx = hheader->head;
size_t idx = hheader->head;
while (idx) { CHUNK *blk = HEAP_index(heap, idx, CHUNK);
if (idx + blk->size > heap->free) { assert(heap->free - idx <= VAR_MAX);
blk->size = (var_t) (heap->free - idx); /* cut
off illegal tail of block */
blk->size = (size_t) (heap->free - idx);
/* cut
off illegal tail of block */
} if (blk->next > heap->free || blk->next < (idx + blk-
size) || (blk->next & (hheader->alignment - 1))) { blk->next = 0; /* cut off next block */
U gdk_qsort.mx Index: gdk_qsort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_qsort.mx,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- gdk_qsort.mx 7 Jan 2009 14:13:48 -0000 1.34 +++ gdk_qsort.mx 13 Apr 2009 23:36:24 -0000 1.35 @@ -86,6 +86,7 @@ char *offst; /* NULL or start of varsized heap */ int hshift; /* log2 of hs */ int tshift; /* log2 of hs */
- int vshift; /* shift to be applied on var_ offsets */ unsigned hs; /* width of head record */ unsigned ts; /* width of tail record */ void *h; /* start of head buns */
@@ -189,7 +190,7 @@ #endif #endif
-#define offset(p) (buf->offst + *(var_t*) (p)) +#define offset(p) (buf->offst + ((*(var_t*) (p)) << buf- vshift)) #define direct(p) (p)
#define any_LE(a,b) ((buf->cmp)(a,b) <= 0) @@ -432,6 +433,7 @@ buf.ts = (unsigned) ts; buf.hshift = ATOMelmshift(hs); buf.tshift = ATOMelmshift(ts);
- buf.vshift = ATOMvarsized(tpe)?GDK_VARSHIFT:0; buf.h = h; if (!t) t = &x;
U gdk_bat.mx Index: gdk_bat.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bat.mx,v retrieving revision 1.214 retrieving revision 1.215 diff -u -d -r1.214 -r1.215 --- gdk_bat.mx 3 Apr 2009 09:14:52 -0000 1.214 +++ gdk_bat.mx 13 Apr 2009 23:36:24 -0000 1.215 @@ -434,7 +434,7 @@ BAT * BATextend(BAT *b, BUN newcap) {
- size_t hheap_size, theap_size;
size_t hheap_size = newcap, theap_size = newcap;
assert(newcap <= BUN_MAX); BATcheck(b, "BATextend");
@@ -453,10 +453,10 @@
b->batCapacity = newcap;
- hheap_size = Hsize(b) * newcap;
- hheap_size *= Hsize(b); if (b->H->heap.base && HEAPextend(&b->H->heap, hheap_size) < 0) return NULL;
- theap_size = Tsize(b) * newcap;
- theap_size *= Tsize(b); if (b->T->heap.base && HEAPextend(&b->T->heap, theap_size) < 0) return NULL; HASHdestroy(b);
U gdk_ssort.mx Index: gdk_ssort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_ssort.mx,v retrieving revision 1.15 retrieving revision 1.16 diff -u -d -r1.15 -r1.16 --- gdk_ssort.mx 7 Jan 2009 14:13:48 -0000 1.15 +++ gdk_ssort.mx 13 Apr 2009 23:36:24 -0000 1.16 @@ -203,8 +203,8 @@ } \ } while (0)
-#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)-
heap +
- (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)
((X), (Y))) < 0)
-#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare) ((ms)- heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)- compare)((X), (Y))) > 0) +#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)-
heap +
((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) < 0)
+#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare) ((ms)- heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) <<
GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) > 0)
@= islt #define ISLT_@1@2(X, Y, ms) (* (@1 *) (X) @3 * (@1 *) (Y))
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
-- Sjoerd Mullender
No virus found in this incoming message. Checked by AVG - www.avg.com Version: 8.0.238 / Virus Database: 270.11.20/2013 - Release Date: 04/14/09 06:17:00
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
On Tue, Apr 14, 2009 at 02:04:36PM +0200, Peter Boncz wrote:
Hi Sjoerd,
Thanks for the question. I wrote earlier last night an email to this list, which I thought covers this.
The answer regarding backward compatibility is: (1) NO, on repositories created by MonetDB configures with --enable-bits=64 --enable-oid32 (2) YES, in all other cases.
It is my understanding that few people use (1).
FYI, (AFAIR following your (Peter's) suggestion,) all people that use the 64-bit Windows Installers use (1) (IIRC "to reduce memory requirements and prevent/postpone addressspace fragmentation --- even on 64-bit Windows), i.e., the 64-bit Windows Installers are by default (and only) built, released and distributed with 32-bit OIDs.
I cannot give any counts, though ...
Stefan
If the MonetDB team agrees, I would propose not to take additional migration measures. If otherwise, ie if we go to a migration anyway, we might also consider changing the new type stridx_t in the (2) cases from var_t to unsigned short. It reduces the fixed overhead of string heaps of small bats (2KB instead of 4KB on 32-bits, 8KB on 64 bits). I refrained from doing so to keep (2) backward compatible.
Peter
-----Original Message----- From: Sjoerd Mullender [mailto:sjoerd@acm.org] Sent: Tuesday, April 14, 2009 1:41 PM To: monetdb-developers@lists.sourceforge.net Cc: monetdb-checkins@lists.sourceforge.net Subject: Re: [Monetdb-checkins] MonetDB/src/gdk gdk.mx, , 1.279, 1.280 gdk_atoms.mx, , 1.161, 1.162 gdk_bat.mx, , 1.214, 1.215 gdk_batop.mx, , 1.170, 1.171 gdk_bbp.mx, , 1.252, 1.253 gdk_heap.mx, , 1.117, 1.118 gdk_qsort.mx, , 1.34, 1.35 gdk_ssort.mx, , 1.15, 1...
Is this a backward compatible change? In other words, can databases created before this change be read by the new version?
I really want backward compatibility, so if it isn't, some kind of conversion would be needed.
Peter Boncz wrote:
Update of /cvsroot/monetdb/MonetDB/src/gdk In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv5363/src/gdk
Modified Files: gdk.mx gdk_atoms.mx gdk_bat.mx gdk_batop.mx gdk_bbp.mx gdk_heap.mx gdk_qsort.mx gdk_ssort.mx gdk_utils.mx Log Message: support for 32GB string heaps in 64bits/oid32 mode (implies bat format change but ONLY for 64bits/oid32)
src/gdk/gdk.mx
- introduce GDK_VARSHIFT, var_t (str indices) are to be shifted that amount of bits to get to the real offset (padding is 8, in case of 64-bits and oid32 -- otherwise it is 0 => no change)
- clean up usage of var_t in HEAP_* interface
src/gdk/gdk_atoms.mx
- str heaps with difficult double lim distrubution do not maintain a linked list (direct open hashing only)
- allow internal string heap hash table pointers to be unsigned shorts instead of var_t (only switched on for 64bits/oid32)
- double-elim string heaps scaled back to 64KB (from 256-512KB)
- support for GDK_VARSHIFT
src/gdk/gdk_bat.mx
- bugfix in 64bits/oid32 offset calculation (precision loss averted)
src/gdk/gdk_batop.mx src/gdk/gdk_bbp.mx src/gdk/gdk_qsort.mx src/gdk/gdk_ssort.mx
- support for GDK_VARSHIFT
src/gdk/gdk_heap.mx
- HEAPmargin allocates large VM area after a block alloc in 64-bits (this to stimulate in-place HEAPextend without memcpy)
- clean up use of var_t/size_t in HEAP_* functions, and support for
GDK_VARSHIFT
src/gdk/gdk_utils.mx
- increase VM size for 64-bits systems to 4TB
U gdk.mx Index: gdk.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk.mx,v retrieving revision 1.279 retrieving revision 1.280 diff -u -d -r1.279 -r1.280 --- gdk.mx 3 Apr 2009 09:14:51 -0000 1.279 +++ gdk.mx 13 Apr 2009 23:36:24 -0000 1.280 @@ -1068,9 +1068,9 @@ @item void HEAP_destroy (Heap* h) @item var_t -HEAP_malloc (Heap* heap, var_t nbytes) +HEAP_malloc (Heap* heap, size_t nbytes) @item void -HEAP_free (Heap *heap, size_t block) +HEAP_free (Heap *heap, var_t block) @item int HEAP_private (Heap* h) @item void @@ -1111,7 +1111,7 @@ int (*sizefcn) (ptr) /*
BATatoms[].atomLen
function */
);
-gdk_export var_t HEAP_malloc(Heap *heap, var_t nbytes); +gdk_export var_t HEAP_malloc(Heap *heap, size_t nbytes); gdk_export void HEAP_free(Heap *heap, var_t block); gdk_export var_t HEAP_private(Heap *h); gdk_export void HEAP_checkformat(Heap *h); @@ -1350,12 +1350,37 @@ #define Hloc(b,p) ((b)->H->heap.base+((p)<<(b)->H->shift)) #define Tloc(b,p) ((b)->T->heap.base+((p)<<(b)->T->shift))
+#if SIZEOF_VAR_T < SIZEOF_VOID_P +/* NEW 11/4/2009: when compiled with 32-bits oids/var_t on 64-bits
systems, align heap strings
- on 8 byte boundaries always (wasting 4 padding bytes on avg). Note
that in heaps where
- duplicate elimination is succesful, such padding occurs anyway (as an
aside, a better
- implementation with two-bytes pointers in the string heap hash table,
could reduce that
- padding to avg 1 byte wasted -- see TODO below).
- This 8 byte alignment allows the offset in the fixed part of the BAT
string column to be
- interpreted as an index, which should be multiplied by 8 to get the
position (VARSHIFT). The
- overall effect is that 32GB heaps can be addressed even when oids are
limited to 4Gtuples.
- In the future, we might extend this such that the string alignment is
set in the BAT header
- (columns with long strings take more storage space, but could
tolerate more padding).
- It would mostly work, only the sort routine and strPut/strLocate
(which do not see the BAT
- header) extra parameters would be needed in their APIs.
- */
+typedef unsigned short stridx_t; +#define GDK_VARSHIFT 3 +#define GDK_VARALIGN (1<<GDK_VARSHIFT) +#else +typedef var_t stridx_t; /* TODO: should also be unsigned short, but kept
at var_t not to break BAT images */
+#define GDK_VARSHIFT 0 +#define GDK_VARALIGN sizeof(stridx_t) +#endif
#define BUNhloc(bi,p) Hloc((bi).b,p) #define BUNtloc(bi,p) Tloc((bi).b,p) #define BUNhpos(bi,p) (Hpos(&(bi),p)) #define BUNtpos(bi,p) (Tpos(&(bi),p)) -#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+*((var_t*)BUNhloc(bi,p))):BUNhpos(bi,p)) -#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+*((var_t*)BUNtloc(bi,p))):BUNtpos(bi,p)) +#define BUNhvar(bi,p) ((bi).b- htype?(Hbase((bi).b)+((*(var_t*)BUNhloc(bi,p))<<GDK_VARSHIFT)):BUNhpos(bi,
p))
+#define BUNtvar(bi,p) ((bi).b- ttype?(Tbase((bi).b)+((*(var_t*)BUNtloc(bi,p))<<GDK_VARSHIFT)):BUNtpos(bi,
p))
#define BUNhead(bi,p) ((bi).b- hvarsized?BUNhvar(bi,p):BUNhloc(bi,p)) #define BUNtail(bi,p) ((bi).b- tvarsized?BUNtvar(bi,p):BUNtloc(bi,p))
U gdk_atoms.mx Index: gdk_atoms.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_atoms.mx,v retrieving revision 1.161 retrieving revision 1.162 diff -u -d -r1.161 -r1.162 --- gdk_atoms.mx 7 Apr 2009 11:53:22 -0000 1.161 +++ gdk_atoms.mx 13 Apr 2009 23:36:24 -0000 1.162 @@ -175,7 +175,6 @@ gdk_export int strLen(const char *s); gdk_export int strCmp(str l, str r); gdk_export int strNil(str s); -gdk_export void strHeapConvert(Heap *h, int directon); gdk_export int strElimDoubles(Heap *h); gdk_export var_t strLocate(Heap *h, str v); gdk_export int strCmpNoNil(unsigned char *l, unsigned char *r); @@ -457,7 +456,7 @@ 0, 0, (var_t (*)(Heap *, var_t *, ptr)) strPut, 0, (int (*)(ptr)) strLen, strHeap,
(void (*)(Heap *, int)) strHeapConvert, 0},
(void (*)(Heap *, int)) 0, 0},
}; int GDKatomcnt = TYPE_str + 1;
@@ -931,24 +930,25 @@ } \ } while (0)
-#define GDK_STRHASHTABLE (1<<10) +/* string heaps:
- strings are 8 byte aligned
- start with a 1024 bucket hash table
- heaps < 64KB are fully duplicate eliminated with this hash tables
- heaps >= 64KB are opportunistically (imperfect) duplicate
eliminated
- as only the last 128KB chunk is considered and there is no linked
list
- buckets and next pointers are unsigned short "indices"
- indices should be multiplied by 8 and takes from ELIMBASE to get an
offset
- Note that a 64KB chunk of the heap contains at most 8K 8-byte aligned
- strings. The 1K bucket list means that in worst load, the list length
is 8 (OK).
- */
+#define GDK_STRHASHTABLE (1<<10) /* 1024 */ #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) -#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(var_t)) -#define GDK_STRHASHCREDIT(h) (((var_t*) (h)- base)[GDK_STRHASHTABLE]) +#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) +#define GDK_ELIMPOWER 16 /* 64KB is the threshold */ #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) -#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) +#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently:
ELIMBASE == 0 */
#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) <<
GDK_ELIMPOWER)
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define GDK_ELIMPOWER 16 /* makes for a max 64KB hash
table
* ie 256 string bytes per hash bucket
* ~ 4 strings of UP4(8<=len<=11)=12 + 4
bytes
*/
-#else -#define GDK_ELIMPOWER 17 /* makes for a max 128KB hash
table
* ie 512 string bytes per hash bucket
* ~ 5 strings of UP8(8<=len<=15)=16 + 8
bytes
*/
-#endif
@- Atomic ADT functions @c @@ -1767,46 +1767,34 @@ Because in many typical situations lots of double string values occur in tables, the string insertion provides automatic double elimination. To do this, a GDK_STRHASHTABLE(=1024) bucket hashtable is hidden in the
first
-4096 (8192 on 64-bit architectures) bytes of the string heap, consisting
of integer offsets of the first
-string hashing to that bucket in the heap. Furthermore, the first 4(8)
bytes
-before each string in the heap is an integer offset to the next string
hashing
-to the same number. +4096 bytes of the string heap, consisting an offset to the first string +hashing to that bucket in the heap. +These offsets are made small (stridx_t is an unsigned short) by
exploiting
+the fact that the double elimination chunks are (now) 64KB, hence a
short
+suffices.
-However, in many other situations the cardinality of string columns is
large,
+In many other situations the cardinality of string columns is large, or the string values might even be unique. In those cases, our fixed-
size hash
table will start to overflow quickly. Therefore, after the hash table is
full
(this is measured very simplistically by looking whether the string heap
exceeds a
-heap size = GDK_ELIMLIMIT -- done this way to keep compatibility with
old bat images)
-we flush the hash table. If one views the string heaps as consecutive
chunks
-of size GDK_ELIMLIMIT bytes, then all strings within one chunk are
double-eliminated.
-There is a macro GDK_ELIMBASE(offset) that computes the base of the
chunk in which
-a certain byte-offset falls. -@- -This is a departure from our previous policy of not looking at the hash
tables at
-all after overflow occurred. The advantage of the new approach is that
if we have
-a value distribution that is skewed (ie some values are very frequent),
these
-values will always be double eliminated, saving a considerable amount of
space.
-Disadvantage of the approach is that we always have to reserve space for
the next
-pointer (4(8) byte integer offset) that is stored right in front of the
string (and
-consequently have to keep all string chunks and offsets aligned to
4(8)). All this
-translates into some wasted space. However, if there are that many
different strings
-that the hash table overflows, the strings must be relatively long and
the relative
-storage overhead should be low. +heap size = GDK_ELIMLIMIT = 64KB) we flush the hash table. Even more,
from that moment
+on, we do not use a linked list, but a lossy hash table that just
contains
+the last position for each bucket. Basically, after exceeding
GDK_ELIMLIMIT,
+we get a probabilistic/opportunistic duplicate elimination mechanism, +that only looks at the last GDK_ELIMLIMIT chunk in the heap, in a lossy
way.
+When comparing with the previous string implementation, the biggest
difference
+is that on 64-nbits bt with 32-bits oids, strings are always 8-byte
aligned
+and var_t numbers are multiplied by 8 to get the true offset. The goal
to do
+this is to allow 32-bits var_t on 64-bits systems to address 32GB (using
string
+alignment=8). For large database, the cost of padding (4 bytes avg) is
offset
+by the savings in var_t (allowing to go from 64- to 32-bits). Nothing
lost there,
+and 32-bits var_t also pay in smaller OIDs and smaller hash tables,
reducing memory
+pressure. For small duplicate eliminated heaps, the short indices +used in the hash table have now allowed more buckets (2K instead of 1K) +and average 2 bytes overhead for the next pointers instead of 6-12.
Therefore
+small heaps are now more compact than before. @- -Notice that this mechanism enables to keep a certain linear storage
property
-in the string heaps. This is important if we want to take a BATslice on
a BAT
-by simply loading or @strong{mmap()}ping slices of the BAT files on disk
into memory.
-This is relevant in order to process a very large BAT iteratively by
taking slices
-in order to reduce memory consumption. Notice that if there are few
different string
-values, the hash table has not overflowed, and the string heap size will
be small
-(i.e. < GDK_ELIMLIMIT), so in those cases it is not a problem to load
the entire string heap.
-If the hash table @strong{has} overflowed, we want to be able to only
map a slice of the
-string heap as well. Now, given that the first string in the BAT-slice
is called F1
-and its heap offset is O1 and the last string in the slice is F2 and its -offset is O2, then the slice we should take from the string heap is: -@example -GDK_ELIMBASE(F1) .. MAX(GDK_ELIMBASE(F2)+GDK_ELIMLIMIT), O2+strlen(F2)) -@end example The routine strElimDoubles() can be used to check whether all strings are still being double-eliminated in the original hash-table. Only then we know that unequal offset-integers in the BUN array means @@ -1877,16 +1865,12 @@ strHeap(Heap *d, size_t cap) { size_t size;
var_t *h, *e;
cap = MAX(cap, BATTINY);
size = (GDK_STRHASHTABLE + 1) * sizeof(var_t) + MIN(GDK_ELIMLIMIT,
cap * 12);
- size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap *
GDK_VARALIGN);
if (HEAPalloc(d, size, 1) >= 0) {
d->free = GDK_STRHASHTABLE * sizeof(var_t);
h = (var_t *) d->base;
for (e = h; e < h + GDK_STRHASHTABLE; e++) {
*e = 0;
}
d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
}memset(d->base, 0, d->free);
}
@@ -1923,42 +1907,10 @@ void strCleanHash(Heap *h, int rebuild) {
- (void) rebuild; if (!GDK_ELIMDOUBLES(h)) { /* flush hash table for security */ memset(h->base, 0, GDK_STRHASHSIZE);
- } else if (rebuild) {
var_t xx, cur = 0, end = 0;
str hash = (str) h->base;
-/* int cnt[GDK_STRHASHTABLE]; */
/* collect all values in one big linked list */
for (xx = 0; xx < GDK_STRHASHTABLE; xx++) {
var_t yy = ((var_t *) hash)[xx];
-/* cnt[xx]=0; */
((var_t *) hash)[xx] = 0; /* clear hash table slot
*/
if (end) {
*(var_t *) (hash + end) = yy;
} else {
cur = yy;
}
for (; yy; yy = *(var_t *) (hash + yy))
end = yy;
}
/* process the linked list, inserting the values again */
for (; cur; cur = end) {
str val = hash + cur;
GDK_STRHASH(val + sizeof(var_t), xx);
xx &= GDK_STRHASHMASK;
end = *(var_t *) val;
*(var_t *) val = ((var_t *) hash)[xx];
((var_t *) hash)[xx] = cur;
-/* cnt[xx]++; */
}
-/* for(xx=0; xx<GDK_STRHASHTABLE; xx++)
- if(cnt[xx]) stream_printf(GDKout, "%5d %6d", xx, cnt[xx]); */ }
}
@@ -1970,90 +1922,63 @@ var_t strLocate(Heap *h, str v) {
- var_t *htab = (var_t *) h->base;
- var_t *l, *e;
- BUN idx;
- GDK_STRHASH(v, idx);
- e = htab + (idx & GDK_STRHASHMASK);
- if (*e) {
for (l = e; *l && *l < h->free; l = (var_t *) ((char *) h->base
- *l)) {
str x = (str) ((char *) h->base + *l + sizeof(var_t));
if (GDK_STRCMP(v, x) == 0) {
return *l + (var_t) sizeof(var_t);
}
}
- }
- return 0;
-}
- stridx_t *ref, *next;
-#if SIZEOF_SIZE_T == SIZEOF_INT -#define normal_vart_SWAP(x) ((var_t) normal_int_SWAP((int)x)) -#else -#define normal_vart_SWAP(x) ((var_t) long_long_SWAP((lng)x)) -#endif
- /* search hash-table, if double-elimination is still in place */
- BUN off; GDK_STRHASH(v, off);
- off &= GDK_STRHASHMASK;
-/* convert the integers in the implicit hash table structure */ -void -strHeapConvert(Heap *h, int dir) -{
- var_t *htab = (var_t *) h->base;
- var_t *l, i, j;
- /* should only use strLocate iff fully double eliminated */
- assert(GDK_ELIMBASE(h->free) == 0);
- if (dir == CONV_HTON) {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + j)) {
*l = normal_vart_SWAP(j);
}
}
- } else {
for (i = 0; i < GDK_STRHASHTABLE; i++) {
for (l = htab + i; (j = *l) != 0 && j < h->free; l =
(var_t *) (h->base + *l)) {
*l = normal_vart_SWAP(j);
}
}
- /* search the linked list */
- for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0)
return (sizeof(stridx_t) + *ref) >> GDK_VARSHIFT; /*
found */
}
- return 0;
}
var_t strPut(Heap *h, var_t *dst, str v) {
- var_t *l;
- size_t i = GDK_STRLEN(v);
- BUN off;
- /* round up to var_t-byte alignment + var_t (next pointer) */
- size_t len = ((i + sizeof(var_t) - 1) & ~(sizeof(var_t) - 1)) +
sizeof(var_t);
- size_t elimlimit = GDK_ELIMBASE(h->free) + GDK_ELIMLIMIT;
size_t elimbase = GDK_ELIMBASE(h->free);
size_t pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN-1));
size_t pos, len = GDK_STRLEN(v);
stridx_t *bucket, *ref, *next;
/* search hash-table, if double-elimination is still in place */
- GDK_STRHASH(v, off);
- BUN off; GDK_STRHASH(v, off); off &= GDK_STRHASHMASK;
- for (l = ((var_t *) h->base) + off; *l && *l < h->free; l = (var_t *)
(h->base + *l)) {
str x = (str) (h->base + *l + sizeof(var_t));
- bucket = ((stridx_t *) h->base) + off;
if (GDK_STRCMP(v, x) == 0) {
*dst = *l + (var_t) sizeof(var_t); /* already in
heap;
do not insert! */
if (GDK_ELIMDOUBLES(h) == 0 && GDK_STRHASHCREDIT(h))
GDK_STRHASHCREDIT(h) += 4;
return *dst;
- if (elimbase == 0) { /* small string heap (<64KB) -- fully double
eliminated */
for (ref = bucket; *ref; ref = next) { /* search the linked
list */
next = (stridx_t*) (h->base + *ref);
if (GDK_STRCMP(v, (str) (next+1)) == 0) { /* found */
pos = sizeof(stridx_t) + *ref;
return *dst = (pos >> GDK_VARSHIFT);
}}
- }
- /* flush the hash table if it becomes too big (implies
!GDK_ELIMDOUBLES) */
- if (h->free + len >= elimlimit) {
/* if we are not hash-inserting anymore, h->free may no longer
be var_t aligned */
size_t mask = h->free & (sizeof(var_t) - 1);
if (mask)
h->free += sizeof(var_t) - mask; /* realign */
memset(h->base, 0, GDK_STRHASHSIZE); /* start over hash
inserting in a pristine hash table */
GDK_STRHASHCREDIT(h) = 32; /* only tolerate limited misses
in the future */
/* is there room for the next pointer in the padding space? */
if (pad < sizeof(stridx_t))
pad += GDK_VARALIGN; /* if not, pad more */
- } else {
/* large string heap (>=64KB) -- opportunistic/probabilistic
double elimination */
pos = elimbase + *bucket;
if (GDK_STRCMP(v, (str) (h->base + pos)) == 0) {
return *dst = (pos >> GDK_VARSHIFT); /* already in heap;
do not insert! */
}
+#if SIZEOF_VAR_T < SIZEOF_VOID_P
pad &= (GDK_VARALIGN-1); /* a full VARALIGN pad can be omitted
*/
+#else
pad = 0; /* only 32-bits var_t in 64-bits needs padding */
+#endif }
/* check heap for space (limited to a certain maximum after which
nils are inserted) */
- if (h->free + len >= h->size) {
- if (h->free + pad + len >= h->size) { /* Something really strange happens here, In a special case (Pentium II Klamath, gcc version 2.96 20000731, GNU assembler version 2.10.90 using BFD version 2.10.0.18)
@@ -2064,11 +1989,15 @@ */ float batmargin = (float) BATMARGIN; float hnewsize = h->size * batmargin;
size_t newsize = len + (size_t) hnewsize;
size_t newsize = pad + len + (size_t) hnewsize;
assert(newsize);
if (h->free + len < h->maxsize) {
if (h->free + pad + len >= (((size_t) VAR_MAX) <<
GDK_VARSHIFT)) {
GDKerror("strPut: string heaps gets larger than
%dGB.\n",
(((size_t) VAR_MAX) << GDK_VARSHIFT) >> 30);
return 0;
}
if (h->free + pad + len < h->maxsize) { /* if there is reserved space, first use the reserved
space */
newsize = MIN(newsize, h->maxsize); }
@@ -2077,32 +2006,27 @@ } /* fill should solve initialisation problems within valgrind */ memset(h->base + h->free, 0, h->size - h->free);
}
if (!GDK_ELIMDOUBLES(h)) {
if (GDK_STRHASHCREDIT(h) == 0) {
/* if credits are gone, we do not hash insert at all */
if (h->free > VAR_MAX) {
GDKerror("strPut: string heaps gets larger than
2GB
-- too large for 32-bits oids.\n");
return 0;
}
*dst = (var_t) h->free;
memcpy(h->base + h->free, v, i);
h->free += i; /* in this case, we do not round to
var_t either */
return *dst;
}
GDK_STRHASHCREDIT(h)--;
/* make bucket point into the enw heap */
}bucket = ((stridx_t *) h->base) + off;
- /* insert string in hash table and copy into the heap */
- l = (var_t *) (h->base + h->free);
- *(l++) = ((var_t *) h->base)[off];
- assert(h->free + sizeof(var_t) <= VAR_MAX);
- ((var_t *) h->base)[off] = (var_t) h->free;
- *dst = (var_t) (h->free + sizeof(var_t));
- h->free += len;
- memcpy((char *) l, v, i);
- /* insert string */
- pos = h->free + pad;
- *dst = pos >> GDK_VARSHIFT;
- memcpy(h->base + pos, v, len);
- h->free += pad + len;
- /* maintain hash table */
- if (elimbase == 0) { /* small string heap: link the next pointer */
pos -= sizeof(stridx_t); /* the stridx_t next pointer directly
precedes the string */
*(stridx_t*) (h->base + pos) = *bucket;
- }
- *bucket = (stridx_t) (pos - elimbase); /* set bucket to the new
string */
- if (h->free >= elimbase + GDK_ELIMLIMIT) {
memset(h->base, 0, GDK_STRHASHSIZE); /* flush hash table */
- } return *dst;
}
U gdk_bbp.mx Index: gdk_bbp.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bbp.mx,v retrieving revision 1.252 retrieving revision 1.253 diff -u -d -r1.252 -r1.253 --- gdk_bbp.mx 9 Apr 2009 15:29:18 -0000 1.252 +++ gdk_bbp.mx 13 Apr 2009 23:36:24 -0000 1.253 @@ -2965,9 +2965,9 @@ BATsetcount(&bs->B, 0);
if (bs->H.vheap)
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->H.vheap->base, 0, bs->H.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
if (bs->T.vheap)
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(var_t));
memset(bs->T.vheap->base, 0, bs->T.vheap->free =
GDK_STRHASHTABLE * sizeof(stridx_t));
return bs;
} BATDEBUG THRprintf(GDKout, "#BBPrecycle %d %d "SZFMT" (%d, %d %d %d)
N2\n", ht, tt, cap, BATCACHE_NOTYPE(ht), BATCACHE_NOTYPE(tt), batcache_maxbuckets, bin);
U gdk_batop.mx Index: gdk_batop.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_batop.mx,v retrieving revision 1.170 retrieving revision 1.171 diff -u -d -r1.170 -r1.171 --- gdk_batop.mx 7 Jan 2009 14:13:48 -0000 1.170 +++ gdk_batop.mx 13 Apr 2009 23:36:24 -0000 1.171 @@ -1656,7 +1656,7 @@ if (b->hkey == 0) { /* sorted and key? */ while (cur < end) {
char *val = base + *(((var_t
*)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t
*)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5= 0) { key = FALSE;
@@ -1668,7 +1668,7 @@ } /* sorted? */ while (cur < end) {
char *val = base + *(((var_t *)b->H-
heap.base)+ cur);
char *val = base + (*(((var_t *)b->H-
heap.base)+ cur) << GDK_VARSHIFT);
if (cmp(prv, val) @5 0) { /* record negative position info
*/
U gdk_utils.mx Index: gdk_utils.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_utils.mx,v retrieving revision 1.246 retrieving revision 1.247 diff -u -d -r1.246 -r1.247 --- gdk_utils.mx 9 Apr 2009 18:48:25 -0000 1.246 +++ gdk_utils.mx 13 Apr 2009 23:36:24 -0000 1.247 @@ -382,7 +382,7 @@ #define GDK_MEM_NULLALLOWED
#if SIZEOF_VOID_P==8 -#define GDK_VM_MAXSIZE LL_CONSTANT(137438953472) /* :-) a 64-bit
OS: 128 GB */
+#define GDK_VM_MAXSIZE LL_CONSTANT(4398046511104) /* :-) a 64-bit
OS: 4TB */
#else #define GDK_VM_MAXSIZE LL_CONSTANT(1610612736) /* :-| a 32-bit OS:
1.5GB */
#endif
U gdk_heap.mx Index: gdk_heap.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_heap.mx,v retrieving revision 1.117 retrieving revision 1.118 diff -u -d -r1.117 -r1.118 --- gdk_heap.mx 20 Mar 2009 11:59:57 -0000 1.117 +++ gdk_heap.mx 13 Apr 2009 23:36:24 -0000 1.118 @@ -75,8 +75,20 @@ nice). @{ @c -int -HEAPalloc(Heap *h, size_t nitems, size_t itemsize) +size_t HEAPmargin(size_t maxsize) {
- size_t ret;
+#if SIZEOF_VOID_P == 8
- /* in 64-bits systems, try to enforce in-place realloc, but provoke
the memcpy on 256MB, then 4GB */
- size_t use = GDKvm_cursize();
- ret = MIN(GDK_mem_maxsize, MAX(((size_t) 1) << 26, 16*maxsize));
- if ((ret+ret) > (GDK_vm_maxsize-MIN(GDK_vm_maxsize,use))) /* only if
room */
+#endif
- ret = ((double) BATMARGIN)*maxsize - 1; /* do not waste VM on 32-bits
*/
- return (1 + (MAX(maxsize,ret) >> 16)) << 16; /* round up to 64K */
+}
+/* in 64-bits space, use very large margins to accomodate reallocations
*/
+int HEAPalloc(Heap *h, size_t nitems, size_t itemsize) { char nme[PATHLENGTH], *ext = NULL;
@@ -98,8 +110,7 @@ /* when using anonymous vm we malloc we need 64K chunks, also we * 20% extra malloc */ if (h->size > GDK_mem_bigsize) {
h->maxsize = (size_t) ((double) h->maxsize * BATMARGIN) - 1;
h->maxsize = (1 + (h->maxsize >> 16)) << 16;
} if (h->filename == NULL || (h->size < GDK_mmap_minsize)) { h->storage = STORE_MEM;h->maxsize = HEAPmargin(h->maxsize);
@@ -169,17 +180,14 @@ /* extend a malloced heap, possibly switching over to file-
mapped storage */
Heap bak = *h; int can_mmap = h->filename && (size >= GDK_mem_bigsize || h-
newstorage != STORE_MEM);
int must_mmap = can_mmap && (size >= GDK_mmap_minsize || h-
newstorage != STORE_MEM);
int small_cpy = (h->size*4 < size) && (size >=
GDK_mmap_minsize);
int must_mmap = can_mmap && (small_cpy || h->newstorage !=
STORE_MEM);
h->size = size; if (can_mmap) { /* in anonymous vm, if have to realloc anyway, we
reserve
some extra space */
if (size > h->maxsize) {
h->maxsize = (size_t) ((double) size *
BATMARGIN);
}
/* when using anonymous vm we malloc we need 64K chunks
*/
h->maxsize = (1 + ((h->maxsize - 1) >> 16)) << 16;
} else { h->maxsize = size; /* for normal GDKmalloc, maxsizeh->maxsize = HEAPmargin(MAX(size,h->maxsize));
= size */
}
@@ -514,9 +522,9 @@ #define HEAPVERSION 20030408
typedef struct heapheader {
- var_t head; /* index to first free block */
- size_t head; /* index to first free block */ int alignment; /* alignment of objects on heap */
- var_t firstblock; /* first block in heap */
- size_t firstblock; /* first block in heap */ int version; int (*sizefcn) (ptr); /* ADT function to ask length */
} HEADER32; @@ -524,8 +532,8 @@ typedef struct { int version; int alignment;
- var_t head;
- var_t firstblock;
- size_t head;
- size_t firstblock; int (*sizefcn) (ptr);
} HEADER64;
@@ -537,8 +545,8 @@ typedef HEADER64 HEADER_OTHER; #endif typedef struct hfblock {
- var_t size; /* Size of this block in freelist */
- var_t next; /* index of next block */
- size_t size; /* Size of this block in freelist */
- size_t next; /* index of next block */
} CHUNK;
@c @@ -546,13 +554,13 @@ #define roundup_4(x) (((x)+3)&~3) #define blocksize(h,p) ((p)->size)
-static INLINE var_t -roundup_num(var_t number, int alignment) +static INLINE size_t +roundup_num(size_t number, int alignment) {
- var_t rval;
- size_t rval;
- rval = number + (var_t) alignment - 1;
- rval -= (rval % (var_t) alignment);
- rval = number + (size_t) alignment - 1;
- rval -= (rval % (size_t) alignment); return rval;
}
@@ -560,7 +568,7 @@ HEAP_private(Heap *h) { (void) h;
- return (var_t) roundup_8(sizeof(HEADER));
- return (var_t) (roundup_8(sizeof(HEADER)) >> GDK_VARSHIFT);
}
#ifdef TRACE @@ -568,7 +576,7 @@ HEAP_printstatus(Heap *heap) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t block, cur_free = hheader->head;
size_t block, cur_free = hheader->head; CHUNK *blockp;
THRprintf(GDKout, "#HEAP has head " SZFMT " and alignment %d and size
" SZFMT "\n",
@@ -591,7 +599,7 @@ cur_free = blockp->next; block += blockp->size; } else {
var_t size = blocksize(hheader, blockp);
size_t size = blocksize(hheader, blockp); THRprintf(GDKout, "# block at " SZFMT " with size "
SZFMT "\n", block, size);
block += size;
@@ -611,7 +619,7 @@ /* // Calculate position of first and only free block. */
- var_t head = roundup_num((var_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
- size_t head = roundup_num((size_t) (roundup_8(sizeof(HEADER)) +
roundup_8(nprivate)), alignment);
CHUNK *headp = HEAP_index(heap, head, CHUNK);
assert(roundup_8(sizeof(HEADER)) + roundup_8(nprivate) <= VAR_MAX); @@ -629,7 +637,7 @@ // Fill first free block. */ assert(heap->size - head <= VAR_MAX);
- headp->size = (var_t) (heap->size - head);
- headp->size = (size_t) (heap->size - head); headp->next = 0;
#ifdef TRACE THRprintf(GDKout, "#We created the following heap\n"); @@ -669,9 +677,9 @@
var_t -HEAP_malloc(Heap *heap, var_t nbytes) +HEAP_malloc(Heap *heap, size_t nbytes) {
- var_t block, trail, ttrail;
- size_t block, trail, ttrail; CHUNK *blockp; CHUNK *trailp; HEADER *hheader = HEAP_index(heap, 0, HEADER);
@@ -682,15 +690,9 @@
/* add space for size field */ nbytes += hheader->alignment;
- if (hheader->alignment == 8) {
nbytes = roundup_8(nbytes);
- } else if (hheader->alignment == 4) {
nbytes = roundup_4(nbytes);
- } else {
GDKfatal("HEAP_malloc: Heap structure corrupt\n");
- }
- nbytes = roundup_8(nbytes); if (nbytes < sizeof(CHUNK))
nbytes = (var_t) sizeof(CHUNK);
nbytes = (size_t) sizeof(CHUNK);
/* // block -- points to block with acceptable size (if available).
@@ -718,12 +720,12 @@ // If no block of acceptable size is found we try to enlarge the
heap.
*/
if (block == 0) {
var_t newsize;
size_t newsize;
assert(heap->free + MAX(heap->free, nbytes) <= VAR_MAX);
newsize = (var_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
newsize = (size_t) roundup_8(heap->free + MAX(heap->free,
nbytes));
assert(heap->free <= VAR_MAX);
block = (var_t) heap->free; /* current end-of-heap */
block = (size_t) heap->free; /* current end-of-heap */
#ifdef TRACE THRprintf(GDKout, "#No block found\n"); @@ -747,7 +749,7 @@
blockp->next = 0; assert(heap->free - block <= VAR_MAX);
blockp->size = (var_t) (heap->free - block); /* determine
size of allocated block */
blockp->size = (size_t) (heap->free - block); /* determine
size of allocated block */
/* // Try to join the last block in the freelist and the newly
allocated
@@ -778,7 +780,7 @@ // TUNE: use different amount than 2*sizeof(CHUNK) */ if (blockp->size >= nbytes + 2 * sizeof(CHUNK)) {
var_t newblock = block + nbytes;
size_t newblock = block + nbytes;
CHUNK *newblockp = HEAP_index(heap, newblock, CHUNK);
newblockp->size = blockp->size - nbytes;
@@ -800,17 +802,17 @@ }
block += hheader->alignment;
- return block;
- return block >> GDK_VARSHIFT;
}
void -HEAP_free(Heap *heap, var_t block) +HEAP_free(Heap *heap, var_t mem) { HEADER *hheader = HEAP_index(heap, 0, HEADER); CHUNK *beforep; CHUNK *blockp; CHUNK *afterp;
- var_t after, before;
size_t after, before, block = mem << GDK_VARSHIFT;
if (hheader->alignment != 8 && hheader->alignment != 4) { GDKfatal("HEAP_free: Heap structure corrupt\n");
@@ -884,10 +886,10 @@ HEAP_check(Heap *heap, HeapRepair *hr) { HEADER *hheader = HEAP_index(heap, 0, HEADER);
- var_t head = hheader->head, alignshift = 2;
- var_t block, nwords = (var_t) ((heap->free - 1) >> 7);
- size_t head = hheader->head, alignshift = 2;
- size_t block, nwords = (size_t) ((heap->free - 1) >> 7); int *freemask;
- var_t prevblock = 0;
size_t prevblock = 0; CHUNK *blockp;
hr->alignment = hheader->alignment;
@@ -922,8 +924,8 @@ // Walk the freelist; register them in freemask */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if ((block <= prevblock) && (block != 0)) {
@@ -942,8 +944,8 @@ */ block = hheader->firstblock; while (block < heap->free) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
hr->validmask[pos] |= mask;
@@ -965,8 +967,8 @@ // Check if there are left over free blocks */ for (block = hheader->head; block != 0; block = HEAP_index(heap,
block, CHUNK)->next) {
var_t idx = block >> alignshift;
var_t pos = idx >> 5;
size_t idx = block >> alignshift;
size_t pos = idx >> 5;
int mask = 1 << (idx & 31);
if (freemask[pos] & mask) {
@@ -1046,14 +1048,14 @@ if (hheader->head > heap->free) { hheader->head = 0; /* cut off free block */ } else if (hheader->head) {
var_t idx = hheader->head;
size_t idx = hheader->head;
while (idx) { CHUNK *blk = HEAP_index(heap, idx, CHUNK);
if (idx + blk->size > heap->free) { assert(heap->free - idx <= VAR_MAX);
blk->size = (var_t) (heap->free - idx); /* cut
off illegal tail of block */
blk->size = (size_t) (heap->free - idx);
/* cut
off illegal tail of block */
} if (blk->next > heap->free || blk->next < (idx + blk-
size) || (blk->next & (hheader->alignment - 1))) { blk->next = 0; /* cut off next block */
U gdk_qsort.mx Index: gdk_qsort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_qsort.mx,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- gdk_qsort.mx 7 Jan 2009 14:13:48 -0000 1.34 +++ gdk_qsort.mx 13 Apr 2009 23:36:24 -0000 1.35 @@ -86,6 +86,7 @@ char *offst; /* NULL or start of varsized heap */ int hshift; /* log2 of hs */ int tshift; /* log2 of hs */
- int vshift; /* shift to be applied on var_ offsets */ unsigned hs; /* width of head record */ unsigned ts; /* width of tail record */ void *h; /* start of head buns */
@@ -189,7 +190,7 @@ #endif #endif
-#define offset(p) (buf->offst + *(var_t*) (p)) +#define offset(p) (buf->offst + ((*(var_t*) (p)) << buf- vshift)) #define direct(p) (p)
#define any_LE(a,b) ((buf->cmp)(a,b) <= 0) @@ -432,6 +433,7 @@ buf.ts = (unsigned) ts; buf.hshift = ATOMelmshift(hs); buf.tshift = ATOMelmshift(ts);
- buf.vshift = ATOMvarsized(tpe)?GDK_VARSHIFT:0; buf.h = h; if (!t) t = &x;
U gdk_bat.mx Index: gdk_bat.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_bat.mx,v retrieving revision 1.214 retrieving revision 1.215 diff -u -d -r1.214 -r1.215 --- gdk_bat.mx 3 Apr 2009 09:14:52 -0000 1.214 +++ gdk_bat.mx 13 Apr 2009 23:36:24 -0000 1.215 @@ -434,7 +434,7 @@ BAT * BATextend(BAT *b, BUN newcap) {
- size_t hheap_size, theap_size;
size_t hheap_size = newcap, theap_size = newcap;
assert(newcap <= BUN_MAX); BATcheck(b, "BATextend");
@@ -453,10 +453,10 @@
b->batCapacity = newcap;
- hheap_size = Hsize(b) * newcap;
- hheap_size *= Hsize(b); if (b->H->heap.base && HEAPextend(&b->H->heap, hheap_size) < 0) return NULL;
- theap_size = Tsize(b) * newcap;
- theap_size *= Tsize(b); if (b->T->heap.base && HEAPextend(&b->T->heap, theap_size) < 0) return NULL; HASHdestroy(b);
U gdk_ssort.mx Index: gdk_ssort.mx =================================================================== RCS file: /cvsroot/monetdb/MonetDB/src/gdk/gdk_ssort.mx,v retrieving revision 1.15 retrieving revision 1.16 diff -u -d -r1.15 -r1.16 --- gdk_ssort.mx 7 Jan 2009 14:13:48 -0000 1.15 +++ gdk_ssort.mx 13 Apr 2009 23:36:24 -0000 1.16 @@ -203,8 +203,8 @@ } \ } while (0)
-#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
- (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)->compare)((X),
(Y))) < 0)
-#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + * (var_t *) (X), (ms)->heap + * (var_t *) (Y)) : (*(ms)- compare)((X), (Y))) > 0) +#define ISLT_any(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)->heap +
((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) << GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) < 0)
+#define ISLT_any_rev(X, Y, ms) (((ms)->heap ? (*(ms)->compare)((ms)- heap + ((*(var_t*) (X)) << GDK_VARSHIFT), (ms)->heap + ((*(var_t*) (Y)) <<
GDK_VARSHIFT)) : (*(ms)->compare)((X), (Y))) > 0)
@= islt #define ISLT_@1@2(X, Y, ms) (* (@1 *) (X) @3 * (@1 *) (Y))
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
-- Sjoerd Mullender
No virus found in this incoming message. Checked by AVG - www.avg.com Version: 8.0.238 / Virus Database: 270.11.20/2013 - Release Date: 04/14/09 06:17:00
This SF.net email is sponsored by: High Quality Requirements in a Collaborative Environment. Download a free trial of Rational Requirements Composer Now! http://p.sf.net/sfu/www-ibm-com _______________________________________________ Monetdb-checkins mailing list Monetdb-checkins@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/monetdb-checkins
participants (5)
-
Henning Rode
-
Henning Rode
-
Peter Boncz
-
Sjoerd Mullender
-
Stefan Manegold