Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * @a Lefteris Sidirourgos, Hannes Muehleisen
15 : * @* Low level sample facilities
16 : *
17 : * This sampling implementation generates a sorted set of OIDs by
18 : * calling the random number generator, and uses a binary tree to
19 : * eliminate duplicates. The elements of the tree are then used to
20 : * create a sorted sample BAT. This implementation has a logarithmic
21 : * complexity that only depends on the sample size.
22 : *
23 : * There is a pathological case when the sample size is almost the
24 : * size of the BAT. Then, many collisions occur and performance
25 : * degrades. To catch this, we switch to antiset semantics when the
26 : * sample size is larger than half the BAT size. Then, we generate the
27 : * values that should be omitted from the sample.
28 : */
29 :
30 : #include "monetdb_config.h"
31 : #include "gdk.h"
32 : #include "gdk_private.h"
33 : #include "xoshiro256starstar.h"
34 :
35 : /* this is a straightforward implementation of a binary tree */
36 : struct oidtreenode {
37 : union {
38 : struct { /* use as a binary tree */
39 : oid o;
40 : struct oidtreenode *left;
41 : struct oidtreenode *right;
42 : };
43 : uint64_t r; /* temporary storage for random numbers */
44 : };
45 : };
46 :
47 : static bool
48 185684138 : OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
49 : {
50 185684138 : struct oidtreenode **nodep;
51 :
52 185684138 : if (allocated == 0) {
53 342595 : tree->left = tree->right = NULL;
54 342595 : tree->o = o;
55 342595 : return true;
56 : }
57 : nodep = &tree;
58 2002786313 : while (*nodep) {
59 1845712423 : if (o == (*nodep)->o)
60 : return false;
61 1817444770 : if (o < (*nodep)->o)
62 908866404 : nodep = &(*nodep)->left;
63 : else
64 908578366 : nodep = &(*nodep)->right;
65 : }
66 157073890 : *nodep = &tree[allocated];
67 157073890 : tree[allocated].left = tree[allocated].right = NULL;
68 157073890 : tree[allocated].o = o;
69 157073890 : return true;
70 : }
71 :
72 : /* inorder traversal, gives us a sorted BAT */
73 : static void
74 26317643 : OIDTreeToBAT(struct oidtreenode *node, BAT *bn)
75 : {
76 52588410 : if (node->left != NULL)
77 26265049 : OIDTreeToBAT(node->left, bn);
78 52588410 : ((oid *) bn->theap->base)[bn->batCount++] = node->o;
79 52588410 : if (node->right != NULL )
80 : OIDTreeToBAT(node->right, bn);
81 26317643 : }
82 :
83 : /* Antiset traversal, give us all values but the ones in the tree */
84 : static void
85 52558475 : OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bn, oid start, oid stop)
86 : {
87 104828075 : oid noid;
88 :
89 104828075 : if (node->left != NULL)
90 52268474 : OIDTreeToBATAntiset(node->left, bn, start, node->o);
91 : else
92 197574173 : for (noid = start; noid < node->o; noid++)
93 145014572 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
94 :
95 104828075 : if (node->right != NULL)
96 52269600 : OIDTreeToBATAntiset(node->right, bn, node->o + 1, stop);
97 : else
98 197541023 : for (noid = node->o+1; noid < stop; noid++)
99 144982548 : ((oid *) bn->theap->base)[bn->batCount++] = noid;
100 52558475 : }
101 :
102 : static BAT *
103 1060176 : do_batsample(oid hseq, BUN cnt, BUN n, random_state_engine rse, MT_Lock *lock)
104 : {
105 1060176 : BAT *bn;
106 1060176 : BUN slen;
107 1060176 : BUN rescnt;
108 1060176 : struct oidtreenode *tree = NULL;
109 :
110 1060176 : ERRORcheck(n > BUN_MAX, "sample size larger than BUN_MAX\n", NULL);
111 : /* empty sample size */
112 1060176 : if (n == 0) {
113 2 : bn = BATdense(0, 0, 0);
114 1060174 : } else if (cnt <= n) {
115 : /* sample size is larger than the input BAT, return
116 : * all oids */
117 717579 : bn = BATdense(0, hseq, cnt);
118 : } else {
119 342595 : oid minoid = hseq;
120 342595 : oid maxoid = hseq + cnt;
121 :
122 : /* if someone samples more than half of our tree, we
123 : * do the antiset */
124 342595 : bool antiset = n > cnt / 2;
125 342595 : slen = n;
126 342595 : if (antiset)
127 290001 : n = cnt - n;
128 :
129 342595 : tree = GDKmalloc(n * sizeof(struct oidtreenode));
130 342595 : if (tree == NULL) {
131 : return NULL;
132 : }
133 342595 : bn = COLnew(0, TYPE_oid, slen, TRANSIENT);
134 342594 : if (bn == NULL) {
135 0 : GDKfree(tree);
136 0 : return NULL;
137 : }
138 :
139 342594 : if (lock)
140 342584 : MT_lock_set(lock);
141 : /* generate a list of random numbers; note we use the
142 : * "tree" array, but we use the value from each location
143 : * before it is overwritten by the use as part of the
144 : * binary tree */
145 157759080 : for (rescnt = 0; rescnt < n; rescnt++)
146 157416485 : tree[rescnt].r = next(rse);
147 :
148 : /* while we do not have enough sample OIDs yet */
149 : BUN rnd = 0;
150 157759080 : for (rescnt = 0; rescnt < n; rescnt++) {
151 185684138 : oid candoid;
152 185684138 : do {
153 185684138 : if (rnd == n) {
154 : /* we ran out of random numbers,
155 : * so generate more */
156 29504172 : for (rnd = rescnt; rnd < n; rnd++)
157 28267653 : tree[rnd].r = next(rse);
158 : rnd = rescnt;
159 : }
160 185684138 : candoid = minoid + tree[rnd++].r % cnt;
161 : /* if that candidate OID was already
162 : * generated, try again */
163 185684138 : } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
164 : }
165 342595 : if (lock)
166 342585 : MT_lock_unset(lock);
167 342595 : if (!antiset) {
168 52594 : OIDTreeToBAT(tree, bn);
169 : } else {
170 290001 : OIDTreeToBATAntiset(tree, bn, minoid, maxoid);
171 : }
172 342595 : GDKfree(tree);
173 :
174 342595 : BATsetcount(bn, slen);
175 342595 : bn->trevsorted = bn->batCount <= 1;
176 342595 : bn->tsorted = true;
177 342595 : bn->tkey = true;
178 342595 : bn->tseqbase = bn->batCount == 0 ? 0 : bn->batCount == 1 ? *(oid *) Tloc(bn, 0) : oid_nil;
179 : }
180 : return bn;
181 : }
182 :
183 : /* BATsample implements sampling for BATs */
184 : BAT *
185 10 : BATsample_with_seed(BAT *b, BUN n, uint64_t seed)
186 : {
187 10 : random_state_engine rse;
188 :
189 10 : init_random_state_engine(rse, seed);
190 :
191 10 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, NULL);
192 10 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
193 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
194 10 : return bn;
195 : }
196 :
197 : static MT_Lock rse_lock = MT_LOCK_INITIALIZER(rse_lock);
198 : BAT *
199 1060166 : BATsample(BAT *b, BUN n)
200 : {
201 1060166 : static random_state_engine rse;
202 :
203 1060166 : MT_lock_set(&rse_lock);
204 1060166 : if (rse[0] == 0 && rse[1] == 0 && rse[2] == 0 && rse[3] == 0)
205 325 : init_random_state_engine(rse, (uint64_t) GDKusec());
206 1060166 : MT_lock_unset(&rse_lock);
207 1060166 : BAT *bn = do_batsample(b->hseqbase, BATcount(b), n, rse, &rse_lock);
208 1060164 : TRC_DEBUG(ALGO, ALGOBATFMT "," BUNFMT " -> " ALGOOPTBATFMT "\n",
209 : ALGOBATPAR(b), n, ALGOOPTBATPAR(bn));
210 1060164 : return bn;
211 : }
|