Line data Source code
1 : /*
2 : * SPDX-License-Identifier: MPL-2.0
3 : *
4 : * This Source Code Form is subject to the terms of the Mozilla Public
5 : * License, v. 2.0. If a copy of the MPL was not distributed with this
6 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 : *
8 : * Copyright 2024 MonetDB Foundation;
9 : * Copyright August 2008 - 2023 MonetDB B.V.;
10 : * Copyright 1997 - July 2008 CWI.
11 : */
12 :
13 : /*
14 : * N. Nes
15 : * PCRE library interface
16 : * The PCRE library is a set of functions that implement regular
17 : * expression pattern matching using the same syntax and semantics as Perl,
18 : * with just a few differences. The current implementation of PCRE
19 : * (release 4.x) corresponds approximately with Perl 5.8, including support
20 : * for UTF-8 encoded strings. However, this support has to be
21 : * explicitly enabled; it is not the default.
22 : *
23 : * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
24 : */
25 : #include "monetdb_config.h"
26 : #include <string.h>
27 :
28 : #include "mal.h"
29 : #include "mal_client.h"
30 : #include "mal_interpreter.h"
31 : #include "mal_exception.h"
32 :
33 : #include <wchar.h>
34 : #include <wctype.h>
35 :
36 : #ifdef HAVE_LIBPCRE
37 : #include <pcre.h>
38 : #ifndef PCRE_STUDY_JIT_COMPILE
39 : /* old library version on e.g. EPEL 6 */
40 : #define pcre_free_study(x) pcre_free(x)
41 : #define PCRE_STUDY_JIT_COMPILE 0
42 : #endif
43 : #define JIT_COMPILE_MIN 1024 /* when to try JIT compilation of patterns */
44 :
45 : #else
46 :
47 : #include <regex.h>
48 :
49 : typedef regex_t pcre;
50 : #endif
51 :
52 : /* current implementation assumes simple %keyword% [keyw%]* */
53 : struct RE {
54 : char *k;
55 : uint32_t *w;
56 : bool search:1, atend:1, is_ascii:1, case_ignore:1;
57 : size_t len;
58 : struct RE *n;
59 : };
60 :
61 : /* We cannot use strcasecmp and strncasecmp since they work byte for
62 : * byte and don't deal with multibyte encodings (such as UTF-8).
63 : *
64 : * We implement our own conversion from UTF-8 encoding to Unicode code
65 : * points which we store in uint32_t. The reason for this is,
66 : * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
67 : * locale to use them), and on Windows, wchar_t is only 2 bytes and
68 : * therefore cannot hold all Unicode code points. We do use functions
69 : * such as towlower to convert a Unicode code point to its lower-case
70 : * equivalent, but again on Windows, if the code point doesn't fit in
71 : * 2 bytes, we skip this conversion and compare the unconverted code
72 : * points.
73 : *
74 : * Note, towlower is also locale-dependent, but we don't need a UTF-8
75 : * locale in order to use it. */
76 :
77 : /* helper function to convert a UTF-8 multibyte character to a wide
78 : * character */
79 : static size_t
80 274 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
81 : {
82 274 : if ((src[0] & 0x80) == 0) {
83 217 : *dest = src[0];
84 217 : return src[0] != 0;
85 57 : } else if ((src[0] & 0xE0) == 0xC0
86 40 : && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
87 40 : *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
88 40 : return 2;
89 17 : } else if ((src[0] & 0xF0) == 0xE0
90 17 : && (src[1] & 0xC0) == 0x80
91 17 : && (src[2] & 0xC0) == 0x80
92 17 : && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
93 17 : *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
94 17 : return 3;
95 0 : } else if ((src[0] & 0xF8) == 0xF0
96 0 : && (src[1] & 0xC0) == 0x80
97 0 : && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
98 0 : uint32_t c = (src[0] & 0x07) << 18
99 0 : | (src[1] & 0x3F) << 12
100 0 : | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
101 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
102 : return (size_t) -1;
103 0 : *dest = c;
104 0 : return 4;
105 : }
106 : return (size_t) -1;
107 : }
108 :
109 : /* helper function to convert a UTF-8 string to a wide character
110 : * string, the wide character string is allocated */
111 : static uint32_t *
112 50 : utf8stoucs(const char *src)
113 : {
114 50 : uint32_t *dest;
115 50 : size_t i = 0;
116 50 : size_t j = 0;
117 :
118 : /* count how many uint32_t's we need, while also checking for
119 : * correctness of the input */
120 272 : while (src[j]) {
121 222 : i++;
122 222 : if ((src[j + 0] & 0x80) == 0) {
123 172 : j += 1;
124 50 : } else if ((src[j + 0] & 0xE0) == 0xC0
125 25 : && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
126 25 : j += 2;
127 25 : } else if ((src[j + 0] & 0xF0) == 0xE0
128 25 : && (src[j + 1] & 0xC0) == 0x80
129 25 : && (src[j + 2] & 0xC0) == 0x80
130 25 : && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
131 25 : j += 3;
132 0 : } else if ((src[j + 0] & 0xF8) == 0xF0
133 0 : && (src[j + 1] & 0xC0) == 0x80
134 0 : && (src[j + 2] & 0xC0) == 0x80
135 0 : && (src[j + 3] & 0xC0) == 0x80) {
136 0 : uint32_t c = (src[j + 0] & 0x07) << 18
137 0 : | (src[j + 1] & 0x3F) << 12
138 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
139 0 : if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
140 : return NULL;
141 0 : j += 4;
142 : } else {
143 : return NULL;
144 : }
145 : }
146 50 : dest = GDKmalloc((i + 1) * sizeof(uint32_t));
147 50 : if (dest == NULL)
148 : return NULL;
149 : /* go through the source string again, this time we can skip
150 : * the correctness tests */
151 : i = j = 0;
152 272 : while (src[j]) {
153 222 : if ((src[j + 0] & 0x80) == 0) {
154 172 : dest[i++] = src[j + 0];
155 172 : j += 1;
156 50 : } else if ((src[j + 0] & 0xE0) == 0xC0) {
157 25 : dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
158 25 : j += 2;
159 25 : } else if ((src[j + 0] & 0xF0) == 0xE0) {
160 25 : dest[i++] = (src[j + 0] & 0x0F) << 12
161 25 : | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
162 25 : j += 3;
163 0 : } else if ((src[j + 0] & 0xF8) == 0xF0) {
164 0 : dest[i++] = (src[j + 0] & 0x07) << 18
165 0 : | (src[j + 1] & 0x3F) << 12
166 0 : | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
167 0 : j += 4;
168 : }
169 : }
170 50 : dest[i] = 0;
171 50 : return dest;
172 : }
173 :
174 : static size_t
175 33 : myucslen(const uint32_t *ucs)
176 : {
177 33 : size_t i = 0;
178 :
179 66 : while (ucs[i])
180 33 : i++;
181 33 : return i;
182 : }
183 :
184 : static inline bool
185 14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
186 : bool atend)
187 : {
188 14 : uint32_t c1;
189 :
190 27 : while (n2 > 0) {
191 20 : size_t nn1 = utfc8touc(&c1, s1);
192 20 : if (nn1 == 0 || nn1 == (size_t) -1)
193 0 : return (*s2 == 0);
194 20 : if (*s2 == 0)
195 : return false;
196 : #if SIZEOF_WCHAR_T == 2
197 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
198 : if (c1 != *s2)
199 : return false;
200 : } else
201 : #endif
202 20 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
203 : return false;
204 13 : s1 += nn1;
205 13 : n2--;
206 13 : s2++;
207 : }
208 14 : return !atend || *s1 == 0;
209 : }
210 :
211 : static inline int
212 0 : mystrcasecmp(const char *s1, const char *s2)
213 : {
214 0 : uint32_t c1 = 0, c2 = 0;
215 :
216 0 : for (;;) {
217 0 : size_t nn1 = utfc8touc(&c1, s1);
218 0 : size_t nn2 = utfc8touc(&c2, s2);
219 0 : if (nn1 == 0 || nn1 == (size_t) -1)
220 0 : return -(nn2 != 0 && nn2 != (size_t) -1);
221 0 : if (nn2 == 0 || nn2 == (size_t) -1)
222 : return 1;
223 : #if SIZEOF_WCHAR_T == 2
224 : if (c1 > 0xFFFF || c2 > 0xFFFF) {
225 : if (c1 != c2)
226 : return c1 - c2;
227 : } else
228 : #endif
229 0 : if (towlower((wint_t) c1) != towlower((wint_t) c2))
230 0 : return towlower((wint_t) c1) - towlower((wint_t) c2);
231 0 : s1 += nn1;
232 0 : s2 += nn2;
233 : }
234 : }
235 :
236 : static inline int
237 42 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
238 : {
239 42 : uint32_t c1 = 0;
240 :
241 330 : for (;;) {
242 186 : size_t nn1 = utfc8touc(&c1, s1);
243 186 : if (nn1 == 0 || nn1 == (size_t) -1)
244 22 : return -(*s2 != 0);
245 164 : if (*s2 == 0)
246 : return 1;
247 : #if SIZEOF_WCHAR_T == 2
248 : if (c1 > 0xFFFF || *s2 > 0xFFFF) {
249 : if (c1 != *s2)
250 : return c1 - *s2;
251 : } else
252 : #endif
253 164 : if (towlower((wint_t) c1) != towlower((wint_t) * s2))
254 20 : return towlower((wint_t) c1) - towlower((wint_t) * s2);
255 144 : s1 += nn1;
256 144 : s2++;
257 : }
258 : }
259 :
260 : static inline const char *
261 33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
262 : bool atend)
263 : {
264 33 : size_t nlen = myucslen(wneedle);
265 :
266 33 : if (nlen == 0)
267 0 : return atend ? haystack + strlen(haystack) : haystack;
268 :
269 86 : while (*haystack) {
270 : size_t i;
271 : size_t h;
272 : size_t step = 0;
273 83 : for (i = h = 0; i < nlen; i++) {
274 68 : uint32_t c = 0;
275 68 : size_t j = utfc8touc(&c, haystack + h);
276 68 : if (j == 0 || j == (size_t) -1)
277 0 : return NULL;
278 68 : if (i == 0) {
279 68 : step = j;
280 : }
281 : #if SIZEOF_WCHAR_T == 2
282 : if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
283 : if (c != wneedle[i])
284 : break;
285 : } else
286 : #endif
287 68 : if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
288 : break;
289 15 : h += j;
290 : }
291 68 : if (i == nlen && (!atend || haystack[h] == 0))
292 15 : return haystack;
293 53 : haystack += step;
294 : }
295 : return NULL;
296 : }
297 :
298 : /* returns true if the pattern does not contain unescaped `_' (single
299 : * character match) and ends with unescaped `%' (any sequence
300 : * match) */
301 : static inline bool
302 6471 : re_simple(const char *pat, unsigned char esc)
303 : {
304 6471 : bool escaped = false;
305 :
306 6471 : if (pat == 0)
307 : return false;
308 6471 : if (*pat == '%') {
309 5700 : pat++;
310 : }
311 46910 : while (*pat) {
312 41020 : if (escaped) {
313 : escaped = false;
314 40877 : } else if ((unsigned char) *pat == esc) {
315 : escaped = true;
316 40734 : } else if (*pat == '_') {
317 : return false;
318 : }
319 40439 : pat++;
320 : }
321 : return true;
322 : }
323 :
324 : static inline bool
325 7344 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
326 : {
327 7344 : bool escaped = false;
328 :
329 7344 : if (pat == 0)
330 : return true;
331 61605 : while (*pat) {
332 54261 : if (escaped) {
333 : escaped = false;
334 54109 : } else if ((unsigned char) *pat == esc) {
335 54261 : escaped = true;
336 : }
337 54261 : pat++;
338 : }
339 7344 : return escaped ? false : true;
340 : }
341 :
342 : /* returns true if the pattern does not contain wildcard
343 : * characters ('%' or '_') and no character is escaped
344 : */
345 : static inline bool
346 7345 : is_strcmpable(const char *pat, const char *esc)
347 : {
348 7345 : if (pat[strcspn(pat, "%_")])
349 : return false;
350 1814 : return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
351 : }
352 :
353 : /* Compare two strings ignoring case. When both strings are
354 : * lower case this function returns the same result as strcmp.
355 : */
356 : static int
357 1204 : istrcmp(const char *s1, const char *s2)
358 : {
359 1204 : char c1, c2;
360 1204 : const char *p1, *p2;
361 1629 : for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
362 1113 : c1 = *p1;
363 1113 : c2 = *p2;
364 :
365 1113 : if ('A' <= c1 && c1 <= 'Z')
366 611 : c1 += 'a' - 'A';
367 :
368 1113 : if ('A' <= c2 && c2 <= 'Z')
369 656 : c2 += 'a' - 'A';
370 :
371 1113 : if (c1 != c2)
372 688 : return (c1 - c2);
373 : }
374 :
375 516 : if (*p1 != *p2)
376 443 : return *p1 - *p2;
377 :
378 : return 0;
379 : }
380 :
381 : /* Compare at most len characters of two strings ignoring
382 : * case. When both strings are lowercase this function
383 : * returns the same result as strncmp.
384 : */
385 : static int
386 16 : istrncmp(const char *s1, const char *s2, size_t len)
387 : {
388 16 : char c1, c2;
389 16 : const char *p1, *p2;
390 16 : size_t n = 0;
391 :
392 32 : for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
393 16 : c1 = *p1;
394 16 : c2 = *p2;
395 :
396 16 : if ('A' <= c1 && c1 <= 'Z')
397 4 : c1 += 'a' - 'A';
398 :
399 16 : if ('A' <= c2 && c2 <= 'Z')
400 0 : c2 += 'a' - 'A';
401 :
402 16 : if (c1 != c2)
403 0 : return c1 - c2;
404 : }
405 :
406 16 : if (*p1 != *p2 && n < len)
407 0 : return *p1 - *p2;
408 :
409 : return 0;
410 : }
411 :
412 :
413 : /* Find the first occurence of the substring needle in
414 : * haystack ignoring case.
415 : *
416 : * NOTE: This function assumes that the needle is already
417 : * lowercase.
418 : */
419 : static const char *
420 6249 : istrstr(const char *haystack, const char *needle)
421 : {
422 6249 : const char *ph;
423 6249 : const char *pn;
424 6249 : const char *p1;
425 6249 : bool match = true;
426 :
427 298316 : for (ph = haystack; *ph; ph++) {
428 355689 : match = true;
429 355689 : for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
430 353498 : char c1 = *pn;
431 353498 : char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
432 353498 : if (c1 != c2) {
433 : match = false;
434 : break;
435 : }
436 : }
437 :
438 : /* We reached the end of the haystack, but we still have characters in
439 : * needle. None of the future iterations will match.
440 : */
441 294258 : if (*p1 == 0 && *pn != 0) {
442 : break;
443 : }
444 :
445 294258 : if (match) {
446 2191 : return ph;
447 : }
448 : }
449 : return NULL;
450 : }
451 :
452 : /* Match regular expression by comparing bytes.
453 : *
454 : * This is faster than re_match_ignore, because it does not
455 : * need to decode characters. This function should be used
456 : * in all cases except when we need to perform UTF-8
457 : * comparisons ignoring case.
458 : *
459 : * TODO: The name of the function is no longer accurate and
460 : * needs to change.
461 : */
462 : static inline bool
463 170781 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
464 : {
465 170781 : const struct RE *r;
466 170781 : size_t l;
467 :
468 228613 : for (r = pattern; r; r = r->n) {
469 171236 : if (*r->k == 0 && (r->search || *s == 0))
470 : return true;
471 150421 : if (!*s ||
472 : (r->search
473 150348 : ? (r->atend
474 136807 : ? (r->case_ignore
475 6055 : ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
476 5971 : : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
477 130752 : : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
478 124504 : : (s = strstr(s, r->k)) == NULL))
479 : : (r->atend
480 13541 : ? (r->case_ignore ? istrcmp(s, r->k) != 0
481 95 : : strcmp(s, r->k) != 0)
482 13446 : : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
483 13430 : : strncmp(s, r->k, r->len) != 0))))
484 : return false;
485 57832 : s += r->len;
486 : }
487 : return true;
488 : }
489 :
490 : /* Match a regular expression by comparing wide characters.
491 : *
492 : * This needs to be used when we need to perform a
493 : * case-ignoring comparions involving UTF-8 characters.
494 : */
495 : static inline bool
496 44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
497 : {
498 44 : const struct RE *r;
499 :
500 : /* Since the pattern is ascii, do the cheaper comparison */
501 44 : if (pattern->is_ascii) {
502 0 : return re_match_no_ignore(s, pattern);
503 : }
504 :
505 66 : for (r = pattern; r; r = r->n) {
506 47 : if (*r->w == 0 && (r->search || *s == 0))
507 : return true;
508 47 : if (!*s ||
509 : (r->search
510 47 : ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
511 14 : : !mywstrncaseeq(s, r->w, r->len, r->atend)))
512 : return false;
513 22 : s += r->len;
514 : }
515 : return true;
516 : }
517 :
518 : static void
519 5889 : re_destroy(struct RE *p)
520 : {
521 5889 : if (p) {
522 5889 : GDKfree(p->k);
523 5889 : GDKfree(p->w);
524 5983 : do {
525 5983 : struct RE *n = p->n;
526 :
527 5983 : GDKfree(p);
528 5984 : p = n;
529 5984 : } while (p);
530 : }
531 5890 : }
532 :
533 : /* Create a linked list of RE structures. Depending on the
534 : * caseignore and the ascii_pattern flags, the w
535 : * (if caseignore == true && ascii_pattern == false) or the k
536 : * (in every other case) field is used. These in the first
537 : * structure are allocated, whereas in all subsequent
538 : * structures the fields point into the allocated buffer of
539 : * the first.
540 : */
541 : static struct RE *
542 5890 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
543 : {
544 5890 : struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
545 5890 : bool escaped = false;
546 :
547 5890 : if (r == NULL)
548 : return NULL;
549 5890 : *r = (struct RE) {.atend = true };
550 :
551 11271 : while (esc != '%' && *pat == '%') {
552 5381 : pat++; /* skip % */
553 5381 : r->search = true;
554 : }
555 5890 : if (caseignore && !ascii_pattern) {
556 20 : uint32_t *wp;
557 20 : uint32_t *wq;
558 20 : wp = utf8stoucs(pat);
559 20 : if (wp == NULL) {
560 0 : GDKfree(r);
561 0 : return NULL;
562 : }
563 20 : r->w = wp;
564 20 : wq = wp;
565 68 : while (*wp) {
566 48 : if (escaped) {
567 0 : *wq++ = *wp;
568 0 : n->len++;
569 0 : escaped = false;
570 48 : } else if (*wp == esc) {
571 : escaped = true;
572 48 : } else if (*wp == '%') {
573 16 : n->atend = false;
574 16 : while (wp[1] == '%')
575 0 : wp++;
576 16 : if (wp[1]) {
577 4 : n = n->n = GDKmalloc(sizeof(struct RE));
578 4 : if (n == NULL)
579 0 : goto bailout;
580 4 : *n = (struct RE) {
581 : .search = true,
582 : .atend = true,
583 4 : .w = wp + 1,
584 : };
585 : }
586 16 : *wq = 0;
587 16 : wq = wp + 1;
588 : } else {
589 32 : *wq++ = *wp;
590 32 : n->len++;
591 : }
592 48 : wp++;
593 : }
594 20 : *wq = 0;
595 : } else {
596 5870 : char *p, *q;
597 5870 : if ((p = GDKstrdup(pat)) == NULL) {
598 0 : GDKfree(r);
599 0 : return NULL;
600 : }
601 5870 : if (ascii_pattern)
602 5867 : n->is_ascii = true;
603 5870 : if (caseignore)
604 59 : n->case_ignore = true;
605 :
606 59 : if (ascii_pattern && caseignore) {
607 568 : for (q = p; *q != 0; q++) {
608 509 : if ('A' <= *q && *q <= 'Z')
609 18 : *q += 'a' - 'A';
610 : }
611 : }
612 :
613 5870 : r->k = p;
614 5870 : q = p;
615 44731 : while (*p) {
616 38861 : if (escaped) {
617 136 : *q++ = *p;
618 136 : n->len++;
619 136 : escaped = false;
620 38725 : } else if ((unsigned char) *p == esc) {
621 : escaped = true;
622 38589 : } else if (*p == '%') {
623 5638 : n->atend = false;
624 5666 : while (p[1] == '%')
625 28 : p++;
626 5638 : if (p[1]) {
627 90 : n = n->n = GDKmalloc(sizeof(struct RE));
628 90 : if (n == NULL)
629 0 : goto bailout;
630 90 : *n = (struct RE) {
631 : .search = true,
632 : .atend = true,
633 90 : .k = p + 1
634 : };
635 90 : if (ascii_pattern) {
636 87 : n->is_ascii = true;
637 : }
638 90 : if (caseignore) {
639 16 : n->case_ignore = true;
640 : }
641 : }
642 5638 : *q = 0;
643 5638 : q = p + 1;
644 : } else {
645 32951 : char c = *p;
646 32951 : if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
647 0 : c += 'a' - 'A';
648 : }
649 32951 : *q++ = c;
650 32951 : n->len++;
651 : }
652 38861 : p++;
653 : }
654 5870 : *q = 0;
655 : }
656 : return r;
657 0 : bailout:
658 0 : re_destroy(r);
659 0 : return NULL;
660 : }
661 :
662 : #ifdef HAVE_LIBPCRE
663 : static str
664 25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
665 : {
666 25 : pcre *r;
667 25 : const char *err_p = NULL;
668 25 : int errpos = 0;
669 25 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
670 25 : if (insensitive)
671 0 : options |= PCRE_CASELESS;
672 :
673 25 : if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
674 0 : throw(MAL, "pcre.compile", OPERATION_FAILED
675 : " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
676 : }
677 25 : *res = r;
678 25 : return MAL_SUCCEED;
679 : }
680 : #endif
681 :
682 : /* maximum number of back references and quoted \ or $ in replacement string */
683 : #define MAX_NR_REFS 20
684 :
685 : struct backref {
686 : int idx;
687 : int start;
688 : int end;
689 : };
690 :
691 : #ifdef HAVE_LIBPCRE
692 : /* fill in parameter backrefs (length maxrefs) with information about
693 : * back references in the replacement string; a back reference is a
694 : * dollar or backslash followed by a number */
695 : static int
696 60 : parse_replacement(const char *replacement, int len_replacement,
697 : struct backref *backrefs, int maxrefs)
698 : {
699 60 : int nbackrefs = 0;
700 :
701 108 : for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
702 48 : if (replacement[i] == '$' || replacement[i] == '\\') {
703 6 : char *endptr;
704 6 : backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
705 6 : if (endptr > replacement + i + 1) {
706 6 : int k = (int) (endptr - (replacement + i + 1));
707 6 : backrefs[nbackrefs].start = i;
708 6 : backrefs[nbackrefs].end = i + k + 1;
709 6 : nbackrefs++;
710 0 : } else if (replacement[i] == replacement[i + 1]) {
711 : /* doubled $ or \, we must copy just one to the output */
712 0 : backrefs[nbackrefs].idx = INT_MAX; /* impossible value > 0 */
713 0 : backrefs[nbackrefs].start = i;
714 0 : backrefs[nbackrefs].end = i + 1;
715 0 : i++; /* don't look at second $ or \ again */
716 0 : nbackrefs++;
717 : }
718 : /* else: $ or \ followed by something we don't recognize,
719 : * so just leave it */
720 : }
721 : }
722 60 : return nbackrefs;
723 : }
724 :
725 : static char *
726 28316 : single_replace(pcre *pcre_code, pcre_extra *extra,
727 : const char *origin_str, int len_origin_str,
728 : int exec_options, int *ovector, int ovecsize,
729 : const char *replacement, int len_replacement,
730 : struct backref *backrefs, int nbackrefs,
731 : bool global, char *result, int *max_result)
732 : {
733 28316 : int offset = 0;
734 28316 : int len_result = 0;
735 104799 : int addlen;
736 104799 : char *tmp;
737 :
738 104799 : do {
739 104799 : int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
740 : exec_options, ovector, ovecsize);
741 104908 : if (j <= 0)
742 : break;
743 78653 : addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
744 78653 : if (len_result + addlen >= *max_result) {
745 6840 : tmp = GDKrealloc(result, len_result + addlen + 1);
746 6840 : if (tmp == NULL) {
747 0 : GDKfree(result);
748 0 : return NULL;
749 : }
750 6840 : result = tmp;
751 6840 : *max_result = len_result + addlen + 1;
752 : }
753 78653 : if (ovector[0] > offset) {
754 76482 : strncpy(result + len_result, origin_str + offset,
755 76482 : ovector[0] - offset);
756 76482 : len_result += ovector[0] - offset;
757 : }
758 78653 : if (nbackrefs == 0) {
759 76486 : strncpy(result + len_result, replacement, len_replacement);
760 76486 : len_result += len_replacement;
761 : } else {
762 : int prevend = 0;
763 4334 : for (int i = 0; i < nbackrefs; i++) {
764 2167 : int off, len;
765 2167 : if (backrefs[i].idx >= ovecsize / 3) {
766 : /* out of bounds, replace with empty string */
767 : off = 0;
768 : len = 0;
769 : } else {
770 2167 : off = ovector[backrefs[i].idx * 2];
771 2167 : len = ovector[backrefs[i].idx * 2 + 1] - off;
772 : }
773 2167 : addlen = backrefs[i].start - prevend + len;
774 2167 : if (len_result + addlen >= *max_result) {
775 21 : tmp = GDKrealloc(result, len_result + addlen + 1);
776 21 : if (tmp == NULL) {
777 0 : GDKfree(result);
778 0 : return NULL;
779 : }
780 21 : result = tmp;
781 21 : *max_result = len_result + addlen + 1;
782 : }
783 2167 : if (backrefs[i].start > prevend) {
784 2 : strncpy(result + len_result, replacement + prevend,
785 2 : backrefs[i].start - prevend);
786 2 : len_result += backrefs[i].start - prevend;
787 : }
788 2167 : if (len > 0) {
789 2167 : strncpy(result + len_result, origin_str + off, len);
790 2167 : len_result += len;
791 : }
792 2167 : prevend = backrefs[i].end;
793 : }
794 : /* copy rest of replacement string (after last backref) */
795 2167 : addlen = len_replacement - prevend;
796 2167 : if (addlen > 0) {
797 2 : if (len_result + addlen >= *max_result) {
798 1 : tmp = GDKrealloc(result, len_result + addlen + 1);
799 1 : if (tmp == NULL) {
800 0 : GDKfree(result);
801 0 : return NULL;
802 : }
803 1 : result = tmp;
804 1 : *max_result = len_result + addlen + 1;
805 : }
806 2 : strncpy(result + len_result, replacement + prevend, addlen);
807 2 : len_result += addlen;
808 : }
809 : }
810 78653 : offset = ovector[1];
811 78653 : } while (offset < len_origin_str && global);
812 28425 : if (offset < len_origin_str) {
813 26257 : addlen = len_origin_str - offset;
814 26257 : if (len_result + addlen >= *max_result) {
815 328 : tmp = GDKrealloc(result, len_result + addlen + 1);
816 328 : if (tmp == NULL) {
817 0 : GDKfree(result);
818 0 : return NULL;
819 : }
820 328 : result = tmp;
821 328 : *max_result = len_result + addlen + 1;
822 : }
823 26257 : strncpy(result + len_result, origin_str + offset, addlen);
824 26257 : len_result += addlen;
825 : }
826 : /* null terminate string */
827 28425 : result[len_result] = '\0';
828 28425 : return result;
829 : }
830 : #endif
831 :
832 : static str
833 10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
834 : const char *replacement, const char *flags, bool global)
835 : {
836 : #ifdef HAVE_LIBPCRE
837 10 : const char *err_p = NULL;
838 10 : pcre *pcre_code = NULL;
839 10 : pcre_extra *extra;
840 10 : char *tmpres;
841 10 : int max_result;
842 10 : int i, errpos = 0;
843 10 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
844 10 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
845 10 : int *ovector, ovecsize;
846 10 : int len_origin_str = (int) strlen(origin_str);
847 10 : int len_replacement = (int) strlen(replacement);
848 10 : struct backref backrefs[MAX_NR_REFS];
849 10 : int nbackrefs = 0;
850 :
851 14 : while (*flags) {
852 4 : switch (*flags) {
853 : case 'e':
854 : exec_options &= ~PCRE_NOTEMPTY;
855 : break;
856 1 : case 'i':
857 1 : compile_options |= PCRE_CASELESS;
858 1 : break;
859 1 : case 'm':
860 1 : compile_options |= PCRE_MULTILINE;
861 1 : break;
862 1 : case 's':
863 1 : compile_options |= PCRE_DOTALL;
864 1 : break;
865 1 : case 'x':
866 1 : compile_options |= PCRE_EXTENDED;
867 1 : break;
868 0 : default:
869 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
870 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
871 : *flags);
872 : }
873 4 : flags++;
874 : }
875 :
876 10 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
877 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
878 : OPERATION_FAILED
879 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
880 : pattern, errpos, err_p);
881 : }
882 :
883 : /* Since the compiled pattern is going to be used several times, it is
884 : * worth spending more time analyzing it in order to speed up the time
885 : * taken for matching.
886 : */
887 10 : extra = pcre_study(pcre_code, 0, &err_p);
888 10 : if (err_p != NULL) {
889 0 : pcre_free(pcre_code);
890 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
891 : OPERATION_FAILED
892 : ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
893 : err_p);
894 : }
895 10 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
896 10 : ovecsize = (i + 1) * 3;
897 10 : if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
898 0 : pcre_free_study(extra);
899 0 : pcre_free(pcre_code);
900 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
901 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
902 : }
903 :
904 : /* identify back references in the replacement string */
905 10 : nbackrefs = parse_replacement(replacement, len_replacement,
906 : backrefs, MAX_NR_REFS);
907 :
908 10 : max_result = len_origin_str + 1;
909 10 : tmpres = GDKmalloc(max_result);
910 10 : if (tmpres == NULL) {
911 0 : GDKfree(ovector);
912 0 : pcre_free_study(extra);
913 0 : pcre_free(pcre_code);
914 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
915 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
916 : }
917 :
918 10 : tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
919 : exec_options, ovector, ovecsize, replacement,
920 : len_replacement, backrefs, nbackrefs, global,
921 : tmpres, &max_result);
922 10 : GDKfree(ovector);
923 10 : pcre_free_study(extra);
924 10 : pcre_free(pcre_code);
925 10 : if (tmpres == NULL)
926 0 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
927 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
928 :
929 10 : *res = tmpres;
930 10 : return MAL_SUCCEED;
931 : #else
932 : (void) res;
933 : (void) origin_str;
934 : (void) pattern;
935 : (void) replacement;
936 : (void) flags;
937 : (void) global;
938 : throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
939 : "Database was compiled without PCRE support.");
940 : #endif
941 : }
942 :
943 : static str
944 50 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
945 : const char *replacement, const char *flags, bool global)
946 : {
947 : #ifdef HAVE_LIBPCRE
948 50 : const char *err_p = NULL;
949 50 : char *tmpres;
950 50 : int i, errpos = 0;
951 50 : int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
952 50 : int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
953 50 : pcre *pcre_code = NULL;
954 50 : pcre_extra *extra;
955 50 : BAT *tmpbat;
956 50 : BUN p, q;
957 50 : int *ovector, ovecsize;
958 50 : int len_replacement = (int) strlen(replacement);
959 50 : struct backref backrefs[MAX_NR_REFS];
960 50 : int nbackrefs = 0;
961 50 : const char *origin_str;
962 50 : int max_dest_size = 0;
963 :
964 70 : while (*flags) {
965 20 : switch (*flags) {
966 : case 'e':
967 : exec_options &= ~PCRE_NOTEMPTY;
968 : break;
969 5 : case 'i':
970 5 : compile_options |= PCRE_CASELESS;
971 5 : break;
972 10 : case 'm':
973 10 : compile_options |= PCRE_MULTILINE;
974 10 : break;
975 5 : case 's':
976 5 : compile_options |= PCRE_DOTALL;
977 5 : break;
978 0 : case 'x':
979 0 : compile_options |= PCRE_EXTENDED;
980 0 : break;
981 0 : default:
982 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
983 : ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
984 : *flags);
985 : }
986 20 : flags++;
987 : }
988 :
989 50 : if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
990 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
991 : OPERATION_FAILED
992 : ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
993 : pattern, errpos, err_p);
994 : }
995 :
996 : /* Since the compiled pattern is going to be used several times,
997 : * it is worth spending more time analyzing it in order to speed
998 : * up the time taken for matching.
999 : */
1000 100 : extra = pcre_study(pcre_code,
1001 50 : BATcount(origin_strs) >
1002 : JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
1003 50 : if (err_p != NULL) {
1004 0 : pcre_free(pcre_code);
1005 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1006 : OPERATION_FAILED);
1007 : }
1008 50 : pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
1009 50 : ovecsize = (i + 1) * 3;
1010 50 : if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
1011 0 : pcre_free_study(extra);
1012 0 : pcre_free(pcre_code);
1013 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1014 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1015 : }
1016 :
1017 : /* identify back references in the replacement string */
1018 50 : nbackrefs = parse_replacement(replacement, len_replacement,
1019 : backrefs, MAX_NR_REFS);
1020 :
1021 50 : tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
1022 : TRANSIENT);
1023 :
1024 : /* the buffer for all destination strings is allocated only once,
1025 : * and extended when needed */
1026 50 : max_dest_size = len_replacement + 1;
1027 50 : tmpres = GDKmalloc(max_dest_size);
1028 50 : if (tmpbat == NULL || tmpres == NULL) {
1029 0 : pcre_free_study(extra);
1030 0 : pcre_free(pcre_code);
1031 0 : GDKfree(ovector);
1032 0 : BBPreclaim(tmpbat);
1033 0 : GDKfree(tmpres);
1034 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1035 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1036 : }
1037 50 : BATiter origin_strsi = bat_iterator(origin_strs);
1038 28304 : BATloop(origin_strs, p, q) {
1039 28254 : origin_str = BUNtvar(origin_strsi, p);
1040 56657 : tmpres = single_replace(pcre_code, extra, origin_str,
1041 28254 : (int) strlen(origin_str), exec_options,
1042 : ovector, ovecsize, replacement,
1043 : len_replacement, backrefs, nbackrefs, global,
1044 : tmpres, &max_dest_size);
1045 28403 : if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
1046 0 : bat_iterator_end(&origin_strsi);
1047 0 : pcre_free_study(extra);
1048 0 : pcre_free(pcre_code);
1049 0 : GDKfree(ovector);
1050 0 : GDKfree(tmpres);
1051 0 : BBPreclaim(tmpbat);
1052 0 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1053 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1054 : }
1055 : }
1056 50 : bat_iterator_end(&origin_strsi);
1057 50 : pcre_free_study(extra);
1058 50 : pcre_free(pcre_code);
1059 50 : GDKfree(ovector);
1060 50 : GDKfree(tmpres);
1061 50 : *res = tmpbat;
1062 50 : return MAL_SUCCEED;
1063 : #else
1064 : (void) res;
1065 : (void) origin_strs;
1066 : (void) pattern;
1067 : (void) replacement;
1068 : (void) flags;
1069 : (void) global;
1070 : throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
1071 : "Database was compiled without PCRE support.");
1072 : #endif
1073 : }
1074 :
1075 : static str
1076 74 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
1077 : const char *flags)
1078 : {
1079 74 : int pos;
1080 : #ifdef HAVE_LIBPCRE
1081 74 : const char *err_p = NULL;
1082 74 : int errpos = 0;
1083 74 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
1084 74 : pcre *re;
1085 : #else
1086 : int options = REG_NOSUB;
1087 : regex_t re;
1088 : int errcode;
1089 : int retval;
1090 : #endif
1091 :
1092 148 : while (*flags) {
1093 74 : switch (*flags) {
1094 0 : case 'i':
1095 : #ifdef HAVE_LIBPCRE
1096 0 : options |= PCRE_CASELESS;
1097 : #else
1098 : options |= REG_ICASE;
1099 : #endif
1100 0 : break;
1101 0 : case 'm':
1102 : #ifdef HAVE_LIBPCRE
1103 0 : options |= PCRE_MULTILINE;
1104 : #else
1105 : options |= REG_NEWLINE;
1106 : #endif
1107 0 : break;
1108 : #ifdef HAVE_LIBPCRE
1109 74 : case 's':
1110 74 : options |= PCRE_DOTALL;
1111 74 : break;
1112 : #endif
1113 0 : case 'x':
1114 : #ifdef HAVE_LIBPCRE
1115 0 : options |= PCRE_EXTENDED;
1116 : #else
1117 : options |= REG_EXTENDED;
1118 : #endif
1119 0 : break;
1120 0 : default:
1121 0 : throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
1122 : ": unsupported flag character '%c'\n", *flags);
1123 : }
1124 74 : flags++;
1125 : }
1126 74 : if (strNil(val)) {
1127 0 : *ret = FALSE;
1128 0 : return MAL_SUCCEED;
1129 : }
1130 :
1131 : #ifdef HAVE_LIBPCRE
1132 74 : if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
1133 : #else
1134 : if ((errcode = regcomp(&re, pat, options)) != 0)
1135 : #endif
1136 : {
1137 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1138 : ": compilation of regular expression (%s) failed "
1139 : #ifdef HAVE_LIBPCRE
1140 : "at %d with '%s'", pat, errpos, err_p
1141 : #else
1142 : , pat
1143 : #endif
1144 : );
1145 : }
1146 : #ifdef HAVE_LIBPCRE
1147 74 : pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
1148 : NULL, 0);
1149 74 : pcre_free(re);
1150 : #else
1151 : retval = regexec(&re, val, (size_t) 0, NULL, 0);
1152 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1153 : regfree(&re);
1154 : #endif
1155 74 : if (pos >= 0)
1156 10 : *ret = TRUE;
1157 64 : else if (pos == -1)
1158 64 : *ret = FALSE;
1159 : else
1160 0 : throw(MAL, "pcre.match", OPERATION_FAILED
1161 : ": matching of regular expression (%s) failed with %d", pat, pos);
1162 : return MAL_SUCCEED;
1163 : }
1164 :
1165 : #ifdef HAVE_LIBPCRE
1166 : /* special characters in PCRE that need to be escaped */
1167 : static const char *pcre_specials = ".+?*()[]{}|^$\\";
1168 : #else
1169 : /* special characters in POSIX basic regular expressions that need to
1170 : * be escaped */
1171 : static const char *pcre_specials = "^.[$()|*+?{\\";
1172 : #endif
1173 :
1174 : /* change SQL LIKE pattern into PCRE pattern */
1175 : static str
1176 587 : sql2pcre(str *r, const char *pat, const char *esc_str)
1177 : {
1178 587 : int escaped = 0;
1179 587 : int hasWildcard = 0;
1180 587 : char *ppat;
1181 1174 : int esc = strNil(esc_str) ? 0 : esc_str[0]; /* should change to utf8_convert() */
1182 587 : int specials;
1183 587 : int c;
1184 :
1185 587 : if (strlen(esc_str) > 1)
1186 0 : throw(MAL, "pcre.sql2pcre",
1187 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1188 : ": ESCAPE string must have length 1");
1189 587 : if (pat == NULL)
1190 0 : throw(MAL, "pcre.sql2pcre",
1191 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1192 : ": (I)LIKE pattern must not be NULL");
1193 587 : ppat = GDKmalloc(strlen(pat) * 3 +
1194 : 3 /* 3 = "^'the translated regexp'$0" */ );
1195 587 : if (ppat == NULL)
1196 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1197 :
1198 587 : *r = ppat;
1199 : /* The escape character can be a char which is special in a PCRE
1200 : * expression. If the user used the "+" char as escape and has "++"
1201 : * in their pattern, then replacing this with "+" is not correct and
1202 : * should be "\+" instead. */
1203 587 : specials = (esc && strchr(pcre_specials, esc) != NULL);
1204 :
1205 587 : *ppat++ = '^';
1206 5950 : while ((c = *pat++) != 0) {
1207 5363 : if (c == esc) {
1208 15 : if (escaped) {
1209 1 : if (specials) { /* change ++ into \+ */
1210 1 : *ppat++ = esc;
1211 : } else { /* do not escape simple escape symbols */
1212 0 : ppat[-1] = esc; /* overwrite backslash */
1213 : }
1214 : escaped = 0;
1215 : } else {
1216 14 : *ppat++ = '\\';
1217 14 : escaped = 1;
1218 : }
1219 : hasWildcard = 1;
1220 5348 : } else if (strchr(pcre_specials, c) != NULL) {
1221 : /* escape PCRE special chars, avoid double backslash if the
1222 : * user uses an invalid escape sequence */
1223 28 : if (!escaped)
1224 28 : *ppat++ = '\\';
1225 28 : *ppat++ = c;
1226 28 : hasWildcard = 1;
1227 28 : escaped = 0;
1228 5320 : } else if (c == '%' && !escaped) {
1229 721 : *ppat++ = '.';
1230 721 : *ppat++ = '*';
1231 721 : *ppat++ = '?';
1232 721 : hasWildcard = 1;
1233 : /* collapse multiple %, but only if it isn't the escape */
1234 721 : if (esc != '%')
1235 721 : while (*pat == '%')
1236 0 : pat++;
1237 4599 : } else if (c == '_' && !escaped) {
1238 694 : *ppat++ = '.';
1239 694 : hasWildcard = 1;
1240 : } else {
1241 3905 : if (escaped) {
1242 13 : ppat[-1] = c; /* overwrite backslash of invalid escape */
1243 : } else {
1244 3892 : *ppat++ = c;
1245 : }
1246 : escaped = 0;
1247 : }
1248 : }
1249 : /* no wildcard or escape character at end of string */
1250 587 : if (!hasWildcard || escaped) {
1251 1 : GDKfree(*r);
1252 1 : *r = NULL;
1253 1 : if (escaped)
1254 0 : throw(MAL, "pcre.sql2pcre",
1255 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1256 : ": (I)LIKE pattern must not end with escape character");
1257 1 : *r = GDKstrdup(str_nil);
1258 1 : if (*r == NULL)
1259 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1260 : } else {
1261 586 : *ppat++ = '$';
1262 586 : *ppat = 0;
1263 : }
1264 : return MAL_SUCCEED;
1265 : }
1266 :
1267 : #ifdef HAVE_LIBPCRE
1268 : /* change SQL PATINDEX pattern into PCRE pattern */
1269 : static str
1270 25 : pat2pcre(str *r, const char *pat)
1271 : {
1272 25 : size_t len = strlen(pat);
1273 25 : char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
1274 25 : int start = 0;
1275 :
1276 25 : if (ppat == NULL)
1277 0 : throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1278 25 : *r = ppat;
1279 77 : while (*pat) {
1280 52 : int c = *pat++;
1281 :
1282 52 : if (strchr(pcre_specials, c) != NULL) {
1283 17 : *ppat++ = '\\';
1284 17 : *ppat++ = c;
1285 35 : } else if (c == '%') {
1286 3 : if (start && *pat) {
1287 0 : *ppat++ = '.';
1288 0 : *ppat++ = '*';
1289 : }
1290 3 : start++;
1291 32 : } else if (c == '_') {
1292 0 : *ppat++ = '.';
1293 : } else {
1294 32 : *ppat++ = c;
1295 : }
1296 : }
1297 25 : *ppat = 0;
1298 25 : return MAL_SUCCEED;
1299 : }
1300 : #endif
1301 :
1302 : /*
1303 : * @+ Wrapping
1304 : */
1305 :
1306 : static str
1307 10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
1308 : const str *flags)
1309 : {
1310 10 : return pcre_replace(res, *or, *pat, *repl, *flags, true);
1311 : }
1312 :
1313 : static str
1314 0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
1315 : const str *flags)
1316 : {
1317 0 : return pcre_replace(res, *or, *pat, *repl, *flags, false);
1318 : }
1319 :
1320 : static str
1321 50 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
1322 : const str *flags)
1323 : {
1324 50 : BAT *b, *bn = NULL;
1325 50 : str msg;
1326 50 : if ((b = BATdescriptor(*bid)) == NULL)
1327 0 : throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1328 :
1329 50 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
1330 50 : if (msg == MAL_SUCCEED) {
1331 50 : *res = bn->batCacheid;
1332 50 : BBPkeepref(bn);
1333 : }
1334 50 : BBPunfix(b->batCacheid);
1335 50 : return msg;
1336 : }
1337 :
1338 : static str
1339 0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
1340 : const str *repl, const str *flags)
1341 : {
1342 0 : BAT *b, *bn = NULL;
1343 0 : str msg;
1344 0 : if ((b = BATdescriptor(*bid)) == NULL)
1345 0 : throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
1346 :
1347 0 : msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
1348 0 : if (msg == MAL_SUCCEED) {
1349 0 : *res = bn->batCacheid;
1350 0 : BBPkeepref(bn);
1351 : }
1352 0 : BBPunfix(b->batCacheid);
1353 0 : return msg;
1354 : }
1355 :
1356 : static str
1357 74 : PCREmatch(bit *ret, const str *val, const str *pat)
1358 : {
1359 4 : return pcre_match_with_flags(ret, *val, *pat,
1360 : #ifdef HAVE_LIBPCRE
1361 : "s"
1362 : #else
1363 : "x"
1364 : #endif
1365 : );
1366 : }
1367 :
1368 : static str
1369 0 : PCREimatch(bit *ret, const str *val, const str *pat)
1370 : {
1371 0 : return pcre_match_with_flags(ret, *val, *pat, "i"
1372 : #ifndef HAVE_LIBPCRE
1373 : "x"
1374 : #endif
1375 : );
1376 : }
1377 :
1378 : static str
1379 25 : PCREindex(int *res, const pcre *pattern, const str *s)
1380 : {
1381 : #ifdef HAVE_LIBPCRE
1382 25 : int v[3];
1383 :
1384 25 : v[0] = v[1] = *res = 0;
1385 25 : if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
1386 : PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
1387 23 : *res = v[1];
1388 : }
1389 25 : return MAL_SUCCEED;
1390 : #else
1391 : (void) res;
1392 : (void) pattern;
1393 : (void) s;
1394 : throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
1395 : #endif
1396 : }
1397 :
1398 : static str
1399 27 : PCREpatindex(int *ret, const str *pat, const str *val)
1400 : {
1401 : #ifdef HAVE_LIBPCRE
1402 27 : pcre *re = NULL;
1403 27 : char *ppat = NULL, *msg;
1404 :
1405 53 : if (strNil(*pat) || strNil(*val)) {
1406 2 : *ret = int_nil;
1407 2 : return MAL_SUCCEED;
1408 : }
1409 :
1410 25 : if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
1411 : return msg;
1412 25 : if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
1413 0 : GDKfree(ppat);
1414 0 : return msg;
1415 : }
1416 25 : GDKfree(ppat);
1417 25 : msg = PCREindex(ret, re, val);
1418 25 : pcre_free(re);
1419 25 : return msg;
1420 : #else
1421 : (void) ret;
1422 : (void) pat;
1423 : (void) val;
1424 : throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
1425 : #endif
1426 : }
1427 :
1428 : static str
1429 0 : PCREquote(str *ret, const str *val)
1430 : {
1431 0 : char *p;
1432 0 : const char *s = *val;
1433 :
1434 0 : *ret = p = GDKmalloc(strlen(s) * 2 + 1); /* certainly long enough */
1435 0 : if (p == NULL)
1436 0 : throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
1437 : /* quote all non-alphanumeric ASCII characters (i.e. leave
1438 : non-ASCII and alphanumeric alone) */
1439 0 : while (*s) {
1440 0 : if (!((*s & 0x80) != 0 ||
1441 0 : ('a' <= *s && *s <= 'z') ||
1442 0 : ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
1443 0 : *p++ = '\\';
1444 0 : *p++ = *s++;
1445 : }
1446 0 : *p = 0;
1447 0 : return MAL_SUCCEED;
1448 : }
1449 :
1450 : static str
1451 6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
1452 : {
1453 6 : return sql2pcre(ret, *pat, *esc);
1454 : }
1455 :
1456 : static bool
1457 7840 : is_ascii_str(const char *pat)
1458 : {
1459 7840 : size_t len = strlen(pat);
1460 61058 : for (size_t i = 0; i < len; i++) {
1461 53882 : if (pat[i] & 0x80)
1462 : return false;
1463 : }
1464 :
1465 : return true;
1466 : }
1467 :
1468 : static inline str
1469 7840 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
1470 : bool *ascii_pattern, const char *pat, const char *esc)
1471 : {
1472 7840 : str res = MAL_SUCCEED;
1473 7840 : *use_re = false;
1474 7840 : *use_strcmp = false;
1475 7840 : *empty = false;
1476 :
1477 :
1478 7840 : *ascii_pattern = is_ascii_str(pat);
1479 :
1480 15192 : if (strNil(pat) || strNil(esc)) {
1481 488 : *empty = true;
1482 : } else {
1483 7352 : if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
1484 5 : throw(MAL, "pcre.sql2pcre",
1485 : SQLSTATE(22019) ILLEGAL_ARGUMENT
1486 : ": (I)LIKE pattern must not end with escape character");
1487 7347 : if (is_strcmpable(pat, esc)) {
1488 876 : *use_re = true;
1489 876 : *use_strcmp = true;
1490 6471 : } else if (re_simple(pat, (unsigned char) *esc)) {
1491 5890 : *use_re = true;
1492 : } else {
1493 581 : if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
1494 : return res;
1495 581 : if (strNil(*ppat)) {
1496 0 : GDKfree(*ppat);
1497 0 : *ppat = NULL;
1498 0 : *use_re = true;
1499 0 : *use_strcmp = true;
1500 : }
1501 : }
1502 : }
1503 : return res;
1504 : }
1505 :
1506 : static str
1507 234 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
1508 : const bit *isens)
1509 : {
1510 234 : str res = MAL_SUCCEED;
1511 234 : char *ppat = NULL;
1512 234 : bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
1513 234 : struct RE *re = NULL;
1514 :
1515 234 : if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
1516 : *pat, *esc)) != MAL_SUCCEED)
1517 : return res;
1518 :
1519 459 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
1520 225 : "pcrelike: pattern matching using strcmp" : use_re ?
1521 : "pcrelike: pattern matching using RE" :
1522 : "pcrelike: pattern matching using pcre");
1523 :
1524 468 : if (strNil(*s) || empty) {
1525 0 : *ret = bit_nil;
1526 234 : } else if (use_re) {
1527 164 : if (use_strcmp) {
1528 9 : *ret = *isens ? (ascii_pattern
1529 2 : ? istrcmp(*s, *pat) == 0
1530 0 : : mystrcasecmp(*s, *pat) == 0)
1531 7 : : strcmp(*s, *pat) == 0;
1532 : } else {
1533 155 : if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
1534 0 : res = createException(MAL, "pcre.like4",
1535 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1536 : else
1537 310 : *ret = (*isens && !re->is_ascii)
1538 0 : ? re_match_ignore(*s, re)
1539 155 : : re_match_no_ignore(*s, re);
1540 : }
1541 : } else {
1542 70 : res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
1543 : }
1544 :
1545 164 : if (re)
1546 155 : re_destroy(re);
1547 234 : GDKfree(ppat);
1548 234 : return res;
1549 : }
1550 :
1551 : static str
1552 234 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
1553 : const bit *isens)
1554 : {
1555 229 : return PCRElike_imp(ret, s, pat, esc, isens);
1556 : }
1557 :
1558 : static str
1559 5 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
1560 : const bit *isens)
1561 : {
1562 5 : str tmp;
1563 5 : bit r;
1564 :
1565 5 : rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
1566 5 : *ret = r == bit_nil ? bit_nil : !r;
1567 5 : return MAL_SUCCEED;
1568 : }
1569 :
1570 : static inline str
1571 6603 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
1572 : bool use_strcmp, bool ascii_pattern, uint32_t esc)
1573 : {
1574 6603 : if (!use_strcmp) {
1575 5735 : if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
1576 0 : return createException(MAL, "pcre.re_like_build",
1577 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1578 868 : } else if (caseignore && !ascii_pattern) {
1579 30 : if (!(*wpat = utf8stoucs(pat)))
1580 0 : return createException(MAL, "pcre.re_like_build",
1581 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1582 : }
1583 : return MAL_SUCCEED;
1584 : }
1585 :
1586 : #define proj_scanloop(TEST) \
1587 : do { \
1588 : if (strNil(s)) \
1589 : return bit_nil; \
1590 : else \
1591 : return TEST; \
1592 : } while (0)
1593 :
1594 : static inline bit
1595 5084 : re_like_proj_apply(const char *s, const struct RE *restrict re,
1596 : const uint32_t *restrict wpat, const char *pat,
1597 : bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
1598 : {
1599 5084 : if (use_strcmp) {
1600 1163 : if (caseignore) {
1601 537 : if (is_ascii) {
1602 518 : if (anti)
1603 950 : proj_scanloop(istrcmp(s, pat) != 0);
1604 : else
1605 86 : proj_scanloop(istrcmp(s, pat) == 0);
1606 : } else {
1607 19 : if (anti)
1608 28 : proj_scanloop(mywstrcasecmp(s, wpat) != 0);
1609 : else
1610 10 : proj_scanloop(mywstrcasecmp(s, wpat) == 0);
1611 : }
1612 : } else {
1613 626 : if (anti)
1614 608 : proj_scanloop(strcmp(s, pat) != 0);
1615 : else
1616 644 : proj_scanloop(strcmp(s, pat) == 0);
1617 : }
1618 : } else {
1619 : /* Use re_match_ignore only if the pattern is UTF-8
1620 : * and we need to ignore case
1621 : */
1622 3921 : if (caseignore && !is_ascii) {
1623 3 : if (anti)
1624 6 : proj_scanloop(!re_match_ignore(s, re));
1625 : else
1626 0 : proj_scanloop(re_match_ignore(s, re));
1627 : } else {
1628 3918 : if (anti)
1629 180 : proj_scanloop(!re_match_no_ignore(s, re));
1630 : else
1631 7656 : proj_scanloop(re_match_no_ignore(s, re));
1632 : }
1633 : }
1634 : }
1635 :
1636 : static inline void
1637 6847 : re_like_clean(struct RE **re, uint32_t **wpat)
1638 : {
1639 6847 : if (*re) {
1640 5734 : re_destroy(*re);
1641 5735 : *re = NULL;
1642 : }
1643 6848 : if (*wpat) {
1644 30 : GDKfree(*wpat);
1645 30 : *wpat = NULL;
1646 : }
1647 6848 : }
1648 :
1649 : #ifdef HAVE_LIBPCRE
1650 : static inline str
1651 511 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
1652 : BUN count)
1653 : {
1654 511 : const char *err_p = NULL;
1655 511 : int errpos = 0;
1656 511 : int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
1657 511 : int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
1658 :
1659 511 : *res = NULL;
1660 511 : *ex = NULL;
1661 :
1662 511 : if (caseignore) {
1663 18 : options |= PCRE_CASELESS;
1664 : }
1665 511 : if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
1666 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1667 : ": compilation of regular expression (%s) failed"
1668 : " at %d with '%s'", ppat, errpos, err_p);
1669 509 : *ex = pcre_study(*res, pcrestopt, &err_p);
1670 511 : if (err_p != NULL)
1671 0 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1672 : ": pcre study of pattern (%s) "
1673 : "failed with '%s'", ppat, err_p);
1674 : return MAL_SUCCEED;
1675 : }
1676 : #else
1677 : static inline str
1678 : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
1679 : BUN count)
1680 : {
1681 : int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
1682 : int errcode;
1683 :
1684 : *res = (regex_t) {
1685 : 0};
1686 : (void) count;
1687 :
1688 : if (caseignore) {
1689 : options |= REG_ICASE;
1690 : }
1691 : if ((errcode = regcomp(res, ppat, options)) != 0)
1692 : return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
1693 : ": compilation of regular expression (%s) failed",
1694 : ppat);
1695 : (void) ex;
1696 : return MAL_SUCCEED;
1697 : }
1698 : #endif
1699 :
1700 : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
1701 : do { \
1702 : LOOP_BODY \
1703 : if (strNil(s)) \
1704 : *ret = bit_nil; \
1705 : else if (pos >= 0) \
1706 : *ret = RES1; \
1707 : else if (pos == -1) \
1708 : *ret = RES2; \
1709 : else \
1710 : return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
1711 : } while(0)
1712 :
1713 : static inline str
1714 1153 : pcre_like_apply(bit *ret, const char *s,
1715 : #ifdef HAVE_LIBPCRE
1716 : const pcre *re, const pcre_extra *ex
1717 : #else
1718 : regex_t re, void *ex
1719 : #endif
1720 : , const char *ppat, bool anti)
1721 : {
1722 1153 : int pos;
1723 :
1724 : #ifdef HAVE_LIBPCRE
1725 : #define LOOP_BODY \
1726 : pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
1727 : #else
1728 : #define LOOP_BODY \
1729 : int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
1730 : (void) ex; \
1731 : pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
1732 : #endif
1733 :
1734 1153 : if (anti)
1735 43 : PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
1736 : else
1737 1110 : PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
1738 :
1739 : return MAL_SUCCEED;
1740 : }
1741 :
1742 : static inline void
1743 1160 : pcre_clean(
1744 : #ifdef HAVE_LIBPCRE
1745 : pcre **re, pcre_extra **ex)
1746 : {
1747 1160 : if (*re)
1748 511 : pcre_free(*re);
1749 1160 : if (*ex)
1750 511 : pcre_free_study(*ex);
1751 1158 : *re = NULL;
1752 1158 : *ex = NULL;
1753 : #else
1754 : regex_t *re, void *ex)
1755 : {
1756 : regfree(re);
1757 : *re = (regex_t) {
1758 : 0};
1759 : (void) ex;
1760 : #endif
1761 1158 : }
1762 :
1763 : static str
1764 667 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
1765 : const str *esc, const bit *isens, const bit *not)
1766 : {
1767 667 : str msg = MAL_SUCCEED;
1768 667 : BAT *b = NULL, *pbn = NULL, *bn = NULL;
1769 667 : char *ppat = NULL;
1770 667 : const char *input = NULL;
1771 667 : bool use_re = false,
1772 667 : use_strcmp = false,
1773 667 : empty = false,
1774 667 : isensitive = (bool) *isens,
1775 667 : anti = (bool) *not,
1776 667 : has_nil = false,
1777 667 : ascii_pattern = false,
1778 667 : input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
1779 667 : pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
1780 667 : bat *r = getArgReference_bat(stk, pci, 0);
1781 667 : BUN q = 0;
1782 667 : bit *restrict ret = NULL;
1783 : #ifdef HAVE_LIBPCRE
1784 667 : pcre *re = NULL;
1785 667 : pcre_extra *ex = NULL;
1786 : #else
1787 : regex_t re = (regex_t) { 0 };
1788 : void *ex = NULL;
1789 : #endif
1790 667 : struct RE *re_simple = NULL;
1791 667 : uint32_t *wpat = NULL;
1792 667 : BATiter bi = (BATiter) { 0 }, pi;
1793 :
1794 667 : (void) cntxt;
1795 667 : if (input_is_a_bat) {
1796 667 : bat *bid = getArgReference_bat(stk, pci, 1);
1797 667 : if (!(b = BATdescriptor(*bid))) {
1798 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1799 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1800 0 : goto bailout;
1801 : }
1802 : }
1803 667 : if (pattern_is_a_bat) {
1804 84 : bat *pb = getArgReference_bat(stk, pci, 2);
1805 84 : if (!(pbn = BATdescriptor(*pb))) {
1806 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1807 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
1808 0 : goto bailout;
1809 : }
1810 : }
1811 667 : assert((!b || ATOMstorage(b->ttype) == TYPE_str)
1812 : && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
1813 :
1814 667 : q = BATcount(b ? b : pbn);
1815 667 : if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
1816 0 : msg = createException(MAL, "batalgebra.batpcrelike3",
1817 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
1818 0 : goto bailout;
1819 : }
1820 667 : ret = (bit *) Tloc(bn, 0);
1821 :
1822 667 : if (pattern_is_a_bat) {
1823 84 : pi = bat_iterator(pbn);
1824 84 : if (b)
1825 84 : bi = bat_iterator(b);
1826 : else
1827 0 : input = *getArgReference_str(stk, pci, 1);
1828 :
1829 1173 : for (BUN p = 0; p < q; p++) {
1830 1090 : const char *next_input = b ? BUNtvar(bi, p) : input,
1831 1090 : *np = BUNtvar(pi, p);
1832 :
1833 1090 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1834 : &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
1835 0 : bat_iterator_end(&pi);
1836 0 : if (b)
1837 0 : bat_iterator_end(&bi);
1838 0 : goto bailout;
1839 : }
1840 :
1841 1091 : if (use_re) {
1842 627 : if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
1843 : use_strcmp, ascii_pattern,
1844 626 : (unsigned char) **esc)) != MAL_SUCCEED) {
1845 0 : bat_iterator_end(&pi);
1846 0 : if (b)
1847 0 : bat_iterator_end(&bi);
1848 0 : goto bailout;
1849 : }
1850 627 : ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
1851 : isensitive, anti, use_strcmp,
1852 : ascii_pattern);
1853 625 : re_like_clean(&re_simple, &wpat);
1854 465 : } else if (empty) {
1855 459 : ret[p] = bit_nil;
1856 : } else {
1857 6 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
1858 0 : bat_iterator_end(&pi);
1859 0 : if (b)
1860 0 : bat_iterator_end(&bi);
1861 0 : goto bailout;
1862 : }
1863 6 : if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
1864 0 : bat_iterator_end(&pi);
1865 0 : if (b)
1866 0 : bat_iterator_end(&bi);
1867 0 : goto bailout;
1868 : }
1869 6 : pcre_clean(&re, &ex);
1870 : }
1871 1089 : has_nil |= is_bit_nil(ret[p]);
1872 1089 : GDKfree(ppat);
1873 1089 : ppat = NULL;
1874 : }
1875 83 : bat_iterator_end(&pi);
1876 84 : if (b)
1877 84 : bat_iterator_end(&bi);
1878 : } else {
1879 583 : const char *pat = *getArgReference_str(stk, pci, 2);
1880 583 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
1881 : &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
1882 5 : goto bailout;
1883 :
1884 578 : bi = bat_iterator(b);
1885 1095 : MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
1886 : ? "pcrelike: pattern matching using strcmp" :
1887 517 : use_re ? "pcrelike: pattern matching using RE" :
1888 : "pcrelike: pattern matching using pcre");
1889 :
1890 578 : if (use_re) {
1891 424 : if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
1892 424 : ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
1893 0 : bat_iterator_end(&bi);
1894 0 : goto bailout;
1895 : }
1896 4881 : for (BUN p = 0; p < q; p++) {
1897 4457 : const char *s = BUNtvar(bi, p);
1898 4458 : ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
1899 : anti, use_strcmp, ascii_pattern);
1900 4457 : has_nil |= is_bit_nil(ret[p]);
1901 : }
1902 154 : } else if (empty) {
1903 43 : for (BUN p = 0; p < q; p++)
1904 26 : ret[p] = bit_nil;
1905 : has_nil = true;
1906 : } else {
1907 137 : if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
1908 0 : bat_iterator_end(&bi);
1909 0 : goto bailout;
1910 : }
1911 1284 : for (BUN p = 0; p < q; p++) {
1912 1147 : const char *s = BUNtvar(bi, p);
1913 1147 : if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
1914 0 : bat_iterator_end(&bi);
1915 0 : goto bailout;
1916 : }
1917 1147 : has_nil |= is_bit_nil(ret[p]);
1918 : }
1919 : }
1920 578 : bat_iterator_end(&bi);
1921 : }
1922 :
1923 667 : bailout:
1924 667 : GDKfree(ppat);
1925 667 : re_like_clean(&re_simple, &wpat);
1926 667 : pcre_clean(&re, &ex);
1927 666 : if (bn && !msg) {
1928 661 : BATsetcount(bn, q);
1929 662 : bn->tnil = has_nil;
1930 662 : bn->tnonil = !has_nil;
1931 662 : bn->tkey = BATcount(bn) <= 1;
1932 662 : bn->tsorted = BATcount(bn) <= 1;
1933 662 : bn->trevsorted = BATcount(bn) <= 1;
1934 662 : *r = bn->batCacheid;
1935 662 : BBPkeepref(bn);
1936 5 : } else if (bn)
1937 5 : BBPreclaim(bn);
1938 666 : BBPreclaim(b);
1939 667 : BBPreclaim(pbn);
1940 666 : return msg;
1941 : }
1942 :
1943 : static str
1944 528 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1945 : {
1946 528 : const str *esc = getArgReference_str(stk, pci, 3);
1947 528 : const bit *ci = getArgReference_bit(stk, pci, 4);
1948 528 : bit no = FALSE;
1949 :
1950 528 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
1951 : }
1952 :
1953 : static str
1954 139 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
1955 : {
1956 139 : const str *esc = getArgReference_str(stk, pci, 3);
1957 139 : const bit *ci = getArgReference_bit(stk, pci, 4);
1958 139 : bit yes = TRUE;
1959 :
1960 139 : return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
1961 : }
1962 :
1963 : /* scan select loop with or without candidates */
1964 : #define pcrescanloop(TEST, KEEP_NULLS) \
1965 : do { \
1966 : TRC_DEBUG(ALGO, \
1967 : "PCREselect(b=%s#"BUNFMT",anti=%d): " \
1968 : "scanselect %s\n", BATgetId(b), BATcount(b), \
1969 : anti, #TEST); \
1970 : if (!s || BATtdense(s)) { \
1971 : for (; p < q; p++) { \
1972 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1973 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1974 : const char *restrict v = BUNtvar(bi, p - off); \
1975 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1976 : vals[cnt++] = p; \
1977 : } \
1978 : } else { \
1979 : for (; p < ncands; p++) { \
1980 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
1981 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
1982 : oid o = canditer_next(ci); \
1983 : const char *restrict v = BUNtvar(bi, o - off); \
1984 : if ((TEST) || ((KEEP_NULLS) && strNil(v))) \
1985 : vals[cnt++] = o; \
1986 : } \
1987 : } \
1988 : } while (0)
1989 :
1990 : #ifdef HAVE_LIBPCRE
1991 : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
1992 : #else
1993 : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
1994 : #endif
1995 :
1996 : static str
1997 362 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
1998 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
1999 : bool keep_nulls)
2000 : {
2001 : #ifdef HAVE_LIBPCRE
2002 362 : pcre *re = NULL;
2003 362 : pcre_extra *ex = NULL;
2004 : #else
2005 : regex_t re = (regex_t) { 0 };
2006 : void *ex = NULL;
2007 : #endif
2008 362 : BATiter bi = bat_iterator(b);
2009 362 : BUN cnt = 0, ncands = ci->ncand;
2010 362 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2011 362 : str msg = MAL_SUCCEED;
2012 :
2013 362 : size_t counter = 0;
2014 362 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2015 :
2016 362 : if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
2017 0 : goto bailout;
2018 :
2019 362 : if (anti)
2020 0 : pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
2021 : else
2022 37760 : pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
2023 :
2024 4 : bailout:
2025 361 : bat_iterator_end(&bi);
2026 362 : pcre_clean(&re, &ex);
2027 362 : *rcnt = cnt;
2028 362 : return msg;
2029 : }
2030 :
2031 : static str
2032 5433 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
2033 : BUN *rcnt, const char *pat, bool caseignore, bool anti,
2034 : bool use_strcmp, uint32_t esc, bool keep_nulls,
2035 : bool ascii_pattern)
2036 : {
2037 5433 : BATiter bi = bat_iterator(b);
2038 5433 : BUN cnt = 0, ncands = ci->ncand;
2039 5433 : oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
2040 5433 : struct RE *re = NULL;
2041 5433 : uint32_t *wpat = NULL;
2042 5433 : str msg = MAL_SUCCEED;
2043 :
2044 5433 : size_t counter = 0;
2045 5433 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2046 :
2047 5433 : if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
2048 : esc)) != MAL_SUCCEED)
2049 0 : goto bailout;
2050 :
2051 5433 : if (use_strcmp) {
2052 90 : if (caseignore) {
2053 30 : if (ascii_pattern) {
2054 22 : if (anti)
2055 64 : pcrescanloop(!strNil(v)
2056 : && istrcmp(v, pat) != 0, keep_nulls);
2057 : else
2058 597 : pcrescanloop(!strNil(v)
2059 : && istrcmp(v, pat) == 0, keep_nulls);
2060 : } else {
2061 8 : if (anti)
2062 0 : pcrescanloop(!strNil(v)
2063 : && mywstrcasecmp(v, wpat) != 0, keep_nulls);
2064 : else
2065 36 : pcrescanloop(!strNil(v)
2066 : && mywstrcasecmp(v, wpat) == 0, keep_nulls);
2067 : }
2068 : } else {
2069 60 : if (anti)
2070 54 : pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
2071 : else
2072 9115 : pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
2073 : }
2074 : } else {
2075 5343 : if (caseignore) {
2076 : /* ascii_pattern == true is encoded in re */
2077 53 : if (anti) {
2078 1 : if (ascii_pattern)
2079 42 : pcrescanloop(!strNil(v)
2080 : && !re_match_no_ignore(v, re), keep_nulls);
2081 : else
2082 0 : pcrescanloop(!strNil(v)
2083 : && !re_match_ignore(v, re), keep_nulls);
2084 : } else {
2085 52 : if (ascii_pattern)
2086 6311 : pcrescanloop(!strNil(v)
2087 : && re_match_no_ignore(v, re), keep_nulls);
2088 : else
2089 72 : pcrescanloop(!strNil(v)
2090 : && re_match_ignore(v, re), keep_nulls);
2091 : }
2092 : } else {
2093 5290 : if (anti)
2094 42604 : pcrescanloop(!strNil(v)
2095 : && !re_match_no_ignore(v, re), keep_nulls);
2096 : else
2097 132170 : pcrescanloop(!strNil(v)
2098 : && re_match_no_ignore(v, re), keep_nulls);
2099 : }
2100 : }
2101 :
2102 80 : bailout:
2103 5433 : bat_iterator_end(&bi);
2104 5432 : re_like_clean(&re, &wpat);
2105 5433 : *rcnt = cnt;
2106 5433 : return msg;
2107 : }
2108 :
2109 : static str
2110 5795 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
2111 : const str *esc, const bit *caseignore, const bit *anti)
2112 : {
2113 5795 : BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
2114 5795 : str msg = MAL_SUCCEED;
2115 5795 : char *ppat = NULL;
2116 5795 : bool use_re = false,
2117 5795 : use_strcmp = false,
2118 5795 : empty = false,
2119 5795 : ascii_pattern = false;
2120 5795 : bool with_strimps = false;
2121 5795 : bool with_strimps_anti = false;
2122 5795 : BUN p = 0, q = 0, rcnt = 0;
2123 5795 : struct canditer ci;
2124 :
2125 5795 : if ((b = BATdescriptor(*bid)) == NULL) {
2126 0 : msg = createException(MAL, "algebra.likeselect",
2127 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2128 0 : goto bailout;
2129 : }
2130 5795 : if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
2131 0 : msg = createException(MAL, "algebra.likeselect",
2132 : SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2133 0 : goto bailout;
2134 : }
2135 :
2136 5794 : assert(ATOMstorage(b->ttype) == TYPE_str);
2137 :
2138 5794 : if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
2139 : *pat, *esc)) != MAL_SUCCEED)
2140 0 : goto bailout;
2141 :
2142 5795 : if (empty) {
2143 0 : if (!(bn = BATdense(0, 0, 0)))
2144 0 : msg = createException(MAL, "algebra.likeselect",
2145 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2146 :
2147 0 : goto bailout;
2148 : }
2149 : /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
2150 : * set will necessarily reject some of the matching entries in the NOT LIKE query.
2151 : *
2152 : * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
2153 : * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
2154 : * the BAT contains NULLs.
2155 : */
2156 5795 : if (BAThasstrimps(b)) {
2157 24 : if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
2158 24 : BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
2159 24 : if (tmp_s) {
2160 24 : old_s = s;
2161 24 : s = tmp_s;
2162 24 : if (!*anti)
2163 : with_strimps = true;
2164 : else
2165 0 : with_strimps_anti = true;
2166 : }
2167 : } else { /* If we cannot filter with the strimp just continue normally */
2168 0 : GDKclrerr();
2169 : }
2170 : }
2171 :
2172 :
2173 5795 : MT_thread_setalgorithm(use_strcmp
2174 5795 : ? (with_strimps ?
2175 : "pcrelike: pattern matching using strcmp with strimps"
2176 : : (with_strimps_anti ?
2177 : "pcrelike: pattern matching using strcmp with strimps anti"
2178 5795 : : "pcrelike: pattern matching using strcmp")) :
2179 5705 : use_re ? (with_strimps ?
2180 : "pcrelike: pattern matching using RE with strimps"
2181 : : (with_strimps_anti ?
2182 : "pcrelike: patterm matching using RE with strimps anti"
2183 : :
2184 : "pcrelike: pattern matching using RE"))
2185 : : (with_strimps ?
2186 : "pcrelike: pattern matching using pcre with strimps"
2187 : : (with_strimps_anti ?
2188 : "pcrelike: pattermatching using pcre with strimps anti"
2189 : : "pcrelike: pattern matching using pcre")));
2190 :
2191 5795 : canditer_init(&ci, b, s);
2192 5795 : if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
2193 0 : msg = createException(MAL, "algebra.likeselect",
2194 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2195 0 : goto bailout;
2196 : }
2197 :
2198 5795 : if (!s || BATtdense(s)) {
2199 1469 : if (s) {
2200 4242 : assert(BATtdense(s));
2201 4242 : p = (BUN) s->tseqbase;
2202 4242 : q = p + BATcount(s);
2203 4242 : if ((oid) p < b->hseqbase)
2204 : p = b->hseqbase;
2205 4242 : if ((oid) q > b->hseqbase + BATcount(b))
2206 : q = b->hseqbase + BATcount(b);
2207 : } else {
2208 1469 : p = b->hseqbase;
2209 1469 : q = BATcount(b) + b->hseqbase;
2210 : }
2211 : }
2212 :
2213 5795 : if (use_re) {
2214 5433 : msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
2215 791 : && !with_strimps_anti, use_strcmp,
2216 5433 : (unsigned char) **esc, with_strimps_anti,
2217 : ascii_pattern);
2218 : } else {
2219 362 : msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
2220 362 : *anti && !with_strimps_anti, with_strimps_anti);
2221 : }
2222 :
2223 5795 : if (!msg) { /* set some properties */
2224 5795 : BATsetcount(bn, rcnt);
2225 5795 : bn->tsorted = true;
2226 5795 : bn->trevsorted = bn->batCount <= 1;
2227 5795 : bn->tkey = true;
2228 5795 : bn->tnil = false;
2229 5795 : bn->tnonil = true;
2230 5795 : bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
2231 5795 : if (with_strimps_anti) {
2232 : /* Reverse the result taking into account the original candidate list. */
2233 : // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
2234 0 : BAT *rev;
2235 0 : if (old_s) {
2236 0 : rev = BATdiffcand(old_s, bn);
2237 : #ifndef NDEBUG
2238 0 : BAT *is = BATintersectcand(old_s, bn);
2239 0 : if (is) {
2240 0 : assert(is->batCount == bn->batCount);
2241 0 : BBPreclaim(is);
2242 : }
2243 0 : assert(rev->batCount == old_s->batCount - bn->batCount);
2244 : #endif
2245 : }
2246 :
2247 : else
2248 0 : rev = BATnegcands(b->batCount, bn);
2249 : /* BAT *rev = BATnegcands(b->batCount, bn); */
2250 0 : BBPunfix(bn->batCacheid);
2251 0 : bn = rev;
2252 : }
2253 : }
2254 :
2255 :
2256 5795 : bailout:
2257 5795 : BBPreclaim(b);
2258 5795 : BBPreclaim(s);
2259 5794 : BBPreclaim(old_s);
2260 5794 : GDKfree(ppat);
2261 5794 : if (bn && !msg) {
2262 5794 : *ret = bn->batCacheid;
2263 5794 : BBPkeepref(bn);
2264 0 : } else if (bn)
2265 0 : BBPreclaim(bn);
2266 5794 : return msg;
2267 : }
2268 :
2269 : #define APPEND(b, o) (((oid *) b->theap->base)[b->batCount++] = (o))
2270 : #define VALUE(s, x) (s##vars + VarHeapVal(s##vals, (x), s##i.width))
2271 :
2272 : #ifdef HAVE_LIBPCRE
2273 : #define PCRE_EXEC \
2274 : do { \
2275 : retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
2276 : } while (0)
2277 : #define PCRE_EXEC_COND (retval < 0)
2278 : #else
2279 : #define PCRE_EXEC \
2280 : do { \
2281 : retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
2282 : } while (0)
2283 : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
2284 : #endif
2285 :
2286 : /* nested loop implementation for PCRE join */
2287 : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND) \
2288 : do { \
2289 : for (BUN ridx = 0; ridx < rci.ncand; ridx++) { \
2290 : GDK_CHECK_TIMEOUT(qry_ctx, counter, \
2291 : GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
2292 : ro = canditer_next(&rci); \
2293 : vr = VALUE(r, ro - rbase); \
2294 : nl = 0; \
2295 : ascii_pattern = use_re = use_strcmp = empty = false; \
2296 : if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
2297 : goto bailout; \
2298 : if (!empty) { \
2299 : if (use_re) { \
2300 : if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
2301 : goto bailout; \
2302 : } else if (pcrepat) { \
2303 : if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
2304 : goto bailout; \
2305 : GDKfree(pcrepat); \
2306 : pcrepat = NULL; \
2307 : } \
2308 : canditer_reset(&lci); \
2309 : for (BUN lidx = 0; lidx < lci.ncand; lidx++) { \
2310 : lo = canditer_next(&lci); \
2311 : vl = VALUE(l, lo - lbase); \
2312 : if (strNil(vl)) { \
2313 : continue; \
2314 : } else if (use_re) { \
2315 : if (use_strcmp) { \
2316 : if (STRCMP) \
2317 : continue; \
2318 : } else { \
2319 : assert(re); \
2320 : if (RE_MATCH) \
2321 : continue; \
2322 : } \
2323 : } else { \
2324 : int retval; \
2325 : PCRE_EXEC; \
2326 : if (PCRE_COND) \
2327 : continue; \
2328 : } \
2329 : if (BATcount(r1) == BATcapacity(r1)) { \
2330 : newcap = BATgrows(r1); \
2331 : BATsetcount(r1, BATcount(r1)); \
2332 : if (r2) \
2333 : BATsetcount(r2, BATcount(r2)); \
2334 : if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
2335 : msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
2336 : goto bailout; \
2337 : } \
2338 : assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
2339 : } \
2340 : if (BATcount(r1) > 0) { \
2341 : if (lastl + 1 != lo) \
2342 : r1->tseqbase = oid_nil; \
2343 : if (nl == 0) { \
2344 : if (r2) \
2345 : r2->trevsorted = false; \
2346 : if (lastl > lo) { \
2347 : r1->tsorted = false; \
2348 : r1->tkey = false; \
2349 : } else if (lastl < lo) { \
2350 : r1->trevsorted = false; \
2351 : } else { \
2352 : r1->tkey = false; \
2353 : } \
2354 : } \
2355 : } \
2356 : APPEND(r1, lo); \
2357 : if (r2) \
2358 : APPEND(r2, ro); \
2359 : lastl = lo; \
2360 : nl++; \
2361 : } \
2362 : re_like_clean(&re, &wpat); \
2363 : pcre_clean(&pcrere, &pcreex); \
2364 : } \
2365 : if (r2) { \
2366 : if (nl > 1) { \
2367 : r2->tkey = false; \
2368 : r2->tseqbase = oid_nil; \
2369 : r1->trevsorted = false; \
2370 : } else if (nl == 0) { \
2371 : rskipped = BATcount(r2) > 0; \
2372 : } else if (rskipped) { \
2373 : r2->tseqbase = oid_nil; \
2374 : } \
2375 : } else if (nl > 1) { \
2376 : r1->trevsorted = false; \
2377 : } \
2378 : } \
2379 : } while (0)
2380 :
2381 : static char *
2382 43 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
2383 : bit caseignore, bit anti)
2384 : {
2385 43 : struct canditer lci, rci;
2386 43 : const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
2387 43 : int rskipped = 0; /* whether we skipped values in r */
2388 43 : oid lbase, rbase, lo, ro, lastl = 0; /* last value inserted into r1 */
2389 43 : BUN nl, newcap;
2390 43 : char *pcrepat = NULL, *msg = MAL_SUCCEED;
2391 43 : struct RE *re = NULL;
2392 43 : bool use_re = false,
2393 43 : use_strcmp = false,
2394 43 : empty = false,
2395 43 : ascii_pattern = false;
2396 43 : uint32_t *wpat = NULL;
2397 : #ifdef HAVE_LIBPCRE
2398 43 : pcre *pcrere = NULL;
2399 43 : pcre_extra *pcreex = NULL;
2400 : #else
2401 : regex_t pcrere = (regex_t) { 0 };
2402 : void *pcreex = NULL;
2403 : #endif
2404 :
2405 43 : size_t counter = 0;
2406 43 : QryCtx *qry_ctx = MT_thread_get_qry_ctx();
2407 :
2408 43 : TRC_DEBUG(ALGO,
2409 : "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
2410 : "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
2411 : "sr=%s#" BUNFMT "%s%s)\n",
2412 : BATgetId(l), BATcount(l), ATOMname(l->ttype),
2413 : l->tsorted ? "-sorted" : "",
2414 : l->trevsorted ? "-revsorted" : "",
2415 : BATgetId(r), BATcount(r), ATOMname(r->ttype),
2416 : r->tsorted ? "-sorted" : "",
2417 : r->trevsorted ? "-revsorted" : "",
2418 : sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
2419 : sl && sl->tsorted ? "-sorted" : "",
2420 : sl && sl->trevsorted ? "-revsorted" : "",
2421 : sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
2422 : sr && sr->tsorted ? "-sorted" : "",
2423 : sr && sr->trevsorted ? "-revsorted" : "");
2424 :
2425 129 : assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
2426 43 : assert(ATOMtype(l->ttype) == TYPE_str);
2427 :
2428 43 : canditer_init(&lci, l, sl);
2429 43 : canditer_init(&rci, r, sr);
2430 :
2431 43 : BATiter li = bat_iterator(l);
2432 43 : BATiter ri = bat_iterator(r);
2433 43 : lbase = l->hseqbase;
2434 43 : rbase = r->hseqbase;
2435 43 : lvals = (const char *) li.base;
2436 43 : rvals = (const char *) ri.base;
2437 43 : assert(ri.vh && r->ttype);
2438 43 : lvars = li.vh->base;
2439 43 : rvars = ri.vh->base;
2440 :
2441 43 : r1->tkey = true;
2442 43 : r1->tsorted = true;
2443 43 : r1->trevsorted = true;
2444 43 : r1->tnil = false;
2445 43 : r1->tnonil = true;
2446 43 : if (r2) {
2447 26 : r2->tkey = true;
2448 26 : r2->tsorted = true;
2449 26 : r2->trevsorted = true;
2450 26 : r2->tnil = false;
2451 26 : r2->tnonil = true;
2452 : }
2453 :
2454 43 : if (anti) {
2455 23 : if (caseignore) {
2456 123 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
2457 : re_match_ignore(vl, re), !PCRE_EXEC_COND);
2458 : } else {
2459 328 : pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
2460 : }
2461 : } else {
2462 20 : if (caseignore) {
2463 5 : pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
2464 : !re_match_ignore(vl, re), PCRE_EXEC_COND);
2465 : } else {
2466 381 : pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
2467 : }
2468 : }
2469 43 : bat_iterator_end(&li);
2470 43 : bat_iterator_end(&ri);
2471 :
2472 43 : assert(!r2 || BATcount(r1) == BATcount(r2));
2473 : /* also set other bits of heap to correct value to indicate size */
2474 43 : BATsetcount(r1, BATcount(r1));
2475 43 : if (r2)
2476 26 : BATsetcount(r2, BATcount(r2));
2477 43 : if (BATcount(r1) > 0) {
2478 30 : if (BATtdense(r1))
2479 7 : r1->tseqbase = ((oid *) r1->theap->base)[0];
2480 30 : if (r2 && BATtdense(r2))
2481 14 : r2->tseqbase = ((oid *) r2->theap->base)[0];
2482 : } else {
2483 13 : r1->tseqbase = 0;
2484 13 : if (r2)
2485 6 : r2->tseqbase = 0;
2486 : }
2487 20 : if (r2)
2488 26 : TRC_DEBUG(ALGO,
2489 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
2490 : BATgetId(l), BATgetId(r),
2491 : BATgetId(r1), BATcount(r1),
2492 : r1->tsorted ? "-sorted" : "",
2493 : r1->trevsorted ? "-revsorted" : "",
2494 : BATgetId(r2), BATcount(r2),
2495 : r2->tsorted ? "-sorted" : "",
2496 : r2->trevsorted ? "-revsorted" : "");
2497 : else
2498 17 : TRC_DEBUG(ALGO,
2499 : "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
2500 : BATgetId(l), BATgetId(r),
2501 : BATgetId(r1), BATcount(r1),
2502 : r1->tsorted ? "-sorted" : "",
2503 : r1->trevsorted ? "-revsorted" : "");
2504 : return MAL_SUCCEED;
2505 :
2506 0 : bailout:
2507 0 : bat_iterator_end(&li);
2508 0 : bat_iterator_end(&ri);
2509 0 : GDKfree(pcrepat);
2510 0 : re_like_clean(&re, &wpat);
2511 0 : pcre_clean(&pcrere, &pcreex);
2512 0 : assert(msg != MAL_SUCCEED);
2513 : return msg;
2514 : }
2515 :
2516 : static str
2517 43 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
2518 : bat ciid, bit anti)
2519 : {
2520 43 : BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
2521 43 : *candleft = NULL, *candright = NULL;
2522 43 : BAT *result1 = NULL, *result2 = NULL;
2523 43 : char *msg = MAL_SUCCEED;
2524 43 : const char *esc = "";
2525 43 : bit ci;
2526 43 : BATiter bi;
2527 :
2528 43 : if ((left = BATdescriptor(lid)) == NULL)
2529 0 : goto fail;
2530 43 : if ((right = BATdescriptor(rid)) == NULL)
2531 0 : goto fail;
2532 43 : if ((escape = BATdescriptor(elid)) == NULL)
2533 0 : goto fail;
2534 43 : if ((caseignore = BATdescriptor(ciid)) == NULL)
2535 0 : goto fail;
2536 43 : if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
2537 0 : goto fail;
2538 43 : if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
2539 0 : goto fail;
2540 43 : result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2541 43 : if (r2)
2542 26 : result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
2543 43 : if (!result1 || (r2 && !result2)) {
2544 0 : msg = createException(MAL, "pcre.join",
2545 : SQLSTATE(HY013) MAL_MALLOC_FAIL);
2546 0 : goto fail;
2547 : }
2548 43 : result1->tnil = false;
2549 43 : result1->tnonil = true;
2550 43 : result1->tkey = true;
2551 43 : result1->tsorted = true;
2552 43 : result1->trevsorted = true;
2553 43 : result1->tseqbase = 0;
2554 43 : if (r2) {
2555 26 : result2->tnil = false;
2556 26 : result2->tnonil = true;
2557 26 : result2->tkey = true;
2558 26 : result2->tsorted = true;
2559 26 : result2->trevsorted = true;
2560 26 : result2->tseqbase = 0;
2561 : }
2562 43 : if (BATcount(escape) != 1) {
2563 0 : msg = createException(MAL, "pcre.join",
2564 : SQLSTATE(42000)
2565 : "At the moment, only one value is allowed for the escape input at pcre join");
2566 0 : goto fail;
2567 : }
2568 43 : if (BATcount(caseignore) != 1) {
2569 0 : msg = createException(MAL, "pcre.join",
2570 : SQLSTATE(42000)
2571 : "At the moment, only one value is allowed for the case ignore input at pcre join");
2572 0 : goto fail;
2573 : }
2574 43 : bi = bat_iterator(caseignore);
2575 43 : ci = *(bit *) BUNtloc(bi, 0);
2576 43 : bat_iterator_end(&bi);
2577 43 : bi = bat_iterator(escape);
2578 43 : esc = BUNtvar(bi, 0);
2579 43 : msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
2580 : anti);
2581 43 : bat_iterator_end(&bi);
2582 43 : if (msg)
2583 0 : goto fail;
2584 43 : *r1 = result1->batCacheid;
2585 43 : BBPkeepref(result1);
2586 43 : if (r2) {
2587 26 : *r2 = result2->batCacheid;
2588 26 : BBPkeepref(result2);
2589 : }
2590 43 : BBPunfix(left->batCacheid);
2591 43 : BBPunfix(right->batCacheid);
2592 43 : BBPreclaim(escape);
2593 43 : BBPreclaim(caseignore);
2594 43 : BBPreclaim(candleft);
2595 43 : BBPreclaim(candright);
2596 : return MAL_SUCCEED;
2597 :
2598 0 : fail:
2599 0 : BBPreclaim(left);
2600 0 : BBPreclaim(right);
2601 0 : BBPreclaim(escape);
2602 0 : BBPreclaim(caseignore);
2603 0 : BBPreclaim(candleft);
2604 0 : BBPreclaim(candright);
2605 0 : BBPreclaim(result1);
2606 0 : BBPreclaim(result2);
2607 0 : if (msg)
2608 : return msg;
2609 0 : throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
2610 : }
2611 :
2612 : static str
2613 26 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
2614 : const bat *cid, const bat *slid, const bat *srid,
2615 : const bit *nil_matches, const lng *estimate, const bit *anti)
2616 : {
2617 26 : (void) nil_matches;
2618 26 : (void) estimate;
2619 26 : return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2620 26 : *elid, *cid, *anti);
2621 : }
2622 :
2623 : static str
2624 17 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
2625 : const bat *cid, const bat *slid, const bat *srid,
2626 : const bit *nil_matches, const lng *estimate, const bit *anti)
2627 : {
2628 17 : (void) nil_matches;
2629 17 : (void) estimate;
2630 17 : return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
2631 17 : *elid, *cid, *anti);
2632 : }
2633 :
2634 : #include "mel.h"
2635 : mel_atom pcre_init_atoms[] = {
2636 : { .name="pcre", }, { .cmp=NULL }
2637 : };
2638 : mel_func pcre_init_funcs[] = {
2639 : command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
2640 : command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2641 : command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
2642 : command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
2643 : command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2644 : command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2645 : command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
2646 : command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
2647 : command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
2648 : command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2649 : command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
2650 : command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2651 : command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2652 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2653 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2654 : pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2655 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
2656 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2657 : pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
2658 : command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds. The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
2659 : command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
2660 : command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
2661 : { .imp=NULL }
2662 : };
2663 : #include "mal_import.h"
2664 : #ifdef _MSC_VER
2665 : #undef read
2666 : #pragma section(".CRT$XCU",read)
2667 : #endif
2668 334 : LIB_STARTUP_FUNC(init_pcre_mal)
2669 334 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }
|