LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - pcre.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1019 1289 79.1 %
Date: 2024-04-25 23:25:41 Functions: 51 56 91.1 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : /*
      14             :  * N. Nes
      15             :  * PCRE library interface
      16             :  * The  PCRE library is a set of functions that implement regular
      17             :  * expression pattern matching using the same syntax  and  semantics  as  Perl,
      18             :  * with  just  a  few  differences.  The  current  implementation of PCRE
      19             :  * (release 4.x) corresponds approximately with Perl 5.8, including  support
      20             :  * for  UTF-8  encoded  strings.   However,  this support has to be
      21             :  * explicitly enabled; it is not the default.
      22             :  *
      23             :  * ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre
      24             :  */
      25             : #include "monetdb_config.h"
      26             : #include <string.h>
      27             : 
      28             : #include "mal.h"
      29             : #include "mal_client.h"
      30             : #include "mal_interpreter.h"
      31             : #include "mal_exception.h"
      32             : 
      33             : #include <wchar.h>
      34             : #include <wctype.h>
      35             : 
      36             : #ifdef HAVE_LIBPCRE
      37             : #include <pcre.h>
      38             : #ifndef PCRE_STUDY_JIT_COMPILE
      39             : /* old library version on e.g. EPEL 6 */
      40             : #define pcre_free_study(x)              pcre_free(x)
      41             : #define PCRE_STUDY_JIT_COMPILE  0
      42             : #endif
      43             : #define JIT_COMPILE_MIN 1024    /* when to try JIT compilation of patterns */
      44             : 
      45             : #else
      46             : 
      47             : #include <regex.h>
      48             : 
      49             : typedef regex_t pcre;
      50             : #endif
      51             : 
      52             : /* current implementation assumes simple %keyword% [keyw%]* */
      53             : struct RE {
      54             :         char *k;
      55             :         uint32_t *w;
      56             :         bool search:1, atend:1, is_ascii:1, case_ignore:1;
      57             :         size_t len;
      58             :         struct RE *n;
      59             : };
      60             : 
      61             : /* We cannot use strcasecmp and strncasecmp since they work byte for
      62             :  * byte and don't deal with multibyte encodings (such as UTF-8).
      63             :  *
      64             :  * We implement our own conversion from UTF-8 encoding to Unicode code
      65             :  * points which we store in uint32_t.  The reason for this is,
      66             :  * functions like mbsrtowcs are locale-dependent (so we need a UTF-8
      67             :  * locale to use them), and on Windows, wchar_t is only 2 bytes and
      68             :  * therefore cannot hold all Unicode code points.  We do use functions
      69             :  * such as towlower to convert a Unicode code point to its lower-case
      70             :  * equivalent, but again on Windows, if the code point doesn't fit in
      71             :  * 2 bytes, we skip this conversion and compare the unconverted code
      72             :  * points.
      73             :  *
      74             :  * Note, towlower is also locale-dependent, but we don't need a UTF-8
      75             :  * locale in order to use it. */
      76             : 
      77             : /* helper function to convert a UTF-8 multibyte character to a wide
      78             :  * character */
      79             : static size_t
      80         274 : utfc8touc(uint32_t *restrict dest, const char *restrict src)
      81             : {
      82         274 :         if ((src[0] & 0x80) == 0) {
      83         217 :                 *dest = src[0];
      84         217 :                 return src[0] != 0;
      85          57 :         } else if ((src[0] & 0xE0) == 0xC0
      86          40 :                            && (src[1] & 0xC0) == 0x80 && (src[0] & 0x1E) != 0) {
      87          40 :                 *dest = (src[0] & 0x1F) << 6 | (src[1] & 0x3F);
      88          40 :                 return 2;
      89          17 :         } else if ((src[0] & 0xF0) == 0xE0
      90          17 :                            && (src[1] & 0xC0) == 0x80
      91          17 :                            && (src[2] & 0xC0) == 0x80
      92          17 :                            && ((src[0] & 0x0F) != 0 || (src[1] & 0x20) != 0)) {
      93          17 :                 *dest = (src[0] & 0x0F) << 12 | (src[1] & 0x3F) << 6 | (src[2] & 0x3F);
      94          17 :                 return 3;
      95           0 :         } else if ((src[0] & 0xF8) == 0xF0
      96           0 :                            && (src[1] & 0xC0) == 0x80
      97           0 :                            && (src[2] & 0xC0) == 0x80 && (src[3] & 0xC0) == 0x80) {
      98           0 :                 uint32_t c = (src[0] & 0x07) << 18
      99           0 :                                 | (src[1] & 0x3F) << 12
     100           0 :                                 | (src[2] & 0x3F) << 6 | (src[3] & 0x3F);
     101           0 :                 if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     102             :                         return (size_t) -1;
     103           0 :                 *dest = c;
     104           0 :                 return 4;
     105             :         }
     106             :         return (size_t) -1;
     107             : }
     108             : 
     109             : /* helper function to convert a UTF-8 string to a wide character
     110             :  * string, the wide character string is allocated */
     111             : static uint32_t *
     112          50 : utf8stoucs(const char *src)
     113             : {
     114          50 :         uint32_t *dest;
     115          50 :         size_t i = 0;
     116          50 :         size_t j = 0;
     117             : 
     118             :         /* count how many uint32_t's we need, while also checking for
     119             :          * correctness of the input */
     120         272 :         while (src[j]) {
     121         222 :                 i++;
     122         222 :                 if ((src[j + 0] & 0x80) == 0) {
     123         172 :                         j += 1;
     124          50 :                 } else if ((src[j + 0] & 0xE0) == 0xC0
     125          25 :                                    && (src[j + 1] & 0xC0) == 0x80 && (src[j + 0] & 0x1E) != 0) {
     126          25 :                         j += 2;
     127          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0
     128          25 :                                    && (src[j + 1] & 0xC0) == 0x80
     129          25 :                                    && (src[j + 2] & 0xC0) == 0x80
     130          25 :                                    && ((src[j + 0] & 0x0F) != 0 || (src[j + 1] & 0x20) != 0)) {
     131          25 :                         j += 3;
     132           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0
     133           0 :                                    && (src[j + 1] & 0xC0) == 0x80
     134           0 :                                    && (src[j + 2] & 0xC0) == 0x80
     135           0 :                                    && (src[j + 3] & 0xC0) == 0x80) {
     136           0 :                         uint32_t c = (src[j + 0] & 0x07) << 18
     137           0 :                                         | (src[j + 1] & 0x3F) << 12
     138           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     139           0 :                         if (c < 0x10000 || c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
     140             :                                 return NULL;
     141           0 :                         j += 4;
     142             :                 } else {
     143             :                         return NULL;
     144             :                 }
     145             :         }
     146          50 :         dest = GDKmalloc((i + 1) * sizeof(uint32_t));
     147          50 :         if (dest == NULL)
     148             :                 return NULL;
     149             :         /* go through the source string again, this time we can skip
     150             :          * the correctness tests */
     151             :         i = j = 0;
     152         272 :         while (src[j]) {
     153         222 :                 if ((src[j + 0] & 0x80) == 0) {
     154         172 :                         dest[i++] = src[j + 0];
     155         172 :                         j += 1;
     156          50 :                 } else if ((src[j + 0] & 0xE0) == 0xC0) {
     157          25 :                         dest[i++] = (src[j + 0] & 0x1F) << 6 | (src[j + 1] & 0x3F);
     158          25 :                         j += 2;
     159          25 :                 } else if ((src[j + 0] & 0xF0) == 0xE0) {
     160          25 :                         dest[i++] = (src[j + 0] & 0x0F) << 12
     161          25 :                                         | (src[j + 1] & 0x3F) << 6 | (src[j + 2] & 0x3F);
     162          25 :                         j += 3;
     163           0 :                 } else if ((src[j + 0] & 0xF8) == 0xF0) {
     164           0 :                         dest[i++] = (src[j + 0] & 0x07) << 18
     165           0 :                                         | (src[j + 1] & 0x3F) << 12
     166           0 :                                         | (src[j + 2] & 0x3F) << 6 | (src[j + 3] & 0x3F);
     167           0 :                         j += 4;
     168             :                 }
     169             :         }
     170          50 :         dest[i] = 0;
     171          50 :         return dest;
     172             : }
     173             : 
     174             : static size_t
     175          33 : myucslen(const uint32_t *ucs)
     176             : {
     177          33 :         size_t i = 0;
     178             : 
     179          66 :         while (ucs[i])
     180          33 :                 i++;
     181          33 :         return i;
     182             : }
     183             : 
     184             : static inline bool
     185          14 : mywstrncaseeq(const char *restrict s1, const uint32_t *restrict s2, size_t n2,
     186             :                           bool atend)
     187             : {
     188          14 :         uint32_t c1;
     189             : 
     190          27 :         while (n2 > 0) {
     191          20 :                 size_t nn1 = utfc8touc(&c1, s1);
     192          20 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     193           0 :                         return (*s2 == 0);
     194          20 :                 if (*s2 == 0)
     195             :                         return false;
     196             : #if SIZEOF_WCHAR_T == 2
     197             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     198             :                         if (c1 != *s2)
     199             :                                 return false;
     200             :                 } else
     201             : #endif
     202          20 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     203             :                         return false;
     204          13 :                 s1 += nn1;
     205          13 :                 n2--;
     206          13 :                 s2++;
     207             :         }
     208          14 :         return !atend || *s1 == 0;
     209             : }
     210             : 
     211             : static inline int
     212           0 : mystrcasecmp(const char *s1, const char *s2)
     213             : {
     214           0 :         uint32_t c1 = 0, c2 = 0;
     215             : 
     216           0 :         for (;;) {
     217           0 :                 size_t nn1 = utfc8touc(&c1, s1);
     218           0 :                 size_t nn2 = utfc8touc(&c2, s2);
     219           0 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     220           0 :                         return -(nn2 != 0 && nn2 != (size_t) -1);
     221           0 :                 if (nn2 == 0 || nn2 == (size_t) -1)
     222             :                         return 1;
     223             : #if SIZEOF_WCHAR_T == 2
     224             :                 if (c1 > 0xFFFF || c2 > 0xFFFF) {
     225             :                         if (c1 != c2)
     226             :                                 return c1 - c2;
     227             :                 } else
     228             : #endif
     229           0 :                 if (towlower((wint_t) c1) != towlower((wint_t) c2))
     230           0 :                         return towlower((wint_t) c1) - towlower((wint_t) c2);
     231           0 :                 s1 += nn1;
     232           0 :                 s2 += nn2;
     233             :         }
     234             : }
     235             : 
     236             : static inline int
     237          42 : mywstrcasecmp(const char *restrict s1, const uint32_t *restrict s2)
     238             : {
     239          42 :         uint32_t c1 = 0;
     240             : 
     241         330 :         for (;;) {
     242         186 :                 size_t nn1 = utfc8touc(&c1, s1);
     243         186 :                 if (nn1 == 0 || nn1 == (size_t) -1)
     244          22 :                         return -(*s2 != 0);
     245         164 :                 if (*s2 == 0)
     246             :                         return 1;
     247             : #if SIZEOF_WCHAR_T == 2
     248             :                 if (c1 > 0xFFFF || *s2 > 0xFFFF) {
     249             :                         if (c1 != *s2)
     250             :                                 return c1 - *s2;
     251             :                 } else
     252             : #endif
     253         164 :                 if (towlower((wint_t) c1) != towlower((wint_t) * s2))
     254          20 :                         return towlower((wint_t) c1) - towlower((wint_t) * s2);
     255         144 :                 s1 += nn1;
     256         144 :                 s2++;
     257             :         }
     258             : }
     259             : 
     260             : static inline const char *
     261          33 : mywstrcasestr(const char *restrict haystack, const uint32_t *restrict wneedle,
     262             :                           bool atend)
     263             : {
     264          33 :         size_t nlen = myucslen(wneedle);
     265             : 
     266          33 :         if (nlen == 0)
     267           0 :                 return atend ? haystack + strlen(haystack) : haystack;
     268             : 
     269          86 :         while (*haystack) {
     270             :                 size_t i;
     271             :                 size_t h;
     272             :                 size_t step = 0;
     273          83 :                 for (i = h = 0; i < nlen; i++) {
     274          68 :                         uint32_t c = 0;
     275          68 :                         size_t j = utfc8touc(&c, haystack + h);
     276          68 :                         if (j == 0 || j == (size_t) -1)
     277           0 :                                 return NULL;
     278          68 :                         if (i == 0) {
     279          68 :                                 step = j;
     280             :                         }
     281             : #if SIZEOF_WCHAR_T == 2
     282             :                         if (c > 0xFFFF || wneedle[i] > 0xFFFF) {
     283             :                                 if (c != wneedle[i])
     284             :                                         break;
     285             :                         } else
     286             : #endif
     287          68 :                         if (towlower((wint_t) c) != towlower((wint_t) wneedle[i]))
     288             :                                 break;
     289          15 :                         h += j;
     290             :                 }
     291          68 :                 if (i == nlen && (!atend || haystack[h] == 0))
     292          15 :                         return haystack;
     293          53 :                 haystack += step;
     294             :         }
     295             :         return NULL;
     296             : }
     297             : 
     298             : /* returns true if the pattern does not contain unescaped `_' (single
     299             :  * character match) and ends with unescaped `%' (any sequence
     300             :  * match) */
     301             : static inline bool
     302        6471 : re_simple(const char *pat, unsigned char esc)
     303             : {
     304        6471 :         bool escaped = false;
     305             : 
     306        6471 :         if (pat == 0)
     307             :                 return false;
     308        6471 :         if (*pat == '%') {
     309        5700 :                 pat++;
     310             :         }
     311       46910 :         while (*pat) {
     312       41020 :                 if (escaped) {
     313             :                         escaped = false;
     314       40877 :                 } else if ((unsigned char) *pat == esc) {
     315             :                         escaped = true;
     316       40734 :                 } else if (*pat == '_') {
     317             :                         return false;
     318             :                 }
     319       40439 :                 pat++;
     320             :         }
     321             :         return true;
     322             : }
     323             : 
     324             : static inline bool
     325        7344 : re_is_pattern_properly_escaped(const char *pat, unsigned char esc)
     326             : {
     327        7344 :         bool escaped = false;
     328             : 
     329        7344 :         if (pat == 0)
     330             :                 return true;
     331       61605 :         while (*pat) {
     332       54261 :                 if (escaped) {
     333             :                         escaped = false;
     334       54109 :                 } else if ((unsigned char) *pat == esc) {
     335       54261 :                         escaped = true;
     336             :                 }
     337       54261 :                 pat++;
     338             :         }
     339        7344 :         return escaped ? false : true;
     340             : }
     341             : 
     342             : /* returns true if the pattern does not contain wildcard
     343             :  * characters ('%' or '_') and no character is escaped
     344             :  */
     345             : static inline bool
     346        7345 : is_strcmpable(const char *pat, const char *esc)
     347             : {
     348        7345 :         if (pat[strcspn(pat, "%_")])
     349             :                 return false;
     350        1814 :         return strlen(esc) == 0 || strNil(esc) || strstr(pat, esc) == NULL;
     351             : }
     352             : 
     353             : /* Compare two strings ignoring case. When both strings are
     354             :  * lower case this function returns the same result as strcmp.
     355             :  */
     356             : static int
     357        1204 : istrcmp(const char *s1, const char *s2)
     358             : {
     359        1204 :         char c1, c2;
     360        1204 :         const char *p1, *p2;
     361        1629 :         for (p1 = s1, p2 = s2; *p1 && *p2; p1++, p2++) {
     362        1113 :                 c1 = *p1;
     363        1113 :                 c2 = *p2;
     364             : 
     365        1113 :                 if ('A' <= c1 && c1 <= 'Z')
     366         611 :                         c1 += 'a' - 'A';
     367             : 
     368        1113 :                 if ('A' <= c2 && c2 <= 'Z')
     369         656 :                         c2 += 'a' - 'A';
     370             : 
     371        1113 :                 if (c1 != c2)
     372         688 :                         return (c1 - c2);
     373             :         }
     374             : 
     375         516 :         if (*p1 != *p2)
     376         443 :                 return *p1 - *p2;
     377             : 
     378             :         return 0;
     379             : }
     380             : 
     381             : /* Compare at most len characters of two strings ignoring
     382             :  * case. When both strings are lowercase this function
     383             :  * returns the same result as strncmp.
     384             :  */
     385             : static int
     386          16 : istrncmp(const char *s1, const char *s2, size_t len)
     387             : {
     388          16 :         char c1, c2;
     389          16 :         const char *p1, *p2;
     390          16 :         size_t n = 0;
     391             : 
     392          32 :         for (p1 = s1, p2 = s2; *p1 && *p2 && (n < len); p1++, p2++, n++) {
     393          16 :                 c1 = *p1;
     394          16 :                 c2 = *p2;
     395             : 
     396          16 :                 if ('A' <= c1 && c1 <= 'Z')
     397           4 :                         c1 += 'a' - 'A';
     398             : 
     399          16 :                 if ('A' <= c2 && c2 <= 'Z')
     400           0 :                         c2 += 'a' - 'A';
     401             : 
     402          16 :                 if (c1 != c2)
     403           0 :                         return c1 - c2;
     404             :         }
     405             : 
     406          16 :         if (*p1 != *p2 && n < len)
     407           0 :                 return *p1 - *p2;
     408             : 
     409             :         return 0;
     410             : }
     411             : 
     412             : 
     413             : /* Find the first occurence of the substring needle in
     414             :  * haystack ignoring case.
     415             :  *
     416             :  * NOTE: This function assumes that the needle is already
     417             :  * lowercase.
     418             :  */
     419             : static const char *
     420        6249 : istrstr(const char *haystack, const char *needle)
     421             : {
     422        6249 :         const char *ph;
     423        6249 :         const char *pn;
     424        6249 :         const char *p1;
     425        6249 :         bool match = true;
     426             : 
     427      298316 :         for (ph = haystack; *ph; ph++) {
     428      355689 :                 match = true;
     429      355689 :                 for (pn = needle, p1 = ph; *pn && *p1; pn++, p1++) {
     430      353498 :                         char c1 = *pn;
     431      353498 :                         char c2 = ('A' <= *p1 && *p1 <= 'Z') ? *p1 - 'A' + 'a' : *p1;
     432      353498 :                         if (c1 != c2) {
     433             :                                 match = false;
     434             :                                 break;
     435             :                         }
     436             :                 }
     437             : 
     438             :                 /* We reached the end of the haystack, but we still have characters in
     439             :                  * needle. None of the future iterations will match.
     440             :                  */
     441      294258 :                 if (*p1 == 0 && *pn != 0) {
     442             :                         break;
     443             :                 }
     444             : 
     445      294258 :                 if (match) {
     446        2191 :                         return ph;
     447             :                 }
     448             :         }
     449             :         return NULL;
     450             : }
     451             : 
     452             : /* Match regular expression by comparing bytes.
     453             :  *
     454             :  * This is faster than re_match_ignore, because it does not
     455             :  * need to decode characters. This function should be used
     456             :  * in all cases except when we need to perform UTF-8
     457             :  * comparisons ignoring case.
     458             :  *
     459             :  * TODO: The name of the function is no longer accurate and
     460             :  * needs to change.
     461             :  */
     462             : static inline bool
     463      170781 : re_match_no_ignore(const char *restrict s, const struct RE *restrict pattern)
     464             : {
     465      170781 :         const struct RE *r;
     466      170781 :         size_t l;
     467             : 
     468      228613 :         for (r = pattern; r; r = r->n) {
     469      171236 :                 if (*r->k == 0 && (r->search || *s == 0))
     470             :                         return true;
     471      150421 :                 if (!*s ||
     472             :                         (r->search
     473      150348 :                          ? (r->atend
     474      136807 :                                 ? (r->case_ignore
     475        6055 :                                    ? (l = strlen(s)) < r->len || istrcmp(s + l - r->len, r->k) != 0
     476        5971 :                                    : (l = strlen(s)) < r->len || strcmp(s + l - r->len, r->k) != 0)
     477      130752 :                                 : (r->case_ignore ? (s = istrstr(s, r->k)) == NULL
     478      124504 :                                    : (s = strstr(s, r->k)) == NULL))
     479             :                          : (r->atend
     480       13541 :                                 ? (r->case_ignore ? istrcmp(s, r->k) != 0
     481          95 :                                    : strcmp(s, r->k) != 0)
     482       13446 :                                 : (r->case_ignore ? istrncmp(s, r->k, r->len) != 0
     483       13430 :                                    : strncmp(s, r->k, r->len) != 0))))
     484             :                         return false;
     485       57832 :                 s += r->len;
     486             :         }
     487             :         return true;
     488             : }
     489             : 
     490             : /* Match a regular expression by comparing wide characters.
     491             :  *
     492             :  * This needs to be used when we need to perform a
     493             :  * case-ignoring comparions involving UTF-8 characters.
     494             :  */
     495             : static inline bool
     496          44 : re_match_ignore(const char *restrict s, const struct RE *restrict pattern)
     497             : {
     498          44 :         const struct RE *r;
     499             : 
     500             :         /* Since the pattern is ascii, do the cheaper comparison */
     501          44 :         if (pattern->is_ascii) {
     502           0 :                 return re_match_no_ignore(s, pattern);
     503             :         }
     504             : 
     505          66 :         for (r = pattern; r; r = r->n) {
     506          47 :                 if (*r->w == 0 && (r->search || *s == 0))
     507             :                         return true;
     508          47 :                 if (!*s ||
     509             :                         (r->search
     510          47 :                          ? (s = mywstrcasestr(s, r->w, r->atend)) == NULL
     511          14 :                          : !mywstrncaseeq(s, r->w, r->len, r->atend)))
     512             :                         return false;
     513          22 :                 s += r->len;
     514             :         }
     515             :         return true;
     516             : }
     517             : 
     518             : static void
     519        5889 : re_destroy(struct RE *p)
     520             : {
     521        5889 :         if (p) {
     522        5889 :                 GDKfree(p->k);
     523        5889 :                 GDKfree(p->w);
     524        5983 :                 do {
     525        5983 :                         struct RE *n = p->n;
     526             : 
     527        5983 :                         GDKfree(p);
     528        5984 :                         p = n;
     529        5984 :                 } while (p);
     530             :         }
     531        5890 : }
     532             : 
     533             : /* Create a linked list of RE structures.  Depending on the
     534             :  * caseignore and the ascii_pattern flags, the w
     535             :  * (if caseignore == true && ascii_pattern == false) or the k
     536             :  * (in every other case) field is used.  These in the first
     537             :  * structure are allocated, whereas in all subsequent
     538             :  * structures the fields point into the allocated buffer of
     539             :  * the first.
     540             :  */
     541             : static struct RE *
     542        5890 : re_create(const char *pat, bool caseignore, bool ascii_pattern, uint32_t esc)
     543             : {
     544        5890 :         struct RE *r = GDKmalloc(sizeof(struct RE)), *n = r;
     545        5890 :         bool escaped = false;
     546             : 
     547        5890 :         if (r == NULL)
     548             :                 return NULL;
     549        5890 :         *r = (struct RE) {.atend = true };
     550             : 
     551       11271 :         while (esc != '%' && *pat == '%') {
     552        5381 :                 pat++;                                  /* skip % */
     553        5381 :                 r->search = true;
     554             :         }
     555        5890 :         if (caseignore && !ascii_pattern) {
     556          20 :                 uint32_t *wp;
     557          20 :                 uint32_t *wq;
     558          20 :                 wp = utf8stoucs(pat);
     559          20 :                 if (wp == NULL) {
     560           0 :                         GDKfree(r);
     561           0 :                         return NULL;
     562             :                 }
     563          20 :                 r->w = wp;
     564          20 :                 wq = wp;
     565          68 :                 while (*wp) {
     566          48 :                         if (escaped) {
     567           0 :                                 *wq++ = *wp;
     568           0 :                                 n->len++;
     569           0 :                                 escaped = false;
     570          48 :                         } else if (*wp == esc) {
     571             :                                 escaped = true;
     572          48 :                         } else if (*wp == '%') {
     573          16 :                                 n->atend = false;
     574          16 :                                 while (wp[1] == '%')
     575           0 :                                         wp++;
     576          16 :                                 if (wp[1]) {
     577           4 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     578           4 :                                         if (n == NULL)
     579           0 :                                                 goto bailout;
     580           4 :                                         *n = (struct RE) {
     581             :                                                 .search = true,
     582             :                                                 .atend = true,
     583           4 :                                                 .w = wp + 1,
     584             :                                         };
     585             :                                 }
     586          16 :                                 *wq = 0;
     587          16 :                                 wq = wp + 1;
     588             :                         } else {
     589          32 :                                 *wq++ = *wp;
     590          32 :                                 n->len++;
     591             :                         }
     592          48 :                         wp++;
     593             :                 }
     594          20 :                 *wq = 0;
     595             :         } else {
     596        5870 :                 char *p, *q;
     597        5870 :                 if ((p = GDKstrdup(pat)) == NULL) {
     598           0 :                         GDKfree(r);
     599           0 :                         return NULL;
     600             :                 }
     601        5870 :                 if (ascii_pattern)
     602        5867 :                         n->is_ascii = true;
     603        5870 :                 if (caseignore)
     604          59 :                         n->case_ignore = true;
     605             : 
     606          59 :                 if (ascii_pattern && caseignore) {
     607         568 :                         for (q = p; *q != 0; q++) {
     608         509 :                                 if ('A' <= *q && *q <= 'Z')
     609          18 :                                         *q += 'a' - 'A';
     610             :                         }
     611             :                 }
     612             : 
     613        5870 :                 r->k = p;
     614        5870 :                 q = p;
     615       44731 :                 while (*p) {
     616       38861 :                         if (escaped) {
     617         136 :                                 *q++ = *p;
     618         136 :                                 n->len++;
     619         136 :                                 escaped = false;
     620       38725 :                         } else if ((unsigned char) *p == esc) {
     621             :                                 escaped = true;
     622       38589 :                         } else if (*p == '%') {
     623        5638 :                                 n->atend = false;
     624        5666 :                                 while (p[1] == '%')
     625          28 :                                         p++;
     626        5638 :                                 if (p[1]) {
     627          90 :                                         n = n->n = GDKmalloc(sizeof(struct RE));
     628          90 :                                         if (n == NULL)
     629           0 :                                                 goto bailout;
     630          90 :                                         *n = (struct RE) {
     631             :                                                 .search = true,
     632             :                                                 .atend = true,
     633          90 :                                                 .k = p + 1
     634             :                                         };
     635          90 :                                         if (ascii_pattern) {
     636          87 :                                                 n->is_ascii = true;
     637             :                                         }
     638          90 :                                         if (caseignore) {
     639          16 :                                                 n->case_ignore = true;
     640             :                                         }
     641             :                                 }
     642        5638 :                                 *q = 0;
     643        5638 :                                 q = p + 1;
     644             :                         } else {
     645       32951 :                                 char c = *p;
     646       32951 :                                 if (ascii_pattern && caseignore && 'A' <= c && c <= 'Z') {
     647           0 :                                         c += 'a' - 'A';
     648             :                                 }
     649       32951 :                                 *q++ = c;
     650       32951 :                                 n->len++;
     651             :                         }
     652       38861 :                         p++;
     653             :                 }
     654        5870 :                 *q = 0;
     655             :         }
     656             :         return r;
     657           0 :   bailout:
     658           0 :         re_destroy(r);
     659           0 :         return NULL;
     660             : }
     661             : 
     662             : #ifdef HAVE_LIBPCRE
     663             : static str
     664          25 : pcre_compile_wrap(pcre **res, const char *pattern, bit insensitive)
     665             : {
     666          25 :         pcre *r;
     667          25 :         const char *err_p = NULL;
     668          25 :         int errpos = 0;
     669          25 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE;
     670          25 :         if (insensitive)
     671           0 :                 options |= PCRE_CASELESS;
     672             : 
     673          25 :         if ((r = pcre_compile(pattern, options, &err_p, &errpos, NULL)) == NULL) {
     674           0 :                 throw(MAL, "pcre.compile", OPERATION_FAILED
     675             :                           " with\n'%s'\nat %d in\n'%s'.\n", err_p, errpos, pattern);
     676             :         }
     677          25 :         *res = r;
     678          25 :         return MAL_SUCCEED;
     679             : }
     680             : #endif
     681             : 
     682             : /* maximum number of back references and quoted \ or $ in replacement string */
     683             : #define MAX_NR_REFS             20
     684             : 
     685             : struct backref {
     686             :         int idx;
     687             :         int start;
     688             :         int end;
     689             : };
     690             : 
     691             : #ifdef HAVE_LIBPCRE
     692             : /* fill in parameter backrefs (length maxrefs) with information about
     693             :  * back references in the replacement string; a back reference is a
     694             :  * dollar or backslash followed by a number */
     695             : static int
     696          60 : parse_replacement(const char *replacement, int len_replacement,
     697             :                                   struct backref *backrefs, int maxrefs)
     698             : {
     699          60 :         int nbackrefs = 0;
     700             : 
     701         108 :         for (int i = 0; i < len_replacement && nbackrefs < maxrefs; i++) {
     702          48 :                 if (replacement[i] == '$' || replacement[i] == '\\') {
     703           6 :                         char *endptr;
     704           6 :                         backrefs[nbackrefs].idx = strtol(replacement + i + 1, &endptr, 10);
     705           6 :                         if (endptr > replacement + i + 1) {
     706           6 :                                 int k = (int) (endptr - (replacement + i + 1));
     707           6 :                                 backrefs[nbackrefs].start = i;
     708           6 :                                 backrefs[nbackrefs].end = i + k + 1;
     709           6 :                                 nbackrefs++;
     710           0 :                         } else if (replacement[i] == replacement[i + 1]) {
     711             :                                 /* doubled $ or \, we must copy just one to the output */
     712           0 :                                 backrefs[nbackrefs].idx = INT_MAX;      /* impossible value > 0 */
     713           0 :                                 backrefs[nbackrefs].start = i;
     714           0 :                                 backrefs[nbackrefs].end = i + 1;
     715           0 :                                 i++;                    /* don't look at second $ or \ again */
     716           0 :                                 nbackrefs++;
     717             :                         }
     718             :                         /* else: $ or \ followed by something we don't recognize,
     719             :                          * so just leave it */
     720             :                 }
     721             :         }
     722          60 :         return nbackrefs;
     723             : }
     724             : 
     725             : static char *
     726       28316 : single_replace(pcre *pcre_code, pcre_extra *extra,
     727             :                            const char *origin_str, int len_origin_str,
     728             :                            int exec_options, int *ovector, int ovecsize,
     729             :                            const char *replacement, int len_replacement,
     730             :                            struct backref *backrefs, int nbackrefs,
     731             :                            bool global, char *result, int *max_result)
     732             : {
     733       28316 :         int offset = 0;
     734       28316 :         int len_result = 0;
     735      104799 :         int addlen;
     736      104799 :         char *tmp;
     737             : 
     738      104799 :         do {
     739      104799 :                 int j = pcre_exec(pcre_code, extra, origin_str, len_origin_str, offset,
     740             :                                                   exec_options, ovector, ovecsize);
     741      104908 :                 if (j <= 0)
     742             :                         break;
     743       78653 :                 addlen = ovector[0] - offset + (nbackrefs == 0 ? len_replacement : 0);
     744       78653 :                 if (len_result + addlen >= *max_result) {
     745        6840 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     746        6840 :                         if (tmp == NULL) {
     747           0 :                                 GDKfree(result);
     748           0 :                                 return NULL;
     749             :                         }
     750        6840 :                         result = tmp;
     751        6840 :                         *max_result = len_result + addlen + 1;
     752             :                 }
     753       78653 :                 if (ovector[0] > offset) {
     754       76482 :                         strncpy(result + len_result, origin_str + offset,
     755       76482 :                                         ovector[0] - offset);
     756       76482 :                         len_result += ovector[0] - offset;
     757             :                 }
     758       78653 :                 if (nbackrefs == 0) {
     759       76486 :                         strncpy(result + len_result, replacement, len_replacement);
     760       76486 :                         len_result += len_replacement;
     761             :                 } else {
     762             :                         int prevend = 0;
     763        4334 :                         for (int i = 0; i < nbackrefs; i++) {
     764        2167 :                                 int off, len;
     765        2167 :                                 if (backrefs[i].idx >= ovecsize / 3) {
     766             :                                         /* out of bounds, replace with empty string */
     767             :                                         off = 0;
     768             :                                         len = 0;
     769             :                                 } else {
     770        2167 :                                         off = ovector[backrefs[i].idx * 2];
     771        2167 :                                         len = ovector[backrefs[i].idx * 2 + 1] - off;
     772             :                                 }
     773        2167 :                                 addlen = backrefs[i].start - prevend + len;
     774        2167 :                                 if (len_result + addlen >= *max_result) {
     775          21 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     776          21 :                                         if (tmp == NULL) {
     777           0 :                                                 GDKfree(result);
     778           0 :                                                 return NULL;
     779             :                                         }
     780          21 :                                         result = tmp;
     781          21 :                                         *max_result = len_result + addlen + 1;
     782             :                                 }
     783        2167 :                                 if (backrefs[i].start > prevend) {
     784           2 :                                         strncpy(result + len_result, replacement + prevend,
     785           2 :                                                         backrefs[i].start - prevend);
     786           2 :                                         len_result += backrefs[i].start - prevend;
     787             :                                 }
     788        2167 :                                 if (len > 0) {
     789        2167 :                                         strncpy(result + len_result, origin_str + off, len);
     790        2167 :                                         len_result += len;
     791             :                                 }
     792        2167 :                                 prevend = backrefs[i].end;
     793             :                         }
     794             :                         /* copy rest of replacement string (after last backref) */
     795        2167 :                         addlen = len_replacement - prevend;
     796        2167 :                         if (addlen > 0) {
     797           2 :                                 if (len_result + addlen >= *max_result) {
     798           1 :                                         tmp = GDKrealloc(result, len_result + addlen + 1);
     799           1 :                                         if (tmp == NULL) {
     800           0 :                                                 GDKfree(result);
     801           0 :                                                 return NULL;
     802             :                                         }
     803           1 :                                         result = tmp;
     804           1 :                                         *max_result = len_result + addlen + 1;
     805             :                                 }
     806           2 :                                 strncpy(result + len_result, replacement + prevend, addlen);
     807           2 :                                 len_result += addlen;
     808             :                         }
     809             :                 }
     810       78653 :                 offset = ovector[1];
     811       78653 :         } while (offset < len_origin_str && global);
     812       28425 :         if (offset < len_origin_str) {
     813       26257 :                 addlen = len_origin_str - offset;
     814       26257 :                 if (len_result + addlen >= *max_result) {
     815         328 :                         tmp = GDKrealloc(result, len_result + addlen + 1);
     816         328 :                         if (tmp == NULL) {
     817           0 :                                 GDKfree(result);
     818           0 :                                 return NULL;
     819             :                         }
     820         328 :                         result = tmp;
     821         328 :                         *max_result = len_result + addlen + 1;
     822             :                 }
     823       26257 :                 strncpy(result + len_result, origin_str + offset, addlen);
     824       26257 :                 len_result += addlen;
     825             :         }
     826             :         /* null terminate string */
     827       28425 :         result[len_result] = '\0';
     828       28425 :         return result;
     829             : }
     830             : #endif
     831             : 
     832             : static str
     833          10 : pcre_replace(str *res, const char *origin_str, const char *pattern,
     834             :                          const char *replacement, const char *flags, bool global)
     835             : {
     836             : #ifdef HAVE_LIBPCRE
     837          10 :         const char *err_p = NULL;
     838          10 :         pcre *pcre_code = NULL;
     839          10 :         pcre_extra *extra;
     840          10 :         char *tmpres;
     841          10 :         int max_result;
     842          10 :         int i, errpos = 0;
     843          10 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     844          10 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     845          10 :         int *ovector, ovecsize;
     846          10 :         int len_origin_str = (int) strlen(origin_str);
     847          10 :         int len_replacement = (int) strlen(replacement);
     848          10 :         struct backref backrefs[MAX_NR_REFS];
     849          10 :         int nbackrefs = 0;
     850             : 
     851          14 :         while (*flags) {
     852           4 :                 switch (*flags) {
     853             :                 case 'e':
     854             :                         exec_options &= ~PCRE_NOTEMPTY;
     855             :                         break;
     856           1 :                 case 'i':
     857           1 :                         compile_options |= PCRE_CASELESS;
     858           1 :                         break;
     859           1 :                 case 'm':
     860           1 :                         compile_options |= PCRE_MULTILINE;
     861           1 :                         break;
     862           1 :                 case 's':
     863           1 :                         compile_options |= PCRE_DOTALL;
     864           1 :                         break;
     865           1 :                 case 'x':
     866           1 :                         compile_options |= PCRE_EXTENDED;
     867           1 :                         break;
     868           0 :                 default:
     869           0 :                         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     870             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     871             :                                   *flags);
     872             :                 }
     873           4 :                 flags++;
     874             :         }
     875             : 
     876          10 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     877           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     878             :                           OPERATION_FAILED
     879             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     880             :                           pattern, errpos, err_p);
     881             :         }
     882             : 
     883             :         /* Since the compiled pattern is going to be used several times, it is
     884             :          * worth spending more time analyzing it in order to speed up the time
     885             :          * taken for matching.
     886             :          */
     887          10 :         extra = pcre_study(pcre_code, 0, &err_p);
     888          10 :         if (err_p != NULL) {
     889           0 :                 pcre_free(pcre_code);
     890           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     891             :                           OPERATION_FAILED
     892             :                           ": pcre study of pattern (%s) failed with '%s'.\n", pattern,
     893             :                           err_p);
     894             :         }
     895          10 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
     896          10 :         ovecsize = (i + 1) * 3;
     897          10 :         if ((ovector = (int *) GDKmalloc(sizeof(int) * ovecsize)) == NULL) {
     898           0 :                 pcre_free_study(extra);
     899           0 :                 pcre_free(pcre_code);
     900           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     901             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     902             :         }
     903             : 
     904             :         /* identify back references in the replacement string */
     905          10 :         nbackrefs = parse_replacement(replacement, len_replacement,
     906             :                                                                   backrefs, MAX_NR_REFS);
     907             : 
     908          10 :         max_result = len_origin_str + 1;
     909          10 :         tmpres = GDKmalloc(max_result);
     910          10 :         if (tmpres == NULL) {
     911           0 :                 GDKfree(ovector);
     912           0 :                 pcre_free_study(extra);
     913           0 :                 pcre_free(pcre_code);
     914           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     915             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     916             :         }
     917             : 
     918          10 :         tmpres = single_replace(pcre_code, extra, origin_str, len_origin_str,
     919             :                                                         exec_options, ovector, ovecsize, replacement,
     920             :                                                         len_replacement, backrefs, nbackrefs, global,
     921             :                                                         tmpres, &max_result);
     922          10 :         GDKfree(ovector);
     923          10 :         pcre_free_study(extra);
     924          10 :         pcre_free(pcre_code);
     925          10 :         if (tmpres == NULL)
     926           0 :                 throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     927             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
     928             : 
     929          10 :         *res = tmpres;
     930          10 :         return MAL_SUCCEED;
     931             : #else
     932             :         (void) res;
     933             :         (void) origin_str;
     934             :         (void) pattern;
     935             :         (void) replacement;
     936             :         (void) flags;
     937             :         (void) global;
     938             :         throw(MAL, global ? "pcre.replace" : "pcre.replace_first",
     939             :                   "Database was compiled without PCRE support.");
     940             : #endif
     941             : }
     942             : 
     943             : static str
     944          50 : pcre_replace_bat(BAT **res, BAT *origin_strs, const char *pattern,
     945             :                                  const char *replacement, const char *flags, bool global)
     946             : {
     947             : #ifdef HAVE_LIBPCRE
     948          50 :         const char *err_p = NULL;
     949          50 :         char *tmpres;
     950          50 :         int i, errpos = 0;
     951          50 :         int compile_options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
     952          50 :         int exec_options = PCRE_NOTEMPTY | PCRE_NO_UTF8_CHECK;
     953          50 :         pcre *pcre_code = NULL;
     954          50 :         pcre_extra *extra;
     955          50 :         BAT *tmpbat;
     956          50 :         BUN p, q;
     957          50 :         int *ovector, ovecsize;
     958          50 :         int len_replacement = (int) strlen(replacement);
     959          50 :         struct backref backrefs[MAX_NR_REFS];
     960          50 :         int nbackrefs = 0;
     961          50 :         const char *origin_str;
     962          50 :         int max_dest_size = 0;
     963             : 
     964          70 :         while (*flags) {
     965          20 :                 switch (*flags) {
     966             :                 case 'e':
     967             :                         exec_options &= ~PCRE_NOTEMPTY;
     968             :                         break;
     969           5 :                 case 'i':
     970           5 :                         compile_options |= PCRE_CASELESS;
     971           5 :                         break;
     972          10 :                 case 'm':
     973          10 :                         compile_options |= PCRE_MULTILINE;
     974          10 :                         break;
     975           5 :                 case 's':
     976           5 :                         compile_options |= PCRE_DOTALL;
     977           5 :                         break;
     978           0 :                 case 'x':
     979           0 :                         compile_options |= PCRE_EXTENDED;
     980           0 :                         break;
     981           0 :                 default:
     982           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     983             :                                   ILLEGAL_ARGUMENT ": unsupported flag character '%c'\n",
     984             :                                   *flags);
     985             :                 }
     986          20 :                 flags++;
     987             :         }
     988             : 
     989          50 :         if ((pcre_code = pcre_compile(pattern, compile_options, &err_p, &errpos, NULL)) == NULL) {
     990           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
     991             :                           OPERATION_FAILED
     992             :                           ": pcre compile of pattern (%s) failed at %d with\n'%s'.\n",
     993             :                           pattern, errpos, err_p);
     994             :         }
     995             : 
     996             :         /* Since the compiled pattern is going to be used several times,
     997             :          * it is worth spending more time analyzing it in order to speed
     998             :          * up the time taken for matching.
     999             :          */
    1000         100 :         extra = pcre_study(pcre_code,
    1001          50 :                                            BATcount(origin_strs) >
    1002             :                                            JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0, &err_p);
    1003          50 :         if (err_p != NULL) {
    1004           0 :                 pcre_free(pcre_code);
    1005           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1006             :                           OPERATION_FAILED);
    1007             :         }
    1008          50 :         pcre_fullinfo(pcre_code, extra, PCRE_INFO_CAPTURECOUNT, &i);
    1009          50 :         ovecsize = (i + 1) * 3;
    1010          50 :         if ((ovector = (int *) GDKzalloc(sizeof(int) * ovecsize)) == NULL) {
    1011           0 :                 pcre_free_study(extra);
    1012           0 :                 pcre_free(pcre_code);
    1013           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1014             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1015             :         }
    1016             : 
    1017             :         /* identify back references in the replacement string */
    1018          50 :         nbackrefs = parse_replacement(replacement, len_replacement,
    1019             :                                                                   backrefs, MAX_NR_REFS);
    1020             : 
    1021          50 :         tmpbat = COLnew(origin_strs->hseqbase, TYPE_str, BATcount(origin_strs),
    1022             :                                         TRANSIENT);
    1023             : 
    1024             :         /* the buffer for all destination strings is allocated only once,
    1025             :          * and extended when needed */
    1026          50 :         max_dest_size = len_replacement + 1;
    1027          50 :         tmpres = GDKmalloc(max_dest_size);
    1028          50 :         if (tmpbat == NULL || tmpres == NULL) {
    1029           0 :                 pcre_free_study(extra);
    1030           0 :                 pcre_free(pcre_code);
    1031           0 :                 GDKfree(ovector);
    1032           0 :                 BBPreclaim(tmpbat);
    1033           0 :                 GDKfree(tmpres);
    1034           0 :                 throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1035             :                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1036             :         }
    1037          50 :         BATiter origin_strsi = bat_iterator(origin_strs);
    1038       28304 :         BATloop(origin_strs, p, q) {
    1039       28254 :                 origin_str = BUNtvar(origin_strsi, p);
    1040       56657 :                 tmpres = single_replace(pcre_code, extra, origin_str,
    1041       28254 :                                                                 (int) strlen(origin_str), exec_options,
    1042             :                                                                 ovector, ovecsize, replacement,
    1043             :                                                                 len_replacement, backrefs, nbackrefs, global,
    1044             :                                                                 tmpres, &max_dest_size);
    1045       28403 :                 if (tmpres == NULL || BUNappend(tmpbat, tmpres, false) != GDK_SUCCEED) {
    1046           0 :                         bat_iterator_end(&origin_strsi);
    1047           0 :                         pcre_free_study(extra);
    1048           0 :                         pcre_free(pcre_code);
    1049           0 :                         GDKfree(ovector);
    1050           0 :                         GDKfree(tmpres);
    1051           0 :                         BBPreclaim(tmpbat);
    1052           0 :                         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1053             :                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1054             :                 }
    1055             :         }
    1056          50 :         bat_iterator_end(&origin_strsi);
    1057          50 :         pcre_free_study(extra);
    1058          50 :         pcre_free(pcre_code);
    1059          50 :         GDKfree(ovector);
    1060          50 :         GDKfree(tmpres);
    1061          50 :         *res = tmpbat;
    1062          50 :         return MAL_SUCCEED;
    1063             : #else
    1064             :         (void) res;
    1065             :         (void) origin_strs;
    1066             :         (void) pattern;
    1067             :         (void) replacement;
    1068             :         (void) flags;
    1069             :         (void) global;
    1070             :         throw(MAL, global ? "batpcre.replace" : "batpcre.replace_first",
    1071             :                   "Database was compiled without PCRE support.");
    1072             : #endif
    1073             : }
    1074             : 
    1075             : static str
    1076          74 : pcre_match_with_flags(bit *ret, const char *val, const char *pat,
    1077             :                                           const char *flags)
    1078             : {
    1079          74 :         int pos;
    1080             : #ifdef HAVE_LIBPCRE
    1081          74 :         const char *err_p = NULL;
    1082          74 :         int errpos = 0;
    1083          74 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
    1084          74 :         pcre *re;
    1085             : #else
    1086             :         int options = REG_NOSUB;
    1087             :         regex_t re;
    1088             :         int errcode;
    1089             :         int retval;
    1090             : #endif
    1091             : 
    1092         148 :         while (*flags) {
    1093          74 :                 switch (*flags) {
    1094           0 :                 case 'i':
    1095             : #ifdef HAVE_LIBPCRE
    1096           0 :                         options |= PCRE_CASELESS;
    1097             : #else
    1098             :                         options |= REG_ICASE;
    1099             : #endif
    1100           0 :                         break;
    1101           0 :                 case 'm':
    1102             : #ifdef HAVE_LIBPCRE
    1103           0 :                         options |= PCRE_MULTILINE;
    1104             : #else
    1105             :                         options |= REG_NEWLINE;
    1106             : #endif
    1107           0 :                         break;
    1108             : #ifdef HAVE_LIBPCRE
    1109          74 :                 case 's':
    1110          74 :                         options |= PCRE_DOTALL;
    1111          74 :                         break;
    1112             : #endif
    1113           0 :                 case 'x':
    1114             : #ifdef HAVE_LIBPCRE
    1115           0 :                         options |= PCRE_EXTENDED;
    1116             : #else
    1117             :                         options |= REG_EXTENDED;
    1118             : #endif
    1119           0 :                         break;
    1120           0 :                 default:
    1121           0 :                         throw(MAL, "pcre.match", ILLEGAL_ARGUMENT
    1122             :                                   ": unsupported flag character '%c'\n", *flags);
    1123             :                 }
    1124          74 :                 flags++;
    1125             :         }
    1126          74 :         if (strNil(val)) {
    1127           0 :                 *ret = FALSE;
    1128           0 :                 return MAL_SUCCEED;
    1129             :         }
    1130             : 
    1131             : #ifdef HAVE_LIBPCRE
    1132          74 :         if ((re = pcre_compile(pat, options, &err_p, &errpos, NULL)) == NULL)
    1133             : #else
    1134             :         if ((errcode = regcomp(&re, pat, options)) != 0)
    1135             : #endif
    1136             :         {
    1137           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1138             :                           ": compilation of regular expression (%s) failed "
    1139             : #ifdef HAVE_LIBPCRE
    1140             :                           "at %d with '%s'", pat, errpos, err_p
    1141             : #else
    1142             :                           , pat
    1143             : #endif
    1144             :                                 );
    1145             :         }
    1146             : #ifdef HAVE_LIBPCRE
    1147          74 :         pos = pcre_exec(re, NULL, val, (int) strlen(val), 0, PCRE_NO_UTF8_CHECK,
    1148             :                                         NULL, 0);
    1149          74 :         pcre_free(re);
    1150             : #else
    1151             :         retval = regexec(&re, val, (size_t) 0, NULL, 0);
    1152             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1153             :         regfree(&re);
    1154             : #endif
    1155          74 :         if (pos >= 0)
    1156          10 :                 *ret = TRUE;
    1157          64 :         else if (pos == -1)
    1158          64 :                 *ret = FALSE;
    1159             :         else
    1160           0 :                 throw(MAL, "pcre.match", OPERATION_FAILED
    1161             :                           ": matching of regular expression (%s) failed with %d", pat, pos);
    1162             :         return MAL_SUCCEED;
    1163             : }
    1164             : 
    1165             : #ifdef HAVE_LIBPCRE
    1166             : /* special characters in PCRE that need to be escaped */
    1167             : static const char *pcre_specials = ".+?*()[]{}|^$\\";
    1168             : #else
    1169             : /* special characters in POSIX basic regular expressions that need to
    1170             :  * be escaped */
    1171             : static const char *pcre_specials = "^.[$()|*+?{\\";
    1172             : #endif
    1173             : 
    1174             : /* change SQL LIKE pattern into PCRE pattern */
    1175             : static str
    1176         587 : sql2pcre(str *r, const char *pat, const char *esc_str)
    1177             : {
    1178         587 :         int escaped = 0;
    1179         587 :         int hasWildcard = 0;
    1180         587 :         char *ppat;
    1181        1174 :         int esc = strNil(esc_str) ? 0 : esc_str[0];     /* should change to utf8_convert() */
    1182         587 :         int specials;
    1183         587 :         int c;
    1184             : 
    1185         587 :         if (strlen(esc_str) > 1)
    1186           0 :                 throw(MAL, "pcre.sql2pcre",
    1187             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1188             :                           ": ESCAPE string must have length 1");
    1189         587 :         if (pat == NULL)
    1190           0 :                 throw(MAL, "pcre.sql2pcre",
    1191             :                           SQLSTATE(22019) ILLEGAL_ARGUMENT
    1192             :                           ": (I)LIKE pattern must not be NULL");
    1193         587 :         ppat = GDKmalloc(strlen(pat) * 3 +
    1194             :                                          3 /* 3 = "^'the translated regexp'$0" */ );
    1195         587 :         if (ppat == NULL)
    1196           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1197             : 
    1198         587 :         *r = ppat;
    1199             :         /* The escape character can be a char which is special in a PCRE
    1200             :          * expression.  If the user used the "+" char as escape and has "++"
    1201             :          * in their pattern, then replacing this with "+" is not correct and
    1202             :          * should be "\+" instead. */
    1203         587 :         specials = (esc && strchr(pcre_specials, esc) != NULL);
    1204             : 
    1205         587 :         *ppat++ = '^';
    1206        5950 :         while ((c = *pat++) != 0) {
    1207        5363 :                 if (c == esc) {
    1208          15 :                         if (escaped) {
    1209           1 :                                 if (specials) { /* change ++ into \+ */
    1210           1 :                                         *ppat++ = esc;
    1211             :                                 } else {                /* do not escape simple escape symbols */
    1212           0 :                                         ppat[-1] = esc; /* overwrite backslash */
    1213             :                                 }
    1214             :                                 escaped = 0;
    1215             :                         } else {
    1216          14 :                                 *ppat++ = '\\';
    1217          14 :                                 escaped = 1;
    1218             :                         }
    1219             :                         hasWildcard = 1;
    1220        5348 :                 } else if (strchr(pcre_specials, c) != NULL) {
    1221             :                         /* escape PCRE special chars, avoid double backslash if the
    1222             :                          * user uses an invalid escape sequence */
    1223          28 :                         if (!escaped)
    1224          28 :                                 *ppat++ = '\\';
    1225          28 :                         *ppat++ = c;
    1226          28 :                         hasWildcard = 1;
    1227          28 :                         escaped = 0;
    1228        5320 :                 } else if (c == '%' && !escaped) {
    1229         721 :                         *ppat++ = '.';
    1230         721 :                         *ppat++ = '*';
    1231         721 :                         *ppat++ = '?';
    1232         721 :                         hasWildcard = 1;
    1233             :                         /* collapse multiple %, but only if it isn't the escape */
    1234         721 :                         if (esc != '%')
    1235         721 :                                 while (*pat == '%')
    1236           0 :                                         pat++;
    1237        4599 :                 } else if (c == '_' && !escaped) {
    1238         694 :                         *ppat++ = '.';
    1239         694 :                         hasWildcard = 1;
    1240             :                 } else {
    1241        3905 :                         if (escaped) {
    1242          13 :                                 ppat[-1] = c;   /* overwrite backslash of invalid escape */
    1243             :                         } else {
    1244        3892 :                                 *ppat++ = c;
    1245             :                         }
    1246             :                         escaped = 0;
    1247             :                 }
    1248             :         }
    1249             :         /* no wildcard or escape character at end of string */
    1250         587 :         if (!hasWildcard || escaped) {
    1251           1 :                 GDKfree(*r);
    1252           1 :                 *r = NULL;
    1253           1 :                 if (escaped)
    1254           0 :                         throw(MAL, "pcre.sql2pcre",
    1255             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1256             :                                   ": (I)LIKE pattern must not end with escape character");
    1257           1 :                 *r = GDKstrdup(str_nil);
    1258           1 :                 if (*r == NULL)
    1259           0 :                         throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1260             :         } else {
    1261         586 :                 *ppat++ = '$';
    1262         586 :                 *ppat = 0;
    1263             :         }
    1264             :         return MAL_SUCCEED;
    1265             : }
    1266             : 
    1267             : #ifdef HAVE_LIBPCRE
    1268             : /* change SQL PATINDEX pattern into PCRE pattern */
    1269             : static str
    1270          25 : pat2pcre(str *r, const char *pat)
    1271             : {
    1272          25 :         size_t len = strlen(pat);
    1273          25 :         char *ppat = GDKmalloc(len * 2 + 3 /* 3 = "^'the translated regexp'$0" */ );
    1274          25 :         int start = 0;
    1275             : 
    1276          25 :         if (ppat == NULL)
    1277           0 :                 throw(MAL, "pcre.sql2pcre", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1278          25 :         *r = ppat;
    1279          77 :         while (*pat) {
    1280          52 :                 int c = *pat++;
    1281             : 
    1282          52 :                 if (strchr(pcre_specials, c) != NULL) {
    1283          17 :                         *ppat++ = '\\';
    1284          17 :                         *ppat++ = c;
    1285          35 :                 } else if (c == '%') {
    1286           3 :                         if (start && *pat) {
    1287           0 :                                 *ppat++ = '.';
    1288           0 :                                 *ppat++ = '*';
    1289             :                         }
    1290           3 :                         start++;
    1291          32 :                 } else if (c == '_') {
    1292           0 :                         *ppat++ = '.';
    1293             :                 } else {
    1294          32 :                         *ppat++ = c;
    1295             :                 }
    1296             :         }
    1297          25 :         *ppat = 0;
    1298          25 :         return MAL_SUCCEED;
    1299             : }
    1300             : #endif
    1301             : 
    1302             : /*
    1303             :  * @+ Wrapping
    1304             :  */
    1305             : 
    1306             : static str
    1307          10 : PCREreplace_wrap(str *res, const str *or, const str *pat, const str *repl,
    1308             :                                  const str *flags)
    1309             : {
    1310          10 :         return pcre_replace(res, *or, *pat, *repl, *flags, true);
    1311             : }
    1312             : 
    1313             : static str
    1314           0 : PCREreplacefirst_wrap(str *res, const str *or, const str *pat, const str *repl,
    1315             :                                           const str *flags)
    1316             : {
    1317           0 :         return pcre_replace(res, *or, *pat, *repl, *flags, false);
    1318             : }
    1319             : 
    1320             : static str
    1321          50 : PCREreplace_bat_wrap(bat *res, const bat *bid, const str *pat, const str *repl,
    1322             :                                          const str *flags)
    1323             : {
    1324          50 :         BAT *b, *bn = NULL;
    1325          50 :         str msg;
    1326          50 :         if ((b = BATdescriptor(*bid)) == NULL)
    1327           0 :                 throw(MAL, "batpcre.replace", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1328             : 
    1329          50 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, true);
    1330          50 :         if (msg == MAL_SUCCEED) {
    1331          50 :                 *res = bn->batCacheid;
    1332          50 :                 BBPkeepref(bn);
    1333             :         }
    1334          50 :         BBPunfix(b->batCacheid);
    1335          50 :         return msg;
    1336             : }
    1337             : 
    1338             : static str
    1339           0 : PCREreplacefirst_bat_wrap(bat *res, const bat *bid, const str *pat,
    1340             :                                                   const str *repl, const str *flags)
    1341             : {
    1342           0 :         BAT *b, *bn = NULL;
    1343           0 :         str msg;
    1344           0 :         if ((b = BATdescriptor(*bid)) == NULL)
    1345           0 :                 throw(MAL, "batpcre.replace_first", RUNTIME_OBJECT_MISSING);
    1346             : 
    1347           0 :         msg = pcre_replace_bat(&bn, b, *pat, *repl, *flags, false);
    1348           0 :         if (msg == MAL_SUCCEED) {
    1349           0 :                 *res = bn->batCacheid;
    1350           0 :                 BBPkeepref(bn);
    1351             :         }
    1352           0 :         BBPunfix(b->batCacheid);
    1353           0 :         return msg;
    1354             : }
    1355             : 
    1356             : static str
    1357          74 : PCREmatch(bit *ret, const str *val, const str *pat)
    1358             : {
    1359           4 :         return pcre_match_with_flags(ret, *val, *pat,
    1360             : #ifdef HAVE_LIBPCRE
    1361             :                                                                  "s"
    1362             : #else
    1363             :                                                                  "x"
    1364             : #endif
    1365             :                         );
    1366             : }
    1367             : 
    1368             : static str
    1369           0 : PCREimatch(bit *ret, const str *val, const str *pat)
    1370             : {
    1371           0 :         return pcre_match_with_flags(ret, *val, *pat, "i"
    1372             : #ifndef HAVE_LIBPCRE
    1373             :                                                                  "x"
    1374             : #endif
    1375             :                         );
    1376             : }
    1377             : 
    1378             : static str
    1379          25 : PCREindex(int *res, const pcre *pattern, const str *s)
    1380             : {
    1381             : #ifdef HAVE_LIBPCRE
    1382          25 :         int v[3];
    1383             : 
    1384          25 :         v[0] = v[1] = *res = 0;
    1385          25 :         if (pcre_exec(pattern, NULL, *s, (int) strlen(*s), 0,
    1386             :                                   PCRE_NO_UTF8_CHECK, v, 3) >= 0) {
    1387          23 :                 *res = v[1];
    1388             :         }
    1389          25 :         return MAL_SUCCEED;
    1390             : #else
    1391             :         (void) res;
    1392             :         (void) pattern;
    1393             :         (void) s;
    1394             :         throw(MAL, "pcre.index", "Database was compiled without PCRE support.");
    1395             : #endif
    1396             : }
    1397             : 
    1398             : static str
    1399          27 : PCREpatindex(int *ret, const str *pat, const str *val)
    1400             : {
    1401             : #ifdef HAVE_LIBPCRE
    1402          27 :         pcre *re = NULL;
    1403          27 :         char *ppat = NULL, *msg;
    1404             : 
    1405          53 :         if (strNil(*pat) || strNil(*val)) {
    1406           2 :                 *ret = int_nil;
    1407           2 :                 return MAL_SUCCEED;
    1408             :         }
    1409             : 
    1410          25 :         if ((msg = pat2pcre(&ppat, *pat)) != MAL_SUCCEED)
    1411             :                 return msg;
    1412          25 :         if ((msg = pcre_compile_wrap(&re, ppat, FALSE)) != MAL_SUCCEED) {
    1413           0 :                 GDKfree(ppat);
    1414           0 :                 return msg;
    1415             :         }
    1416          25 :         GDKfree(ppat);
    1417          25 :         msg = PCREindex(ret, re, val);
    1418          25 :         pcre_free(re);
    1419          25 :         return msg;
    1420             : #else
    1421             :         (void) ret;
    1422             :         (void) pat;
    1423             :         (void) val;
    1424             :         throw(MAL, "pcre.patindex", "Database was compiled without PCRE support.");
    1425             : #endif
    1426             : }
    1427             : 
    1428             : static str
    1429           0 : PCREquote(str *ret, const str *val)
    1430             : {
    1431           0 :         char *p;
    1432           0 :         const char *s = *val;
    1433             : 
    1434           0 :         *ret = p = GDKmalloc(strlen(s) * 2 + 1);        /* certainly long enough */
    1435           0 :         if (p == NULL)
    1436           0 :                 throw(MAL, "pcre.quote", SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1437             :         /* quote all non-alphanumeric ASCII characters (i.e. leave
    1438             :            non-ASCII and alphanumeric alone) */
    1439           0 :         while (*s) {
    1440           0 :                 if (!((*s & 0x80) != 0 ||
    1441           0 :                           ('a' <= *s && *s <= 'z') ||
    1442           0 :                           ('A' <= *s && *s <= 'Z') || isdigit((unsigned char) *s)))
    1443           0 :                         *p++ = '\\';
    1444           0 :                 *p++ = *s++;
    1445             :         }
    1446           0 :         *p = 0;
    1447           0 :         return MAL_SUCCEED;
    1448             : }
    1449             : 
    1450             : static str
    1451           6 : PCREsql2pcre(str *ret, const str *pat, const str *esc)
    1452             : {
    1453           6 :         return sql2pcre(ret, *pat, *esc);
    1454             : }
    1455             : 
    1456             : static bool
    1457        7840 : is_ascii_str(const char *pat)
    1458             : {
    1459        7840 :         size_t len = strlen(pat);
    1460       61058 :         for (size_t i = 0; i < len; i++) {
    1461       53882 :                 if (pat[i] & 0x80)
    1462             :                         return false;
    1463             :         }
    1464             : 
    1465             :         return true;
    1466             : }
    1467             : 
    1468             : static inline str
    1469        7840 : choose_like_path(char **ppat, bool *use_re, bool *use_strcmp, bool *empty,
    1470             :                                  bool *ascii_pattern, const char *pat, const char *esc)
    1471             : {
    1472        7840 :         str res = MAL_SUCCEED;
    1473        7840 :         *use_re = false;
    1474        7840 :         *use_strcmp = false;
    1475        7840 :         *empty = false;
    1476             : 
    1477             : 
    1478        7840 :         *ascii_pattern = is_ascii_str(pat);
    1479             : 
    1480       15192 :         if (strNil(pat) || strNil(esc)) {
    1481         488 :                 *empty = true;
    1482             :         } else {
    1483        7352 :                 if (!re_is_pattern_properly_escaped(pat, (unsigned char) *esc))
    1484           5 :                         throw(MAL, "pcre.sql2pcre",
    1485             :                                   SQLSTATE(22019) ILLEGAL_ARGUMENT
    1486             :                                   ": (I)LIKE pattern must not end with escape character");
    1487        7347 :                 if (is_strcmpable(pat, esc)) {
    1488         876 :                         *use_re = true;
    1489         876 :                         *use_strcmp = true;
    1490        6471 :                 } else if (re_simple(pat, (unsigned char) *esc)) {
    1491        5890 :                         *use_re = true;
    1492             :                 } else {
    1493         581 :                         if ((res = sql2pcre(ppat, pat, esc)) != MAL_SUCCEED)
    1494             :                                 return res;
    1495         581 :                         if (strNil(*ppat)) {
    1496           0 :                                 GDKfree(*ppat);
    1497           0 :                                 *ppat = NULL;
    1498           0 :                                 *use_re = true;
    1499           0 :                                 *use_strcmp = true;
    1500             :                         }
    1501             :                 }
    1502             :         }
    1503             :         return res;
    1504             : }
    1505             : 
    1506             : static str
    1507         234 : PCRElike_imp(bit *ret, const str *s, const str *pat, const str *esc,
    1508             :                          const bit *isens)
    1509             : {
    1510         234 :         str res = MAL_SUCCEED;
    1511         234 :         char *ppat = NULL;
    1512         234 :         bool use_re = false, use_strcmp = false, empty = false, ascii_pattern = false;
    1513         234 :         struct RE *re = NULL;
    1514             : 
    1515         234 :         if ((res = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    1516             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    1517             :                 return res;
    1518             : 
    1519         459 :         MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp ?
    1520         225 :                                                    "pcrelike: pattern matching using strcmp" : use_re ?
    1521             :                                                    "pcrelike: pattern matching using RE" :
    1522             :                                                    "pcrelike: pattern matching using pcre");
    1523             : 
    1524         468 :         if (strNil(*s) || empty) {
    1525           0 :                 *ret = bit_nil;
    1526         234 :         } else if (use_re) {
    1527         164 :                 if (use_strcmp) {
    1528           9 :                         *ret = *isens ? (ascii_pattern
    1529           2 :                                                          ? istrcmp(*s, *pat) == 0
    1530           0 :                                                          : mystrcasecmp(*s, *pat) == 0)
    1531           7 :                                 : strcmp(*s, *pat) == 0;
    1532             :                 } else {
    1533         155 :                         if (!(re = re_create(*pat, *isens, ascii_pattern, (unsigned char) **esc)))
    1534           0 :                                 res = createException(MAL, "pcre.like4",
    1535             :                                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1536             :                         else
    1537         310 :                                 *ret = (*isens && !re->is_ascii)
    1538           0 :                                         ? re_match_ignore(*s, re)
    1539         155 :                                         : re_match_no_ignore(*s, re);
    1540             :                 }
    1541             :         } else {
    1542          70 :                 res = *isens ? PCREimatch(ret, s, &ppat) : PCREmatch(ret, s, &ppat);
    1543             :         }
    1544             : 
    1545         164 :         if (re)
    1546         155 :                 re_destroy(re);
    1547         234 :         GDKfree(ppat);
    1548         234 :         return res;
    1549             : }
    1550             : 
    1551             : static str
    1552         234 : PCRElike(bit *ret, const str *s, const str *pat, const str *esc,
    1553             :                  const bit *isens)
    1554             : {
    1555         229 :         return PCRElike_imp(ret, s, pat, esc, isens);
    1556             : }
    1557             : 
    1558             : static str
    1559           5 : PCREnotlike(bit *ret, const str *s, const str *pat, const str *esc,
    1560             :                         const bit *isens)
    1561             : {
    1562           5 :         str tmp;
    1563           5 :         bit r;
    1564             : 
    1565           5 :         rethrow("str.not_like", tmp, PCRElike(&r, s, pat, esc, isens));
    1566           5 :         *ret = r == bit_nil ? bit_nil : !r;
    1567           5 :         return MAL_SUCCEED;
    1568             : }
    1569             : 
    1570             : static inline str
    1571        6603 : re_like_build(struct RE **re, uint32_t **wpat, const char *pat, bool caseignore,
    1572             :                           bool use_strcmp, bool ascii_pattern, uint32_t esc)
    1573             : {
    1574        6603 :         if (!use_strcmp) {
    1575        5735 :                 if (!(*re = re_create(pat, caseignore, ascii_pattern, esc)))
    1576           0 :                         return createException(MAL, "pcre.re_like_build",
    1577             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1578         868 :         } else if (caseignore && !ascii_pattern) {
    1579          30 :                 if (!(*wpat = utf8stoucs(pat)))
    1580           0 :                         return createException(MAL, "pcre.re_like_build",
    1581             :                                                                    SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1582             :         }
    1583             :         return MAL_SUCCEED;
    1584             : }
    1585             : 
    1586             : #define proj_scanloop(TEST)     \
    1587             :         do {                                    \
    1588             :                 if (strNil(s))          \
    1589             :                         return bit_nil; \
    1590             :                 else                            \
    1591             :                         return TEST;    \
    1592             :         } while (0)
    1593             : 
    1594             : static inline bit
    1595        5084 : re_like_proj_apply(const char *s, const struct RE *restrict re,
    1596             :                                    const uint32_t *restrict wpat, const char *pat,
    1597             :                                    bool caseignore, bool anti, bool use_strcmp, bool is_ascii)
    1598             : {
    1599        5084 :         if (use_strcmp) {
    1600        1163 :                 if (caseignore) {
    1601         537 :                         if (is_ascii) {
    1602         518 :                                 if (anti)
    1603         950 :                                         proj_scanloop(istrcmp(s, pat) != 0);
    1604             :                                 else
    1605          86 :                                         proj_scanloop(istrcmp(s, pat) == 0);
    1606             :                         } else {
    1607          19 :                                 if (anti)
    1608          28 :                                         proj_scanloop(mywstrcasecmp(s, wpat) != 0);
    1609             :                                 else
    1610          10 :                                         proj_scanloop(mywstrcasecmp(s, wpat) == 0);
    1611             :                         }
    1612             :                 } else {
    1613         626 :                         if (anti)
    1614         608 :                                 proj_scanloop(strcmp(s, pat) != 0);
    1615             :                         else
    1616         644 :                                 proj_scanloop(strcmp(s, pat) == 0);
    1617             :                 }
    1618             :         } else {
    1619             :                 /* Use re_match_ignore only if the pattern is UTF-8
    1620             :                  * and we need to ignore case
    1621             :                  */
    1622        3921 :                 if (caseignore && !is_ascii) {
    1623           3 :                         if (anti)
    1624           6 :                                 proj_scanloop(!re_match_ignore(s, re));
    1625             :                         else
    1626           0 :                                 proj_scanloop(re_match_ignore(s, re));
    1627             :                 } else {
    1628        3918 :                         if (anti)
    1629         180 :                                 proj_scanloop(!re_match_no_ignore(s, re));
    1630             :                         else
    1631        7656 :                                 proj_scanloop(re_match_no_ignore(s, re));
    1632             :                 }
    1633             :         }
    1634             : }
    1635             : 
    1636             : static inline void
    1637        6847 : re_like_clean(struct RE **re, uint32_t **wpat)
    1638             : {
    1639        6847 :         if (*re) {
    1640        5734 :                 re_destroy(*re);
    1641        5735 :                 *re = NULL;
    1642             :         }
    1643        6848 :         if (*wpat) {
    1644          30 :                 GDKfree(*wpat);
    1645          30 :                 *wpat = NULL;
    1646             :         }
    1647        6848 : }
    1648             : 
    1649             : #ifdef HAVE_LIBPCRE
    1650             : static inline str
    1651         511 : pcre_like_build(pcre **res, pcre_extra **ex, const char *ppat, bool caseignore,
    1652             :                                 BUN count)
    1653             : {
    1654         511 :         const char *err_p = NULL;
    1655         511 :         int errpos = 0;
    1656         511 :         int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK | PCRE_MULTILINE | PCRE_DOTALL;
    1657         511 :         int pcrestopt = count > JIT_COMPILE_MIN ? PCRE_STUDY_JIT_COMPILE : 0;
    1658             : 
    1659         511 :         *res = NULL;
    1660         511 :         *ex = NULL;
    1661             : 
    1662         511 :         if (caseignore) {
    1663          18 :                 options |= PCRE_CASELESS;
    1664             :         }
    1665         511 :         if ((*res = pcre_compile(ppat, options, &err_p, &errpos, NULL)) == NULL)
    1666           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1667             :                                                            ": compilation of regular expression (%s) failed"
    1668             :                                                            " at %d with '%s'", ppat, errpos, err_p);
    1669         509 :         *ex = pcre_study(*res, pcrestopt, &err_p);
    1670         511 :         if (err_p != NULL)
    1671           0 :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1672             :                                                            ": pcre study of pattern (%s) "
    1673             :                                                            "failed with '%s'", ppat, err_p);
    1674             :         return MAL_SUCCEED;
    1675             : }
    1676             : #else
    1677             : static inline str
    1678             : pcre_like_build(regex_t *res, void *ex, const char *ppat, bool caseignore,
    1679             :                                 BUN count)
    1680             : {
    1681             :         int options = REG_NEWLINE | REG_NOSUB | REG_EXTENDED;
    1682             :         int errcode;
    1683             : 
    1684             :         *res = (regex_t) {
    1685             :         0};
    1686             :         (void) count;
    1687             : 
    1688             :         if (caseignore) {
    1689             :                 options |= REG_ICASE;
    1690             :         }
    1691             :         if ((errcode = regcomp(res, ppat, options)) != 0)
    1692             :                 return createException(MAL, "pcre.pcre_like_build", OPERATION_FAILED
    1693             :                                                            ": compilation of regular expression (%s) failed",
    1694             :                                                            ppat);
    1695             :         (void) ex;
    1696             :         return MAL_SUCCEED;
    1697             : }
    1698             : #endif
    1699             : 
    1700             : #define PCRE_LIKE_BODY(LOOP_BODY, RES1, RES2) \
    1701             :         do { \
    1702             :                 LOOP_BODY  \
    1703             :                 if (strNil(s))          \
    1704             :                         *ret = bit_nil; \
    1705             :                 else if (pos >= 0) \
    1706             :                         *ret = RES1; \
    1707             :                 else if (pos == -1) \
    1708             :                         *ret = RES2; \
    1709             :                 else \
    1710             :                         return createException(MAL, "pcre.match", OPERATION_FAILED ": matching of regular expression (%s) failed with %d", ppat, pos); \
    1711             :         } while(0)
    1712             : 
    1713             : static inline str
    1714        1153 : pcre_like_apply(bit *ret, const char *s,
    1715             : #ifdef HAVE_LIBPCRE
    1716             :                                 const pcre *re, const pcre_extra *ex
    1717             : #else
    1718             :                                 regex_t re, void *ex
    1719             : #endif
    1720             :                                 , const char *ppat, bool anti)
    1721             : {
    1722        1153 :         int pos;
    1723             : 
    1724             : #ifdef HAVE_LIBPCRE
    1725             : #define LOOP_BODY       \
    1726             :         pos = pcre_exec(re, ex, s, (int) strlen(s), 0, PCRE_NO_UTF8_CHECK, NULL, 0);
    1727             : #else
    1728             : #define LOOP_BODY       \
    1729             :         int retval = regexec(&re, s, (size_t) 0, NULL, 0); \
    1730             :         (void) ex; \
    1731             :         pos = retval == REG_NOMATCH ? -1 : (retval == REG_ENOSYS ? -2 : 0);
    1732             : #endif
    1733             : 
    1734        1153 :         if (anti)
    1735          43 :                 PCRE_LIKE_BODY(LOOP_BODY, FALSE, TRUE);
    1736             :         else
    1737        1110 :                 PCRE_LIKE_BODY(LOOP_BODY, TRUE, FALSE);
    1738             : 
    1739             :         return MAL_SUCCEED;
    1740             : }
    1741             : 
    1742             : static inline void
    1743        1160 : pcre_clean(
    1744             : #ifdef HAVE_LIBPCRE
    1745             :                           pcre **re, pcre_extra **ex)
    1746             : {
    1747        1160 :         if (*re)
    1748         511 :                 pcre_free(*re);
    1749        1160 :         if (*ex)
    1750         511 :                 pcre_free_study(*ex);
    1751        1158 :         *re = NULL;
    1752        1158 :         *ex = NULL;
    1753             : #else
    1754             :                           regex_t *re, void *ex)
    1755             : {
    1756             :         regfree(re);
    1757             :         *re = (regex_t) {
    1758             :         0};
    1759             :         (void) ex;
    1760             : #endif
    1761        1158 : }
    1762             : 
    1763             : static str
    1764         667 : BATPCRElike_imp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci,
    1765             :                                 const str *esc, const bit *isens, const bit *not)
    1766             : {
    1767         667 :         str msg = MAL_SUCCEED;
    1768         667 :         BAT *b = NULL, *pbn = NULL, *bn = NULL;
    1769         667 :         char *ppat = NULL;
    1770         667 :         const char *input = NULL;
    1771         667 :         bool use_re = false,
    1772         667 :                 use_strcmp = false,
    1773         667 :                 empty = false,
    1774         667 :                 isensitive = (bool) *isens,
    1775         667 :                 anti = (bool) *not,
    1776         667 :                 has_nil = false,
    1777         667 :                 ascii_pattern = false,
    1778         667 :                 input_is_a_bat = isaBatType(getArgType(mb, pci, 1)),
    1779         667 :                 pattern_is_a_bat = isaBatType(getArgType(mb, pci, 2));
    1780         667 :         bat *r = getArgReference_bat(stk, pci, 0);
    1781         667 :         BUN q = 0;
    1782         667 :         bit *restrict ret = NULL;
    1783             : #ifdef HAVE_LIBPCRE
    1784         667 :         pcre *re = NULL;
    1785         667 :         pcre_extra *ex = NULL;
    1786             : #else
    1787             :         regex_t re = (regex_t) { 0 };
    1788             :         void *ex = NULL;
    1789             : #endif
    1790         667 :         struct RE *re_simple = NULL;
    1791         667 :         uint32_t *wpat = NULL;
    1792         667 :         BATiter bi = (BATiter) { 0 }, pi;
    1793             : 
    1794         667 :         (void) cntxt;
    1795         667 :         if (input_is_a_bat) {
    1796         667 :                 bat *bid = getArgReference_bat(stk, pci, 1);
    1797         667 :                 if (!(b = BATdescriptor(*bid))) {
    1798           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1799             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1800           0 :                         goto bailout;
    1801             :                 }
    1802             :         }
    1803         667 :         if (pattern_is_a_bat) {
    1804          84 :                 bat *pb = getArgReference_bat(stk, pci, 2);
    1805          84 :                 if (!(pbn = BATdescriptor(*pb))) {
    1806           0 :                         msg = createException(MAL, "batalgebra.batpcrelike3",
    1807             :                                                                   SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    1808           0 :                         goto bailout;
    1809             :                 }
    1810             :         }
    1811         667 :         assert((!b || ATOMstorage(b->ttype) == TYPE_str)
    1812             :                    && (!pbn || ATOMstorage(pbn->ttype) == TYPE_str));
    1813             : 
    1814         667 :         q = BATcount(b ? b : pbn);
    1815         667 :         if (!(bn = COLnew(b ? b->hseqbase : pbn->hseqbase, TYPE_bit, q, TRANSIENT))) {
    1816           0 :                 msg = createException(MAL, "batalgebra.batpcrelike3",
    1817             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    1818           0 :                 goto bailout;
    1819             :         }
    1820         667 :         ret = (bit *) Tloc(bn, 0);
    1821             : 
    1822         667 :         if (pattern_is_a_bat) {
    1823          84 :                 pi = bat_iterator(pbn);
    1824          84 :                 if (b)
    1825          84 :                         bi = bat_iterator(b);
    1826             :                 else
    1827           0 :                         input = *getArgReference_str(stk, pci, 1);
    1828             : 
    1829        1173 :                 for (BUN p = 0; p < q; p++) {
    1830        1090 :                         const char *next_input = b ? BUNtvar(bi, p) : input,
    1831        1090 :                                 *np = BUNtvar(pi, p);
    1832             : 
    1833        1090 :                         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1834             :                                                                                 &ascii_pattern, np, *esc)) != MAL_SUCCEED) {
    1835           0 :                                 bat_iterator_end(&pi);
    1836           0 :                                 if (b)
    1837           0 :                                         bat_iterator_end(&bi);
    1838           0 :                                 goto bailout;
    1839             :                         }
    1840             : 
    1841        1091 :                         if (use_re) {
    1842         627 :                                 if ((msg = re_like_build(&re_simple, &wpat, np, isensitive,
    1843             :                                                                                  use_strcmp, ascii_pattern,
    1844         626 :                                                                                  (unsigned char) **esc)) != MAL_SUCCEED) {
    1845           0 :                                         bat_iterator_end(&pi);
    1846           0 :                                         if (b)
    1847           0 :                                                 bat_iterator_end(&bi);
    1848           0 :                                         goto bailout;
    1849             :                                 }
    1850         627 :                                 ret[p] = re_like_proj_apply(next_input, re_simple, wpat, np,
    1851             :                                                                                         isensitive, anti, use_strcmp,
    1852             :                                                                                         ascii_pattern);
    1853         625 :                                 re_like_clean(&re_simple, &wpat);
    1854         465 :                         } else if (empty) {
    1855         459 :                                 ret[p] = bit_nil;
    1856             :                         } else {
    1857           6 :                                 if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, 1)) != MAL_SUCCEED) {
    1858           0 :                                         bat_iterator_end(&pi);
    1859           0 :                                         if (b)
    1860           0 :                                                 bat_iterator_end(&bi);
    1861           0 :                                         goto bailout;
    1862             :                                 }
    1863           6 :                                 if ((msg = pcre_like_apply(&(ret[p]), next_input, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1864           0 :                                         bat_iterator_end(&pi);
    1865           0 :                                         if (b)
    1866           0 :                                                 bat_iterator_end(&bi);
    1867           0 :                                         goto bailout;
    1868             :                                 }
    1869           6 :                                 pcre_clean(&re, &ex);
    1870             :                         }
    1871        1089 :                         has_nil |= is_bit_nil(ret[p]);
    1872        1089 :                         GDKfree(ppat);
    1873        1089 :                         ppat = NULL;
    1874             :                 }
    1875          83 :                 bat_iterator_end(&pi);
    1876          84 :                 if (b)
    1877          84 :                         bat_iterator_end(&bi);
    1878             :         } else {
    1879         583 :                 const char *pat = *getArgReference_str(stk, pci, 2);
    1880         583 :                 if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty,
    1881             :                                                                         &ascii_pattern, pat, *esc)) != MAL_SUCCEED)
    1882           5 :                         goto bailout;
    1883             : 
    1884         578 :                 bi = bat_iterator(b);
    1885        1095 :                 MT_thread_setalgorithm(empty ? "pcrelike: trivially empty" : use_strcmp
    1886             :                                                            ? "pcrelike: pattern matching using strcmp" :
    1887         517 :                                                            use_re ? "pcrelike: pattern matching using RE" :
    1888             :                                                            "pcrelike: pattern matching using pcre");
    1889             : 
    1890         578 :                 if (use_re) {
    1891         424 :                         if ((msg = re_like_build(&re_simple, &wpat, pat, isensitive, use_strcmp,
    1892         424 :                                                                          ascii_pattern, (unsigned char) **esc)) != MAL_SUCCEED) {
    1893           0 :                                 bat_iterator_end(&bi);
    1894           0 :                                 goto bailout;
    1895             :                         }
    1896        4881 :                         for (BUN p = 0; p < q; p++) {
    1897        4457 :                                 const char *s = BUNtvar(bi, p);
    1898        4458 :                                 ret[p] = re_like_proj_apply(s, re_simple, wpat, pat, isensitive,
    1899             :                                                                                         anti, use_strcmp, ascii_pattern);
    1900        4457 :                                 has_nil |= is_bit_nil(ret[p]);
    1901             :                         }
    1902         154 :                 } else if (empty) {
    1903          43 :                         for (BUN p = 0; p < q; p++)
    1904          26 :                                 ret[p] = bit_nil;
    1905             :                         has_nil = true;
    1906             :                 } else {
    1907         137 :                         if ((msg = pcre_like_build(&re, &ex, ppat, isensitive, q)) != MAL_SUCCEED) {
    1908           0 :                                 bat_iterator_end(&bi);
    1909           0 :                                 goto bailout;
    1910             :                         }
    1911        1284 :                         for (BUN p = 0; p < q; p++) {
    1912        1147 :                                 const char *s = BUNtvar(bi, p);
    1913        1147 :                                 if ((msg = pcre_like_apply(&(ret[p]), s, re, ex, ppat, anti)) != MAL_SUCCEED) {
    1914           0 :                                         bat_iterator_end(&bi);
    1915           0 :                                         goto bailout;
    1916             :                                 }
    1917        1147 :                                 has_nil |= is_bit_nil(ret[p]);
    1918             :                         }
    1919             :                 }
    1920         578 :                 bat_iterator_end(&bi);
    1921             :         }
    1922             : 
    1923         667 :   bailout:
    1924         667 :         GDKfree(ppat);
    1925         667 :         re_like_clean(&re_simple, &wpat);
    1926         667 :         pcre_clean(&re, &ex);
    1927         666 :         if (bn && !msg) {
    1928         661 :                 BATsetcount(bn, q);
    1929         662 :                 bn->tnil = has_nil;
    1930         662 :                 bn->tnonil = !has_nil;
    1931         662 :                 bn->tkey = BATcount(bn) <= 1;
    1932         662 :                 bn->tsorted = BATcount(bn) <= 1;
    1933         662 :                 bn->trevsorted = BATcount(bn) <= 1;
    1934         662 :                 *r = bn->batCacheid;
    1935         662 :                 BBPkeepref(bn);
    1936           5 :         } else if (bn)
    1937           5 :                 BBPreclaim(bn);
    1938         666 :         BBPreclaim(b);
    1939         667 :         BBPreclaim(pbn);
    1940         666 :         return msg;
    1941             : }
    1942             : 
    1943             : static str
    1944         528 : BATPCRElike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1945             : {
    1946         528 :         const str *esc = getArgReference_str(stk, pci, 3);
    1947         528 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1948         528 :         bit no = FALSE;
    1949             : 
    1950         528 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &no);
    1951             : }
    1952             : 
    1953             : static str
    1954         139 : BATPCREnotlike(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
    1955             : {
    1956         139 :         const str *esc = getArgReference_str(stk, pci, 3);
    1957         139 :         const bit *ci = getArgReference_bit(stk, pci, 4);
    1958         139 :         bit yes = TRUE;
    1959             : 
    1960         139 :         return BATPCRElike_imp(cntxt, mb, stk, pci, esc, ci, &yes);
    1961             : }
    1962             : 
    1963             : /* scan select loop with or without candidates */
    1964             : #define pcrescanloop(TEST, KEEP_NULLS)                                                                  \
    1965             :         do {                                                                                                                            \
    1966             :                 TRC_DEBUG(ALGO,                                                                                                 \
    1967             :                                   "PCREselect(b=%s#"BUNFMT",anti=%d): "                                     \
    1968             :                                   "scanselect %s\n", BATgetId(b), BATcount(b),                        \
    1969             :                                   anti, #TEST);                                                                                 \
    1970             :                 if (!s || BATtdense(s)) {                                                                               \
    1971             :                         for (; p < q; p++) {                                                                         \
    1972             :                                 GDK_CHECK_TIMEOUT(qry_ctx, counter,                                             \
    1973             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    1974             :                                 const char *restrict v = BUNtvar(bi, p - off);                  \
    1975             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1976             :                                         vals[cnt++] = p;                                                                        \
    1977             :                         }                                                                                                                       \
    1978             :                 } else {                                                                                                                \
    1979             :                         for (; p < ncands; p++) {                                                                    \
    1980             :                                 GDK_CHECK_TIMEOUT(qry_ctx, counter,                                             \
    1981             :                                                                   GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    1982             :                                 oid o = canditer_next(ci);                                                              \
    1983             :                                 const char *restrict v = BUNtvar(bi, o - off);                  \
    1984             :                                 if ((TEST) || ((KEEP_NULLS) && strNil(v)))                              \
    1985             :                                         vals[cnt++] = o;                                                                        \
    1986             :                         }                                                                                                                       \
    1987             :                 }                                                                                                                               \
    1988             :         } while (0)
    1989             : 
    1990             : #ifdef HAVE_LIBPCRE
    1991             : #define PCRE_LIKESELECT_BODY (pcre_exec(re, ex, v, (int) strlen(v), 0, PCRE_NO_UTF8_CHECK, NULL, 0) >= 0)
    1992             : #else
    1993             : #define PCRE_LIKESELECT_BODY (regexec(&re, v, (size_t) 0, NULL, 0) != REG_NOMATCH)
    1994             : #endif
    1995             : 
    1996             : static str
    1997         362 : pcre_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    1998             :                                 BUN *rcnt, const char *pat, bool caseignore, bool anti,
    1999             :                                 bool keep_nulls)
    2000             : {
    2001             : #ifdef HAVE_LIBPCRE
    2002         362 :         pcre *re = NULL;
    2003         362 :         pcre_extra *ex = NULL;
    2004             : #else
    2005             :         regex_t re = (regex_t) { 0 };
    2006             :         void *ex = NULL;
    2007             : #endif
    2008         362 :         BATiter bi = bat_iterator(b);
    2009         362 :         BUN cnt = 0, ncands = ci->ncand;
    2010         362 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2011         362 :         str msg = MAL_SUCCEED;
    2012             : 
    2013         362 :         size_t counter = 0;
    2014         362 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2015             : 
    2016         362 :         if ((msg = pcre_like_build(&re, &ex, pat, caseignore, ci->ncand)) != MAL_SUCCEED)
    2017           0 :                 goto bailout;
    2018             : 
    2019         362 :         if (anti)
    2020           0 :                 pcrescanloop(!strNil(v) && !PCRE_LIKESELECT_BODY, keep_nulls);
    2021             :         else
    2022       37760 :                 pcrescanloop(!strNil(v) && PCRE_LIKESELECT_BODY, keep_nulls);
    2023             : 
    2024           4 :   bailout:
    2025         361 :         bat_iterator_end(&bi);
    2026         362 :         pcre_clean(&re, &ex);
    2027         362 :         *rcnt = cnt;
    2028         362 :         return msg;
    2029             : }
    2030             : 
    2031             : static str
    2032        5433 : re_likeselect(BAT *bn, BAT *b, BAT *s, struct canditer *ci, BUN p, BUN q,
    2033             :                           BUN *rcnt, const char *pat, bool caseignore, bool anti,
    2034             :                           bool use_strcmp, uint32_t esc, bool keep_nulls,
    2035             :                           bool ascii_pattern)
    2036             : {
    2037        5433 :         BATiter bi = bat_iterator(b);
    2038        5433 :         BUN cnt = 0, ncands = ci->ncand;
    2039        5433 :         oid off = b->hseqbase, *restrict vals = Tloc(bn, 0);
    2040        5433 :         struct RE *re = NULL;
    2041        5433 :         uint32_t *wpat = NULL;
    2042        5433 :         str msg = MAL_SUCCEED;
    2043             : 
    2044        5433 :         size_t counter = 0;
    2045        5433 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2046             : 
    2047        5433 :         if ((msg = re_like_build(&re, &wpat, pat, caseignore, use_strcmp, ascii_pattern,
    2048             :                                                          esc)) != MAL_SUCCEED)
    2049           0 :                 goto bailout;
    2050             : 
    2051        5433 :         if (use_strcmp) {
    2052          90 :                 if (caseignore) {
    2053          30 :                         if (ascii_pattern) {
    2054          22 :                                 if (anti)
    2055          64 :                                         pcrescanloop(!strNil(v)
    2056             :                                                                  && istrcmp(v, pat) != 0, keep_nulls);
    2057             :                                 else
    2058         597 :                                         pcrescanloop(!strNil(v)
    2059             :                                                                  && istrcmp(v, pat) == 0, keep_nulls);
    2060             :                         } else {
    2061           8 :                                 if (anti)
    2062           0 :                                         pcrescanloop(!strNil(v)
    2063             :                                                                  && mywstrcasecmp(v, wpat) != 0, keep_nulls);
    2064             :                                 else
    2065          36 :                                         pcrescanloop(!strNil(v)
    2066             :                                                                  && mywstrcasecmp(v, wpat) == 0, keep_nulls);
    2067             :                         }
    2068             :                 } else {
    2069          60 :                         if (anti)
    2070          54 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) != 0, keep_nulls);
    2071             :                         else
    2072        9115 :                                 pcrescanloop(!strNil(v) && strcmp(v, pat) == 0, keep_nulls);
    2073             :                 }
    2074             :         } else {
    2075        5343 :                 if (caseignore) {
    2076             :                         /* ascii_pattern == true is encoded in re */
    2077          53 :                         if (anti) {
    2078           1 :                                 if (ascii_pattern)
    2079          42 :                                         pcrescanloop(!strNil(v)
    2080             :                                                                  && !re_match_no_ignore(v, re), keep_nulls);
    2081             :                                 else
    2082           0 :                                         pcrescanloop(!strNil(v)
    2083             :                                                                  && !re_match_ignore(v, re), keep_nulls);
    2084             :                         } else {
    2085          52 :                                 if (ascii_pattern)
    2086        6311 :                                         pcrescanloop(!strNil(v)
    2087             :                                                                  && re_match_no_ignore(v, re), keep_nulls);
    2088             :                                 else
    2089          72 :                                         pcrescanloop(!strNil(v)
    2090             :                                                                  && re_match_ignore(v, re), keep_nulls);
    2091             :                         }
    2092             :                 } else {
    2093        5290 :                         if (anti)
    2094       42604 :                                 pcrescanloop(!strNil(v)
    2095             :                                                          && !re_match_no_ignore(v, re), keep_nulls);
    2096             :                         else
    2097      132170 :                                 pcrescanloop(!strNil(v)
    2098             :                                                          && re_match_no_ignore(v, re), keep_nulls);
    2099             :                 }
    2100             :         }
    2101             : 
    2102          80 :   bailout:
    2103        5433 :         bat_iterator_end(&bi);
    2104        5432 :         re_like_clean(&re, &wpat);
    2105        5433 :         *rcnt = cnt;
    2106        5433 :         return msg;
    2107             : }
    2108             : 
    2109             : static str
    2110        5795 : PCRElikeselect(bat *ret, const bat *bid, const bat *sid, const str *pat,
    2111             :                            const str *esc, const bit *caseignore, const bit *anti)
    2112             : {
    2113        5795 :         BAT *b, *s = NULL, *bn = NULL, *old_s = NULL;
    2114        5795 :         str msg = MAL_SUCCEED;
    2115        5795 :         char *ppat = NULL;
    2116        5795 :         bool use_re = false,
    2117        5795 :                 use_strcmp = false,
    2118        5795 :                 empty = false,
    2119        5795 :                 ascii_pattern = false;
    2120        5795 :         bool with_strimps = false;
    2121        5795 :         bool with_strimps_anti = false;
    2122        5795 :         BUN p = 0, q = 0, rcnt = 0;
    2123        5795 :         struct canditer ci;
    2124             : 
    2125        5795 :         if ((b = BATdescriptor(*bid)) == NULL) {
    2126           0 :                 msg = createException(MAL, "algebra.likeselect",
    2127             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2128           0 :                 goto bailout;
    2129             :         }
    2130        5795 :         if (sid && !is_bat_nil(*sid) && (s = BATdescriptor(*sid)) == NULL) {
    2131           0 :                 msg = createException(MAL, "algebra.likeselect",
    2132             :                                                           SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2133           0 :                 goto bailout;
    2134             :         }
    2135             : 
    2136        5794 :         assert(ATOMstorage(b->ttype) == TYPE_str);
    2137             : 
    2138        5794 :         if ((msg = choose_like_path(&ppat, &use_re, &use_strcmp, &empty, &ascii_pattern,
    2139             :                                                                 *pat, *esc)) != MAL_SUCCEED)
    2140           0 :                 goto bailout;
    2141             : 
    2142        5795 :         if (empty) {
    2143           0 :                 if (!(bn = BATdense(0, 0, 0)))
    2144           0 :                         msg = createException(MAL, "algebra.likeselect",
    2145             :                                                                   SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2146             : 
    2147           0 :                 goto bailout;
    2148             :         }
    2149             :         /* Since the strimp pre-filtering of a LIKE query produces a superset of the actual result the complement of that
    2150             :          * set will necessarily reject some of the matching entries in the NOT LIKE query.
    2151             :          *
    2152             :          * In this case we run the PCRElikeselect as a LIKE query with strimps and return the complement of the result,
    2153             :          * taking extra care to not return NULLs. This currently means that we do not run strimps for NOT LIKE queries if
    2154             :          * the BAT contains NULLs.
    2155             :          */
    2156        5795 :         if (BAThasstrimps(b)) {
    2157          24 :                 if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
    2158          24 :                         BAT *tmp_s = STRMPfilter(b, s, *pat, *anti);
    2159          24 :                         if (tmp_s) {
    2160          24 :                                 old_s = s;
    2161          24 :                                 s = tmp_s;
    2162          24 :                                 if (!*anti)
    2163             :                                         with_strimps = true;
    2164             :                                 else
    2165           0 :                                         with_strimps_anti = true;
    2166             :                         }
    2167             :                 } else {                                /* If we cannot filter with the strimp just continue normally */
    2168           0 :                         GDKclrerr();
    2169             :                 }
    2170             :         }
    2171             : 
    2172             : 
    2173        5795 :         MT_thread_setalgorithm(use_strcmp
    2174        5795 :                                                    ? (with_strimps ?
    2175             :                                                           "pcrelike: pattern matching using strcmp with strimps"
    2176             :                                                           : (with_strimps_anti ?
    2177             :                                                                  "pcrelike: pattern matching using strcmp with strimps anti"
    2178        5795 :                                                                  : "pcrelike: pattern matching using strcmp")) :
    2179        5705 :                                                    use_re ? (with_strimps ?
    2180             :                                                                          "pcrelike: pattern matching using RE with strimps"
    2181             :                                                                          : (with_strimps_anti ?
    2182             :                                                                                 "pcrelike: patterm matching using RE with strimps anti"
    2183             :                                                                                 :
    2184             :                                                                                 "pcrelike: pattern matching using RE"))
    2185             :                                                    : (with_strimps ?
    2186             :                                                           "pcrelike: pattern matching using pcre with strimps"
    2187             :                                                           : (with_strimps_anti ?
    2188             :                                                                  "pcrelike: pattermatching using pcre with strimps anti"
    2189             :                                                                  : "pcrelike: pattern matching using pcre")));
    2190             : 
    2191        5795 :         canditer_init(&ci, b, s);
    2192        5795 :         if (!(bn = COLnew(0, TYPE_oid, ci.ncand, TRANSIENT))) {
    2193           0 :                 msg = createException(MAL, "algebra.likeselect",
    2194             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2195           0 :                 goto bailout;
    2196             :         }
    2197             : 
    2198        5795 :         if (!s || BATtdense(s)) {
    2199        1469 :                 if (s) {
    2200        4242 :                         assert(BATtdense(s));
    2201        4242 :                         p = (BUN) s->tseqbase;
    2202        4242 :                         q = p + BATcount(s);
    2203        4242 :                         if ((oid) p < b->hseqbase)
    2204             :                                 p = b->hseqbase;
    2205        4242 :                         if ((oid) q > b->hseqbase + BATcount(b))
    2206             :                                 q = b->hseqbase + BATcount(b);
    2207             :                 } else {
    2208        1469 :                         p = b->hseqbase;
    2209        1469 :                         q = BATcount(b) + b->hseqbase;
    2210             :                 }
    2211             :         }
    2212             : 
    2213        5795 :         if (use_re) {
    2214        5433 :                 msg = re_likeselect(bn, b, s, &ci, p, q, &rcnt, *pat, *caseignore, *anti
    2215         791 :                                                         && !with_strimps_anti, use_strcmp,
    2216        5433 :                                                         (unsigned char) **esc, with_strimps_anti,
    2217             :                                                         ascii_pattern);
    2218             :         } else {
    2219         362 :                 msg = pcre_likeselect(bn, b, s, &ci, p, q, &rcnt, ppat, *caseignore,
    2220         362 :                                                           *anti && !with_strimps_anti, with_strimps_anti);
    2221             :         }
    2222             : 
    2223        5795 :         if (!msg) {                                     /* set some properties */
    2224        5795 :                 BATsetcount(bn, rcnt);
    2225        5795 :                 bn->tsorted = true;
    2226        5795 :                 bn->trevsorted = bn->batCount <= 1;
    2227        5795 :                 bn->tkey = true;
    2228        5795 :                 bn->tnil = false;
    2229        5795 :                 bn->tnonil = true;
    2230        5795 :                 bn->tseqbase = rcnt == 0 ? 0 : rcnt == 1 ? *(const oid *) Tloc(bn, 0) : rcnt == b->batCount ? b->hseqbase : oid_nil;
    2231        5795 :                 if (with_strimps_anti) {
    2232             :                         /* Reverse the result taking into account the original candidate list. */
    2233             :                         // BAT *rev = BATdiffcand(BATdense(b->hseqbase, 0, b->batCount), bn);
    2234           0 :                         BAT *rev;
    2235           0 :                         if (old_s) {
    2236           0 :                                 rev = BATdiffcand(old_s, bn);
    2237             : #ifndef NDEBUG
    2238           0 :                                 BAT *is = BATintersectcand(old_s, bn);
    2239           0 :                                 if (is) {
    2240           0 :                                         assert(is->batCount == bn->batCount);
    2241           0 :                                         BBPreclaim(is);
    2242             :                                 }
    2243           0 :                                 assert(rev->batCount == old_s->batCount - bn->batCount);
    2244             : #endif
    2245             :                         }
    2246             : 
    2247             :                         else
    2248           0 :                                 rev = BATnegcands(b->batCount, bn);
    2249             :                         /* BAT *rev = BATnegcands(b->batCount, bn); */
    2250           0 :                         BBPunfix(bn->batCacheid);
    2251           0 :                         bn = rev;
    2252             :                 }
    2253             :         }
    2254             : 
    2255             : 
    2256        5795 :   bailout:
    2257        5795 :         BBPreclaim(b);
    2258        5795 :         BBPreclaim(s);
    2259        5794 :         BBPreclaim(old_s);
    2260        5794 :         GDKfree(ppat);
    2261        5794 :         if (bn && !msg) {
    2262        5794 :                 *ret = bn->batCacheid;
    2263        5794 :                 BBPkeepref(bn);
    2264           0 :         } else if (bn)
    2265           0 :                 BBPreclaim(bn);
    2266        5794 :         return msg;
    2267             : }
    2268             : 
    2269             : #define APPEND(b, o)    (((oid *) b->theap->base)[b->batCount++] = (o))
    2270             : #define VALUE(s, x)             (s##vars + VarHeapVal(s##vals, (x), s##i.width))
    2271             : 
    2272             : #ifdef HAVE_LIBPCRE
    2273             : #define PCRE_EXEC \
    2274             :         do { \
    2275             :                 retval = pcre_exec(pcrere, pcreex, vl, (int) strlen(vl), 0, PCRE_NO_UTF8_CHECK, NULL, 0); \
    2276             :         } while (0)
    2277             : #define PCRE_EXEC_COND (retval < 0)
    2278             : #else
    2279             : #define PCRE_EXEC \
    2280             :         do { \
    2281             :                 retval = regexec(&pcrere, vl, (size_t) 0, NULL, 0); \
    2282             :         } while (0)
    2283             : #define PCRE_EXEC_COND (retval == REG_NOMATCH || retval == REG_ENOSYS)
    2284             : #endif
    2285             : 
    2286             : /* nested loop implementation for PCRE join */
    2287             : #define pcre_join_loop(STRCMP, RE_MATCH, PCRE_COND)                                             \
    2288             :         do {                                                                                                                            \
    2289             :                 for (BUN ridx = 0; ridx < rci.ncand; ridx++) {                                       \
    2290             :                         GDK_CHECK_TIMEOUT(qry_ctx, counter,                                                     \
    2291             :                                                           GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx)); \
    2292             :                         ro = canditer_next(&rci);                                                                   \
    2293             :                         vr = VALUE(r, ro - rbase);                                                                      \
    2294             :                         nl = 0;                                                                                                         \
    2295             :                         ascii_pattern = use_re = use_strcmp = empty = false;            \
    2296             :                         if ((msg = choose_like_path(&pcrepat, &use_re, &use_strcmp, &empty, &ascii_pattern, vr, esc))) \
    2297             :                                 goto bailout;                                                                                   \
    2298             :                         if (!empty) {                                                                                           \
    2299             :                                 if (use_re) {                                                                                   \
    2300             :                                         if ((msg = re_like_build(&re, &wpat, vr, caseignore, use_strcmp, ascii_pattern, (unsigned char) *esc)) != MAL_SUCCEED) \
    2301             :                                                 goto bailout;                                                                   \
    2302             :                                 } else if (pcrepat) {                                                                   \
    2303             :                                         if ((msg = pcre_like_build(&pcrere, &pcreex, pcrepat, caseignore, lci.ncand)) != MAL_SUCCEED) \
    2304             :                                                 goto bailout;                                                                   \
    2305             :                                         GDKfree(pcrepat);                                                                       \
    2306             :                                         pcrepat = NULL;                                                                         \
    2307             :                                 }                                                                                                               \
    2308             :                                 canditer_reset(&lci);                                                                       \
    2309             :                                 for (BUN lidx = 0; lidx < lci.ncand; lidx++) {                       \
    2310             :                                         lo = canditer_next(&lci);                                                   \
    2311             :                                         vl = VALUE(l, lo - lbase);                                                      \
    2312             :                                         if (strNil(vl)) {                                                                       \
    2313             :                                                 continue;                                                                               \
    2314             :                                         } else if (use_re) {                                                            \
    2315             :                                                 if (use_strcmp) {                                                               \
    2316             :                                                         if (STRCMP)                                                                     \
    2317             :                                                                 continue;                                                               \
    2318             :                                                 } else {                                                                                \
    2319             :                                                         assert(re);                                                                     \
    2320             :                                                         if (RE_MATCH)                                                           \
    2321             :                                                                 continue;                                                               \
    2322             :                                                 }                                                                                               \
    2323             :                                         } else {                                                                                        \
    2324             :                                                 int retval;                                                                             \
    2325             :                                                 PCRE_EXEC;                                                                              \
    2326             :                                                 if (PCRE_COND)                                                                  \
    2327             :                                                         continue;                                                                       \
    2328             :                                         }                                                                                                       \
    2329             :                                         if (BATcount(r1) == BATcapacity(r1)) {                          \
    2330             :                                                 newcap = BATgrows(r1);                                                  \
    2331             :                                                 BATsetcount(r1, BATcount(r1));                                  \
    2332             :                                                 if (r2)                                                                                 \
    2333             :                                                         BATsetcount(r2, BATcount(r2));                          \
    2334             :                                                 if (BATextend(r1, newcap) != GDK_SUCCEED || (r2 && BATextend(r2, newcap) != GDK_SUCCEED)) { \
    2335             :                                                         msg = createException(MAL, "pcre.join", SQLSTATE(HY013) MAL_MALLOC_FAIL); \
    2336             :                                                         goto bailout;                                                           \
    2337             :                                                 }                                                                                               \
    2338             :                                                 assert(!r2 || BATcapacity(r1) == BATcapacity(r2)); \
    2339             :                                         }                                                                                                       \
    2340             :                                         if (BATcount(r1) > 0) {                                                              \
    2341             :                                                 if (lastl + 1 != lo)                                                    \
    2342             :                                                         r1->tseqbase = oid_nil;                                              \
    2343             :                                                 if (nl == 0) {                                                                  \
    2344             :                                                         if (r2)                                                                         \
    2345             :                                                                 r2->trevsorted = false;                                      \
    2346             :                                                         if (lastl > lo) {                                                    \
    2347             :                                                                 r1->tsorted = false;                                 \
    2348             :                                                                 r1->tkey = false;                                            \
    2349             :                                                         } else if (lastl < lo) {                                     \
    2350             :                                                                 r1->trevsorted = false;                                      \
    2351             :                                                         } else {                                                                        \
    2352             :                                                                 r1->tkey = false;                                            \
    2353             :                                                         }                                                                                       \
    2354             :                                                 }                                                                                               \
    2355             :                                         }                                                                                                       \
    2356             :                                         APPEND(r1, lo);                                                                         \
    2357             :                                         if (r2)                                                                                         \
    2358             :                                                 APPEND(r2, ro);                                                                 \
    2359             :                                         lastl = lo;                                                                                     \
    2360             :                                         nl++;                                                                                           \
    2361             :                                 }                                                                                                               \
    2362             :                                 re_like_clean(&re, &wpat);                                                              \
    2363             :                                 pcre_clean(&pcrere, &pcreex);                                                   \
    2364             :                         }                                                                                                                       \
    2365             :                         if (r2) {                                                                                                       \
    2366             :                                 if (nl > 1) {                                                                                        \
    2367             :                                         r2->tkey = false;                                                                    \
    2368             :                                         r2->tseqbase = oid_nil;                                                              \
    2369             :                                         r1->trevsorted = false;                                                              \
    2370             :                                 } else if (nl == 0) {                                                                   \
    2371             :                                         rskipped = BATcount(r2) > 0;                                         \
    2372             :                                 } else if (rskipped) {                                                                  \
    2373             :                                         r2->tseqbase = oid_nil;                                                              \
    2374             :                                 }                                                                                                               \
    2375             :                         } else if (nl > 1) {                                                                         \
    2376             :                                 r1->trevsorted = false;                                                                      \
    2377             :                         }                                                                                                                       \
    2378             :                 }                                                                                                                               \
    2379             :         } while (0)
    2380             : 
    2381             : static char *
    2382          43 : pcrejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *esc,
    2383             :                  bit caseignore, bit anti)
    2384             : {
    2385          43 :         struct canditer lci, rci;
    2386          43 :         const char *lvals, *rvals, *lvars, *rvars, *vl, *vr;
    2387          43 :         int rskipped = 0;                       /* whether we skipped values in r */
    2388          43 :         oid lbase, rbase, lo, ro, lastl = 0;    /* last value inserted into r1 */
    2389          43 :         BUN nl, newcap;
    2390          43 :         char *pcrepat = NULL, *msg = MAL_SUCCEED;
    2391          43 :         struct RE *re = NULL;
    2392          43 :         bool use_re = false,
    2393          43 :                 use_strcmp = false,
    2394          43 :                 empty = false,
    2395          43 :                 ascii_pattern = false;
    2396          43 :         uint32_t *wpat = NULL;
    2397             : #ifdef HAVE_LIBPCRE
    2398          43 :         pcre *pcrere = NULL;
    2399          43 :         pcre_extra *pcreex = NULL;
    2400             : #else
    2401             :         regex_t pcrere = (regex_t) { 0 };
    2402             :         void *pcreex = NULL;
    2403             : #endif
    2404             : 
    2405          43 :         size_t counter = 0;
    2406          43 :         QryCtx *qry_ctx = MT_thread_get_qry_ctx();
    2407             : 
    2408          43 :         TRC_DEBUG(ALGO,
    2409             :                           "pcrejoin(l=%s#" BUNFMT "[%s]%s%s,"
    2410             :                           "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
    2411             :                           "sr=%s#" BUNFMT "%s%s)\n",
    2412             :                           BATgetId(l), BATcount(l), ATOMname(l->ttype),
    2413             :                           l->tsorted ? "-sorted" : "",
    2414             :                           l->trevsorted ? "-revsorted" : "",
    2415             :                           BATgetId(r), BATcount(r), ATOMname(r->ttype),
    2416             :                           r->tsorted ? "-sorted" : "",
    2417             :                           r->trevsorted ? "-revsorted" : "",
    2418             :                           sl ? BATgetId(sl) : "NULL", sl ? BATcount(sl) : 0,
    2419             :                           sl && sl->tsorted ? "-sorted" : "",
    2420             :                           sl && sl->trevsorted ? "-revsorted" : "",
    2421             :                           sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
    2422             :                           sr && sr->tsorted ? "-sorted" : "",
    2423             :                           sr && sr->trevsorted ? "-revsorted" : "");
    2424             : 
    2425         129 :         assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
    2426          43 :         assert(ATOMtype(l->ttype) == TYPE_str);
    2427             : 
    2428          43 :         canditer_init(&lci, l, sl);
    2429          43 :         canditer_init(&rci, r, sr);
    2430             : 
    2431          43 :         BATiter li = bat_iterator(l);
    2432          43 :         BATiter ri = bat_iterator(r);
    2433          43 :         lbase = l->hseqbase;
    2434          43 :         rbase = r->hseqbase;
    2435          43 :         lvals = (const char *) li.base;
    2436          43 :         rvals = (const char *) ri.base;
    2437          43 :         assert(ri.vh && r->ttype);
    2438          43 :         lvars = li.vh->base;
    2439          43 :         rvars = ri.vh->base;
    2440             : 
    2441          43 :         r1->tkey = true;
    2442          43 :         r1->tsorted = true;
    2443          43 :         r1->trevsorted = true;
    2444          43 :         r1->tnil = false;
    2445          43 :         r1->tnonil = true;
    2446          43 :         if (r2) {
    2447          26 :                 r2->tkey = true;
    2448          26 :                 r2->tsorted = true;
    2449          26 :                 r2->trevsorted = true;
    2450          26 :                 r2->tnil = false;
    2451          26 :                 r2->tnonil = true;
    2452             :         }
    2453             : 
    2454          43 :         if (anti) {
    2455          23 :                 if (caseignore) {
    2456         123 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) == 0 : mywstrcasecmp(vl, wpat) == 0,
    2457             :                                                    re_match_ignore(vl, re), !PCRE_EXEC_COND);
    2458             :                 } else {
    2459         328 :                         pcre_join_loop(strcmp(vl, vr) == 0, re_match_no_ignore(vl, re), !PCRE_EXEC_COND);
    2460             :                 }
    2461             :         } else {
    2462          20 :                 if (caseignore) {
    2463           5 :                         pcre_join_loop(ascii_pattern ? istrcmp(vl, vr) != 0 : mywstrcasecmp(vl, wpat) != 0,
    2464             :                                                    !re_match_ignore(vl, re), PCRE_EXEC_COND);
    2465             :                 } else {
    2466         381 :                         pcre_join_loop(strcmp(vl, vr) != 0, !re_match_no_ignore(vl, re), PCRE_EXEC_COND);
    2467             :                 }
    2468             :         }
    2469          43 :         bat_iterator_end(&li);
    2470          43 :         bat_iterator_end(&ri);
    2471             : 
    2472          43 :         assert(!r2 || BATcount(r1) == BATcount(r2));
    2473             :         /* also set other bits of heap to correct value to indicate size */
    2474          43 :         BATsetcount(r1, BATcount(r1));
    2475          43 :         if (r2)
    2476          26 :                 BATsetcount(r2, BATcount(r2));
    2477          43 :         if (BATcount(r1) > 0) {
    2478          30 :                 if (BATtdense(r1))
    2479           7 :                         r1->tseqbase = ((oid *) r1->theap->base)[0];
    2480          30 :                 if (r2 && BATtdense(r2))
    2481          14 :                         r2->tseqbase = ((oid *) r2->theap->base)[0];
    2482             :         } else {
    2483          13 :                 r1->tseqbase = 0;
    2484          13 :                 if (r2)
    2485           6 :                         r2->tseqbase = 0;
    2486             :         }
    2487          20 :         if (r2)
    2488          26 :                 TRC_DEBUG(ALGO,
    2489             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s,%s#" BUNFMT "%s%s\n",
    2490             :                                   BATgetId(l), BATgetId(r),
    2491             :                                   BATgetId(r1), BATcount(r1),
    2492             :                                   r1->tsorted ? "-sorted" : "",
    2493             :                                   r1->trevsorted ? "-revsorted" : "",
    2494             :                                   BATgetId(r2), BATcount(r2),
    2495             :                                   r2->tsorted ? "-sorted" : "",
    2496             :                                   r2->trevsorted ? "-revsorted" : "");
    2497             :         else
    2498          17 :                 TRC_DEBUG(ALGO,
    2499             :                                   "pcrejoin(l=%s,r=%s)=(%s#" BUNFMT "%s%s\n",
    2500             :                                   BATgetId(l), BATgetId(r),
    2501             :                                   BATgetId(r1), BATcount(r1),
    2502             :                                   r1->tsorted ? "-sorted" : "",
    2503             :                                   r1->trevsorted ? "-revsorted" : "");
    2504             :         return MAL_SUCCEED;
    2505             : 
    2506           0 :   bailout:
    2507           0 :         bat_iterator_end(&li);
    2508           0 :         bat_iterator_end(&ri);
    2509           0 :         GDKfree(pcrepat);
    2510           0 :         re_like_clean(&re, &wpat);
    2511           0 :         pcre_clean(&pcrere, &pcreex);
    2512           0 :         assert(msg != MAL_SUCCEED);
    2513             :         return msg;
    2514             : }
    2515             : 
    2516             : static str
    2517          43 : PCREjoin(bat *r1, bat *r2, bat lid, bat rid, bat slid, bat srid, bat elid,
    2518             :                  bat ciid, bit anti)
    2519             : {
    2520          43 :         BAT *left = NULL, *right = NULL, *escape = NULL, *caseignore = NULL,
    2521          43 :                 *candleft = NULL, *candright = NULL;
    2522          43 :         BAT *result1 = NULL, *result2 = NULL;
    2523          43 :         char *msg = MAL_SUCCEED;
    2524          43 :         const char *esc = "";
    2525          43 :         bit ci;
    2526          43 :         BATiter bi;
    2527             : 
    2528          43 :         if ((left = BATdescriptor(lid)) == NULL)
    2529           0 :                 goto fail;
    2530          43 :         if ((right = BATdescriptor(rid)) == NULL)
    2531           0 :                 goto fail;
    2532          43 :         if ((escape = BATdescriptor(elid)) == NULL)
    2533           0 :                 goto fail;
    2534          43 :         if ((caseignore = BATdescriptor(ciid)) == NULL)
    2535           0 :                 goto fail;
    2536          43 :         if (!is_bat_nil(slid) && (candleft = BATdescriptor(slid)) == NULL)
    2537           0 :                 goto fail;
    2538          43 :         if (!is_bat_nil(srid) && (candright = BATdescriptor(srid)) == NULL)
    2539           0 :                 goto fail;
    2540          43 :         result1 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2541          43 :         if (r2)
    2542          26 :                 result2 = COLnew(0, TYPE_oid, BATcount(left), TRANSIENT);
    2543          43 :         if (!result1 || (r2 && !result2)) {
    2544           0 :                 msg = createException(MAL, "pcre.join",
    2545             :                                                           SQLSTATE(HY013) MAL_MALLOC_FAIL);
    2546           0 :                 goto fail;
    2547             :         }
    2548          43 :         result1->tnil = false;
    2549          43 :         result1->tnonil = true;
    2550          43 :         result1->tkey = true;
    2551          43 :         result1->tsorted = true;
    2552          43 :         result1->trevsorted = true;
    2553          43 :         result1->tseqbase = 0;
    2554          43 :         if (r2) {
    2555          26 :                 result2->tnil = false;
    2556          26 :                 result2->tnonil = true;
    2557          26 :                 result2->tkey = true;
    2558          26 :                 result2->tsorted = true;
    2559          26 :                 result2->trevsorted = true;
    2560          26 :                 result2->tseqbase = 0;
    2561             :         }
    2562          43 :         if (BATcount(escape) != 1) {
    2563           0 :                 msg = createException(MAL, "pcre.join",
    2564             :                                                           SQLSTATE(42000)
    2565             :                                                           "At the moment, only one value is allowed for the escape input at pcre join");
    2566           0 :                 goto fail;
    2567             :         }
    2568          43 :         if (BATcount(caseignore) != 1) {
    2569           0 :                 msg = createException(MAL, "pcre.join",
    2570             :                                                           SQLSTATE(42000)
    2571             :                                                           "At the moment, only one value is allowed for the case ignore input at pcre join");
    2572           0 :                 goto fail;
    2573             :         }
    2574          43 :         bi = bat_iterator(caseignore);
    2575          43 :         ci = *(bit *) BUNtloc(bi, 0);
    2576          43 :         bat_iterator_end(&bi);
    2577          43 :         bi = bat_iterator(escape);
    2578          43 :         esc = BUNtvar(bi, 0);
    2579          43 :         msg = pcrejoin(result1, result2, left, right, candleft, candright, esc, ci,
    2580             :                                    anti);
    2581          43 :         bat_iterator_end(&bi);
    2582          43 :         if (msg)
    2583           0 :                 goto fail;
    2584          43 :         *r1 = result1->batCacheid;
    2585          43 :         BBPkeepref(result1);
    2586          43 :         if (r2) {
    2587          26 :                 *r2 = result2->batCacheid;
    2588          26 :                 BBPkeepref(result2);
    2589             :         }
    2590          43 :         BBPunfix(left->batCacheid);
    2591          43 :         BBPunfix(right->batCacheid);
    2592          43 :         BBPreclaim(escape);
    2593          43 :         BBPreclaim(caseignore);
    2594          43 :         BBPreclaim(candleft);
    2595          43 :         BBPreclaim(candright);
    2596             :         return MAL_SUCCEED;
    2597             : 
    2598           0 :   fail:
    2599           0 :         BBPreclaim(left);
    2600           0 :         BBPreclaim(right);
    2601           0 :         BBPreclaim(escape);
    2602           0 :         BBPreclaim(caseignore);
    2603           0 :         BBPreclaim(candleft);
    2604           0 :         BBPreclaim(candright);
    2605           0 :         BBPreclaim(result1);
    2606           0 :         BBPreclaim(result2);
    2607           0 :         if (msg)
    2608             :                 return msg;
    2609           0 :         throw(MAL, "pcre.join", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
    2610             : }
    2611             : 
    2612             : static str
    2613          26 : LIKEjoin(bat *r1, bat *r2, const bat *lid, const bat *rid, const bat *elid,
    2614             :                  const bat *cid, const bat *slid, const bat *srid,
    2615             :                  const bit *nil_matches, const lng *estimate, const bit *anti)
    2616             : {
    2617          26 :         (void) nil_matches;
    2618          26 :         (void) estimate;
    2619          26 :         return PCREjoin(r1, r2, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2620          26 :                                         *elid, *cid, *anti);
    2621             : }
    2622             : 
    2623             : static str
    2624          17 : LIKEjoin1(bat *r1, const bat *lid, const bat *rid, const bat *elid,
    2625             :                   const bat *cid, const bat *slid, const bat *srid,
    2626             :                   const bit *nil_matches, const lng *estimate, const bit *anti)
    2627             : {
    2628          17 :         (void) nil_matches;
    2629          17 :         (void) estimate;
    2630          17 :         return PCREjoin(r1, NULL, *lid, *rid, slid ? *slid : 0, srid ? *srid : 0,
    2631          17 :                                         *elid, *cid, *anti);
    2632             : }
    2633             : 
    2634             : #include "mel.h"
    2635             : mel_atom pcre_init_atoms[] = {
    2636             :  { .name="pcre", },  { .cmp=NULL }
    2637             : };
    2638             : mel_func pcre_init_funcs[] = {
    2639             :  command("pcre", "index", PCREindex, false, "match a pattern, return matched position (or 0 when not found)", args(1,3, arg("",int),arg("pat",pcre),arg("s",str))),
    2640             :  command("pcre", "match", PCREmatch, false, "Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2641             :  command("pcre", "imatch", PCREimatch, false, "Caseless Perl Compatible Regular Expression pattern matching against a string", args(1,3, arg("",bit),arg("s",str),arg("pat",str))),
    2642             :  command("pcre", "patindex", PCREpatindex, false, "Location of the first POSIX pattern matching against a string", args(1,3, arg("",int),arg("pat",str),arg("s",str))),
    2643             :  command("pcre", "replace", PCREreplace_wrap, false, "Replace _all_ matches of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2644             :  command("pcre", "replace_first", PCREreplacefirst_wrap, false, "Replace _the first_ match of \"pattern\" in \"origin_str\" with \"replacement\".\nParameter \"flags\" accept these flags: 'i', 'm', 's', and 'x'.\n'e': if present, an empty string is considered to be a valid match\n'i': if present, the match operates in case-insensitive mode.\nOtherwise, in case-sensitive mode.\n'm': if present, the match operates in multi-line mode.\n's': if present, the match operates in \"dot-all\"\nThe specifications of the flags can be found in \"man pcreapi\"\nThe flag letters may be repeated.\nNo other letters than 'e', 'i', 'm', 's' and 'x' are allowed in \"flags\".\nReturns the replaced string, or if no matches found, the original string.", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2645             :  command("pcre", "pcre_quote", PCREquote, false, "Return a PCRE pattern string that matches the argument exactly.", args(1,2, arg("",str),arg("s",str))),
    2646             :  command("pcre", "sql2pcre", PCREsql2pcre, false, "Convert a SQL like pattern with the given escape character into a PCRE pattern.", args(1,3, arg("",str),arg("pat",str),arg("esc",str))),
    2647             :  command("str", "replace", PCREreplace_wrap, false, "", args(1,5, arg("",str),arg("origin",str),arg("pat",str),arg("repl",str),arg("flags",str))),
    2648             :  command("batpcre", "replace", PCREreplace_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2649             :  command("batpcre", "replace_first", PCREreplacefirst_bat_wrap, false, "", args(1,5, batarg("",str),batarg("orig",str),arg("pat",str),arg("repl",str),arg("flag",str))),
    2650             :  command("algebra", "like", PCRElike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2651             :  command("algebra", "not_like", PCREnotlike, false, "", args(1,5, arg("",bit),arg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2652             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2653             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2654             :  pattern("batalgebra", "like", BATPCRElike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2655             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),arg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2656             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),arg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2657             :  pattern("batalgebra", "not_like", BATPCREnotlike, false, "", args(1,5, batarg("",bit),batarg("s",str),batarg("pat",str),arg("esc",str),arg("caseignore",bit))),
    2658             :  command("algebra", "likeselect", PCRElikeselect, false, "Select all head values of the first input BAT for which the\ntail value is \"like\" the given (SQL-style) pattern and for\nwhich the head value occurs in the tail of the second input\nBAT.\nInput is a dense-headed BAT, output is a dense-headed BAT with in\nthe tail the head value of the input BAT for which the\nrelationship holds.  The output BAT is sorted on the tail value.", args(1,7, batarg("",oid),batarg("b",str),batarg("s",oid),arg("pat",str),arg("esc",str),arg("caseignore",bit),arg("anti",bit))),
    2659             :  command("algebra", "likejoin", LIKEjoin, false, "Join the string bat L with the pattern bat R\nwith optional candidate lists SL and SR using pattern escape string ESC\nand doing a case sensitive match.\nThe result is two aligned bats with oids of matching rows.", args(2,11, batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
    2660             :  command("algebra", "likejoin", LIKEjoin1, false, "The same as LIKEjoin_esc, but only produce one output", args(1,10,batarg("",oid),batarg("l",str),batarg("r",str),batarg("esc",str),batarg("caseignore",bit),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng), arg("anti",bit))),
    2661             :  { .imp=NULL }
    2662             : };
    2663             : #include "mal_import.h"
    2664             : #ifdef _MSC_VER
    2665             : #undef read
    2666             : #pragma section(".CRT$XCU",read)
    2667             : #endif
    2668         334 : LIB_STARTUP_FUNC(init_pcre_mal)
    2669         334 : { mal_module("pcre", pcre_init_atoms, pcre_init_funcs); }

Generated by: LCOV version 1.14