LCOV - code coverage report
Current view: top level - sql/server - sql_scan.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 1038 1096 94.7 %
Date: 2024-04-25 21:43:30 Functions: 26 26 100.0 %

          Line data    Source code
       1             : /*
       2             :  * SPDX-License-Identifier: MPL-2.0
       3             :  *
       4             :  * This Source Code Form is subject to the terms of the Mozilla Public
       5             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       6             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       7             :  *
       8             :  * Copyright 2024 MonetDB Foundation;
       9             :  * Copyright August 2008 - 2023 MonetDB B.V.;
      10             :  * Copyright 1997 - July 2008 CWI.
      11             :  */
      12             : 
      13             : #include "monetdb_config.h"
      14             : #include <wctype.h>
      15             : #include "sql_mem.h"
      16             : #include "sql_scan.h"
      17             : #include "sql_types.h"
      18             : #include "sql_symbol.h"
      19             : #include "sql_mvc.h"
      20             : #include "sql_parser.tab.h"
      21             : #include "sql_semantic.h"
      22             : #include "sql_parser.h"               /* for sql_error() */
      23             : 
      24             : #include "stream.h"
      25             : #include "mapi_prompt.h"
      26             : #include <unistd.h>
      27             : #include <string.h>
      28             : #include <ctype.h>
      29             : #include "sql_keyword.h"
      30             : 
      31             : /**
      32             :  * Removes all comments before the query. In query comments are kept.
      33             :  */
      34             : char *
      35      397869 : query_cleaned(allocator *sa, const char *query)
      36             : {
      37      397869 :         char *q, *r, *c = NULL;
      38      397869 :         int lines = 0;
      39      397869 :         int quote = 0;          /* inside quotes ('..', "..", {..}) */
      40      397869 :         bool bs = false;                /* seen a backslash in a quoted string */
      41      397869 :         bool incomment1 = false;        /* inside traditional C style comment */
      42      397869 :         bool incomment2 = false;        /* inside comment starting with --  */
      43      397869 :         bool inline_comment = false;
      44             : 
      45      397869 :         r = SA_NEW_ARRAY(sa, char, strlen(query) + 1);
      46      397869 :         if(!r)
      47             :                 return NULL;
      48             : 
      49    67529278 :         for (q = r; *query; query++) {
      50    67131409 :                 if (incomment1) {
      51       16186 :                         if (*query == '/' && query[-1] == '*') {
      52         234 :                                 incomment1 = false;
      53         234 :                                 if (c == r && lines > 0) {
      54         226 :                                         q = r; // reset to beginning
      55         226 :                                         lines = 0;
      56         226 :                                         continue;
      57             :                                 }
      58             :                         }
      59       15960 :                         if (*query == '\n') lines++;
      60       15960 :                         *q++ = *query;
      61    67115223 :                 } else if (incomment2) {
      62      831781 :                         if (*query == '\n') {
      63        2889 :                                 incomment2 = false;
      64        2889 :                                 inline_comment = false;
      65             :                                 /* add newline only if comment doesn't
      66             :                                  * occupy whole line */
      67        2889 :                                 if (q > r && q[-1] != '\n'){
      68         999 :                                         *q++ = '\n';
      69         999 :                                         lines++;
      70             :                                 }
      71      828892 :                         } else if (inline_comment){
      72       24710 :                                 *q++ = *query; // preserve in line query comments
      73             :                         }
      74    66283442 :                 } else if (quote) {
      75    22198724 :                         if (bs) {
      76             :                                 bs = false;
      77    22195457 :                         } else if (*query == '\\') {
      78             :                                 bs = true;
      79    22192190 :                         } else if (*query == quote) {
      80      668201 :                                 quote = 0;
      81             :                         }
      82    22198724 :                         *q++ = *query;
      83    44084718 :                 } else if (*query == '"' || *query == '\'') {
      84      667739 :                         quote = *query;
      85      667739 :                         *q++ = *query;
      86    43416979 :                 } else if (*query == '{') {
      87         507 :                         quote = '}';
      88         507 :                         *q++ = *query;
      89    43416472 :                 } else if (*query == '-' && query[1] == '-') {
      90        2889 :                         if (q > r && q[-1] != '\n') {
      91         999 :                                 inline_comment = true;
      92         999 :                                 *q++ = *query; // preserve in line query comments
      93             :                         }
      94             :                         incomment2 = true;
      95    43413583 :                 } else if (*query == '/' && query[1] == '*') {
      96         234 :                         incomment1 = true;
      97         234 :                         c = q;
      98         234 :                         *q++ = *query;
      99    43413349 :                 } else if (*query == '\n') {
     100             :                         /* collapse newlines */
     101      864343 :                         if (q > r && q[-1] != '\n') {
     102      822272 :                                 *q++ = '\n';
     103      822272 :                                 lines++;
     104             :                         }
     105    42549006 :                 } else if (*query == ' ' || *query == '\t') {
     106             :                         /* collapse white space */
     107     7012814 :                         if (q > r && q[-1] != ' ')
     108     5523765 :                                 *q++ = ' ';
     109             :                 } else {
     110    35536192 :                         *q++ = *query;
     111             :                 }
     112             :         }
     113      397869 :         *q = 0;
     114      397869 :         return r;
     115             : }
     116             : 
     117             : int
     118         341 : scanner_init_keywords(void)
     119             : {
     120         341 :         int failed = 0;
     121             : 
     122         341 :         failed += keywords_insert("false", BOOL_FALSE);
     123         341 :         failed += keywords_insert("true", BOOL_TRUE);
     124         341 :         failed += keywords_insert("bool", sqlBOOL);
     125             : 
     126         341 :         failed += keywords_insert("ALTER", ALTER);
     127         341 :         failed += keywords_insert("ADD", ADD);
     128         341 :         failed += keywords_insert("AND", AND);
     129             : 
     130         341 :         failed += keywords_insert("RANK", RANK);
     131         341 :         failed += keywords_insert("DENSE_RANK", RANK);
     132         341 :         failed += keywords_insert("PERCENT_RANK", RANK);
     133         341 :         failed += keywords_insert("CUME_DIST", RANK);
     134         341 :         failed += keywords_insert("ROW_NUMBER", RANK);
     135         341 :         failed += keywords_insert("NTILE", RANK);
     136         341 :         failed += keywords_insert("LAG", RANK);
     137         341 :         failed += keywords_insert("LEAD", RANK);
     138         341 :         failed += keywords_insert("FETCH", FETCH);
     139         341 :         failed += keywords_insert("FIRST_VALUE", RANK);
     140         341 :         failed += keywords_insert("LAST_VALUE", RANK);
     141         341 :         failed += keywords_insert("NTH_VALUE", RANK);
     142             : 
     143         341 :         failed += keywords_insert("BEST", BEST);
     144         341 :         failed += keywords_insert("EFFORT", EFFORT);
     145             : 
     146         341 :         failed += keywords_insert("AS", AS);
     147         341 :         failed += keywords_insert("ASC", ASC);
     148         341 :         failed += keywords_insert("AUTHORIZATION", AUTHORIZATION);
     149         341 :         failed += keywords_insert("BETWEEN", BETWEEN);
     150         341 :         failed += keywords_insert("SYMMETRIC", SYMMETRIC);
     151         341 :         failed += keywords_insert("ASYMMETRIC", ASYMMETRIC);
     152         341 :         failed += keywords_insert("BY", BY);
     153         341 :         failed += keywords_insert("CAST", CAST);
     154         341 :         failed += keywords_insert("CONVERT", CONVERT);
     155         341 :         failed += keywords_insert("CHARACTER", CHARACTER);
     156         341 :         failed += keywords_insert("CHAR", CHARACTER);
     157         341 :         failed += keywords_insert("VARYING", VARYING);
     158         341 :         failed += keywords_insert("VARCHAR", VARCHAR);
     159         341 :         failed += keywords_insert("BINARY", BINARY);
     160         341 :         failed += keywords_insert("LARGE", LARGE);
     161         341 :         failed += keywords_insert("OBJECT", OBJECT);
     162         341 :         failed += keywords_insert("CLOB", CLOB);
     163         341 :         failed += keywords_insert("BLOB", sqlBLOB);
     164         341 :         failed += keywords_insert("TEXT", sqlTEXT);
     165         341 :         failed += keywords_insert("TINYTEXT", sqlTEXT);
     166         341 :         failed += keywords_insert("STRING", CLOB);    /* ? */
     167         341 :         failed += keywords_insert("CHECK", CHECK);
     168         341 :         failed += keywords_insert("CLIENT", CLIENT);
     169         341 :         failed += keywords_insert("SERVER", SERVER);
     170         341 :         failed += keywords_insert("COMMENT", COMMENT);
     171         341 :         failed += keywords_insert("CONSTRAINT", CONSTRAINT);
     172         341 :         failed += keywords_insert("CREATE", CREATE);
     173         341 :         failed += keywords_insert("CROSS", CROSS);
     174         341 :         failed += keywords_insert("COPY", COPY);
     175         341 :         failed += keywords_insert("RECORDS", RECORDS);
     176         341 :         failed += keywords_insert("DELIMITERS", DELIMITERS);
     177         341 :         failed += keywords_insert("STDIN", STDIN);
     178         341 :         failed += keywords_insert("STDOUT", STDOUT);
     179             : 
     180         341 :         failed += keywords_insert("TINYINT", TINYINT);
     181         341 :         failed += keywords_insert("SMALLINT", SMALLINT);
     182         341 :         failed += keywords_insert("INTEGER", sqlINTEGER);
     183         341 :         failed += keywords_insert("INT", sqlINTEGER);
     184         341 :         failed += keywords_insert("MEDIUMINT", sqlINTEGER);
     185         341 :         failed += keywords_insert("BIGINT", BIGINT);
     186             : #ifdef HAVE_HGE
     187         341 :         failed += keywords_insert("HUGEINT", HUGEINT);
     188             : #endif
     189         341 :         failed += keywords_insert("DEC", sqlDECIMAL);
     190         341 :         failed += keywords_insert("DECIMAL", sqlDECIMAL);
     191         341 :         failed += keywords_insert("NUMERIC", sqlDECIMAL);
     192         341 :         failed += keywords_insert("DECLARE", DECLARE);
     193         341 :         failed += keywords_insert("DEFAULT", DEFAULT);
     194         341 :         failed += keywords_insert("DESC", DESC);
     195         341 :         failed += keywords_insert("DISTINCT", DISTINCT);
     196         341 :         failed += keywords_insert("DOUBLE", sqlDOUBLE);
     197         341 :         failed += keywords_insert("REAL", sqlREAL);
     198         341 :         failed += keywords_insert("DROP", DROP);
     199         341 :         failed += keywords_insert("ESCAPE", ESCAPE);
     200         341 :         failed += keywords_insert("EXISTS", EXISTS);
     201         341 :         failed += keywords_insert("UESCAPE", UESCAPE);
     202         341 :         failed += keywords_insert("EXTRACT", EXTRACT);
     203         341 :         failed += keywords_insert("FLOAT", sqlFLOAT);
     204         341 :         failed += keywords_insert("FOR", FOR);
     205         341 :         failed += keywords_insert("FOREIGN", FOREIGN);
     206         341 :         failed += keywords_insert("FROM", FROM);
     207         341 :         failed += keywords_insert("FWF", FWF);
     208             : 
     209         341 :         failed += keywords_insert("BIG", BIG);
     210         341 :         failed += keywords_insert("LITTLE", LITTLE);
     211         341 :         failed += keywords_insert("NATIVE", NATIVE);
     212         341 :         failed += keywords_insert("ENDIAN", ENDIAN);
     213             : 
     214         341 :         failed += keywords_insert("REFERENCES", REFERENCES);
     215             : 
     216         341 :         failed += keywords_insert("MATCH", MATCH);
     217         341 :         failed += keywords_insert("FULL", FULL);
     218         341 :         failed += keywords_insert("PARTIAL", PARTIAL);
     219         341 :         failed += keywords_insert("SIMPLE", SIMPLE);
     220             : 
     221         341 :         failed += keywords_insert("INSERT", INSERT);
     222         341 :         failed += keywords_insert("UPDATE", UPDATE);
     223         341 :         failed += keywords_insert("DELETE", sqlDELETE);
     224         341 :         failed += keywords_insert("TRUNCATE", TRUNCATE);
     225         341 :         failed += keywords_insert("MATCHED", MATCHED);
     226             : 
     227         341 :         failed += keywords_insert("ACTION", ACTION);
     228         341 :         failed += keywords_insert("CASCADE", CASCADE);
     229         341 :         failed += keywords_insert("RESTRICT", RESTRICT);
     230         341 :         failed += keywords_insert("FIRST", FIRST);
     231         341 :         failed += keywords_insert("GLOBAL", GLOBAL);
     232         341 :         failed += keywords_insert("GROUP", sqlGROUP);
     233         341 :         failed += keywords_insert("GROUPING", GROUPING);
     234         341 :         failed += keywords_insert("ROLLUP", ROLLUP);
     235         341 :         failed += keywords_insert("CUBE", CUBE);
     236         341 :         failed += keywords_insert("HAVING", HAVING);
     237         341 :         failed += keywords_insert("ILIKE", ILIKE);
     238         341 :         failed += keywords_insert("IMPRINTS", IMPRINTS);
     239         341 :         failed += keywords_insert("IN", sqlIN);
     240         341 :         failed += keywords_insert("INNER", INNER);
     241         341 :         failed += keywords_insert("INTO", INTO);
     242         341 :         failed += keywords_insert("IS", IS);
     243         341 :         failed += keywords_insert("JOIN", JOIN);
     244         341 :         failed += keywords_insert("KEY", KEY);
     245         341 :         failed += keywords_insert("LATERAL", LATERAL);
     246         341 :         failed += keywords_insert("LEFT", LEFT);
     247         341 :         failed += keywords_insert("LIKE", LIKE);
     248         341 :         failed += keywords_insert("LIMIT", LIMIT);
     249         341 :         failed += keywords_insert("SAMPLE", SAMPLE);
     250         341 :         failed += keywords_insert("SEED", SEED);
     251         341 :         failed += keywords_insert("LAST", LAST);
     252         341 :         failed += keywords_insert("LOCAL", LOCAL);
     253         341 :         failed += keywords_insert("NATURAL", NATURAL);
     254         341 :         failed += keywords_insert("NOT", NOT);
     255         341 :         failed += keywords_insert("NULL", sqlNULL);
     256         341 :         failed += keywords_insert("NULLS", NULLS);
     257         341 :         failed += keywords_insert("OFFSET", OFFSET);
     258         341 :         failed += keywords_insert("ON", ON);
     259         341 :         failed += keywords_insert("OPTIONS", OPTIONS);
     260         341 :         failed += keywords_insert("OPTION", OPTION);
     261         341 :         failed += keywords_insert("OR", OR);
     262         341 :         failed += keywords_insert("ORDER", ORDER);
     263         341 :         failed += keywords_insert("ORDERED", ORDERED);
     264         341 :         failed += keywords_insert("OUTER", OUTER);
     265         341 :         failed += keywords_insert("OVER", OVER);
     266         341 :         failed += keywords_insert("PARTITION", PARTITION);
     267         341 :         failed += keywords_insert("PATH", PATH);
     268         341 :         failed += keywords_insert("PRECISION", PRECISION);
     269         341 :         failed += keywords_insert("PRIMARY", PRIMARY);
     270             : 
     271         341 :         failed += keywords_insert("USER", USER);
     272         341 :         failed += keywords_insert("RENAME", RENAME);
     273         341 :         failed += keywords_insert("UNENCRYPTED", UNENCRYPTED);
     274         341 :         failed += keywords_insert("ENCRYPTED", ENCRYPTED);
     275         341 :         failed += keywords_insert("PASSWORD", PASSWORD);
     276         341 :         failed += keywords_insert("GRANT", GRANT);
     277         341 :         failed += keywords_insert("REVOKE", REVOKE);
     278         341 :         failed += keywords_insert("ROLE", ROLE);
     279         341 :         failed += keywords_insert("ADMIN", ADMIN);
     280         341 :         failed += keywords_insert("PRIVILEGES", PRIVILEGES);
     281         341 :         failed += keywords_insert("PUBLIC", PUBLIC);
     282         341 :         failed += keywords_insert("CURRENT_USER", CURRENT_USER);
     283         341 :         failed += keywords_insert("CURRENT_ROLE", CURRENT_ROLE);
     284         341 :         failed += keywords_insert("SESSION_USER", SESSION_USER);
     285         341 :         failed += keywords_insert("CURRENT_SCHEMA", CURRENT_SCHEMA);
     286         341 :         failed += keywords_insert("SESSION", sqlSESSION);
     287         341 :         failed += keywords_insert("MAX_MEMORY", MAX_MEMORY);
     288         341 :         failed += keywords_insert("MAX_WORKERS", MAX_WORKERS);
     289         341 :         failed += keywords_insert("OPTIMIZER", OPTIMIZER);
     290             : 
     291         341 :         failed += keywords_insert("RIGHT", RIGHT);
     292         341 :         failed += keywords_insert("SCHEMA", SCHEMA);
     293         341 :         failed += keywords_insert("SELECT", SELECT);
     294         341 :         failed += keywords_insert("SET", SET);
     295         341 :         failed += keywords_insert("SETS", SETS);
     296         341 :         failed += keywords_insert("AUTO_COMMIT", AUTO_COMMIT);
     297             : 
     298         341 :         failed += keywords_insert("ALL", ALL);
     299         341 :         failed += keywords_insert("ANY", ANY);
     300         341 :         failed += keywords_insert("SOME", SOME);
     301         341 :         failed += keywords_insert("EVERY", ANY);
     302             :         /*
     303             :            failed += keywords_insert("SQLCODE", SQLCODE );
     304             :          */
     305         341 :         failed += keywords_insert("COLUMN", COLUMN);
     306         341 :         failed += keywords_insert("TABLE", TABLE);
     307         341 :         failed += keywords_insert("TEMPORARY", TEMPORARY);
     308         341 :         failed += keywords_insert("TEMP", TEMP);
     309         341 :         failed += keywords_insert("REMOTE", REMOTE);
     310         341 :         failed += keywords_insert("MERGE", MERGE);
     311         341 :         failed += keywords_insert("REPLICA", REPLICA);
     312         341 :         failed += keywords_insert("UNLOGGED", UNLOGGED);
     313         341 :         failed += keywords_insert("TO", TO);
     314         341 :         failed += keywords_insert("UNION", UNION);
     315         341 :         failed += keywords_insert("EXCEPT", EXCEPT);
     316         341 :         failed += keywords_insert("INTERSECT", INTERSECT);
     317         341 :         failed += keywords_insert("CORRESPONDING", CORRESPONDING);
     318         341 :         failed += keywords_insert("UNIQUE", UNIQUE);
     319         341 :         failed += keywords_insert("USING", USING);
     320         341 :         failed += keywords_insert("VALUES", VALUES);
     321         341 :         failed += keywords_insert("VIEW", VIEW);
     322         341 :         failed += keywords_insert("WHERE", WHERE);
     323         341 :         failed += keywords_insert("WITH", WITH);
     324         341 :         failed += keywords_insert("WITHOUT", WITHOUT);
     325         341 :         failed += keywords_insert("DATA", DATA);
     326             : 
     327         341 :         failed += keywords_insert("DATE", sqlDATE);
     328         341 :         failed += keywords_insert("TIME", TIME);
     329         341 :         failed += keywords_insert("TIMESTAMP", TIMESTAMP);
     330         341 :         failed += keywords_insert("INTERVAL", INTERVAL);
     331         341 :         failed += keywords_insert("CURRENT_DATE", CURRENT_DATE);
     332         341 :         failed += keywords_insert("CURRENT_TIME", CURRENT_TIME);
     333         341 :         failed += keywords_insert("CURRENT_TIMESTAMP", CURRENT_TIMESTAMP);
     334         341 :         failed += keywords_insert("CURRENT_TIMEZONE", CURRENT_TIMEZONE);
     335         341 :         failed += keywords_insert("NOW", CURRENT_TIMESTAMP);
     336         341 :         failed += keywords_insert("LOCALTIME", LOCALTIME);
     337         341 :         failed += keywords_insert("LOCALTIMESTAMP", LOCALTIMESTAMP);
     338         341 :         failed += keywords_insert("ZONE", ZONE);
     339             : 
     340         341 :         failed += keywords_insert("CENTURY", CENTURY);
     341         341 :         failed += keywords_insert("DECADE", DECADE);
     342         341 :         failed += keywords_insert("YEAR", YEAR);
     343         341 :         failed += keywords_insert("QUARTER", QUARTER);
     344         341 :         failed += keywords_insert("MONTH", MONTH);
     345         341 :         failed += keywords_insert("WEEK", WEEK);
     346         341 :         failed += keywords_insert("DOW", DOW);
     347         341 :         failed += keywords_insert("DOY", DOY);
     348         341 :         failed += keywords_insert("DAY", DAY);
     349         341 :         failed += keywords_insert("HOUR", HOUR);
     350         341 :         failed += keywords_insert("MINUTE", MINUTE);
     351         341 :         failed += keywords_insert("SECOND", SECOND);
     352         341 :         failed += keywords_insert("EPOCH", EPOCH);
     353             : 
     354         341 :         failed += keywords_insert("POSITION", POSITION);
     355         341 :         failed += keywords_insert("SUBSTRING", SUBSTRING);
     356         341 :         failed += keywords_insert("SPLIT_PART", SPLIT_PART);
     357         341 :         failed += keywords_insert("TRIM", TRIM);
     358         341 :         failed += keywords_insert("LEADING", LEADING);
     359         341 :         failed += keywords_insert("TRAILING", TRAILING);
     360         341 :         failed += keywords_insert("BOTH", BOTH);
     361             : 
     362         341 :         failed += keywords_insert("CASE", CASE);
     363         341 :         failed += keywords_insert("WHEN", WHEN);
     364         341 :         failed += keywords_insert("THEN", THEN);
     365         341 :         failed += keywords_insert("ELSE", ELSE);
     366         341 :         failed += keywords_insert("END", END);
     367         341 :         failed += keywords_insert("NULLIF", NULLIF);
     368         341 :         failed += keywords_insert("COALESCE", COALESCE);
     369         341 :         failed += keywords_insert("ELSEIF", ELSEIF);
     370         341 :         failed += keywords_insert("IF", IF);
     371         341 :         failed += keywords_insert("WHILE", WHILE);
     372         341 :         failed += keywords_insert("DO", DO);
     373             : 
     374         341 :         failed += keywords_insert("COMMIT", COMMIT);
     375         341 :         failed += keywords_insert("ROLLBACK", ROLLBACK);
     376         341 :         failed += keywords_insert("SAVEPOINT", SAVEPOINT);
     377         341 :         failed += keywords_insert("RELEASE", RELEASE);
     378         341 :         failed += keywords_insert("WORK", WORK);
     379         341 :         failed += keywords_insert("CHAIN", CHAIN);
     380         341 :         failed += keywords_insert("PRESERVE", PRESERVE);
     381         341 :         failed += keywords_insert("ROWS", ROWS);
     382         341 :         failed += keywords_insert("NO", NO);
     383         341 :         failed += keywords_insert("START", START);
     384         341 :         failed += keywords_insert("TRANSACTION", TRANSACTION);
     385         341 :         failed += keywords_insert("READ", READ);
     386         341 :         failed += keywords_insert("WRITE", WRITE);
     387         341 :         failed += keywords_insert("ONLY", ONLY);
     388         341 :         failed += keywords_insert("ISOLATION", ISOLATION);
     389         341 :         failed += keywords_insert("LEVEL", LEVEL);
     390         341 :         failed += keywords_insert("UNCOMMITTED", UNCOMMITTED);
     391         341 :         failed += keywords_insert("COMMITTED", COMMITTED);
     392         341 :         failed += keywords_insert("REPEATABLE", sqlREPEATABLE);
     393         341 :         failed += keywords_insert("SNAPSHOT", SNAPSHOT);
     394         341 :         failed += keywords_insert("SERIALIZABLE", SERIALIZABLE);
     395         341 :         failed += keywords_insert("DIAGNOSTICS", DIAGNOSTICS);
     396         341 :         failed += keywords_insert("SIZE", sqlSIZE);
     397         341 :         failed += keywords_insert("STORAGE", STORAGE);
     398             : 
     399         341 :         failed += keywords_insert("TYPE", TYPE);
     400         341 :         failed += keywords_insert("PROCEDURE", PROCEDURE);
     401         341 :         failed += keywords_insert("FUNCTION", FUNCTION);
     402         341 :         failed += keywords_insert("LOADER", sqlLOADER);
     403         341 :         failed += keywords_insert("REPLACE", REPLACE);
     404             : 
     405         341 :         failed += keywords_insert("FIELD", FIELD);
     406         341 :         failed += keywords_insert("FILTER", FILTER);
     407         341 :         failed += keywords_insert("AGGREGATE", AGGREGATE);
     408         341 :         failed += keywords_insert("RETURNS", RETURNS);
     409         341 :         failed += keywords_insert("EXTERNAL", EXTERNAL);
     410         341 :         failed += keywords_insert("NAME", sqlNAME);
     411         341 :         failed += keywords_insert("RETURN", RETURN);
     412         341 :         failed += keywords_insert("CALL", CALL);
     413         341 :         failed += keywords_insert("LANGUAGE", LANGUAGE);
     414             : 
     415         341 :         failed += keywords_insert("ANALYZE", ANALYZE);
     416         341 :         failed += keywords_insert("MINMAX", MINMAX);
     417         341 :         failed += keywords_insert("EXPLAIN", SQL_EXPLAIN);
     418         341 :         failed += keywords_insert("PLAN", SQL_PLAN);
     419         341 :         failed += keywords_insert("TRACE", SQL_TRACE);
     420         341 :         failed += keywords_insert("PREPARE", PREPARE);
     421         341 :         failed += keywords_insert("PREP", PREP);
     422         341 :         failed += keywords_insert("EXECUTE", EXECUTE);
     423         341 :         failed += keywords_insert("EXEC", EXEC);
     424         341 :         failed += keywords_insert("DEALLOCATE", DEALLOCATE);
     425             : 
     426         341 :         failed += keywords_insert("INDEX", INDEX);
     427             : 
     428         341 :         failed += keywords_insert("SEQUENCE", SEQUENCE);
     429         341 :         failed += keywords_insert("RESTART", RESTART);
     430         341 :         failed += keywords_insert("INCREMENT", INCREMENT);
     431         341 :         failed += keywords_insert("MAXVALUE", MAXVALUE);
     432         341 :         failed += keywords_insert("MINVALUE", MINVALUE);
     433         341 :         failed += keywords_insert("CYCLE", CYCLE);
     434         341 :         failed += keywords_insert("CACHE", CACHE);
     435         341 :         failed += keywords_insert("NEXT", NEXT);
     436         341 :         failed += keywords_insert("VALUE", VALUE);
     437         341 :         failed += keywords_insert("GENERATED", GENERATED);
     438         341 :         failed += keywords_insert("ALWAYS", ALWAYS);
     439         341 :         failed += keywords_insert("IDENTITY", IDENTITY);
     440         341 :         failed += keywords_insert("SERIAL", SERIAL);
     441         341 :         failed += keywords_insert("BIGSERIAL", BIGSERIAL);
     442         341 :         failed += keywords_insert("AUTO_INCREMENT", AUTO_INCREMENT);
     443         341 :         failed += keywords_insert("CONTINUE", CONTINUE);
     444             : 
     445         341 :         failed += keywords_insert("TRIGGER", TRIGGER);
     446         341 :         failed += keywords_insert("ATOMIC", ATOMIC);
     447         341 :         failed += keywords_insert("BEGIN", BEGIN);
     448         341 :         failed += keywords_insert("OF", OF);
     449         341 :         failed += keywords_insert("BEFORE", BEFORE);
     450         341 :         failed += keywords_insert("AFTER", AFTER);
     451         341 :         failed += keywords_insert("ROW", ROW);
     452         341 :         failed += keywords_insert("STATEMENT", STATEMENT);
     453         341 :         failed += keywords_insert("NEW", sqlNEW);
     454         341 :         failed += keywords_insert("OLD", OLD);
     455         341 :         failed += keywords_insert("EACH", EACH);
     456         341 :         failed += keywords_insert("REFERENCING", REFERENCING);
     457             : 
     458         341 :         failed += keywords_insert("RANGE", RANGE);
     459         341 :         failed += keywords_insert("UNBOUNDED", UNBOUNDED);
     460         341 :         failed += keywords_insert("PRECEDING", PRECEDING);
     461         341 :         failed += keywords_insert("FOLLOWING", FOLLOWING);
     462         341 :         failed += keywords_insert("CURRENT", CURRENT);
     463         341 :         failed += keywords_insert("EXCLUDE", EXCLUDE);
     464         341 :         failed += keywords_insert("OTHERS", OTHERS);
     465         341 :         failed += keywords_insert("TIES", TIES);
     466         341 :         failed += keywords_insert("GROUPS", GROUPS);
     467         341 :         failed += keywords_insert("WINDOW", WINDOW);
     468             : 
     469             :         /* special SQL/XML keywords */
     470         341 :         failed += keywords_insert("XMLCOMMENT", XMLCOMMENT);
     471         341 :         failed += keywords_insert("XMLCONCAT", XMLCONCAT);
     472         341 :         failed += keywords_insert("XMLDOCUMENT", XMLDOCUMENT);
     473         341 :         failed += keywords_insert("XMLELEMENT", XMLELEMENT);
     474         341 :         failed += keywords_insert("XMLATTRIBUTES", XMLATTRIBUTES);
     475         341 :         failed += keywords_insert("XMLFOREST", XMLFOREST);
     476         341 :         failed += keywords_insert("XMLPARSE", XMLPARSE);
     477         341 :         failed += keywords_insert("STRIP", STRIP);
     478         341 :         failed += keywords_insert("WHITESPACE", WHITESPACE);
     479         341 :         failed += keywords_insert("XMLPI", XMLPI);
     480         341 :         failed += keywords_insert("XMLQUERY", XMLQUERY);
     481         341 :         failed += keywords_insert("PASSING", PASSING);
     482         341 :         failed += keywords_insert("XMLTEXT", XMLTEXT);
     483         341 :         failed += keywords_insert("NIL", NIL);
     484         341 :         failed += keywords_insert("REF", REF);
     485         341 :         failed += keywords_insert("ABSENT", ABSENT);
     486         341 :         failed += keywords_insert("DOCUMENT", DOCUMENT);
     487         341 :         failed += keywords_insert("ELEMENT", ELEMENT);
     488         341 :         failed += keywords_insert("CONTENT", CONTENT);
     489         341 :         failed += keywords_insert("XMLNAMESPACES", XMLNAMESPACES);
     490         341 :         failed += keywords_insert("NAMESPACE", NAMESPACE);
     491         341 :         failed += keywords_insert("XMLVALIDATE", XMLVALIDATE);
     492         341 :         failed += keywords_insert("RETURNING", RETURNING);
     493         341 :         failed += keywords_insert("LOCATION", LOCATION);
     494         341 :         failed += keywords_insert("ID", ID);
     495         341 :         failed += keywords_insert("ACCORDING", ACCORDING);
     496         341 :         failed += keywords_insert("XMLSCHEMA", XMLSCHEMA);
     497         341 :         failed += keywords_insert("URI", URI);
     498         341 :         failed += keywords_insert("XMLAGG", XMLAGG);
     499             : 
     500             :         /* keywords for opengis */
     501         341 :         failed += keywords_insert("GEOMETRY", GEOMETRY);
     502             : 
     503         341 :         failed += keywords_insert("POINT", GEOMETRYSUBTYPE);
     504         341 :         failed += keywords_insert("LINESTRING", GEOMETRYSUBTYPE);
     505         341 :         failed += keywords_insert("POLYGON", GEOMETRYSUBTYPE);
     506         341 :         failed += keywords_insert("MULTIPOINT", GEOMETRYSUBTYPE);
     507         341 :         failed += keywords_insert("MULTILINESTRING", GEOMETRYSUBTYPE);
     508         341 :         failed += keywords_insert("MULTIPOLYGON", GEOMETRYSUBTYPE);
     509         341 :         failed += keywords_insert("GEOMETRYCOLLECTION", GEOMETRYSUBTYPE);
     510             : 
     511         341 :         failed += keywords_insert("POINTZ", GEOMETRYSUBTYPE);
     512         341 :         failed += keywords_insert("LINESTRINGZ", GEOMETRYSUBTYPE);
     513         341 :         failed += keywords_insert("POLYGONZ", GEOMETRYSUBTYPE);
     514         341 :         failed += keywords_insert("MULTIPOINTZ", GEOMETRYSUBTYPE);
     515         341 :         failed += keywords_insert("MULTILINESTRINGZ", GEOMETRYSUBTYPE);
     516         341 :         failed += keywords_insert("MULTIPOLYGONZ", GEOMETRYSUBTYPE);
     517         341 :         failed += keywords_insert("GEOMETRYCOLLECTIONZ", GEOMETRYSUBTYPE);
     518             : 
     519         341 :         failed += keywords_insert("POINTM", GEOMETRYSUBTYPE);
     520         341 :         failed += keywords_insert("LINESTRINGM", GEOMETRYSUBTYPE);
     521         341 :         failed += keywords_insert("POLYGONM", GEOMETRYSUBTYPE);
     522         341 :         failed += keywords_insert("MULTIPOINTM", GEOMETRYSUBTYPE);
     523         341 :         failed += keywords_insert("MULTILINESTRINGM", GEOMETRYSUBTYPE);
     524         341 :         failed += keywords_insert("MULTIPOLYGONM", GEOMETRYSUBTYPE);
     525         341 :         failed += keywords_insert("GEOMETRYCOLLECTIONM", GEOMETRYSUBTYPE);
     526             : 
     527         341 :         failed += keywords_insert("POINTZM", GEOMETRYSUBTYPE);
     528         341 :         failed += keywords_insert("LINESTRINGZM", GEOMETRYSUBTYPE);
     529         341 :         failed += keywords_insert("POLYGONZM", GEOMETRYSUBTYPE);
     530         341 :         failed += keywords_insert("MULTIPOINTZM", GEOMETRYSUBTYPE);
     531         341 :         failed += keywords_insert("MULTILINESTRINGZM", GEOMETRYSUBTYPE);
     532         341 :         failed += keywords_insert("MULTIPOLYGONZM", GEOMETRYSUBTYPE);
     533         341 :         failed += keywords_insert("GEOMETRYCOLLECTIONZM", GEOMETRYSUBTYPE);
     534         341 :         failed += keywords_insert("LOGIN", LOGIN);
     535             :         // odbc keywords
     536         341 :         failed += keywords_insert("d", ODBC_DATE_ESCAPE_PREFIX);
     537         341 :         failed += keywords_insert("t", ODBC_TIME_ESCAPE_PREFIX);
     538         341 :         failed += keywords_insert("ts", ODBC_TIMESTAMP_ESCAPE_PREFIX);
     539         341 :         failed += keywords_insert("guid", ODBC_GUID_ESCAPE_PREFIX);
     540         341 :         failed += keywords_insert("fn", ODBC_FUNC_ESCAPE_PREFIX);
     541         341 :         failed += keywords_insert("oj", ODBC_OJ_ESCAPE_PREFIX);
     542         341 :         failed += keywords_insert("DAYNAME", DAYNAME);
     543         341 :         failed += keywords_insert("IFNULL", IFNULL);
     544         341 :         failed += keywords_insert("MONTHNAME", MONTHNAME);
     545         341 :         failed += keywords_insert("TIMESTAMPADD", TIMESTAMPADD);
     546         341 :         failed += keywords_insert("TIMESTAMPDIFF", TIMESTAMPDIFF);
     547         341 :         failed += keywords_insert("SQL_BIGINT", SQL_BIGINT);
     548         341 :         failed += keywords_insert("SQL_BINARY", SQL_BINARY);
     549         341 :         failed += keywords_insert("SQL_BIT", SQL_BIT);
     550         341 :         failed += keywords_insert("SQL_CHAR", SQL_CHAR);
     551         341 :         failed += keywords_insert("SQL_DATE", SQL_DATE);
     552         341 :         failed += keywords_insert("SQL_DECIMAL", SQL_DECIMAL);
     553         341 :         failed += keywords_insert("SQL_DOUBLE", SQL_DOUBLE);
     554         341 :         failed += keywords_insert("SQL_FLOAT", SQL_FLOAT);
     555         341 :         failed += keywords_insert("SQL_GUID", SQL_GUID);
     556         341 :         failed += keywords_insert("SQL_HUGEINT", SQL_HUGEINT);
     557         341 :         failed += keywords_insert("SQL_INTEGER", SQL_INTEGER);
     558         341 :         failed += keywords_insert("SQL_INTERVAL_DAY", SQL_INTERVAL_DAY);
     559         341 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_HOUR", SQL_INTERVAL_DAY_TO_HOUR);
     560         341 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_MINUTE", SQL_INTERVAL_DAY_TO_MINUTE);
     561         341 :         failed += keywords_insert("SQL_INTERVAL_DAY_TO_SECOND", SQL_INTERVAL_DAY_TO_SECOND);
     562         341 :         failed += keywords_insert("SQL_INTERVAL_HOUR", SQL_INTERVAL_HOUR);
     563         341 :         failed += keywords_insert("SQL_INTERVAL_HOUR_TO_MINUTE", SQL_INTERVAL_HOUR_TO_MINUTE);
     564         341 :         failed += keywords_insert("SQL_INTERVAL_HOUR_TO_SECOND", SQL_INTERVAL_HOUR_TO_SECOND);
     565         341 :         failed += keywords_insert("SQL_INTERVAL_MINUTE", SQL_INTERVAL_MINUTE);
     566         341 :         failed += keywords_insert("SQL_INTERVAL_MINUTE_TO_SECOND", SQL_INTERVAL_MINUTE_TO_SECOND);
     567         341 :         failed += keywords_insert("SQL_INTERVAL_MONTH", SQL_INTERVAL_MONTH);
     568         341 :         failed += keywords_insert("SQL_INTERVAL_SECOND", SQL_INTERVAL_SECOND);
     569         341 :         failed += keywords_insert("SQL_INTERVAL_YEAR", SQL_INTERVAL_YEAR);
     570         341 :         failed += keywords_insert("SQL_INTERVAL_YEAR_TO_MONTH", SQL_INTERVAL_YEAR_TO_MONTH);
     571         341 :         failed += keywords_insert("SQL_LONGVARBINARY", SQL_LONGVARBINARY);
     572         341 :         failed += keywords_insert("SQL_LONGVARCHAR", SQL_LONGVARCHAR);
     573         341 :         failed += keywords_insert("SQL_NUMERIC", SQL_NUMERIC);
     574         341 :         failed += keywords_insert("SQL_REAL", SQL_REAL);
     575         341 :         failed += keywords_insert("SQL_SMALLINT", SQL_SMALLINT);
     576         341 :         failed += keywords_insert("SQL_TIME", SQL_TIME);
     577         341 :         failed += keywords_insert("SQL_TIMESTAMP", SQL_TIMESTAMP);
     578         341 :         failed += keywords_insert("SQL_TINYINT", SQL_TINYINT);
     579         341 :         failed += keywords_insert("SQL_VARBINARY", SQL_VARBINARY);
     580         341 :         failed += keywords_insert("SQL_VARCHAR", SQL_VARCHAR);
     581         341 :         failed += keywords_insert("SQL_WCHAR", SQL_WCHAR);
     582         341 :         failed += keywords_insert("SQL_WLONGVARCHAR", SQL_WLONGVARCHAR);
     583         341 :         failed += keywords_insert("SQL_WVARCHAR", SQL_WVARCHAR);
     584         341 :         failed += keywords_insert("SQL_TSI_FRAC_SECOND", SQL_TSI_FRAC_SECOND);
     585         341 :         failed += keywords_insert("SQL_TSI_SECOND", SQL_TSI_SECOND);
     586         341 :         failed += keywords_insert("SQL_TSI_MINUTE", SQL_TSI_MINUTE);
     587         341 :         failed += keywords_insert("SQL_TSI_HOUR", SQL_TSI_HOUR);
     588         341 :         failed += keywords_insert("SQL_TSI_DAY", SQL_TSI_DAY);
     589         341 :         failed += keywords_insert("SQL_TSI_WEEK", SQL_TSI_WEEK);
     590         341 :         failed += keywords_insert("SQL_TSI_MONTH", SQL_TSI_MONTH);
     591         341 :         failed += keywords_insert("SQL_TSI_QUARTER", SQL_TSI_QUARTER);
     592         341 :         failed += keywords_insert("SQL_TSI_YEAR", SQL_TSI_YEAR);
     593             : 
     594         341 :         failed += keywords_insert("LEAST", MARGFUNC);
     595         341 :         failed += keywords_insert("GREATEST", MARGFUNC);
     596         341 :         return failed;
     597             : }
     598             : 
     599             : #define find_keyword_bs(lc, s) find_keyword(lc->rs->buf+lc->rs->pos+s)
     600             : 
     601             : void
     602      246532 : scanner_init(struct scanner *s, bstream *rs, stream *ws)
     603             : {
     604      493064 :         *s = (struct scanner) {
     605             :                 .rs = rs,
     606             :                 .ws = ws,
     607             :                 .mode = LINE_N,
     608      246532 :                 .raw_string_mode = GDKgetenv_istrue("raw_strings"),
     609             :                 .aborted = false,
     610             :         };
     611      246532 : }
     612             : 
     613             : void
     614     1300345 : scanner_query_processed(struct scanner *s)
     615             : {
     616     1300345 :         int cur;
     617             : 
     618     1300345 :         if (s->yybak) {
     619      507049 :                 s->rs->buf[s->rs->pos + s->yycur] = s->yybak;
     620      507049 :                 s->yybak = 0;
     621             :         }
     622     1300345 :         if (s->rs) {
     623     1300345 :                 s->rs->pos += s->yycur;
     624             :                 /* completely eat the query including white space after the ; */
     625     2453098 :                 while (s->rs->pos < s->rs->len &&
     626     2124117 :                            (cur = s->rs->buf[s->rs->pos], iswspace(cur))) {
     627     1152753 :                         s->rs->pos++;
     628             :                 }
     629             :         }
     630             :         /*assert(s->rs->pos <= s->rs->len);*/
     631     1300345 :         s->yycur = 0;
     632     1300345 :         s->started = 0;
     633     1300345 :         s->as = 0;
     634     1300345 :         s->schema = NULL;
     635     1300345 : }
     636             : 
     637             : static int
     638          33 : scanner_error(mvc *lc, int cur)
     639             : {
     640          33 :         switch (cur) {
     641           0 :         case EOF:
     642           0 :                 (void) sql_error(lc, 1, SQLSTATE(42000) "Unexpected end of input");
     643           0 :                 return EOF;
     644          33 :         default:
     645             :                 /* on Windows at least, iswcntrl returns TRUE for
     646             :                  * U+FEFF, but we just want consistent error
     647             :                  * messages */
     648          33 :                 (void) sql_error(lc, 1, SQLSTATE(42000) "Unexpected%s character (U+%04X)", iswcntrl(cur) && cur != 0xFEFF ? " control" : "", (unsigned) cur);
     649             :         }
     650          33 :         return LEX_ERROR;
     651             : }
     652             : 
     653             : 
     654             : /*
     655             :    UTF-8 encoding is as follows:
     656             : U-00000000 - U-0000007F: 0xxxxxxx
     657             : U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
     658             : U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
     659             : U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     660             : U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     661             : U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     662             : */
     663             : /* To be correctly coded UTF-8, the sequence should be the shortest
     664             :    possible encoding of the value being encoded.  This means that for
     665             :    an encoding of length n+1 (1 <= n <= 5), at least one of the bits in
     666             :    utf8chkmsk[n] should be non-zero (else the encoding could be
     667             :    shorter).
     668             : */
     669             : static const int utf8chkmsk[] = {
     670             :         0x0000007f,
     671             :         0x00000780,
     672             :         0x0000f800,
     673             :         0x001f0000,
     674             :         0x03e00000,
     675             :         0x7c000000
     676             : };
     677             : 
     678             : static void
     679    31449518 : utf8_putchar(struct scanner *lc, int ch)
     680             : {
     681    31449518 :         if ((ch) < 0x80) {
     682    31449513 :                 lc->yycur--;
     683           5 :         } else if ((ch) < 0x800) {
     684           0 :                 lc->yycur -= 2;
     685           5 :         } else if ((ch) < 0x10000) {
     686           5 :                 lc->yycur -= 3;
     687             :         } else {
     688           0 :                 lc->yycur -= 4;
     689             :         }
     690    31449518 : }
     691             : 
     692             : static inline int
     693   136977415 : scanner_read_more(struct scanner *lc, size_t n)
     694             : {
     695   136977415 :         bstream *b = lc->rs;
     696   136977415 :         bool more = false;
     697             : 
     698             : 
     699   136977415 :         if (lc->aborted)
     700             :                 return EOF;
     701   136981650 :         while (b->len < b->pos + lc->yycur + n) {
     702             : 
     703      136220 :                 if (lc->mode == LINE_1 || !lc->started)
     704             :                         return EOF;
     705             : 
     706             :                 /* query is not finished ask for more */
     707        3649 :                 if (b->eof || !isa_block_stream(b->s)) {
     708        1528 :                         if (bstream_getoob(b)) {
     709           0 :                                 lc->aborted = true;
     710           0 :                                 return EOF;
     711             :                         }
     712        2121 :                         if (mnstr_write(lc->ws, PROMPT2, sizeof(PROMPT2) - 1, 1) == 1)
     713        2121 :                                 mnstr_flush(lc->ws, MNSTR_FLUSH_DATA);
     714        2121 :                         b->eof = false;
     715        2121 :                         more = true;
     716             :                 }
     717             :                 /* we need more query text */
     718        4242 :                 if (bstream_next(b) < 0 ||
     719             :                     /* we asked for more data but didn't get any */
     720        2121 :                     (more && b->eof && b->len < b->pos + lc->yycur + n))
     721             :                         return EOF;
     722        4235 :                 if (more && b->pos + lc->yycur + 2 == b->len && b->buf[b->pos + lc->yycur] == '\200' && b->buf[b->pos + lc->yycur + 1] == '\n') {
     723           0 :                         lc->errstr = "Query aborted";
     724           0 :                         b->len -= 2;
     725           0 :                         b->buf[b->len] = 0;
     726           0 :                         return EOF;
     727             :                 }
     728             :         }
     729             :         return 1;
     730             : }
     731             : 
     732             : static inline int
     733   135727952 : scanner_getc(struct scanner *lc)
     734             : {
     735   135727952 :         bstream *b = lc->rs;
     736   135727952 :         unsigned char *s = NULL;
     737   135727952 :         int c, m, n, mask;
     738             : 
     739   135727952 :         if (scanner_read_more(lc, 1) == EOF) {
     740             :                 //lc->errstr = SQLSTATE(42000) "end of input stream";
     741             :                 return EOF;
     742             :         }
     743   135592932 :         lc->errstr = NULL;
     744             : 
     745   135592932 :         s = (unsigned char *) b->buf + b->pos + lc->yycur++;
     746   135592932 :         if (((c = *s) & 0x80) == 0) {
     747             :                 /* 7-bit char */
     748             :                 return c;
     749             :         }
     750       88230 :         for (n = 0, m = 0x40; c & m; n++, m >>= 1)
     751             :                 ;
     752             :         /* n now is number of 10xxxxxx bytes that should follow */
     753       29435 :         if (n == 0 || n >= 6 || (b->pos + n) > b->len) {
     754             :                 /* incorrect UTF-8 sequence */
     755             :                 /* n==0: c == 10xxxxxx */
     756             :                 /* n>=6: c == 1111111x */
     757           0 :                 lc->errstr = SQLSTATE(42000) "invalid start of UTF-8 sequence";
     758           0 :                 goto error;
     759             :         }
     760             : 
     761       29435 :         if (scanner_read_more(lc, (size_t) n) == EOF)
     762             :                 return EOF;
     763       29435 :         s = (unsigned char *) b->buf + b->pos + lc->yycur;
     764             : 
     765       29435 :         mask = utf8chkmsk[n];
     766       29435 :         c &= ~(0xFFC0 >> n);  /* remove non-x bits */
     767       88229 :         while (--n >= 0) {
     768       58795 :                 c <<= 6;
     769       58795 :                 lc->yycur++;
     770       58795 :                 if (((m = *s++) & 0xC0) != 0x80) {
     771             :                         /* incorrect UTF-8 sequence: byte is not 10xxxxxx */
     772             :                         /* this includes end-of-string (m == 0) */
     773           1 :                         lc->errstr = SQLSTATE(42000) "invalid continuation in UTF-8 sequence";
     774           1 :                         goto error;
     775             :                 }
     776       58794 :                 c |= m & 0x3F;
     777             :         }
     778       29434 :         if ((c & mask) == 0) {
     779             :                 /* incorrect UTF-8 sequence: not shortest possible */
     780           0 :                 lc->errstr = SQLSTATE(42000) "not shortest possible UTF-8 sequence";
     781           0 :                 goto error;
     782             :         }
     783             : 
     784             :         return c;
     785             : 
     786           1 : error:
     787           1 :         if (b->pos + lc->yycur < b->len)    /* skip bogus char */
     788           0 :                 lc->yycur++;
     789             :         return EOF;
     790             : }
     791             : 
     792             : static int
     793    28159699 : scanner_token(struct scanner *lc, int token)
     794             : {
     795    28159699 :         lc->yybak = lc->rs->buf[lc->rs->pos + lc->yycur];
     796    28159699 :         lc->rs->buf[lc->rs->pos + lc->yycur] = 0;
     797    28159699 :         lc->yyval = token;
     798    28159699 :         return lc->yyval;
     799             : }
     800             : 
     801             : static int
     802     2092264 : scanner_string(mvc *c, int quote, bool escapes)
     803             : {
     804     2092264 :         struct scanner *lc = &c->scanner;
     805     2092264 :         bstream *rs = lc->rs;
     806     2092264 :         int cur = quote;
     807     2092264 :         bool escape = false;
     808     2092264 :         const size_t limit = quote == '"' ? 1 << 11 : 1 << 30;
     809             : 
     810     2092264 :         lc->started = 1;
     811     2130142 :         while (cur != EOF) {
     812     2130127 :                 size_t pos = 0;
     813     2130127 :                 const size_t yycur = rs->pos + lc->yycur;
     814             : 
     815    35122684 :                 while (cur != EOF && (quote != '"' || cur != 0xFEFF) && pos < limit &&
     816    32992557 :                        (((cur = rs->buf[yycur + pos++]) & 0x80) == 0) &&
     817    65955668 :                        cur && (cur != quote || escape)) {
     818    30862431 :                         if (escapes && cur == '\\')
     819        6593 :                                 escape = !escape;
     820             :                         else
     821             :                                 escape = false;
     822             :                 }
     823     2130127 :                 if (pos == limit) {
     824           0 :                         (void) sql_error(c, 2, SQLSTATE(42000) "string too long");
     825           0 :                         return LEX_ERROR;
     826             :                 }
     827             :                 /* BOM character not allowed as an identifier */
     828     2130127 :                 if (cur == EOF || (quote == '"' && cur == 0xFEFF))
     829           1 :                         return scanner_error(c, cur);
     830     2130126 :                 lc->yycur += pos;
     831             :                 /* check for quote escaped quote: Obscure SQL Rule */
     832     2130126 :                 if (cur == quote && rs->buf[yycur + pos] == quote) {
     833        8445 :                         lc->yycur++;
     834        8445 :                         continue;
     835             :                 }
     836     2121681 :                 assert(yycur + pos <= rs->len + 1);
     837     2121681 :                 if (cur == quote && !escape) {
     838     2092234 :                         return scanner_token(lc, STRING);
     839             :                 }
     840       29447 :                 lc->yycur--; /* go back to current (possibly invalid) char */
     841             :                 /* long utf8, if correct isn't the quote */
     842       29447 :                 if (!cur) {
     843          30 :                         if (lc->rs->len >= lc->rs->pos + lc->yycur + 1) {
     844          14 :                                 (void) sql_error(c, 2, SQLSTATE(42000) "NULL byte in string");
     845          14 :                                 return LEX_ERROR;
     846             :                         }
     847          16 :                         cur = scanner_read_more(lc, 1);
     848             :                 } else {
     849       29417 :                         cur = scanner_getc(lc);
     850             :                 }
     851             :         }
     852          15 :         (void) sql_error(c, 2, "%s", lc->errstr ? lc->errstr : SQLSTATE(42000) "Unexpected end of input");
     853          15 :         return EOF;
     854             : }
     855             : 
     856             : /* scan a structure {blah} into a string. We only count the matching {}
     857             :  * unless escaped. We do not consider embeddings in string literals yet
     858             :  */
     859             : 
     860             : static int
     861         229 : scanner_body(mvc *c)
     862             : {
     863         229 :         struct scanner *lc = &c->scanner;
     864         229 :         bstream *rs = lc->rs;
     865         229 :         int cur = (int) 'x';
     866         229 :         int blk = 1;
     867         229 :         bool escape = false;
     868             : 
     869         229 :         lc->started = 1;
     870         229 :         assert(rs->buf[rs->pos + lc->yycur-1] == '{');
     871         285 :         while (cur != EOF) {
     872         285 :                 size_t pos = rs->pos + lc->yycur;
     873             : 
     874       31486 :                 while ((((cur = rs->buf[pos++]) & 0x80) == 0) && cur && (blk || escape)) {
     875       31201 :                         if (cur != '\\')
     876             :                                 escape = false;
     877             :                         else
     878          12 :                                 escape = !escape;
     879       31201 :                         blk += cur =='{';
     880       31201 :                         blk -= cur =='}';
     881             :                 }
     882         285 :                 lc->yycur = pos - rs->pos;
     883         285 :                 assert(pos <= rs->len + 1);
     884         285 :                 if (blk == 0 && !escape){
     885         229 :                         lc->yycur--; /* go back to current (possibly invalid) char */
     886         229 :                         return scanner_token(lc, X_BODY);
     887             :                 }
     888          56 :                 lc->yycur--; /* go back to current (possibly invalid) char */
     889          56 :                 if (!cur) {
     890          56 :                         if (lc->rs->len >= lc->rs->pos + lc->yycur + 1) {
     891           0 :                                 (void) sql_error(c, 2, SQLSTATE(42000) "NULL byte in string");
     892           0 :                                 return LEX_ERROR;
     893             :                         }
     894          56 :                         cur = scanner_read_more(lc, 1);
     895             :                 } else {
     896           0 :                         cur = scanner_getc(lc);
     897             :                 }
     898             :         }
     899           0 :         (void) sql_error(c, 2, SQLSTATE(42000) "Unexpected end of input");
     900           0 :         return EOF;
     901             : }
     902             : 
     903             : static int
     904    13507106 : keyword_or_ident(mvc * c, int cur)
     905             : {
     906    13507106 :         struct scanner *lc = &c->scanner;
     907    13507106 :         keyword *k = NULL;
     908    13507106 :         size_t s;
     909             : 
     910    13507106 :         lc->started = 1;
     911    13507106 :         utf8_putchar(lc, cur);
     912    13507094 :         s = lc->yycur;
     913    13507094 :         lc->yyval = IDENT;
     914    80788535 :         while ((cur = scanner_getc(lc)) != EOF) {
     915    80788426 :                 if (!iswalnum(cur) && cur != '_') {
     916    13506985 :                         utf8_putchar(lc, cur);
     917    13506983 :                         (void)scanner_token(lc, IDENT);
     918    13506983 :                         if ((k = find_keyword_bs(lc,s)))
     919     8275345 :                                 lc->yyval = k->token;
     920    13507123 :                         return lc->yyval;
     921             :                 }
     922             :         }
     923             :         if (cur < 0)
     924             :                 return cur;
     925             :         (void)scanner_token(lc, IDENT);
     926             :         if ((k = find_keyword_bs(lc,s)))
     927             :                 lc->yyval = k->token;
     928             :         return lc->yyval;
     929             : }
     930             : 
     931             : static int
     932    14128950 : skip_white_space(struct scanner * lc)
     933             : {
     934    17772966 :         int cur;
     935             : 
     936    17772966 :         do {
     937    17772966 :                 lc->yysval = lc->yycur;
     938    17772966 :         } while ((cur = scanner_getc(lc)) != EOF && iswspace(cur));
     939    14128945 :         return cur;
     940             : }
     941             : 
     942             : static int
     943       69007 : skip_c_comment(struct scanner * lc)
     944             : {
     945       69007 :         int cur;
     946       69007 :         int prev = 0;
     947       69007 :         int started = lc->started;
     948       69007 :         int depth = 1;
     949             : 
     950       69007 :         lc->started = 1;
     951     1391778 :         while (depth > 0 && (cur = scanner_getc(lc)) != EOF) {
     952     1322771 :                 if (prev == '*' && cur == '/')
     953       69007 :                         depth--;
     954     1253764 :                 else if (prev == '/' && cur == '*') {
     955             :                         /* block comments can nest */
     956           0 :                         cur = 0; /* prevent slash-star-slash from matching */
     957           0 :                         depth++;
     958             :                 }
     959             :                 prev = cur;
     960             :         }
     961       69007 :         lc->yysval = lc->yycur;
     962       69007 :         lc->started = started;
     963             :         /* a comment is equivalent to a newline */
     964       69007 :         return cur == EOF ? cur : '\n';
     965             : }
     966             : 
     967             : static int
     968        3169 : skip_sql_comment(struct scanner * lc)
     969             : {
     970        3169 :         int cur;
     971        3169 :         int started = lc->started;
     972             : 
     973        3169 :         lc->started = 1;
     974      834518 :         while ((cur = scanner_getc(lc)) != EOF && (cur != '\n'))
     975             :                 ;
     976        3169 :         lc->yysval = lc->yycur;
     977        3169 :         lc->started = started;
     978             :         /* a comment is equivalent to a newline */
     979        3169 :         return cur;
     980             : }
     981             : 
     982             : static int tokenize(mvc * lc, int cur);
     983             : 
     984     5752824 : static inline bool is_valid_decimal_digit(int cur) { return (iswdigit(cur)); }
     985          13 : static inline bool is_valid_binary_digit(int cur) { return (iswdigit(cur) && cur < '2'); }
     986          10 : static inline bool is_valid_octal_digit(int cur) { return (iswdigit(cur) && cur < '8'); }
     987        3688 : static inline bool is_valid_hexadecimal_digit(int cur) { return iswxdigit(cur); }
     988             : 
     989     1908318 : static inline int check_validity_number(mvc* c, int pcur, bool initial_underscore_allowed, int *token, int type) {
     990     1908318 :         struct scanner *lc = &c->scanner;
     991     1908318 :         bool (*is_valid_n_ary_digit)(int);
     992             : 
     993     1908318 :         if (pcur == '_' && !initial_underscore_allowed)  /* ERROR: initial underscore not allowed */  {
     994           0 :                 *token = 0;
     995           0 :                 return '_';
     996             :         }
     997             : 
     998     1908318 :         switch (type) {
     999             :         case BINARYNUM:
    1000             :                 is_valid_n_ary_digit = &is_valid_binary_digit;
    1001             :                 break;
    1002           3 :         case OCTALNUM:
    1003           3 :                 is_valid_n_ary_digit = &is_valid_octal_digit;
    1004           3 :                 break;
    1005         280 :         case HEXADECIMALNUM:
    1006         280 :                 is_valid_n_ary_digit = &is_valid_hexadecimal_digit;
    1007         280 :                 break;
    1008     1908033 :         default:
    1009     1908033 :                 is_valid_n_ary_digit = &is_valid_decimal_digit;
    1010     1908033 :                 break;
    1011             :         }
    1012             : 
    1013     1908318 :         if ( !(pcur == '_' || is_valid_n_ary_digit(pcur)) ) /* ERROR: first digit is not valid */ {
    1014          17 :                 *token = 0;
    1015          17 :                 return pcur;
    1016             :         }
    1017             : 
    1018     1908256 :         int cur = scanner_getc(lc);
    1019     1908181 :         *token = type;
    1020     3849689 :         while (cur != EOF) {
    1021     3849689 :                 if (cur == '_') {
    1022          25 :                         if (pcur == '_') /* ERROR: multiple consecutive underscores */ {
    1023           2 :                                 *token = 0;
    1024           2 :                                 return '_';
    1025             :                         }
    1026             :                 }
    1027     3849664 :                 else if (!is_valid_n_ary_digit(cur))
    1028             :                         break;
    1029     1941352 :                 pcur = cur;
    1030     1941352 :                 cur = scanner_getc(lc);
    1031             :         }
    1032             : 
    1033     1907880 :         if (pcur == '_')  {
    1034           3 :                 *token = 0;
    1035           3 :                 if (iswalnum(cur))       /* ERROR: not a valid digit */
    1036             :                         return cur;
    1037             :                 else                            /* ERROR: number ends with underscore */
    1038             :                         return '_';
    1039             :         }
    1040             : 
    1041             :         return cur;
    1042             : }
    1043             : 
    1044             : static int
    1045     1894998 : number(mvc * c, int cur)
    1046             : {
    1047     1894998 :         struct scanner *lc = &c->scanner;
    1048     1894998 :         int token = sqlINT;
    1049             : 
    1050             :         /* a number has one of these forms (expressed in regular expressions):
    1051             :          * 0x[0-9A-Fa-f]+                   -- (hexadecimal) INTEGER
    1052             :          * \.[0-9]+                         -- DECIMAL
    1053             :          * [0-9]+\.[0-9]*                   -- DECIMAL
    1054             :          * [0-9]+@0                         -- OID
    1055             :          * [0-9]*\.[0-9]+[eE][-+]?[0-9]+    -- REAL
    1056             :          * [0-9]+(\.[0-9]*)?[eE][-+]?[0-9]+ -- REAL
    1057             :          * [0-9]+                           -- (decimal) INTEGER
    1058             :          */
    1059     1894998 :         lc->started = 1;
    1060     1894998 :         if (cur == '0') {
    1061      303730 :                 switch ((cur = scanner_getc(lc))) {
    1062           2 :                 case 'b':
    1063           2 :                         cur = scanner_getc(lc);
    1064           2 :                         if ((cur = check_validity_number(c, cur, true, &token, BINARYNUM)) == EOF) return cur;
    1065             :                         break;
    1066           3 :                 case 'o':
    1067           3 :                         cur = scanner_getc(lc);
    1068           3 :                         if ((cur = check_validity_number(c,  cur, true, &token, OCTALNUM)) == EOF) return cur;
    1069             :                         break;
    1070         280 :                 case 'x':
    1071         280 :                         cur = scanner_getc(lc);
    1072         280 :                         if ((cur = check_validity_number(c,  cur, true, &token, HEXADECIMALNUM)) == EOF) return cur;
    1073             :                         break;
    1074      303441 :                 default:
    1075      303441 :                         utf8_putchar(lc, cur);
    1076      303441 :                         cur = '0';
    1077             :                 }
    1078             :         }
    1079     1894994 :         if (token == sqlINT) {
    1080     1894705 :                 if ((cur = check_validity_number(c, cur, false, &token, sqlINT)) == EOF) return cur;
    1081     1894285 :                 if (cur == '@') {
    1082           0 :                         if (token == sqlINT) {
    1083           0 :                                 cur = scanner_getc(lc);
    1084           0 :                                 if (cur == EOF)
    1085             :                                         return cur;
    1086           0 :                                 if (cur == '0') {
    1087           0 :                                         cur = scanner_getc(lc);
    1088           0 :                                         if (cur == EOF)
    1089             :                                                 return cur;
    1090           0 :                                         token = OIDNUM;
    1091             :                                 } else {
    1092             :                                         /* number + '@' not followed by 0: show '@' as erroneous */
    1093           0 :                                         utf8_putchar(lc, cur);
    1094           0 :                                         cur = '@';
    1095           0 :                                         token = 0;
    1096             :                                 }
    1097             :                         }
    1098             :                 } else {
    1099     1894285 :                         if (cur == '.') {
    1100       11104 :                                 cur = scanner_getc(lc);
    1101       11104 :                                 if (iswalnum(cur)) /* early exit for numerical forms with final . e.g. 10. */
    1102       11098 :                                 if ((cur = check_validity_number(c, cur, false, &token, INTNUM)) == EOF) return cur;
    1103             :                         }
    1104     1894285 :                         if (token != 0)
    1105     1894283 :                         if (cur == 'e' || cur == 'E') {
    1106        2229 :                                 cur = scanner_getc(lc);
    1107        2229 :                                 if (cur == '+' || cur == '-')
    1108        2111 :                                         cur = scanner_getc(lc);
    1109        2229 :                                 if ((cur = check_validity_number(c, cur, false, &token, APPROXNUM)) == EOF) return cur;
    1110             :                         }
    1111             :                 }
    1112             :         }
    1113             : 
    1114     1892345 :         assert(cur != EOF);
    1115             : 
    1116     1894574 :         if (iswalnum(cur)) /* ERROR: not a valid digit */
    1117           6 :                 token = 0;
    1118             : 
    1119     1894574 :         utf8_putchar(lc, cur);
    1120             : 
    1121     1894508 :         if (token) {
    1122     1894498 :                 return scanner_token(lc, token);
    1123             :         } else {
    1124          10 :                 (void)sql_error( c, 2, SQLSTATE(42000) "Unexpected symbol %lc", (wint_t) cur);
    1125          10 :                 return LEX_ERROR;
    1126             :         }
    1127             : }
    1128             : 
    1129             : static
    1130    12823396 : int scanner_symbol(mvc * c, int cur)
    1131             : {
    1132    12823396 :         struct scanner *lc = &c->scanner;
    1133    12823396 :         int next = 0;
    1134    12823396 :         int started = lc->started;
    1135             : 
    1136    12823396 :         switch (cur) {
    1137       71650 :         case '/':
    1138       71650 :                 lc->started = 1;
    1139       71650 :                 next = scanner_getc(lc);
    1140       71650 :                 if (next < 0)
    1141             :                         return EOF;
    1142       71650 :                 if (next == '*') {
    1143       69007 :                         lc->started = started;
    1144       69007 :                         cur = skip_c_comment(lc);
    1145       69007 :                         if (cur < 0)
    1146             :                                 return EOF;
    1147       69007 :                         return tokenize(c, cur);
    1148             :                 } else {
    1149        2643 :                         utf8_putchar(lc, next);
    1150        2643 :                         return scanner_token(lc, cur);
    1151             :                 }
    1152           0 :         case '0':
    1153             :         case '1':
    1154             :         case '2':
    1155             :         case '3':
    1156             :         case '4':
    1157             :         case '5':
    1158             :         case '6':
    1159             :         case '7':
    1160             :         case '8':
    1161             :         case '9':
    1162           0 :                 return number(c, cur);
    1163           5 :         case '#':
    1164           5 :                 if ((cur = skip_sql_comment(lc)) == EOF)
    1165             :                         return cur;
    1166           5 :                 return tokenize(c, cur);
    1167      804943 :         case '\'':
    1168      804943 :                 if (lc->raw_string_mode || lc->next_string_is_raw)
    1169          59 :                         return scanner_string(c, cur, false);
    1170      804884 :                 return scanner_string(c, cur, true);
    1171     1280244 :         case '"':
    1172     1280244 :                 return scanner_string(c, cur, false);
    1173         495 :         case '{':
    1174             :                 // if previous tokens like LANGUAGE IDENT
    1175             :                 // TODO checking on IDENT only may not be enough
    1176         495 :                 if (lc->yylast == IDENT)
    1177         229 :                         return scanner_body(c);
    1178         266 :                 lc->started = 1;
    1179         266 :                 return scanner_token(lc, cur);
    1180         266 :         case '}':
    1181         266 :                 lc->started = 1;
    1182         266 :                 return scanner_token(lc, cur);
    1183       30214 :         case '-':
    1184       30214 :                 lc->started = 1;
    1185       30214 :                 next = scanner_getc(lc);
    1186       30214 :                 if (next < 0)
    1187             :                         return EOF;
    1188       30213 :                 if (next == '-') {
    1189        3164 :                         lc->started = started;
    1190        3164 :                         if ((cur = skip_sql_comment(lc)) == EOF)
    1191             :                                 return cur;
    1192        3164 :                         return tokenize(c, cur);
    1193             :                 }
    1194       27049 :                 lc->started = 1;
    1195       27049 :                 utf8_putchar(lc, next);
    1196       27049 :                 return scanner_token(lc, cur);
    1197          12 :         case '~': /* binary not */
    1198          12 :                 lc->started = 1;
    1199          12 :                 next = scanner_getc(lc);
    1200          12 :                 if (next < 0)
    1201             :                         return EOF;
    1202          12 :                 if (next == '=')
    1203           5 :                         return scanner_token(lc, GEOM_MBR_EQUAL);
    1204           7 :                 utf8_putchar(lc, next);
    1205           7 :                 return scanner_token(lc, cur);
    1206     7211004 :         case '^': /* binary xor */
    1207             :         case '*':
    1208             :         case '?':
    1209             :         case ':':
    1210             :         case '%':
    1211             :         case '+':
    1212             :         case '(':
    1213             :         case ')':
    1214             :         case ',':
    1215             :         case '=':
    1216             :         case '[':
    1217             :         case ']':
    1218     7211004 :                 lc->started = 1;
    1219     7211004 :                 return scanner_token(lc, cur);
    1220        6115 :         case '&':
    1221        6115 :                 lc->started = 1;
    1222        6115 :                 cur = scanner_getc(lc);
    1223        6115 :                 if (cur < 0)
    1224             :                         return EOF;
    1225        6115 :                 if (cur < 0)
    1226             :                         return EOF;
    1227        6115 :                 if(cur == '<') {
    1228           3 :                         next = scanner_getc(lc);
    1229           3 :                         if (next < 0)
    1230             :                                 return EOF;
    1231           3 :                         if(next == '|') {
    1232           0 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_BELOW);
    1233             :                         } else {
    1234           3 :                                 utf8_putchar(lc, next); //put the char back
    1235           3 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_LEFT);
    1236             :                         }
    1237        6112 :                 } else if(cur == '>')
    1238           3 :                         return scanner_token(lc, GEOM_OVERLAP_OR_RIGHT);
    1239        6109 :                 else if(cur == '&')
    1240           3 :                         return scanner_token(lc, GEOM_OVERLAP);
    1241             :                 else {/* binary and */
    1242        6106 :                         utf8_putchar(lc, cur); //put the char back
    1243        6106 :                         return scanner_token(lc, '&');
    1244             :                 }
    1245          19 :         case '@':
    1246          19 :                 lc->started = 1;
    1247          19 :                 return scanner_token(lc, AT);
    1248      987821 :         case ';':
    1249      987821 :                 lc->started = 0;
    1250      987821 :                 return scanner_token(lc, SCOLON);
    1251       51926 :         case '<':
    1252       51926 :                 lc->started = 1;
    1253       51926 :                 cur = scanner_getc(lc);
    1254       51926 :                 if (cur < 0)
    1255             :                         return EOF;
    1256       51926 :                 if (cur == '=') {
    1257        3143 :                         return scanner_token( lc, COMPARISON);
    1258       48783 :                 } else if (cur == '>') {
    1259       35231 :                         return scanner_token( lc, COMPARISON);
    1260       13552 :                 } else if (cur == '<') {
    1261          46 :                         next = scanner_getc(lc);
    1262          46 :                         if (next < 0)
    1263             :                                 return EOF;
    1264          46 :                         if (next == '=') {
    1265           4 :                                 return scanner_token( lc, LEFT_SHIFT_ASSIGN);
    1266          42 :                         } else if (next == '|') {
    1267           1 :                                 return scanner_token(lc, GEOM_BELOW);
    1268             :                         } else {
    1269          41 :                                 utf8_putchar(lc, next); //put the char back
    1270          41 :                                 return scanner_token( lc, LEFT_SHIFT);
    1271             :                         }
    1272       13506 :                 } else if(cur == '-') {
    1273          19 :                         next = scanner_getc(lc);
    1274          19 :                         if (next < 0)
    1275             :                                 return EOF;
    1276          19 :                         if(next == '>') {
    1277           7 :                                 return scanner_token(lc, GEOM_DIST);
    1278             :                         } else {
    1279             :                                 //put the characters back and fall in the next possible case
    1280          12 :                                 utf8_putchar(lc, next);
    1281          12 :                                 utf8_putchar(lc, cur);
    1282          12 :                                 return scanner_token( lc, COMPARISON);
    1283             :                         }
    1284             :                 } else {
    1285       13487 :                         utf8_putchar(lc, cur);
    1286       13487 :                         return scanner_token( lc, COMPARISON);
    1287             :                 }
    1288       47732 :         case '>':
    1289       47732 :                 lc->started = 1;
    1290       47732 :                 cur = scanner_getc(lc);
    1291       47732 :                 if (cur < 0)
    1292             :                         return EOF;
    1293       47732 :                 if (cur == '>') {
    1294        2579 :                         cur = scanner_getc(lc);
    1295        2579 :                         if (cur < 0)
    1296             :                                 return EOF;
    1297        2579 :                         if (cur == '=')
    1298           3 :                                 return scanner_token( lc, RIGHT_SHIFT_ASSIGN);
    1299        2576 :                         utf8_putchar(lc, cur);
    1300        2576 :                         return scanner_token( lc, RIGHT_SHIFT);
    1301       45153 :                 } else if (cur != '=') {
    1302       42912 :                         utf8_putchar(lc, cur);
    1303       42912 :                         return scanner_token( lc, COMPARISON);
    1304             :                 } else {
    1305        2241 :                         return scanner_token( lc, COMPARISON);
    1306             :                 }
    1307     2143934 :         case '.':
    1308     2143934 :                 lc->started = 1;
    1309     2143934 :                 cur = scanner_getc(lc);
    1310     2143934 :                 if (cur < 0)
    1311             :                         return EOF;
    1312     2143933 :                 if (!iswdigit(cur)) {
    1313     2143920 :                         utf8_putchar(lc, cur);
    1314     2143920 :                         return scanner_token( lc, '.');
    1315             :                 } else {
    1316          13 :                         utf8_putchar(lc, cur);
    1317          13 :                         cur = '.';
    1318          13 :                         return number(c, cur);
    1319             :                 }
    1320      186982 :         case '|': /* binary or or string concat */
    1321      186982 :                 lc->started = 1;
    1322      186982 :                 cur = scanner_getc(lc);
    1323      186982 :                 if (cur < 0)
    1324             :                         return EOF;
    1325      186982 :                 if (cur == '|') {
    1326      186953 :                         return scanner_token(lc, CONCATSTRING);
    1327          29 :                 } else if (cur == '&') {
    1328           0 :                         next = scanner_getc(lc);
    1329           0 :                         if (next < 0)
    1330             :                                 return EOF;
    1331           0 :                         if(next == '>') {
    1332           0 :                                 return scanner_token(lc, GEOM_OVERLAP_OR_ABOVE);
    1333             :                         } else {
    1334           0 :                                 utf8_putchar(lc, next); //put the char back
    1335           0 :                                 utf8_putchar(lc, cur); //put the char back
    1336           0 :                                 return scanner_token(lc, '|');
    1337             :                         }
    1338          29 :                 } else if (cur == '>') {
    1339           1 :                         next = scanner_getc(lc);
    1340           1 :                         if (next < 0)
    1341             :                                 return EOF;
    1342           1 :                         if(next == '>') {
    1343           1 :                                 return scanner_token(lc, GEOM_ABOVE);
    1344             :                         } else {
    1345           0 :                                 utf8_putchar(lc, next); //put the char back
    1346           0 :                                 utf8_putchar(lc, cur); //put the char back
    1347           0 :                                 return scanner_token(lc, '|');
    1348             :                         }
    1349             :                 } else {
    1350          28 :                         utf8_putchar(lc, cur);
    1351          28 :                         return scanner_token(lc, '|');
    1352             :                 }
    1353             :         }
    1354          34 :         (void)sql_error( c, 3, SQLSTATE(42000) "Unexpected symbol (%lc)", (wint_t) cur);
    1355          34 :         return LEX_ERROR;
    1356             : }
    1357             : 
    1358             : static int
    1359    28251161 : tokenize(mvc * c, int cur)
    1360             : {
    1361    28251161 :         struct scanner *lc = &c->scanner;
    1362    56466501 :         while (1) {
    1363    42358831 :                 if (cur == 0xFEFF) {
    1364             :                         /* on Linux at least, iswpunct returns TRUE
    1365             :                          * for U+FEFF, but we don't want that, we just
    1366             :                          * want to go to the scanner_error case
    1367             :                          * below */
    1368             :                         ;
    1369    42358814 :                 } else if (iswspace(cur)) {
    1370    14126370 :                         if ((cur = skip_white_space(lc)) == EOF)
    1371             :                                 return cur;
    1372    14107670 :                         continue;  /* try again */
    1373    28232444 :                 } else if (iswdigit(cur)) {
    1374     1894975 :                         return number(c, cur);
    1375    26337469 :                 } else if (iswalpha(cur) || cur == '_') {
    1376    13479259 :                         switch (cur) {
    1377      650111 :                         case 'e': /* string with escapes */
    1378             :                         case 'E':
    1379      650111 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1380      650111 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '\'') {
    1381        3792 :                                         return scanner_string(c, scanner_getc(lc), true);
    1382             :                                 }
    1383             :                                 break;
    1384      416876 :                         case 'x': /* blob */
    1385             :                         case 'X':
    1386             :                         case 'r': /* raw string */
    1387             :                         case 'R':
    1388      416876 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1389      416876 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '\'') {
    1390        3268 :                                         return scanner_string(c, scanner_getc(lc), false);
    1391             :                                 }
    1392             :                                 break;
    1393      153239 :                         case 'u': /* unicode string */
    1394             :                         case 'U':
    1395      153239 :                                 if (scanner_read_more(lc, 1) != EOF &&
    1396      153256 :                                     lc->rs->buf[lc->rs->pos + lc->yycur] == '&' &&
    1397          17 :                                     scanner_read_more(lc, 2) != EOF &&
    1398          17 :                                     (lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '\'' ||
    1399             :                                      lc->rs->buf[lc->rs->pos + lc->yycur + 1] == '"')) {
    1400          17 :                                         cur = scanner_getc(lc); /* '&' */
    1401          17 :                                         return scanner_string(c, scanner_getc(lc), false);
    1402             :                                 }
    1403             :                                 break;
    1404             :                         default:
    1405             :                                 break;
    1406             :                         }
    1407    13507097 :                         return keyword_or_ident(c, cur);
    1408    12823295 :                 } else if (iswpunct(cur)) {
    1409    12823280 :                         return scanner_symbol(c, cur);
    1410             :                 }
    1411          32 :                 if (cur == EOF) {
    1412           0 :                         if (lc->mode == LINE_1 || !lc->started )
    1413             :                                 return cur;
    1414           0 :                         return scanner_error(c, cur);
    1415             :                 }
    1416             :                 /* none of the above: error */
    1417          32 :                 return scanner_error(c, cur);
    1418             :         }
    1419             : }
    1420             : 
    1421             : /* SQL 'quoted' idents consist of a set of any character of
    1422             :  * the source language character set other than a 'quote'
    1423             :  *
    1424             :  * MonetDB has 3 restrictions:
    1425             :  *      1 we disallow '%' as the first character.
    1426             :  *      2 the length is limited to 1024 characters
    1427             :  *      3 the identifier 'TID%' is not allowed
    1428             :  */
    1429             : static bool
    1430     1280233 : valid_ident(const char *restrict s, char *restrict dst)
    1431             : {
    1432     1280233 :         int p = 0;
    1433             : 
    1434     1280233 :         if (*s == '%')
    1435             :                 return false;
    1436             : 
    1437     9499169 :         while (*s) {
    1438     8218936 :                 if ((dst[p++] = *s++) == '"' && *s == '"')
    1439          68 :                         s++;
    1440     8218936 :                 if (p >= 1024)
    1441             :                         return false;
    1442             :         }
    1443     1280233 :         dst[p] = '\0';
    1444     1280233 :         if (strcmp(dst, TID + 1) == 0) /* an index named 'TID%' could interfere with '%TID%' */
    1445             :                 return false;
    1446             :         return true;
    1447             : }
    1448             : 
    1449             : static inline int
    1450    28352319 : sql_get_next_token(YYSTYPE *yylval, void *parm)
    1451             : {
    1452    28352319 :         mvc *c = (mvc*)parm;
    1453    28352319 :         struct scanner *lc = &c->scanner;
    1454    28352319 :         int token = 0, cur = 0;
    1455             : 
    1456    28352319 :         if (lc->rs->buf == NULL) /* malloc failure */
    1457             :                 return EOF;
    1458             : 
    1459    28352319 :         if (lc->yynext) {
    1460       61424 :                 int next = lc->yynext;
    1461             : 
    1462       61424 :                 lc->yynext = 0;
    1463       61424 :                 return(next);
    1464             :         }
    1465             : 
    1466    28290895 :         if (lc->yybak) {
    1467    27267211 :                 lc->rs->buf[lc->rs->pos + lc->yycur] = lc->yybak;
    1468    27267211 :                 lc->yybak = 0;
    1469             :         }
    1470             : 
    1471    28290895 :         lc->yysval = lc->yycur;
    1472    28290895 :         lc->yylast = lc->yyval;
    1473    28290895 :         cur = scanner_getc(lc);
    1474    28290761 :         if (cur < 0)
    1475             :                 return EOF;
    1476    28179466 :         token = tokenize(c, cur);
    1477             : 
    1478    28178776 :         yylval->sval = (lc->rs->buf + lc->rs->pos + lc->yysval);
    1479             : 
    1480    28178776 :         if (token == KW_TYPE)
    1481       49354 :                 token = aTYPE;
    1482             : 
    1483    28178776 :         if (token == IDENT || token == COMPARISON ||
    1484    22849973 :             token == RANK || token == aTYPE || token == MARGFUNC) {
    1485     5388684 :                 yylval->sval = sa_strndup(c->sa, yylval->sval, lc->yycur-lc->yysval);
    1486     5388700 :                 lc->next_string_is_raw = false;
    1487    22790092 :         } else if (token == STRING) {
    1488     2092234 :                 char quote = *yylval->sval;
    1489     2092234 :                 char *str = sa_alloc( c->sa, (lc->yycur-lc->yysval-2)*2 + 1 );
    1490     2092234 :                 char *dst;
    1491             : 
    1492     2092234 :                 assert(quote == '"' || quote == '\'' || quote == 'E' || quote == 'e' || quote == 'U' || quote == 'u' || quote == 'X' || quote == 'x' || quote == 'R' || quote == 'r');
    1493             : 
    1494     2092234 :                 lc->rs->buf[lc->rs->pos + lc->yycur - 1] = 0;
    1495     2092234 :                 switch (quote) {
    1496     1280233 :                 case '"':
    1497     1280233 :                         if (valid_ident(yylval->sval+1,str)) {
    1498             :                                 token = IDENT;
    1499             :                         } else {
    1500           0 :                                 sql_error(c, 1, SQLSTATE(42000) "Invalid identifier '%s'", yylval->sval+1);
    1501           0 :                                 return LEX_ERROR;
    1502             :                         }
    1503             :                         break;
    1504        3791 :                 case 'e':
    1505             :                 case 'E':
    1506        3791 :                         assert(yylval->sval[1] == '\'');
    1507        3791 :                         if (GDKstrFromStr((unsigned char *) str,
    1508             :                                                           (unsigned char *) yylval->sval + 2,
    1509        3791 :                                                           lc->yycur-lc->yysval - 2, '\'') < 0) {
    1510           1 :                                 char *err = GDKerrbuf;
    1511           1 :                                 if (strncmp(err, GDKERROR, strlen(GDKERROR)) == 0)
    1512           1 :                                         err += strlen(GDKERROR);
    1513           0 :                                 else if (*err == '!')
    1514           0 :                                         err++;
    1515           1 :                                 sql_error(c, 1, SQLSTATE(42000) "%s", err);
    1516           1 :                                 return LEX_ERROR;
    1517             :                         }
    1518             :                         quote = '\'';
    1519             :                         break;
    1520          17 :                 case 'u':
    1521             :                 case 'U':
    1522          17 :                         assert(yylval->sval[1] == '&');
    1523          17 :                         assert(yylval->sval[2] == '\'' || yylval->sval[2] == '"');
    1524          17 :                         strcpy(str, yylval->sval + 3);
    1525          17 :                         token = yylval->sval[2] == '\'' ? USTRING : UIDENT;
    1526          17 :                         quote = yylval->sval[2];
    1527          17 :                         lc->next_string_is_raw = true;
    1528          17 :                         break;
    1529           1 :                 case 'x':
    1530             :                 case 'X':
    1531           1 :                         assert(yylval->sval[1] == '\'');
    1532           1 :                         dst = str;
    1533           5 :                         for (char *src = yylval->sval + 2; *src; dst++)
    1534           4 :                                 if ((*dst = *src++) == '\'' && *src == '\'')
    1535           0 :                                         src++;
    1536           1 :                         *dst = 0;
    1537           1 :                         quote = '\'';
    1538           1 :                         token = XSTRING;
    1539           1 :                         lc->next_string_is_raw = true;
    1540           1 :                         break;
    1541        3260 :                 case 'r':
    1542             :                 case 'R':
    1543        3260 :                         assert(yylval->sval[1] == '\'');
    1544        3260 :                         dst = str;
    1545      449203 :                         for (char *src = yylval->sval + 2; *src; dst++)
    1546      445943 :                                 if ((*dst = *src++) == '\'' && *src == '\'')
    1547        2744 :                                         src++;
    1548        3260 :                         quote = '\'';
    1549        3260 :                         *dst = 0;
    1550        3260 :                         break;
    1551      804932 :                 default:
    1552      804932 :                         if (lc->raw_string_mode || lc->next_string_is_raw) {
    1553          59 :                                 dst = str;
    1554         553 :                                 for (char *src = yylval->sval + 1; *src; dst++)
    1555         494 :                                         if ((*dst = *src++) == '\'' && *src == '\'')
    1556           3 :                                                 src++;
    1557          59 :                                 *dst = 0;
    1558             :                         } else {
    1559      804873 :                                 if (GDKstrFromStr((unsigned char *)str,
    1560      804873 :                                                                   (unsigned char *)yylval->sval + 1,
    1561      804873 :                                                                   lc->yycur - lc->yysval - 1,
    1562             :                                                                   '\'') < 0) {
    1563           1 :                                         sql_error(c, 1, SQLSTATE(42000) "%s", GDKerrbuf);
    1564           1 :                                         return LEX_ERROR;
    1565             :                                 }
    1566             :                         }
    1567             :                         break;
    1568             :                 }
    1569     2092232 :                 yylval->sval = str;
    1570             : 
    1571             :                 /* reset original */
    1572     2092232 :                 lc->rs->buf[lc->rs->pos+lc->yycur- 1] = quote;
    1573             :         } else {
    1574    20697858 :                 lc->next_string_is_raw = false;
    1575             :         }
    1576             : 
    1577             :         return(token);
    1578             : }
    1579             : 
    1580             : static int scanner( YYSTYPE *yylval, void *m, bool log);
    1581             : 
    1582             : static int
    1583    28220343 : scanner(YYSTYPE * yylval, void *parm, bool log)
    1584             : {
    1585    28220343 :         int token;
    1586    28220343 :         mvc *c = (mvc *) parm;
    1587    28220343 :         struct scanner *lc = &c->scanner;
    1588    28220343 :         size_t pos;
    1589             : 
    1590             :         /* store position for when view's query ends */
    1591    28220343 :         pos = lc->rs->pos + lc->yycur;
    1592             : 
    1593    28220343 :         token = sql_get_next_token(yylval, parm);
    1594             : 
    1595    28219074 :         if (token == NOT) {
    1596       74112 :                 int next = scanner(yylval, parm, false);
    1597             : 
    1598       74112 :                 if (next == NOT) {
    1599           2 :                         return scanner(yylval, parm, false);
    1600             :                 } else if (next == EXISTS) {
    1601             :                         token = NOT_EXISTS;
    1602             :                 } else if (next == BETWEEN) {
    1603             :                         token = NOT_BETWEEN;
    1604             :                 } else if (next == sqlIN) {
    1605             :                         token = NOT_IN;
    1606             :                 } else if (next == LIKE) {
    1607             :                         token = NOT_LIKE;
    1608             :                 } else if (next == ILIKE) {
    1609             :                         token = NOT_ILIKE;
    1610             :                 } else {
    1611       61424 :                         lc->yynext = next;
    1612             :                 }
    1613    28144962 :         } else if (token == SCOLON) {
    1614             :                 /* ignore semi-colon(s) following a semi-colon */
    1615      987818 :                 if (lc->yylast == SCOLON) {
    1616      131985 :                         size_t prev = lc->yycur;
    1617      131986 :                         while ((token = sql_get_next_token(yylval, parm)) == SCOLON)
    1618           1 :                                 prev = lc->yycur;
    1619             : 
    1620             :                         /* skip the skipped stuff also in the buffer */
    1621      131984 :                         lc->rs->pos += prev;
    1622      131984 :                         lc->yycur -= prev;
    1623             :                 }
    1624             :         }
    1625             : 
    1626    28219071 :         if (lc->log && log)
    1627           0 :                 mnstr_write(lc->log, lc->rs->buf+pos, lc->rs->pos + lc->yycur - pos, 1);
    1628             : 
    1629    28219071 :         lc->started += (token != EOF);
    1630    28219071 :         return token;
    1631             : }
    1632             : 
    1633             : /* also see sql_parser.y */
    1634             : extern int sqllex(YYSTYPE * yylval, void *parm);
    1635             : 
    1636             : int
    1637    28146716 : sqllex(YYSTYPE * yylval, void *parm)
    1638             : {
    1639    28146716 :         return scanner(yylval, parm, true);
    1640             : }

Generated by: LCOV version 1.14