lexer.c

   1 /*
   2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for any
   5  * purpose with or without fee is hereby granted, provided that the above
   6  * copyright notice and this permission notice appear in all copies.
   7  *
   8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15  */
  16
  17 #include <stdbool.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <ctype.h>
  21 #include <regex.h>
  22
  23 #include "ast.h"
  24 #include "lexer.h"
  25 #include "parser.h"
  26
  27
  28 struct token {
  29         int type;
  30         const char *pat;
  31         int plen;
  32         int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
  33 };
  34
  35 #define dec(o) \
  36         ((o) - '0')
  37
  38 #define hex(x) \
  39         (((x) >= 'a') ? (10 + (x) - 'a') : \
  40                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
  41
  42 /*
  43  * Stores the given codepoint as a utf8 multibyte sequence into the given
  44  * output buffer and substracts the required amount of bytes from  the given
  45  * length pointer.
  46  *
  47  * Returns false if the multibyte sequence would not fit into the buffer,
  48  * otherwise true.
  49  */
  50
  51 static bool
  52 utf8enc(char **out, int *rem, int code)
  53 {
  54         if (code > 0 && code <= 0x7F)
  55         {
  56                 if (*rem < 1)
  57                         return false;
  58
  59                 *(*out)++ = code; (*rem)--;
  60                 return true;
  61         }
  62         else if (code > 0 && code <= 0x7FF)
  63         {
  64                 if (*rem < 2)
  65                         return false;
  66
  67                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
  68                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  69                 return true;
  70         }
  71         else if (code > 0 && code <= 0xFFFF)
  72         {
  73                 if (*rem < 3)
  74                         return false;
  75
  76                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
  77                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  78                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  79                 return true;
  80         }
  81         else if (code > 0 && code <= 0x10FFFF)
  82         {
  83                 if (*rem < 4)
  84                         return false;
  85
  86                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
  87                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
  88                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  89                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  90                 return true;
  91         }
  92
  93         return true;
  94 }
  95
  96
  97 /*
  98  * Parses a string literal from the given buffer.
  99  *
 100  * Returns a negative value on error, otherwise the amount of consumed
 101  * characters from the given buffer.
 102  *
 103  * Error values:
 104  *  -1  Unterminated string
 105  *  -2  Invalid escape sequence
 106  *  -3  String literal too long
 107  */
 108
 109 static int
 110 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 111 {
 112         char q = *(buf++);
 113         char str[128] = { 0 };
 114         char *out = str;
 115         const char *in = buf;
 116         bool esc = false;
 117         int rem = sizeof(str) - 1;
 118         int code;
 119
 120         while (*in)
 121         {
 122                 /* continuation of escape sequence */
 123                 if (esc)
 124                 {
 125                         /* \uFFFF */
 126                         if (in[0] == 'u')
 127                         {
 128                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
 129                                     isxdigit(in[3]) && isxdigit(in[4]))
 130                                 {
 131                                         if (!utf8enc(&out, &rem,
 132                                                      hex(in[1]) * 16 * 16 * 16 +
 133                                                      hex(in[2]) * 16 * 16 +
 134                                                      hex(in[3]) * 16 +
 135                                                      hex(in[4])))
 136                                         {
 137                                                 s->error_pos = s->off + (in - buf);
 138                                                 return -3;
 139                                         }
 140
 141                                         in += 5;
 142                                 }
 143                                 else
 144                                 {
 145                                         s->error_pos = s->off + (in - buf);
 146                                         return -2;
 147                                 }
 148                         }
 149
 150                         /* \xFF */
 151                         else if (in[0] == 'x')
 152                         {
 153                                 if (isxdigit(in[1]) && isxdigit(in[2]))
 154                                 {
 155                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
 156                                         {
 157                                                 s->error_pos = s->off + (in - buf);
 158                                                 return -3;
 159                                         }
 160
 161                                         in += 3;
 162                                 }
 163                                 else
 164                                 {
 165                                         s->error_pos = s->off + (in - buf);
 166                                         return -2;
 167                                 }
 168                         }
 169
 170                         /* \377, \77 or \7 */
 171                         else if (in[0] >= '0' && in[0] <= '7')
 172                         {
 173                                 /* \377 */
 174                                 if (in[1] >= '0' && in[1] <= '7' &&
 175                                     in[2] >= '0' && in[2] <= '7')
 176                                 {
 177                                         code = dec(in[0]) * 8 * 8 +
 178                                                dec(in[1]) * 8 +
 179                                                dec(in[2]);
 180
 181                                         if (code > 255)
 182                                         {
 183                                                 s->error_pos = s->off + (in - buf);
 184                                                 return -2;
 185                                         }
 186
 187                                         if (!utf8enc(&out, &rem, code))
 188                                         {
 189                                                 s->error_pos = s->off + (in - buf);
 190                                                 return -3;
 191                                         }
 192
 193                                         in += 3;
 194                                 }
 195
 196                                 /* \77 */
 197                                 else if (in[1] >= '0' && in[1] <= '7')
 198                                 {
 199                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
 200                                         {
 201                                                 s->error_pos = s->off + (in - buf);
 202                                                 return -3;
 203                                         }
 204
 205                                         in += 2;
 206                                 }
 207
 208                                 /* \7 */
 209                                 else
 210                                 {
 211                                         if (!utf8enc(&out, &rem, dec(in[0])))
 212                                         {
 213                                                 s->error_pos = s->off + (in - buf);
 214                                                 return -3;
 215                                         }
 216
 217                                         in += 1;
 218                                 }
 219                         }
 220
 221                         /* single character escape */
 222                         else
 223                         {
 224                                 if (rem-- < 1)
 225                                 {
 226                                         s->error_pos = s->off + (in - buf);
 227                                         return -3;
 228                                 }
 229
 230                                 switch (in[0])
 231                                 {
 232                                 case 'a': *out = '\a'; break;
 233                                 case 'b': *out = '\b'; break;
 234                                 case 'e': *out = '\e'; break;
 235                                 case 'f': *out = '\f'; break;
 236                                 case 'n': *out = '\n'; break;
 237                                 case 'r': *out = '\r'; break;
 238                                 case 't': *out = '\t'; break;
 239                                 case 'v': *out = '\v'; break;
 240                                 default:
 241                                         /* in regexp mode, retain backslash */
 242                                         if (q == '/')
 243                                         {
 244                                                 if (rem-- < 1)
 245                                                 {
 246                                                         s->error_pos = s->off + (in - buf);
 247                                                         return -3;
 248                                                 }
 249
 250                                                 *out++ = '\\';
 251                                         }
 252
 253                                         *out = *in;
 254                                         break;
 255                                 }
 256
 257                                 in++;
 258                                 out++;
 259                         }
 260
 261                         esc = false;
 262                 }
 263
 264                 /* begin of escape sequence */
 265                 else if (*in == '\\')
 266                 {
 267                         in++;
 268                         esc = true;
 269                 }
 270
 271                 /* terminating quote */
 272                 else if (*in == q)
 273                 {
 274                         op->str = strdup(str);
 275                         return (in - buf) + 2;
 276                 }
 277
 278                 /* ordinary char */
 279                 else
 280                 {
 281                         if (rem-- < 1)
 282                         {
 283                                 s->error_pos = s->off + (in - buf);
 284                                 return -3;
 285                         }
 286
 287                         *out++ = *in++;
 288                 }
 289         }
 290
 291         return -1;
 292 }
 293
 294
 295 /*
 296  * Parses a regexp literal from the given buffer.
 297  *
 298  * Returns a negative value on error, otherwise the amount of consumed
 299  * characters from the given buffer.
 300  *
 301  * Error values:
 302  *  -1  Unterminated regexp
 303  *  -2  Invalid escape sequence
 304  *  -3  Regexp literal too long
 305  */
 306
 307 static int
 308 parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
 309 {
 310         int len = parse_string(buf, op, s);
 311         const char *p;
 312
 313         if (len >= 2)
 314         {
 315                 op->num = REG_NOSUB | REG_NEWLINE;
 316
 317                 for (p = buf + len; p; p++)
 318                 {
 319                         switch (*p)
 320                         {
 321                         case 'e':
 322                                 op->num |= REG_EXTENDED;
 323                                 len++;
 324                                 break;
 325
 326                         case 'i':
 327                                 op->num |= REG_ICASE;
 328                                 len++;
 329                                 break;
 330
 331                         case 's':
 332                                 op->num &= ~REG_NEWLINE;
 333                                 len++;
 334                                 break;
 335
 336                         default:
 337                                 return len;
 338                         }
 339                 }
 340
 341         }
 342
 343         return len;
 344 }
 345
 346
 347 /*
 348  * Parses a label from the given buffer.
 349  *
 350  * Returns a negative value on error, otherwise the amount of consumed
 351  * characters from the given buffer.
 352  *
 353  * Error values:
 354  *  -3  Label too long
 355  */
 356
 357 static int
 358 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
 359 {
 360         char str[128] = { 0 };
 361         char *out = str;
 362         const char *in = buf;
 363         int rem = sizeof(str) - 1;
 364
 365         while (*in == '_' || isalnum(*in))
 366         {
 367                 if (rem-- < 1)
 368                 {
 369                         s->error_pos = s->off + (in - buf);
 370                         return -3;
 371                 }
 372
 373                 *out++ = *in++;
 374         }
 375
 376         if (!strcmp(str, "true") || !strcmp(str, "false"))
 377         {
 378                 op->num = (str[0] == 't');
 379                 op->type = T_BOOL;
 380         }
 381         else
 382         {
 383                 op->str = strdup(str);
 384         }
 385
 386         return (in - buf);
 387 }
 388
 389
 390 /*
 391  * Parses a number literal from the given buffer.
 392  *
 393  * Returns a negative value on error, otherwise the amount of consumed
 394  * characters from the given buffer.
 395  *
 396  * Error values:
 397  *  -2  Invalid number character
 398  */
 399
 400 static int
 401 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
 402 {
 403         char *e;
 404         int n = strtol(buf, &e, 10);
 405
 406         if (e == buf)
 407         {
 408                 s->error_pos = s->off;
 409                 return -2;
 410         }
 411
 412         op->num = n;
 413
 414         return (e - buf);
 415 }
 416
 417 static const struct token tokens[] = {
 418         { 0,                    " ",     1 },
 419         { 0,                    "\t",    1 },
 420         { 0,                    "\n",    1 },
 421         { T_LE,                 "<=",    2 },
 422         { T_GE,                 ">=",    2 },
 423         { T_NE,                 "!=",    2 },
 424         { T_AND,                "&&",    2 },
 425         { T_OR,                 "||",    2 },
 426         { T_DOT,                ".",     1 },
 427         { T_BROPEN,             "[",     1 },
 428         { T_BRCLOSE,    "]",     1 },
 429         { T_POPEN,              "(",     1 },
 430         { T_PCLOSE,             ")",     1 },
 431         { T_UNION,              ",",     1 },
 432         { T_ROOT,               "$",     1 },
 433         { T_THIS,               "@",     1 },
 434         { T_LT,                 "<",     1 },
 435         { T_GT,                 ">",     1 },
 436         { T_EQ,                 "=",     1 },
 437         { T_MATCH,              "~",     1 },
 438         { T_NOT,                "!",     1 },
 439         { T_WILDCARD,   "*",     1 },
 440         { T_REGEXP,             "/",     1, parse_regexp },
 441         { T_STRING,             "'",     1, parse_string },
 442         { T_STRING,             "\"",    1, parse_string },
 443         { T_LABEL,              "_",     1, parse_label  },
 444         { T_LABEL,              "az",    0, parse_label  },
 445         { T_LABEL,              "AZ",    0, parse_label  },
 446         { T_NUMBER,             "-",     1, parse_number },
 447         { T_NUMBER,             "09",    0, parse_number },
 448 };
 449
 450 const char *tokennames[25] = {
 451         [0]                             = "End of file",
 452         [T_AND]                 = "'&&'",
 453         [T_OR]                  = "'||'",
 454         [T_UNION]               = "','",
 455         [T_EQ]                  = "'='",
 456         [T_NE]                  = "'!='",
 457         [T_GT]                  = "'>'",
 458         [T_GE]                  = "'>='",
 459         [T_LT]                  = "'<'",
 460         [T_LE]                  = "'<='",
 461         [T_MATCH]       = "'~'",
 462         [T_NOT]                 = "'!'",
 463         [T_LABEL]               = "Label",
 464         [T_ROOT]                = "'$'",
 465         [T_THIS]                = "'@'",
 466         [T_DOT]                 = "'.'",
 467         [T_WILDCARD]    = "'*'",
 468         [T_REGEXP]      = "/.../",
 469         [T_BROPEN]              = "'['",
 470         [T_BRCLOSE]             = "']'",
 471         [T_BOOL]                = "Bool",
 472         [T_NUMBER]              = "Number",
 473         [T_STRING]              = "String",
 474         [T_POPEN]               = "'('",
 475         [T_PCLOSE]              = "')'",
 476 };
 477
 478
 479 static int
 480 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
 481 {
 482         int i;
 483         const struct token *tok;
 484
 485         for (i = 0, tok = &tokens[0];
 486              i < sizeof(tokens) / sizeof(tokens[0]);
 487                  i++, tok = &tokens[i])
 488         {
 489                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
 490                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
 491                 {
 492                         op->type = tok->type;
 493
 494                         if (tok->parse)
 495                                 return tok->parse(ptr, op, s);
 496
 497                         return tok->plen;
 498                 }
 499         }
 500
 501         s->error_pos = s->off;
 502         return -4;
 503 }
 504
 505 struct jp_opcode *
 506 jp_get_token(struct jp_state *s, const char *input, int *mlen)
 507 {
 508         struct jp_opcode op = { 0 };
 509
 510         *mlen = match_token(input, &op, s);
 511
 512         if (*mlen < 0)
 513         {
 514                 s->error_code = *mlen;
 515                 return NULL;
 516         }
 517         else if (op.type == 0)
 518         {
 519                 return NULL;
 520         }
 521
 522         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
 523 }