lexer.c

   1 /*
   2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for any
   5  * purpose with or without fee is hereby granted, provided that the above
   6  * copyright notice and this permission notice appear in all copies.
   7  *
   8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15  */
  16
  17 #include <stdbool.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <ctype.h>
  21
  22 #include "ast.h"
  23 #include "lexer.h"
  24 #include "parser.h"
  25
  26
  27 struct token {
  28         int type;
  29         const char *pat;
  30         int plen;
  31         int (*parse)(const char *buf, struct jp_opcode *op);
  32 };
  33
  34 #define dec(o) \
  35         ((o) - '0')
  36
  37 #define hex(x) \
  38         (((x) >= 'a') ? (10 + (x) - 'a') : \
  39                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
  40
  41 /*
  42  * Stores the given codepoint as a utf8 multibyte sequence into the given
  43  * output buffer and substracts the required amount of bytes from  the given
  44  * length pointer.
  45  *
  46  * Returns false if the multibyte sequence would not fit into the buffer,
  47  * otherwise true.
  48  */
  49
  50 static bool
  51 utf8enc(char **out, int *rem, int code)
  52 {
  53         if (code > 0 && code <= 0x7F)
  54         {
  55                 if (*rem < 1)
  56                         return false;
  57
  58                 *(*out++) = code; (*rem)--;
  59                 return true;
  60         }
  61         else if (code > 0 && code <= 0x7FF)
  62         {
  63                 if (*rem < 2)
  64                         return false;
  65
  66                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
  67                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  68                 return true;
  69         }
  70         else if (code > 0 && code <= 0xFFFF)
  71         {
  72                 if (*rem < 3)
  73                         return false;
  74
  75                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
  76                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  77                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  78                 return true;
  79         }
  80         else if (code > 0 && code <= 0x10FFFF)
  81         {
  82                 if (*rem < 4)
  83                         return false;
  84
  85                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
  86                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
  87                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
  88                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
  89                 return true;
  90         }
  91
  92         return true;
  93 }
  94
  95
  96 /*
  97  * Parses a string literal from the given buffer.
  98  *
  99  * Returns a negative value on error, otherwise the amount of consumed
 100  * characters from the given buffer.
 101  *
 102  * Error values:
 103  *  -1  Unterminated string
 104  *  -2  Invalid escape sequence
 105  *  -3  String literal too long
 106  */
 107
 108 static int
 109 parse_string(const char *buf, struct jp_opcode *op)
 110 {
 111         char q = *(buf++);
 112         char str[128] = { 0 };
 113         char *out = str;
 114         const char *in = buf;
 115         bool esc = false;
 116         int rem = sizeof(str) - 1;
 117         int code;
 118
 119         while (*in)
 120         {
 121                 /* continuation of escape sequence */
 122                 if (esc)
 123                 {
 124                         /* \uFFFF */
 125                         if (in[0] == 'u')
 126                         {
 127                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
 128                                     isxdigit(in[3]) && isxdigit(in[4]))
 129                                 {
 130                                         if (!utf8enc(&out, &rem,
 131                                                      hex(in[1]) * 16 * 16 * 16 +
 132                                                      hex(in[2]) * 16 * 16 +
 133                                                      hex(in[3]) * 16 +
 134                                                      hex(in[4])))
 135                                                 return -3;
 136
 137                                         in += 5;
 138                                 }
 139                                 else
 140                                 {
 141                                         return -2;
 142                                 }
 143                         }
 144
 145                         /* \xFF */
 146                         else if (in[0] == 'x')
 147                         {
 148                                 if (isxdigit(in[1]) && isxdigit(in[2]))
 149                                 {
 150                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
 151                                                 return -3;
 152
 153                                         in += 3;
 154                                 }
 155                                 else
 156                                 {
 157                                         return -2;
 158                                 }
 159                         }
 160
 161                         /* \377, \77 or \7 */
 162                         else if (in[0] >= '0' && in[0] <= '7')
 163                         {
 164                                 /* \377 */
 165                                 if (in[1] >= '0' && in[1] <= '7' &&
 166                                     in[2] >= '0' && in[2] <= '7')
 167                                 {
 168                                         code = dec(in[0]) * 8 * 8 +
 169                                                dec(in[1]) * 8 +
 170                                                dec(in[2]);
 171
 172                                         if (code > 255)
 173                                                 return -2;
 174
 175                                         if (!utf8enc(&out, &rem, code))
 176                                                 return -3;
 177
 178                                         in += 3;
 179                                 }
 180
 181                                 /* \77 */
 182                                 else if (in[1] >= '0' && in[1] <= '7')
 183                                 {
 184                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
 185                                                 return -3;
 186
 187                                         in += 2;
 188                                 }
 189
 190                                 /* \7 */
 191                                 else
 192                                 {
 193                                         if (!utf8enc(&out, &rem, dec(in[0])))
 194                                                 return -3;
 195
 196                                         in += 1;
 197                                 }
 198                         }
 199
 200                         /* single character escape */
 201                         else
 202                         {
 203                                 if (rem-- < 1)
 204                                         return -3;
 205
 206                                 switch (in[0])
 207                                 {
 208                                 case 'a': *out = '\a'; break;
 209                                 case 'b': *out = '\b'; break;
 210                                 case 'e': *out = '\e'; break;
 211                                 case 'f': *out = '\f'; break;
 212                                 case 'n': *out = '\n'; break;
 213                                 case 'r': *out = '\r'; break;
 214                                 case 't': *out = '\t'; break;
 215                                 case 'v': *out = '\v'; break;
 216                                 default:  *out = *in; break;
 217                                 }
 218
 219                                 in++;
 220                                 out++;
 221                         }
 222
 223                         esc = false;
 224                 }
 225
 226                 /* begin of escape sequence */
 227                 else if (*in == '\\')
 228                 {
 229                         in++;
 230                         esc = true;
 231                 }
 232
 233                 /* terminating quote */
 234                 else if (*in == q)
 235                 {
 236                         op->str = strdup(str);
 237                         return (in - buf) + 2;
 238                 }
 239
 240                 /* ordinary char */
 241                 else
 242                 {
 243                         if (rem-- < 1)
 244                                 return -3;
 245
 246                         *out++ = *in++;
 247                 }
 248         }
 249
 250         return -1;
 251 }
 252
 253
 254 /*
 255  * Parses a label from the given buffer.
 256  *
 257  * Returns a negative value on error, otherwise the amount of consumed
 258  * characters from the given buffer.
 259  *
 260  * Error values:
 261  *  -3  Label too long
 262  */
 263
 264 static int
 265 parse_label(const char *buf, struct jp_opcode *op)
 266 {
 267         char str[128] = { 0 };
 268         char *out = str;
 269         const char *in = buf;
 270         int rem = sizeof(str) - 1;
 271
 272         while (*in == '_' || isalnum(*in))
 273         {
 274                 if (rem-- < 1)
 275                         return -3;
 276
 277                 *out++ = *in++;
 278         }
 279
 280         if (!strcmp(str, "true") || !strcmp(str, "false"))
 281         {
 282                 op->num = (str[0] == 't');
 283                 op->type = T_BOOL;
 284         }
 285         else
 286         {
 287                 op->str = strdup(str);
 288         }
 289
 290         return (in - buf);
 291 }
 292
 293
 294 /*
 295  * Parses a number literal from the given buffer.
 296  *
 297  * Returns a negative value on error, otherwise the amount of consumed
 298  * characters from the given buffer.
 299  *
 300  * Error values:
 301  *  -2  Invalid number character
 302  */
 303
 304 static int
 305 parse_number(const char *buf, struct jp_opcode *op)
 306 {
 307         char *e;
 308         int n = strtol(buf, &e, 10);
 309
 310         if (e == buf)
 311                 return -2;
 312
 313         op->num = n;
 314
 315         return (e - buf);
 316 }
 317
 318 static const struct token tokens[] = {
 319         { 0,                    " ",     1 },
 320         { 0,                    "\t",    1 },
 321         { 0,                    "\n",    1 },
 322         { T_LE,                 "<=",    2 },
 323         { T_GE,                 ">=",    2 },
 324         { T_NE,                 "!=",    2 },
 325         { T_AND,                "&&",    2 },
 326         { T_OR,                 "||",    2 },
 327         { T_DOT,                ".",     1 },
 328         { T_BROPEN,             "[",     1 },
 329         { T_BRCLOSE,    "]",     1 },
 330         { T_POPEN,              "(",     1 },
 331         { T_PCLOSE,             ")",     1 },
 332         { T_UNION,              ",",     1 },
 333         { T_ROOT,               "$",     1 },
 334         { T_THIS,               "@",     1 },
 335         { T_LT,                 "<",     1 },
 336         { T_GT,                 ">",     1 },
 337         { T_EQ,                 "=",     1 },
 338         { T_NOT,                "!",     1 },
 339         { T_WILDCARD,   "*",     1 },
 340         { T_STRING,             "'",     1, parse_string },
 341         { T_STRING,             "\"",    1, parse_string },
 342         { T_LABEL,              "_",     1, parse_label  },
 343         { T_LABEL,              "az",    0, parse_label  },
 344         { T_LABEL,              "AZ",    0, parse_label  },
 345         { T_NUMBER,             "-",     1, parse_number },
 346         { T_NUMBER,             "09",    0, parse_number },
 347 };
 348
 349 const char *tokennames[23] = {
 350         [0]                             = "End of file",
 351         [T_AND]                 = "'&&'",
 352         [T_OR]                  = "'||'",
 353         [T_UNION]               = "','",
 354         [T_EQ]                  = "'='",
 355         [T_NE]                  = "'!='",
 356         [T_GT]                  = "'>'",
 357         [T_GE]                  = "'>='",
 358         [T_LT]                  = "'<'",
 359         [T_LE]                  = "'<='",
 360         [T_NOT]                 = "'!'",
 361         [T_LABEL]               = "Label",
 362         [T_ROOT]                = "'$'",
 363         [T_THIS]                = "'@'",
 364         [T_DOT]                 = "'.'",
 365         [T_WILDCARD]    = "'*'",
 366         [T_BROPEN]              = "'['",
 367         [T_BRCLOSE]             = "']'",
 368         [T_BOOL]                = "Bool",
 369         [T_NUMBER]              = "Number",
 370         [T_STRING]              = "String",
 371         [T_POPEN]               = "'('",
 372         [T_PCLOSE]              = "')'",
 373 };
 374
 375
 376 static int
 377 match_token(const char *ptr, struct jp_opcode *op)
 378 {
 379         int i;
 380         const struct token *tok;
 381
 382         for (i = 0, tok = &tokens[0];
 383              i < sizeof(tokens) / sizeof(tokens[0]);
 384                  i++, tok = &tokens[i])
 385         {
 386                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
 387                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
 388                 {
 389                         op->type = tok->type;
 390
 391                         if (tok->parse)
 392                                 return tok->parse(ptr, op);
 393
 394                         return tok->plen;
 395                 }
 396         }
 397
 398         return -1;
 399 }
 400
 401 struct jp_opcode *
 402 jp_get_token(struct jp_state *s, const char *input, int *mlen)
 403 {
 404         struct jp_opcode op = { 0 };
 405
 406         *mlen = match_token(input, &op);
 407
 408         if (*mlen < 0 || op.type == 0)
 409                 return NULL;
 410
 411         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
 412 }