implement POSIX regexp support
[project/jsonpath.git] / lexer.c
1 /*
2 * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdbool.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <ctype.h>
21 #include <regex.h>
22
23 #include "ast.h"
24 #include "lexer.h"
25 #include "parser.h"
26
27
28 struct token {
29 int type;
30 const char *pat;
31 int plen;
32 int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
33 };
34
35 #define dec(o) \
36 ((o) - '0')
37
38 #define hex(x) \
39 (((x) >= 'a') ? (10 + (x) - 'a') : \
40 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
41
42 /*
43 * Stores the given codepoint as a utf8 multibyte sequence into the given
44 * output buffer and substracts the required amount of bytes from the given
45 * length pointer.
46 *
47 * Returns false if the multibyte sequence would not fit into the buffer,
48 * otherwise true.
49 */
50
51 static bool
52 utf8enc(char **out, int *rem, int code)
53 {
54 if (code > 0 && code <= 0x7F)
55 {
56 if (*rem < 1)
57 return false;
58
59 *(*out)++ = code; (*rem)--;
60 return true;
61 }
62 else if (code > 0 && code <= 0x7FF)
63 {
64 if (*rem < 2)
65 return false;
66
67 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--;
68 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
69 return true;
70 }
71 else if (code > 0 && code <= 0xFFFF)
72 {
73 if (*rem < 3)
74 return false;
75
76 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
77 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
78 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
79 return true;
80 }
81 else if (code > 0 && code <= 0x10FFFF)
82 {
83 if (*rem < 4)
84 return false;
85
86 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
87 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
88 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--;
89 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--;
90 return true;
91 }
92
93 return true;
94 }
95
96
97 /*
98 * Parses a string literal from the given buffer.
99 *
100 * Returns a negative value on error, otherwise the amount of consumed
101 * characters from the given buffer.
102 *
103 * Error values:
104 * -1 Unterminated string
105 * -2 Invalid escape sequence
106 * -3 String literal too long
107 */
108
109 static int
110 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
111 {
112 char q = *(buf++);
113 char str[128] = { 0 };
114 char *out = str;
115 const char *in = buf;
116 bool esc = false;
117 int rem = sizeof(str) - 1;
118 int code;
119
120 while (*in)
121 {
122 /* continuation of escape sequence */
123 if (esc)
124 {
125 /* \uFFFF */
126 if (in[0] == 'u')
127 {
128 if (isxdigit(in[1]) && isxdigit(in[2]) &&
129 isxdigit(in[3]) && isxdigit(in[4]))
130 {
131 if (!utf8enc(&out, &rem,
132 hex(in[1]) * 16 * 16 * 16 +
133 hex(in[2]) * 16 * 16 +
134 hex(in[3]) * 16 +
135 hex(in[4])))
136 {
137 s->error_pos = s->off + (in - buf);
138 return -3;
139 }
140
141 in += 5;
142 }
143 else
144 {
145 s->error_pos = s->off + (in - buf);
146 return -2;
147 }
148 }
149
150 /* \xFF */
151 else if (in[0] == 'x')
152 {
153 if (isxdigit(in[1]) && isxdigit(in[2]))
154 {
155 if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
156 {
157 s->error_pos = s->off + (in - buf);
158 return -3;
159 }
160
161 in += 3;
162 }
163 else
164 {
165 s->error_pos = s->off + (in - buf);
166 return -2;
167 }
168 }
169
170 /* \377, \77 or \7 */
171 else if (in[0] >= '0' && in[0] <= '7')
172 {
173 /* \377 */
174 if (in[1] >= '0' && in[1] <= '7' &&
175 in[2] >= '0' && in[2] <= '7')
176 {
177 code = dec(in[0]) * 8 * 8 +
178 dec(in[1]) * 8 +
179 dec(in[2]);
180
181 if (code > 255)
182 {
183 s->error_pos = s->off + (in - buf);
184 return -2;
185 }
186
187 if (!utf8enc(&out, &rem, code))
188 {
189 s->error_pos = s->off + (in - buf);
190 return -3;
191 }
192
193 in += 3;
194 }
195
196 /* \77 */
197 else if (in[1] >= '0' && in[1] <= '7')
198 {
199 if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
200 {
201 s->error_pos = s->off + (in - buf);
202 return -3;
203 }
204
205 in += 2;
206 }
207
208 /* \7 */
209 else
210 {
211 if (!utf8enc(&out, &rem, dec(in[0])))
212 {
213 s->error_pos = s->off + (in - buf);
214 return -3;
215 }
216
217 in += 1;
218 }
219 }
220
221 /* single character escape */
222 else
223 {
224 if (rem-- < 1)
225 {
226 s->error_pos = s->off + (in - buf);
227 return -3;
228 }
229
230 switch (in[0])
231 {
232 case 'a': *out = '\a'; break;
233 case 'b': *out = '\b'; break;
234 case 'e': *out = '\e'; break;
235 case 'f': *out = '\f'; break;
236 case 'n': *out = '\n'; break;
237 case 'r': *out = '\r'; break;
238 case 't': *out = '\t'; break;
239 case 'v': *out = '\v'; break;
240 default:
241 /* in regexp mode, retain backslash */
242 if (q == '/')
243 {
244 if (rem-- < 1)
245 {
246 s->error_pos = s->off + (in - buf);
247 return -3;
248 }
249
250 *out++ = '\\';
251 }
252
253 *out = *in;
254 break;
255 }
256
257 in++;
258 out++;
259 }
260
261 esc = false;
262 }
263
264 /* begin of escape sequence */
265 else if (*in == '\\')
266 {
267 in++;
268 esc = true;
269 }
270
271 /* terminating quote */
272 else if (*in == q)
273 {
274 op->str = strdup(str);
275 return (in - buf) + 2;
276 }
277
278 /* ordinary char */
279 else
280 {
281 if (rem-- < 1)
282 {
283 s->error_pos = s->off + (in - buf);
284 return -3;
285 }
286
287 *out++ = *in++;
288 }
289 }
290
291 return -1;
292 }
293
294
295 /*
296 * Parses a regexp literal from the given buffer.
297 *
298 * Returns a negative value on error, otherwise the amount of consumed
299 * characters from the given buffer.
300 *
301 * Error values:
302 * -1 Unterminated regexp
303 * -2 Invalid escape sequence
304 * -3 Regexp literal too long
305 */
306
307 static int
308 parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
309 {
310 int len = parse_string(buf, op, s);
311 const char *p;
312
313 if (len >= 2)
314 {
315 op->num = REG_NOSUB | REG_NEWLINE;
316
317 for (p = buf + len; p; p++)
318 {
319 switch (*p)
320 {
321 case 'e':
322 op->num |= REG_EXTENDED;
323 len++;
324 break;
325
326 case 'i':
327 op->num |= REG_ICASE;
328 len++;
329 break;
330
331 case 's':
332 op->num &= ~REG_NEWLINE;
333 len++;
334 break;
335
336 default:
337 return len;
338 }
339 }
340
341 }
342
343 return len;
344 }
345
346
347 /*
348 * Parses a label from the given buffer.
349 *
350 * Returns a negative value on error, otherwise the amount of consumed
351 * characters from the given buffer.
352 *
353 * Error values:
354 * -3 Label too long
355 */
356
357 static int
358 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
359 {
360 char str[128] = { 0 };
361 char *out = str;
362 const char *in = buf;
363 int rem = sizeof(str) - 1;
364
365 while (*in == '_' || isalnum(*in))
366 {
367 if (rem-- < 1)
368 {
369 s->error_pos = s->off + (in - buf);
370 return -3;
371 }
372
373 *out++ = *in++;
374 }
375
376 if (!strcmp(str, "true") || !strcmp(str, "false"))
377 {
378 op->num = (str[0] == 't');
379 op->type = T_BOOL;
380 }
381 else
382 {
383 op->str = strdup(str);
384 }
385
386 return (in - buf);
387 }
388
389
390 /*
391 * Parses a number literal from the given buffer.
392 *
393 * Returns a negative value on error, otherwise the amount of consumed
394 * characters from the given buffer.
395 *
396 * Error values:
397 * -2 Invalid number character
398 */
399
400 static int
401 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
402 {
403 char *e;
404 int n = strtol(buf, &e, 10);
405
406 if (e == buf)
407 {
408 s->error_pos = s->off;
409 return -2;
410 }
411
412 op->num = n;
413
414 return (e - buf);
415 }
416
417 static const struct token tokens[] = {
418 { 0, " ", 1 },
419 { 0, "\t", 1 },
420 { 0, "\n", 1 },
421 { T_LE, "<=", 2 },
422 { T_GE, ">=", 2 },
423 { T_NE, "!=", 2 },
424 { T_AND, "&&", 2 },
425 { T_OR, "||", 2 },
426 { T_DOT, ".", 1 },
427 { T_BROPEN, "[", 1 },
428 { T_BRCLOSE, "]", 1 },
429 { T_POPEN, "(", 1 },
430 { T_PCLOSE, ")", 1 },
431 { T_UNION, ",", 1 },
432 { T_ROOT, "$", 1 },
433 { T_THIS, "@", 1 },
434 { T_LT, "<", 1 },
435 { T_GT, ">", 1 },
436 { T_EQ, "=", 1 },
437 { T_MATCH, "~", 1 },
438 { T_NOT, "!", 1 },
439 { T_WILDCARD, "*", 1 },
440 { T_REGEXP, "/", 1, parse_regexp },
441 { T_STRING, "'", 1, parse_string },
442 { T_STRING, "\"", 1, parse_string },
443 { T_LABEL, "_", 1, parse_label },
444 { T_LABEL, "az", 0, parse_label },
445 { T_LABEL, "AZ", 0, parse_label },
446 { T_NUMBER, "-", 1, parse_number },
447 { T_NUMBER, "09", 0, parse_number },
448 };
449
450 const char *tokennames[25] = {
451 [0] = "End of file",
452 [T_AND] = "'&&'",
453 [T_OR] = "'||'",
454 [T_UNION] = "','",
455 [T_EQ] = "'='",
456 [T_NE] = "'!='",
457 [T_GT] = "'>'",
458 [T_GE] = "'>='",
459 [T_LT] = "'<'",
460 [T_LE] = "'<='",
461 [T_MATCH] = "'~'",
462 [T_NOT] = "'!'",
463 [T_LABEL] = "Label",
464 [T_ROOT] = "'$'",
465 [T_THIS] = "'@'",
466 [T_DOT] = "'.'",
467 [T_WILDCARD] = "'*'",
468 [T_REGEXP] = "/.../",
469 [T_BROPEN] = "'['",
470 [T_BRCLOSE] = "']'",
471 [T_BOOL] = "Bool",
472 [T_NUMBER] = "Number",
473 [T_STRING] = "String",
474 [T_POPEN] = "'('",
475 [T_PCLOSE] = "')'",
476 };
477
478
479 static int
480 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
481 {
482 int i;
483 const struct token *tok;
484
485 for (i = 0, tok = &tokens[0];
486 i < sizeof(tokens) / sizeof(tokens[0]);
487 i++, tok = &tokens[i])
488 {
489 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
490 (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
491 {
492 op->type = tok->type;
493
494 if (tok->parse)
495 return tok->parse(ptr, op, s);
496
497 return tok->plen;
498 }
499 }
500
501 s->error_pos = s->off;
502 return -4;
503 }
504
505 struct jp_opcode *
506 jp_get_token(struct jp_state *s, const char *input, int *mlen)
507 {
508 struct jp_opcode op = { 0 };
509
510 *mlen = match_token(input, &op, s);
511
512 if (*mlen < 0)
513 {
514 s->error_code = *mlen;
515 return NULL;
516 }
517 else if (op.type == 0)
518 {
519 return NULL;
520 }
521
522 return jp_alloc_op(s, op.type, op.num, op.str, NULL);
523 }