2 * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
32 int (*parse
)(const char *buf
, struct jp_opcode
*op
, struct jp_state
*s
);
39 (((x) >= 'a') ? (10 + (x) - 'a') : \
40 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
43 * Stores the given codepoint as a utf8 multibyte sequence into the given
44 * output buffer and substracts the required amount of bytes from the given
47 * Returns false if the multibyte sequence would not fit into the buffer,
52 utf8enc(char **out
, int *rem
, int code
)
54 if (code
> 0 && code
<= 0x7F)
59 *(*out
)++ = code
; (*rem
)--;
62 else if (code
> 0 && code
<= 0x7FF)
67 *(*out
)++ = ((code
>> 6) & 0x1F) | 0xC0; (*rem
)--;
68 *(*out
)++ = ( code
& 0x3F) | 0x80; (*rem
)--;
71 else if (code
> 0 && code
<= 0xFFFF)
76 *(*out
)++ = ((code
>> 12) & 0x0F) | 0xE0; (*rem
)--;
77 *(*out
)++ = ((code
>> 6) & 0x3F) | 0x80; (*rem
)--;
78 *(*out
)++ = ( code
& 0x3F) | 0x80; (*rem
)--;
81 else if (code
> 0 && code
<= 0x10FFFF)
86 *(*out
)++ = ((code
>> 18) & 0x07) | 0xF0; (*rem
)--;
87 *(*out
)++ = ((code
>> 12) & 0x3F) | 0x80; (*rem
)--;
88 *(*out
)++ = ((code
>> 6) & 0x3F) | 0x80; (*rem
)--;
89 *(*out
)++ = ( code
& 0x3F) | 0x80; (*rem
)--;
98 * Parses a string literal from the given buffer.
100 * Returns a negative value on error, otherwise the amount of consumed
101 * characters from the given buffer.
104 * -1 Unterminated string
105 * -2 Invalid escape sequence
106 * -3 String literal too long
110 parse_string(const char *buf
, struct jp_opcode
*op
, struct jp_state
*s
)
113 char str
[128] = { 0 };
115 const char *in
= buf
;
117 int rem
= sizeof(str
) - 1;
122 /* continuation of escape sequence */
128 if (isxdigit(in
[1]) && isxdigit(in
[2]) &&
129 isxdigit(in
[3]) && isxdigit(in
[4]))
131 if (!utf8enc(&out
, &rem
,
132 hex(in
[1]) * 16 * 16 * 16 +
133 hex(in
[2]) * 16 * 16 +
137 s
->error_pos
= s
->off
+ (in
- buf
);
145 s
->error_pos
= s
->off
+ (in
- buf
);
151 else if (in
[0] == 'x')
153 if (isxdigit(in
[1]) && isxdigit(in
[2]))
155 if (!utf8enc(&out
, &rem
, hex(in
[1]) * 16 + hex(in
[2])))
157 s
->error_pos
= s
->off
+ (in
- buf
);
165 s
->error_pos
= s
->off
+ (in
- buf
);
170 /* \377, \77 or \7 */
171 else if (in
[0] >= '0' && in
[0] <= '7')
174 if (in
[1] >= '0' && in
[1] <= '7' &&
175 in
[2] >= '0' && in
[2] <= '7')
177 code
= dec(in
[0]) * 8 * 8 +
183 s
->error_pos
= s
->off
+ (in
- buf
);
187 if (!utf8enc(&out
, &rem
, code
))
189 s
->error_pos
= s
->off
+ (in
- buf
);
197 else if (in
[1] >= '0' && in
[1] <= '7')
199 if (!utf8enc(&out
, &rem
, dec(in
[0]) * 8 + dec(in
[1])))
201 s
->error_pos
= s
->off
+ (in
- buf
);
211 if (!utf8enc(&out
, &rem
, dec(in
[0])))
213 s
->error_pos
= s
->off
+ (in
- buf
);
221 /* single character escape */
226 s
->error_pos
= s
->off
+ (in
- buf
);
232 case 'a': *out
= '\a'; break;
233 case 'b': *out
= '\b'; break;
234 case 'e': *out
= '\e'; break;
235 case 'f': *out
= '\f'; break;
236 case 'n': *out
= '\n'; break;
237 case 'r': *out
= '\r'; break;
238 case 't': *out
= '\t'; break;
239 case 'v': *out
= '\v'; break;
241 /* in regexp mode, retain backslash */
246 s
->error_pos
= s
->off
+ (in
- buf
);
264 /* begin of escape sequence */
265 else if (*in
== '\\')
271 /* terminating quote */
274 op
->str
= strdup(str
);
275 return (in
- buf
) + 2;
283 s
->error_pos
= s
->off
+ (in
- buf
);
296 * Parses a regexp literal from the given buffer.
298 * Returns a negative value on error, otherwise the amount of consumed
299 * characters from the given buffer.
302 * -1 Unterminated regexp
303 * -2 Invalid escape sequence
304 * -3 Regexp literal too long
308 parse_regexp(const char *buf
, struct jp_opcode
*op
, struct jp_state
*s
)
310 int len
= parse_string(buf
, op
, s
);
315 op
->num
= REG_NOSUB
| REG_NEWLINE
;
317 for (p
= buf
+ len
; p
; p
++)
322 op
->num
|= REG_EXTENDED
;
327 op
->num
|= REG_ICASE
;
332 op
->num
&= ~REG_NEWLINE
;
348 * Parses a label from the given buffer.
350 * Returns a negative value on error, otherwise the amount of consumed
351 * characters from the given buffer.
358 parse_label(const char *buf
, struct jp_opcode
*op
, struct jp_state
*s
)
360 char str
[128] = { 0 };
362 const char *in
= buf
;
363 int rem
= sizeof(str
) - 1;
365 while (*in
== '_' || isalnum(*in
))
369 s
->error_pos
= s
->off
+ (in
- buf
);
376 if (!strcmp(str
, "true") || !strcmp(str
, "false"))
378 op
->num
= (str
[0] == 't');
383 op
->str
= strdup(str
);
391 * Parses a number literal from the given buffer.
393 * Returns a negative value on error, otherwise the amount of consumed
394 * characters from the given buffer.
397 * -2 Invalid number character
401 parse_number(const char *buf
, struct jp_opcode
*op
, struct jp_state
*s
)
404 int n
= strtol(buf
, &e
, 10);
408 s
->error_pos
= s
->off
;
417 static const struct token tokens
[] = {
427 { T_BROPEN
, "[", 1 },
428 { T_BRCLOSE
, "]", 1 },
430 { T_PCLOSE
, ")", 1 },
439 { T_WILDCARD
, "*", 1 },
440 { T_REGEXP
, "/", 1, parse_regexp
},
441 { T_STRING
, "'", 1, parse_string
},
442 { T_STRING
, "\"", 1, parse_string
},
443 { T_LABEL
, "_", 1, parse_label
},
444 { T_LABEL
, "az", 0, parse_label
},
445 { T_LABEL
, "AZ", 0, parse_label
},
446 { T_NUMBER
, "-", 1, parse_number
},
447 { T_NUMBER
, "09", 0, parse_number
},
450 const char *tokennames
[25] = {
467 [T_WILDCARD
] = "'*'",
468 [T_REGEXP
] = "/.../",
472 [T_NUMBER
] = "Number",
473 [T_STRING
] = "String",
480 match_token(const char *ptr
, struct jp_opcode
*op
, struct jp_state
*s
)
483 const struct token
*tok
;
485 for (i
= 0, tok
= &tokens
[0];
486 i
< sizeof(tokens
) / sizeof(tokens
[0]);
487 i
++, tok
= &tokens
[i
])
489 if ((tok
->plen
> 0 && !strncmp(ptr
, tok
->pat
, tok
->plen
)) ||
490 (tok
->plen
== 0 && *ptr
>= tok
->pat
[0] && *ptr
<= tok
->pat
[1]))
492 op
->type
= tok
->type
;
495 return tok
->parse(ptr
, op
, s
);
501 s
->error_pos
= s
->off
;
506 jp_get_token(struct jp_state
*s
, const char *input
, int *mlen
)
508 struct jp_opcode op
= { 0 };
510 *mlen
= match_token(input
, &op
, s
);
514 s
->error_code
= *mlen
;
517 else if (op
.type
== 0)
522 return jp_alloc_op(s
, op
.type
, op
.num
, op
.str
, NULL
);