libs/web: fix filtering of overlong utf8 sequences
[project/luci.git] / libs / web / src / template_utils.c
1 /*
2 * LuCI Template - Utility functions
3 *
4 * Copyright (C) 2010 Jo-Philipp Wich <xm@subsignal.org>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 #include "template_utils.h"
20
21 /* initialize a buffer object */
22 static struct template_buffer * buf_init(void)
23 {
24 struct template_buffer *buf;
25
26 buf = (struct template_buffer *)malloc(sizeof(struct template_buffer));
27
28 if (buf != NULL)
29 {
30 buf->fill = 0;
31 buf->size = 1024;
32 buf->data = (unsigned char *)malloc(buf->size);
33
34 if (buf->data != NULL)
35 {
36 buf->dptr = buf->data;
37 buf->data[0] = 0;
38
39 return buf;
40 }
41
42 free(buf);
43 }
44
45 return NULL;
46 }
47
48 /* grow buffer */
49 static int buf_grow(struct template_buffer *buf)
50 {
51 unsigned int off = (buf->dptr - buf->data);
52 unsigned char *data =
53 (unsigned char *)realloc(buf->data, buf->size + 1024);
54
55 if (data != NULL)
56 {
57 buf->data = data;
58 buf->dptr = data + off;
59 buf->size += 1024;
60
61 return buf->size;
62 }
63
64 return 0;
65 }
66
67 /* put one char into buffer object */
68 static int buf_putchar(struct template_buffer *buf, unsigned char c)
69 {
70 if( ((buf->fill + 1) >= buf->size) && !buf_grow(buf) )
71 return 0;
72
73 *(buf->dptr++) = c;
74 *(buf->dptr) = 0;
75
76 buf->fill++;
77 return 1;
78 }
79
80 /* append data to buffer */
81 static int buf_append(struct template_buffer *buf, unsigned char *s, int len)
82 {
83 while ((buf->fill + len + 1) >= buf->size)
84 {
85 if (!buf_grow(buf))
86 return 0;
87 }
88
89 memcpy(buf->dptr, s, len);
90 buf->fill += len;
91 buf->dptr += len;
92
93 *(buf->dptr) = 0;
94
95 return len;
96 }
97
98 /* destroy buffer object and return pointer to data */
99 static char * buf_destroy(struct template_buffer *buf)
100 {
101 unsigned char *data = buf->data;
102
103 free(buf);
104 return (char *)data;
105 }
106
107
108 /* calculate the number of expected continuation chars */
109 static inline int mb_num_chars(unsigned char c)
110 {
111 if ((c & 0xE0) == 0xC0)
112 return 2;
113 else if ((c & 0xF0) == 0xE0)
114 return 3;
115 else if ((c & 0xF8) == 0xF0)
116 return 4;
117 else if ((c & 0xFC) == 0xF8)
118 return 5;
119 else if ((c & 0xFE) == 0xFC)
120 return 6;
121
122 return 1;
123 }
124
125 /* test whether the given byte is a valid continuation char */
126 static inline int mb_is_cont(unsigned char c)
127 {
128 return ((c >= 0x80) && (c <= 0xBF));
129 }
130
131 /* test whether the byte sequence at the given pointer with the given
132 * length is the shortest possible representation of the code point */
133 static inline int mb_is_shortest(unsigned char *s, int n)
134 {
135 switch (n)
136 {
137 case 2:
138 /* 1100000x (10xxxxxx) */
139 return !(((*s >> 1) == 0x60) &&
140 ((*(s+1) >> 6) == 0x02));
141
142 case 3:
143 /* 11100000 100xxxxx (10xxxxxx) */
144 return !((*s == 0xE0) &&
145 ((*(s+1) >> 5) == 0x04) &&
146 ((*(s+2) >> 6) == 0x02));
147
148 case 4:
149 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
150 return !((*s == 0xF0) &&
151 ((*(s+1) >> 4) == 0x08) &&
152 ((*(s+2) >> 6) == 0x02) &&
153 ((*(s+3) >> 6) == 0x02));
154
155 case 5:
156 /* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
157 return !((*s == 0xF8) &&
158 ((*(s+1) >> 3) == 0x10) &&
159 ((*(s+2) >> 6) == 0x02) &&
160 ((*(s+3) >> 6) == 0x02) &&
161 ((*(s+4) >> 6) == 0x02));
162
163 case 6:
164 /* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
165 return !((*s == 0xF8) &&
166 ((*(s+1) >> 2) == 0x20) &&
167 ((*(s+2) >> 6) == 0x02) &&
168 ((*(s+3) >> 6) == 0x02) &&
169 ((*(s+4) >> 6) == 0x02) &&
170 ((*(s+5) >> 6) == 0x02));
171 }
172
173 return 1;
174 }
175
176 /* test whether the byte sequence at the given pointer with the given
177 * length is an UTF-16 surrogate */
178 static inline int mb_is_surrogate(unsigned char *s, int n)
179 {
180 return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF));
181 }
182
183 /* test whether the byte sequence at the given pointer with the given
184 * length is an illegal UTF-8 code point */
185 static inline int mb_is_illegal(unsigned char *s, int n)
186 {
187 return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) &&
188 (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF));
189 }
190
191
192 /* scan given source string, validate UTF-8 sequence and store result
193 * in given buffer object */
194 static int _validate_utf8(unsigned char **s, int l, struct template_buffer *buf)
195 {
196 unsigned char *ptr = *s;
197 unsigned int o = 0, v, n;
198
199 /* ascii byte without null */
200 if ((*(ptr+0) >= 0x01) && (*(ptr+0) <= 0x7F))
201 {
202 if (!buf_putchar(buf, *ptr++))
203 return 0;
204
205 o = 1;
206 }
207
208 /* multi byte sequence */
209 else if ((n = mb_num_chars(*ptr)) > 1)
210 {
211 /* count valid chars */
212 for (v = 1; (v <= n) && ((o+v) < l) && mb_is_cont(*(ptr+v)); v++);
213
214 switch (n)
215 {
216 case 6:
217 case 5:
218 /* five and six byte sequences are always invalid */
219 if (!buf_putchar(buf, '?'))
220 return 0;
221
222 break;
223
224 default:
225 /* if the number of valid continuation bytes matches the
226 * expected number and if the sequence is legal, copy
227 * the bytes to the destination buffer */
228 if ((v == n) && mb_is_shortest(ptr, n) &&
229 !mb_is_surrogate(ptr, n) && !mb_is_illegal(ptr, n))
230 {
231 /* copy sequence */
232 if (!buf_append(buf, ptr, n))
233 return 0;
234 }
235
236 /* the found sequence is illegal, skip it */
237 else
238 {
239 /* invalid sequence */
240 if (!buf_putchar(buf, '?'))
241 return 0;
242 }
243
244 break;
245 }
246
247 /* advance beyound the last found valid continuation char */
248 o = v;
249 ptr += v;
250 }
251
252 /* invalid byte (0x00) */
253 else
254 {
255 if (!buf_putchar(buf, '?')) /* or 0xEF, 0xBF, 0xBD */
256 return 0;
257
258 o = 1;
259 ptr++;
260 }
261
262 *s = ptr;
263 return o;
264 }
265
266 /* sanitize given string and replace all invalid UTF-8 sequences with "?" */
267 char * sanitize_utf8(const char *s, unsigned int l)
268 {
269 struct template_buffer *buf = buf_init();
270 unsigned char *ptr = (unsigned char *)s;
271 unsigned int v, o;
272
273 if (!buf)
274 return NULL;
275
276 for (o = 0; o < l; o++)
277 {
278 /* ascii char */
279 if ((*ptr >= 0x01) && (*ptr <= 0x7F))
280 {
281 if (!buf_putchar(buf, *ptr++))
282 break;
283 }
284
285 /* invalid byte or multi byte sequence */
286 else
287 {
288 if (!(v = _validate_utf8(&ptr, l - o, buf)))
289 break;
290
291 o += (v - 1);
292 }
293 }
294
295 return buf_destroy(buf);
296 }
297
298 /* Sanitize given string and strip all invalid XML bytes
299 * Validate UTF-8 sequences
300 * Escape XML control chars */
301 char * sanitize_pcdata(const char *s, unsigned int l)
302 {
303 struct template_buffer *buf = buf_init();
304 unsigned char *ptr = (unsigned char *)s;
305 unsigned int o, v;
306 char esq[8];
307 int esl;
308
309 if (!buf)
310 return NULL;
311
312 for (o = 0; o < l; o++)
313 {
314 /* Invalid XML bytes */
315 if (((*ptr >= 0x00) && (*ptr <= 0x08)) ||
316 ((*ptr >= 0x0B) && (*ptr <= 0x0C)) ||
317 ((*ptr >= 0x0E) && (*ptr <= 0x1F)) ||
318 (*ptr == 0x7F))
319 {
320 ptr++;
321 }
322
323 /* Escapes */
324 else if ((*ptr == 0x26) ||
325 (*ptr == 0x27) ||
326 (*ptr == 0x22) ||
327 (*ptr == 0x3C) ||
328 (*ptr == 0x3E))
329 {
330 esl = snprintf(esq, sizeof(esq), "&#%i;", *ptr);
331
332 if (!buf_append(buf, (unsigned char *)esq, esl))
333 break;
334
335 ptr++;
336 }
337
338 /* ascii char */
339 else if (*ptr <= 0x7F)
340 {
341 buf_putchar(buf, *ptr++);
342 }
343
344 /* multi byte sequence */
345 else
346 {
347 if (!(v = _validate_utf8(&ptr, l - o, buf)))
348 break;
349
350 o += (v - 1);
351 }
352 }
353
354 return buf_destroy(buf);
355 }