Line data Source code
1 : /* $OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ */
2 :
3 : /*
4 : * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 : * Copyright (C) 1994-2015 Lua.org, PUC-Rio.
6 : *
7 : * Permission is hereby granted, free of charge, to any person obtaining
8 : * a copy of this software and associated documentation files (the
9 : * "Software"), to deal in the Software without restriction, including
10 : * without limitation the rights to use, copy, modify, merge, publish,
11 : * distribute, sublicense, and/or sell copies of the Software, and to
12 : * permit persons to whom the Software is furnished to do so, subject to
13 : * the following conditions:
14 : *
15 : * The above copyright notice and this permission notice shall be
16 : * included in all copies or substantial portions of the Software.
17 : *
18 : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 : * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 : * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 : * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 : * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 : */
26 :
27 : /*
28 : * Derived from Lua 5.3.1:
29 : * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $
30 : * Standard library for string operations and pattern-matching
31 : */
32 :
33 : #include <sys/types.h>
34 : #include <ctype.h>
35 : #include <errno.h>
36 : #include <stddef.h>
37 : #include <stdlib.h>
38 : #include <string.h>
39 :
40 : #include "patterns.h"
41 : #include "lwan.h"
42 :
43 : #define uchar(c) ((unsigned char)(c)) /* macro to 'unsign' a char */
44 : #define CAP_UNFINISHED (-1)
45 : #define CAP_POSITION (-2)
46 : #define L_ESC '%'
47 : #define SPECIALS "^$*+?.([%-"
48 :
49 : struct match_state {
50 : int matchdepth; /* control for recursive depth (to avoid C
51 : * stack overflow) */
52 : int repetitioncounter; /* control the repetition items */
53 : int maxcaptures; /* configured capture limit */
54 : const char *src_init; /* init of source string */
55 : const char *src_end; /* end ('\0') of source string */
56 : const char *p_end; /* end ('\0') of pattern */
57 : const char *error; /* should be NULL */
58 : int level; /* total number of captures (finished or
59 : * unfinished) */
60 : struct {
61 : const char *init;
62 : ptrdiff_t len;
63 : } capture[MAXCAPTURES];
64 : };
65 :
66 : /* recursive function */
67 : static const char *match(struct match_state *, const char *, const char *);
68 :
69 : static int
70 0 : match_error(struct match_state *ms, const char *error)
71 : {
72 0 : ms->error = ms->error == NULL ? error : ms->error;
73 0 : return (-1);
74 : }
75 :
76 : static int
77 0 : check_capture(struct match_state *ms, int l)
78 : {
79 0 : l -= '1';
80 0 : if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
81 0 : return match_error(ms, "invalid capture index");
82 0 : return (l);
83 : }
84 :
85 : static int
86 37 : capture_to_close(struct match_state *ms)
87 : {
88 37 : int level = ms->level;
89 37 : for (level--; level >= 0; level--)
90 37 : if (ms->capture[level].len == CAP_UNFINISHED)
91 37 : return (level);
92 0 : return match_error(ms, "invalid pattern capture");
93 : }
94 :
95 : static const char *
96 589 : classend(struct match_state *ms, const char *p)
97 : {
98 589 : switch (*p++) {
99 66 : case L_ESC:
100 66 : if (p == ms->p_end)
101 0 : match_error(ms,
102 : "malformed pattern (ends with '%')");
103 66 : return p + 1;
104 0 : case '[':
105 0 : if (*p == '^')
106 0 : p++;
107 : do {
108 : /* look for a ']' */
109 0 : if (p == ms->p_end) {
110 0 : match_error(ms,
111 : "malformed pattern (missing ']')");
112 0 : break;
113 : }
114 0 : if (*(p++) == L_ESC && p < ms->p_end) {
115 : /* skip escapes (e.g. '%]') */
116 0 : p++;
117 : }
118 0 : } while (*p != ']');
119 0 : return p + 1;
120 523 : default:
121 523 : return p;
122 : }
123 : }
124 :
125 : static int
126 99 : match_class(int c, int cl)
127 : {
128 : int res;
129 99 : switch (tolower(cl)) {
130 1 : case 'a':
131 1 : res = isalpha(c);
132 1 : break;
133 0 : case 'c':
134 0 : res = iscntrl(c);
135 0 : break;
136 98 : case 'd':
137 98 : res = isdigit(c);
138 98 : break;
139 0 : case 'g':
140 0 : res = isgraph(c);
141 0 : break;
142 0 : case 'l':
143 0 : res = islower(c);
144 0 : break;
145 0 : case 'p':
146 0 : res = ispunct(c);
147 0 : break;
148 0 : case 's':
149 0 : res = isspace(c);
150 0 : break;
151 0 : case 'u':
152 0 : res = isupper(c);
153 0 : break;
154 0 : case 'w':
155 0 : res = isalnum(c);
156 0 : break;
157 0 : case 'x':
158 0 : res = isxdigit(c);
159 0 : break;
160 0 : default:
161 0 : return (cl == c);
162 : }
163 99 : return (islower(cl) ? res : !res);
164 : }
165 :
166 : static int
167 0 : matchbracketclass(int c, const char *p, const char *ec)
168 : {
169 0 : int sig = 1;
170 0 : if (*(p + 1) == '^') {
171 0 : sig = 0;
172 : /* skip the '^' */
173 0 : p++;
174 : }
175 0 : while (++p < ec) {
176 0 : if (*p == L_ESC) {
177 0 : p++;
178 0 : if (match_class(c, uchar(*p)))
179 0 : return sig;
180 0 : } else if ((*(p + 1) == '-') && (p + 2 < ec)) {
181 0 : p += 2;
182 0 : if (uchar(*(p - 2)) <= c && c <= uchar(*p))
183 0 : return sig;
184 0 : } else if (uchar(*p) == c)
185 0 : return sig;
186 : }
187 0 : return !sig;
188 : }
189 :
190 : static int
191 635 : singlematch(struct match_state *ms, const char *s, const char *p,
192 : const char *ep)
193 : {
194 635 : if (s >= ms->src_end)
195 30 : return 0;
196 : else {
197 605 : int c = uchar(*s);
198 605 : switch (*p) {
199 37 : case '.':
200 : /* matches any char */
201 37 : return (1);
202 99 : case L_ESC:
203 99 : return match_class(c, uchar(*(p + 1)));
204 0 : case '[':
205 0 : return matchbracketclass(c, p, ep - 1);
206 469 : default:
207 469 : return (uchar(*p) == c);
208 : }
209 : }
210 : }
211 :
212 : static const char *
213 0 : matchbalance(struct match_state *ms, const char *s, const char *p)
214 : {
215 0 : if (p >= ms->p_end - 1) {
216 0 : match_error(ms,
217 : "malformed pattern (missing arguments to '%b')");
218 0 : return (NULL);
219 : }
220 0 : if (*s != *p)
221 0 : return (NULL);
222 : else {
223 0 : int b = *p;
224 0 : int e = *(p + 1);
225 0 : int cont = 1;
226 0 : while (++s < ms->src_end) {
227 0 : if (*s == e) {
228 0 : if (--cont == 0)
229 0 : return s + 1;
230 0 : } else if (*s == b)
231 0 : cont++;
232 : }
233 : }
234 :
235 : /* string ends out of balance */
236 0 : return (NULL);
237 : }
238 :
239 : static const char *
240 23 : max_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
241 : {
242 23 : ptrdiff_t i = 0;
243 : /* counts maximum expand for item */
244 46 : while (singlematch(ms, s + i, p, ep))
245 23 : i++;
246 : /* keeps trying to match with the maximum repetitions */
247 50 : while (i >= 0) {
248 36 : const char *res = match(ms, (s + i), ep + 1);
249 36 : if (res)
250 9 : return res;
251 : /* else didn't match; reduce 1 repetition to try again */
252 27 : i--;
253 : }
254 14 : return NULL;
255 : }
256 :
257 : static const char *
258 0 : min_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
259 : {
260 0 : for (;;) {
261 0 : const char *res = match(ms, s, ep + 1);
262 0 : if (res != NULL)
263 0 : return res;
264 0 : else if (singlematch(ms, s, p, ep))
265 0 : s++; /* try with one more repetition */
266 : else
267 0 : return NULL;
268 : }
269 : }
270 :
271 : static const char *
272 66 : start_capture(struct match_state *ms, const char *s, const char *p, int what)
273 : {
274 : const char *res;
275 :
276 66 : int level = ms->level;
277 66 : if (level >= ms->maxcaptures) {
278 0 : match_error(ms, "too many captures");
279 0 : return (NULL);
280 : }
281 66 : ms->capture[level].init = s;
282 66 : ms->capture[level].len = what;
283 66 : ms->level = level + 1;
284 : /* undo capture if match failed */
285 66 : if ((res = match(ms, s, p)) == NULL)
286 56 : ms->level--;
287 66 : return res;
288 : }
289 :
290 : static const char *
291 37 : end_capture(struct match_state *ms, const char *s, const char *p)
292 : {
293 37 : int l = capture_to_close(ms);
294 : const char *res;
295 37 : if (l == -1)
296 0 : return NULL;
297 : /* close capture */
298 37 : ms->capture[l].len = s - ms->capture[l].init;
299 : /* undo capture if match failed */
300 37 : if ((res = match(ms, s, p)) == NULL)
301 27 : ms->capture[l].len = CAP_UNFINISHED;
302 37 : return res;
303 : }
304 :
305 : static const char *
306 0 : match_capture(struct match_state *ms, const char *s, int l)
307 : {
308 : size_t len;
309 0 : l = check_capture(ms, l);
310 0 : if (l == -1)
311 0 : return NULL;
312 0 : len = (size_t)ms->capture[l].len;
313 0 : if ((size_t) (ms->src_end - s) >= len &&
314 0 : memcmp(ms->capture[l].init, s, len) == 0)
315 0 : return s + len;
316 : else
317 0 : return NULL;
318 : }
319 :
320 : static const char *
321 374 : match(struct match_state *ms, const char *s, const char *p)
322 : {
323 : const char *ep, *res;
324 : char previous;
325 :
326 374 : if (UNLIKELY(ms->matchdepth-- == 0)) {
327 0 : match_error(ms, "pattern too complex");
328 0 : return (NULL);
329 : }
330 :
331 : /* using goto's to optimize tail recursion */
332 735 : init:
333 : /* end of pattern? */
334 735 : if (p != ms->p_end) {
335 692 : switch (*p) {
336 66 : case '(':
337 : /* start capture */
338 66 : if (*(p + 1) == ')')
339 : /* position capture? */
340 0 : s = start_capture(ms, s, p + 2, CAP_POSITION);
341 : else
342 66 : s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
343 66 : break;
344 37 : case ')':
345 : /* end capture */
346 37 : s = end_capture(ms, s, p + 1);
347 37 : break;
348 0 : case '$':
349 : /* is the '$' the last char in pattern? */
350 0 : if ((p + 1) != ms->p_end) {
351 : /* no; go to default */
352 0 : goto dflt;
353 : }
354 : /* check end of string */
355 0 : s = (s == ms->src_end) ? s : NULL;
356 0 : break;
357 66 : case L_ESC:
358 : /* escaped sequences not in the format class[*+?-]? */
359 66 : switch (*(p + 1)) {
360 0 : case 'b':
361 : /* balanced string? */
362 0 : s = matchbalance(ms, s, p + 2);
363 0 : if (s != NULL) {
364 0 : p += 4;
365 : /* return match(ms, s, p + 4); */
366 0 : goto init;
367 : } /* else fail (s == NULL) */
368 0 : break;
369 0 : case 'f':
370 : /* frontier? */
371 0 : p += 2;
372 0 : if (*p != '[') {
373 0 : match_error(ms, "missing '['"
374 : " after '%f' in pattern");
375 0 : break;
376 : }
377 : /* points to what is next */
378 0 : ep = classend(ms, p);
379 0 : if (ms->error != NULL)
380 0 : break;
381 0 : previous =
382 0 : (s == ms->src_init) ? '\0' : *(s - 1);
383 0 : if (!matchbracketclass(uchar(previous),
384 0 : p, ep - 1) &&
385 0 : matchbracketclass(uchar(*s),
386 : p, ep - 1)) {
387 0 : p = ep;
388 : /* return match(ms, s, ep); */
389 0 : goto init;
390 : }
391 : /* match failed */
392 0 : s = NULL;
393 0 : break;
394 0 : case '0' ... '9':
395 : /* capture results (%0-%9)? */
396 0 : s = match_capture(ms, s, uchar(*(p + 1)));
397 0 : if (s != NULL) {
398 0 : p += 2;
399 : /* return match(ms, s, p + 2) */
400 0 : goto init;
401 : }
402 0 : break;
403 66 : default:
404 66 : goto dflt;
405 : }
406 0 : break;
407 : default:
408 :
409 : /* pattern class plus optional suffix */
410 589 : dflt:
411 : /* points to optional suffix */
412 589 : ep = classend(ms, p);
413 589 : if (ms->error != NULL)
414 0 : break;
415 :
416 : /* does not match at least once? */
417 589 : if (!singlematch(ms, s, p, ep)) {
418 205 : if (ms->repetitioncounter-- == 0) {
419 0 : match_error(ms, "max repetition items");
420 0 : s = NULL; /* fail */
421 : /* accept empty? */
422 : } else if
423 205 : (*ep == '*' || *ep == '?' || *ep == '-') {
424 0 : p = ep + 1;
425 : /* return match(ms, s, ep + 1); */
426 0 : goto init;
427 : } else {
428 : /* '+' or no suffix */
429 205 : s = NULL; /* fail */
430 : }
431 : } else {
432 : /* matched once */
433 : /* handle optional suffix */
434 384 : switch (*ep) {
435 0 : case '?':
436 : /* optional */
437 0 : if ((res =
438 0 : match(ms, s + 1, ep + 1)) != NULL)
439 0 : s = res;
440 : else {
441 : /*
442 : * else return
443 : * match(ms, s, ep + 1);
444 : */
445 0 : p = ep + 1;
446 0 : goto init;
447 : }
448 0 : break;
449 23 : case '+':
450 : /* 1 or more repetitions */
451 23 : s++; /* 1 match already done */
452 : /* FALLTHROUGH */
453 23 : case '*':
454 : /* 0 or more repetitions */
455 23 : s = max_expand(ms, s, p, ep);
456 23 : break;
457 0 : case '-':
458 : /* 0 or more repetitions (minimum) */
459 0 : s = min_expand(ms, s, p, ep);
460 0 : break;
461 361 : default:
462 : /* no suffix */
463 361 : s++;
464 361 : p = ep;
465 : /* return match(ms, s + 1, ep); */
466 361 : goto init;
467 : }
468 : }
469 228 : break;
470 : }
471 : }
472 374 : ms->matchdepth++;
473 374 : return s;
474 : }
475 :
476 : static const char *
477 1 : lmemfind(const char *s1, size_t l1, const char *s2, size_t l2)
478 : {
479 : const char *init;
480 :
481 1 : if (l2 == 0)
482 0 : return (s1); /* empty strings are everywhere */
483 1 : if (l2 > l1)
484 0 : return (NULL); /* avoids a negative 'l1' */
485 : /*
486 : * to search for a '*s2' inside 's1'
487 : * - 1st char will be checked by 'memchr'
488 : * - 's2' cannot be found after that
489 : */
490 1 : l2--;
491 1 : l1 = l1 - l2;
492 1 : while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
493 : /* 1st char is already checked */
494 1 : init++;
495 1 : if (memcmp(init, s2 + 1, l2) == 0)
496 1 : return init - 1;
497 :
498 : /* correct 'l1' and 's1' to try again */
499 0 : l1 -= (size_t)(init - s1);
500 0 : s1 = init;
501 : }
502 : /* not found */
503 0 : return (NULL);
504 : }
505 :
506 : static int
507 47 : push_onecapture(struct match_state *ms, int i, const char *s,
508 : const char *e, struct str_find *sm)
509 : {
510 47 : if (i >= ms->level) {
511 37 : if (i == 0 || ms->level == 0) {
512 : /* add whole match */
513 37 : sm->sm_so = (off_t)(s - ms->src_init);
514 37 : sm->sm_eo = (off_t)(e - s) + sm->sm_so;
515 : } else
516 0 : return match_error(ms, "invalid capture index");
517 : } else {
518 10 : ptrdiff_t l = ms->capture[i].len;
519 10 : if (l == CAP_UNFINISHED)
520 0 : return match_error(ms, "unfinished capture");
521 10 : sm->sm_so = ms->capture[i].init - ms->src_init;
522 10 : sm->sm_eo = sm->sm_so + l;
523 : }
524 47 : sm->sm_eo = sm->sm_eo < sm->sm_so ? sm->sm_so : sm->sm_eo;
525 47 : return (0);
526 : }
527 :
528 : static int
529 43 : push_captures(struct match_state *ms, const char *s, const char *e,
530 : struct str_find *sm, size_t nsm)
531 : {
532 : int i;
533 43 : int nlevels = (ms->level <= 0 && s) ? 1 : ms->level;
534 :
535 43 : if (nlevels > (int)nsm)
536 0 : nlevels = (int)nsm;
537 90 : for (i = 0; i < nlevels; i++)
538 47 : if (push_onecapture(ms, i, s, e, sm + i) == -1)
539 0 : break;
540 :
541 : /* number of strings pushed */
542 43 : return (nlevels);
543 : }
544 :
545 : /* check whether pattern has no special characters */
546 : static int
547 58 : nospecials(const char *p, size_t l)
548 : {
549 58 : size_t upto = 0;
550 :
551 : do {
552 58 : if (strpbrk(p + upto, SPECIALS)) {
553 : /* pattern has a special character */
554 57 : return 0;
555 : }
556 : /* may have more after \0 */
557 1 : upto += strlen(p + upto) + 1;
558 1 : } while (upto <= l);
559 :
560 : /* no special chars found */
561 1 : return (1);
562 : }
563 :
564 : static int
565 58 : str_find_aux(struct match_state *ms, const char *pattern, const char *string,
566 : struct str_find *sm, size_t nsm, off_t init)
567 : {
568 58 : size_t ls = strlen(string);
569 58 : size_t lp = strlen(pattern);
570 58 : const char *s = string;
571 58 : const char *p = pattern;
572 : const char *s1, *s2;
573 : int anchor, i;
574 :
575 58 : if (init < 0)
576 0 : init = 0;
577 58 : else if (init > (off_t)ls)
578 0 : return match_error(ms, "starting after string's end");
579 58 : s1 = s + init;
580 :
581 58 : if (nospecials(p, lp)) {
582 : /* do a plain search */
583 1 : s2 = lmemfind(s1, ls - (size_t)init, p, lp);
584 1 : if (s2 == NULL)
585 0 : return (0);
586 :
587 1 : i = 0;
588 1 : sm[i].sm_so = 0;
589 1 : sm[i].sm_eo = (off_t)ls;
590 1 : if (nsm > 1) {
591 1 : i++;
592 1 : sm[i].sm_so = s2 - s;
593 1 : sm[i].sm_eo = (off_t)((s2 - s) + (off_t)lp);
594 : }
595 1 : return (i + 1);
596 : }
597 :
598 57 : anchor = (*p == '^');
599 57 : if (anchor) {
600 0 : p++;
601 0 : lp--; /* skip anchor character */
602 : }
603 57 : ms->maxcaptures = (int)((nsm > MAXCAPTURES ? MAXCAPTURES : nsm) - 1);
604 57 : ms->matchdepth = MAXCCALLS;
605 57 : ms->repetitioncounter = MAXREPETITION;
606 57 : ms->src_init = s;
607 57 : ms->src_end = s + ls;
608 57 : ms->p_end = p + lp;
609 : do {
610 : const char *res;
611 235 : ms->level = 0;
612 235 : if ((res = match(ms, s1, p)) != NULL) {
613 43 : sm->sm_so = 0;
614 43 : sm->sm_eo = (off_t)ls;
615 43 : return push_captures(ms, s1, res, sm + 1, nsm - 1) + 1;
616 :
617 192 : } else if (ms->error != NULL) {
618 0 : return 0;
619 : }
620 192 : } while (s1++ < ms->src_end && !anchor);
621 :
622 14 : return 0;
623 : }
624 :
625 : int
626 58 : str_find(const char *string, const char *pattern, struct str_find *sm,
627 : size_t nsm, const char **errstr)
628 : {
629 : struct match_state ms;
630 : int ret;
631 :
632 58 : memset(&ms, 0, sizeof(ms));
633 58 : memset(sm, 0, nsm * sizeof(*sm));
634 :
635 58 : ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
636 58 : if (ms.error != NULL) {
637 : /* Return 0 on error and store the error string */
638 0 : *errstr = ms.error;
639 0 : ret = 0;
640 : } else
641 58 : *errstr = NULL;
642 :
643 58 : return (ret);
644 : }
645 :
646 : int
647 0 : str_match(const char *string, const char *pattern, struct str_match *m,
648 : const char **errstr)
649 : {
650 : struct str_find sm[MAXCAPTURES];
651 : struct match_state ms;
652 : int i, ret;
653 : size_t len, nsm;
654 :
655 0 : nsm = MAXCAPTURES;
656 0 : memset(&ms, 0, sizeof(ms));
657 0 : memset(sm, 0, sizeof(sm));
658 0 : memset(m, 0, sizeof(*m));
659 :
660 0 : ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
661 0 : if (ret <= 0 || ms.error != NULL) {
662 : /* Return -1 on error and store the error string */
663 0 : *errstr = ms.error;
664 0 : return (-1);
665 : }
666 :
667 0 : if ((m->sm_match = calloc((size_t)ret, sizeof(char *))) == NULL) {
668 0 : *errstr = strerror(errno);
669 0 : return (-1);
670 : }
671 0 : m->sm_nmatch = ret;
672 :
673 0 : for (i = 0; i < ret; i++) {
674 0 : if (sm[i].sm_so > sm[i].sm_eo)
675 0 : continue;
676 0 : len = (size_t)(sm[i].sm_eo - sm[i].sm_so);
677 0 : if ((m->sm_match[i] = strndup(string +
678 0 : sm[i].sm_so, len)) == NULL) {
679 0 : *errstr = strerror(errno);
680 0 : str_match_free(m);
681 0 : return (-1);
682 : }
683 : }
684 :
685 0 : *errstr = NULL;
686 0 : return (0);
687 : }
688 :
689 : void
690 0 : str_match_free(struct str_match *m)
691 : {
692 : int i;
693 :
694 0 : for (i = 0; i < m->sm_nmatch; i++)
695 0 : free(m->sm_match[i]);
696 0 : free(m->sm_match);
697 0 : m->sm_match = NULL;
698 0 : m->sm_nmatch = 0;
699 0 : }
|