1 /* $Id: uwildmat.c 6779 2004-05-17 07:25:28Z rra $
\r
3 ** wildmat pattern matching with Unicode UTF-8 extensions.
\r
5 ** Do shell-style pattern matching for ?, \, [], and * characters. Might not
\r
6 ** be robust in face of malformed patterns; e.g., "foo[a-" could cause a
\r
7 ** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed
\r
8 ** July 2000; all malformed patterns should now just fail to match anything.)
\r
10 ** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
\r
11 ** Rich $alz is now <rsalz@osf.org>.
\r
13 ** April, 1991: Replaced mutually-recursive calls with in-line code for the
\r
16 ** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
\r
17 ** This can greatly speed up failing wildcard patterns. For example:
\r
19 ** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
\r
20 ** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
\r
21 ** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
\r
23 ** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without
\r
24 ** the ABORT code, it takes 22310 calls to fail. Ugh. The following
\r
25 ** explanation is from Lars:
\r
27 ** The precondition that must be fulfilled is that DoMatch will consume at
\r
28 ** least one character in text. This is true if *p is neither '*' nor '\0'.)
\r
29 ** The last return has ABORT instead of false to avoid quadratic behaviour in
\r
30 ** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each
\r
31 ** star-loop has to run to the end of the text; with ABORT only the last one
\r
34 ** Once the control of one instance of DoMatch enters the star-loop, that
\r
35 ** instance will return either true or ABORT, and any calling instance will
\r
36 ** therefore return immediately after (without calling recursively again).
\r
37 ** In effect, only one star-loop is ever active. It would be possible to
\r
38 ** modify the code to maintain this context explicitly, eliminating all
\r
39 ** recursive calls at the cost of some complication and loss of clarity (and
\r
40 ** the ABORT stuff seems to be unclear enough by itself). I think it would
\r
41 ** be unwise to try to get this into a released version unless you have a
\r
42 ** good test data base to try it out on.
\r
44 ** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket
\r
45 ** handling for character sets.
\r
47 ** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add
\r
48 ** support for ',', '!', and optionally '@' to the core wildmat routine.
\r
49 ** Broke the character class matching into a separate function for clarity
\r
50 ** since it's infrequently used in practice, and added some simple lookahead
\r
51 ** to significantly decrease the recursive calls in the '*' matching code.
\r
52 ** Added support for UTF-8 as the default character set for any high-bit
\r
55 ** For more information on UTF-8, see RFC 2279.
\r
57 ** Please note that this file is intentionally written so that conditionally
\r
58 ** executed expressions are on separate lines from the condition to
\r
59 ** facilitate analysis of the coverage of the test suite using purecov.
\r
60 ** Please preserve this. As of March 11, 2001, purecov reports that the
\r
61 ** accompanying test suite achieves 100% coverage of this file.
\r
64 //#include "config.h"
\r
65 //#include "clibrary.h"
\r
66 //#include "libinn.h"
\r
67 #include "../../include/nntp/uwildmat.h"
\r
72 /* Whether or not an octet looks like the start of a UTF-8 character. */
\r
73 #define ISUTF8(c) (((c) & 0xc0) == 0xc0)
\r
77 ** Determine the length of a non-ASCII character in octets (for advancing
\r
78 ** pointers when skipping over characters). Takes a pointer to the start of
\r
79 ** the character and to the last octet of the string. If end is NULL, expect
\r
80 ** the string pointed to by start to be nul-terminated. If the character is
\r
81 ** malformed UTF-8, return 1 to treat it like an eight-bit local character.
\r
84 utf8_length(const unsigned char *start, const unsigned char *end)
\r
86 unsigned char mask = 0x80;
\r
87 const unsigned char *p;
\r
91 for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
\r
93 if (length < 2 || length > 6)
\r
95 if (end != NULL && (end - start + 1) < length)
\r
99 for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
\r
101 return (left == 0) ? length : 1;
\r
106 ** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the
\r
107 ** character and to the last octet of the string, and to a uint32_t into
\r
108 ** which to put the decoded UCS-4 value. If end is NULL, expect the string
\r
109 ** pointed to by start to be nul-terminated. Returns the number of octets in
\r
110 ** the UTF-8 encoding. If the UTF-8 character is malformed, set result to
\r
111 ** the decimal value of the first octet; this is wrong, but it will generally
\r
112 ** cause the rest of the wildmat matching to do the right thing for non-UTF-8
\r
116 utf8_decode(const unsigned char *start, const unsigned char *end,
\r
119 uint32_t value = 0;
\r
121 const unsigned char *p = start;
\r
122 unsigned char mask;
\r
124 length = utf8_length(start, end);
\r
129 mask = (1 << (7 - length)) - 1;
\r
132 for (i = length - 1; i > 0; i--) {
\r
133 value = (value << 6) | (*p & 0x3f);
\r
142 ** Match a character class against text, a UCS-4 character. start is a
\r
143 ** pointer to the first character of the character class, end a pointer to
\r
144 ** the last. Returns whether the class matches that character.
\r
147 match_class(uint32_t text, const unsigned char *start,
\r
148 const unsigned char *end)
\r
150 bool reversed, allowrange;
\r
151 const unsigned char *p = start;
\r
152 uint32_t first, last;
\r
154 /* Check for an inverted character class (starting with ^). If the
\r
155 character matches the character class, we return !reversed; that way,
\r
156 we return true if it's a regular character class and false if it's a
\r
157 reversed one. If the character doesn't match, we return reversed. */
\r
158 reversed = (*p == '^');
\r
162 /* Walk through the character class until we reach the end or find a
\r
163 match, handling character ranges as we go. Only permit a range to
\r
164 start when allowrange is true; this allows - to be treated like a
\r
165 normal character as the first character of the class and catches
\r
166 malformed ranges like a-e-n. We treat the character at the beginning
\r
167 of a range as both a regular member of the class and the beginning of
\r
168 the range; this is harmless (although it means that malformed ranges
\r
169 like m-a will match m and nothing else). */
\r
170 allowrange = false;
\r
172 if (allowrange && *p == '-' && p < end) {
\r
174 p += utf8_decode(p, end, &last);
\r
175 if (text >= first && text <= last)
\r
177 allowrange = false;
\r
179 p += utf8_decode(p, end, &first);
\r
190 ** Match the text against the pattern between start and end. This is a
\r
191 ** single pattern; a leading ! or @ must already be taken care of, and
\r
192 ** commas must be dealt with outside of this routine.
\r
195 match_pattern(const unsigned char *text, const unsigned char *start,
\r
196 const unsigned char *end)
\r
198 const unsigned char *q, *endclass;
\r
199 const unsigned char *p = start;
\r
201 int matched, width;
\r
204 for (; p <= end; p++) {
\r
205 if (!*text && *p != '*')
\r
212 /* Fall through. */
\r
220 text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
\r
224 /* Consecutive stars are equivalent to one. Advance pattern to
\r
225 the character after the star. */
\r
226 for (++p; *p == '*'; p++)
\r
229 /* A trailing star will match anything. */
\r
233 /* Basic algorithm: Recurse at each point where the * could
\r
234 possibly match. If the match succeeds or aborts, return
\r
235 immediately; otherwise, try the next position.
\r
237 Optimization: If the character after the * in the pattern
\r
238 isn't a metacharacter (the common case), then the * has to
\r
239 consume characters at least up to the next occurance of that
\r
240 character in the text. Scan forward for those points rather
\r
241 than recursing at every possible point to save the extra
\r
242 function call overhead. */
\r
243 ismeta = (*p == '[' || *p == '?' || *p == '\\');
\r
245 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
\r
247 matched = match_pattern(text, p, end);
\r
250 while (*text && *text != *p) {
\r
252 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
\r
256 matched = match_pattern(++text, p + 1, end);
\r
258 if (matched != false)
\r
264 /* Find the end of the character class, making sure not to pick
\r
265 up a close bracket at the beginning of the class. */
\r
267 q = p + (*p == '^') + 1;
\r
270 endclass = (unsigned char *)memchr(q, ']', (size_t) (end - q + 1));
\r
274 /* Do the heavy lifting in another function for clarity, since
\r
275 character classes are an uncommon case. */
\r
276 text += utf8_decode(text, NULL, &c);
\r
277 if (!match_class(c, p, endclass - 1))
\r
284 return (*text == '\0');
\r
289 ** Takes text and a wildmat expression; a wildmat expression is a
\r
290 ** comma-separated list of wildmat patterns, optionally preceeded by ! to
\r
291 ** invert the sense of the expression. Returns WILDMAT_MATCH if that
\r
292 ** expression matches the text, WILDMAT_FAIL otherwise. If allowpoison is
\r
293 ** set, allow @ to introduce a poison expression (the same as !, but if it
\r
294 ** triggers the failed match the routine returns WILDMAT_POISON instead).
\r
296 static enum uwildmat
\r
297 match_expression(const unsigned char *text, const unsigned char *start,
\r
300 const unsigned char *end, *split;
\r
301 const unsigned char *p = start;
\r
302 bool reverse, escaped;
\r
303 bool match = false;
\r
304 bool poison = false;
\r
305 bool poisoned = false;
\r
307 /* Handle the empty expression separately, since otherwise end will be
\r
308 set to an invalid pointer. */
\r
310 return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
\r
311 end = start + strlen((const char *) start) - 1;
\r
313 /* Main match loop. Find each comma that separates patterns, and attempt
\r
314 to match the text with each pattern in order. The last matching
\r
315 pattern determines whether the whole expression matches. */
\r
316 for (; p <= end + 1; p = split + 1) {
\r
318 poison = (*p == '@');
\r
319 reverse = (*p == '!') || poison;
\r
323 /* Find the first unescaped comma, if any. If there is none, split
\r
324 will be one greater than end and point at the nul at the end of
\r
326 for (escaped = false, split = p; split <= end; split++) {
\r
327 if (*split == '[') {
\r
331 while (split <= end && *split != ']')
\r
334 if (*split == ',' && !escaped)
\r
336 escaped = (*split == '\\') ? !escaped : false;
\r
339 /* Optimization: If match == !reverse and poison == poisoned, this
\r
340 pattern can't change the result, so don't do any work. */
\r
341 if (match == !reverse && poison == poisoned)
\r
343 if (match_pattern(text, p, split - 1) == true) {
\r
349 return UWILDMAT_POISON;
\r
350 return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
\r
355 ** User-level routine used for wildmats where @ should be treated as a
\r
356 ** regular character.
\r
359 uwildmat(const char *text, const char *pat)
\r
361 const unsigned char *utext = (const unsigned char *) text;
\r
362 const unsigned char *upat = (const unsigned char *) pat;
\r
364 if (upat[0] == '*' && upat[1] == '\0')
\r
367 return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
\r
372 ** User-level routine used for wildmats that support poison matches.
\r
375 uwildmat_poison(const char *text, const char *pat)
\r
377 const unsigned char *utext = (const unsigned char *) text;
\r
378 const unsigned char *upat = (const unsigned char *) pat;
\r
380 if (upat[0] == '*' && upat[1] == '\0')
\r
381 return UWILDMAT_MATCH;
\r
383 return match_expression(utext, upat, true);
\r
388 ** User-level routine for simple expressions (neither , nor ! are special).
\r
391 uwildmat_simple(const char *text, const char *pat)
\r
393 const unsigned char *utext = (const unsigned char *) text;
\r
394 const unsigned char *upat = (const unsigned char *) pat;
\r
397 if (upat[0] == '*' && upat[1] == '\0')
\r
400 length = strlen(pat);
\r
401 return (match_pattern(utext, upat, upat + length - 1) == true);
\r