nntp/uwildmat.cpp

   1 /*  $Id: uwildmat.c 6779 2004-05-17 07:25:28Z rra $\r
   2 **\r
   3 **  wildmat pattern matching with Unicode UTF-8 extensions.\r
   4 **\r
   5 **  Do shell-style pattern matching for ?, \, [], and * characters.  Might not\r
   6 **  be robust in face of malformed patterns; e.g., "foo[a-" could cause a\r
   7 **  segmentation violation.  It is 8-bit clean.  (Robustness hopefully fixed\r
   8 **  July 2000; all malformed patterns should now just fail to match anything.)\r
   9 **\r
  10 **  Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.\r
  11 **  Rich $alz is now <rsalz@osf.org>.\r
  12 **\r
  13 **  April, 1991:  Replaced mutually-recursive calls with in-line code for the\r
  14 **  star character.\r
  15 **\r
  16 **  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.\r
  17 **  This can greatly speed up failing wildcard patterns.  For example:\r
  18 **\r
  19 **      pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*\r
  20 **      text 1:  -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1\r
  21 **      text 2:  -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1\r
  22 **\r
  23 **  Text 1 matches with 51 calls, while text 2 fails with 54 calls.  Without\r
  24 **  the ABORT code, it takes 22310 calls to fail.  Ugh.  The following\r
  25 **  explanation is from Lars:\r
  26 **\r
  27 **  The precondition that must be fulfilled is that DoMatch will consume at\r
  28 **  least one character in text.  This is true if *p is neither '*' nor '\0'.)\r
  29 **  The last return has ABORT instead of false to avoid quadratic behaviour in\r
  30 **  cases like pattern "*a*b*c*d" with text "abcxxxxx".  With false, each\r
  31 **  star-loop has to run to the end of the text; with ABORT only the last one\r
  32 **  does.\r
  33 **\r
  34 **  Once the control of one instance of DoMatch enters the star-loop, that\r
  35 **  instance will return either true or ABORT, and any calling instance will\r
  36 **  therefore return immediately after (without calling recursively again).\r
  37 **  In effect, only one star-loop is ever active.  It would be possible to\r
  38 **  modify the code to maintain this context explicitly, eliminating all\r
  39 **  recursive calls at the cost of some complication and loss of clarity (and\r
  40 **  the ABORT stuff seems to be unclear enough by itself).  I think it would\r
  41 **  be unwise to try to get this into a released version unless you have a\r
  42 **  good test data base to try it out on.\r
  43 **\r
  44 **  June, 1991:  Robert Elz <kre@munnari.oz.au> added minus and close bracket\r
  45 **  handling for character sets.\r
  46 **\r
  47 **  July, 2000:  Largely rewritten by Russ Allbery <rra@stanford.edu> to add\r
  48 **  support for ',', '!', and optionally '@' to the core wildmat routine.\r
  49 **  Broke the character class matching into a separate function for clarity\r
  50 **  since it's infrequently used in practice, and added some simple lookahead\r
  51 **  to significantly decrease the recursive calls in the '*' matching code.\r
  52 **  Added support for UTF-8 as the default character set for any high-bit\r
  53 **  characters.\r
  54 **\r
  55 **  For more information on UTF-8, see RFC 2279.\r
  56 **\r
  57 **  Please note that this file is intentionally written so that conditionally\r
  58 **  executed expressions are on separate lines from the condition to\r
  59 **  facilitate analysis of the coverage of the test suite using purecov.\r
  60 **  Please preserve this.  As of March 11, 2001, purecov reports that the\r
  61 **  accompanying test suite achieves 100% coverage of this file.\r
  62 */\r
  63 \r
  64 //#include "config.h"\r
  65 //#include "clibrary.h"\r
  66 //#include "libinn.h"\r
  67 #include "../../include/nntp/uwildmat.h"\r
  68 #include <string>\r
  69 #include <cstring>\r
  70 #ifndef _WIN32\r
  71 #include <stdint.h>\r
  72 #endif\r
  73 \r
  74 #define ABORT -1\r
  75 \r
  76 /* Whether or not an octet looks like the start of a UTF-8 character. */\r
  77 #define ISUTF8(c)       (((c) & 0xc0) == 0xc0)\r
  78 \r
  79 \r
  80 /*\r
  81 **  Determine the length of a non-ASCII character in octets (for advancing\r
  82 **  pointers when skipping over characters).  Takes a pointer to the start of\r
  83 **  the character and to the last octet of the string.  If end is NULL, expect\r
  84 **  the string pointed to by start to be nul-terminated.  If the character is\r
  85 **  malformed UTF-8, return 1 to treat it like an eight-bit local character.\r
  86 */\r
  87 static int\r
  88 utf8_length(const unsigned char *start, const unsigned char *end)\r
  89 {\r
  90     unsigned char mask = 0x80;\r
  91     const unsigned char *p;\r
  92     int length = 0;\r
  93     int left;\r
  94 \r
  95     for (; mask > 0 && (*start & mask) == mask; mask >>= 1)\r
  96         length++;\r
  97     if (length < 2 || length > 6)\r
  98         return 1;\r
  99     if (end != NULL && (end - start + 1) < length)\r
 100         return 1;\r
 101     left = length - 1;\r
 102     p = start + 1;\r
 103     for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)\r
 104         left--;\r
 105     return (left == 0) ? length : 1;\r
 106 }\r
 107 \r
 108 \r
 109 /*\r
 110 **  Convert a UTF-8 character to UCS-4.  Takes a pointer to the start of the\r
 111 **  character and to the last octet of the string, and to a uint32_t into\r
 112 **  which to put the decoded UCS-4 value.  If end is NULL, expect the string\r
 113 **  pointed to by start to be nul-terminated.  Returns the number of octets in\r
 114 **  the UTF-8 encoding.  If the UTF-8 character is malformed, set result to\r
 115 **  the decimal value of the first octet; this is wrong, but it will generally\r
 116 **  cause the rest of the wildmat matching to do the right thing for non-UTF-8\r
 117 **  input.\r
 118 */\r
 119 static int\r
 120 utf8_decode(const unsigned char *start, const unsigned char *end,\r
 121             uint32_t *result)\r
 122 {\r
 123     uint32_t value = 0;\r
 124     int length, i;\r
 125     const unsigned char *p = start;\r
 126     unsigned char mask;\r
 127 \r
 128     length = utf8_length(start, end);\r
 129     if (length < 2) {\r
 130         *result = *start;\r
 131         return 1;\r
 132     }\r
 133     mask = (1 << (7 - length)) - 1;\r
 134     value = *p & mask;\r
 135     p++;\r
 136     for (i = length - 1; i > 0; i--) {\r
 137         value = (value << 6) | (*p & 0x3f);\r
 138         p++;\r
 139     }\r
 140     *result = value;\r
 141     return length;\r
 142 }\r
 143 \r
 144 \r
 145 /*\r
 146 **  Match a character class against text, a UCS-4 character.  start is a\r
 147 **  pointer to the first character of the character class, end a pointer to\r
 148 **  the last.  Returns whether the class matches that character.\r
 149 */\r
 150 static bool\r
 151 match_class(uint32_t text, const unsigned char *start,\r
 152             const unsigned char *end)\r
 153 {\r
 154     bool reversed, allowrange;\r
 155     const unsigned char *p = start;\r
 156     uint32_t first, last;\r
 157 \r
 158     /* Check for an inverted character class (starting with ^).  If the\r
 159        character matches the character class, we return !reversed; that way,\r
 160        we return true if it's a regular character class and false if it's a\r
 161        reversed one.  If the character doesn't match, we return reversed. */\r
 162     reversed = (*p == '^');\r
 163     if (reversed)\r
 164         p++;\r
 165 \r
 166     /* Walk through the character class until we reach the end or find a\r
 167        match, handling character ranges as we go.  Only permit a range to\r
 168        start when allowrange is true; this allows - to be treated like a\r
 169        normal character as the first character of the class and catches\r
 170        malformed ranges like a-e-n.  We treat the character at the beginning\r
 171        of a range as both a regular member of the class and the beginning of\r
 172        the range; this is harmless (although it means that malformed ranges\r
 173        like m-a will match m and nothing else). */\r
 174     allowrange = false;\r
 175     while (p <= end) {\r
 176         if (allowrange && *p == '-' && p < end) {\r
 177             p++;\r
 178             p += utf8_decode(p, end, &last);\r
 179             if (text >= first && text <= last)\r
 180                 return !reversed;\r
 181             allowrange = false;\r
 182         } else {\r
 183             p += utf8_decode(p, end, &first);\r
 184             if (text == first)\r
 185                 return !reversed;\r
 186             allowrange = true;\r
 187         }\r
 188     }\r
 189     return reversed;\r
 190 }\r
 191 \r
 192 \r
 193 /*\r
 194 **  Match the text against the pattern between start and end.  This is a\r
 195 **  single pattern; a leading ! or @ must already be taken care of, and\r
 196 **  commas must be dealt with outside of this routine.\r
 197 */\r
 198 static int\r
 199 match_pattern(const unsigned char *text, const unsigned char *start,\r
 200               const unsigned char *end)\r
 201 {\r
 202     const unsigned char *q, *endclass;\r
 203     const unsigned char *p = start;\r
 204     bool ismeta;\r
 205     int matched, width;\r
 206     uint32_t c;\r
 207 \r
 208     for (; p <= end; p++) {\r
 209         if (!*text && *p != '*')\r
 210             return ABORT;\r
 211 \r
 212         switch (*p) {\r
 213         case '\\':\r
 214             if (!*++p)\r
 215                 return ABORT;\r
 216             /* Fall through. */\r
 217 \r
 218         default:\r
 219             if (*text++ != *p)\r
 220                 return false;\r
 221             break;\r
 222 \r
 223         case '?':\r
 224             text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
 225             break;\r
 226 \r
 227         case '*':\r
 228             /* Consecutive stars are equivalent to one.  Advance pattern to\r
 229                the character after the star. */\r
 230             for (++p; *p == '*'; p++)\r
 231                 ;\r
 232 \r
 233             /* A trailing star will match anything. */\r
 234             if (p > end)\r
 235                 return true;\r
 236 \r
 237             /* Basic algorithm: Recurse at each point where the * could\r
 238                possibly match.  If the match succeeds or aborts, return\r
 239                immediately; otherwise, try the next position.\r
 240 \r
 241                Optimization: If the character after the * in the pattern\r
 242                isn't a metacharacter (the common case), then the * has to\r
 243                consume characters at least up to the next occurance of that\r
 244                character in the text.  Scan forward for those points rather\r
 245                than recursing at every possible point to save the extra\r
 246                function call overhead. */\r
 247             ismeta = (*p == '[' || *p == '?' || *p == '\\');\r
 248             while (*text) {\r
 249                 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
 250                 if (ismeta) {\r
 251                     matched = match_pattern(text, p, end);\r
 252                     text += width;\r
 253                 } else {\r
 254                     while (*text && *text != *p) {\r
 255                         text += width;\r
 256                         width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
 257                     }\r
 258                     if (!*text)\r
 259                         return ABORT;\r
 260                     matched = match_pattern(++text, p + 1, end);\r
 261                 }\r
 262                 if (matched != false)\r
 263                     return matched;\r
 264             }\r
 265             return ABORT;\r
 266 \r
 267         case '[':\r
 268             /* Find the end of the character class, making sure not to pick\r
 269                up a close bracket at the beginning of the class. */\r
 270             p++;\r
 271             q = p + (*p == '^') + 1;\r
 272             if (q > end)\r
 273                 return ABORT;\r
 274             endclass = (unsigned char *)memchr(q, ']', (size_t) (end - q + 1));\r
 275             if (!endclass)\r
 276                 return ABORT;\r
 277 \r
 278             /* Do the heavy lifting in another function for clarity, since\r
 279                character classes are an uncommon case. */\r
 280             text += utf8_decode(text, NULL, &c);\r
 281             if (!match_class(c, p, endclass - 1))\r
 282                 return false;\r
 283             p = endclass;\r
 284             break;\r
 285         }\r
 286     }\r
 287 \r
 288     return (*text == '\0');\r
 289 }\r
 290 \r
 291 \r
 292 /*\r
 293 **  Takes text and a wildmat expression; a wildmat expression is a\r
 294 **  comma-separated list of wildmat patterns, optionally preceeded by ! to\r
 295 **  invert the sense of the expression.  Returns WILDMAT_MATCH if that\r
 296 **  expression matches the text, WILDMAT_FAIL otherwise.  If allowpoison is\r
 297 **  set, allow @ to introduce a poison expression (the same as !, but if it\r
 298 **  triggers the failed match the routine returns WILDMAT_POISON instead).\r
 299 */\r
 300 static enum uwildmat\r
 301 match_expression(const unsigned char *text, const unsigned char *start,\r
 302                  bool allowpoison)\r
 303 {\r
 304     const unsigned char *end, *split;\r
 305     const unsigned char *p = start;\r
 306     bool reverse, escaped;\r
 307     bool match = false;\r
 308     bool poison = false;\r
 309     bool poisoned = false;\r
 310 \r
 311     /* Handle the empty expression separately, since otherwise end will be\r
 312        set to an invalid pointer. */\r
 313     if (!*p)\r
 314         return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;\r
 315     end = start + strlen((const char *) start) - 1;\r
 316 \r
 317     /* Main match loop.  Find each comma that separates patterns, and attempt \r
 318        to match the text with each pattern in order.  The last matching\r
 319        pattern determines whether the whole expression matches. */\r
 320     for (; p <= end + 1; p = split + 1) {\r
 321         if (allowpoison)\r
 322             poison = (*p == '@');\r
 323         reverse = (*p == '!') || poison;\r
 324         if (reverse)\r
 325             p++;\r
 326 \r
 327         /* Find the first unescaped comma, if any.  If there is none, split\r
 328            will be one greater than end and point at the nul at the end of\r
 329            the string. */\r
 330         for (escaped = false, split = p; split <= end; split++) {\r
 331             if (*split == '[') {\r
 332                 split++;\r
 333                 if (*split == ']')\r
 334                     split++;\r
 335                 while (split <= end && *split != ']')\r
 336                     split++;\r
 337             }\r
 338             if (*split == ',' && !escaped)\r
 339                 break;\r
 340             escaped = (*split == '\\') ? !escaped : false;\r
 341         }\r
 342 \r
 343         /* Optimization: If match == !reverse and poison == poisoned, this\r
 344            pattern can't change the result, so don't do any work. */\r
 345         if (match == !reverse && poison == poisoned)\r
 346             continue;\r
 347         if (match_pattern(text, p, split - 1) == true) {\r
 348             poisoned = poison;\r
 349             match = !reverse;\r
 350         }\r
 351     }\r
 352     if (poisoned)\r
 353         return UWILDMAT_POISON;\r
 354     return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;\r
 355 }\r
 356 \r
 357 \r
 358 /*\r
 359 **  User-level routine used for wildmats where @ should be treated as a\r
 360 **  regular character.\r
 361 */\r
 362 bool\r
 363 uwildmat(const char *text, const char *pat)\r
 364 {\r
 365     const unsigned char *utext = (const unsigned char *) text;\r
 366     const unsigned char *upat = (const unsigned char *) pat;\r
 367 \r
 368     if (upat[0] == '*' && upat[1] == '\0')\r
 369         return true;\r
 370     else\r
 371         return (match_expression(utext, upat, false) == UWILDMAT_MATCH);\r
 372 }\r
 373 \r
 374 \r
 375 /*\r
 376 **  User-level routine used for wildmats that support poison matches.\r
 377 */\r
 378 enum uwildmat\r
 379 uwildmat_poison(const char *text, const char *pat)\r
 380 {\r
 381     const unsigned char *utext = (const unsigned char *) text;\r
 382     const unsigned char *upat = (const unsigned char *) pat;\r
 383 \r
 384     if (upat[0] == '*' && upat[1] == '\0')\r
 385         return UWILDMAT_MATCH;\r
 386     else\r
 387         return match_expression(utext, upat, true);\r
 388 }\r
 389 \r
 390 \r
 391 /*\r
 392 **  User-level routine for simple expressions (neither , nor ! are special).\r
 393 */\r
 394 bool\r
 395 uwildmat_simple(const char *text, const char *pat)\r
 396 {\r
 397     const unsigned char *utext = (const unsigned char *) text;\r
 398     const unsigned char *upat = (const unsigned char *) pat;\r
 399     size_t length;\r
 400 \r
 401     if (upat[0] == '*' && upat[1] == '\0')\r
 402         return true;\r
 403     else {\r
 404         length = strlen(pat);\r
 405         return (match_pattern(utext, upat, upat + length - 1) == true);\r
 406     }\r
 407 }\r