+/* $Id: uwildmat.c 6779 2004-05-17 07:25:28Z rra $\r
+**\r
+** wildmat pattern matching with Unicode UTF-8 extensions.\r
+**\r
+** Do shell-style pattern matching for ?, \, [], and * characters. Might not\r
+** be robust in face of malformed patterns; e.g., "foo[a-" could cause a\r
+** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed\r
+** July 2000; all malformed patterns should now just fail to match anything.)\r
+**\r
+** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.\r
+** Rich $alz is now <rsalz@osf.org>.\r
+**\r
+** April, 1991: Replaced mutually-recursive calls with in-line code for the\r
+** star character.\r
+**\r
+** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.\r
+** This can greatly speed up failing wildcard patterns. For example:\r
+**\r
+** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*\r
+** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1\r
+** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1\r
+**\r
+** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without\r
+** the ABORT code, it takes 22310 calls to fail. Ugh. The following\r
+** explanation is from Lars:\r
+**\r
+** The precondition that must be fulfilled is that DoMatch will consume at\r
+** least one character in text. This is true if *p is neither '*' nor '\0'.)\r
+** The last return has ABORT instead of false to avoid quadratic behaviour in\r
+** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each\r
+** star-loop has to run to the end of the text; with ABORT only the last one\r
+** does.\r
+**\r
+** Once the control of one instance of DoMatch enters the star-loop, that\r
+** instance will return either true or ABORT, and any calling instance will\r
+** therefore return immediately after (without calling recursively again).\r
+** In effect, only one star-loop is ever active. It would be possible to\r
+** modify the code to maintain this context explicitly, eliminating all\r
+** recursive calls at the cost of some complication and loss of clarity (and\r
+** the ABORT stuff seems to be unclear enough by itself). I think it would\r
+** be unwise to try to get this into a released version unless you have a\r
+** good test data base to try it out on.\r
+**\r
+** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket\r
+** handling for character sets.\r
+**\r
+** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add\r
+** support for ',', '!', and optionally '@' to the core wildmat routine.\r
+** Broke the character class matching into a separate function for clarity\r
+** since it's infrequently used in practice, and added some simple lookahead\r
+** to significantly decrease the recursive calls in the '*' matching code.\r
+** Added support for UTF-8 as the default character set for any high-bit\r
+** characters.\r
+**\r
+** For more information on UTF-8, see RFC 2279.\r
+**\r
+** Please note that this file is intentionally written so that conditionally\r
+** executed expressions are on separate lines from the condition to\r
+** facilitate analysis of the coverage of the test suite using purecov.\r
+** Please preserve this. As of March 11, 2001, purecov reports that the\r
+** accompanying test suite achieves 100% coverage of this file.\r
+*/\r
+\r
+//#include "config.h"\r
+//#include "clibrary.h"\r
+//#include "libinn.h"\r
+#include "../../include/nntp/uwildmat.h"\r
+#include <string>\r
+\r
+#define ABORT -1\r
+\r
+/* Whether or not an octet looks like the start of a UTF-8 character. */\r
+#define ISUTF8(c) (((c) & 0xc0) == 0xc0)\r
+\r
+\r
+/*\r
+** Determine the length of a non-ASCII character in octets (for advancing\r
+** pointers when skipping over characters). Takes a pointer to the start of\r
+** the character and to the last octet of the string. If end is NULL, expect\r
+** the string pointed to by start to be nul-terminated. If the character is\r
+** malformed UTF-8, return 1 to treat it like an eight-bit local character.\r
+*/\r
+static int\r
+utf8_length(const unsigned char *start, const unsigned char *end)\r
+{\r
+ unsigned char mask = 0x80;\r
+ const unsigned char *p;\r
+ int length = 0;\r
+ int left;\r
+\r
+ for (; mask > 0 && (*start & mask) == mask; mask >>= 1)\r
+ length++;\r
+ if (length < 2 || length > 6)\r
+ return 1;\r
+ if (end != NULL && (end - start + 1) < length)\r
+ return 1;\r
+ left = length - 1;\r
+ p = start + 1;\r
+ for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)\r
+ left--;\r
+ return (left == 0) ? length : 1;\r
+}\r
+\r
+\r
+/*\r
+** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the\r
+** character and to the last octet of the string, and to a uint32_t into\r
+** which to put the decoded UCS-4 value. If end is NULL, expect the string\r
+** pointed to by start to be nul-terminated. Returns the number of octets in\r
+** the UTF-8 encoding. If the UTF-8 character is malformed, set result to\r
+** the decimal value of the first octet; this is wrong, but it will generally\r
+** cause the rest of the wildmat matching to do the right thing for non-UTF-8\r
+** input.\r
+*/\r
+static int\r
+utf8_decode(const unsigned char *start, const unsigned char *end,\r
+ uint32_t *result)\r
+{\r
+ uint32_t value = 0;\r
+ int length, i;\r
+ const unsigned char *p = start;\r
+ unsigned char mask;\r
+\r
+ length = utf8_length(start, end);\r
+ if (length < 2) {\r
+ *result = *start;\r
+ return 1;\r
+ }\r
+ mask = (1 << (7 - length)) - 1;\r
+ value = *p & mask;\r
+ p++;\r
+ for (i = length - 1; i > 0; i--) {\r
+ value = (value << 6) | (*p & 0x3f);\r
+ p++;\r
+ }\r
+ *result = value;\r
+ return length;\r
+}\r
+\r
+\r
+/*\r
+** Match a character class against text, a UCS-4 character. start is a\r
+** pointer to the first character of the character class, end a pointer to\r
+** the last. Returns whether the class matches that character.\r
+*/\r
+static bool\r
+match_class(uint32_t text, const unsigned char *start,\r
+ const unsigned char *end)\r
+{\r
+ bool reversed, allowrange;\r
+ const unsigned char *p = start;\r
+ uint32_t first, last;\r
+\r
+ /* Check for an inverted character class (starting with ^). If the\r
+ character matches the character class, we return !reversed; that way,\r
+ we return true if it's a regular character class and false if it's a\r
+ reversed one. If the character doesn't match, we return reversed. */\r
+ reversed = (*p == '^');\r
+ if (reversed)\r
+ p++;\r
+\r
+ /* Walk through the character class until we reach the end or find a\r
+ match, handling character ranges as we go. Only permit a range to\r
+ start when allowrange is true; this allows - to be treated like a\r
+ normal character as the first character of the class and catches\r
+ malformed ranges like a-e-n. We treat the character at the beginning\r
+ of a range as both a regular member of the class and the beginning of\r
+ the range; this is harmless (although it means that malformed ranges\r
+ like m-a will match m and nothing else). */\r
+ allowrange = false;\r
+ while (p <= end) {\r
+ if (allowrange && *p == '-' && p < end) {\r
+ p++;\r
+ p += utf8_decode(p, end, &last);\r
+ if (text >= first && text <= last)\r
+ return !reversed;\r
+ allowrange = false;\r
+ } else {\r
+ p += utf8_decode(p, end, &first);\r
+ if (text == first)\r
+ return !reversed;\r
+ allowrange = true;\r
+ }\r
+ }\r
+ return reversed;\r
+}\r
+\r
+\r
+/*\r
+** Match the text against the pattern between start and end. This is a\r
+** single pattern; a leading ! or @ must already be taken care of, and\r
+** commas must be dealt with outside of this routine.\r
+*/\r
+static int\r
+match_pattern(const unsigned char *text, const unsigned char *start,\r
+ const unsigned char *end)\r
+{\r
+ const unsigned char *q, *endclass;\r
+ const unsigned char *p = start;\r
+ bool ismeta;\r
+ int matched, width;\r
+ uint32_t c;\r
+\r
+ for (; p <= end; p++) {\r
+ if (!*text && *p != '*')\r
+ return ABORT;\r
+\r
+ switch (*p) {\r
+ case '\\':\r
+ if (!*++p)\r
+ return ABORT;\r
+ /* Fall through. */\r
+\r
+ default:\r
+ if (*text++ != *p)\r
+ return false;\r
+ break;\r
+\r
+ case '?':\r
+ text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
+ break;\r
+\r
+ case '*':\r
+ /* Consecutive stars are equivalent to one. Advance pattern to\r
+ the character after the star. */\r
+ for (++p; *p == '*'; p++)\r
+ ;\r
+\r
+ /* A trailing star will match anything. */\r
+ if (p > end)\r
+ return true;\r
+\r
+ /* Basic algorithm: Recurse at each point where the * could\r
+ possibly match. If the match succeeds or aborts, return\r
+ immediately; otherwise, try the next position.\r
+\r
+ Optimization: If the character after the * in the pattern\r
+ isn't a metacharacter (the common case), then the * has to\r
+ consume characters at least up to the next occurance of that\r
+ character in the text. Scan forward for those points rather\r
+ than recursing at every possible point to save the extra\r
+ function call overhead. */\r
+ ismeta = (*p == '[' || *p == '?' || *p == '\\');\r
+ while (*text) {\r
+ width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
+ if (ismeta) {\r
+ matched = match_pattern(text, p, end);\r
+ text += width;\r
+ } else {\r
+ while (*text && *text != *p) {\r
+ text += width;\r
+ width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;\r
+ }\r
+ if (!*text)\r
+ return ABORT;\r
+ matched = match_pattern(++text, p + 1, end);\r
+ }\r
+ if (matched != false)\r
+ return matched;\r
+ }\r
+ return ABORT;\r
+\r
+ case '[':\r
+ /* Find the end of the character class, making sure not to pick\r
+ up a close bracket at the beginning of the class. */\r
+ p++;\r
+ q = p + (*p == '^') + 1;\r
+ if (q > end)\r
+ return ABORT;\r
+ endclass = (unsigned char *)memchr(q, ']', (size_t) (end - q + 1));\r
+ if (!endclass)\r
+ return ABORT;\r
+\r
+ /* Do the heavy lifting in another function for clarity, since\r
+ character classes are an uncommon case. */\r
+ text += utf8_decode(text, NULL, &c);\r
+ if (!match_class(c, p, endclass - 1))\r
+ return false;\r
+ p = endclass;\r
+ break;\r
+ }\r
+ }\r
+\r
+ return (*text == '\0');\r
+}\r
+\r
+\r
+/*\r
+** Takes text and a wildmat expression; a wildmat expression is a\r
+** comma-separated list of wildmat patterns, optionally preceeded by ! to\r
+** invert the sense of the expression. Returns WILDMAT_MATCH if that\r
+** expression matches the text, WILDMAT_FAIL otherwise. If allowpoison is\r
+** set, allow @ to introduce a poison expression (the same as !, but if it\r
+** triggers the failed match the routine returns WILDMAT_POISON instead).\r
+*/\r
+static enum uwildmat\r
+match_expression(const unsigned char *text, const unsigned char *start,\r
+ bool allowpoison)\r
+{\r
+ const unsigned char *end, *split;\r
+ const unsigned char *p = start;\r
+ bool reverse, escaped;\r
+ bool match = false;\r
+ bool poison = false;\r
+ bool poisoned = false;\r
+\r
+ /* Handle the empty expression separately, since otherwise end will be\r
+ set to an invalid pointer. */\r
+ if (!*p)\r
+ return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;\r
+ end = start + strlen((const char *) start) - 1;\r
+\r
+ /* Main match loop. Find each comma that separates patterns, and attempt \r
+ to match the text with each pattern in order. The last matching\r
+ pattern determines whether the whole expression matches. */\r
+ for (; p <= end + 1; p = split + 1) {\r
+ if (allowpoison)\r
+ poison = (*p == '@');\r
+ reverse = (*p == '!') || poison;\r
+ if (reverse)\r
+ p++;\r
+\r
+ /* Find the first unescaped comma, if any. If there is none, split\r
+ will be one greater than end and point at the nul at the end of\r
+ the string. */\r
+ for (escaped = false, split = p; split <= end; split++) {\r
+ if (*split == '[') {\r
+ split++;\r
+ if (*split == ']')\r
+ split++;\r
+ while (split <= end && *split != ']')\r
+ split++;\r
+ }\r
+ if (*split == ',' && !escaped)\r
+ break;\r
+ escaped = (*split == '\\') ? !escaped : false;\r
+ }\r
+\r
+ /* Optimization: If match == !reverse and poison == poisoned, this\r
+ pattern can't change the result, so don't do any work. */\r
+ if (match == !reverse && poison == poisoned)\r
+ continue;\r
+ if (match_pattern(text, p, split - 1) == true) {\r
+ poisoned = poison;\r
+ match = !reverse;\r
+ }\r
+ }\r
+ if (poisoned)\r
+ return UWILDMAT_POISON;\r
+ return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;\r
+}\r
+\r
+\r
+/*\r
+** User-level routine used for wildmats where @ should be treated as a\r
+** regular character.\r
+*/\r
+bool\r
+uwildmat(const char *text, const char *pat)\r
+{\r
+ const unsigned char *utext = (const unsigned char *) text;\r
+ const unsigned char *upat = (const unsigned char *) pat;\r
+\r
+ if (upat[0] == '*' && upat[1] == '\0')\r
+ return true;\r
+ else\r
+ return (match_expression(utext, upat, false) == UWILDMAT_MATCH);\r
+}\r
+\r
+\r
+/*\r
+** User-level routine used for wildmats that support poison matches.\r
+*/\r
+enum uwildmat\r
+uwildmat_poison(const char *text, const char *pat)\r
+{\r
+ const unsigned char *utext = (const unsigned char *) text;\r
+ const unsigned char *upat = (const unsigned char *) pat;\r
+\r
+ if (upat[0] == '*' && upat[1] == '\0')\r
+ return UWILDMAT_MATCH;\r
+ else\r
+ return match_expression(utext, upat, true);\r
+}\r
+\r
+\r
+/*\r
+** User-level routine for simple expressions (neither , nor ! are special).\r
+*/\r
+bool\r
+uwildmat_simple(const char *text, const char *pat)\r
+{\r
+ const unsigned char *utext = (const unsigned char *) text;\r
+ const unsigned char *upat = (const unsigned char *) pat;\r
+ size_t length;\r
+\r
+ if (upat[0] == '*' && upat[1] == '\0')\r
+ return true;\r
+ else {\r
+ length = strlen(pat);\r
+ return (match_pattern(utext, upat, upat + length - 1) == true);\r
+ }\r
+}\r