From fc068cf4eb6814e848e4dc54e1c5cf4c30a79a6f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Wed, 19 Dec 2007 16:09:03 -0700 Subject: [PATCH] Fix memmem to avoid O(n^2) worst-case complexity. * lib/memmem.c (knuth_morris_pratt): New function. (memmem): Use it if first few naive iterations fail. * m4/memmem.m4 (gl_FUNC_MEMMEM): Detect cygwin bug. * modules/memcmp (License): Set to LGPLv2+, not LGPL. * modules/memchr (License): Likewise. * modules/memmem (Depends-on): Add memcmp, memchr, stdbool, and malloca. * tests/test-memmem.c: Rewrite, borrowing ideas from test-mbsstr1.c; the old version wouldn't even compile! * modules/memmem-tests: New file. * lib/string.in.h (rpl_memmem): Add declaration. * modules/string (Makefile.am): Substitute REPLACE_MEMMEM. * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Default for REPLACE_MEMMEM. Signed-off-by: Eric Blake --- ChangeLog | 18 +++++ lib/memmem.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++----- lib/string.in.h | 5 +- m4/memmem.m4 | 14 +++- m4/string_h.m4 | 3 + modules/memchr | 2 +- modules/memcmp | 2 +- modules/memmem | 4 ++ modules/memmem-tests | 12 ++++ modules/string | 1 + tests/test-memmem.c | 147 +++++++++++++++++++++++++++++++-------- 11 files changed, 351 insertions(+), 49 deletions(-) create mode 100644 modules/memmem-tests diff --git a/ChangeLog b/ChangeLog index d52195d0a..280c7ca09 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2007-12-19 Eric Blake + + Fix memmem to avoid O(n^2) worst-case complexity. + * lib/memmem.c (knuth_morris_pratt): New function. + (memmem): Use it if first few naive iterations fail. + * m4/memmem.m4 (gl_FUNC_MEMMEM): Detect cygwin bug. + * modules/memcmp (License): Set to LGPLv2+, not LGPL. + * modules/memchr (License): Likewise. + * modules/memmem (Depends-on): Add memcmp, memchr, stdbool, and + malloca. + * tests/test-memmem.c: Rewrite, borrowing ideas from + test-mbsstr1.c; the old version wouldn't even compile! + * modules/memmem-tests: New file. + * lib/string.in.h (rpl_memmem): Add declaration. + * modules/string (Makefile.am): Substitute REPLACE_MEMMEM. + * m4/string_h.m4 (gl_HEADER_STRING_H_DEFAULTS): Default for + REPLACE_MEMMEM. + 2007-12-18 Paul Eggert Fix problem with _GL_JUST_INCLUDE_SYSTEM_INTTYPES_H on VMS. diff --git a/lib/memmem.c b/lib/memmem.c index e8822221c..69d2edc20 100644 --- a/lib/memmem.c +++ b/lib/memmem.c @@ -21,24 +21,114 @@ #include #include +#include + +#include "malloca.h" #ifndef _LIBC # define __builtin_expect(expr, val) (expr) #endif -#undef memmem +/* Knuth-Morris-Pratt algorithm. + See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm + Return a boolean indicating success. */ + +static bool +knuth_morris_pratt (const char *haystack, const char *last_haystack, + const char *needle, size_t m, + const char **resultp) +{ + /* Allocate the table. */ + size_t *table = (size_t *) malloca (m * sizeof (size_t)); + if (table == NULL) + return false; + /* Fill the table. + For 0 < i < m: + 0 < table[i] <= i is defined such that + rhaystack[0..i-1] == needle[0..i-1] and rhaystack[i] != needle[i] + implies + forall 0 <= x < table[i]: rhaystack[x..x+m-1] != needle[0..m-1], + and table[i] is as large as possible with this property. + table[0] remains uninitialized. */ + { + size_t i, j; + + table[1] = 1; + j = 0; + for (i = 2; i < m; i++) + { + unsigned char b = (unsigned char) needle[i - 1]; -/* Return the first occurrence of NEEDLE in HAYSTACK. */ + for (;;) + { + if (b == (unsigned char) needle[j]) + { + table[i] = i - ++j; + break; + } + if (j == 0) + { + table[i] = i; + break; + } + j = j - table[j]; + } + } + } + + /* Search, using the table to accelerate the processing. */ + { + size_t j; + const char *rhaystack; + const char *phaystack; + + *resultp = NULL; + j = 0; + rhaystack = haystack; + phaystack = haystack; + /* Invariant: phaystack = rhaystack + j. */ + while (phaystack != last_haystack) + if ((unsigned char) needle[j] == (unsigned char) *phaystack) + { + j++; + phaystack++; + if (j == m) + { + /* The entire needle has been found. */ + *resultp = rhaystack; + break; + } + } + else if (j > 0) + { + /* Found a match of needle[0..j-1], mismatch at needle[j]. */ + rhaystack += table[j]; + j -= table[j]; + } + else + { + /* Found a mismatch at needle[0] already. */ + rhaystack++; + phaystack++; + } + } + + freea (table); + return true; +} + +/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK + if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in + HAYSTACK. */ void * -memmem (haystack, haystack_len, needle, needle_len) - const void *haystack; - size_t haystack_len; - const void *needle; - size_t needle_len; +memmem (const void *haystack, size_t haystack_len, + const void *needle, size_t needle_len) { - const char *begin; - const char *const last_possible - = (const char *) haystack + haystack_len - needle_len; + /* Operating with void * is awkward. */ + const char *Haystack = (const char *) haystack; + const char *Needle = (const char *) needle; + const char *last_haystack = Haystack + haystack_len; + const char *last_needle = Needle + needle_len; if (needle_len == 0) /* The first occurrence of the empty string is deemed to occur at @@ -50,12 +140,82 @@ memmem (haystack, haystack_len, needle, needle_len) if (__builtin_expect (haystack_len < needle_len, 0)) return NULL; - for (begin = (const char *) haystack; begin <= last_possible; ++begin) - if (begin[0] == ((const char *) needle)[0] && - !memcmp ((const void *) &begin[1], - (const void *) ((const char *) needle + 1), - needle_len - 1)) - return (void *) begin; + /* Use optimizations in memchr when possible. */ + if (__builtin_expect (needle_len == 1, 0)) + return memchr (haystack, (unsigned char) *Needle, haystack_len); + + /* Minimizing the worst-case complexity: + Let n = haystack_len, m = needle_len. + The naïve algorithm is O(n*m) worst-case. + The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a + memory allocation. + To achieve linear complexity and yet amortize the cost of the + memory allocation, we activate the Knuth-Morris-Pratt algorithm + only once the naïve algorithm has already run for some time; more + precisely, when + - the outer loop count is >= 10, + - the average number of comparisons per outer loop is >= 5, + - the total number of comparisons is >= m. + But we try it only once. If the memory allocation attempt failed, + we don't retry it. */ + { + bool try_kmp = true; + size_t outer_loop_count = 0; + size_t comparison_count = 0; + + /* Speed up the following searches of needle by caching its first + character. */ + char b = *Needle++; + + for (;; Haystack++) + { + if (Haystack == last_haystack) + /* No match. */ + return NULL; + + /* See whether it's advisable to use an asymptotically faster + algorithm. */ + if (try_kmp + && outer_loop_count >= 10 + && comparison_count >= 5 * outer_loop_count) + { + /* See if needle + comparison_count now reaches the end of + needle. */ + if (comparison_count >= needle_len) + { + /* Try the Knuth-Morris-Pratt algorithm. */ + const char *result; + if (knuth_morris_pratt (Haystack, last_haystack, + Needle - 1, needle_len, &result)) + return (void *) result; + try_kmp = false; + } + } + + outer_loop_count++; + comparison_count++; + if (*Haystack == b) + /* The first character matches. */ + { + const char *rhaystack = Haystack + 1; + const char *rneedle = Needle; + + for (;; rhaystack++, rneedle++) + { + if (rneedle == last_needle) + /* Found a match. */ + return (void *) Haystack; + if (rhaystack == last_haystack) + /* No match. */ + return NULL; + comparison_count++; + if (*rhaystack != *rneedle) + /* Nothing in this round. */ + break; + } + } + } + } return NULL; } diff --git a/lib/string.in.h b/lib/string.in.h index c60e2f316..09205e7f0 100644 --- a/lib/string.in.h +++ b/lib/string.in.h @@ -35,7 +35,10 @@ extern "C" { /* Return the first occurrence of NEEDLE in HAYSTACK. */ #if @GNULIB_MEMMEM@ -# if ! @HAVE_DECL_MEMMEM@ +# if @REPLACE_MEMMEM@ +# define memmem rpl_memmem +# endif +# if ! @HAVE_DECL_MEMMEM@ || @REPLACE_MEMMEM@ extern void *memmem (void const *__haystack, size_t __haystack_len, void const *__needle, size_t __needle_len); # endif diff --git a/m4/memmem.m4 b/m4/memmem.m4 index e6a3da193..a529af3be 100644 --- a/m4/memmem.m4 +++ b/m4/memmem.m4 @@ -1,4 +1,4 @@ -# memmem.m4 serial 5 +# memmem.m4 serial 6 dnl Copyright (C) 2002, 2003, 2004, 2007 Free Software Foundation, Inc. dnl This file is free software; the Free Software Foundation dnl gives unlimited permission to copy and/or distribute it, @@ -14,6 +14,18 @@ AC_DEFUN([gl_FUNC_MEMMEM], AC_CHECK_DECLS_ONCE(memmem) if test $ac_cv_have_decl_memmem = no; then HAVE_DECL_MEMMEM=0 + else + AC_CACHE_CHECK([whether memmem works], [gl_cv_func_memmem_works], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include ], + [return !memmem ("a", 1, NULL, 0);])], + [gl_cv_func_memmem_works=yes], [gl_cv_func_memmem_works=no], + [dnl pessimistically assume the worst, since even glibc 2.6.1 + dnl has quadratic complexity in its memmem + gl_cv_func_memmem_works=no])]) + if test $gl_cv_func_memmem_works = no; then + REPLACE_MEMMEM=1 + AC_LIBOBJ([memmem]) + fi fi gl_PREREQ_MEMMEM ]) diff --git a/m4/string_h.m4 b/m4/string_h.m4 index 83711799a..99a0dabbc 100644 --- a/m4/string_h.m4 +++ b/m4/string_h.m4 @@ -5,6 +5,8 @@ # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. +# serial 2 + # Written by Paul Eggert. AC_DEFUN([gl_HEADER_STRING_H], @@ -75,4 +77,5 @@ AC_DEFUN([gl_HEADER_STRING_H_DEFAULTS], HAVE_DECL_STRTOK_R=1; AC_SUBST([HAVE_DECL_STRTOK_R]) HAVE_DECL_STRERROR=1; AC_SUBST([HAVE_DECL_STRERROR]) REPLACE_STRERROR=0; AC_SUBST([REPLACE_STRERROR]) + REPLACE_MEMMEM=0; AC_SUBST([REPLACE_MEMMEM]) ]) diff --git a/modules/memchr b/modules/memchr index 3b18a1c5f..5e908d2bf 100644 --- a/modules/memchr +++ b/modules/memchr @@ -16,7 +16,7 @@ Include: License: -LGPL +LGPLv2+ Maintainer: Jim Meyering, glibc diff --git a/modules/memcmp b/modules/memcmp index fc85d852f..e6b74e509 100644 --- a/modules/memcmp +++ b/modules/memcmp @@ -16,7 +16,7 @@ Include: License: -LGPL +LGPLv2+ Maintainer: Jim Meyering, glibc diff --git a/modules/memmem b/modules/memmem index c51ee3093..738cd6a29 100644 --- a/modules/memmem +++ b/modules/memmem @@ -8,6 +8,10 @@ m4/memmem.m4 Depends-on: extensions string +stdbool +malloca +memchr +memcmp configure.ac: gl_FUNC_MEMMEM diff --git a/modules/memmem-tests b/modules/memmem-tests new file mode 100644 index 000000000..3d79e477e --- /dev/null +++ b/modules/memmem-tests @@ -0,0 +1,12 @@ +Files: +tests/test-memmem.c + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-memmem +TESTS_ENVIRONMENT += EXEEXT='@EXEEXT@' +check_PROGRAMS += test-memmem + diff --git a/modules/string b/modules/string index 505ae6a53..6f3226d0f 100644 --- a/modules/string +++ b/modules/string @@ -66,6 +66,7 @@ string.h: string.in.h -e 's|@''HAVE_STRCASESTR''@|$(HAVE_STRCASESTR)|g' \ -e 's|@''HAVE_DECL_STRTOK_R''@|$(HAVE_DECL_STRTOK_R)|g' \ -e 's|@''HAVE_DECL_STRERROR''@|$(HAVE_DECL_STRERROR)|g' \ + -e 's|@''REPLACE_MEMMEM''@|$(REPLACE_MEMMEM)|g' \ -e 's|@''REPLACE_STRERROR''@|$(REPLACE_STRERROR)|g' \ -e '/definition of GL_LINK_WARNING/r $(LINK_WARNING_H)' \ < $(srcdir)/string.in.h; \ diff --git a/tests/test-memmem.c b/tests/test-memmem.c index 65980c26c..935edbbc6 100644 --- a/tests/test-memmem.c +++ b/tests/test-memmem.c @@ -19,39 +19,128 @@ #include #include +#include + +#define ASSERT(expr) \ + do \ + { \ + if (!(expr)) \ + { \ + fprintf (stderr, "%s:%d: assertion failed\n", __FILE__, __LINE__); \ + abort (); \ + } \ + } \ + while (0) int main (int argc, char *argv[]) { - const char *big = "foo-bar-baz"; - char *p; - - p = memmem (big, "foo", strlen (big)); - if (p != big) - { - fprintf (stderr, "memmem FAILURE 1\n"); - return 1; - } - - p = memmem (big, "baz", strlen (big)); - if (p != big + strlen (big) - 3) - { - fprintf (stderr, "memmem FAILURE 2\n"); - return 1; - - p = memmem (big, "-", strlen (big)); - if (p != big + 3) - { - fprintf (stderr, "memmem FAILURE 3\n"); - return 1; - } - - p = memmem (big, "baz", strlen (big) - 1); - if (p != NULL) - { - fprintf (stderr, "memmem FAILURE 4\n"); - return 1; - } + { + const char input[] = "foo"; + const char *result = memmem (input, strlen (input), "", 0); + ASSERT (result == input); + } + + { + const char input[] = "foo"; + const char *result = memmem (input, strlen (input), "o", 1); + ASSERT (result == input + 1); + } + + { + const char input[] = "ABC ABCDAB ABCDABCDABDE"; + const char *result = memmem (input, strlen (input), "ABCDABD", 7); + ASSERT (result == input + 15); + } + + { + const char input[] = "ABC ABCDAB ABCDABCDABDE"; + const char *result = memmem (input, strlen (input), "ABCDABE", 7); + ASSERT (result == NULL); + } + + /* Check that length 0 does not dereference NULL. */ + { + const char *result = memmem (NULL, 0, "foo", 3); + ASSERT (result == NULL); + } + + { + const char input[] = "foo"; + const char *result = memmem (input, strlen (input), NULL, 0); + ASSERT (result == input); + } + + /* Check that a very long haystack is handled quickly if the needle is + short and occurs near the beginning. */ + { + size_t repeat = 10000; + size_t m = 1000000; + char *needle = + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + size_t n = strlen (needle); + char *haystack = (char *) malloc (m + 1); + if (haystack != NULL) + { + memset (haystack, 'A', m); + haystack[0] = 'B'; + + for (; repeat > 0; repeat--) + { + ASSERT (memmem (haystack, m, needle, n) == haystack + 1); + } + + free (haystack); + } + } + + /* Check that a very long needle is discarded quickly if the haystack is + short. */ + { + size_t repeat = 10000; + size_t m = 1000000; + char *haystack = + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + "ABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABABAB"; + size_t n = strlen (haystack); + char *needle = (char *) malloc (m + 1); + if (needle != NULL) + { + memset (needle, 'A', m); + + for (; repeat > 0; repeat--) + { + ASSERT (memmem (haystack, n, needle, m) == NULL); + } + + free (needle); + } + } + + /* Check that the asymptotic worst-case complexity is not quadratic. */ + { + size_t m = 1000000; + char *haystack = (char *) malloc (2 * m + 1); + char *needle = (char *) malloc (m + 1); + if (haystack != NULL && needle != NULL) + { + const char *result; + + memset (haystack, 'A', 2 * m); + haystack[2 * m] = 'B'; + + memset (needle, 'A', m); + needle[m] = 'B'; + + result = memmem (haystack, 2 * m + 1, needle, m + 1); + ASSERT (result == haystack + m); + } + if (needle != NULL) + free (needle); + if (haystack != NULL) + free (haystack); + } return 0; } -- 2.11.0