+2012-02-07 Paul Eggert <eggert@cs.ucla.edu>
+
+ regex: merge glibc changes
+
+ * lib/regcomp.c (init_dfa): Tighten overflow checks to test
+ for IDX_MAX too, since IDX_MAX can be much less than SIZE_MAX.
+ (init_word_char): Work even if bitset words are not exactly 32 or
+ 64 bits wide. Don't assume there are no padding bits.
+ * lib/regex.c [_LIBC]: Do not include <config.h>.
+ [!_LIBC]: Add pragmas to ignore -Wsuggest-attributes=pure
+ and -Wtype-limits.
+ * lib/regex.h (__USE_GNU): Renamed from __USE_GNU_REGEX, to avoid
+ needless disagreement with glibc. All uses changed. Define it to
+ 1 only if _GNU_SOURCE, to match glibc.
+ (_REG_RM_NAME): Remove; no longer needed, since the names in
+ question are now all protected by __USE_GNU.
+ (_REG_RE_NAME): Remove; replaced by glibc's __REPB_PREFIX.
+ (REG_TRANSLATE_TYPE): Remove; replaced by glibc's __RE_TRANSLATE_TYPE.
+ * lib/regex_internal.h (MIN): New macro.
+
+ 2012-01-03 Ulrich Drepper <drepper@gmail.com>
+ * lib/regcomp.c (init_word_char): Optimize regex a bit.
+
+ 2011-12-30 Jakub Jelinek <jakub@redhat.com>
+ * lib/regex_internal.c (re_string_fetch_byte_case):
+ Fix up regcomp/regexec. The problem is that parse_bracket_symbol
+ is miscompiled, and it turns out it is because of an incorrect
+ attribute on re_string_fetch_byte_case. Unlike
+ re_string_peek_byte_case, this one is really not pure, it modifies
+ memory (increments pstr->cur_idx), and with the pure attribute GCC
+ assumed it doesn't and it cached the presumed value of
+ regexp->cur_idx in a variable across the
+ for (;; ++i)
+ {
+ if (i >= BRACKET_NAME_BUF_SIZE)
+ return REG_EBRACK;
+ if (token->type == OP_OPEN_CHAR_CLASS)
+ ch = re_string_fetch_byte_case (regexp);
+ else
+ ch = re_string_fetch_byte (regexp);
+ if (re_string_eoi(regexp))
+ return REG_EBRACK;
+ if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
+ break;
+ elem->opr.name[i] = ch;
+ }
+
+ 2011-11-29 Andreas Schwab <schwab@redhat.com>
+ * lib/regcomp.c (build_equiv_class):
+ Fix access after end of search string in regex matcher.
+
+ 2011-11-12 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex_internal.c, lib/regex_internal.h: Fix warnings in regex.
+
+ 2011-10-12 Ulrich Drepper <drepper@redhat.com>
+ * lib/regcomp.c (parse_branch): One more regex memory leak fixed.
+
+ 2011-10-11 Ulrich Drepper <drepper@redhat.com>
+ * lib/regcomp.c (parse_branch, parse_sub_exp):
+ More regex memory leak fixes and tests.
+ (parse_sub_exp, parse_bracket_exp):
+ Fix memory leak for some invalid regular expressions.
+
+ 2011-05-28 Ulrich Drepper <drepper@gmail.com>
+ * lib/regex_internal.c, lib/regexec.c:
+ Fix unnecessary overallocation due to incomplete character. When
+ incomplete characters are found at the end of a string the code
+ ran amok and allocated lots of memory. Stricter limits are now in
+ place.
+
+ 2011-05-20 Reuben Thomas <rrt@sc3d.org>
+ * lib/regex.h: Update documentation.
+
+ 2011-05-16 Aharon Robbins <arnold@skeeve.com>
+ * lib/regex.h: Update RE_SYNTAX*_AWK constants.
+
+ 2010-05-05 Andreas Schwab <schwab@redhat.com>
+ * lib/regexec.c (find_collation_sequence_value):
+ Fix lookup of collation sequence value during regexp matching.
+
+ 2010-01-22 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex_internal.c (re_dfa_add_node): Extend overflow detection.
+
+ 2008-01-16 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex.h: Cleanup namespace.
+
+ 2007-11-26 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex.h (REG_ENOSYS): Define REG_ENOSYS also for __USE_XOPEN2K.
+
+ 2007-08-26 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex_internal.h: Prevent some declarations and definitions
+ to be seen when used in tests.
+
+ 2005-05-06 Ulrich Drepper <drepper@redhat.com>
+ * lib/regex_internal.h: Include bits/libc-lock.h or define dummy
+ __libc_lock_* macros if not _LIBC.
+ (struct re_dfa_t): Add lock.
+
2012-02-07 Eric Blake <eblake@redhat.com>
maint.mk: also prohibit lower-case @var@
static const bitset_t utf8_sb_map =
{
/* Set the first 128 bits. */
-# if 4 * BITSET_WORD_BITS < ASCII_CHARS
-# error "bitset_word_t is narrower than 32 bits"
-# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
+# ifdef __GNUC__
+ [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
+# else
+# if 4 * BITSET_WORD_BITS < ASCII_CHARS
+# error "bitset_word_t is narrower than 32 bits"
+# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
-# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
+# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX, BITSET_WORD_MAX,
-# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
+# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
BITSET_WORD_MAX,
-# endif
+# endif
(BITSET_WORD_MAX
>> (SBC_MAX % BITSET_WORD_BITS == 0
? 0
: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
+# endif
};
#endif
calculation below, and for similar doubling calculations
elsewhere. And it's <= rather than <, because some of the
doubling calculations add 1 afterwards. */
- if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
+ if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
return REG_ESPACE;
dfa->nodes_alloc = pat_len + 1;
internal_function
init_word_char (re_dfa_t *dfa)
{
- int i, j, ch;
dfa->word_ops_used = 1;
- for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
+ int i = 0;
+ int j;
+ int ch = 0;
+ if (BE (dfa->map_notascii == 0, 1))
+ {
+ if (BITSET_WORD_BITS == 64)
+ {
+ dfa->word_char[0] = UINT64_C (0x03ff000000000000);
+ dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
+ i = 2;
+ }
+ else if (BITSET_WORD_BITS == 32)
+ {
+ dfa->word_char[0] = UINT32_C (0x00000000);
+ dfa->word_char[1] = UINT32_C (0x03ff0000);
+ dfa->word_char[2] = UINT32_C (0x87fffffe);
+ dfa->word_char[3] = UINT32_C (0x07fffffe);
+ i = 4;
+ }
+ else
+ goto general_case;
+ ch = 128;
+
+ if (BE (dfa->is_utf8, 1))
+ {
+ memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
+ return;
+ }
+ }
+
+ general_case:
+ for (; i < BITSET_WORDS; ++i)
for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
if (isalnum (ch) || ch == '_')
dfa->word_char[i] |= (bitset_word_t) 1 << j;
expr = parse_expression (regexp, preg, token, syntax, nest, err);
if (BE (*err != REG_NOERROR && expr == NULL, 0))
{
+ if (tree != NULL)
+ postorder (tree, free_tree, NULL);
return NULL;
}
if (tree != NULL && expr != NULL)
{
- tree = create_tree (dfa, tree, expr, CONCAT);
- if (tree == NULL)
+ bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
+ if (newtree == NULL)
{
+ postorder (expr, free_tree, NULL);
+ postorder (tree, free_tree, NULL);
*err = REG_ESPACE;
return NULL;
}
+ tree = newtree;
}
else if (tree == NULL)
tree = expr;
{
tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
- *err = REG_EPAREN;
+ {
+ if (tree != NULL)
+ postorder (tree, free_tree, NULL);
+ *err = REG_EPAREN;
+ }
if (BE (*err != REG_NOERROR, 0))
return NULL;
}
static reg_errcode_t
internal_function
-build_collating_symbol (bitset_t sbcset,
# ifdef RE_ENABLE_I18N
- re_charset_t *mbcset, Idx *coll_sym_alloc,
-# endif
- const unsigned char *name)
+build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
+ Idx *coll_sym_alloc, const unsigned char *name)
+# else /* not RE_ENABLE_I18N */
+build_collating_symbol (bitset_t sbcset, const unsigned char *name)
+# endif /* not RE_ENABLE_I18N */
{
size_t name_len = strlen ((const char *) name);
if (BE (name_len != 1, 0))
if (BE (sbcset == NULL, 0))
#endif /* RE_ENABLE_I18N */
{
+ re_free (sbcset);
+#ifdef RE_ENABLE_I18N
+ re_free (mbcset);
+#endif
*err = REG_ESPACE;
return NULL;
}
_NL_COLLATE_EXTRAMB);
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
- idx1 = findidx (&cp);
- if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
+ idx1 = findidx (&cp, -1);
+ if (BE (idx1 == 0 || *cp != '\0', 0))
/* This isn't a valid character. */
return REG_ECOLLATE;
/* Build single byte matching table for this equivalence class. */
- char_buf[1] = (unsigned char) '\0';
len = weights[idx1 & 0xffffff];
for (ch = 0; ch < SBC_MAX; ++ch)
{
char_buf[0] = ch;
cp = char_buf;
- idx2 = findidx (&cp);
+ idx2 = findidx (&cp, 1);
/*
idx2 = table[ch];
*/
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
-#include <config.h>
+#ifndef _LIBC
+# include <config.h>
+
+# if (__GNUC__ == 4 && 3 <= __GNUC_MINOR__) || 4 < __GNUC__
+# pragma GCC diagnostic ignored "-Wsuggest-attribute=pure"
+# pragma GCC diagnostic ignored "-Wtype-limits"
+# endif
+#endif
/* Make sure no one compiles this code with a C++ compiler. */
#if defined __cplusplus && defined _LIBC
GNU regex allows. Include it before <regex.h>, which correctly
#undefs RE_DUP_MAX and sets it to the right value. */
#include <limits.h>
-#include <strings.h>
#include <regex.h>
#include "regex_internal.h"
/* Definitions for data structures and routines for the regular
expression library.
- Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2006, 2009-2012
+ Copyright (C) 1985, 1989-1993, 1995-1998, 2000-2003, 2005-2012
Free Software Foundation, Inc.
This file is part of the GNU C Library.
extern "C" {
#endif
-/* Define __USE_GNU_REGEX to declare GNU extensions that violate the
+/* Define __USE_GNU to declare GNU extensions that violate the
POSIX name space rules. */
-#undef __USE_GNU_REGEX
-#if (defined _GNU_SOURCE \
- || (!defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE \
- && !defined _XOPEN_SOURCE))
-# define __USE_GNU_REGEX 1
+#ifdef _GNU_SOURCE
+# define __USE_GNU 1
#endif
#ifdef _REGEX_LARGE_OFFSETS
supported within glibc itself, and glibc users should not define
_REGEX_LARGE_OFFSETS. */
-/* The type of the offset of a byte within a string.
- For historical reasons POSIX 1003.1-2004 requires that regoff_t be
- at least as wide as off_t. However, many common POSIX platforms set
- regoff_t to the more-sensible ssize_t and the Open Group has
- signalled its intention to change the requirement to be that
- regoff_t be at least as wide as ptrdiff_t and ssize_t; see XBD ERN
- 60 (2005-08-25). We don't know of any hosts where ssize_t or
- ptrdiff_t is wider than ssize_t, so ssize_t is safe. */
-typedef ssize_t regoff_t;
-
/* The type of nonnegative object indexes. Traditionally, GNU regex
uses 'int' for these. Code that uses __re_idx_t should work
regardless of whether the type is signed. */
#else
-/* Use types that are binary-compatible with the traditional GNU regex
- implementation, which mishandles strings longer than INT_MAX. */
-
-typedef int regoff_t;
+/* The traditional GNU regex implementation mishandles strings longer
+ than INT_MAX. */
typedef int __re_idx_t;
typedef unsigned int __re_size_t;
typedef unsigned long int __re_long_size_t;
add or remove a bit, only one other definition need change. */
typedef unsigned long int reg_syntax_t;
-#ifdef __USE_GNU_REGEX
-
+#ifdef __USE_GNU
/* If this bit is not set, then \ inside a bracket expression is literal.
If set, then such a \ quotes the following character. */
# define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1)
/* If this bit is set, then no_sub will be set to 1 during
re_compile_pattern. */
# define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
-
-#endif /* defined __USE_GNU_REGEX */
+#endif
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
already-compiled regexps. */
extern reg_syntax_t re_syntax_options;
\f
-#ifdef __USE_GNU_REGEX
+#ifdef __USE_GNU
/* Define combinations of the above bits for the standard possibilities.
(The [[[ comments delimit what gets put into the Texinfo file, so
don't delete them!) */
| RE_NO_BK_PARENS | RE_NO_BK_REFS \
| RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
| RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \
+ | RE_CHAR_CLASSES \
| RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS)
# define RE_SYNTAX_GNU_AWK \
- ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \
- & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS \
- | RE_CONTEXT_INVALID_OPS ))
+ ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
+ | RE_INVALID_INTERVAL_ORD) \
+ & ~(RE_DOT_NOT_NULL | RE_CONTEXT_INDEP_OPS \
+ | RE_CONTEXT_INVALID_OPS ))
# define RE_SYNTAX_POSIX_AWK \
(RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \
- | RE_INTERVALS | RE_NO_GNU_OPS)
+ | RE_INTERVALS | RE_NO_GNU_OPS \
+ | RE_INVALID_INTERVAL_ORD)
# define RE_SYNTAX_GREP \
(RE_BK_PLUS_QM | RE_CHAR_CLASSES \
| RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
/* [[[end syntaxes]]] */
-#endif /* defined __USE_GNU_REGEX */
-\f
-#ifdef __USE_GNU_REGEX
-
/* Maximum number of duplicates an interval can allow. POSIX-conforming
systems might define this in <limits.h>, but we want our
value, so remove any previous define. */
actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains
its historical value. */
# define RE_DUP_MAX (0x7fff)
-
-#endif /* defined __USE_GNU_REGEX */
+#endif
/* POSIX 'cflags' bits (i.e., information for 'regcomp'). */
_REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
} reg_errcode_t;
-#ifdef _XOPEN_SOURCE
+#if defined _XOPEN_SOURCE || defined __USE_XOPEN2K
# define REG_ENOSYS _REG_ENOSYS
#endif
#define REG_NOERROR _REG_NOERROR
#define REG_ESIZE _REG_ESIZE
#define REG_ERPAREN _REG_ERPAREN
\f
-/* struct re_pattern_buffer normally uses member names like 'buffer'
- that POSIX does not allow. In POSIX mode these members have names
- with leading 're_' (e.g., 're_buffer'). */
-#ifdef __USE_GNU_REGEX
-# define _REG_RE_NAME(id) id
-# define _REG_RM_NAME(id) id
-#else
-# define _REG_RE_NAME(id) re_##id
-# define _REG_RM_NAME(id) rm_##id
+/* This data structure represents a compiled pattern. Before calling
+ the pattern compiler, the fields 'buffer', 'allocated', 'fastmap',
+ and 'translate' can be set. After the pattern has been compiled,
+ the fields 're_nsub', 'not_bol' and 'not_eol' are available. All
+ other fields are private to the regex routines. */
+
+#ifndef RE_TRANSLATE_TYPE
+# define __RE_TRANSLATE_TYPE unsigned char *
+# ifdef __USE_GNU
+# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
+# endif
#endif
-/* The user can specify the type of the re_translate member by
- defining the macro RE_TRANSLATE_TYPE, which defaults to unsigned
- char *. This pollutes the POSIX name space, so in POSIX mode just
- use unsigned char *. */
-#ifdef __USE_GNU_REGEX
-# ifndef RE_TRANSLATE_TYPE
-# define RE_TRANSLATE_TYPE unsigned char *
-# endif
-# define REG_TRANSLATE_TYPE RE_TRANSLATE_TYPE
+#ifdef __USE_GNU
+# define __REPB_PREFIX(name) name
#else
-# define REG_TRANSLATE_TYPE unsigned char *
+# define __REPB_PREFIX(name) __##name
#endif
-/* This data structure represents a compiled pattern. Before calling
- the pattern compiler, the fields 'buffer', 'allocated', 'fastmap',
- 'translate', and 'no_sub' can be set. After the pattern has been
- compiled, the 're_nsub' field is available. All other fields are
- private to the regex routines. */
-
struct re_pattern_buffer
{
/* Space that holds the compiled pattern. It is declared as
'unsigned char *' because its elements are sometimes used as
array indexes. */
- unsigned char *_REG_RE_NAME (buffer);
+ unsigned char *__REPB_PREFIX(buffer);
/* Number of bytes to which 'buffer' points. */
- __re_long_size_t _REG_RE_NAME (allocated);
+ __re_long_size_t __REPB_PREFIX(allocated);
/* Number of bytes actually used in 'buffer'. */
- __re_long_size_t _REG_RE_NAME (used);
+ __re_long_size_t __REPB_PREFIX(used);
/* Syntax setting with which the pattern was compiled. */
- reg_syntax_t _REG_RE_NAME (syntax);
+ reg_syntax_t __REPB_PREFIX(syntax);
/* Pointer to a fastmap, if any, otherwise zero. re_search uses the
fastmap, if there is one, to skip over impossible starting points
for matches. */
- char *_REG_RE_NAME (fastmap);
+ char *__REPB_PREFIX(fastmap);
/* Either a translate table to apply to all characters before
comparing them, or zero for no translation. The translation is
applied to a pattern when it is compiled and to a string when it
is matched. */
- REG_TRANSLATE_TYPE _REG_RE_NAME (translate);
+ __RE_TRANSLATE_TYPE __REPB_PREFIX(translate);
/* Number of subexpressions found by the compiler. */
size_t re_nsub;
Well, in truth it's used only in 're_search_2', to see whether or
not we should use the fastmap, so we don't set this absolutely
perfectly; see 're_compile_fastmap' (the "duplicate" case). */
- unsigned int _REG_RE_NAME (can_be_null) : 1;
+ unsigned __REPB_PREFIX(can_be_null) : 1;
/* If REGS_UNALLOCATED, allocate space in the 'regs' structure
for 'max (RE_NREGS, re_nsub + 1)' groups.
If REGS_REALLOCATE, reallocate space if necessary.
If REGS_FIXED, use what's there. */
-#ifdef __USE_GNU_REGEX
+#ifdef __USE_GNU
# define REGS_UNALLOCATED 0
# define REGS_REALLOCATE 1
# define REGS_FIXED 2
#endif
- unsigned int _REG_RE_NAME (regs_allocated) : 2;
+ unsigned __REPB_PREFIX(regs_allocated) : 2;
/* Set to zero when 're_compile_pattern' compiles a pattern; set to
one by 're_compile_fastmap' if it updates the fastmap. */
- unsigned int _REG_RE_NAME (fastmap_accurate) : 1;
+ unsigned __REPB_PREFIX(fastmap_accurate) : 1;
/* If set, 're_match_2' does not return information about
subexpressions. */
- unsigned int _REG_RE_NAME (no_sub) : 1;
+ unsigned __REPB_PREFIX(no_sub) : 1;
/* If set, a beginning-of-line anchor doesn't match at the beginning
of the string. */
- unsigned int _REG_RE_NAME (not_bol) : 1;
+ unsigned __REPB_PREFIX(not_bol) : 1;
/* Similarly for an end-of-line anchor. */
- unsigned int _REG_RE_NAME (not_eol) : 1;
+ unsigned __REPB_PREFIX(not_eol) : 1;
/* If true, an anchor at a newline matches. */
- unsigned int _REG_RE_NAME (newline_anchor) : 1;
-
-/* [[[end pattern_buffer]]] */
+ unsigned __REPB_PREFIX(newline_anchor) : 1;
};
typedef struct re_pattern_buffer regex_t;
\f
+/* Type for byte offsets within the string. POSIX mandates this. */
+#ifdef _REGEX_LARGE_OFFSETS
+/* POSIX 1003.1-2008 requires that regoff_t be at least as wide as
+ ptrdiff_t and ssize_t. We don't know of any hosts where ptrdiff_t
+ is wider than ssize_t, so ssize_t is safe. */
+typedef ssize_t regoff_t;
+#else
+/* The traditional GNU regex implementation mishandles strings longer
+ than INT_MAX. */
+typedef int regoff_t;
+#endif
+
+
+#ifdef __USE_GNU
/* This is the structure we store register match data in. See
regex.texinfo for a full description of what registers match. */
struct re_registers
{
- __re_size_t _REG_RM_NAME (num_regs);
- regoff_t *_REG_RM_NAME (start);
- regoff_t *_REG_RM_NAME (end);
+ __re_size_t num_regs;
+ regoff_t *start;
+ regoff_t *end;
};
/* If 'regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
're_match_2' returns information about at least this many registers
the first time a 'regs' structure is passed. */
-#if !defined RE_NREGS && defined __USE_GNU_REGEX
-# define RE_NREGS 30
+# ifndef RE_NREGS
+# define RE_NREGS 30
+# endif
#endif
\f
/* Declarations for routines. */
+#ifdef __USE_GNU
/* Sets the current default syntax to SYNTAX, and return the old syntax.
You can also simply assign to the 're_syntax_options' variable. */
extern reg_syntax_t re_set_syntax (reg_syntax_t __syntax);
/* Compile the regular expression PATTERN, with length LENGTH
and syntax given by the global 're_syntax_options', into the buffer
- BUFFER. Return NULL if successful, and an error string if not. */
+ BUFFER. Return NULL if successful, and an error string if not.
+
+ To free the allocated storage, you must call 'regfree' on BUFFER.
+ Note that the translate table must either have been initialised by
+ 'regcomp', with a malloc'ed value, or set to NULL before calling
+ 'regfree'. */
extern const char *re_compile_pattern (const char *__pattern, size_t __length,
struct re_pattern_buffer *__buffer);
register data.
Unless this function is called, the first search or match using
- BUFFER will allocate its own register data, without freeing the old
- data. */
+ BUFFER will allocate its own register data, without
+ freeing the old data. */
extern void re_set_registers (struct re_pattern_buffer *__buffer,
struct re_registers *__regs,
__re_size_t __num_regs,
regoff_t *__starts, regoff_t *__ends);
+#endif /* Use GNU */
-#if defined _REGEX_RE_COMP || defined _LIBC
+#if defined _REGEX_RE_COMP || (defined _LIBC && defined __USE_BSD)
# ifndef _CRAY
/* 4.2 bsd compatibility. */
extern char *re_comp (const char *);
{
wint_t *new_wcs;
- /* Avoid overflow. */
- size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
- if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
+ /* Avoid overflow in realloc. */
+ const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
+ if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
return REG_ESPACE;
new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
else
p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
- if (BE (mbclen == (size_t) -2, 0))
- {
- /* The buffer doesn't have enough space, finish to build. */
- pstr->cur_state = prev_st;
- break;
- }
- else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
+ if (BE (mbclen == (size_t) -1 || mbclen == 0
+ || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
{
/* We treat these cases as a singlebyte character. */
mbclen = 1;
wc = pstr->trans[wc];
pstr->cur_state = prev_st;
}
+ else if (BE (mbclen == (size_t) -2, 0))
+ {
+ /* The buffer doesn't have enough space, finish to build. */
+ pstr->cur_state = prev_st;
+ break;
+ }
/* Write wide character and padding. */
pstr->wcs[byte_idx++] = wc;
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
pstr->wcs[byte_idx++] = WEOF;
}
- else if (mbclen == (size_t) -1 || mbclen == 0)
+ else if (mbclen == (size_t) -1 || mbclen == 0
+ || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
{
- /* It is an invalid character or '\0'. Just use the byte. */
+ /* It is an invalid character, an incomplete character
+ at the end of the string, or '\0'. Just use the byte. */
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
pstr->mbs[byte_idx] = ch;
/* And also cast it to wide char. */
for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
pstr->wcs[byte_idx++] = WEOF;
}
- else if (mbclen == (size_t) -1 || mbclen == 0)
+ else if (mbclen == (size_t) -1 || mbclen == 0
+ || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
{
/* It is an invalid character or '\0'. Just use the byte. */
int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
rawbuf_idx < new_raw_idx;)
{
wchar_t wc2;
- Idx remain_len;
- remain_len = pstr->len - rawbuf_idx;
+ Idx remain_len = pstr->len - rawbuf_idx;
prev_st = pstr->cur_state;
mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
remain_len, &pstr->cur_state);
mbstate_t cur_state;
wchar_t wc2;
Idx mlen = raw + pstr->len - p;
+ unsigned char buf[6];
size_t mbclen;
-#if 0 /* dead code: buf is set but never used */
- unsigned char buf[6];
+ const unsigned char *pp = p;
if (BE (pstr->trans != NULL, 0))
{
int i = mlen < 6 ? mlen : 6;
while (--i >= 0)
buf[i] = pstr->trans[p[i]];
+ pp = buf;
}
-#endif
/* XXX Don't use mbrtowc, we know which conversion
to use (UTF-8 -> UCS4). */
memset (&cur_state, 0, sizeof (cur_state));
- mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
+ mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
&cur_state);
if (raw + offset - p <= mbclen
&& mbclen < (size_t) -2)
}
static unsigned char
-internal_function __attribute ((pure))
+internal_function
re_string_fetch_byte_case (re_string_t *pstr)
{
if (BE (!pstr->mbs_allocated, 1))
Idx *new_nexts, *new_indices;
re_node_set *new_edests, *new_eclosures;
re_token_t *new_nodes;
- size_t max_object_size =
- MAX (sizeof (re_token_t),
- MAX (sizeof (re_node_set),
- sizeof (Idx)));
- /* Avoid overflows. */
- if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
+ /* Avoid overflows in realloc. */
+ const size_t max_object_size = MAX (sizeof (re_token_t),
+ MAX (sizeof (re_node_set),
+ sizeof (Idx)));
+ if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
return REG_MISSING;
new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
{
Idx elem = newstate->nodes.elems[i];
if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
- if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
+ if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
return REG_ESPACE;
}
#include <assert.h>
#include <ctype.h>
-#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
# include "localcharset.h"
#endif
#include <locale.h>
-
#include <wchar.h>
#include <wctype.h>
+#include <stdbool.h>
#include <stdint.h>
#if defined _LIBC
# include <bits/libc-lock.h>
#else
+# define __libc_lock_define(CLASS,NAME)
# define __libc_lock_init(NAME) do { } while (0)
# define __libc_lock_lock(NAME) do { } while (0)
# define __libc_lock_unlock(NAME) do { } while (0)
# define __wctype wctype
# define __iswctype iswctype
# define __btowc btowc
-# define __wcrtomb wcrtomb
# define __mbrtowc mbrtowc
+# define __wcrtomb wcrtomb
# define __regfree regfree
# define attribute_hidden
#endif /* not _LIBC */
#endif
typedef __re_idx_t Idx;
+#ifdef _REGEX_LARGE_OFFSETS
+# define IDX_MAX (SIZE_MAX - 2)
+#else
+# define IDX_MAX INT_MAX
+#endif
/* Special return value for failure to match. */
#define REG_MISSING ((Idx) -1)
# define internal_function
#endif
+#ifndef NOT_IN_libc
static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
Idx new_buf_len)
internal_function;
-#ifdef RE_ENABLE_I18N
+# ifdef RE_ENABLE_I18N
static void build_wcs_buffer (re_string_t *pstr) internal_function;
static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr)
- internal_function;
-#endif /* RE_ENABLE_I18N */
+ internal_function;
+# endif /* RE_ENABLE_I18N */
static void build_upper_buffer (re_string_t *pstr) internal_function;
static void re_string_translate_buffer (re_string_t *pstr) internal_function;
static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
int eflags)
internal_function __attribute ((pure));
+#endif
#define re_string_peek_byte(pstr, offset) \
((pstr)->mbs[(pstr)->cur_idx + offset])
#define re_string_fetch_byte(pstr) \
#ifndef MAX
# define MAX(a,b) ((a) < (b) ? (b) : (a))
#endif
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (b) : (a))
+#endif
#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t)))
#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t)))
#ifdef DEBUG
char* re_str;
#endif
-#ifdef _LIBC
__libc_lock_define (, lock)
-#endif
};
#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set))
return (wint_t) pstr->wcs[idx];
}
+# ifndef NOT_IN_libc
static int
internal_function __attribute ((pure))
re_string_elem_size_at (const re_string_t *pstr, Idx idx)
{
-# ifdef _LIBC
+# ifdef _LIBC
const unsigned char *p, *extra;
const int32_t *table, *indirect;
- int32_t tmp;
-# include <locale/weight.h>
+# include <locale/weight.h>
uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
if (nrules != 0)
indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
_NL_COLLATE_INDIRECTMB);
p = pstr->mbs + idx;
- tmp = findidx (&p);
+ findidx (&p, pstr->len - idx);
return p - pstr->mbs - idx;
}
else
-# endif /* _LIBC */
+# endif /* _LIBC */
return 1;
}
+# endif
#endif /* RE_ENABLE_I18N */
#ifndef __GNUC_PREREQ
regoff_t range, Idx stop,
struct re_registers *regs,
bool ret_len) internal_function;
-static unsigned int re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
- Idx nregs, int regs_allocated)
- internal_function;
+static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
+ Idx nregs, int regs_allocated) internal_function;
static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
internal_function;
static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
#endif
static regoff_t
-internal_function
re_search_2_stub (struct re_pattern_buffer *bufp,
const char *string1, Idx length1,
const char *string2, Idx length2,
otherwise the position of the match is returned. */
static regoff_t
-internal_function
re_search_stub (struct re_pattern_buffer *bufp,
const char *string, Idx length,
Idx start, regoff_t range, Idx stop, struct re_registers *regs,
return rval;
}
-static unsigned int
-internal_function
+static unsigned
re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
int regs_allocated)
{
(0 <= LAST_START && LAST_START <= LENGTH) */
static reg_errcode_t
-internal_function __attribute_warn_unused_result__
+__attribute_warn_unused_result__
re_search_internal (const regex_t *preg,
const char *string, Idx length,
Idx start, Idx last_start, Idx stop,
if (nmatch > 1 || dfa->has_mb_node)
{
/* Avoid overflow. */
- if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
+ if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
+ <= mctx.input.bufs_len), 0))
{
err = REG_ESPACE;
goto free_return;
}
static reg_errcode_t
-internal_function __attribute_warn_unused_result__
+__attribute_warn_unused_result__
prune_impossible_nodes (re_match_context_t *mctx)
{
const re_dfa_t *const dfa = mctx->dfa;
halt_node = mctx->last_node;
/* Avoid overflow. */
- if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
+ if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
return REG_ESPACE;
sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
re_dfastate_t *old_state = cur_state;
Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
- if (BE (next_char_idx >= mctx->input.bufs_len, 0)
+ if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
+ && mctx->input.bufs_len < mctx->input.len)
|| (BE (next_char_idx >= mctx->input.valid_len, 0)
&& mctx->input.valid_len < mctx->input.len))
{
{
Idx top = mctx->state_log_top;
- if (next_state_log_idx >= mctx->input.bufs_len
+ if ((next_state_log_idx >= mctx->input.bufs_len
+ && mctx->input.bufs_len < mctx->input.len)
|| (next_state_log_idx >= mctx->input.valid_len
&& mctx->input.valid_len < mctx->input.len))
{
{
re_dfastate_t **new_array;
Idx old_alloc = path->alloc;
- Idx new_alloc = old_alloc + last_str + mctx->max_mb_elem_len + 1;
- if (BE (new_alloc < old_alloc, 0)
- || BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
+ Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
+ Idx new_alloc;
+ if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
+ return REG_ESPACE;
+ new_alloc = old_alloc + incr_alloc;
+ if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
return REG_ESPACE;
new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
if (BE (new_array == NULL, 0))
{
if (dests_node_malloced)
free (dests_alloc);
+ /* Return false in case of an error, true otherwise. */
if (ndests == 0)
{
state->trtable = (re_dfastate_t **)
const int32_t *table, *indirect;
const unsigned char *weights, *extra;
const char *collseqwc;
- int32_t idx;
/* This #include defines a local function! */
# include <locale/weight.h>
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
indirect = (const int32_t *)
_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
- int32_t idx = findidx (&cp);
+ int32_t idx = findidx (&cp, elem_len);
if (idx > 0)
for (i = 0; i < cset->nequiv_classes; ++i)
{
/* Skip the collation sequence value. */
idx += sizeof (uint32_t);
/* Skip the wide char sequence of the collating element. */
- idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
+ idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
/* If we found the entry, return the sequence value. */
if (found)
return *(uint32_t *) (extra + idx);
re_string_t *pstr = &mctx->input;
/* Avoid overflow. */
- if (BE (SIZE_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
+ if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
+ <= pstr->bufs_len, 0))
return REG_ESPACE;
/* Double the lengthes of the buffers. */
- ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
+ ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
if (BE (ret != REG_NOERROR, 0))
return ret;
size_t max_object_size =
MAX (sizeof (struct re_backref_cache_entry),
sizeof (re_sub_match_top_t *));
- if (BE (SIZE_MAX / max_object_size < n, 0))
+ if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
return REG_ESPACE;
mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);