From: Bruno Haible Date: Sun, 8 Mar 2009 15:28:47 +0000 (+0100) Subject: New module 'unicase/u8-casefold'. X-Git-Tag: v0.1~6158 X-Git-Url: http://erislabs.org.uk/gitweb/?a=commitdiff_plain;h=3c44dc1b5b0882d7278226f79c5d4a2ac8a8f363;p=gnulib.git New module 'unicase/u8-casefold'. --- diff --git a/ChangeLog b/ChangeLog index 3455cb4a7..0d21ed8ae 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2009-03-08 Bruno Haible + New module 'unicase/u8-casefold'. + * lib/unicase/u8-casefold.c: New file. + * lib/unicase/u-casefold.h: New file. + * modules/unicase/u8-casefold: New file. + New module 'unicase/tocasefold'. * lib/unicase/casefold.h: New file. * lib/unicase/tocasefold.c: New file. diff --git a/lib/unicase/u-casefold.h b/lib/unicase/u-casefold.h new file mode 100644 index 000000000..01699fc7f --- /dev/null +++ b/lib/unicase/u-casefold.h @@ -0,0 +1,103 @@ +/* Casefolding mapping for Unicode strings (locale dependent). + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +UNIT * +FUNC (const UNIT *s, size_t n, const char *iso639_language, + uninorm_t nf, + UNIT *resultbuf, size_t *lengthp) +{ + /* Implement the three definitions of caseless matching, as described in + Unicode 5.0, section "Default caseless matching": + - If no normalization is requested, simply apply the casefolding. + X -> toCasefold(X). + - If canonical normalization is requested, apply it, and apply an NFD + before. + X -> NFD(toCasefold(NFD(X))). + - If compatibility normalization is requested, apply it twice, apply + the normalization after each, and apply an NFD before: + X -> NFKD(toCasefold(NFKD(toCasefold(NFD(X))))). */ + if (nf == NULL) + /* X -> toCasefold(X) */ + return U_CASEMAP (s, n, iso639_language, + uc_tocasefold, offsetof (struct special_casing_rule, casefold[0]), + NULL, + resultbuf, lengthp); + else + { + uninorm_t nfd = uninorm_decomposing_form (nf); + /* X -> nf(toCasefold(NFD(X))) or + X -> nf(toCasefold(nfd(toCasefold(NFD(X))))) */ + int repeat = (uninorm_is_compat_decomposing (nf) ? 2 : 1); + UNIT tmpbuf1[2048 / sizeof (UNIT)]; + UNIT tmpbuf2[2048 / sizeof (UNIT)]; + UNIT *tmp1; + size_t tmp1_length; + UNIT *tmp2; + size_t tmp2_length; + + tmp1_length = sizeof (tmpbuf1) / sizeof (UNIT); + tmp1 = U_NORMALIZE (UNINORM_NFD, s, n, tmpbuf1, &tmp1_length); + if (tmp1 == NULL) + /* errno is set here. */ + return NULL; + + do + { + tmp2_length = sizeof (tmpbuf2) / sizeof (UNIT); + tmp2 = U_CASEMAP (tmp1, tmp1_length, iso639_language, + uc_tocasefold, offsetof (struct special_casing_rule, casefold[0]), + NULL, + tmpbuf2, &tmp2_length); + if (tmp2 == NULL) + { + int saved_errno = errno; + if (tmp1 != tmpbuf1) + free (tmp1); + errno = saved_errno; + return NULL; + } + + if (tmp1 != tmpbuf1) + free (tmp1); + + if (repeat > 1) + { + tmp1_length = sizeof (tmpbuf1) / sizeof (UNIT); + tmp1 = U_NORMALIZE (nfd, tmp2, tmp2_length, + tmpbuf1, &tmp1_length); + } + else + /* Last run through this loop. */ + tmp1 = U_NORMALIZE (nf, tmp2, tmp2_length, + resultbuf, lengthp); + if (tmp1 == NULL) + { + int saved_errno = errno; + if (tmp2 != tmpbuf2) + free (tmp2); + errno = saved_errno; + return NULL; + } + + if (tmp2 != tmpbuf2) + free (tmp2); + } + while (--repeat > 0); + + return tmp1; + } +} diff --git a/lib/unicase/u8-casefold.c b/lib/unicase/u8-casefold.c new file mode 100644 index 000000000..f0f71f208 --- /dev/null +++ b/lib/unicase/u8-casefold.c @@ -0,0 +1,116 @@ +/* Casefolding mapping for UTF-8 strings (locale dependent). + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include "unicase.h" + +#include +#include +#include + +#include "unicasemap.h" +#include "special-casing.h" +#include "casefold.h" + +#define FUNC u8_casefold +#define UNIT uint8_t +#define U_CASEMAP u8_casemap +#define U_NORMALIZE u8_normalize +#include "u-casefold.h" + + +#ifdef TEST + +#include +#include +#include +#include + +/* Read the contents of an input stream, and return it, terminated with a NUL + byte. */ +char * +read_file (FILE *stream) +{ +#define BUFSIZE 4096 + char *buf = NULL; + int alloc = 0; + int size = 0; + int count; + + while (! feof (stream)) + { + if (size + BUFSIZE > alloc) + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } + count = fread (buf + size, 1, BUFSIZE, stream); + if (count == 0) + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } + else + size += count; + } + buf = realloc (buf, size + 1); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + buf[size] = '\0'; + return buf; +#undef BUFSIZE +} + +int +main (int argc, char * argv[]) +{ + setlocale (LC_ALL, ""); + if (argc == 1) + { + /* Display the case folded input string. */ + char *input = read_file (stdin); + int length = strlen (input); + size_t output_length; + uint8_t *output = + u8_casefold ((uint8_t *) input, length, uc_locale_language (), + NULL, + NULL, &output_length); + + fwrite (output, 1, output_length, stdout); + + return 0; + } + else + return 1; +} + +#endif /* TEST */ diff --git a/modules/unicase/u8-casefold b/modules/unicase/u8-casefold new file mode 100644 index 000000000..5996151ab --- /dev/null +++ b/modules/unicase/u8-casefold @@ -0,0 +1,30 @@ +Description: +Casefolding mapping for UTF-8 strings (locale dependent). + +Files: +lib/unicase/u8-casefold.c +lib/unicase/u-casefold.h + +Depends-on: +unicase/base +unicase/u8-casemap +unicase/special-casing +unicase/tocasefold +uninorm/decomposing-form +uninorm/u8-normalize +uninorm/nfd + +configure.ac: + +Makefile.am: +lib_SOURCES += unicase/u8-casefold.c + +Include: +"unicase.h" + +License: +LGPL + +Maintainer: +Bruno Haible +