From: Bruno Haible Date: Sun, 8 Feb 2009 19:56:37 +0000 (+0100) Subject: New module 'uniwbrk/u8-wordbreaks'. X-Git-Tag: v0.1~6350 X-Git-Url: http://erislabs.org.uk/gitweb/?a=commitdiff_plain;h=5956ed4c1444f84c9cd620ea9e918f376f3c3ed4;p=gnulib.git New module 'uniwbrk/u8-wordbreaks'. --- diff --git a/ChangeLog b/ChangeLog index ea08b916e..60be23ab0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2009-02-08 Bruno Haible + New module 'uniwbrk/u8-wordbreaks'. + * modules/uniwbrk/u8-wordbreaks: New file. + * lib/uniwbrk/u8-wordbreaks.c: New file. + * lib/uniwbrk/u-wordbreaks.h: New file. + New module 'uniwbrk/table'. * modules/uniwbrk/table: New file. * lib/uniwbrk/wbrktable.h: New file. diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h new file mode 100644 index 000000000..5ef4e8c1a --- /dev/null +++ b/lib/uniwbrk/u-wordbreaks.h @@ -0,0 +1,127 @@ +/* Word breaks in UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +void +FUNC (const UNIT *s, size_t n, char *p) +{ + if (n > 0) + { + const UNIT *s_end = s + n; + + /* Word break property of the last character. + -1 at the very beginning of the string. */ + int last_char_prop = -1; + + /* Format and Extend characters are ignored; this means, the mostly used + unit is the complex character (= character with subsequent ignored + characters). + Word break property of the last complex character. + -1 at the very beginning of the string. */ + int last_compchar_prop = -1; + char *last_compchar_ptr = NULL; + + /* For recognizing rules involving 3 complex characters: + Word break property of the second-to-last complex character. + -1 at the very beginning of the string. */ + int secondlast_compchar_prop = -1; + + /* Don't break inside multibyte characters. */ + memset (p, 0, n); + + while (s < s_end) + { + ucs4_t uc; + int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); + int prop = uc_wordbreak_property (uc); + + /* No break at the start of the string. */ + if (last_char_prop >= 0) + { + /* No break between CR and LF. */ + if (last_char_prop == WBP_CR && prop == WBP_LF) + /* *p = 0 */; + /* Break before and after newlines. */ + else if (last_char_prop >= WBP_NEWLINE + /* same as: + last_char_prop == WBP_CR + || last_char_prop == WBP_LF + || last_char_prop == WBP_NEWLINE */ + || prop >= WBP_NEWLINE + /* same as: + prop == WBP_CR + || prop == WBP_LF + || prop == WBP_NEWLINE */) + *p = 1; + /* Ignore Format and Extend characters. */ + else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) + { + /* No break in these situations (see UAX #29): + + secondlast last current + + ALetter (MidLetter | MidNumLet) × ALetter (WB7) + ALetter × (MidLetter | MidNumLet) ALetter (WB6) + Numeric (MidNum | MidNumLet) × Numeric (WB11) + Numeric × (MidNum | MidNumLet) Numeric (WB12) + ALetter × ALetter (WB5) + ALetter × Numeric (WB9) + Numeric × ALetter (WB10) + Numeric × Numeric (WB8) + Katakana × Katakana (WB13) + (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) + ExtendNumLet × ExtendNumLet (WB13a) + ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + */ + /* No break across certain punctuation. Also, disable word + breaks that were recognized earlier (due to lookahead of + only one complex character). */ + if ((prop == WBP_ALETTER + && (last_compchar_prop == WBP_MIDLETTER + || last_compchar_prop == WBP_MIDNUMLET) + && secondlast_compchar_prop == WBP_ALETTER) + || (prop == WBP_NUMERIC + && (last_compchar_prop == WBP_MIDNUM + || last_compchar_prop == WBP_MIDNUMLET) + && secondlast_compchar_prop == WBP_NUMERIC)) + { + *last_compchar_ptr = 0; + /* *p = 0; */ + } + else + { + /* Perform a single table lookup. */ + if (uniwbrk_table[last_compchar_prop][prop]) + *p = 1; + /* else *p = 0; */ + } + } + } + + last_char_prop = prop; + /* Ignore Format and Extend characters, except at the start of the string. */ + if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) + { + secondlast_compchar_prop = last_compchar_prop; + last_compchar_prop = prop; + last_compchar_ptr = p; + } + + s += count; + p += count; + } + } +} diff --git a/lib/uniwbrk/u8-wordbreaks.c b/lib/uniwbrk/u8-wordbreaks.c new file mode 100644 index 000000000..59d2076de --- /dev/null +++ b/lib/uniwbrk/u8-wordbreaks.c @@ -0,0 +1,124 @@ +/* Word breaks in UTF-8 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include "uniwbrk.h" + +#include + +#include "unistr.h" +#include "uniwbrk/wbrktable.h" + +#define FUNC u8_wordbreaks +#define UNIT uint8_t +#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe +#include "u-wordbreaks.h" + + +#ifdef TEST + +#include +#include + +/* Read the contents of an input stream, and return it, terminated with a NUL + byte. */ +char * +read_file (FILE *stream) +{ +#define BUFSIZE 4096 + char *buf = NULL; + int alloc = 0; + int size = 0; + int count; + + while (! feof (stream)) + { + if (size + BUFSIZE > alloc) + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } + count = fread (buf + size, 1, BUFSIZE, stream); + if (count == 0) + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } + else + size += count; + } + buf = realloc (buf, size + 1); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + buf[size] = '\0'; + return buf; +#undef BUFSIZE +} + +int +main (int argc, char * argv[]) +{ + if (argc == 1) + { + /* Display all the word breaks in the input string. */ + char *input = read_file (stdin); + int length = strlen (input); + char *breaks = malloc (length); + int i; + + u8_wordbreaks ((uint8_t *) input, length, breaks); + + for (i = 0; i < length; i++) + { + switch (breaks[i]) + { + case 1: + /* U+2027 in UTF-8 encoding */ + putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); + break; + case 0: + break; + default: + abort (); + } + putc (input[i], stdout); + } + + free (breaks); + + return 0; + } + else + return 1; +} + +#endif /* TEST */ diff --git a/modules/uniwbrk/u8-wordbreaks b/modules/uniwbrk/u8-wordbreaks new file mode 100644 index 000000000..71aaf4523 --- /dev/null +++ b/modules/uniwbrk/u8-wordbreaks @@ -0,0 +1,26 @@ +Description: +Word breaks in UTF-8 strings. + +Files: +lib/uniwbrk/u8-wordbreaks.c +lib/uniwbrk/u-wordbreaks.h + +Depends-on: +uniwbrk/base +uniwbrk/table +unistr/u8-mbtouc-unsafe + +configure.ac: + +Makefile.am: +lib_SOURCES += uniwbrk/u8-wordbreaks.c + +Include: +"uniwbrk.h" + +License: +LGPL + +Maintainer: +Bruno Haible +