From: Bruno Haible Date: Sat, 21 Feb 2009 11:01:44 +0000 (+0100) Subject: New module 'uninorm/base'. X-Git-Tag: v0.1~6311 X-Git-Url: http://erislabs.org.uk/gitweb/?a=commitdiff_plain;h=762e95706d4dec592fbdda953092e3570e54e499;p=gnulib.git New module 'uninorm/base'. --- diff --git a/ChangeLog b/ChangeLog index 44002ff34..73b797a2c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2009-02-21 Bruno Haible + + New module 'uninorm/base'. + * lib/uninorm.h: New file. + * lib/unictype.h: Update comment. + * modules/uninorm/base: New file. + 2009-02-21 David Lutterkort Tests for module 'safe-alloc'. diff --git a/lib/unictype.h b/lib/unictype.h index ab5477c57..46c05497a 100644 --- a/lib/unictype.h +++ b/lib/unictype.h @@ -1,5 +1,5 @@ /* Unicode character classification and properties. - Copyright (C) 2002, 2005-2008 Free Software Foundation, Inc. + Copyright (C) 2002, 2005-2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published @@ -301,7 +301,7 @@ extern bool /* ========================================================================= */ /* Field 5 of Unicode Character Database: Character decomposition mapping. - See "unicomp.h". */ + See "uninorm.h". */ /* ========================================================================= */ diff --git a/lib/uninorm.h b/lib/uninorm.h new file mode 100644 index 000000000..8750e1f37 --- /dev/null +++ b/lib/uninorm.h @@ -0,0 +1,153 @@ +/* Normalization forms (composition and decomposition) of Unicode strings. + Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#ifndef _UNINORM_H +#define _UNINORM_H + +/* Get size_t. */ +#include + +#include "unitypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Conventions: + + All functions prefixed with u8_ operate on UTF-8 encoded strings. + Their unit is an uint8_t (1 byte). + + All functions prefixed with u16_ operate on UTF-16 encoded strings. + Their unit is an uint16_t (a 2-byte word). + + All functions prefixed with u32_ operate on UCS-4 encoded strings. + Their unit is an uint32_t (a 4-byte word). + + All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly + n units. + + Functions returning a string result take a (resultbuf, lengthp) argument + pair. If resultbuf is not NULL and the result fits into *lengthp units, + it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly + allocated string is returned. In both cases, *lengthp is set to the + length (number of units) of the returned string. In case of error, + NULL is returned and errno is set. */ + + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* An encircled form. */ + UC_DECOMP_SUPER, /* A superscript form. */ + UC_DECOMP_SUB, /* A subscript form. */ + UC_DECOMP_VERTICAL,/* A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* A vulgar fraction form. */ + UC_DECOMP_COMPAT /* Otherwise unspecified compatibility character. */ +}; + +/* Maximum size of decomposition of a single Unicode character. */ +#define UC_DECOMPOSITION_MAX_LENGTH 32 + +/* Return the character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are + filled and N is returned. Otherwise -1 is returned. */ +extern int + uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); + +/* Return the canonical character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is + returned. Otherwise -1 is returned. */ +extern int + uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); + + +/* Attempt to combine the Unicode characters uc1, uc2. + uc1 is known to have canonical combining class 0. + Return the combination of uc1 and uc2, if it exists. + Return 0 otherwise. + Not all decompositions can be recombined using this function. See the + Unicode file CompositionExclusions.txt for details. */ +extern ucs4_t + uc_composition (ucs4_t uc1, ucs4_t uc2); + + +/* An object of type uninorm_t denotes a Unicode normalization form. */ +struct unicode_normalization_form; +typedef const struct unicode_normalization_form *uninorm_t; + +/* UNINORM_NFD: Normalization form D: canonical decomposition. */ +extern const struct unicode_normalization_form uninorm_nfd; +#define UNINORM_NFD (&uninorm_nfd) + +/* UNINORM_NFC: Normalization form C: canonical decomposition, then + canonical composition. */ +extern const struct unicode_normalization_form uninorm_nfc; +#define UNINORM_NFC (&uninorm_nfc) + +/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ +extern const struct unicode_normalization_form uninorm_nfkd; +#define UNINORM_NFKD (&uninorm_nfkd) + +/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then + canonical composition. */ +extern const struct unicode_normalization_form uninorm_nfkc; +#define UNINORM_NFKC (&uninorm_nfkc) + +/* Test whether a normalization form does compatibility decomposition. */ +#define uninorm_is_compat_decomposing(nf) \ + ((* (const unsigned int *) (nf) >> 0) & 1) + +/* Test whether a normalization form includes canonical composition. */ +#define uninorm_is_composing(nf) \ + ((* (const unsigned int *) (nf) >> 1) & 1) + + +/* Return the specified normalization form of a string. */ +extern uint8_t * + u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, + uint32_t *resultbuf, size_t *lengthp); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _UNINORM_H */ diff --git a/modules/uninorm/base b/modules/uninorm/base new file mode 100644 index 000000000..062a5bee3 --- /dev/null +++ b/modules/uninorm/base @@ -0,0 +1,22 @@ +Description: +Base layer for normalization forms of Unicode strings. + +Files: +lib/uninorm.h + +Depends-on: +unitypes + +configure.ac: + +Makefile.am: + +Include: +"uninorm.h" + +License: +LGPL + +Maintainer: +Bruno Haible +