diff options
-rw-r--r-- | .metadata.txt | 9 | ||||
-rw-r--r-- | CVTUTF.C | 331 | ||||
-rw-r--r-- | CVTUTF.H | 106 | ||||
-rw-r--r-- | CVTUTF7.C | 15 | ||||
-rw-r--r-- | CVTUTF7.H | 15 | ||||
-rw-r--r-- | ConvertUTF.c | 468 | ||||
-rw-r--r-- | ConvertUTF.h | 138 | ||||
-rw-r--r-- | harness.c | 395 | ||||
-rw-r--r-- | readme.txt | 28 |
9 files changed, 1044 insertions, 461 deletions
diff --git a/.metadata.txt b/.metadata.txt index 7cb857f..3c46226 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,4 +1,5 @@ -CVTUTF.C 2001-02-27 05:17 -CVTUTF.H 2001-02-27 05:17 -CVTUTF7.C 2001-02-27 05:17 -CVTUTF7.H 2001-02-27 05:17 +CVTUTF7.C 2001-08-23 23:56 +CVTUTF7.H 2001-08-23 23:56 +ConvertUTF.c 2001-08-23 23:56 +ConvertUTF.h 2001-08-23 23:56 +harness.c 2001-08-23 23:56 diff --git a/CVTUTF.C b/CVTUTF.C deleted file mode 100644 index 94898bc..0000000 --- a/CVTUTF.C +++ /dev/null @@ -1,331 +0,0 @@ -/* ================================================================ */ -/* -File: ConvertUTF.C -Author: Mark E. Davis -Copyright (C) 1994 Taligent, Inc. All rights reserved. - -This code is copyrighted. Under the copyright laws, this code may not -be copied, in whole or part, without prior written consent of Taligent. - -Taligent grants the right to use or reprint this code as long as this -ENTIRE copyright notice is reproduced in the code or reproduction. -The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, -EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN -NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, -WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS -INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY -LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN -IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF -LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE -LIMITATION MAY NOT APPLY TO YOU. - -RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the -government is subject to restrictions as set forth in subparagraph -(c)(l)(ii) of the Rights in Technical Data and Computer Software -clause at DFARS 252.227-7013 and FAR 52.227-19. - -This code may be protected by one or more U.S. and International -Patents. - -TRADEMARKS: Taligent and the Taligent Design Mark are registered -trademarks of Taligent, Inc. -*/ -/* ================================================================ */ - -#include "CVTUTF.H" - -/* ================================================================ */ - -const int halfShift = 10; -const UCS4 halfBase = 0x0010000UL; -const UCS4 halfMask = 0x3FFUL; -const UCS4 kSurrogateHighStart = 0xD800UL; -const UCS4 kSurrogateHighEnd = 0xDBFFUL; -const UCS4 kSurrogateLowStart = 0xDC00UL; -const UCS4 kSurrogateLowEnd = 0xDFFFUL; - -/* ================================================================ */ - -ConversionResult ConvertUCS4toUTF16 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd) { - ConversionResult result = ok; - register UCS4* source = *sourceStart; - register UTF16* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - }; - ch = *source++; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUTF16) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF16toUCS4 ( - UTF16** sourceStart, UTF16* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd) { - ConversionResult result = ok; - register UTF16* source = *sourceStart; - register UCS4* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (target >= targetEnd) { - result = targetExhausted; break; - }; - *target++ = ch; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; -char bytesFromUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; - -UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; - -/* ================================================================ */ -/* This code is similar in effect to making successive calls on the -mbtowc and wctomb routines in FSS-UTF. However, it is considerably -different in code: -* it is adapted to be consistent with UTF16, -* the interface converts a whole buffer to avoid function-call overhead -* constants have been gathered. -* loops & conditionals have been removed as much as possible for -efficiency, in favor of drop-through switch statements. -*/ - -/* ================================================================ */ -ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd) -{ - ConversionResult result = ok; - register UTF16* source = *sourceStart; - register UTF8* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - register unsigned short bytesToWrite = 0; - register const UCS4 byteMask = 0xBF; - register const UCS4 byteMark = 0x80; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd - && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (ch < 0x80) { bytesToWrite = 1; - } else if (ch < 0x800) { bytesToWrite = 2; - } else if (ch < 0x10000) { bytesToWrite = 3; - } else if (ch < 0x200000) { bytesToWrite = 4; - } else if (ch < 0x4000000) { bytesToWrite = 5; - } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; - } else { bytesToWrite = 2; - ch = kReplacementCharacter; - }; /* I wish there were a smart way to avoid this conditional */ - - target += bytesToWrite; - if (target > targetEnd) { - target -= bytesToWrite; result = targetExhausted; break; - }; - switch (bytesToWrite) { /* note: code falls through cases! */ - case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - }; - target += bytesToWrite; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd) -{ - ConversionResult result = ok; - register UTF8* source = *sourceStart; - register UTF16* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch = 0; - register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; - if (source + extraBytesToWrite > sourceEnd) { - result = sourceExhausted; break; - }; - switch(extraBytesToWrite) { /* note: code falls through cases! */ - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - }; - ch -= offsetsFromUTF8[extraBytesToWrite]; - - if (target >= targetEnd) { - result = targetExhausted; break; - }; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUTF16) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ -ConversionResult ConvertUCS4toUTF8 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd) -{ - ConversionResult result = ok; - register UCS4* source = *sourceStart; - register UTF8* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch; - register unsigned short bytesToWrite = 0; - register const UCS4 byteMask = 0xBF; - register const UCS4 byteMark = 0x80; - ch = *source++; - if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd - && source < sourceEnd) { - register UCS4 ch2 = *source; - if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { - ch = ((ch - kSurrogateHighStart) << halfShift) - + (ch2 - kSurrogateLowStart) + halfBase; - ++source; - }; - }; - if (ch < 0x80) { bytesToWrite = 1; - } else if (ch < 0x800) { bytesToWrite = 2; - } else if (ch < 0x10000) { bytesToWrite = 3; - } else if (ch < 0x200000) { bytesToWrite = 4; - } else if (ch < 0x4000000) { bytesToWrite = 5; - } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; - } else { bytesToWrite = 2; - ch = kReplacementCharacter; - }; /* I wish there were a smart way to avoid this conditional */ - - target += bytesToWrite; - if (target > targetEnd) { - target -= bytesToWrite; result = targetExhausted; break; - }; - switch (bytesToWrite) { /* note: code falls through cases! */ - case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - }; - target += bytesToWrite; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; - -/* ================================================================ */ - -ConversionResult ConvertUTF8toUCS4 ( - UTF8** sourceStart, UTF8* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd) -{ - ConversionResult result = ok; - register UTF8* source = *sourceStart; - register UCS4* target = *targetStart; - while (source < sourceEnd) { - register UCS4 ch = 0; - register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; - if (source + extraBytesToWrite > sourceEnd) { - result = sourceExhausted; break; - }; - switch(extraBytesToWrite) { /* note: code falls through cases! */ - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - }; - ch -= offsetsFromUTF8[extraBytesToWrite]; - - if (target >= targetEnd) { - result = targetExhausted; break; - }; - if (ch <= kMaximumUCS2) { - *target++ = ch; - } else if (ch > kMaximumUCS4) { - *target++ = kReplacementCharacter; - } else { - if (target + 1 >= targetEnd) { - result = targetExhausted; break; - }; - ch -= halfBase; - *target++ = (ch >> halfShift) + kSurrogateHighStart; - *target++ = (ch & halfMask) + kSurrogateLowStart; - }; - }; - *sourceStart = source; - *targetStart = target; - return result; -}; diff --git a/CVTUTF.H b/CVTUTF.H deleted file mode 100644 index 85fd8ef..0000000 --- a/CVTUTF.H +++ /dev/null @@ -1,106 +0,0 @@ -/* ================================================================ */ -/* -File: ConvertUTF.h -Author: Mark E. Davis -Copyright (C) 1994 Taligent, Inc. All rights reserved. - -This code is copyrighted. Under the copyright laws, this code may not -be copied, in whole or part, without prior written consent of Taligent. - -Taligent grants the right to use or reprint this code as long as this -ENTIRE copyright notice is reproduced in the code or reproduction. -The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, -EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN -NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, -WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS -INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY -LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN -IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. -BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF -LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE -LIMITATION MAY NOT APPLY TO YOU. - -RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the -government is subject to restrictions as set forth in subparagraph -(c)(l)(ii) of the Rights in Technical Data and Computer Software -clause at DFARS 252.227-7013 and FAR 52.227-19. - -This code may be protected by one or more U.S. and International -Patents. - -TRADEMARKS: Taligent and the Taligent Design Mark are registered -trademarks of Taligent, Inc. -*/ -/* ================================================================ */ - -#include <stdio.h> -#include <stdlib.h> -// #include <types.h> -#include <string.h> - -/* ================================================================ */ -/* The following 4 definitions are compiler-specific. - I would use wchar_t for UCS2/UTF16, except that the C standard - does not guarantee that it has at least 16 bits, so wchar_t is - no less portable than unsigned short! -*/ - -typedef unsigned long UCS4; -typedef unsigned short UCS2; -typedef unsigned short UTF16; -typedef unsigned char UTF8; - -typedef enum {false, true} Boolean; - - -const UCS4 kReplacementCharacter = 0x0000FFFDUL; -const UCS4 kMaximumUCS2 = 0x0000FFFFUL; -const UCS4 kMaximumUTF16 = 0x0010FFFFUL; -const UCS4 kMaximumUCS4 = 0x7FFFFFFFUL; - -/* ================================================================ */ -/* Each of these routines converts the text between *sourceStart and -sourceEnd, putting the result into the buffer between *targetStart and -targetEnd. Note: the end pointers are *after* the last item: e.g. -*(sourceEnd - 1) is the last item. - - The return result indicates whether the conversion was successful, -and if not, whether the problem was in the source or target buffers. - - After the conversion, *sourceStart and *targetStart are both -updated to point to the end of last text successfully converted in -the respective buffers. -*/ - -typedef enum { - ok, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted /* insuff. room in target for conversion */ -} ConversionResult; - -ConversionResult ConvertUCS4toUTF16 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd); - -ConversionResult ConvertUTF16toUCS4 ( - UTF16** sourceStart, UTF16* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd); - -ConversionResult ConvertUTF16toUTF8 ( - UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd); - -ConversionResult ConvertUTF8toUTF16 ( - UTF8** sourceStart, UTF8* sourceEnd, - UTF16** targetStart, const UTF16* targetEnd); - -ConversionResult ConvertUCS4toUTF8 ( - UCS4** sourceStart, const UCS4* sourceEnd, - UTF8** targetStart, const UTF8* targetEnd); - -ConversionResult ConvertUTF8toUCS4 ( - UTF8** sourceStart, UTF8* sourceEnd, - UCS4** targetStart, const UCS4* targetEnd); - -/* ================================================================ */ @@ -2,21 +2,22 @@ /* File: ConvertUTF7.c Author: David B. Goldsmith -Copyright (C) 1994, 1996 Taligent, Inc. All rights reserved. +Copyright (C) 1994, 1996 IBM Corporation All rights reserved. +Revisions: Header update only July, 2001. This code is copyrighted. Under the copyright laws, this code may not -be copied, in whole or part, without prior written consent of Taligent. +be copied, in whole or part, without prior written consent of IBM Corporation. -Taligent grants the right to use this code as long as this ENTIRE +IBM Corporation grants the right to use this code as long as this ENTIRE copyright notice is reproduced in the code. The code is provided -AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +AS-IS, AND IBM CORPORATION DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT -WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WILL IBM CORPORATION BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN -IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +IF IBM CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY NOT APPLY TO YOU. @@ -29,8 +30,6 @@ clause at DFARS 252.227-7013 and FAR 52.227-19. This code may be protected by one or more U.S. and International Patents. -TRADEMARKS: Taligent and the Taligent Design Mark are registered -trademarks of Taligent, Inc. */ #include "CVTUTF7.H" @@ -2,21 +2,22 @@ /* File: ConvertUTF7.h Author: David B. Goldsmith -Copyright (C) 1994 Taligent, Inc. All rights reserved. +Copyright (C) 1994 IBM Corporation All rights reserved. +Revisions: Header update only July, 2001. This code is copyrighted. Under the copyright laws, this code may not -be copied, in whole or part, without prior written consent of Taligent. +be copied, in whole or part, without prior written consent of IBM Corporation. -Taligent grants the right to use this code as long as this ENTIRE +IBM Corporation grants the right to use this code as long as this ENTIRE copyright notice is reproduced in the code. The code is provided -AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR +AS-IS, AND IBM CORPORATION DISCLAIMS ALL WARRANTIES, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT -WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, +WILL IBM CORPORATION BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN -IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +IF IBM CORPORATION HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY NOT APPLY TO YOU. @@ -29,8 +30,6 @@ clause at DFARS 252.227-7013 and FAR 52.227-19. This code may be protected by one or more U.S. and International Patents. -TRADEMARKS: Taligent and the Taligent Design Mark are registered -trademarks of Taligent, Inc. */ /* ================================================================ */ diff --git a/ConvertUTF.c b/ConvertUTF.c new file mode 100644 index 0000000..491fa14 --- /dev/null +++ b/ConvertUTF.c @@ -0,0 +1,468 @@ +/* + * Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + + See the header file "ConvertUTF.h" for complete documentation. + +------------------------------------------------------------------------ */ + + +#include "ConvertUTF.h" +#ifdef CVTUTF_DEBUG +#include <stdio.h> +#endif + +static const int halfShift = 10; /* used for shifting by 10 bits */ + +static const UTF32 halfBase = 0x0010000UL; +static const UTF32 halfMask = 0x3FFUL; + +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF +#define false 0 +#define true 1 + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; + *target++ = (ch & halfMask) + UNI_SUR_LOW_START; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 ( + UTF16** sourceStart, UTF16* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, const ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + ch = *source++; + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { + ch2 = *source; + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { + /* an unpaired low surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + if (target >= targetEnd) { + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} +#endif + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... six byte sequence.) + */ +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF8 ( + UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { + UTF32 ch2 = *source; + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; + } else { bytesToWrite = 2; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 1: *--target = ch | firstByteMark[bytesToWrite]; + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + +static Boolean isLegalUTF8(UTF8 *source, int length) { + UTF8 a; + UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + if (*source > 0xF4) return false; + } + return true; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= extraBytesToRead; /* return to the start */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; + *target++ = (ch & halfMask) + UNI_SUR_LOW_START; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + /* surrogates of any stripe are not legal UTF32 characters */ + if (flags == strictConversion ) { + if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; + } else { bytesToWrite = 2; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; + case 1: *--target = ch | firstByteMark[bytesToWrite]; + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + result = targetExhausted; break; + } + if (ch <= UNI_MAX_UTF32) { + *target++ = ch; + } else if (ch > UNI_MAX_UTF32) { + *target++ = UNI_REPLACEMENT_CHAR; + } else { + if (target + 1 >= targetEnd) { + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; + *target++ = (ch & halfMask) + UNI_SUR_LOW_START; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- + + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. + + --------------------------------------------------------------------- */ + + diff --git a/ConvertUTF.h b/ConvertUTF.h new file mode 100644 index 0000000..6798183 --- /dev/null +++ b/ConvertUTF.h @@ -0,0 +1,138 @@ +/* + * Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Header file. + + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. + + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. + + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. + + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) + + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. + + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. + + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. + + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, + or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. + + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. + + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. + + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + +------------------------------------------------------------------------ */ + +/* --------------------------------------------------------------------- + The following 4 definitions are compiler-specific. + The C standard does not guarantee that wchar_t has at least + 16 bits, so wchar_t is no less portable than unsigned short! + All should be unsigned values to avoid sign extension during + bit mask & shift operations. +------------------------------------------------------------------------ */ + +typedef unsigned long UTF32; /* at least 32 bits */ +typedef unsigned short UTF16; /* at least 16 bits */ +typedef unsigned char UTF8; /* typically 8 bits */ +typedef unsigned char Boolean; /* 0 or 1 */ + +/* Some fundamental constants */ +#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD +#define UNI_MAX_BMP (UTF32)0x0000FFFF +#define UNI_MAX_UTF16 (UTF32)0x0010FFFF +#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF + +typedef enum { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +} ConversionResult; + +typedef enum { + strictConversion = 0, + lenientConversion +} ConversionFlags; + +ConversionResult ConvertUTF32toUTF16 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF32 ( + UTF16** sourceStart, UTF16* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF16toUTF8 ( + UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF16 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF32toUTF8 ( + UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags); + +ConversionResult ConvertUTF8toUTF32 ( + UTF8** sourceStart, UTF8* sourceEnd, + UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags); + +Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd); + +/* --------------------------------------------------------------------- */ diff --git a/harness.c b/harness.c new file mode 100644 index 0000000..a07792b --- /dev/null +++ b/harness.c @@ -0,0 +1,395 @@ +/* + * Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + * + * harness.c + * + * This is a test harness for "ConvertUTF.c". Compile this + * and run without arguments. It will exhaustively test + * the conversion routines, and print a few lines of diagnostic + * output. You don't need to compile ConvertUTF.c itself, + * since it gets #included here along with the header. + * Example of a compile line: + * + * $ gcc -g harness.c -o harness + * + * Rev History: Rick McGowan, new file April 2001. + * + */ + +#define CVTUTF_DEBUG 1 + +#include <stdio.h> +#include "ConvertUTF.c" + +/* --------------------------------------------------------------------- + test01 - Spot check a few legal & illegal UTF-8 values only. + This is not an exhaustive test, just a brief one that was + used to develop the "isLegalUTF8" routine. + + Legal UTF-8 sequences are: + + 1st---- 2nd---- 3rd---- 4th---- Codepoints--- + + 00-7F 0000- 007F + C2-DF 80-BF 0080- 07FF + E0 A0-BF 80-BF 0800- 0FFF + E1-EF 80-BF 80-BF 1000- FFFF + F0 90-BF 80-BF 80-BF 10000- 3FFFF + F1-F3 80-BF 80-BF 80-BF 40000- FFFFF + F4 80-8F 80-BF 80-BF 100000-10FFFF + + --------------------------------------------------------------------- */ + + +struct utf8_test { + Boolean utf8_legal; /* is legal sequence? */ + int utf8_len; /* length of sequence */ + unsigned char utf8_seq[5]; /* the sequence */ +}; + +struct utf8_test utf8_testData[] = { + { 1, 1, { 0x7A, 0x00, 0x00, 0x00, 0x00 }}, /* 0 */ + { 1, 2, { 0xC2, 0xAC, 0x00, 0x00, 0x00 }}, /* 1 */ + { 1, 2, { 0xDF, 0xB2, 0x00, 0x00, 0x00 }}, /* 2 */ + { 1, 3, { 0xE0, 0xA1, 0x81, 0x00, 0x00 }}, /* 3 */ + { 1, 3, { 0xE1, 0xAC, 0x90, 0x00, 0x00 }}, /* 4 */ + { 1, 3, { 0xF0, 0x93, 0xB2, 0xA1, 0x00 }}, /* 5 */ + { 1, 4, { 0xF1, 0x87, 0x9A, 0xB0, 0x00 }}, /* 6 */ + { 1, 4, { 0xF3, 0x88, 0x9B, 0xAD, 0x00 }}, /* 7 */ + { 1, 4, { 0xF4, 0x82, 0x89, 0x8F, 0x00 }}, /* 8 */ + + { 0, 3, { 0x82, 0x00, 0x00, 0x00, 0x00 }}, /* 9 */ + { 0, 2, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */ + { 0, 2, { 0xE1, 0xFC, 0xFF, 0x00, 0x00 }}, /* 11 */ + { 0, 3, { 0xC2, 0xFC, 0x00, 0x00, 0x00 }}, /* 12 */ + { 0, 3, { 0xE1, 0xC2, 0x81, 0x00, 0x00 }}, /* 13 */ + { 0, 2, { 0xC2, 0xC1, 0x00, 0x00, 0x00 }}, /* 14 */ + { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */ + { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */ + { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */ +/* for all > 17 use "short" buffer lengths to detect over-run */ + { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ + { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }} +}; + +int test01() { + int i; + int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2; + + printf("Begin Test01\n"); fflush(stdout); + + rval = 0; + for (i = 0; utf8_testData[i].utf8_len; i++) { + wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; + gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); + /* use truncated length for tests over 17 */ + if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } + gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); + if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { + printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", + i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0], + utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2], + utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4], + utf8_testData[i].utf8_len); + ++rval; + } + } + + return (rval ? 0 : 1); +} + + +/* --------------------------------------------------------------------- + test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32 + + This is an exhaustive test of values 0 through 0x10FFFF. It + takes each integer value and converts from UTC4 through the + other encoding forms, and back to UTR32, checking the results + along the way. + + It does not check the un-paired low surrogates, except for + the first low surrogate. It intends to get that one illegal + result, prints a message, and continues with tests. + + --------------------------------------------------------------------- */ + +int test02() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2], utf32_result[2]; + UTF16 utf16_buf[3], utf16_result[3]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF16 *utf16SourceStart, *utf16TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test02\n"); fflush(stdout); + + for (i = 0; i <= 0x10FFFF; i++) { + utf32_buf[0] = i; utf32_buf[1] = 0; + utf32_result[0] = utf32_result[1] = 0; + utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0; + utf16_result[0] = utf16_result[1] = utf16_result[2] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf16TargetStart = utf16SourceStart = utf16_buf; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF32 -> UTF16 + */ + result = ConvertUTF32toUTF16(&utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { + /* skip result checking for all but 0000d800, which we know to be illegal */ + switch (result) { + default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + } + if (result != conversionOK) { + if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) { + printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result); + if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) { + return 0; + } else { + printf("!!! Test02A: note expected illegal result for 0x0000D800\n"); + } + } + } + if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; + + /* + * Test UTF16 -> UTF8, with legality check on. We check for everything except + * for unpaired low surrogates. We do make one check that the lowest low + * surrogate, when unpaired, is illegal. + */ + result = ConvertUTF16toUTF8(&utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + + printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); + if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) { + return 0; + } else { + /* Note: This illegal result only happens if we remove the surrogate + check in Test02A. So it shouldn't be seen unless that check and + the "continue" are removed in the test above. + */ + if (i == UNI_SUR_LOW_START) + printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n"); + else if (i == UNI_SUR_HIGH_START) + printf("!!! Test02B: note expected illegal result for 0xD800,0000\n"); + } + } + if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) { + printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); + printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n"); + return 0; + } + + if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; + + /* + * Reset some result buffer pointers for the trip back. + */ + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf16TargetStart = utf16SourceStart = utf16_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF8 -> UTF16, with legality check on. + */ + result = ConvertUTF8toUTF16(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n", + i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result); + return 0; + } + for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */ + if (utf16_buf[n] != utf16_result[n]) { + printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n", + utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]); + return 0; + } + } + + /* + * Test UTF16 -> UTF32, with legality check on. If the result of our previous + * conversion gave us a "surrogate pair", then we need to convert 2 entities + * back to UTF32. + */ + if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) { + result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + } else { + result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + } + switch (result) { + default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n", + i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result); + return 0; + } + + /* + * Now, check the final round-trip value. + */ + if (utf32_buf[0] != utf32_result[0]) { + printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]); + return 0; + } + + + } + return 1; +} + +/* --------------------------------------------------------------------- + test03 - Test round trip UTF32 -> UTF8 -> UTF32 + + This tests the functions that were not tested by test02 above. + For each UTF32 value 0 through 0x10FFFF, it tests the conversion + to UTF-8 and back. The test is exhaustive. + + --------------------------------------------------------------------- */ + +int test03() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2], utf32_result[2]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test03\n"); fflush(stdout); + + for (i = 0; i <= 0x10FFFF; i++) { + /* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */ + if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue; + + utf32_buf[0] = i; utf32_buf[1] = 0; + utf32_result[0] = utf32_result[1] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF32 -> UTF8, with legality check on. + */ + result = ConvertUTF32toUTF8(&utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test03A for %d (0x%x); output %s; result %d\n", + i, utf32_buf[0], utf8_buf, result); + if (i != UNI_SUR_HIGH_START) { + return 0; + } else { + printf("!!! Test03A: note expected illegal result for 0x0000D800\n"); + } + } + if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) { + printf("Test03A for %d (0x%x); output %s; result %d\n", + i, utf32_buf[0], utf8_buf, result); + printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n"); + return 0; + } + + if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue; + + /* + * Reset some result buffer pointers for the trip back. + */ + utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result; + utf8TargetStart = utf8SourceStart = utf8_buf; + + /* + * Test UTF8 -> UTF32, with legality check on. + */ + result = ConvertUTF8toUTF32(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion); + switch (result) { + default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + case conversionOK: break; + case sourceExhausted: printf("sourceExhausted\t"); break; + case targetExhausted: printf("targetExhausted\t"); break; + case sourceIllegal: printf("sourceIllegal\t"); break; + } + if (result != conversionOK) { + printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n", + i, utf32_buf[0], utf8_buf, utf32_result[0], result); + return 0; + } + + /* + * Now, check the final round-trip value. + */ + if (utf32_buf[0] != utf32_result[0]) { + printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]); + return 0; + } + } + return 1; +} + +/* --------------------------------------------------------------------- */ + +main() { + printf("Three tests of round-trip conversions will be performed.\n"); + printf("Two illegal result messages are expected; one in test 02A; one in test 03A .\n\n"); + fflush(stdout); + if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); } + else { printf("-------- Test01 failed. --------\n\n"); } + if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); } + else { printf("-------- Test02 failed. --------\n\n"); } + if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); } + else { printf("-------- Test03 failed. --------\n\n"); } +} @@ -1,5 +1,25 @@ -The accompanying files provide for conversion between Unicode/10646 and two -different proposed transformation formats, UTF-7, UTF-8 and UTF-16. -Please address any comments on UTF-8 or UTF-16 to mark_davis@taligent.com, -and any comments on UTF-7 to david_goldsmith@taligent.com +The accompanying C source code file "ConvertUTF.c" and the associated header +file "ConvertUTF.h" provide for conversion between various transformation +formats of Unicode characters. The following conversions are supported: + + UCS4 to UTF16 + UCS4 to UTF8 + UTF16 to UCS4 + UTF16 to UTF8 + UTF8 to UTF16 + UTF8 to UCS4 + + +The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes +only. They have not been updated to Unicode 3.0 and should be considered +obsolescent. "CVTUTF7.C" contains two functions that can convert between +UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are not supported, +the code has not been tested, and should be considered unsuitable for general +purpose use. + +Please address any bug reports about these programs to info@unicode.org. + +Last update: July 12, 2001 + + |