From 08a7547e191867cdd03423009624af7b45630a69 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 11 Feb 2003 01:16:09 -0500 Subject: http://web.archive.org/web/20030211011609/http:/www.unicode.org:80/Public/ALPHA/CVTUTF-1-1/ --- .metadata.txt | 12 +- ConvertUTF.c | 685 ++++++++++++++++++++++++++++------------------------- ExpectedOutput.txt | 28 +++ harness.c | 11 +- readme.txt | 30 +++ 5 files changed, 436 insertions(+), 330 deletions(-) create mode 100644 ExpectedOutput.txt create mode 100644 readme.txt diff --git a/.metadata.txt b/.metadata.txt index 04a8f5d..4ff67c8 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,5 +1,7 @@ -CVTUTF7.C 2001-08-23 23:56 -CVTUTF7.H 2001-08-23 23:56 -ConvertUTF.c 2001-09-26 17:39 -ConvertUTF.h 2001-09-26 17:39 -harness.c 2001-09-26 17:39 +CVTUTF7.C 2002-09-20 15:27 +CVTUTF7.H 2002-09-20 15:27 +ConvertUTF.c 2002-09-20 15:27 +ConvertUTF.h 2002-09-20 15:27 +ExpectedOutput.txt 2002-09-20 15:27 +harness.c 2002-09-20 15:27 +readme.txt 2002-09-20 15:27 diff --git a/ConvertUTF.c b/ConvertUTF.c index 23834c4..0a18518 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -1,5 +1,5 @@ /* - * Copyright 2001 Unicode, Inc. + * Copyright 2001-2003 Unicode, Inc. * * Disclaimer * @@ -23,10 +23,14 @@ /* --------------------------------------------------------------------- Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. See the header file "ConvertUTF.h" for complete documentation. @@ -38,106 +42,122 @@ #include #endif -static const int halfShift = 10; /* used for shifting by 10 bits */ +static const int halfShift = 10; /* used for shifting by 10 bits */ -static const UTF32 halfBase = 0x0010000UL; -static const UTF32 halfMask = 0x3FFUL; +static const UTF32 halfBase = 0x0010000UL; +static const UTF32 halfMask = 0x3FFUL; -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF -#define false 0 -#define true 1 +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF +#define false 0 +#define true 1 /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; - *target++ = (ch & halfMask) + UNI_SUR_LOW_START; + *target++ = UNI_REPLACEMENT_CHAR; } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - *sourceStart = source; - *targetStart = target; - return result; + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { - ch2 = *source; - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { - /* an unpaired low surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; } - *target++ = ch; + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; } - *sourceStart = source; - *targetStart = target; + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; #ifdef CVTUTF_DEBUG if (result == sourceIllegal) { fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); fflush(stderr); } #endif - return result; + return result; } /* --------------------------------------------------------------------- */ @@ -147,14 +167,14 @@ if (result == sourceIllegal) { * get the number of trailing bytes that are supposed to follow it. */ static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* @@ -163,7 +183,7 @@ static const char trailingBytesForUTF8[256] = { * in a UTF-8 sequence. */ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed @@ -186,60 +206,71 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) { - UTF32 ch2 = *source; - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; - ch = UNI_REPLACEMENT_CHAR; + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; + } else { bytesToWrite = 2; + ch = UNI_REPLACEMENT_CHAR; + } - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - } - target += bytesToWrite; + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); } - *sourceStart = source; - *targetStart = target; - return result; + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ @@ -248,7 +279,7 @@ ConversionResult ConvertUTF16toUTF8 ( * Utility routine to tell whether a sequence of bytes is legal UTF-8. * This must be called with the length pre-determined by the first byte. * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; + * length = trailingBytesForUTF8[*source]+1; * and the sequence is illegal right away if there aren't that many bytes * available. * If presented with a length > 4, this returns false. The Unicode @@ -256,25 +287,25 @@ ConversionResult ConvertUTF16toUTF8 ( */ static Boolean isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - if (*source > 0xF4) return false; + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; } - return true; + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + if (*source > 0xF4) return false; + } + return true; } /* --------------------------------------------------------------------- */ @@ -284,190 +315,204 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { * This is not used here; it's just exported. */ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - return isLegalUTF8(source, length); + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); } /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START; - *target++ = (ch & halfMask) + UNI_SUR_LOW_START; + *target++ = UNI_REPLACEMENT_CHAR; } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - *sourceStart = source; - *targetStart = target; - return result; + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - /* surrogates of any stripe are not legal UTF32 characters */ - if (flags == strictConversion ) { - if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6; - case 1: *--target = ch | firstByteMark[bytesToWrite]; - } - target += bytesToWrite; + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; + } else { bytesToWrite = 2; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; } - *sourceStart = source; - *targetStart = target; - return result; + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_UTF32) { - *target++ = ch; - } else { /* i.e., ch > UNI_MAX_UTF32 */ - *target++ = UNI_REPLACEMENT_CHAR; + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_UTF32) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_UTF32 */ + *target++ = UNI_REPLACEMENT_CHAR; } - *sourceStart = source; - *targetStart = target; - return result; + } + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: + { + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); + } + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. --------------------------------------------------------------------- */ - - diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt new file mode 100644 index 0000000..a1b7739 --- /dev/null +++ b/ExpectedOutput.txt @@ -0,0 +1,28 @@ +Three tests of round-trip conversions will be performed. +Notes: + Six illegal result messages are expected, three in test 02A and + three in test 03A. These are for tests of Surrogate conversion and + some non-characters. Three lines indicating success without error + should also be printed, one for each of the three tests. + +Begin Test01 +******** Test01 succeeded without error. ******** + +Begin Test02 +Test02A for 55296, input 0000d800, output 0000,0000, result 3 +!!! Test02A: note expected illegal result for 0x0000D800 +sourceIllegal Test02A for 65534, input 0000fffe, output 0000,0000, result 3 +!!! Test02A: note expected illegal result for 0x0000FFFE +sourceIllegal Test02A for 65535, input 0000ffff, output 0000,0000, result 3 +!!! Test02A: note expected illegal result for 0x0000FFFF +******** Test02 succeeded without error. ******** + +Begin Test03 +sourceIllegal Test03A for 55296 (0xd800); output ; result 3 +!!! Test03A: note expected illegal result for 0x0000D800 +sourceIllegal Test03A for 65534 (0xfffe); output ; result 3 +!!! Test03A: note expected illegal result for 0x0000FFFE +sourceIllegal Test03A for 65535 (0xffff); output ; result 3 +!!! Test03A: note expected illegal result for 0x0000FFFF +******** Test03 succeeded without error. ******** + diff --git a/harness.c b/harness.c index b3dd500..1e3dfb9 100644 --- a/harness.c +++ b/harness.c @@ -31,6 +31,9 @@ * $ gcc -g harness.c -o harness * * Rev History: Rick McGowan, new file April 2001. + * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2] + * per report from Iain Murray. + * July 3, 2003: Updated printout message. * */ @@ -195,7 +198,6 @@ int test02() { case sourceIllegal: printf("sourceIllegal\t"); break; } if (result != conversionOK) { - printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n", i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result); if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) { @@ -230,7 +232,7 @@ int test02() { /* * Test UTF8 -> UTF16, with legality check on. */ - result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion); + result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion); switch (result) { default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); case conversionOK: break; @@ -281,8 +283,6 @@ int test02() { printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]); return 0; } - - } return 1; } @@ -384,7 +384,8 @@ int test03() { main() { printf("Three tests of round-trip conversions will be performed.\n"); - printf("Two illegal result messages are expected; one in test 02A; one in test 03A .\n\n"); + printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); + printf("These are for tests of Surrogate conversion.\n\n"); fflush(stdout); if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); } else { printf("-------- Test01 failed. --------\n\n"); } diff --git a/readme.txt b/readme.txt new file mode 100644 index 0000000..4a09ecb --- /dev/null +++ b/readme.txt @@ -0,0 +1,30 @@ + +The accompanying C source code file "ConvertUTF.c" and the associated header +file "ConvertUTF.h" provide for conversion between various transformation +formats of Unicode characters. The following conversions are supported: + + UCS4 to UTF16 + UCS4 to UTF8 + UTF16 to UCS4 + UTF16 to UTF8 + UTF8 to UTF16 + UTF8 to UCS4 + +In addition, there is a test harness which runs various tests. + +The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes +only. They have not been updated to Unicode 3.0 and should be considered +obsolescent. "CVTUTF7.C" contains two functions that can convert between +UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are not supported, +the code has not been tested, and should be considered unsuitable for general +purpose use. + +Please submit any bug reports about these programs here: + + http://www.unicode.org/unicode/reporting.html + +Version 1.0: initial version. +Version 1.1: corrected some minor problems; added stricter checks. + +Last update: September 19, 2002 + -- cgit v1.2.3-2-g168b From 69598662f36c6738ff9774d0c57271b8bf069b2c Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Tue, 29 Jul 2003 15:00:00 -0500 Subject: Synthesized listing: 20030729150099 ALPHA/CVTUTF-1-1 --- .metadata.txt | 7 ------- ExpectedOutput.txt | 15 ++------------- readme.txt | 2 +- 3 files changed, 3 insertions(+), 21 deletions(-) delete mode 100644 .metadata.txt diff --git a/.metadata.txt b/.metadata.txt deleted file mode 100644 index 4ff67c8..0000000 --- a/.metadata.txt +++ /dev/null @@ -1,7 +0,0 @@ -CVTUTF7.C 2002-09-20 15:27 -CVTUTF7.H 2002-09-20 15:27 -ConvertUTF.c 2002-09-20 15:27 -ConvertUTF.h 2002-09-20 15:27 -ExpectedOutput.txt 2002-09-20 15:27 -harness.c 2002-09-20 15:27 -readme.txt 2002-09-20 15:27 diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt index a1b7739..cf13a6a 100644 --- a/ExpectedOutput.txt +++ b/ExpectedOutput.txt @@ -1,9 +1,6 @@ Three tests of round-trip conversions will be performed. -Notes: - Six illegal result messages are expected, three in test 02A and - three in test 03A. These are for tests of Surrogate conversion and - some non-characters. Three lines indicating success without error - should also be printed, one for each of the three tests. +Two illegal result messages are expected; one in test 02A; one in test 03A. +These are for tests of Surrogate conversion. Begin Test01 ******** Test01 succeeded without error. ******** @@ -11,18 +8,10 @@ Begin Test01 Begin Test02 Test02A for 55296, input 0000d800, output 0000,0000, result 3 !!! Test02A: note expected illegal result for 0x0000D800 -sourceIllegal Test02A for 65534, input 0000fffe, output 0000,0000, result 3 -!!! Test02A: note expected illegal result for 0x0000FFFE -sourceIllegal Test02A for 65535, input 0000ffff, output 0000,0000, result 3 -!!! Test02A: note expected illegal result for 0x0000FFFF ******** Test02 succeeded without error. ******** Begin Test03 sourceIllegal Test03A for 55296 (0xd800); output ; result 3 !!! Test03A: note expected illegal result for 0x0000D800 -sourceIllegal Test03A for 65534 (0xfffe); output ; result 3 -!!! Test03A: note expected illegal result for 0x0000FFFE -sourceIllegal Test03A for 65535 (0xffff); output ; result 3 -!!! Test03A: note expected illegal result for 0x0000FFFF ******** Test03 succeeded without error. ******** diff --git a/readme.txt b/readme.txt index 4a09ecb..7be1443 100644 --- a/readme.txt +++ b/readme.txt @@ -26,5 +26,5 @@ Please submit any bug reports about these programs here: Version 1.0: initial version. Version 1.1: corrected some minor problems; added stricter checks. -Last update: September 19, 2002 +Last update: July 3, 2003 -- cgit v1.2.3-2-g168b From 766942acf8f0c0d9ef6c16ffbdedefdfda0af4b2 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Sun, 4 Apr 2004 06:02:31 -0500 Subject: http://web.archive.org/web/20040404060231/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-2/ --- .metadata.txt | 7 +++++++ ConvertUTF.c | 7 ++++++- readme.txt | 21 +++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 .metadata.txt diff --git a/.metadata.txt b/.metadata.txt new file mode 100644 index 0000000..4a86b4a --- /dev/null +++ b/.metadata.txt @@ -0,0 +1,7 @@ +CVTUTF7.C 2004-01-06 17:42 +CVTUTF7.H 2004-01-06 17:42 +ConvertUTF.c 2004-01-06 17:42 +ConvertUTF.h 2004-01-06 17:42 +ExpectedOutput.txt 2004-01-06 17:42 +harness.c 2004-01-06 17:42 +readme.txt 2004-01-06 17:42 diff --git a/ConvertUTF.c b/ConvertUTF.c index 0a18518..649fbc8 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -1,5 +1,5 @@ /* - * Copyright 2001-2003 Unicode, Inc. + * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * @@ -31,6 +31,7 @@ source sequences, enhanced error detection, added casts to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. See the header file "ConvertUTF.h" for complete documentation. @@ -345,6 +346,8 @@ ConversionResult ConvertUTF8toUTF16 ( * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; @@ -465,6 +468,8 @@ ConversionResult ConvertUTF8toUTF32 ( * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; diff --git a/readme.txt b/readme.txt index 7be1443..722c6f4 100644 --- a/readme.txt +++ b/readme.txt @@ -13,18 +13,27 @@ formats of Unicode characters. The following conversions are supported: In addition, there is a test harness which runs various tests. The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes -only. They have not been updated to Unicode 3.0 and should be considered -obsolescent. "CVTUTF7.C" contains two functions that can convert between -UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are not supported, -the code has not been tested, and should be considered unsuitable for general -purpose use. +only. They have not been updated to Unicode 3.0 or later and should be +considered obsolescent. "CVTUTF7.C" contains two functions that can convert +between UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are +not supported, the code has not been tested, and should be considered +unsuitable for general purpose use. Please submit any bug reports about these programs here: http://www.unicode.org/unicode/reporting.html Version 1.0: initial version. + Version 1.1: corrected some minor problems; added stricter checks. -Last update: July 3, 2003 +Version 1.2: corrected switch statements associated with "extraBytesToRead" + in 4 & 5 byte cases, in functions for conversion from UTF8. + Note: formally, the 4 & 5 byte cases are illegal in the latest + UTF8, but the table and this code has always catered for those, + cases since at one time they were legal. + + +Last update: January 6, 2004 + -- cgit v1.2.3-2-g168b From 7d347a05ce025a9aef28bcf72089e1388dd48d13 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 22 Oct 2004 05:57:51 -0500 Subject: http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/ --- .metadata.txt | 14 ++++++------- ConvertUTF.c | 46 ++++++++++++++++++++++++++-------------- ConvertUTF.h | 3 ++- ExpectedOutput.txt | 4 ++++ harness.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++------ readme.txt | 18 +++++++++------- 6 files changed, 110 insertions(+), 36 deletions(-) diff --git a/.metadata.txt b/.metadata.txt index 4a86b4a..08cfecf 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,7 +1,7 @@ -CVTUTF7.C 2004-01-06 17:42 -CVTUTF7.H 2004-01-06 17:42 -ConvertUTF.c 2004-01-06 17:42 -ConvertUTF.h 2004-01-06 17:42 -ExpectedOutput.txt 2004-01-06 17:42 -harness.c 2004-01-06 17:42 -readme.txt 2004-01-06 17:42 +CVTUTF7.C 2004-10-19 16:05 +CVTUTF7.H 2004-10-19 16:05 +ConvertUTF.c 2004-10-19 16:05 +ConvertUTF.h 2004-10-19 16:05 +ExpectedOutput.txt 2004-10-19 16:05 +harness.c 2004-10-19 16:05 +readme.txt 2004-10-19 16:08 diff --git a/ConvertUTF.c b/ConvertUTF.c index 649fbc8..9b3deeb 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -32,6 +32,7 @@ to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. See the header file "ConvertUTF.h" for complete documentation. @@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 ( } else { *target++ = (UTF16)ch; /* normal case */ } - } else if (ch > UNI_MAX_UTF16) { + } else if (ch > UNI_MAX_LEGAL_UTF32) { if (flags == strictConversion) { result = sourceIllegal; } else { @@ -166,6 +167,9 @@ if (result == sourceIllegal) { /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... six byte sequence.) + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; @@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 ( if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; } @@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { /* no fall-through in this inner switch */ case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; case 0xF0: if (a < 0x90) return false; break; case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; + default: if (a < 0x80) return false; } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - if (*source > 0xF4) return false; + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } + if (*source > 0xF4) return false; return true; } @@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 ( * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; @@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 ( break; } } - /* Figure out how many bytes the result will require */ + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; } target += bytesToWrite; @@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 ( source -= (extraBytesToRead+1); /* Back up the source pointer! */ result = targetExhausted; break; } - if (ch <= UNI_MAX_UTF32) { - /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ @@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 ( } else { *target++ = ch; } - } else { /* i.e., ch > UNI_MAX_UTF32 */ + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } diff --git a/ConvertUTF.h b/ConvertUTF.h index 429ab40..e264915 100644 --- a/ConvertUTF.h +++ b/ConvertUTF.h @@ -1,5 +1,5 @@ /* - * Copyright 2001 Unicode, Inc. + * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * @@ -97,6 +97,7 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_MAX_BMP (UTF32)0x0000FFFF #define UNI_MAX_UTF16 (UTF32)0x0010FFFF #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF typedef enum { conversionOK, /* conversion successful */ diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt index cf13a6a..e09d844 100644 --- a/ExpectedOutput.txt +++ b/ExpectedOutput.txt @@ -1,4 +1,5 @@ Three tests of round-trip conversions will be performed. +One test of illegal UTF-32 will be peroformed. Two illegal result messages are expected; one in test 02A; one in test 03A. These are for tests of Surrogate conversion. @@ -15,3 +16,6 @@ sourceIllegal Test03A for 55296 (0xd800); output ; result 3 !!! Test03A: note expected illegal result for 0x0000D800 ******** Test03 succeeded without error. ******** +Begin Test04 +******** Test04 succeeded without error. ******** + diff --git a/harness.c b/harness.c index 1e3dfb9..25b3e9e 100644 --- a/harness.c +++ b/harness.c @@ -1,5 +1,5 @@ /* - * Copyright 2001 Unicode, Inc. + * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * @@ -34,6 +34,8 @@ * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2] * per report from Iain Murray. * July 3, 2003: Updated printout message. + * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch + * illegal surrogate use in UTF-8, per report from Frank Tang. * */ @@ -54,7 +56,9 @@ 00-7F 0000- 007F C2-DF 80-BF 0080- 07FF E0 A0-BF 80-BF 0800- 0FFF - E1-EF 80-BF 80-BF 1000- FFFF + E1-EC 80-BF 80-BF 1000- CFFF + ED 80-9F 80-BF D000- D7FF + EE-EF 80-BF 80-BF E000- FFFF F0 90-BF 80-BF 80-BF 10000- 3FFFF F1-F3 80-BF 80-BF 80-BF 40000- FFFFF F4 80-8F 80-BF 80-BF 100000-10FFFF @@ -88,9 +92,16 @@ struct utf8_test utf8_testData[] = { { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */ { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */ -/* for all > 17 use "short" buffer lengths to detect over-run */ + + { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */ + { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */ + { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */ + { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */ + +/* for all > 21 use "short" buffer lengths to detect over-run */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ - { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }} + { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}, + }; int test01() { @@ -103,8 +114,8 @@ int test01() { for (i = 0; utf8_testData[i].utf8_len; i++) { wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); - /* use truncated length for tests over 17 */ - if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } + /* use truncated length for tests over 21 */ + if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", @@ -380,10 +391,46 @@ int test03() { return 1; } +/* --------------------------------------------------------------------- + test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8. + Expect it will be turned into UNI_REPLACEMENT_CHAR. + + --------------------------------------------------------------------- */ + +int test04() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test04\n"); fflush(stdout); + + i = 0x10FFFF + 21; /* an arbitrary value > legal */ + + utf32_buf[0] = i; utf32_buf[1] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; + utf8TargetStart = utf8_buf; + + /* + * Test UTF32 -> UTF8, with legality check on. + */ + result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + if (result != sourceIllegal) { + fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + } + + return 1; +} + /* --------------------------------------------------------------------- */ main() { printf("Three tests of round-trip conversions will be performed.\n"); + printf("One test of illegal UTF-32 will be peroformed.\n"); printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); printf("These are for tests of Surrogate conversion.\n\n"); fflush(stdout); @@ -393,4 +440,6 @@ main() { else { printf("-------- Test02 failed. --------\n\n"); } if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); } else { printf("-------- Test03 failed. --------\n\n"); } + if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); } + else { printf("-------- Test04 failed. --------\n\n"); } } diff --git a/readme.txt b/readme.txt index 722c6f4..b9f17fb 100644 --- a/readme.txt +++ b/readme.txt @@ -3,12 +3,12 @@ The accompanying C source code file "ConvertUTF.c" and the associated header file "ConvertUTF.h" provide for conversion between various transformation formats of Unicode characters. The following conversions are supported: - UCS4 to UTF16 - UCS4 to UTF8 - UTF16 to UCS4 - UTF16 to UTF8 - UTF8 to UTF16 - UTF8 to UCS4 + UTF-32 to UTF-16 + UTF-32 to UTF-8 + UTF-16 to UTF-32 + UTF-16 to UTF-8 + UTF-8 to UTF-16 + UTF-8 to UTF-32 In addition, there is a test harness which runs various tests. @@ -33,7 +33,11 @@ Version 1.2: corrected switch statements associated with "extraBytesToRead" UTF8, but the table and this code has always catered for those, cases since at one time they were legal. +Version 1.3: Updated UTF-8 legality check; + updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions + Updated UTF-8 legality tests in harness.c + -Last update: January 6, 2004 +Last update: October 19, 2004 -- cgit v1.2.3-2-g168b