From 7d347a05ce025a9aef28bcf72089e1388dd48d13 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 22 Oct 2004 05:57:51 -0500 Subject: http://web.archive.org/web/20041022055751/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-3/ --- .metadata.txt | 14 ++++++------- ConvertUTF.c | 46 ++++++++++++++++++++++++++-------------- ConvertUTF.h | 3 ++- ExpectedOutput.txt | 4 ++++ harness.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++------ readme.txt | 18 +++++++++------- 6 files changed, 110 insertions(+), 36 deletions(-) diff --git a/.metadata.txt b/.metadata.txt index 4a86b4a..08cfecf 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,7 +1,7 @@ -CVTUTF7.C 2004-01-06 17:42 -CVTUTF7.H 2004-01-06 17:42 -ConvertUTF.c 2004-01-06 17:42 -ConvertUTF.h 2004-01-06 17:42 -ExpectedOutput.txt 2004-01-06 17:42 -harness.c 2004-01-06 17:42 -readme.txt 2004-01-06 17:42 +CVTUTF7.C 2004-10-19 16:05 +CVTUTF7.H 2004-10-19 16:05 +ConvertUTF.c 2004-10-19 16:05 +ConvertUTF.h 2004-10-19 16:05 +ExpectedOutput.txt 2004-10-19 16:05 +harness.c 2004-10-19 16:05 +readme.txt 2004-10-19 16:08 diff --git a/ConvertUTF.c b/ConvertUTF.c index 649fbc8..9b3deeb 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -32,6 +32,7 @@ to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. See the header file "ConvertUTF.h" for complete documentation. @@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 ( } else { *target++ = (UTF16)ch; /* normal case */ } - } else if (ch > UNI_MAX_UTF16) { + } else if (ch > UNI_MAX_LEGAL_UTF32) { if (flags == strictConversion) { result = sourceIllegal; } else { @@ -166,6 +167,9 @@ if (result == sourceIllegal) { /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... six byte sequence.) + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; @@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 ( if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; } @@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { /* no fall-through in this inner switch */ case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; case 0xF0: if (a < 0x90) return false; break; case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; + default: if (a < 0x80) return false; } - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - if (*source > 0xF4) return false; + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } + if (*source > 0xF4) return false; return true; } @@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 ( * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; @@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 ( break; } } - /* Figure out how many bytes the result will require */ + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ if (ch < (UTF32)0x80) { bytesToWrite = 1; } else if (ch < (UTF32)0x800) { bytesToWrite = 2; } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x200000) { bytesToWrite = 4; - } else { bytesToWrite = 2; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; } target += bytesToWrite; @@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 ( source -= (extraBytesToRead+1); /* Back up the source pointer! */ result = targetExhausted; break; } - if (ch <= UNI_MAX_UTF32) { - /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (flags == strictConversion) { source -= (extraBytesToRead+1); /* return to the illegal value itself */ @@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 ( } else { *target++ = ch; } - } else { /* i.e., ch > UNI_MAX_UTF32 */ + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; *target++ = UNI_REPLACEMENT_CHAR; } } diff --git a/ConvertUTF.h b/ConvertUTF.h index 429ab40..e264915 100644 --- a/ConvertUTF.h +++ b/ConvertUTF.h @@ -1,5 +1,5 @@ /* - * Copyright 2001 Unicode, Inc. + * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * @@ -97,6 +97,7 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_MAX_BMP (UTF32)0x0000FFFF #define UNI_MAX_UTF16 (UTF32)0x0010FFFF #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF +#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF typedef enum { conversionOK, /* conversion successful */ diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt index cf13a6a..e09d844 100644 --- a/ExpectedOutput.txt +++ b/ExpectedOutput.txt @@ -1,4 +1,5 @@ Three tests of round-trip conversions will be performed. +One test of illegal UTF-32 will be peroformed. Two illegal result messages are expected; one in test 02A; one in test 03A. These are for tests of Surrogate conversion. @@ -15,3 +16,6 @@ sourceIllegal Test03A for 55296 (0xd800); output ; result 3 !!! Test03A: note expected illegal result for 0x0000D800 ******** Test03 succeeded without error. ******** +Begin Test04 +******** Test04 succeeded without error. ******** + diff --git a/harness.c b/harness.c index 1e3dfb9..25b3e9e 100644 --- a/harness.c +++ b/harness.c @@ -1,5 +1,5 @@ /* - * Copyright 2001 Unicode, Inc. + * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * @@ -34,6 +34,8 @@ * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2] * per report from Iain Murray. * July 3, 2003: Updated printout message. + * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch + * illegal surrogate use in UTF-8, per report from Frank Tang. * */ @@ -54,7 +56,9 @@ 00-7F 0000- 007F C2-DF 80-BF 0080- 07FF E0 A0-BF 80-BF 0800- 0FFF - E1-EF 80-BF 80-BF 1000- FFFF + E1-EC 80-BF 80-BF 1000- CFFF + ED 80-9F 80-BF D000- D7FF + EE-EF 80-BF 80-BF E000- FFFF F0 90-BF 80-BF 80-BF 10000- 3FFFF F1-F3 80-BF 80-BF 80-BF 40000- FFFFF F4 80-8F 80-BF 80-BF 100000-10FFFF @@ -88,9 +92,16 @@ struct utf8_test utf8_testData[] = { { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */ { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */ -/* for all > 17 use "short" buffer lengths to detect over-run */ + + { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */ + { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */ + { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */ + { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */ + +/* for all > 21 use "short" buffer lengths to detect over-run */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ - { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }} + { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}, + }; int test01() { @@ -103,8 +114,8 @@ int test01() { for (i = 0; utf8_testData[i].utf8_len; i++) { wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); - /* use truncated length for tests over 17 */ - if (i <= 17) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } + /* use truncated length for tests over 21 */ + if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", @@ -380,10 +391,46 @@ int test03() { return 1; } +/* --------------------------------------------------------------------- + test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8. + Expect it will be turned into UNI_REPLACEMENT_CHAR. + + --------------------------------------------------------------------- */ + +int test04() { + int i, n; + ConversionResult result; + UTF32 utf32_buf[2]; + UTF8 utf8_buf[8]; + UTF32 *utf32SourceStart, *utf32TargetStart; + UTF8 *utf8SourceStart, *utf8TargetStart; + + printf("Begin Test04\n"); fflush(stdout); + + i = 0x10FFFF + 21; /* an arbitrary value > legal */ + + utf32_buf[0] = i; utf32_buf[1] = 0; + for (n = 0; n < 8; n++) utf8_buf[n] = 0; + + utf32SourceStart = utf32_buf; + utf8TargetStart = utf8_buf; + + /* + * Test UTF32 -> UTF8, with legality check on. + */ + result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion); + if (result != sourceIllegal) { + fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1); + } + + return 1; +} + /* --------------------------------------------------------------------- */ main() { printf("Three tests of round-trip conversions will be performed.\n"); + printf("One test of illegal UTF-32 will be peroformed.\n"); printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); printf("These are for tests of Surrogate conversion.\n\n"); fflush(stdout); @@ -393,4 +440,6 @@ main() { else { printf("-------- Test02 failed. --------\n\n"); } if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); } else { printf("-------- Test03 failed. --------\n\n"); } + if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); } + else { printf("-------- Test04 failed. --------\n\n"); } } diff --git a/readme.txt b/readme.txt index 722c6f4..b9f17fb 100644 --- a/readme.txt +++ b/readme.txt @@ -3,12 +3,12 @@ The accompanying C source code file "ConvertUTF.c" and the associated header file "ConvertUTF.h" provide for conversion between various transformation formats of Unicode characters. The following conversions are supported: - UCS4 to UTF16 - UCS4 to UTF8 - UTF16 to UCS4 - UTF16 to UTF8 - UTF8 to UTF16 - UTF8 to UCS4 + UTF-32 to UTF-16 + UTF-32 to UTF-8 + UTF-16 to UTF-32 + UTF-16 to UTF-8 + UTF-8 to UTF-16 + UTF-8 to UTF-32 In addition, there is a test harness which runs various tests. @@ -33,7 +33,11 @@ Version 1.2: corrected switch statements associated with "extraBytesToRead" UTF8, but the table and this code has always catered for those, cases since at one time they were legal. +Version 1.3: Updated UTF-8 legality check; + updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions + Updated UTF-8 legality tests in harness.c + -Last update: January 6, 2004 +Last update: October 19, 2004 -- cgit v1.1-4-g5e80