From eb4c53216a3fac23bdca417f6d899c164fcef61a Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Thu, 20 Jul 2006 23:02:04 -0400 Subject: http://web.archive.org/web/20060720230204/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-4/ --- .metadata.txt | 14 +++++++------- ConvertUTF.c | 24 +++++++++++++++++++----- ExpectedOutput.txt | 2 +- harness.c | 10 ++++++---- readme.txt | 7 +++++-- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/.metadata.txt b/.metadata.txt index 65edd80..b351678 100644 --- a/.metadata.txt +++ b/.metadata.txt @@ -1,7 +1,7 @@ -CVTUTF7.C 2004-11-02 15:08 -CVTUTF7.H 2004-11-02 15:08 -ConvertUTF.c 2004-11-02 15:08 -ConvertUTF.h 2004-11-02 15:08 -ExpectedOutput.txt 2004-11-02 15:08 -harness.c 2004-11-02 15:08 -readme.txt 2004-11-02 15:08 +CVTUTF7.C 2006-05-10 10:41 +CVTUTF7.H 2006-05-10 10:41 +ConvertUTF.c 2006-05-10 10:41 +ConvertUTF.h 2006-05-10 10:41 +ExpectedOutput.txt 2006-05-10 10:41 +harness.c 2006-05-10 10:41 +readme.txt 2006-05-10 10:41 diff --git a/ConvertUTF.c b/ConvertUTF.c index 9b3deeb..67ab49f 100644 --- a/ConvertUTF.c +++ b/ConvertUTF.c @@ -33,6 +33,7 @@ July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + May 2006: updated isLegalUTF8Sequence. See the header file "ConvertUTF.h" for complete documentation. @@ -305,7 +306,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { switch (*source) { /* no fall-through in this inner switch */ case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; + case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break; case 0xF0: if (a < 0x90) return false; break; case 0xF4: if (a > 0x8F) return false; break; default: if (a < 0x80) return false; @@ -323,12 +324,25 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ + Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; + int length; + if (source == sourceEnd) { + return true; + } + while (true) { + length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + if (!isLegalUTF8(source, length)) { + return false; + } + source += length; + if (source >= sourceEnd) { + return true; + } } - return isLegalUTF8(source, length); } /* --------------------------------------------------------------------- */ diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt index e09d844..792bdc8 100644 --- a/ExpectedOutput.txt +++ b/ExpectedOutput.txt @@ -1,5 +1,5 @@ Three tests of round-trip conversions will be performed. -One test of illegal UTF-32 will be peroformed. +One test of illegal UTF-32 will be performed. Two illegal result messages are expected; one in test 02A; one in test 03A. These are for tests of Surrogate conversion. diff --git a/harness.c b/harness.c index 25b3e9e..fa4e191 100644 --- a/harness.c +++ b/harness.c @@ -36,6 +36,7 @@ * July 3, 2003: Updated printout message. * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch * illegal surrogate use in UTF-8, per report from Frank Tang. + * May 8, 2006: added ED 60 80 to isLegalUTF8 test. * */ @@ -97,8 +98,9 @@ struct utf8_test utf8_testData[] = { { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */ { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */ { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */ + { 0, 3, { 0xED, 0x60, 0x80, 0x00, 0x00 }}, /* 22 */ -/* for all > 21 use "short" buffer lengths to detect over-run */ +/* for all > 22 use "short" buffer lengths to detect over-run */ { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */ { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }}, @@ -114,8 +116,8 @@ int test01() { for (i = 0; utf8_testData[i].utf8_len; i++) { wantVal1 = wantVal2 = utf8_testData[i].utf8_legal; gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len); - /* use truncated length for tests over 21 */ - if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } + /* use truncated length for tests over 22 */ + if (i <= 22) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; } gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2); if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) { printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n", @@ -430,7 +432,7 @@ int test04() { main() { printf("Three tests of round-trip conversions will be performed.\n"); - printf("One test of illegal UTF-32 will be peroformed.\n"); + printf("One test of illegal UTF-32 will be performed.\n"); printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n"); printf("These are for tests of Surrogate conversion.\n\n"); fflush(stdout); diff --git a/readme.txt b/readme.txt index b9f17fb..f01bd73 100644 --- a/readme.txt +++ b/readme.txt @@ -37,7 +37,10 @@ Version 1.3: Updated UTF-8 legality check; updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions Updated UTF-8 legality tests in harness.c - -Last update: October 19, 2004 +Version 1.4: Updated UTF-8 legality check; + Added test 22 for UTF-8 legality in harness.c + Updated isLegalUTF8Sequence to scan multiple sequences, per + suggestion from a user. +Last update: May 9, 2006 -- cgit v1.1-4-g5e80