From eb4c53216a3fac23bdca417f6d899c164fcef61a Mon Sep 17 00:00:00 2001
From: Luke Shumaker <lukeshu@lukeshu.com>
Date: Thu, 20 Jul 2006 23:02:04 -0400
Subject: 
 http://web.archive.org/web/20060720230204/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-4/

---
 .metadata.txt      | 14 +++++++-------
 ConvertUTF.c       | 24 +++++++++++++++++++-----
 ExpectedOutput.txt |  2 +-
 harness.c          | 10 ++++++----
 readme.txt         |  7 +++++--
 5 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/.metadata.txt b/.metadata.txt
index 65edd80..b351678 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,7 +1,7 @@
-CVTUTF7.C               2004-11-02 15:08
-CVTUTF7.H               2004-11-02 15:08
-ConvertUTF.c            2004-11-02 15:08
-ConvertUTF.h            2004-11-02 15:08
-ExpectedOutput.txt      2004-11-02 15:08
-harness.c               2004-11-02 15:08
-readme.txt              2004-11-02 15:08
+CVTUTF7.C               2006-05-10 10:41
+CVTUTF7.H               2006-05-10 10:41
+ConvertUTF.c            2006-05-10 10:41
+ConvertUTF.h            2006-05-10 10:41
+ExpectedOutput.txt      2006-05-10 10:41
+harness.c               2006-05-10 10:41
+readme.txt              2006-05-10 10:41
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 9b3deeb..67ab49f 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -33,6 +33,7 @@
     July 2003: slight mods to back out aggressive FFFE detection.
     Jan 2004: updated switches in from-UTF8 conversions.
     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
+    May 2006: updated isLegalUTF8Sequence.
 
     See the header file "ConvertUTF.h" for complete documentation.
 
@@ -305,7 +306,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
 	switch (*source) {
 	    /* no fall-through in this inner switch */
 	    case 0xE0: if (a < 0xA0) return false; break;
-	    case 0xED: if (a > 0x9F) return false; break;
+	    case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break;
 	    case 0xF0: if (a < 0x90) return false; break;
 	    case 0xF4: if (a > 0x8F) return false; break;
 	    default:   if (a < 0x80) return false;
@@ -323,12 +324,25 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
  * Exported function to return whether a UTF-8 sequence is legal or not.
  * This is not used here; it's just exported.
  */
+
 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
-    int length = trailingBytesForUTF8[*source]+1;
-    if (source+length > sourceEnd) {
-	return false;
+    int length;
+    if (source == sourceEnd) {
+        return true;
+    }
+    while (true) {
+        length = trailingBytesForUTF8[*source]+1;
+        if (source+length > sourceEnd) {
+            return false;
+        }
+        if (!isLegalUTF8(source, length)) {
+            return false;
+        }
+        source += length;
+        if (source >= sourceEnd) {
+            return true;
+        }
     }
-    return isLegalUTF8(source, length);
 }
 
 /* --------------------------------------------------------------------- */
diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt
index e09d844..792bdc8 100644
--- a/ExpectedOutput.txt
+++ b/ExpectedOutput.txt
@@ -1,5 +1,5 @@
 Three tests of round-trip conversions will be performed.
-One test of illegal UTF-32 will be peroformed.
+One test of illegal UTF-32 will be performed.
 Two illegal result messages are expected; one in test 02A; one in test 03A.
 These are for tests of Surrogate conversion.
 
diff --git a/harness.c b/harness.c
index 25b3e9e..fa4e191 100644
--- a/harness.c
+++ b/harness.c
@@ -36,6 +36,7 @@
  * July 3, 2003: Updated printout message.
  * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
  *	illegal surrogate use in UTF-8, per report from Frank Tang.
+ * May 8, 2006: added ED 60 80 to isLegalUTF8 test.
  *
  */
 
@@ -97,8 +98,9 @@ struct utf8_test utf8_testData[] = {
     { 1,	3,	{ 0xEE, 0x80, 0x80, 0x00, 0x00 }},	/* 19 */
     { 0,	3,	{ 0xED, 0xA0, 0x80, 0x00, 0x00 }},	/* 20 */
     { 0,	3,	{ 0xED, 0xBF, 0xBF, 0x00, 0x00 }},	/* 21 */
+    { 0,	3,	{ 0xED, 0x60, 0x80, 0x00, 0x00 }},	/* 22 */
 
-/* for all > 21 use "short" buffer lengths to detect over-run */
+/* for all > 22 use "short" buffer lengths to detect over-run */
     { 0,	4,	{ 0xF0, 0x93, 0xB2, 0xC3, 0x00 }},	/* 18 use short buflen */
     { 0,	0,	{ 0x00, 0x00, 0x00, 0x00, 0x00 }},
 
@@ -114,8 +116,8 @@ int test01() {
 	for (i = 0; utf8_testData[i].utf8_len; i++) {
 		wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
 		gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
-		/* use truncated length for tests over 21 */
-		if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+		/* use truncated length for tests over 22 */
+		if (i <= 22) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
 		gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
 		if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
 			printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
@@ -430,7 +432,7 @@ int test04() {
 
 main() {
 	printf("Three tests of round-trip conversions will be performed.\n");
-	printf("One test of illegal UTF-32 will be peroformed.\n");
+	printf("One test of illegal UTF-32 will be performed.\n");
 	printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
 	printf("These are for tests of Surrogate conversion.\n\n");
 	fflush(stdout);
diff --git a/readme.txt b/readme.txt
index b9f17fb..f01bd73 100644
--- a/readme.txt
+++ b/readme.txt
@@ -37,7 +37,10 @@ Version 1.3: Updated UTF-8 legality check;
 	updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
 	Updated UTF-8 legality tests in harness.c
  
-
-Last update: October 19, 2004
+Version 1.4: Updated UTF-8 legality check;
+	Added test 22 for UTF-8 legality in harness.c
+	Updated isLegalUTF8Sequence to scan multiple sequences, per
+		suggestion from a user.
 
 
+Last update: May 9, 2006
-- 
cgit v1.2.3-2-g168b