summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2006-07-20 23:02:04 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2006-07-20 23:02:04 -0400
commiteb4c53216a3fac23bdca417f6d899c164fcef61a (patch)
tree7b5c1e9bf8254b0eda1e148387cd79ad77235a24
parente49ab49fe9202b6f875cf9cba6bad2f320e24fe4 (diff)
http://web.archive.org/web/20060720230204/http:/www.unicode.org:80/Public/BETA/CVTUTF-1-4/BETA/CVTUTF-1-4
-rw-r--r--.metadata.txt14
-rw-r--r--ConvertUTF.c24
-rw-r--r--ExpectedOutput.txt2
-rw-r--r--harness.c10
-rw-r--r--readme.txt7
5 files changed, 38 insertions, 19 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 65edd80..b351678 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,7 +1,7 @@
-CVTUTF7.C 2004-11-02 15:08
-CVTUTF7.H 2004-11-02 15:08
-ConvertUTF.c 2004-11-02 15:08
-ConvertUTF.h 2004-11-02 15:08
-ExpectedOutput.txt 2004-11-02 15:08
-harness.c 2004-11-02 15:08
-readme.txt 2004-11-02 15:08
+CVTUTF7.C 2006-05-10 10:41
+CVTUTF7.H 2006-05-10 10:41
+ConvertUTF.c 2006-05-10 10:41
+ConvertUTF.h 2006-05-10 10:41
+ExpectedOutput.txt 2006-05-10 10:41
+harness.c 2006-05-10 10:41
+readme.txt 2006-05-10 10:41
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 9b3deeb..67ab49f 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -33,6 +33,7 @@
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
+ May 2006: updated isLegalUTF8Sequence.
See the header file "ConvertUTF.h" for complete documentation.
@@ -305,7 +306,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
- case 0xED: if (a > 0x9F) return false; break;
+ case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
@@ -323,12 +324,25 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
* Exported function to return whether a UTF-8 sequence is legal or not.
* This is not used here; it's just exported.
*/
+
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
- int length = trailingBytesForUTF8[*source]+1;
- if (source+length > sourceEnd) {
- return false;
+ int length;
+ if (source == sourceEnd) {
+ return true;
+ }
+ while (true) {
+ length = trailingBytesForUTF8[*source]+1;
+ if (source+length > sourceEnd) {
+ return false;
+ }
+ if (!isLegalUTF8(source, length)) {
+ return false;
+ }
+ source += length;
+ if (source >= sourceEnd) {
+ return true;
+ }
}
- return isLegalUTF8(source, length);
}
/* --------------------------------------------------------------------- */
diff --git a/ExpectedOutput.txt b/ExpectedOutput.txt
index e09d844..792bdc8 100644
--- a/ExpectedOutput.txt
+++ b/ExpectedOutput.txt
@@ -1,5 +1,5 @@
Three tests of round-trip conversions will be performed.
-One test of illegal UTF-32 will be peroformed.
+One test of illegal UTF-32 will be performed.
Two illegal result messages are expected; one in test 02A; one in test 03A.
These are for tests of Surrogate conversion.
diff --git a/harness.c b/harness.c
index 25b3e9e..fa4e191 100644
--- a/harness.c
+++ b/harness.c
@@ -36,6 +36,7 @@
* July 3, 2003: Updated printout message.
* Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
* illegal surrogate use in UTF-8, per report from Frank Tang.
+ * May 8, 2006: added ED 60 80 to isLegalUTF8 test.
*
*/
@@ -97,8 +98,9 @@ struct utf8_test utf8_testData[] = {
{ 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */
{ 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */
{ 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */
+ { 0, 3, { 0xED, 0x60, 0x80, 0x00, 0x00 }}, /* 22 */
-/* for all > 21 use "short" buffer lengths to detect over-run */
+/* for all > 22 use "short" buffer lengths to detect over-run */
{ 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */
{ 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }},
@@ -114,8 +116,8 @@ int test01() {
for (i = 0; utf8_testData[i].utf8_len; i++) {
wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
- /* use truncated length for tests over 21 */
- if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
+ /* use truncated length for tests over 22 */
+ if (i <= 22) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
@@ -430,7 +432,7 @@ int test04() {
main() {
printf("Three tests of round-trip conversions will be performed.\n");
- printf("One test of illegal UTF-32 will be peroformed.\n");
+ printf("One test of illegal UTF-32 will be performed.\n");
printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
printf("These are for tests of Surrogate conversion.\n\n");
fflush(stdout);
diff --git a/readme.txt b/readme.txt
index b9f17fb..f01bd73 100644
--- a/readme.txt
+++ b/readme.txt
@@ -37,7 +37,10 @@ Version 1.3: Updated UTF-8 legality check;
updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions
Updated UTF-8 legality tests in harness.c
-
-Last update: October 19, 2004
+Version 1.4: Updated UTF-8 legality check;
+ Added test 22 for UTF-8 legality in harness.c
+ Updated isLegalUTF8Sequence to scan multiple sequences, per
+ suggestion from a user.
+Last update: May 9, 2006