http://web.archive.org/web/20020821102204/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/

author: Luke Shumaker <lukeshu@lukeshu.com> 2002-08-21 10:22:04 -0500
committer: Luke Shumaker <lukeshu@lukeshu.com> 2002-08-21 10:22:04 -0500
commit: 040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc (patch)
tree: 8383fc77670817fb5481faffa33790ca15768a67
parent: bd13aea5c28a1366edd0752877051bc720c33875 (diff)
5 files changed, 73 insertions, 84 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 3c46226..04a8f5d 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,5 +1,5 @@
 CVTUTF7.C               2001-08-23 23:56
 CVTUTF7.H               2001-08-23 23:56
-ConvertUTF.c            2001-08-23 23:56
-ConvertUTF.h            2001-08-23 23:56
-harness.c               2001-08-23 23:56
+ConvertUTF.c            2001-09-26 17:39
+ConvertUTF.h            2001-09-26 17:39
+harness.c               2001-09-26 17:39
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 491fa14..23834c4 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -25,6 +25,8 @@
     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
 	Author: Mark E. Davis, 1994.
 	Rev History: Rick McGowan, fixes & updates May 2001.
+	Sept 2001: fixed const & error conditions per
+		mods suggested by S. Parent & A. Lillich.
 
     See the header file "ConvertUTF.h" for complete documentation.
 
@@ -51,10 +53,10 @@ static const UTF32 halfMask	= 0x3FFUL;
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF32toUTF16 (
-		UTF32** sourceStart, const UTF32* sourceEnd, 
-		UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) {
+		const UTF32** sourceStart, const UTF32* sourceEnd, 
+		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF32* source = *sourceStart;
+	const UTF32* source = *sourceStart;
 	UTF16* target = *targetStart;
 	while (source < sourceEnd) {
 		UTF32 ch;
@@ -79,6 +81,7 @@ ConversionResult ConvertUTF32toUTF16 (
 		} else {
 			/* target is a character in range 0xFFFF - 0x10FFFF. */
 			if (target + 1 >= targetEnd) {
+				--source; /* Back up source pointer! */
 				result = targetExhausted; break;
 			}
 			ch -= halfBase;
@@ -94,13 +97,14 @@ ConversionResult ConvertUTF32toUTF16 (
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF16toUTF32 (
-		UTF16** sourceStart, UTF16* sourceEnd, 
-		UTF32** targetStart, const UTF32* targetEnd, const ConversionFlags flags) {
+		const UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF16* source = *sourceStart;
+	const UTF16* source = *sourceStart;
 	UTF32* target = *targetStart;
 	UTF32 ch, ch2;
 	while (source < sourceEnd) {
+		const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
 		ch = *source++;
 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
 			ch2 = *source;
@@ -120,6 +124,7 @@ ConversionResult ConvertUTF16toUTF32 (
 			break;
 		}
 		if (target >= targetEnd) {
+			source = oldSource; /* Back up source pointer! */
 			result = targetExhausted; break;
 		}
 		*target++ = ch;
@@ -181,16 +186,17 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF16toUTF8 (
-		UTF16** sourceStart, const UTF16* sourceEnd, 
-		UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) {
+		const UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF16* source = *sourceStart;
+	const UTF16* source = *sourceStart;
 	UTF8* target = *targetStart;
 	while (source < sourceEnd) {
 		UTF32 ch;
 		unsigned short bytesToWrite = 0;
 		const UTF32 byteMask = 0xBF;
 		const UTF32 byteMark = 0x80; 
+		const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
 		ch = *source++;
 		/* If we have a surrogate pair, convert to UTF32 first. */
 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
@@ -220,6 +226,7 @@ ConversionResult ConvertUTF16toUTF8 (
 
 		target += bytesToWrite;
 		if (target > targetEnd) {
+			source = oldSource; /* Back up source pointer! */
 			target -= bytesToWrite; result = targetExhausted; break;
 		}
 		switch (bytesToWrite) {	/* note: everything falls through. */
@@ -248,9 +255,9 @@ ConversionResult ConvertUTF16toUTF8 (
  * definition of UTF-8 goes up to 4-byte sequences.
  */
 
-static Boolean isLegalUTF8(UTF8 *source, int length) {
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
 	UTF8 a;
-	UTF8 *srcptr = source+length;
+	const UTF8 *srcptr = source+length;
 	switch (length) {
 	default: return false;
 		/* Everything else falls through when "true"... */
@@ -276,7 +283,7 @@ static Boolean isLegalUTF8(UTF8 *source, int length) {
  * Exported function to return whether a UTF-8 sequence is legal or not.
  * This is not used here; it's just exported.
  */
-Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) {
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
 	int length = trailingBytesForUTF8[*source]+1;
 	if (source+length > sourceEnd) {
 	    return false;
@@ -287,10 +294,10 @@ Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) {
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF8toUTF16 (
-		UTF8** sourceStart, UTF8* sourceEnd, 
-		UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) {
+		const UTF8** sourceStart, const UTF8* sourceEnd, 
+		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF8* source = *sourceStart;
+	const UTF8* source = *sourceStart;
 	UTF16* target = *targetStart;
 	while (source < sourceEnd) {
 		UTF32 ch = 0;
@@ -315,11 +322,12 @@ ConversionResult ConvertUTF8toUTF16 (
 		ch -= offsetsFromUTF8[extraBytesToRead];
 
 		if (target >= targetEnd) {
+			source -= (extraBytesToRead+1);	/* Back up source pointer! */
 			result = targetExhausted; break;
 		}
 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 			if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
-				--source; /* return to the illegal value itself */
+				source -= (extraBytesToRead+1); /* return to the illegal value itself */
 				result = sourceIllegal;
 				break;
 			} else {
@@ -328,13 +336,15 @@ ConversionResult ConvertUTF8toUTF16 (
 		} else if (ch > UNI_MAX_UTF16) {
 			if (flags == strictConversion) {
 				result = sourceIllegal;
-				source -= extraBytesToRead; /* return to the start */
+				source -= (extraBytesToRead+1); /* return to the start */
+				break; /* Bail out; shouldn't continue */
 			} else {
 				*target++ = UNI_REPLACEMENT_CHAR;
 			}
 		} else {
 			/* target is a character in range 0xFFFF - 0x10FFFF. */
 			if (target + 1 >= targetEnd) {
+				source -= (extraBytesToRead+1);	/* Back up source pointer! */
 				result = targetExhausted; break;
 			}
 			ch -= halfBase;
@@ -350,10 +360,10 @@ ConversionResult ConvertUTF8toUTF16 (
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF32toUTF8 (
-		UTF32** sourceStart, const UTF32* sourceEnd, 
-		UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) {
+		const UTF32** sourceStart, const UTF32* sourceEnd, 
+		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF32* source = *sourceStart;
+	const UTF32* source = *sourceStart;
 	UTF8* target = *targetStart;
 	while (source < sourceEnd) {
 		UTF32 ch;
@@ -380,6 +390,7 @@ ConversionResult ConvertUTF32toUTF8 (
 		
 		target += bytesToWrite;
 		if (target > targetEnd) {
+			--source; /* Back up source pointer! */
 			target -= bytesToWrite; result = targetExhausted; break;
 		}
 		switch (bytesToWrite) {	/* note: everything falls through. */
@@ -398,10 +409,10 @@ ConversionResult ConvertUTF32toUTF8 (
 /* --------------------------------------------------------------------- */
 
 ConversionResult ConvertUTF8toUTF32 (
-		UTF8** sourceStart, UTF8* sourceEnd, 
-		UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags) {
+		const UTF8** sourceStart, const UTF8* sourceEnd, 
+		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
 	ConversionResult result = conversionOK;
-	UTF8* source = *sourceStart;
+	const UTF8* source = *sourceStart;
 	UTF32* target = *targetStart;
 	while (source < sourceEnd) {
 		UTF32 ch = 0;
@@ -426,19 +437,13 @@ ConversionResult ConvertUTF8toUTF32 (
 		ch -= offsetsFromUTF8[extraBytesToRead];
 
 		if (target >= targetEnd) {
+			source -= (extraBytesToRead+1);	/* Back up the source pointer! */
 			result = targetExhausted; break;
 		}
 		if (ch <= UNI_MAX_UTF32) {
 			*target++ = ch;
-		} else if (ch > UNI_MAX_UTF32) {
+		} else { /* i.e., ch > UNI_MAX_UTF32 */
 			*target++ = UNI_REPLACEMENT_CHAR;
-		} else {
-			if (target + 1 >= targetEnd) {
-				result = targetExhausted; break;
-			}
-			ch -= halfBase;
-			*target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
-			*target++ = (ch & halfMask) + UNI_SUR_LOW_START;
 		}
 	}
 	*sourceStart = source;
diff --git a/ConvertUTF.h b/ConvertUTF.h
index 6798183..429ab40 100644
--- a/ConvertUTF.h
+++ b/ConvertUTF.h
@@ -75,6 +75,7 @@
 
     Author: Mark E. Davis, 1994.
     Rev History: Rick McGowan, fixes & updates May 2001.
+		 Fixes & updates, Sept 2001.
 
 ------------------------------------------------------------------------ */
 
@@ -109,30 +110,39 @@ typedef enum {
 	lenientConversion
 } ConversionFlags;
 
-ConversionResult ConvertUTF32toUTF16 (
-		UTF32** sourceStart, const UTF32* sourceEnd, 
-		UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags);
+/* This is for C++ and does no harm in C */
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-ConversionResult ConvertUTF16toUTF32 (
-		UTF16** sourceStart, UTF16* sourceEnd, 
-		UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF8toUTF16 (
+		const UTF8** sourceStart, const UTF8* sourceEnd, 
+		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
 
 ConversionResult ConvertUTF16toUTF8 (
-		UTF16** sourceStart, const UTF16* sourceEnd, 
-		UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags);
+		const UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
 		
-ConversionResult ConvertUTF8toUTF16 (
-		UTF8** sourceStart, UTF8* sourceEnd, 
-		UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF8toUTF32 (
+		const UTF8** sourceStart, const UTF8* sourceEnd, 
+		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
 
 ConversionResult ConvertUTF32toUTF8 (
-		UTF32** sourceStart, const UTF32* sourceEnd, 
-		UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags);
+		const UTF32** sourceStart, const UTF32* sourceEnd, 
+		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
 		
-ConversionResult ConvertUTF8toUTF32 (
-		UTF8** sourceStart, UTF8* sourceEnd, 
-		UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF16toUTF32 (
+		const UTF16** sourceStart, const UTF16* sourceEnd, 
+		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF32toUTF16 (
+		const UTF32** sourceStart, const UTF32* sourceEnd, 
+		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
+
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
 
-Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd);
+#ifdef __cplusplus
+}
+#endif
 
 /* --------------------------------------------------------------------- */
diff --git a/harness.c b/harness.c
index a07792b..b3dd500 100644
--- a/harness.c
+++ b/harness.c
@@ -157,7 +157,7 @@ int test02() {
 		/*
 		 * Test UTF32 -> UTF16
 		 */
-		result = ConvertUTF32toUTF16(&utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+		result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
 		if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
 			/* skip result checking for all but 0000d800, which we know to be illegal */
 			switch (result) {
@@ -186,7 +186,7 @@ int test02() {
 		 * for unpaired low surrogates.  We do make one check that the lowest low
 		 * surrogate, when unpaired, is illegal.
 		 */
-		result = ConvertUTF16toUTF8(&utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
+		result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
 		switch (result) {
 		default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
 		case conversionOK: break;
@@ -230,7 +230,7 @@ int test02() {
 		/*
 		 * Test UTF8 -> UTF16, with legality check on.
 		 */
-		result = ConvertUTF8toUTF16(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+		result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
 		switch (result) {
 		default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
 		case conversionOK: break;
@@ -257,9 +257,9 @@ int test02() {
 		 * back to UTF32.
 		 */
 		if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
-			result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
 		} else {
-			result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+			result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
 		}
 		switch (result) {
 		default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
@@ -320,7 +320,7 @@ int test03() {
 		/*
 		 * Test UTF32 -> UTF8, with legality check on.
 		 */
-		result = ConvertUTF32toUTF8(&utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+		result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
 		switch (result) {
 		default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
 		case conversionOK: break;
@@ -355,7 +355,7 @@ int test03() {
 		/*
 		 * Test UTF8 -> UTF32, with legality check on.
 		 */
-		result = ConvertUTF8toUTF32(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+		result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
 		switch (result) {
 		default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
 		case conversionOK: break;
diff --git a/readme.txt b/readme.txt
deleted file mode 100644
index 9bb6a00..0000000
--- a/readme.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The accompanying C source code file "ConvertUTF.c" and the associated header
-file "ConvertUTF.h" provide for conversion between various transformation
-formats of Unicode characters.  The following conversions are supported:
-
-	UCS4 to UTF16
-	UCS4 to UTF8
-	UTF16 to UCS4
-	UTF16 to UTF8
-	UTF8 to UTF16
-	UTF8 to UCS4
-
-
-The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes
-only.  They have not been updated to Unicode 3.0 and should be considered
-obsolescent.  "CVTUTF7.C" contains two functions that can convert between
-UCS2 (i.e., the BMP characters only) and UTF-7.  Surrogates are not supported,
-the code has not been tested, and should be considered unsuitable for general
-purpose use.
-
-Please address any bug reports about these programs to:
-	http://www.unicode.org/unicode/reporting.html
-
-Last update: July 12, 2001
-
-
author	Luke Shumaker <lukeshu@lukeshu.com>	2002-08-21 10:22:04 -0500
committer	Luke Shumaker <lukeshu@lukeshu.com>	2002-08-21 10:22:04 -0500
commit	040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc (patch)
tree	8383fc77670817fb5481faffa33790ca15768a67
parent	bd13aea5c28a1366edd0752877051bc720c33875 (diff)