summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2002-08-21 10:22:04 -0500
committerLuke Shumaker <lukeshu@lukeshu.com>2002-08-21 10:22:04 -0500
commit040c6f479435a2b4f2a7cb9ef4bd65fca3ec2fcc (patch)
tree8383fc77670817fb5481faffa33790ca15768a67
parentbd13aea5c28a1366edd0752877051bc720c33875 (diff)
http://web.archive.org/web/20020821102204/http:/www.unicode.org:80/Public/PROGRAMS/CVTUTF/
-rw-r--r--.metadata.txt6
-rw-r--r--ConvertUTF.c67
-rw-r--r--ConvertUTF.h44
-rw-r--r--harness.c14
-rw-r--r--readme.txt26
5 files changed, 73 insertions, 84 deletions
diff --git a/.metadata.txt b/.metadata.txt
index 3c46226..04a8f5d 100644
--- a/.metadata.txt
+++ b/.metadata.txt
@@ -1,5 +1,5 @@
CVTUTF7.C 2001-08-23 23:56
CVTUTF7.H 2001-08-23 23:56
-ConvertUTF.c 2001-08-23 23:56
-ConvertUTF.h 2001-08-23 23:56
-harness.c 2001-08-23 23:56
+ConvertUTF.c 2001-09-26 17:39
+ConvertUTF.h 2001-09-26 17:39
+harness.c 2001-09-26 17:39
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 491fa14..23834c4 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -25,6 +25,8 @@
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
+ Sept 2001: fixed const & error conditions per
+ mods suggested by S. Parent & A. Lillich.
See the header file "ConvertUTF.h" for complete documentation.
@@ -51,10 +53,10 @@ static const UTF32 halfMask = 0x3FFUL;
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF16 (
- UTF32** sourceStart, const UTF32* sourceEnd,
- UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) {
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF32* source = *sourceStart;
+ const UTF32* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
@@ -79,6 +81,7 @@ ConversionResult ConvertUTF32toUTF16 (
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
+ --source; /* Back up source pointer! */
result = targetExhausted; break;
}
ch -= halfBase;
@@ -94,13 +97,14 @@ ConversionResult ConvertUTF32toUTF16 (
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF32 (
- UTF16** sourceStart, UTF16* sourceEnd,
- UTF32** targetStart, const UTF32* targetEnd, const ConversionFlags flags) {
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF16* source = *sourceStart;
+ const UTF16* source = *sourceStart;
UTF32* target = *targetStart;
UTF32 ch, ch2;
while (source < sourceEnd) {
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
ch = *source++;
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
ch2 = *source;
@@ -120,6 +124,7 @@ ConversionResult ConvertUTF16toUTF32 (
break;
}
if (target >= targetEnd) {
+ source = oldSource; /* Back up source pointer! */
result = targetExhausted; break;
}
*target++ = ch;
@@ -181,16 +186,17 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF16toUTF8 (
- UTF16** sourceStart, const UTF16* sourceEnd,
- UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) {
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF16* source = *sourceStart;
+ const UTF16* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
unsigned short bytesToWrite = 0;
const UTF32 byteMask = 0xBF;
const UTF32 byteMark = 0x80;
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
ch = *source++;
/* If we have a surrogate pair, convert to UTF32 first. */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
@@ -220,6 +226,7 @@ ConversionResult ConvertUTF16toUTF8 (
target += bytesToWrite;
if (target > targetEnd) {
+ source = oldSource; /* Back up source pointer! */
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
@@ -248,9 +255,9 @@ ConversionResult ConvertUTF16toUTF8 (
* definition of UTF-8 goes up to 4-byte sequences.
*/
-static Boolean isLegalUTF8(UTF8 *source, int length) {
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
UTF8 a;
- UTF8 *srcptr = source+length;
+ const UTF8 *srcptr = source+length;
switch (length) {
default: return false;
/* Everything else falls through when "true"... */
@@ -276,7 +283,7 @@ static Boolean isLegalUTF8(UTF8 *source, int length) {
* Exported function to return whether a UTF-8 sequence is legal or not.
* This is not used here; it's just exported.
*/
-Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) {
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source]+1;
if (source+length > sourceEnd) {
return false;
@@ -287,10 +294,10 @@ Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd) {
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF16 (
- UTF8** sourceStart, UTF8* sourceEnd,
- UTF16** targetStart, const UTF16* targetEnd, const ConversionFlags flags) {
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF8* source = *sourceStart;
+ const UTF8* source = *sourceStart;
UTF16* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0;
@@ -315,11 +322,12 @@ ConversionResult ConvertUTF8toUTF16 (
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
result = targetExhausted; break;
}
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
- --source; /* return to the illegal value itself */
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
result = sourceIllegal;
break;
} else {
@@ -328,13 +336,15 @@ ConversionResult ConvertUTF8toUTF16 (
} else if (ch > UNI_MAX_UTF16) {
if (flags == strictConversion) {
result = sourceIllegal;
- source -= extraBytesToRead; /* return to the start */
+ source -= (extraBytesToRead+1); /* return to the start */
+ break; /* Bail out; shouldn't continue */
} else {
*target++ = UNI_REPLACEMENT_CHAR;
}
} else {
/* target is a character in range 0xFFFF - 0x10FFFF. */
if (target + 1 >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
result = targetExhausted; break;
}
ch -= halfBase;
@@ -350,10 +360,10 @@ ConversionResult ConvertUTF8toUTF16 (
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF32toUTF8 (
- UTF32** sourceStart, const UTF32* sourceEnd,
- UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags) {
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF32* source = *sourceStart;
+ const UTF32* source = *sourceStart;
UTF8* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch;
@@ -380,6 +390,7 @@ ConversionResult ConvertUTF32toUTF8 (
target += bytesToWrite;
if (target > targetEnd) {
+ --source; /* Back up source pointer! */
target -= bytesToWrite; result = targetExhausted; break;
}
switch (bytesToWrite) { /* note: everything falls through. */
@@ -398,10 +409,10 @@ ConversionResult ConvertUTF32toUTF8 (
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF32 (
- UTF8** sourceStart, UTF8* sourceEnd,
- UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags) {
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
ConversionResult result = conversionOK;
- UTF8* source = *sourceStart;
+ const UTF8* source = *sourceStart;
UTF32* target = *targetStart;
while (source < sourceEnd) {
UTF32 ch = 0;
@@ -426,19 +437,13 @@ ConversionResult ConvertUTF8toUTF32 (
ch -= offsetsFromUTF8[extraBytesToRead];
if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up the source pointer! */
result = targetExhausted; break;
}
if (ch <= UNI_MAX_UTF32) {
*target++ = ch;
- } else if (ch > UNI_MAX_UTF32) {
+ } else { /* i.e., ch > UNI_MAX_UTF32 */
*target++ = UNI_REPLACEMENT_CHAR;
- } else {
- if (target + 1 >= targetEnd) {
- result = targetExhausted; break;
- }
- ch -= halfBase;
- *target++ = (ch >> halfShift) + UNI_SUR_HIGH_START;
- *target++ = (ch & halfMask) + UNI_SUR_LOW_START;
}
}
*sourceStart = source;
diff --git a/ConvertUTF.h b/ConvertUTF.h
index 6798183..429ab40 100644
--- a/ConvertUTF.h
+++ b/ConvertUTF.h
@@ -75,6 +75,7 @@
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
+ Fixes & updates, Sept 2001.
------------------------------------------------------------------------ */
@@ -109,30 +110,39 @@ typedef enum {
lenientConversion
} ConversionFlags;
-ConversionResult ConvertUTF32toUTF16 (
- UTF32** sourceStart, const UTF32* sourceEnd,
- UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags);
+/* This is for C++ and does no harm in C */
+#ifdef __cplusplus
+extern "C" {
+#endif
-ConversionResult ConvertUTF16toUTF32 (
- UTF16** sourceStart, UTF16* sourceEnd,
- UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF8toUTF16 (
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF16toUTF8 (
- UTF16** sourceStart, const UTF16* sourceEnd,
- UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags);
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
-ConversionResult ConvertUTF8toUTF16 (
- UTF8** sourceStart, UTF8* sourceEnd,
- UTF16** targetStart, const UTF16* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF8toUTF32 (
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
ConversionResult ConvertUTF32toUTF8 (
- UTF32** sourceStart, const UTF32* sourceEnd,
- UTF8** targetStart, const UTF8* targetEnd, ConversionFlags flags);
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
-ConversionResult ConvertUTF8toUTF32 (
- UTF8** sourceStart, UTF8* sourceEnd,
- UTF32** targetStart, const UTF32* targetEnd, ConversionFlags flags);
+ConversionResult ConvertUTF16toUTF32 (
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF32toUTF16 (
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
+
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
-Boolean isLegalUTF8Sequence(UTF8 *source, UTF8 *sourceEnd);
+#ifdef __cplusplus
+}
+#endif
/* --------------------------------------------------------------------- */
diff --git a/harness.c b/harness.c
index a07792b..b3dd500 100644
--- a/harness.c
+++ b/harness.c
@@ -157,7 +157,7 @@ int test02() {
/*
* Test UTF32 -> UTF16
*/
- result = ConvertUTF32toUTF16(&utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+ result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
/* skip result checking for all but 0000d800, which we know to be illegal */
switch (result) {
@@ -186,7 +186,7 @@ int test02() {
* for unpaired low surrogates. We do make one check that the lowest low
* surrogate, when unpaired, is illegal.
*/
- result = ConvertUTF16toUTF8(&utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
+ result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
@@ -230,7 +230,7 @@ int test02() {
/*
* Test UTF8 -> UTF16, with legality check on.
*/
- result = ConvertUTF8toUTF16(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
+ result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
@@ -257,9 +257,9 @@ int test02() {
* back to UTF32.
*/
if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
- result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+ result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
} else {
- result = ConvertUTF16toUTF32(&utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+ result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
}
switch (result) {
default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
@@ -320,7 +320,7 @@ int test03() {
/*
* Test UTF32 -> UTF8, with legality check on.
*/
- result = ConvertUTF32toUTF8(&utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
+ result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
@@ -355,7 +355,7 @@ int test03() {
/*
* Test UTF8 -> UTF32, with legality check on.
*/
- result = ConvertUTF8toUTF32(&utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
+ result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
switch (result) {
default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
case conversionOK: break;
diff --git a/readme.txt b/readme.txt
deleted file mode 100644
index 9bb6a00..0000000
--- a/readme.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The accompanying C source code file "ConvertUTF.c" and the associated header
-file "ConvertUTF.h" provide for conversion between various transformation
-formats of Unicode characters. The following conversions are supported:
-
- UCS4 to UTF16
- UCS4 to UTF8
- UTF16 to UCS4
- UTF16 to UTF8
- UTF8 to UTF16
- UTF8 to UCS4
-
-
-The files "CVTUTF7.C" and "CVTUTF7.H" are for archival and historical purposes
-only. They have not been updated to Unicode 3.0 and should be considered
-obsolescent. "CVTUTF7.C" contains two functions that can convert between
-UCS2 (i.e., the BMP characters only) and UTF-7. Surrogates are not supported,
-the code has not been tested, and should be considered unsuitable for general
-purpose use.
-
-Please address any bug reports about these programs to:
- http://www.unicode.org/unicode/reporting.html
-
-Last update: July 12, 2001
-
-