summaryrefslogtreecommitdiff
path: root/ConvertUTF.c
diff options
context:
space:
mode:
Diffstat (limited to 'ConvertUTF.c')
-rw-r--r--ConvertUTF.c46
1 files changed, 31 insertions, 15 deletions
diff --git a/ConvertUTF.c b/ConvertUTF.c
index 649fbc8..9b3deeb 100644
--- a/ConvertUTF.c
+++ b/ConvertUTF.c
@@ -32,6 +32,7 @@
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
+ Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
See the header file "ConvertUTF.h" for complete documentation.
@@ -82,7 +83,7 @@ ConversionResult ConvertUTF32toUTF16 (
} else {
*target++ = (UTF16)ch; /* normal case */
}
- } else if (ch > UNI_MAX_UTF16) {
+ } else if (ch > UNI_MAX_LEGAL_UTF32) {
if (flags == strictConversion) {
result = sourceIllegal;
} else {
@@ -166,6 +167,9 @@ if (result == sourceIllegal) {
/*
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -190,7 +194,8 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... six byte sequence.)
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
@@ -251,8 +256,8 @@ ConversionResult ConvertUTF16toUTF8 (
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
}
@@ -296,16 +301,19 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
- default: if (a < 0x80) return false;
+ default: if (a < 0x80) return false;
}
- case 1: if (*source >= 0x80 && *source < 0xC2) return false;
- if (*source > 0xF4) return false;
+
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
}
+ if (*source > 0xF4) return false;
return true;
}
@@ -346,8 +354,8 @@ ConversionResult ConvertUTF8toUTF16 (
* The cases all fall through. See "Note A" below.
*/
switch (extraBytesToRead) {
- case 5: ch += *source++; ch <<= 6;
- case 4: ch += *source++; ch <<= 6;
+ case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
case 3: ch += *source++; ch <<= 6;
case 2: ch += *source++; ch <<= 6;
case 1: ch += *source++; ch <<= 6;
@@ -418,13 +426,17 @@ ConversionResult ConvertUTF32toUTF8 (
break;
}
}
- /* Figure out how many bytes the result will require */
+ /*
+ * Figure out how many bytes the result will require. Turn any
+ * illegally large UTF32 things (> Plane 17) into replacement chars.
+ */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
- } else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
- } else { bytesToWrite = 2;
+ } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
ch = UNI_REPLACEMENT_CHAR;
+ result = sourceIllegal;
}
target += bytesToWrite;
@@ -481,8 +493,11 @@ ConversionResult ConvertUTF8toUTF32 (
source -= (extraBytesToRead+1); /* Back up the source pointer! */
result = targetExhausted; break;
}
- if (ch <= UNI_MAX_UTF32) {
- /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch <= UNI_MAX_LEGAL_UTF32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
if (flags == strictConversion) {
source -= (extraBytesToRead+1); /* return to the illegal value itself */
@@ -494,7 +509,8 @@ ConversionResult ConvertUTF8toUTF32 (
} else {
*target++ = ch;
}
- } else { /* i.e., ch > UNI_MAX_UTF32 */
+ } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+ result = sourceIllegal;
*target++ = UNI_REPLACEMENT_CHAR;
}
}