From 035301443ee6084c504756ded6d4e104443fcd6a Mon Sep 17 00:00:00 2001 From: Brian Carrier <carrier@sleuthkit.org> Date: Thu, 22 Sep 2011 19:58:49 +0000 Subject: [PATCH] Replace invalid UTF-16 with ^ when converting - patch by Anthony Lawrence --- tsk3/base/tsk_unicode.c | 52 +++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/tsk3/base/tsk_unicode.c b/tsk3/base/tsk_unicode.c index 4c04a1662..391078ed4 100644 --- a/tsk3/base/tsk_unicode.c +++ b/tsk3/base/tsk_unicode.c @@ -170,10 +170,16 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart, + (ch2 - UNI_SUR_LOW_START) + halfBase; ++source; } - else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = TSKsourceIllegal; - break; + else { + if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = TSKsourceIllegal; + break; + } + // replace with another character + else { + ch = '^'; + } } } else { /* We don't have the 16 bits following the high surrogate. */ @@ -182,14 +188,19 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart, break; } } - else if (flags == TSKstrictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + if (flags == TSKstrictConversion) { --source; /* return to the illegal value itself */ result = TSKsourceIllegal; break; } + // replace with another character + else { + ch = '^'; + } } + /* Figure out how many bytes the result will require */ if (ch < (UTF32) 0x80) { bytesToWrite = 1; @@ -273,10 +284,16 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart, + (ch2 - UNI_SUR_LOW_START) + halfBase; ++source; } - else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = TSKsourceIllegal; - break; + else { + if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = TSKsourceIllegal; + break; + } + // replace with another character + else { + ch = '^'; + } } } else { /* We don't have the 16 bits following the high surrogate. */ @@ -285,15 +302,20 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart, break; } } - else if (flags == TSKstrictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + if (flags == TSKstrictConversion) { --source; /* return to the illegal value itself */ result = TSKsourceIllegal; break; } + // replace with another character + else { + ch = '^'; + } } -/* Figure out how many bytes the result will require */ + + /* Figure out how many bytes the result will require */ if (ch < (UTF32) 0x80) { bytesToWrite = 1; } -- GitLab