From da9101dd0f5604bddab57738ea24fad068106654 Mon Sep 17 00:00:00 2001 From: Brian Carrier <carrier@sleuthkit.org> Date: Thu, 22 Sep 2011 20:51:38 +0000 Subject: [PATCH] minor update on unicode cleanup --- NEWS.txt | 1 + tsk3/base/tsk_unicode.c | 35 +++++++++++++++-------------------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/NEWS.txt b/NEWS.txt index a11a1a77b..b973f59e8 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -17,6 +17,7 @@ New Features: - new TskAuto::findFilesInFs(TSK_FS_INFO *) method - Need to only specify first E01 file and the rest are found - Changed docs license to non-commercial +- Unicode conversion routines fix invalid UTF-16 text during conversion Bug Fixes: diff --git a/tsk3/base/tsk_unicode.c b/tsk3/base/tsk_unicode.c index 391078ed4..8f3831b39 100644 --- a/tsk3/base/tsk_unicode.c +++ b/tsk3/base/tsk_unicode.c @@ -164,22 +164,20 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart, /* If the 16 bits following the high surrogate are in the source buffer... */ if (source < sourceEnd) { UTF32 ch2 = tsk_getu16(endian, (uint8_t *) source); + ++source; + /* If it's a low surrogate, convert to UTF32. */ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; } + else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ + result = TSKsourceIllegal; + break; + } + // replace with another character else { - if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = TSKsourceIllegal; - break; - } - // replace with another character - else { - ch = '^'; - } + ch = '^'; } } else { /* We don't have the 16 bits following the high surrogate. */ @@ -278,22 +276,19 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart, /* If the 16 bits following the high surrogate are in the source buffer... */ if (source < sourceEnd) { UTF32 ch2 = *source; + source++; /* If it's a low surrogate, convert to UTF32. */ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; } + else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ + result = TSKsourceIllegal; + break; + } + // replace with another character else { - if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = TSKsourceIllegal; - break; - } - // replace with another character - else { - ch = '^'; - } + ch = '^'; } } else { /* We don't have the 16 bits following the high surrogate. */ -- GitLab