From da9101dd0f5604bddab57738ea24fad068106654 Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Thu, 22 Sep 2011 20:51:38 +0000
Subject: [PATCH] minor update on unicode cleanup

---
 NEWS.txt                |  1 +
 tsk3/base/tsk_unicode.c | 35 +++++++++++++++--------------------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/NEWS.txt b/NEWS.txt
index a11a1a77b..b973f59e8 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -17,6 +17,7 @@ New Features:
 - new TskAuto::findFilesInFs(TSK_FS_INFO *) method
 - Need to only specify first E01 file and the rest are found
 - Changed docs license to non-commercial
+- Unicode conversion routines fix invalid UTF-16 text during conversion
 
 
 Bug Fixes:
diff --git a/tsk3/base/tsk_unicode.c b/tsk3/base/tsk_unicode.c
index 391078ed4..8f3831b39 100644
--- a/tsk3/base/tsk_unicode.c
+++ b/tsk3/base/tsk_unicode.c
@@ -164,22 +164,20 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart,
             /* If the 16 bits following the high surrogate are in the source buffer... */
             if (source < sourceEnd) {
                 UTF32 ch2 = tsk_getu16(endian, (uint8_t *) source);
+                ++source;
+
                 /* If it's a low surrogate, convert to UTF32. */
                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
-                    ++source;
                 }
+                else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
+                    result = TSKsourceIllegal;
+                    break;
+                }
+                // replace with another character
                 else {
-                    if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
-                        --source;       /* return to the illegal value itself */
-                        result = TSKsourceIllegal;
-                        break;
-                    }
-                    // replace with another character
-                    else {
-                        ch = '^';
-                    }
+                    ch = '^';
                 }
             }
             else {              /* We don't have the 16 bits following the high surrogate. */
@@ -278,22 +276,19 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart,
             /* If the 16 bits following the high surrogate are in the source buffer... */
             if (source < sourceEnd) {
                 UTF32 ch2 = *source;
+                source++;
                 /* If it's a low surrogate, convert to UTF32. */
                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
-                    ++source;
                 }
+                else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
+                    result = TSKsourceIllegal;
+                    break;
+                }
+                // replace with another character
                 else {
-                    if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
-                        --source;       /* return to the illegal value itself */
-                        result = TSKsourceIllegal;
-                        break;
-                    }
-                    // replace with another character
-                    else {
-                        ch = '^';
-                    }
+                    ch = '^';
                 }
             }
             else {              /* We don't have the 16 bits following the high surrogate. */
-- 
GitLab