From 035301443ee6084c504756ded6d4e104443fcd6a Mon Sep 17 00:00:00 2001
From: Brian Carrier <carrier@sleuthkit.org>
Date: Thu, 22 Sep 2011 19:58:49 +0000
Subject: [PATCH] Replace invalid UTF-16 with ^ when converting - patch by
 Anthony Lawrence

---
 tsk3/base/tsk_unicode.c | 52 +++++++++++++++++++++++++++++------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/tsk3/base/tsk_unicode.c b/tsk3/base/tsk_unicode.c
index 4c04a1662..391078ed4 100644
--- a/tsk3/base/tsk_unicode.c
+++ b/tsk3/base/tsk_unicode.c
@@ -170,10 +170,16 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart,
                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
                     ++source;
                 }
-                else if (flags == TSKstrictConversion) {        /* it's an unpaired high surrogate */
-                    --source;   /* return to the illegal value itself */
-                    result = TSKsourceIllegal;
-                    break;
+                else {
+                    if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
+                        --source;       /* return to the illegal value itself */
+                        result = TSKsourceIllegal;
+                        break;
+                    }
+                    // replace with another character
+                    else {
+                        ch = '^';
+                    }
                 }
             }
             else {              /* We don't have the 16 bits following the high surrogate. */
@@ -182,14 +188,19 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart,
                 break;
             }
         }
-        else if (flags == TSKstrictConversion) {
-            /* UTF-16 surrogate values are illegal in UTF-32 */
-            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+        /* UTF-16 surrogate values are illegal in UTF-32 */
+        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+            if (flags == TSKstrictConversion) {
                 --source;       /* return to the illegal value itself */
                 result = TSKsourceIllegal;
                 break;
             }
+            // replace with another character
+            else {
+                ch = '^';
+            }
         }
+
         /* Figure out how many bytes the result will require */
         if (ch < (UTF32) 0x80) {
             bytesToWrite = 1;
@@ -273,10 +284,16 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart,
                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
                     ++source;
                 }
-                else if (flags == TSKstrictConversion) {        /* it's an unpaired high surrogate */
-                    --source;   /* return to the illegal value itself */
-                    result = TSKsourceIllegal;
-                    break;
+                else {
+                    if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */
+                        --source;       /* return to the illegal value itself */
+                        result = TSKsourceIllegal;
+                        break;
+                    }
+                    // replace with another character
+                    else {
+                        ch = '^';
+                    }
                 }
             }
             else {              /* We don't have the 16 bits following the high surrogate. */
@@ -285,15 +302,20 @@ tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart,
                 break;
             }
         }
-        else if (flags == TSKstrictConversion) {
-            /* UTF-16 surrogate values are illegal in UTF-32 */
-            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+        /* UTF-16 surrogate values are illegal in UTF-32 */
+        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+            if (flags == TSKstrictConversion) {
                 --source;       /* return to the illegal value itself */
                 result = TSKsourceIllegal;
                 break;
             }
+            // replace with another character
+            else {
+                ch = '^';
+            }
         }
-/* Figure out how many bytes the result will require */
+
+        /* Figure out how many bytes the result will require */
         if (ch < (UTF32) 0x80) {
             bytesToWrite = 1;
         }
-- 
GitLab