diff --git a/CHANGES.txt b/CHANGES.txt index 2b3bbd8abaf4fd313f3244917f4cf34c1e5febf3..41909027b49b23cf4dc56589f028c9dd2a407295 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -62,6 +62,12 @@ about the ext2 and ext3 types (it knew about ext and the older forms). Also added support for ufs1, ufs2, iso, and hfs. reported by Vinogratzky. +4/11/09: Bug Fix: Fixed issue 2725799 regarding ifind not converting +UTF16 names properly on Windows because it was using endian ordering +of file system and not local system. Created new conversion function +and changed a couple of other spots that had similar bug. Reported +by Rob Joyce. + ---------------- VERSION 3.0.0 -------------- 0/00/00: Update: Many, many, many API changes. diff --git a/tsk3/base/tsk_base_i.h b/tsk3/base/tsk_base_i.h index e93b94704702a8a8eb2ad095f6dad91f4693ed7a..24772b63934fa690fce6547b833bbfa356c51d01 100644 --- a/tsk3/base/tsk_base_i.h +++ b/tsk3/base/tsk_base_i.h @@ -39,14 +39,15 @@ extern "C" { (roundup((x),(y)) - (y))) #endif -extern void *tsk_malloc(size_t); -extern void *tsk_realloc(void *, size_t); + extern void *tsk_malloc(size_t); + extern void *tsk_realloc(void *, size_t); // getopt for windows #ifdef TSK_WIN32 extern int tsk_optind; extern TSK_TCHAR *tsk_optarg; - extern int tsk_getopt(int argc, TSK_TCHAR * const argv[], const TSK_TCHAR * optstring); + extern int tsk_getopt(int argc, TSK_TCHAR * const argv[], + const TSK_TCHAR * optstring); #endif @@ -55,9 +56,9 @@ extern void *tsk_realloc(void *, size_t); #define TSK_ERRSTR_L 512 #define TSK_ERRSTR_PR_L ((TSK_ERRSTR_L << 2) + 64) -extern char tsk_errstr[TSK_ERRSTR_L]; -extern char tsk_errstr2[TSK_ERRSTR_L]; -extern char tsk_errstr_print[TSK_ERRSTR_PR_L]; + extern char tsk_errstr[TSK_ERRSTR_L]; + extern char tsk_errstr2[TSK_ERRSTR_L]; + extern char tsk_errstr_print[TSK_ERRSTR_PR_L]; @@ -65,8 +66,10 @@ extern char tsk_errstr_print[TSK_ERRSTR_PR_L]; /* macros to read in multi-byte fields * file system is an array of 8-bit values, not 32-bit values */ -extern uint8_t tsk_guess_end_u16(TSK_ENDIAN_ENUM *, uint8_t *, uint16_t); -extern uint8_t tsk_guess_end_u32(TSK_ENDIAN_ENUM *, uint8_t *, uint32_t); + extern uint8_t tsk_guess_end_u16(TSK_ENDIAN_ENUM *, uint8_t *, + uint16_t); + extern uint8_t tsk_guess_end_u32(TSK_ENDIAN_ENUM *, uint8_t *, + uint32_t); /** \internal * Read a 16-bit unsigned value. @@ -280,33 +283,39 @@ extern uint8_t tsk_guess_end_u32(TSK_ENDIAN_ENUM *, uint8_t *, uint32_t); ------------------------------------------------------------------------ */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ + typedef unsigned short UTF16; /* at least 16 bits */ + typedef unsigned char UTF8; /* typically 8 bits */ + typedef unsigned char Boolean; /* 0 or 1 */ -typedef enum { - TSKconversionOK, ///< conversion successful - TSKsourceExhausted, ///< partial character in source, but hit end - TSKtargetExhausted, ///< insuff. room in target for conversion - TSKsourceIllegal ///< source sequence is illegal/malformed -} TSKConversionResult; + typedef enum { + TSKconversionOK, ///< conversion successful + TSKsourceExhausted, ///< partial character in source, but hit end + TSKtargetExhausted, ///< insuff. room in target for conversion + TSKsourceIllegal ///< source sequence is illegal/malformed + } TSKConversionResult; -typedef enum { - TSKstrictConversion = 0, ///< Error if invalid surrogate pairs are found - TSKlenientConversion ///< Ignore invalid surrogate pairs -} TSKConversionFlags; + typedef enum { + TSKstrictConversion = 0, ///< Error if invalid surrogate pairs are found + TSKlenientConversion ///< Ignore invalid surrogate pairs + } TSKConversionFlags; -TSKConversionResult tsk_UTF8toUTF16(const UTF8 ** sourceStart, - const UTF8 * sourceEnd, - UTF16 ** targetStart, UTF16 * targetEnd, TSKConversionFlags flags); + extern TSKConversionResult tsk_UTF8toUTF16(const UTF8 ** sourceStart, + const UTF8 * sourceEnd, + UTF16 ** targetStart, UTF16 * targetEnd, TSKConversionFlags flags); -TSKConversionResult tsk_UTF16toUTF8(TSK_ENDIAN_ENUM, - const UTF16 ** sourceStart, const UTF16 * sourceEnd, - UTF8 ** targetStart, UTF8 * targetEnd, TSKConversionFlags flags); + extern TSKConversionResult tsk_UTF16toUTF8(TSK_ENDIAN_ENUM, + const UTF16 ** sourceStart, const UTF16 * sourceEnd, + UTF8 ** targetStart, UTF8 * targetEnd, TSKConversionFlags flags); -Boolean tsk_isLegalUTF8Sequence(const UTF8 * source, - const UTF8 * sourceEnd); + extern TSKConversionResult + tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart, + const UTF16 * sourceEnd, UTF8 ** targetStart, + UTF8 * targetEnd, TSKConversionFlags flags); + + + extern Boolean tsk_isLegalUTF8Sequence(const UTF8 * source, + const UTF8 * sourceEnd); #endif //@} diff --git a/tsk3/base/tsk_unicode.c b/tsk3/base/tsk_unicode.c index eccd6f85d9a093525d1d0a7971ee4bc28937eebc..4c04a1662240fb3a9d30bbcb0a110c733044068e 100644 --- a/tsk3/base/tsk_unicode.c +++ b/tsk3/base/tsk_unicode.c @@ -235,6 +235,109 @@ tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart, return result; } + +/** +* \ingroup baselib +* Convert a UTF-16 string in local endian ordering to UTF-8. +* @param sourceStart Pointer to pointer to start of UTF-16 string. Will be updated to last char proccessed. +* @param sourceEnd Pointer to one entry past end of UTF-16 string +* @param targetStart Pointer to pointer to place where UTF-8 string should be written. Will be updated to next place to write to. +* @param targetEnd Pointer to end of UTF-8 buffer +* @param flags Flags used during conversion +* @returns error code +*/ +TSKConversionResult +tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart, + const UTF16 * sourceEnd, UTF8 ** targetStart, + UTF8 * targetEnd, TSKConversionFlags flags) +{ + TSKConversionResult result = TSKconversionOK; + const UTF16 *source = *sourceStart; + UTF8 *target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } + else if (flags == TSKstrictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = TSKsourceIllegal; + break; + } + } + else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = TSKsourceExhausted; + break; + } + } + else if (flags == TSKstrictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = TSKsourceIllegal; + break; + } + } +/* Figure out how many bytes the result will require */ + if (ch < (UTF32) 0x80) { + bytesToWrite = 1; + } + else if (ch < (UTF32) 0x800) { + bytesToWrite = 2; + } + else if (ch < (UTF32) 0x10000) { + bytesToWrite = 3; + } + else if (ch < (UTF32) 0x110000) { + bytesToWrite = 4; + } + else { + bytesToWrite = 3; + ch = TSK_UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; + result = TSKtargetExhausted; + break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: + *--target = (UTF8) ((ch | byteMark) & byteMask); + ch >>= 6; + case 3: + *--target = (UTF8) ((ch | byteMark) & byteMask); + ch >>= 6; + case 2: + *--target = (UTF8) ((ch | byteMark) & byteMask); + ch >>= 6; + case 1: + *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + /* --------------------------------------------------------------------- */ /* diff --git a/tsk3/fs/fls_lib.c b/tsk3/fs/fls_lib.c index 3d7619d02f160e7393a9294f37b4389313b77c0d..8b5b0ece5e8ea876cf8b589f990f5a2989d6a3ee 100644 --- a/tsk3/fs/fls_lib.c +++ b/tsk3/fs/fls_lib.c @@ -220,8 +220,7 @@ tsk_fs_fls(TSK_FS_INFO * fs, TSK_FS_FLS_FLAG_ENUM lclflags, ptr16 = (UTF16 *) tpre; retval = - tsk_UTF16toUTF8(fs->endian, - (const UTF16 **) &ptr16, (UTF16 *) + tsk_UTF16toUTF8_lclorder((const UTF16 **) &ptr16, (UTF16 *) & ptr16[TSTRLEN(tpre) + 1], &ptr8, (UTF8 *) ((uintptr_t) ptr8 + clen), TSKlenientConversion); if (retval != TSKconversionOK) { diff --git a/tsk3/fs/ifind_lib.c b/tsk3/fs/ifind_lib.c index 54311d99e1fac2d049d33dd779699c69524be627..ac3164d779358ab05ab28b991d9947bfce7ffba7 100644 --- a/tsk3/fs/ifind_lib.c +++ b/tsk3/fs/ifind_lib.c @@ -458,7 +458,7 @@ tsk_fs_ifind_path(TSK_FS_INFO * fs, TSK_TCHAR * tpath, TSK_INUM_T * result) ptr16 = (UTF16 *) tpath; retval = - tsk_UTF16toUTF8(fs->endian, (const UTF16 **) &ptr16, (UTF16 *) + tsk_UTF16toUTF8_lclorder((const UTF16 **) &ptr16, (UTF16 *) & ptr16[TSTRLEN(tpath) + 1], &ptr8, (UTF8 *) ((uintptr_t) ptr8 + clen), TSKlenientConversion); if (retval != TSKconversionOK) { diff --git a/tsk3/img/img_open.c b/tsk3/img/img_open.c index 502c6061c81c3fda1d84a3a49115ed1eb6a12266..a17bdc7b8e4867341472360fe66a971228b5513c 100644 --- a/tsk3/img/img_open.c +++ b/tsk3/img/img_open.c @@ -314,8 +314,8 @@ tsk_img_open_utf8(int num_img, const char **images, TSK_IMG_TYPE_ENUM type) // we allocate the buffer with the same number of chars as the UTF-8 length ilen = strlen(images[i]); if ((images16[i] = - (wchar_t *) tsk_malloc((ilen+1) * sizeof(wchar_t))) == - NULL) { + (wchar_t *) tsk_malloc((ilen + + 1) * sizeof(wchar_t))) == NULL) { goto tsk_utf8_cleanup; } @@ -324,8 +324,7 @@ tsk_img_open_utf8(int num_img, const char **images, TSK_IMG_TYPE_ENUM type) retval2 = tsk_UTF8toUTF16((const UTF8 **) &utf8, &utf8[ilen], - &utf16, &utf16[ilen], - TSKlenientConversion); + &utf16, &utf16[ilen], TSKlenientConversion); if (retval2 != TSKconversionOK) { tsk_errno = TSK_ERR_IMG_CONVERT; snprintf(tsk_errstr, TSK_ERRSTR_L, @@ -411,7 +410,7 @@ tsk_img_open_utf16(int num_img, utf8 = (UTF8 *) images8[i]; retval2 = - tsk_UTF16toUTF8(endian, (const UTF16 **) &utf16, + tsk_UTF16toUTF8_lclorder((const UTF16 **) &utf16, &utf16[wcslen(images[i]) + 1], &utf8, &utf8[ilen + 1], TSKlenientConversion); if (retval2 != TSKconversionOK) {