/* * All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or * its licensors. * * For complete copyright and license terms please see the LICENSE at the root of this * distribution (the "License"). All use of this software is governed by the License, * or, if provided, by the license below or the license accompanying this file. Do not * remove or modify any license notices. This file is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * */ // Original file Copyright Crytek GMBH or its affiliates, used under license. // Description : Generic Unicode encoding helpers. // // Defines encoding and decoding functions used by the higher-level functions. // These are used by the various conversion functions in UnicodeFunctions.h and UnicodeIterator.h. // Note: You can use these functions manually for low-level functionality, but we don't recommend that. // In that case, you probably want to check inside the nested Detail namespace for the elementary bits. #pragma once #include "BaseTypes.h" // For uint8, uint16, uint32 #include "CompileTimeAssert.h" // For COMPILE_TIME_ASSERT macro namespace Unicode { // Supported encoding/conversion types. enum EEncoding { // UTF-8 encoding, see http://www.unicode.org/resources/utf8.html. // Input and output are supported. // Note: This format maps the entire UCS, where each code-point can take [1, 4] 8-bit code-units. // Note: This is a strict super-set of Latin1/ISO-885901 as well as ASCII. eEncoding_UTF8, // UTF-16 encoding, see http://tools.ietf.org/html/rfc2781. // Input and output are supported. // Note: This format maps the entire UCS, where each code-point can take [1, 2] 16-bit code-units. eEncoding_UTF16, // UTF-32 encoding, see http://www.unicode.org/reports/tr17/. // Input and output are supported. // Note: This format maps the entire UCS, each code-point is stored in a single 32-bit code-unit. eEncoding_UTF32, // ASCII encoding, see http://en.wikipedia.org/wiki/ASCII. // Input and output are supported (any output UCS values out of supported range are mapped to question mark). // Note: Only values [U+0000, U+007F] can be mapped. eEncoding_ASCII, // Latin1, aka ISO-8859-1 encoding, see http://en.wikipedia.org/wiki/ISO/IEC_8859-1. // Only input is supported. // Note: This is a strict super-set of ASCII, it additionally maps [U+00A0, U+00FF]. eEncoding_Latin1, // Windows ANSI codepage 1252, see http://en.wikipedia.org/wiki/Windows-1252. // Only input is supported. // Note: This is a strict super-set of ASCII and Latin1/ISO-8859-1, it maps some code-units from [0x80, 0x9F]. eEncoding_Win1252, }; // Methods of recovery from invalid encoded sequences. enum EErrorRecovery { // No attempt to detect invalid encoding is performed, the input is assumed to be valid. // If the input is not valid, the output is undefined (in debug, this condition will cause an assert to trigger). eErrorRecovery_None, // When an invalidly encoded sequence is detected, the sequence is discarded (will not be part of the output). // Typically used for logic/hashing purposes when the input is almost certainly valid. eErrorRecovery_Discard, // When an invalidly encoded sequence is detected, the sequence is replaced with the replacement-character (U+FFFD). // Typically used when the output sequence is used for UI display purposes. eErrorRecovery_Replace, // When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent. // If the sequence is also not valid Latin1 encoded, the sequence is discarded. // Typically used when reading generic text files with 1-byte code-units. // Note: This recovery method can only be used when decoding UTF-8. eErrorRecovery_FallbackLatin1ThenDiscard, // When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent. // If the sequence is also not valid codepage 1252 encoded, the sequence is discarded. // Typically used when reading text files generated on Windows with 1-byte code-units. // Note: This recovery method can only be used when decoding UTF-8. eErrorRecovery_FallbackWin1252ThenDiscard, // When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Latin1 equivalent. // If the sequence is also not valid Latin1 encoded, it is replaced with the replacement-character (U+FFFD). // Typically used when reading generic text files with 1-byte code-units. // Note: This recovery method can only be used when decoding UTF-8. eErrorRecovery_FallbackLatin1ThenReplace, // When an invalidly encoded sequence is detected, the sequence is replaced with the eEncoding_Win1252 equivalent. // If the sequence is also not valid codepage 1252 encoded, it is replaced with the replacement-character (U+FFFD). // Typically used when reading text files generated on Windows with 1-byte code-units. // Note: This recovery method can only be used when decoding UTF-8. eErrorRecovery_FallbackWin1252ThenReplace, }; namespace Detail { // Decode(state, unit): Decodes a single code-unit of an encoding into an UCS code-point. // When Safe flag is set, encoding errors are detected so a fall-back encoding or other recovery method can be used. // Interpret return value as follows: // < 0x001FFFFF: Decoded codepoint (== return value), call again with next code-unit and clear state. // < 0x80000000: Intermediate state returned, call again with next code-unit and the returned state. // >= 0x80000000: Bad encoding detected, up to 16 bits (UTF-16) or 24 bits (UTF-8, last in lower bits) // contain previous consumed values (does not happen if Safe == false). template inline uint32 Decode(uint32 state, uint32 unit); // Some constant values used when encoding/decoding. enum { cDecodeShiftRemaining = 26, // Where to store the remaining count in the state. cDecodeOneRemaining = 1 << cDecodeShiftRemaining, // Remaining value of one. cDecodeMaskRemaining = 3 << cDecodeShiftRemaining, // All possible remaining bits that can be used. cDecodeLeadBit = 1 << 22, // All bits up to and including this one are reserved. cDecodeErrorBit = 1 << 31, // Set if an error occurs during decoding. cDecodeOverlongBit = 1 << 30, // Set if overlong sequence was used. cDecodeSurrogateBit = 1 << 29, // Set if surrogate code-point decoded in UTF-8. cDecodeInvalidBit = 1 << 28, // Set if invalid code-point decoded (U+FFFE/FFFF). cDecodeSuccess = 0, // Placeholder to indicate no error occurred. cCodepointMax = 0x10FFFF, // The maximum value of an UCS code-point. cLeadSurrogateFirst = 0xD800, // The first valid UTF-16 lead-surrogate value. cLeadSurrogateLast = 0xDBFF, // The last valid UTF-16 lead-surrogate value. cTrailSurrogateFirst = 0xDC00, // The first valid UTF-16 trail-surrogate value. cTrailSurrogateLast = 0xDFFF, // The last valid UTF-16 trail-surrogate value. cReplacementCharacter = 0xFFFD, // The default replacement character. }; // Validate the UTF-8 state of a multi-byte sequence. // The safe decoder of UTF-8 will call this function when a full potential code-point has been decoded. // This function is (at most) called for 50% of the decoded UTF-8 code-units, but likely at much lower frequency. inline uint32 DecodeValidate8(uint32 state) { uint32 errorbits = (state >> 8) | cDecodeErrorBit; state ^= (state & 0x400000) >> 1; // For 3-byte sequences, bit 5 of the lead byte needs to be cleared. const uint32 cp = (state & 0x3F) | ((state & 0x3F00) >> 2) | ((state & 0x3F0000) >> 4) | ((state & 0x07000000) >> 6); if (cp <= cCodepointMax) { if (cp >= cLeadSurrogateFirst && cp <= cTrailSurrogateLast) { errorbits += cDecodeSurrogateBit; // CESU-8 encoding might have been used. } else { uint32 minval = 0x80; minval += (0x00400000 & state) ? 0x800 - 0x80 : 0; minval += (0x40000000 & state) ? 0x10000 - 0x80 : 0; if (cp >= minval) { if ((cp & 0xFFFFFFFEU) != 0xFFFEU) { return cp; // Valid code-point. } errorbits += cDecodeInvalidBit; // Invalid character used. } errorbits += cDecodeOverlongBit; // Overlong encoding used. } } return errorbits; } // Decode UTF-8, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { if (state == 0) // First byte. { unit = unit & 0xFF; if (unit < 0xC0) { return unit; // Single-unit (ASCII). } uint32 remaining = (unit >> 4) - 0xC; remaining += (remaining == 0); return (unit & 0x1F) + (remaining << cDecodeShiftRemaining); // Lead byte of multi-byte. } state = (state << 6) + (unit & 0x3F) + (state & cDecodeMaskRemaining) - cDecodeOneRemaining; // Apply c-byte. return state & ~cDecodeLeadBit; // Mask off the lead bits of a 4-byte sequence. } // Decode UTF-8, safe template<> inline uint32 Decode(uint32 state, uint32 unit) { if (unit <= 0xF4) // Discard out-of-range values immediately. { if (state == 0) // First byte. { if (unit < 0x80) { return unit; // Single-byte. } if (unit < 0xC2) { return cDecodeErrorBit; // Invalid continuation byte (or illegal 0xC0/0xC1). } uint32 remaining = (unit >> 4) - 0xC; remaining += (remaining == 0); return unit + (remaining << cDecodeShiftRemaining); // Multi-byte. } if ((unit & 0xC0) == 0x80) { const uint32 remaining = (state & cDecodeMaskRemaining) - cDecodeOneRemaining; state = (state << 8) + unit; if (remaining != 0) { return state | remaining; // Intermediate byte of a multi-byte sequence. } return DecodeValidate8(state); // Final byte of a multi-byte sequence. } } return cDecodeErrorBit | state; } // Decode UTF-16, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { const bool bLead = (unit >= cLeadSurrogateFirst) && (unit <= cLeadSurrogateLast); const uint32 initial = unit + (bLead << cDecodeShiftRemaining); const uint32 pair = 0x10000 + ((state & 0x3FF) << 10) + (unit & 0x3FF); return state == 0 ? initial : pair; } // Decode UTF-16, safe. template<> inline uint32 Decode(uint32 state, uint32 unit) { const bool bTrail = (unit >= cTrailSurrogateFirst) && (unit <= cTrailSurrogateLast); if (state != 0 && !bTrail) { return cDecodeErrorBit + (state & 0xFFFF); // Lead surrogate without trail surrogate } uint32 result = Decode(state, unit); bool bValid = (result & 0xFFFFFFFEU) != 0xFFFEU; return bValid ? result : result + cDecodeErrorBit + cDecodeInvalidBit; } // Decode UTF-32, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { return unit; } // Decode UTF-32, safe. template<> inline uint32 Decode(uint32 state, uint32 unit) { if (unit > cCodepointMax) { return cDecodeErrorBit; } if (unit >= cLeadSurrogateFirst && unit <= cTrailSurrogateLast) { return cDecodeErrorBit | cDecodeSurrogateBit; } if ((unit & 0xFFFEU) == 0xFFFEU) { return cDecodeErrorBit | cDecodeInvalidBit; } return unit; } // Decode ASCII, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { return unit; } // Decode ASCII, safe. template<> inline uint32 Decode(uint32 state, uint32 unit) { if (unit > 0x7F) { return cDecodeErrorBit; } return unit; } // Decode Latin1, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { return unit; } // Decode Latin1, safe. template<> inline uint32 Decode(uint32 state, uint32 unit) { if ((unit >= 0x80 && unit <= 0x9F) || (unit > 0xFF)) { return cDecodeErrorBit; } return unit; } // Decode Windows CP-1252, unsafe. template<> inline uint32 Decode(uint32 state, uint32 unit) { static const uint16 cp1252[] = { 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, }; return (unit < 0x80 || unit > 0x9F) ? unit : cp1252[unit - 0x80]; } // Decode Windows CP-1252, safe. template<> inline uint32 Decode(uint32 state, uint32 unit) { if (unit > 0xFF) { return cDecodeErrorBit; } uint32 result = Decode(state, unit); if (!(unit < 0x80 || unit > 0x9F) && (result == unit)) { return cDecodeErrorBit; // Not defined in codepage 1252. } return result; } // SBase: // Utility to apply empty-base-optimization on type T. // Will fall back to a member if T is a reference type. template struct SBase : T { SBase(T base) : T(base) {} T& GetBase() { return *this; } const T& GetBase() const { return *this; } }; template struct SBase { T& base; SBase(T& b) : base(b) {} T& GetBase() { return base; } const T& GetBase() const { return base; } }; // SDecoder: // Functor to decode UCS code-points from an input range. // Recovery functor will be invoked as a fall-back if decoding fails. // This allows ensuring all the output is valid (even if the input isn't). // Note: The destructor will automatically flush any remaining (erroneous) state, you can also call Finalize(). template struct SDecoder : SBase , SBase { uint32 state; SDecoder(Sink sink, Recovery recovery = Recovery()) : SBase(sink) , SBase(recovery) , state(0) {} SDecoder() { Finalize(); } Recovery& recovery() { return SBase::GetBase(); } Sink& sink() { return SBase::GetBase(); } void operator()(uint32 unit) { state = Detail::Decode(state, unit); if (state <= 0x1FFFFF) { sink()(state); state = 0; } else if (state & Detail::cDecodeErrorBit) { recovery()(sink(), state, unit); state = 0; } } void Finalize() { if (state) { recovery()(sink(), state, 0); state = 0; } } }; // SDecoder: // Functor to decode to UCS code-points from an input range. // No attempt to discover or recover from encoding errors is made, can only safely be used with known-valid input. template struct SDecoder : SBase { uint32 state; SDecoder(Sink sink) : SBase(sink) , state(0) {} Sink& sink() { return SBase::GetBase(); } void operator()(uint32 unit) { state = Detail::Decode(state, unit); if (state <= 0x1FFFFF) { sink()(state); state = 0; } } void Finalize() {} }; // SEncoder: // Generic Unicode encoder functor. // Encoding must be one an encoding type for which output is supported. // The Sink type must have HintSequence member for UTF-8 and UTF-16 (although it may be a no-op). // In general, you feed operator() with UCS code-points and it will emit code-units. template struct SEncoder { static const bool value = false; }; // SEncoder: // Specialization of ASCII encoder functor. // Note: Any out-of-range character is mapped to question mark. template struct SEncoder : SBase { static const bool value = true; typedef uint8 value_type; SEncoder(Sink sink) : SBase(sink) {} void operator()(uint32 cp) { cp = cp < 0x80 ? cp : (uint32)'?'; SBase::GetBase()(value_type(cp)); } }; // SEncoder: // Specialization of UTF-8 encoder functor. template struct SEncoder : SBase { static const bool value = true; typedef uint8 value_type; SEncoder(Sink sink) : SBase(sink) {} Sink& sink() { return SBase::GetBase(); } void operator()(uint32 cp) { if (cp < 0x80) { // Single byte sequence. sink()(value_type(cp)); } else { // Expand 21-bit value to 32-bit. uint32 bits = (cp & 0x00003F) + ((cp & 0x000FC0) << 2) + ((cp & 0x03F000) << 4) + ((cp & 0x1C0000) << 6); // Type of sequence. const bool bSeq4 = (cp >= 0x10000); const bool bSeq3 = (cp >= 0x800); // Mask lead-bytes and continuation-bytes. uint32 mask = 0xEFE0C080; mask ^= (bSeq3 << 14); mask += (bSeq4 ? 0xA00000 : 0); bits |= mask; // Length of the sequence. const uint32 length = (uint32)bSeq4 + (uint32)bSeq3 + 1; sink().HintSequence(length); // Sink the multi-byte sequence. if (bSeq4) { sink()(value_type(bits >> 24)); } if (bSeq3) { sink()(value_type(bits >> 16)); } sink()(value_type(bits >> 8)); sink()(value_type(bits)); } } }; // SEncoder: // Specialization of UTF-16 encoder functor. template struct SEncoder : SBase { static const bool value = true; typedef uint16 value_type; SEncoder(Sink sink) : SBase(sink) {} Sink& sink() { return SBase::GetBase(); } void operator()(uint32 cp) { if (cp < 0x10000) { // Single unit sink()(value_type(cp)); } else { // We will generate two-element sequence sink().HintSequence(2); // Surrogate pair cp -= 0x10000; uint32 lead = ((cp >> 10) & 0x3FF) + Detail::cLeadSurrogateFirst; uint32 trail = (cp & 0x3FF) + Detail::cTrailSurrogateFirst; sink()(value_type(lead)); sink()(value_type(trail)); } } }; // SEncoder: // Specialization of UTF-32 encoder functor. // Note: This is a no-op, but we want to be able to express UTF-32 just like the other encodings. template struct SEncoder : SBase { static const bool value = true; typedef uint32 value_type; SEncoder(Sink sink) : SBase(sink) {} void operator()(uint32 cp) { SBase::GetBase()(value_type(cp)); } }; // SDecoder, void>: // Specialization for unsafe no-op trans-coding. // Since the conversion is a no-op, no need to keep any state or do any computation. // Note: For a decoding with a fallback, this is not possible since we can't guarantee the input is valid. template struct SDecoder, void> { Sink sink; SDecoder(Sink s) : sink(s) {} void operator()(uint32 unit) { sink(unit); } void Finalize() {} }; // SRecoveryDiscard: // Recovery handler that, on encoding error, discards the offending sequence. template struct SRecoveryDiscard { SRecoveryDiscard() {} void operator()(Sink& sink, uint32 error, uint32 unit) {} }; // SRecoveryReplace: // Recovery handler that, on encoding error, replaces the sequence with replacement-character (U+FFFD). // Note: This implementation matches a whole invalid sequence, it could be changed to emit for every code-unit. template struct SRecoveryReplace { SRecoveryReplace() {} void operator()(Sink& sink, uint32 error, uint32 unit) { sink(cReplacementCharacter); } }; // SRecoveryFallback: // Recovery handler that, on encoding error, falls back to another encoding. // The fallback encoding must be stateless (ie: ASCII, Latin1 or Win1252). // This type assumes an 8-bit primary encoding since the only viable fallback encodings are 8-bit. template struct SRecoveryFallback : NextFallback { SRecoveryFallback() : NextFallback() {} void operator()(Sink& sink, uint32 error, uint32 unit) { SDecoder fallback(sink, *static_cast(this)); uint8 byte1(error >> 16); uint8 byte2(error >> 8); uint8 byte3(error); uint8 byte4(unit); if (byte1) { fallback(byte1); } if (byte1 | byte2) { fallback(byte2); } if (byte1 | byte2 | byte3) { fallback(byte3); } fallback(byte4); } }; // SRecoveryFallbackHelper: // Helper to pick a SRecoveryFallback instantiation based on RecoveryMethod. template struct SRecoveryFallbackHelper { // A compilation error here means RecoveryMethod value was unexpected here COMPILE_TIME_ASSERT( RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard || RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace || RecoveryMethod == eErrorRecovery_FallbackWin1252ThenDiscard || RecoveryMethod == eErrorRecovery_FallbackWin1252ThenReplace); typedef SEncoder SinkType; static const EEncoding FallbackEncoding = RecoveryMethod == eErrorRecovery_FallbackLatin1ThenDiscard || RecoveryMethod == eErrorRecovery_FallbackLatin1ThenReplace ? eEncoding_Latin1 : eEncoding_Win1252; template struct Pick { typedef SRecoveryDiscard type; }; template struct Pick { typedef SRecoveryReplace type; }; typedef typename Pick::type NextFallback; typedef SRecoveryFallback RecoveryType; typedef SDecoder FullType; }; // STranscoderSelect: // Derives a chained decoder/encoder pair that performs code-unit -> code-unit transform. // The RecoveryMethod template parameter determines the behavior during encoding. // This is the basic way to perform trans-coding, and is the type instantiated by the higher-level functions. template struct STranscoderSelect; template struct STranscoderSelect : SDecoder, void> { typedef SDecoder, void> TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SDecoder, SRecoveryDiscard > > { typedef SRecoveryDiscard > RecoveryType; typedef SDecoder, RecoveryType> TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SDecoder, SRecoveryReplace > > { typedef SRecoveryReplace > RecoveryType; typedef SDecoder, RecoveryType> TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SRecoveryFallbackHelper::FullType { static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenDiscard; typedef typename SRecoveryFallbackHelper::RecoveryType RecoveryType; typedef typename SRecoveryFallbackHelper::FullType TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SRecoveryFallbackHelper::FullType { static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackLatin1ThenReplace; typedef typename SRecoveryFallbackHelper::RecoveryType RecoveryType; typedef typename SRecoveryFallbackHelper::FullType TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SRecoveryFallbackHelper::FullType { static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenDiscard; typedef typename SRecoveryFallbackHelper::RecoveryType RecoveryType; typedef typename SRecoveryFallbackHelper::FullType TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; template struct STranscoderSelect : SRecoveryFallbackHelper::FullType { static const EErrorRecovery RecoveryMethod = eErrorRecovery_FallbackWin1252ThenReplace; typedef typename SRecoveryFallbackHelper::RecoveryType RecoveryType; typedef typename SRecoveryFallbackHelper::FullType TranscoderType; STranscoderSelect(Sink sink) : TranscoderType(sink) {} }; // SIsSafeEncoding: // Check if the given recovery mode is safe. // This is used for SFINAE checks in higher-level functions. template struct SIsSafeEncoding { static const bool value = R == eErrorRecovery_Discard || R == eErrorRecovery_Replace || R == eErrorRecovery_FallbackLatin1ThenDiscard || R == eErrorRecovery_FallbackLatin1ThenReplace || R == eErrorRecovery_FallbackWin1252ThenDiscard || R == eErrorRecovery_FallbackWin1252ThenReplace; }; // SIsCopyableEncoding: // Check if data in one encoding can be copied directly to another encoding. // This is the basis for block-copy and string-assign optimizations in un-safe conversion functions. // Note: There are more valid combinations, they are left out since those can't occur with the output encodings supported. // Note: Only used for un-safe functions since it doesn't account for potential invalid sequences (they would be copied over). template struct SIsCopyableEncoding { static const bool value = InputEncoding == eEncoding_ASCII || // ASCII and Latin1 values don't change in any encoding. (InputEncoding == eEncoding_Latin1 && OutputEncoding != eEncoding_ASCII); // Except Latin1 -> ASCII is lossy. }; template struct SIsCopyableEncoding { static const bool value = true; // If the input and output encodings are the same, then it's copyable. }; } }