/* * All or portions of this file Copyright (c) Amazon.com, Inc. or its affiliates or * its licensors. * * For complete copyright and license terms please see the LICENSE at the root of this * distribution (the "License"). All use of this software is governed by the License, * or, if provided, by the license below or the license accompanying this file. Do not * remove or modify any license notices. This file is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * */ // Original file Copyright Crytek GMBH or its affiliates, used under license. // Description : Encoded Unicode sequence iteration. // // For lower level accessing of encoded text, an STL compatible iterator wrapper is provided. // This iterator will decode the underlying sequence, abstracting it to a sequence of UCS code-points. // Using the iterator wrapper, you can find where in an encoded string code-points (or encoding errors) are located. // Note: The iterator is an input-only iterator, you cannot write to the underlying sequence. #pragma once #include "UnicodeBinding.h" namespace Unicode { namespace Detail { // MoveNext(it, checker, tag): // Moves the iterator to the next UCS code-point in the encoded sequence. // Non-specialized version (for 1:1 code-unit to code-point). template inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, const integral_constant) { COMPILE_TIME_ASSERT( Encoding == eEncoding_ASCII || Encoding == eEncoding_UTF32 || Encoding == eEncoding_Latin1 || Encoding == eEncoding_Win1252); assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence"); // All of these encodings use a single code-unit for each code-point. ++it; } // MoveNext(it, checker, tag): // Moves the iterator to the next UCS code-point in the encoded sequence. // Specialized for UTF-8. template inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant) { assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence"); // UTF-8: just need to skip up to 3 continuation bytes. for (int i = 0; i < 4; ++i) { ++it; if (checker.IsEnd(it)) // :WARN: always returns false if "safe" bool is false! { break; } uint32 val = static_cast(*it); if ((val & 0xC0) != 0x80) { break; } } } // MoveNext(it, checker, tag): // Moves the iterator to the next UCS code-point in the encoded sequence. // Specialized for UTF-16. template inline void MoveNext(BaseIterator& it, const BoundsChecker& checker, integral_constant) { assert(!checker.IsEnd(it) && "Attempt to iterate past the end of the sequence"); // UTF-16: just need to skip one lead surrogate. ++it; uint32 val = static_cast(*it); if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast) { if (!checker.IsEnd(it)) { ++it; } } } // MovePrev(it, checker, tag): // Moves the iterator to the previous UCS code-point in the encoded sequence. // Non-specialized version (for 1:1 code-unit to code-point). template inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, const integral_constant) { COMPILE_TIME_ASSERT( Encoding == eEncoding_ASCII || Encoding == eEncoding_UTF32 || Encoding == eEncoding_Latin1 || Encoding == eEncoding_Win1252); assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence"); // All of these encodings use a single code-unit for each code-point. --it; } // MovePrev(it, checker, tag): // Moves the iterator to the previous UCS code-point in the encoded sequence. // Specialized for UTF-8. template inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant) { assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence"); // UTF-8: just need to skip up to 3 continuation bytes. for (int i = 0; i < 4; ++i) { --it; if (checker.IsBegin(it)) { break; } uint32 val = static_cast(*it); if ((val & 0xC0) != 0x80) { break; } } } // MovePrev(it, checker, tag): // Moves the iterator to the previous UCS code-point in the encoded sequence. // Specialized for UTF-16. template inline void MovePrev(BaseIterator& it, const BoundsChecker& checker, integral_constant) { assert(!checker.IsBegin(it) && "Attempt to iterate past the beginning of the sequence"); // UTF-16: just need to skip one lead surrogate. --it; uint32 val = static_cast(*it); if (val >= cLeadSurrogateFirst && val <= cLeadSurrogateLast) { if (!checker.IsBegin(it)) { --it; } } } // SBaseIterators: // Utility to access base iterators properties from CIterator. // This is the bounds-checked specialization, the range information is kept to defend against malformed sequences. template struct SBaseIterators { typedef BaseIterator type; type begin, end; type it; SBaseIterators(const BaseIterator& _begin, const BaseIterator& _end) : begin(_begin) , end(_end) , it(_begin) {} SBaseIterators(const SBaseIterators& other) : begin(other.begin) , end(other.end) , it(other.it) {} SBaseIterators& operator =(const SBaseIterators& other) { begin = other.begin; end = other.end; it = other.it; return *this; } bool IsBegin(const BaseIterator& _it) const { return begin == _it; } bool IsEnd(const BaseIterator& _it) const { return end == _it; } bool IsEqual(const SBaseIterators& other) const { return it == other.it && begin == other.begin && end == other.end; } // Note: Only called inside assert. // O(N) version; works with any forward-iterator (or better) bool IsInRange(const BaseIterator& _it, std::forward_iterator_tag) const { for (BaseIterator i = begin; i != end; ++i) { if (_it == i) { return true; } } return false; } // Note: Only called inside assert. // O(1) version; requires random-access-iterator. bool IsInRange(const BaseIterator& _it, std::random_access_iterator_tag) const { return (begin <= _it && _it < end); } // Note: Only called inside assert. // Dispatches to the O(1) version if a random-access iterator is used (common case). bool IsInRange(const BaseIterator& _it) const { return IsInRange(_it, typename std::iterator_traits::iterator_category()); } }; // SBaseIterators: // Utility to access base iterators properties from CIterator. // This is the un-checked specialization for known-safe sequences. template struct SBaseIterators { typedef BaseIterator type; type it; explicit SBaseIterators(const BaseIterator& begin) : it(begin) {} SBaseIterators(const BaseIterator& begin, const BaseIterator& end) : it(begin) {} SBaseIterators(const SBaseIterators& other) : it(other.it) {} SBaseIterators& operator =(const SBaseIterators& other) { it = other.it; return *this; } bool IsBegin(const BaseIterator&) const { return false; } bool IsEnd(const BaseIterator&) const { return false; } bool IsEqual(const SBaseIterators& other) const { return it == other.it; } bool IsInRange(const BaseIterator&) const { return true; } }; // SIteratorSink: // Helper to store the last code-point and error bit that was decoded. // This is the safe specialization for potentially malformed sequences. template struct SIteratorSink { static const uint32 cEmpty = 0xFFFFFFFFU; uint32 value; bool error; void Clear() { value = cEmpty; error = false; } bool IsEmpty() const { return value == cEmpty; } bool IsError() const { return error; } const uint32& GetValue() const { return value; } void MarkDecodingError() { value = cReplacementCharacter; error = true; } template void Decode(const SBaseIterators& its, integral_constant) { typedef SDecoder DecoderType; DecoderType decoder(*this, *this); Clear(); for (BaseIterator it = its.it; IsEmpty(); ++it) { uint32 val = static_cast(*it); decoder(val); if (its.IsEnd(it)) { break; } } if (IsEmpty()) { // If we still have neither a new value or an error flag, just treat as error. // This can happen if we reached the end of the sequence, but it ends in an incomplete code-sequence. MarkDecodingError(); } } template void DecodeIfEmpty(const SBaseIterators& its, integral_constant tag) { if (IsEmpty()) { Decode(its, tag); } } void operator()(uint32 unit) { value = unit; } void operator()(SIteratorSink&, uint32, uint32) { MarkDecodingError(); } }; // SIteratorSink: // Helper to store the last code-point that was decoded. // This is the un-safe specialization for known-valid sequences. // Note: No error-state is tracked since we won't handle that regardless for un-safe CIterator. template<> struct SIteratorSink { static const uint32 cEmpty = 0xFFFFFFFFU; uint32 value; void Clear() { value = cEmpty; } bool IsEmpty() const { return value == cEmpty; } bool IsError() const { return false; } const uint32& GetValue() const { return value; } template void Decode(const SBaseIterators& its, integral_constant) { typedef SDecoder DecoderType; DecoderType decoder(*this); for (BaseIterator it = its.it; IsEmpty(); ++it) { uint32 val = static_cast(*it); decoder(val); } } template void DecodeIfEmpty(const SBaseIterators& its, integral_constant tag) { if (IsEmpty()) { Decode(its, tag); } } void operator()(uint32 unit) { value = unit; } }; } // CIterator: // Helper class that can iterate over an encoded text sequence and read the underlying UCS code-points. // If the Safe flag is set, bounds checking is performed inside multi-unit sequences to guard against decoding errors. // This requires the user to know where the sequence ends (use the constructor taking two parameters). // Note: The BaseIterator must be forward-iterator or better when Safe flag is set. // If the Safe flag is not set, you must guarantee the sequence is validly encoded, and allows the use of the single argument constructor. // In the case of unsafe iterator used for C-style string pointer, look for a U+0000 dereferenced value to end the iteration. // Regardless of the Safe flag, the user must ensure that the iterator is never moved past the beginning or end of the range (just like any other STL iterator). // Example of typical usage: // string utf8 = "foo"; // UTF-8 // for (Unicode::CIterator it(utf8.begin(), utf8.end()); it != utf8.end(); ++it) // { // uint32 codepoint = *it; // 32-bit UCS code-point // } // Example unsafe usage: (for known-valid encoded C-style strings): // const char *pValid = "foo"; // UTF-8 // for (Unicode::CIterator it = pValid; *it != 0; ++it) // { // uint32 codepoint = *it; // 32-bit UCS code-point // } template::value> class CIterator { // The iterator value in the encoded sequence. // Optionally provides bounds-checking. Detail::SBaseIterators its; // The cached UCS code-point at the current position. // Mutable because dereferencing is conceptually const, but does cache some state in this case. mutable Detail::SIteratorSink sink; public: // Types for compatibility with STL bidirectional iterator requirements. typedef const uint32 value_type; typedef const uint32& reference; typedef const uint32* pointer; typedef const ptrdiff_t difference_type; typedef std::bidirectional_iterator_tag iterator_category; // Construct an iterator for the given range. // The initial position of the iterator as at the beginning of the range. CIterator(const BaseIterator& begin, const BaseIterator& end) : its(begin, end) { sink.Clear(); } // Construct an iterator from a single iterator (typically C-style string pointer). // This can only be used for unsafe iterators. template CIterator(const IteratorType& it, typename Detail::SRequire::value, IteratorType>::type* = 0) : its(static_cast(it)) { sink.Clear(); } // Copy-construct an iterator. CIterator(const CIterator& other) : its(other.its) , sink(other.sink) {} // Copy-assign an iterator. CIterator& operator =(const CIterator& other) { its = other.its; sink = other.sink; return *this; } // Test if the iterator points at an encoding error in the underlying encoded sequence. // If so, the function returns false. // When using an un-safe iterator, this function always returns true, if a sequence can contain encoding errors, you must use the safe variant. // Note: This requires the underlying iterator to be dereferenced, so you cannot use it only while the iterator is inside the valid range. bool IsAtValidCodepoint() const { assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator"); Detail::integral_constant tag; sink.DecodeIfEmpty(its, tag); return !sink.IsError(); } // Gets the current position in the underlying encoded sequence. // If the iterator points to an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant. // In that case the returned position is approximated; to work around this: move all iterators of which the position is compared in the same direction. const BaseIterator& GetPosition() const { return its.it; } // Sets the current position in the underlying encoded sequence. // You may not set the position outside the range for which this iterator was constructed. void SetPosition(const BaseIterator& it) { assert(its.IsInRange(it) && "Attempt to set the underlying iterator outside of the supported range"); its.it = it; } // Test if this iterator is equal to another iterator instance. // Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant. // To work around this, you can either: // 1) Move all iterators that will be compared in the same direction; or // 2) Compare the dereferenced iterator value(s) instead (if applicable). bool operator ==(const CIterator& other) const { return its.IsEqual(other.its); } // Test if this iterator is equal to another base iterator. // Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined. bool operator ==(const BaseIterator& other) const { return its.it == other; } // Test if this iterator is equal to another iterator instance. // Note: In the presence of an invalidly encoded sequence (ie, IsError() returns true), the direction of iteration is significant. // To work around this, you can either: // 1) Move all iterators that will be compared in the same direction; or // 2) Compare the dereferenced iterator value(s) instead (if applicable). bool operator !=(const CIterator& other) const { return !its.IsEqual(other.its); } // Test if this iterator is equal to another base iterator. // Note: If the provided iterator does not point to the the first code-unit of an UCS code-point, the behavior is undefined. bool operator !=(const BaseIterator& other) const { return its.it != other; } // Get the decoded UCS code-point at the current position in the sequence. // If the iterator points to an invalidly encoded sequence (ie, IsError() returns true) the function returns U+FFFD (replacement character). reference operator *() const { assert(!its.IsEnd(its.it) && "Attempt to dereference the past-the-end iterator"); Detail::integral_constant tag; sink.DecodeIfEmpty(its, tag); return sink.GetValue(); } // Advance the iterator to the next UCS code-point. // Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode. // However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors. CIterator& operator ++() { Detail::integral_constant tag; Detail::MoveNext(its.it, its, tag); sink.Clear(); return *this; } // Go back to the previous UCS code-point. // Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode. // However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors. CIterator& operator --() { Detail::integral_constant tag; Detail::MovePrev(its.it, its, tag); sink.Clear(); return *this; } // Advance the iterator to the next UCS code-point, return a copy of the iterator position before advancing. // Note: You must make sure the iterator is not at the end of the sequence, even in Safe mode. // However, in Safe mode, the iterator will never move past the end of the sequence in the presence of encoding errors. CIterator operator ++(int) { CIterator result = *this; ++*this; return result; } // Go back to the previous UCS code-point, return a copy of the iterator position before going back. // Note: You must make sure the iterator is not at the beginning of the sequence, even in Safe mode. // However, in Safe mode, the iterators will never move past the beginning of the sequence in the presence of encoding errors. CIterator operator --(int) { CIterator result = *this; --*this; return result; } }; namespace Detail { // SIteratorSpecializer: // Specializes the CIterator template to use for a given string type. // Note: The reason we use this is because MSVC doesn't want to deduce this on the MakeIterator declaration. template struct SIteratorSpecializer { typedef CIterator type; }; } // MakeIterator(const StringType &str): // Helper function to make an UCS code-point iterator given an Unicode string. // Example usage: // string utf8 = "foo"; // UTF-8 // auto it = Unicode::MakeIterator(utf8); // while (it != utf8.end()) // { // uint32 codepoint = *it; // 32-bit UCS code-point // } // Or, in a for-loop: // for (auto it = Unicode::MakeIterator(utf8); it != utf8.end(); ++it) {} template inline typename Detail::SIteratorSpecializer::type MakeIterator(const StringType& str) { return typename Detail::SIteratorSpecializer::type(str.begin(), str.end()); } }