From 5df728867874534e97798a580b4cbe72e5bd2686 Mon Sep 17 00:00:00 2001 From: Dick Hollenbeck Date: Sun, 8 Dec 2013 00:48:25 -0600 Subject: [PATCH] complete class UTF8.cpp --- common/gal/stroke_font.cpp | 3 +- tools/UTF8.cpp | 189 +++++++++++++++++++++++++++++++++---- tools/make-UTF8.sh | 6 +- 3 files changed, 176 insertions(+), 22 deletions(-) diff --git a/common/gal/stroke_font.cpp b/common/gal/stroke_font.cpp index 1566d1820d..ab83eba22e 100644 --- a/common/gal/stroke_font.cpp +++ b/common/gal/stroke_font.cpp @@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText ) // (textSize.x) xOffset = textSize.x; glyphSize.x = -m_glyphSize.x; - } else + } + else { xOffset = 0.0; } diff --git a/tools/UTF8.cpp b/tools/UTF8.cpp index 0fd5fb65d5..c9d31dea5d 100644 --- a/tools/UTF8.cpp +++ b/tools/UTF8.cpp @@ -10,6 +10,15 @@ * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special * conversion support to and from wxString, and has iteration over unicode characters. * + *

I've been careful to supply only conversion facillities and not try + * and duplicate wxString() with many member functions. In the end it is + * to be a std::string. There are multiple ways to create text into a std::string + * without the need of member functions. std::ostringstream. + * + *

Because this class used no virtuals, it should be possible to cast any + * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction + * or copying being the effect of the cast. + * * @author Dick Hollenbeck */ class UTF8 : public std::string @@ -25,6 +34,9 @@ public: { } + /// For use with _() function on wx 2.8: + UTF8( const wchar_t* txt ); + explicit UTF8( const std::string& o ) : std::string( o ) { @@ -54,25 +66,20 @@ public: /** * Function uni_forward - * advances over a UTF8 encoded multibyte character, capturing the unicode - * character as it goes, and returning the number of bytes consumed. + * advances over a single UTF8 encoded multibyte character, capturing the + * unicode character as it goes, and returning the number of bytes consumed. * - * @param aSequence is the UTF8 byte sequence. - * @param aResult is where to put the unicode character. + * @param aSequence is the UTF8 byte sequence, must be aligned on start of character. + * @param aResult is where to put the unicode character, and may be NULL if no interest. + * @return int - the count of bytes consumed. */ - static int uni_forward( unsigned char* aSequence, unsigned* aResult ) - { - // @todo: have this read UTF8 characters into result, not bytes. - // What's here now is scaffolding, reading single byte characters only. - *aResult = *aSequence; - return 1; - } + static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL ); /** * class uni_iter * is a non-mutable iterator that walks through code points in the UTF8 encoded * string. The normal ++(), ++(int), ->(), and *() operators are all supported and - * they return a unsigned holding the unicode character appropriate for respective + * they return an unsigned holding the unicode character appropriate for respective * operation. */ class uni_iter @@ -81,10 +88,11 @@ public: unsigned char* it; + // private constructor. uni_iter( const char* start ) : it( (unsigned char*) start ) { - assert( sizeof(unsigned) >= 4 ); + // for the human: assert( sizeof(unsigned) >= 4 ); } public: @@ -94,10 +102,10 @@ public: { unsigned result; - // advance, and toss the result - it += uni_forward( it, &result ); + // advance over current, and toss the unicode result + it += uni_forward( it ); - // get the next result, but do not advance: + // get the next unicode result, but do not advance: uni_forward( it, &result ); return result; } @@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx ) int main() { std::string str = "input"; + + UTF8 u0 = L"wide string"; UTF8 u1 = "initial"; wxString wx = wxT( "input2" ); + printf( "u0:'%s'\n", u0.c_str() ); printf( "u1:'%s'\n", u1.c_str() ); u1 = str; wxString wx2 = u1; + // force a std::string into a UTF8, then into a wxString, then copy construct: + wxString wx3 = (UTF8&) u1; + UTF8 u2 = wx2; u2 += 'X'; @@ -196,7 +210,7 @@ int main() printf( "result:'%s'\n", result.c_str() ); // test the unicode iterator: - for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); ) + for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); ) { // test post-increment: printf( " _%c_", it++ ); @@ -211,8 +225,13 @@ int main() } -// These to go into a library *.cpp, they are not inlined so that code space -// is saved creating the intermediate objects and referencing wxConvUTF8. +/* + + These to go into a library *.cpp, they are not inlined so that significant + code space is saved by encapsulating the creation of intermediate objects + and referencing wxConvUTF8. + +*/ UTF8::UTF8( const wxString& o ) : @@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o ) std::string::operator=( (const char*) o.utf8_str() ); return *this; } + + +static const unsigned char utf8_len[256] = { + // Map encoded prefix byte to sequence length. Zero means + // illegal prefix. See RFC 3629 for details + /* + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F + */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF +}; + + +#ifndef THROW_IO_ERROR + #define THROW_IO_ERROR(x) // nothing +#endif + +// There is no wxWidgets function that does this, because wchar_t is 16 bits +// on windows and wx wants to encode the output in UTF16 for such. + +int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult ) +{ + unsigned ch = *aSequence; + + if( ch < 0x80 ) + { + if( aResult ) + *aResult = ch; + return 1; + } + + unsigned char* s = aSequence; + + int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ]; + + switch( len ) + { + default: + case 0: + THROW_IO_ERROR( "invalid start byte" ); + break; + + case 2: + if( ( s[1] & 0xc0 ) != 0x80 ) + { + THROW_IO_ERROR( "invalid continuation byte" ); + } + + ch = ((s[0] & 0x1f) << 6) + + ((s[1] & 0x3f) << 0); + + assert( ch > 0x007F && ch <= 0x07FF ); + break; + + case 3: + if( (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xE0 && s[1] < 0xA0) + // || (s[0] == 0xED && s[1] > 0x9F) + ) + { + THROW_IO_ERROR( "invalid continuation byte" ); + } + + ch = ((s[0] & 0x0f) << 12) + + ((s[1] & 0x3f) << 6 ) + + ((s[2] & 0x3f) << 0 ); + + assert( ch > 0x07FF && ch <= 0xFFFF ); + break; + + case 4: + if( (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xF0 && s[1] < 0x90) || + (s[0] == 0xF4 && s[1] > 0x8F) ) + { + THROW_IO_ERROR( "invalid continuation byte" ); + } + + ch = ((s[0] & 0x7) << 18) + + ((s[1] & 0x3f) << 12) + + ((s[2] & 0x3f) << 6 ) + + ((s[3] & 0x3f) << 0 ); + + assert( ch > 0xFFFF && ch <= 0x10ffff ); + break; + } + + if( aResult ) + { + *aResult = ch; + } + + return len; +} + + +UTF8::UTF8( const wchar_t* txt ) : + // size initial string safely large enough, then shrink to known size later. + std::string( wcslen( txt ) * 4, 0 ) +{ + /* + + "this" string was sized to hold the worst case UTF8 encoded byte + sequence, and was initialized with all nul bytes. Overwrite some of + those nuls, then resize, shrinking down to actual size. + + Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t + possibly being 16 bits wide on Windows and holding UTF16 input. + + */ + + int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() ); + + resize( sz ); +} + diff --git a/tools/make-UTF8.sh b/tools/make-UTF8.sh index 2e7e7510fc..8e57cf1ae2 100755 --- a/tools/make-UTF8.sh +++ b/tools/make-UTF8.sh @@ -1,5 +1,7 @@ + + WXCONFIG=wx-config -INCLUDE=/usr/include/wx-2.8 +#WXCONFIG=/opt/wx2.9/bin/wx-config -g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs) +g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs)