complete class UTF8.cpp

2013-12-08 00:48:25 -06:00 · 2013-12-08 00:48:25 -06:00 · 5df7288678
parent 2f327f068d
commit 5df7288678
3 changed files with 176 additions and 22 deletions
--- a/common/gal/stroke_font.cpp
+++ b/common/gal/stroke_font.cpp
@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText )
        // (textSize.x)
        xOffset = textSize.x;
        glyphSize.x = -m_glyphSize.x;
-    } else
+    }
    else
    {
        xOffset = 0.0;
    }
--- a/tools/UTF8.cpp
+++ b/tools/UTF8.cpp
@ -10,6 +10,15 @@
 * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
 * conversion support to and from wxString, and has iteration over unicode characters.
 *
 * <p>I've been careful to supply only conversion facillities and not try
 * and duplicate wxString() with many member functions.  In the end it is
 * to be a std::string.  There are multiple ways to create text into a std::string
 * without the need of member functions.  std::ostringstream.
 *
 * <p>Because this class used no virtuals, it should be possible to cast any
 * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
 * or copying being the effect of the cast.
 *
 * @author Dick Hollenbeck
 */
 class UTF8 : public std::string
@ -25,6 +34,9 @@ public:
    {
    }
    /// For use with _() function on wx 2.8:
    UTF8( const wchar_t* txt );
    explicit UTF8( const std::string& o ) :
        std::string( o )
    {
@ -54,25 +66,20 @@ public:
    /**
     * Function uni_forward
-     * advances over a UTF8 encoded multibyte character, capturing the unicode
+     * advances over a single UTF8 encoded multibyte character, capturing the
-     * character as it goes, and returning the number of bytes consumed.
+     * unicode character as it goes, and returning the number of bytes consumed.
     *
-     * @param aSequence is the UTF8 byte sequence.
+     * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
-     * @param aResult is where to put the unicode character.
+     * @param aResult is where to put the unicode character, and may be NULL if no interest.
     * @return int - the count of bytes consumed.
     */
-    static int uni_forward( unsigned char* aSequence, unsigned* aResult )
+    static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );
    {
        // @todo: have this read UTF8 characters into result, not bytes.
        // What's here now is scaffolding, reading single byte characters only.
        *aResult = *aSequence;
        return 1;
    }
    /**
     * class uni_iter
     * is a non-mutable iterator that walks through code points in the UTF8 encoded
     * string.  The normal ++(), ++(int), ->(), and *() operators are all supported and
-     * they return a unsigned holding the unicode character appropriate for respective
+     * they return an unsigned holding the unicode character appropriate for respective
     * operation.
     */
    class uni_iter
@ -81,10 +88,11 @@ public:
        unsigned char* it;
        // private constructor.
        uni_iter( const char* start ) :
            it( (unsigned char*) start )
        {
-            assert( sizeof(unsigned) >= 4 );
+            // for the human: assert( sizeof(unsigned) >= 4 );
        }
    public:
@ -94,10 +102,10 @@ public:
        {
            unsigned    result;
-            // advance, and toss the result
+            // advance over current, and toss the unicode result
-            it += uni_forward( it, &result );
+            it += uni_forward( it );
-            // get the next result, but do not advance:
+            // get the next unicode result, but do not advance:
            uni_forward( it, &result );
            return result;
        }
@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx )
 int main()
 {
    std::string str = "input";
    UTF8        u0 = L"wide string";
    UTF8        u1 = "initial";
    wxString    wx = wxT( "input2" );
    printf( "u0:'%s'\n", u0.c_str() );
    printf( "u1:'%s'\n", u1.c_str() );
    u1 = str;
    wxString    wx2 = u1;
    // force a std::string into a UTF8, then into a wxString, then copy construct:
    wxString    wx3 = (UTF8&) u1;
    UTF8        u2 = wx2;
    u2 += 'X';
@ -196,7 +210,7 @@ int main()
    printf( "result:'%s'\n", result.c_str() );
    // test the unicode iterator:
-    for( UTF8::uni_iter it = u2.ubegin();  it != u2.uend();  )
+    for( UTF8::uni_iter it = u2.ubegin();  it < u2.uend();  )
    {
        // test post-increment:
        printf( " _%c_", it++ );
@ -211,8 +225,13 @@ int main()
 }
-// These to go into a library *.cpp, they are not inlined so that code space
+/*
-// is saved creating the intermediate objects and referencing wxConvUTF8.
+
    These to go into a library *.cpp, they are not inlined so that significant
    code space is saved by encapsulating the creation of intermediate objects
    and referencing wxConvUTF8.
 */
 UTF8::UTF8( const wxString& o ) :
@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o )
    std::string::operator=( (const char*) o.utf8_str() );
    return *this;
 }
 static const unsigned char utf8_len[256] = {
    // Map encoded prefix byte to sequence length.  Zero means
    // illegal prefix.  See RFC 3629 for details
    /*
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
    */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F0-F4 + F5-FF
 };
 #ifndef THROW_IO_ERROR
 #define THROW_IO_ERROR(x)      // nothing
 #endif
 // There is no wxWidgets function that does this, because wchar_t is 16 bits
 // on windows and wx wants to encode the output in UTF16 for such.
 int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
 {
    unsigned ch = *aSequence;
    if( ch < 0x80 )
    {
        if( aResult )
            *aResult = ch;
        return 1;
    }
    unsigned char* s = aSequence;
    int len = utf8_len[ *s - 0x80  /* top half of table is missing */ ];
    switch( len )
    {
    default:
    case 0:
        THROW_IO_ERROR( "invalid start byte" );
        break;
    case 2:
        if( ( s[1] & 0xc0 ) != 0x80 )
        {
            THROW_IO_ERROR( "invalid continuation byte" );
        }
        ch =    ((s[0] & 0x1f) << 6) +
                ((s[1] & 0x3f) << 0);
        assert( ch > 0x007F && ch <= 0x07FF );
        break;
    case 3:
        if( (s[1] & 0xc0) != 0x80 ||
            (s[2] & 0xc0) != 0x80 ||
            (s[0] == 0xE0 && s[1] < 0xA0)
            // || (s[0] == 0xED && s[1] > 0x9F)
        )
        {
            THROW_IO_ERROR( "invalid continuation byte" );
        }
        ch =    ((s[0] & 0x0f) << 12) +
                ((s[1] & 0x3f) << 6 ) +
                ((s[2] & 0x3f) << 0 );
        assert( ch > 0x07FF && ch <= 0xFFFF );
        break;
    case 4:
        if( (s[1] & 0xc0) != 0x80 ||
            (s[2] & 0xc0) != 0x80 ||
            (s[3] & 0xc0) != 0x80 ||
            (s[0] == 0xF0 && s[1] < 0x90) ||
            (s[0] == 0xF4 && s[1] > 0x8F) )
        {
            THROW_IO_ERROR( "invalid continuation byte" );
        }
        ch =    ((s[0] & 0x7)  << 18) +
                ((s[1] & 0x3f) << 12) +
                ((s[2] & 0x3f) << 6 ) +
                ((s[3] & 0x3f) << 0 );
        assert( ch > 0xFFFF && ch <= 0x10ffff );
        break;
    }
    if( aResult )
    {
        *aResult = ch;
    }
    return len;
 }
 UTF8::UTF8( const wchar_t* txt ) :
    // size initial string safely large enough, then shrink to known size later.
    std::string( wcslen( txt ) * 4, 0 )
 {
    /*
        "this" string was sized to hold the worst case UTF8 encoded byte
        sequence, and was initialized with all nul bytes. Overwrite some of
        those nuls, then resize, shrinking down to actual size.
        Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
        possibly being 16 bits wide on Windows and holding UTF16 input.
    */
    int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
    resize( sz );
 }
--- a/tools/make-UTF8.sh
+++ b/tools/make-UTF8.sh
@ -1,5 +1,7 @@
 WXCONFIG=wx-config
-INCLUDE=/usr/include/wx-2.8
+#WXCONFIG=/opt/wx2.9/bin/wx-config
-g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)
+g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)