From 5df728867874534e97798a580b4cbe72e5bd2686 Mon Sep 17 00:00:00 2001
From: Dick Hollenbeck <dick@softplc.com>
Date: Sun, 8 Dec 2013 00:48:25 -0600
Subject: [PATCH] complete class UTF8.cpp

---
 common/gal/stroke_font.cpp |   3 +-
 tools/UTF8.cpp             | 189 +++++++++++++++++++++++++++++++++----
 tools/make-UTF8.sh         |   6 +-
 3 files changed, 176 insertions(+), 22 deletions(-)
diff --git a/common/gal/stroke_font.cpp b/common/gal/stroke_font.cpp
index 1566d1820d..ab83eba22e 100644
--- a/common/gal/stroke_font.cpp
+++ b/common/gal/stroke_font.cpp
@@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText )
         // (textSize.x)
         xOffset = textSize.x;
         glyphSize.x = -m_glyphSize.x;
-    } else
+    }
+    else
     {
         xOffset = 0.0;
     }
diff --git a/tools/UTF8.cpp b/tools/UTF8.cpp
index 0fd5fb65d5..c9d31dea5d 100644
--- a/tools/UTF8.cpp
+++ b/tools/UTF8.cpp
@@ -10,6 +10,15 @@
  * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
  * conversion support to and from wxString, and has iteration over unicode characters.
  *
+ * <p>I've been careful to supply only conversion facillities and not try
+ * and duplicate wxString() with many member functions.  In the end it is
+ * to be a std::string.  There are multiple ways to create text into a std::string
+ * without the need of member functions.  std::ostringstream.
+ *
+ * <p>Because this class used no virtuals, it should be possible to cast any
+ * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
+ * or copying being the effect of the cast.
+ *
  * @author Dick Hollenbeck
  */
 class UTF8 : public std::string
@@ -25,6 +34,9 @@ public:
     {
     }
 
+    /// For use with _() function on wx 2.8:
+    UTF8( const wchar_t* txt );
+
     explicit UTF8( const std::string& o ) :
         std::string( o )
     {
@@ -54,25 +66,20 @@ public:
 
     /**
      * Function uni_forward
-     * advances over a UTF8 encoded multibyte character, capturing the unicode
-     * character as it goes, and returning the number of bytes consumed.
+     * advances over a single UTF8 encoded multibyte character, capturing the
+     * unicode character as it goes, and returning the number of bytes consumed.
      *
-     * @param aSequence is the UTF8 byte sequence.
-     * @param aResult is where to put the unicode character.
+     * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
+     * @param aResult is where to put the unicode character, and may be NULL if no interest.
+     * @return int - the count of bytes consumed.
      */
-    static int uni_forward( unsigned char* aSequence, unsigned* aResult )
-    {
-        // @todo: have this read UTF8 characters into result, not bytes.
-        // What's here now is scaffolding, reading single byte characters only.
-        *aResult = *aSequence;
-        return 1;
-    }
+    static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );
 
     /**
      * class uni_iter
      * is a non-mutable iterator that walks through code points in the UTF8 encoded
      * string.  The normal ++(), ++(int), ->(), and *() operators are all supported and
-     * they return a unsigned holding the unicode character appropriate for respective
+     * they return an unsigned holding the unicode character appropriate for respective
      * operation.
      */
     class uni_iter
@@ -81,10 +88,11 @@ public:
 
         unsigned char* it;
 
+        // private constructor.
         uni_iter( const char* start ) :
             it( (unsigned char*) start )
         {
-            assert( sizeof(unsigned) >= 4 );
+            // for the human: assert( sizeof(unsigned) >= 4 );
         }
 
     public:
@@ -94,10 +102,10 @@ public:
         {
             unsigned    result;
 
-            // advance, and toss the result
-            it += uni_forward( it, &result );
+            // advance over current, and toss the unicode result
+            it += uni_forward( it );
 
-            // get the next result, but do not advance:
+            // get the next unicode result, but do not advance:
             uni_forward( it, &result );
             return result;
         }
@@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx )
 int main()
 {
     std::string str = "input";
+
+    UTF8        u0 = L"wide string";
     UTF8        u1 = "initial";
     wxString    wx = wxT( "input2" );
 
+    printf( "u0:'%s'\n", u0.c_str() );
     printf( "u1:'%s'\n", u1.c_str() );
 
     u1 = str;
 
     wxString    wx2 = u1;
 
+    // force a std::string into a UTF8, then into a wxString, then copy construct:
+    wxString    wx3 = (UTF8&) u1;
+
     UTF8        u2 = wx2;
 
     u2 += 'X';
@@ -196,7 +210,7 @@ int main()
     printf( "result:'%s'\n", result.c_str() );
 
     // test the unicode iterator:
-    for( UTF8::uni_iter it = u2.ubegin();  it != u2.uend();  )
+    for( UTF8::uni_iter it = u2.ubegin();  it < u2.uend();  )
     {
         // test post-increment:
         printf( " _%c_", it++ );
@@ -211,8 +225,13 @@ int main()
 }
 
 
-// These to go into a library *.cpp, they are not inlined so that code space
-// is saved creating the intermediate objects and referencing wxConvUTF8.
+/*
+
+    These to go into a library *.cpp, they are not inlined so that significant
+    code space is saved by encapsulating the creation of intermediate objects
+    and referencing wxConvUTF8.
+
+*/
 
 
 UTF8::UTF8( const wxString& o ) :
@@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o )
     std::string::operator=( (const char*) o.utf8_str() );
     return *this;
 }
+
+
+static const unsigned char utf8_len[256] = {
+    // Map encoded prefix byte to sequence length.  Zero means
+    // illegal prefix.  See RFC 3629 for details
+    /*
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
+    */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F0-F4 + F5-FF
+};
+
+
+#ifndef THROW_IO_ERROR
+ #define THROW_IO_ERROR(x)      // nothing
+#endif
+
+// There is no wxWidgets function that does this, because wchar_t is 16 bits
+// on windows and wx wants to encode the output in UTF16 for such.
+
+int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
+{
+    unsigned ch = *aSequence;
+
+    if( ch < 0x80 )
+    {
+        if( aResult )
+            *aResult = ch;
+        return 1;
+    }
+
+    unsigned char* s = aSequence;
+
+    int len = utf8_len[ *s - 0x80  /* top half of table is missing */ ];
+
+    switch( len )
+    {
+    default:
+    case 0:
+        THROW_IO_ERROR( "invalid start byte" );
+        break;
+
+    case 2:
+        if( ( s[1] & 0xc0 ) != 0x80 )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x1f) << 6) +
+                ((s[1] & 0x3f) << 0);
+
+        assert( ch > 0x007F && ch <= 0x07FF );
+        break;
+
+    case 3:
+        if( (s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[0] == 0xE0 && s[1] < 0xA0)
+            // || (s[0] == 0xED && s[1] > 0x9F)
+        )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x0f) << 12) +
+                ((s[1] & 0x3f) << 6 ) +
+                ((s[2] & 0x3f) << 0 );
+
+        assert( ch > 0x07FF && ch <= 0xFFFF );
+        break;
+
+    case 4:
+        if( (s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[3] & 0xc0) != 0x80 ||
+            (s[0] == 0xF0 && s[1] < 0x90) ||
+            (s[0] == 0xF4 && s[1] > 0x8F) )
+        {
+            THROW_IO_ERROR( "invalid continuation byte" );
+        }
+
+        ch =    ((s[0] & 0x7)  << 18) +
+                ((s[1] & 0x3f) << 12) +
+                ((s[2] & 0x3f) << 6 ) +
+                ((s[3] & 0x3f) << 0 );
+
+        assert( ch > 0xFFFF && ch <= 0x10ffff );
+        break;
+    }
+
+    if( aResult )
+    {
+        *aResult = ch;
+    }
+
+    return len;
+}
+
+
+UTF8::UTF8( const wchar_t* txt ) :
+    // size initial string safely large enough, then shrink to known size later.
+    std::string( wcslen( txt ) * 4, 0 )
+{
+    /*
+
+        "this" string was sized to hold the worst case UTF8 encoded byte
+        sequence, and was initialized with all nul bytes. Overwrite some of
+        those nuls, then resize, shrinking down to actual size.
+
+        Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
+        possibly being 16 bits wide on Windows and holding UTF16 input.
+
+    */
+
+    int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
+
+    resize( sz );
+}
+
diff --git a/tools/make-UTF8.sh b/tools/make-UTF8.sh
index 2e7e7510fc..8e57cf1ae2 100755
--- a/tools/make-UTF8.sh
+++ b/tools/make-UTF8.sh
@@ -1,5 +1,7 @@
+
+
 WXCONFIG=wx-config
-INCLUDE=/usr/include/wx-2.8
+#WXCONFIG=/opt/wx2.9/bin/wx-config
 
-g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)
+g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test  $($WXCONFIG --libs)