complete class UTF8.cpp

This commit is contained in:
Dick Hollenbeck 2013-12-08 00:48:25 -06:00
parent 2f327f068d
commit 5df7288678
3 changed files with 176 additions and 22 deletions

View File

@ -249,7 +249,8 @@ void STROKE_FONT::drawSingleLineText( const wxString& aText )
// (textSize.x) // (textSize.x)
xOffset = textSize.x; xOffset = textSize.x;
glyphSize.x = -m_glyphSize.x; glyphSize.x = -m_glyphSize.x;
} else }
else
{ {
xOffset = 0.0; xOffset = 0.0;
} }

View File

@ -10,6 +10,15 @@
* is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
* conversion support to and from wxString, and has iteration over unicode characters. * conversion support to and from wxString, and has iteration over unicode characters.
* *
* <p>I've been careful to supply only conversion facillities and not try
* and duplicate wxString() with many member functions. In the end it is
* to be a std::string. There are multiple ways to create text into a std::string
* without the need of member functions. std::ostringstream.
*
* <p>Because this class used no virtuals, it should be possible to cast any
* std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
* or copying being the effect of the cast.
*
* @author Dick Hollenbeck * @author Dick Hollenbeck
*/ */
class UTF8 : public std::string class UTF8 : public std::string
@ -25,6 +34,9 @@ public:
{ {
} }
/// For use with _() function on wx 2.8:
UTF8( const wchar_t* txt );
explicit UTF8( const std::string& o ) : explicit UTF8( const std::string& o ) :
std::string( o ) std::string( o )
{ {
@ -54,25 +66,20 @@ public:
/** /**
* Function uni_forward * Function uni_forward
* advances over a UTF8 encoded multibyte character, capturing the unicode * advances over a single UTF8 encoded multibyte character, capturing the
* character as it goes, and returning the number of bytes consumed. * unicode character as it goes, and returning the number of bytes consumed.
* *
* @param aSequence is the UTF8 byte sequence. * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
* @param aResult is where to put the unicode character. * @param aResult is where to put the unicode character, and may be NULL if no interest.
* @return int - the count of bytes consumed.
*/ */
static int uni_forward( unsigned char* aSequence, unsigned* aResult ) static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );
{
// @todo: have this read UTF8 characters into result, not bytes.
// What's here now is scaffolding, reading single byte characters only.
*aResult = *aSequence;
return 1;
}
/** /**
* class uni_iter * class uni_iter
* is a non-mutable iterator that walks through code points in the UTF8 encoded * is a non-mutable iterator that walks through code points in the UTF8 encoded
* string. The normal ++(), ++(int), ->(), and *() operators are all supported and * string. The normal ++(), ++(int), ->(), and *() operators are all supported and
* they return a unsigned holding the unicode character appropriate for respective * they return an unsigned holding the unicode character appropriate for respective
* operation. * operation.
*/ */
class uni_iter class uni_iter
@ -81,10 +88,11 @@ public:
unsigned char* it; unsigned char* it;
// private constructor.
uni_iter( const char* start ) : uni_iter( const char* start ) :
it( (unsigned char*) start ) it( (unsigned char*) start )
{ {
assert( sizeof(unsigned) >= 4 ); // for the human: assert( sizeof(unsigned) >= 4 );
} }
public: public:
@ -94,10 +102,10 @@ public:
{ {
unsigned result; unsigned result;
// advance, and toss the result // advance over current, and toss the unicode result
it += uni_forward( it, &result ); it += uni_forward( it );
// get the next result, but do not advance: // get the next unicode result, but do not advance:
uni_forward( it, &result ); uni_forward( it, &result );
return result; return result;
} }
@ -173,15 +181,21 @@ wxString wxFunctionTaking_wxString( const wxString& wx )
int main() int main()
{ {
std::string str = "input"; std::string str = "input";
UTF8 u0 = L"wide string";
UTF8 u1 = "initial"; UTF8 u1 = "initial";
wxString wx = wxT( "input2" ); wxString wx = wxT( "input2" );
printf( "u0:'%s'\n", u0.c_str() );
printf( "u1:'%s'\n", u1.c_str() ); printf( "u1:'%s'\n", u1.c_str() );
u1 = str; u1 = str;
wxString wx2 = u1; wxString wx2 = u1;
// force a std::string into a UTF8, then into a wxString, then copy construct:
wxString wx3 = (UTF8&) u1;
UTF8 u2 = wx2; UTF8 u2 = wx2;
u2 += 'X'; u2 += 'X';
@ -196,7 +210,7 @@ int main()
printf( "result:'%s'\n", result.c_str() ); printf( "result:'%s'\n", result.c_str() );
// test the unicode iterator: // test the unicode iterator:
for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); ) for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); )
{ {
// test post-increment: // test post-increment:
printf( " _%c_", it++ ); printf( " _%c_", it++ );
@ -211,8 +225,13 @@ int main()
} }
// These to go into a library *.cpp, they are not inlined so that code space /*
// is saved creating the intermediate objects and referencing wxConvUTF8.
These to go into a library *.cpp, they are not inlined so that significant
code space is saved by encapsulating the creation of intermediate objects
and referencing wxConvUTF8.
*/
UTF8::UTF8( const wxString& o ) : UTF8::UTF8( const wxString& o ) :
@ -232,3 +251,135 @@ UTF8& UTF8::operator=( const wxString& o )
std::string::operator=( (const char*) o.utf8_str() ); std::string::operator=( (const char*) o.utf8_str() );
return *this; return *this;
} }
static const unsigned char utf8_len[256] = {
// Map encoded prefix byte to sequence length. Zero means
// illegal prefix. See RFC 3629 for details
/*
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
*/
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
};
#ifndef THROW_IO_ERROR
#define THROW_IO_ERROR(x) // nothing
#endif
// There is no wxWidgets function that does this, because wchar_t is 16 bits
// on windows and wx wants to encode the output in UTF16 for such.
int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
{
unsigned ch = *aSequence;
if( ch < 0x80 )
{
if( aResult )
*aResult = ch;
return 1;
}
unsigned char* s = aSequence;
int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
switch( len )
{
default:
case 0:
THROW_IO_ERROR( "invalid start byte" );
break;
case 2:
if( ( s[1] & 0xc0 ) != 0x80 )
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x1f) << 6) +
((s[1] & 0x3f) << 0);
assert( ch > 0x007F && ch <= 0x07FF );
break;
case 3:
if( (s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[0] == 0xE0 && s[1] < 0xA0)
// || (s[0] == 0xED && s[1] > 0x9F)
)
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x0f) << 12) +
((s[1] & 0x3f) << 6 ) +
((s[2] & 0x3f) << 0 );
assert( ch > 0x07FF && ch <= 0xFFFF );
break;
case 4:
if( (s[1] & 0xc0) != 0x80 ||
(s[2] & 0xc0) != 0x80 ||
(s[3] & 0xc0) != 0x80 ||
(s[0] == 0xF0 && s[1] < 0x90) ||
(s[0] == 0xF4 && s[1] > 0x8F) )
{
THROW_IO_ERROR( "invalid continuation byte" );
}
ch = ((s[0] & 0x7) << 18) +
((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6 ) +
((s[3] & 0x3f) << 0 );
assert( ch > 0xFFFF && ch <= 0x10ffff );
break;
}
if( aResult )
{
*aResult = ch;
}
return len;
}
UTF8::UTF8( const wchar_t* txt ) :
// size initial string safely large enough, then shrink to known size later.
std::string( wcslen( txt ) * 4, 0 )
{
/*
"this" string was sized to hold the worst case UTF8 encoded byte
sequence, and was initialized with all nul bytes. Overwrite some of
those nuls, then resize, shrinking down to actual size.
Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
possibly being 16 bits wide on Windows and holding UTF16 input.
*/
int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
resize( sz );
}

View File

@ -1,5 +1,7 @@
WXCONFIG=wx-config WXCONFIG=wx-config
INCLUDE=/usr/include/wx-2.8 #WXCONFIG=/opt/wx2.9/bin/wx-config
g++ -I $INCLUDE $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs) g++ -g $($WXCONFIG --cppflags) UTF8.cpp -o test $($WXCONFIG --libs)