Move "code costly" functions in experimental class UTF8 to be not "inlined", prefering compactness.

This commit is contained in:
Dick Hollenbeck 2013-12-06 14:22:10 -06:00
parent 4374e25219
commit 03a4f5c4ea
1 changed files with 85 additions and 45 deletions

View File

@ -2,68 +2,69 @@
#include <stdio.h> #include <stdio.h>
#include <string> #include <string>
#include <wx/string.h> #include <wx/string.h>
#include <stdint.h> #include <assert.h>
/** /**
* Class UTF8 * Class UTF8
* is an 8 bit std::string assuredly encoded in UTF8 that supplies special * is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
* conversion support to and from wxString, and has iteration over * conversion support to and from wxString, and has iteration over unicode characters.
* UTF8 code points. *
* @author Dick Hollenbeck
*/ */
class UTF8 : public std::string class UTF8 : public std::string
{ {
public: public:
UTF8( const wxString& o ) : UTF8( const wxString& o );
std::string( (const char*) o.utf8_str() )
{
// @todo: should not be inline.
}
/// This is the only constructor for which you could end up with
/// non-UTF8 encoding, but that would be your fault.
UTF8( const char* txt ) : UTF8( const char* txt ) :
std::string( txt ) std::string( txt )
{ {
// ok inline
} }
explicit UTF8( const std::string& o ) : explicit UTF8( const std::string& o ) :
std::string( o ) std::string( o )
{ {
// ok inline
} }
UTF8() : UTF8() :
std::string() std::string()
{ {
// ok inline
} }
UTF8& operator = ( const wxString& o ) UTF8& operator=( const wxString& o );
UTF8& operator=( const std::string& o )
{ {
// @todo: should not be inline. std::string::operator=( o );
std::string::operator=( (const char*) o.utf8_str() );
return *this; return *this;
} }
UTF8& operator = ( const std::string& o ) operator wxString () const;
/// This one is not in std::string, and one wonders why... might be a solid
/// enough reason to remove it still.
operator char* () const
{ {
std::string::operator = ( o ); return (char*) c_str();
return *this;
} }
operator wxString () const /**
{ * Function uni_forward
// @todo: should not be inline. * advances over a UTF8 encoded multibyte character, capturing the unicode
return wxString( c_str(), wxConvUTF8 ); * character as it goes, and returning the number of bytes consumed.
} *
* @param aSequence is the UTF8 byte sequence.
static int uni_forward( unsigned char* it, uint32_t* result ) * @param aResult is where to put the unicode character.
*/
static int uni_forward( unsigned char* aSequence, unsigned* aResult )
{ {
// @todo: have this read UTF8 characters into result, not bytes. // @todo: have this read UTF8 characters into result, not bytes.
// What's here now is scaffolding, reading single byte characters only. // What's here now is scaffolding, reading single byte characters only.
*result = *it; *aResult = *aSequence;
return 1; return 1;
} }
@ -71,37 +72,40 @@ public:
* class uni_iter * class uni_iter
* is a non-mutable iterator that walks through code points in the UTF8 encoded * is a non-mutable iterator that walks through code points in the UTF8 encoded
* string. The normal ++(), ++(int), ->(), and *() operators are all supported and * string. The normal ++(), ++(int), ->(), and *() operators are all supported and
* they return a uint32_t holding the unicode character appropriate for respective * they return a unsigned holding the unicode character appropriate for respective
* operation. * operation.
*/ */
class uni_iter class uni_iter
{ {
friend class UTF8;
unsigned char* it; unsigned char* it;
public:
uni_iter( const char* start ) : uni_iter( const char* start ) :
it( (unsigned char*) start ) it( (unsigned char*) start )
{ {
assert( sizeof(unsigned) >= 4 );
} }
public:
/// pre-increment and return unicode at new position /// pre-increment and return unicode at new position
uint32_t operator++() unsigned operator++()
{ {
uint32_t result; unsigned result;
// advance, and toss the result // advance, and toss the result
it += uni_forward( it, &result ); it += uni_forward( it, &result );
// get the next result, but do not advance: // get the next result, but do not advance:
uni_forward( it, &result ); uni_forward( it, &result );
return result; return result;
} }
/// post-increment and return unicode at initial position /// post-increment and return unicode at initial position
uint32_t operator++( int ) unsigned operator++( int )
{ {
uint32_t result; unsigned result;
// grab the result and advance. // grab the result and advance.
it += uni_forward( it, &result ); it += uni_forward( it, &result );
@ -109,9 +113,9 @@ public:
} }
/// return unicode at current position /// return unicode at current position
uint32_t operator->() const unsigned operator->() const
{ {
uint32_t result; unsigned result;
// grab the result, do not advance // grab the result, do not advance
uni_forward( it, &result ); uni_forward( it, &result );
@ -119,9 +123,9 @@ public:
} }
/// return unicode at current position /// return unicode at current position
uint32_t operator*() const unsigned operator*() const
{ {
uint32_t result; unsigned result;
// grab the result, do not advance // grab the result, do not advance
uni_forward( it, &result ); uni_forward( it, &result );
@ -136,11 +140,19 @@ public:
bool operator>=( const uni_iter& other ) const { return it >= other.it; } bool operator>=( const uni_iter& other ) const { return it >= other.it; }
}; };
/**
* Function ubegin
* returns a @a uni_iter initialized to the start of this UTF8 byte sequence.
*/
uni_iter ubegin() const uni_iter ubegin() const
{ {
return uni_iter( data() ); return uni_iter( data() );
} }
/**
* Function uend
* returns a @a uni_iter initialized to the end of this UTF8 byte sequence.
*/
uni_iter uend() const uni_iter uend() const
{ {
return uni_iter( data() + size() ); return uni_iter( data() + size() );
@ -148,9 +160,11 @@ public:
}; };
wxString aFunctionTaking_wxString( const wxString& wx ) wxString wxFunctionTaking_wxString( const wxString& wx )
{ {
printf( "%s: '%s'\n", __func__, UTF8( wx ).c_str() ); printf( "%s:'%s'\n", __func__, (char*) UTF8( wx ) );
printf( "%s:'%s'\n", __func__, (const char*) UTF8( wx ) );
printf( "%s:'%s'\n", __func__, UTF8( wx ).c_str() );
return wx; return wx;
} }
@ -158,9 +172,11 @@ wxString aFunctionTaking_wxString( const wxString& wx )
int main() int main()
{ {
UTF8 u1 = "output";
std::string str = "input"; std::string str = "input";
wxString wx = wxT( "input" ); UTF8 u1 = "initial";
wxString wx = wxT( "input2" );
printf( "u1:'%s'\n", u1.c_str() );
u1 = str; u1 = str;
@ -170,25 +186,49 @@ int main()
u2 += 'X'; u2 += 'X';
printf( "utf2:'%s'\n", u2.c_str() ); printf( "u2:'%s'\n", u2.c_str() );
// key accomplishments here: // key accomplishments here:
// 1) passing a UTF8 to a function which normally takes a wxString. // 1) passing a UTF8 to a function which normally takes a wxString.
// 2) return a wxString back into a UTF8. // 2) return a wxString back into a UTF8.
UTF8 result = aFunctionTaking_wxString( u2 ); UTF8 result = wxFunctionTaking_wxString( u2 );
printf( "result:'%s'\n", result.c_str() ); printf( "result:'%s'\n", result.c_str() );
// test the unicode iterator: // test the unicode iterator:
for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); ) for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); )
{ {
// test post-increment:
printf( " _%c_", it++ ); printf( " _%c_", it++ );
// after UTF7::uni_forward() is implemented, it++ %c is no longer useable. // after UTF8::uni_forward() is implemented, %c is no longer useable.
// printf( " _%02x_", it++ ); // printf( " _%02x_", it++ );
} }
printf( "\n" ); printf( "\n" );
return 0; return 0;
} }
// These to go into a library *.cpp, they are not inlined so that code space
// is saved creating the intermediate objects and referencing wxConvUTF8.
UTF8::UTF8( const wxString& o ) :
std::string( (const char*) o.utf8_str() )
{
}
UTF8::operator wxString () const
{
return wxString( c_str(), wxConvUTF8 );
}
UTF8& UTF8::operator=( const wxString& o )
{
std::string::operator=( (const char*) o.utf8_str() );
return *this;
}