2013-12-05 20:36:18 +00:00
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string>
|
|
|
|
#include <wx/string.h>
|
2013-12-06 20:22:10 +00:00
|
|
|
#include <assert.h>
|
2013-12-05 20:36:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Class UTF8
|
2013-12-06 20:22:10 +00:00
|
|
|
* is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
|
|
|
|
* conversion support to and from wxString, and has iteration over unicode characters.
|
|
|
|
*
|
|
|
|
* @author Dick Hollenbeck
|
2013-12-05 20:36:18 +00:00
|
|
|
*/
|
|
|
|
class UTF8 : public std::string
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
UTF8( const wxString& o );
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
/// This is the only constructor for which you could end up with
|
|
|
|
/// non-UTF8 encoding, but that would be your fault.
|
2013-12-05 20:36:18 +00:00
|
|
|
UTF8( const char* txt ) :
|
|
|
|
std::string( txt )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
explicit UTF8( const std::string& o ) :
|
2013-12-05 20:36:18 +00:00
|
|
|
std::string( o )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
UTF8() :
|
|
|
|
std::string()
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
UTF8& operator=( const wxString& o );
|
2013-12-06 12:51:39 +00:00
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
UTF8& operator=( const std::string& o )
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
std::string::operator=( o );
|
2013-12-06 12:51:39 +00:00
|
|
|
return *this;
|
2013-12-05 20:36:18 +00:00
|
|
|
}
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
operator wxString () const;
|
|
|
|
|
|
|
|
/// This one is not in std::string, and one wonders why... might be a solid
|
|
|
|
/// enough reason to remove it still.
|
|
|
|
operator char* () const
|
2013-12-05 20:36:18 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
return (char*) c_str();
|
2013-12-05 20:36:18 +00:00
|
|
|
}
|
2013-12-06 12:51:39 +00:00
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
/**
|
|
|
|
* Function uni_forward
|
|
|
|
* advances over a UTF8 encoded multibyte character, capturing the unicode
|
|
|
|
* character as it goes, and returning the number of bytes consumed.
|
|
|
|
*
|
|
|
|
* @param aSequence is the UTF8 byte sequence.
|
|
|
|
* @param aResult is where to put the unicode character.
|
|
|
|
*/
|
|
|
|
static int uni_forward( unsigned char* aSequence, unsigned* aResult )
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
|
|
|
// @todo: have this read UTF8 characters into result, not bytes.
|
|
|
|
// What's here now is scaffolding, reading single byte characters only.
|
2013-12-06 20:22:10 +00:00
|
|
|
*aResult = *aSequence;
|
2013-12-06 12:51:39 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* class uni_iter
|
|
|
|
* is a non-mutable iterator that walks through code points in the UTF8 encoded
|
|
|
|
* string. The normal ++(), ++(int), ->(), and *() operators are all supported and
|
2013-12-06 20:22:10 +00:00
|
|
|
* they return a unsigned holding the unicode character appropriate for respective
|
2013-12-06 12:51:39 +00:00
|
|
|
* operation.
|
|
|
|
*/
|
2013-12-06 13:32:33 +00:00
|
|
|
class uni_iter
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
friend class UTF8;
|
|
|
|
|
2013-12-06 13:32:33 +00:00
|
|
|
unsigned char* it;
|
2013-12-06 12:51:39 +00:00
|
|
|
|
2013-12-06 13:32:33 +00:00
|
|
|
uni_iter( const char* start ) :
|
|
|
|
it( (unsigned char*) start )
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
assert( sizeof(unsigned) >= 4 );
|
2013-12-06 12:51:39 +00:00
|
|
|
}
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
public:
|
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
/// pre-increment and return unicode at new position
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned operator++()
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned result;
|
2013-12-06 12:51:39 +00:00
|
|
|
|
|
|
|
// advance, and toss the result
|
|
|
|
it += uni_forward( it, &result );
|
|
|
|
|
|
|
|
// get the next result, but do not advance:
|
|
|
|
uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// post-increment and return unicode at initial position
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned operator++( int )
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned result;
|
2013-12-06 12:51:39 +00:00
|
|
|
|
|
|
|
// grab the result and advance.
|
|
|
|
it += uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// return unicode at current position
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned operator->() const
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned result;
|
2013-12-06 12:51:39 +00:00
|
|
|
|
|
|
|
// grab the result, do not advance
|
|
|
|
uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// return unicode at current position
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned operator*() const
|
2013-12-06 12:51:39 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
unsigned result;
|
2013-12-06 12:51:39 +00:00
|
|
|
|
|
|
|
// grab the result, do not advance
|
|
|
|
uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator==( const uni_iter& other ) const { return it == other.it; }
|
|
|
|
bool operator!=( const uni_iter& other ) const { return it != other.it; }
|
|
|
|
bool operator< ( const uni_iter& other ) const { return it < other.it; }
|
|
|
|
bool operator<=( const uni_iter& other ) const { return it <= other.it; }
|
|
|
|
bool operator> ( const uni_iter& other ) const { return it > other.it; }
|
|
|
|
bool operator>=( const uni_iter& other ) const { return it >= other.it; }
|
|
|
|
};
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
/**
|
|
|
|
* Function ubegin
|
|
|
|
* returns a @a uni_iter initialized to the start of this UTF8 byte sequence.
|
|
|
|
*/
|
2013-12-06 12:51:39 +00:00
|
|
|
uni_iter ubegin() const
|
|
|
|
{
|
2013-12-06 13:32:33 +00:00
|
|
|
return uni_iter( data() );
|
2013-12-06 12:51:39 +00:00
|
|
|
}
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
/**
|
|
|
|
* Function uend
|
|
|
|
* returns a @a uni_iter initialized to the end of this UTF8 byte sequence.
|
|
|
|
*/
|
2013-12-06 12:51:39 +00:00
|
|
|
uni_iter uend() const
|
|
|
|
{
|
2013-12-06 13:32:33 +00:00
|
|
|
return uni_iter( data() + size() );
|
2013-12-06 12:51:39 +00:00
|
|
|
}
|
2013-12-05 20:36:18 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
wxString wxFunctionTaking_wxString( const wxString& wx )
|
2013-12-05 20:36:18 +00:00
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
printf( "%s:'%s'\n", __func__, (char*) UTF8( wx ) );
|
|
|
|
printf( "%s:'%s'\n", __func__, (const char*) UTF8( wx ) );
|
|
|
|
printf( "%s:'%s'\n", __func__, UTF8( wx ).c_str() );
|
2013-12-06 12:51:39 +00:00
|
|
|
|
|
|
|
return wx;
|
2013-12-05 20:36:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int main()
|
|
|
|
{
|
|
|
|
std::string str = "input";
|
2013-12-06 20:22:10 +00:00
|
|
|
UTF8 u1 = "initial";
|
|
|
|
wxString wx = wxT( "input2" );
|
|
|
|
|
|
|
|
printf( "u1:'%s'\n", u1.c_str() );
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
u1 = str;
|
|
|
|
|
|
|
|
wxString wx2 = u1;
|
|
|
|
|
|
|
|
UTF8 u2 = wx2;
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
u2 += 'X';
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
printf( "u2:'%s'\n", u2.c_str() );
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
// key accomplishments here:
|
|
|
|
// 1) passing a UTF8 to a function which normally takes a wxString.
|
|
|
|
// 2) return a wxString back into a UTF8.
|
2013-12-06 20:22:10 +00:00
|
|
|
UTF8 result = wxFunctionTaking_wxString( u2 );
|
2013-12-05 20:36:18 +00:00
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
printf( "result:'%s'\n", result.c_str() );
|
|
|
|
|
|
|
|
// test the unicode iterator:
|
|
|
|
for( UTF8::uni_iter it = u2.ubegin(); it != u2.uend(); )
|
|
|
|
{
|
2013-12-06 20:22:10 +00:00
|
|
|
// test post-increment:
|
2013-12-06 12:51:39 +00:00
|
|
|
printf( " _%c_", it++ );
|
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
// after UTF8::uni_forward() is implemented, %c is no longer useable.
|
2013-12-06 12:51:39 +00:00
|
|
|
// printf( " _%02x_", it++ );
|
|
|
|
}
|
2013-12-06 20:22:10 +00:00
|
|
|
|
2013-12-06 12:51:39 +00:00
|
|
|
printf( "\n" );
|
2013-12-05 20:36:18 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2013-12-06 12:51:39 +00:00
|
|
|
|
2013-12-06 20:22:10 +00:00
|
|
|
|
|
|
|
// These to go into a library *.cpp, they are not inlined so that code space
|
|
|
|
// is saved creating the intermediate objects and referencing wxConvUTF8.
|
|
|
|
|
|
|
|
|
|
|
|
UTF8::UTF8( const wxString& o ) :
|
|
|
|
std::string( (const char*) o.utf8_str() )
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UTF8::operator wxString () const
|
|
|
|
{
|
|
|
|
return wxString( c_str(), wxConvUTF8 );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
UTF8& UTF8::operator=( const wxString& o )
|
|
|
|
{
|
|
|
|
std::string::operator=( (const char*) o.utf8_str() );
|
|
|
|
return *this;
|
|
|
|
}
|