fixes, a couple more unit tests.
This commit is contained in:
parent
8316477638
commit
7717aa9279
120
tools/UTF8.cpp
120
tools/UTF8.cpp
|
@ -10,14 +10,20 @@
|
||||||
* is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
|
* is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special
|
||||||
* conversion support to and from wxString, and has iteration over unicode characters.
|
* conversion support to and from wxString, and has iteration over unicode characters.
|
||||||
*
|
*
|
||||||
* <p>I've been careful to supply only conversion facillities and not try
|
* <p>I've been careful to supply only conversion facilities and not try
|
||||||
* and duplicate wxString() with many member functions. In the end it is
|
* and duplicate wxString() with many member functions. In the end it is
|
||||||
* to be a std::string. There are multiple ways to create text into a std::string
|
* to be a std::string. There are multiple ways to create text into a std::string
|
||||||
* without the need of member functions. std::ostringstream.
|
* without the need of too many member functions:
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>richio.h's StrPrintf()</li>
|
||||||
|
* <li>std::ostringstream.</li>
|
||||||
|
* </ul>
|
||||||
*
|
*
|
||||||
* <p>Because this class used no virtuals, it should be possible to cast any
|
* <p>Because this class used no virtuals, it should be possible to cast any
|
||||||
* std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
|
* std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
|
||||||
* or copying being the effect of the cast.
|
* or copying being the effect of the cast. Be sure the source std::string holds
|
||||||
|
* UTF8 encoded text before you do that.
|
||||||
*
|
*
|
||||||
* @author Dick Hollenbeck
|
* @author Dick Hollenbeck
|
||||||
*/
|
*/
|
||||||
|
@ -73,53 +79,54 @@ public:
|
||||||
* @param aResult is where to put the unicode character, and may be NULL if no interest.
|
* @param aResult is where to put the unicode character, and may be NULL if no interest.
|
||||||
* @return int - the count of bytes consumed.
|
* @return int - the count of bytes consumed.
|
||||||
*/
|
*/
|
||||||
static int uni_forward( unsigned char* aSequence, unsigned* aResult = NULL );
|
static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* class uni_iter
|
* class uni_iter
|
||||||
* is a non-mutable iterator that walks through code points in the UTF8 encoded
|
* is a non-muting iterator that walks through unicode code points in the UTF8 encoded
|
||||||
* string. The normal ++(), ++(int), ->(), and *() operators are all supported and
|
* string. The normal ++(), ++(int), ->(), and *() operators are all supported
|
||||||
* they return an unsigned holding the unicode character appropriate for respective
|
* for read only access and they return an unsigned holding the unicode character
|
||||||
* operation.
|
* appropriate for the respective operator.
|
||||||
*/
|
*/
|
||||||
class uni_iter
|
class uni_iter
|
||||||
{
|
{
|
||||||
friend class UTF8;
|
friend class UTF8;
|
||||||
|
|
||||||
unsigned char* it;
|
const unsigned char* it;
|
||||||
|
|
||||||
// private constructor.
|
// private constructor.
|
||||||
uni_iter( const char* start ) :
|
uni_iter( const char* start ) :
|
||||||
it( (unsigned char*) start )
|
it( (const unsigned char*) start )
|
||||||
{
|
{
|
||||||
// for the human: assert( sizeof(unsigned) >= 4 );
|
// for the human: assert( sizeof(unsigned) >= 4 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/// pre-increment and return unicode at new position
|
uni_iter( const uni_iter& o )
|
||||||
unsigned operator++()
|
|
||||||
{
|
{
|
||||||
unsigned result;
|
it = o.it;
|
||||||
|
}
|
||||||
|
|
||||||
// advance over current, and toss the unicode result
|
/// pre-increment and return uni_iter at new position
|
||||||
|
const uni_iter& operator++()
|
||||||
|
{
|
||||||
it += uni_forward( it );
|
it += uni_forward( it );
|
||||||
|
|
||||||
// get the next unicode result, but do not advance:
|
return *this;
|
||||||
uni_forward( it, &result );
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// post-increment and return unicode at initial position
|
/// post-increment and return uni_iter at initial position
|
||||||
unsigned operator++( int )
|
uni_iter operator++( int )
|
||||||
{
|
{
|
||||||
unsigned result;
|
uni_iter ret = *this;
|
||||||
|
|
||||||
// grab the result and advance.
|
it += uni_forward( it );
|
||||||
it += uni_forward( it, &result );
|
return ret;
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
/// return unicode at current position
|
/// return unicode at current position
|
||||||
unsigned operator->() const
|
unsigned operator->() const
|
||||||
{
|
{
|
||||||
|
@ -129,6 +136,7 @@ public:
|
||||||
uni_forward( it, &result );
|
uni_forward( it, &result );
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
/// return unicode at current position
|
/// return unicode at current position
|
||||||
unsigned operator*() const
|
unsigned operator*() const
|
||||||
|
@ -142,6 +150,9 @@ public:
|
||||||
|
|
||||||
bool operator==( const uni_iter& other ) const { return it == other.it; }
|
bool operator==( const uni_iter& other ) const { return it == other.it; }
|
||||||
bool operator!=( const uni_iter& other ) const { return it != other.it; }
|
bool operator!=( const uni_iter& other ) const { return it != other.it; }
|
||||||
|
|
||||||
|
/// Since the ++ operators advance more than one byte, this is your best
|
||||||
|
/// loop termination test, < end(), not == end().
|
||||||
bool operator< ( const uni_iter& other ) const { return it < other.it; }
|
bool operator< ( const uni_iter& other ) const { return it < other.it; }
|
||||||
bool operator<=( const uni_iter& other ) const { return it <= other.it; }
|
bool operator<=( const uni_iter& other ) const { return it <= other.it; }
|
||||||
bool operator> ( const uni_iter& other ) const { return it > other.it; }
|
bool operator> ( const uni_iter& other ) const { return it > other.it; }
|
||||||
|
@ -150,7 +161,7 @@ public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function ubegin
|
* Function ubegin
|
||||||
* returns a @a uni_iter initialized to the start of this UTF8 byte sequence.
|
* returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
|
||||||
*/
|
*/
|
||||||
uni_iter ubegin() const
|
uni_iter ubegin() const
|
||||||
{
|
{
|
||||||
|
@ -159,7 +170,7 @@ public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function uend
|
* Function uend
|
||||||
* returns a @a uni_iter initialized to the end of this UTF8 byte sequence.
|
* returns a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
|
||||||
*/
|
*/
|
||||||
uni_iter uend() const
|
uni_iter uend() const
|
||||||
{
|
{
|
||||||
|
@ -213,14 +224,26 @@ int main()
|
||||||
for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); )
|
for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); )
|
||||||
{
|
{
|
||||||
// test post-increment:
|
// test post-increment:
|
||||||
printf( " _%c_", it++ );
|
printf( " _%c_", *it++ );
|
||||||
|
|
||||||
// after UTF8::uni_forward() is implemented, %c is no longer useable.
|
// after UTF8::uni_forward() is implemented, %c is no longer useable.
|
||||||
// printf( " _%02x_", it++ );
|
// printf( " _%02x_", *it++ );
|
||||||
}
|
}
|
||||||
|
|
||||||
printf( "\n" );
|
printf( "\n" );
|
||||||
|
|
||||||
|
UTF8::uni_iter it = u2.ubegin();
|
||||||
|
|
||||||
|
UTF8::uni_iter it2 = it++;
|
||||||
|
|
||||||
|
printf( "post_inc:'%c' should be 'i'\n", *it2 );
|
||||||
|
|
||||||
|
it2 = ++it;
|
||||||
|
|
||||||
|
printf( "pre_inc:'%c' should be 'p'\n", *it2 );
|
||||||
|
|
||||||
|
printf( "u[1]:'%c' should be 'n'\n", u2[1] );
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -253,7 +276,27 @@ UTF8& UTF8::operator=( const wxString& o )
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static const unsigned char utf8_len[256] = {
|
#ifndef THROW_IO_ERROR
|
||||||
|
#define THROW_IO_ERROR(x) // nothing
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// There is no wxWidgets function that does this, because wchar_t is 16 bits
|
||||||
|
// on windows and wx wants to encode the output in UTF16 for such.
|
||||||
|
|
||||||
|
int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
|
||||||
|
{
|
||||||
|
unsigned ch = *aSequence;
|
||||||
|
|
||||||
|
if( ch < 0x80 )
|
||||||
|
{
|
||||||
|
if( aResult )
|
||||||
|
*aResult = ch;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const unsigned char* s = aSequence;
|
||||||
|
|
||||||
|
static const unsigned char utf8_len[] = {
|
||||||
// Map encoded prefix byte to sequence length. Zero means
|
// Map encoded prefix byte to sequence length. Zero means
|
||||||
// illegal prefix. See RFC 3629 for details
|
// illegal prefix. See RFC 3629 for details
|
||||||
/*
|
/*
|
||||||
|
@ -276,27 +319,6 @@ static const unsigned char utf8_len[256] = {
|
||||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
|
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#ifndef THROW_IO_ERROR
|
|
||||||
#define THROW_IO_ERROR(x) // nothing
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// There is no wxWidgets function that does this, because wchar_t is 16 bits
|
|
||||||
// on windows and wx wants to encode the output in UTF16 for such.
|
|
||||||
|
|
||||||
int UTF8::uni_forward( unsigned char* aSequence, unsigned* aResult )
|
|
||||||
{
|
|
||||||
unsigned ch = *aSequence;
|
|
||||||
|
|
||||||
if( ch < 0x80 )
|
|
||||||
{
|
|
||||||
if( aResult )
|
|
||||||
*aResult = ch;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned char* s = aSequence;
|
|
||||||
|
|
||||||
int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
|
int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
|
||||||
|
|
||||||
switch( len )
|
switch( len )
|
||||||
|
|
Loading…
Reference in New Issue