2013-12-09 18:09:58 +00:00
|
|
|
/*
|
|
|
|
* This program source code file is part of KiCad, a free EDA CAD application.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
|
2020-12-21 15:17:52 +00:00
|
|
|
* Copyright (C) 2013-2020 KiCad Developers, see AUTHORS.txt for contributors.
|
|
|
|
*
|
|
|
|
* @author Dick Hollenbeck
|
2013-12-09 18:09:58 +00:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version 2
|
|
|
|
* of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, you may find one here:
|
|
|
|
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
|
|
|
|
* or you may search the http://www.gnu.org website for the version 2 license,
|
|
|
|
* or you may write to the Free Software Foundation, Inc.,
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
|
|
|
*/
|
|
|
|
|
2016-01-12 16:33:33 +00:00
|
|
|
#ifndef UTF8_H_
|
|
|
|
#define UTF8_H_
|
|
|
|
|
2013-12-09 18:09:58 +00:00
|
|
|
#include <string>
|
|
|
|
#include <wx/string.h>
|
|
|
|
|
2017-07-24 19:02:59 +00:00
|
|
|
#if defined(DEBUG)
|
|
|
|
#define UTF8_VERIFY // Might someday be a hidden cmake config option
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2020-12-21 15:17:52 +00:00
|
|
|
* Test a C string to see if it is UTF8 encoded.
|
|
|
|
*
|
|
|
|
* An ASCII string is a valid UTF8 string.
|
2017-07-24 19:02:59 +00:00
|
|
|
*/
|
|
|
|
bool IsUTF8( const char* aString );
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(UTF8_VERIFY)
|
|
|
|
#define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
|
|
|
|
#else
|
|
|
|
#define MAYBE_VERIFY_UTF8(x) // nothing
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2013-12-09 18:09:58 +00:00
|
|
|
/**
|
2020-12-21 15:17:52 +00:00
|
|
|
* An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion
|
|
|
|
* support to and from wxString, to and from std::string, and has non-mutating iteration
|
|
|
|
* over Unicode characters.
|
2013-12-09 18:09:58 +00:00
|
|
|
*
|
2020-12-21 15:17:52 +00:00
|
|
|
* I've been careful to supply only conversion facilities and not try and duplicate
|
|
|
|
* wxString() with many member functions. There are multiple ways to create text into
|
|
|
|
* a std::string without the need of too many member functions:
|
2013-12-09 18:09:58 +00:00
|
|
|
*
|
2020-12-21 15:17:52 +00:00
|
|
|
* - richio.h's StrPrintf().
|
|
|
|
* - std::ostringstream.
|
2013-12-09 18:09:58 +00:00
|
|
|
*
|
2020-12-21 15:17:52 +00:00
|
|
|
* Because this class uses no virtuals, it should be possible to cast any std::string
|
|
|
|
* into a UTF8 using this kind of cast: (UTF8 &) without construction or copying being
|
|
|
|
* the effect of the cast. Be sure the source std::string holds UTF8 encoded text before
|
|
|
|
* you do that.
|
2013-12-09 18:09:58 +00:00
|
|
|
*/
|
2017-07-25 19:14:31 +00:00
|
|
|
class UTF8
|
2013-12-09 18:09:58 +00:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
UTF8( const wxString& o );
|
|
|
|
|
2014-02-03 15:10:37 +00:00
|
|
|
/// This is a constructor for which you could end up with
|
2013-12-09 18:09:58 +00:00
|
|
|
/// non-UTF8 encoding, but that would be your fault.
|
|
|
|
UTF8( const char* txt ) :
|
2017-07-25 19:14:31 +00:00
|
|
|
m_s( txt )
|
2013-12-09 18:09:58 +00:00
|
|
|
{
|
2017-07-24 19:02:59 +00:00
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2013-12-09 18:09:58 +00:00
|
|
|
}
|
|
|
|
|
2014-02-03 15:10:37 +00:00
|
|
|
/// For use with _() function on wx 2.8.
|
|
|
|
/// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
|
2013-12-09 18:09:58 +00:00
|
|
|
UTF8( const wchar_t* txt );
|
|
|
|
|
2013-12-24 19:09:41 +00:00
|
|
|
UTF8( const std::string& o ) :
|
2017-07-25 19:14:31 +00:00
|
|
|
m_s( o )
|
2013-12-09 18:09:58 +00:00
|
|
|
{
|
2017-07-24 19:02:59 +00:00
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2013-12-09 18:09:58 +00:00
|
|
|
}
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
UTF8()
|
2013-12-09 18:09:58 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2014-10-18 08:18:14 +00:00
|
|
|
~UTF8() // Needed mainly to build python wrapper
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
// expose some std::string functions publicly, since base class must be private.
|
|
|
|
const char* c_str() const { return m_s.c_str(); }
|
|
|
|
bool empty() const { return m_s.empty(); }
|
|
|
|
|
|
|
|
std::string::size_type find( char c ) const { return m_s.find( c ); }
|
2021-02-01 13:17:59 +00:00
|
|
|
std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
|
2017-07-25 19:14:31 +00:00
|
|
|
|
|
|
|
void clear() { m_s.clear(); }
|
|
|
|
std::string::size_type length() const { return m_s.length(); }
|
|
|
|
std::string::size_type size() const { return m_s.size(); }
|
|
|
|
int compare( const std::string& s ) const { return m_s.compare( s ); }
|
|
|
|
|
|
|
|
bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
|
|
|
|
bool operator==( const std::string& rhs ) const { return m_s == rhs; }
|
|
|
|
bool operator==( const char* s ) const { return m_s == s; }
|
|
|
|
|
2020-12-21 15:17:52 +00:00
|
|
|
std::string::size_type find_first_of( const std::string& str,
|
|
|
|
std::string::size_type pos = 0 ) const
|
2017-07-25 19:14:31 +00:00
|
|
|
{
|
|
|
|
return m_s.find_first_of( str, pos );
|
|
|
|
}
|
|
|
|
|
|
|
|
UTF8& operator+=( const UTF8& str )
|
|
|
|
{
|
|
|
|
m_s += str.m_s;
|
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2020-01-08 01:49:11 +00:00
|
|
|
return *this;
|
2017-07-25 19:14:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
UTF8& operator+=( char ch )
|
|
|
|
{
|
|
|
|
m_s.operator+=( ch );
|
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2020-01-08 01:49:11 +00:00
|
|
|
return *this;
|
2017-07-25 19:14:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
UTF8& operator+=( const char* s )
|
|
|
|
{
|
|
|
|
m_s.operator+=( s );
|
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2020-01-08 01:49:11 +00:00
|
|
|
return *this;
|
2017-07-25 19:14:31 +00:00
|
|
|
}
|
|
|
|
|
2017-12-08 13:31:31 +00:00
|
|
|
/// Append a wide (unicode) char to the UTF8 string.
|
2017-12-08 16:57:53 +00:00
|
|
|
/// if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte seqence
|
|
|
|
/// @param w_ch is a UTF-16 value (can be a UTF-32 on Linux)
|
|
|
|
UTF8& operator+=( unsigned w_ch );
|
2017-12-08 13:31:31 +00:00
|
|
|
|
2017-12-06 02:42:36 +00:00
|
|
|
// std::string::npos is not constexpr, so we can't use it in an
|
|
|
|
// initializer.
|
|
|
|
static constexpr std::string::size_type npos = -1;
|
2017-07-25 19:14:31 +00:00
|
|
|
|
2013-12-09 18:09:58 +00:00
|
|
|
UTF8& operator=( const wxString& o );
|
|
|
|
|
|
|
|
UTF8& operator=( const std::string& o )
|
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
m_s = o;
|
2017-07-24 19:02:59 +00:00
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2013-12-09 18:09:58 +00:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2014-01-02 02:17:07 +00:00
|
|
|
UTF8& operator=( const char* s )
|
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
m_s = s;
|
2017-07-24 19:02:59 +00:00
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2014-01-02 02:17:07 +00:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
UTF8& operator=( char c )
|
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
m_s = c;
|
2017-07-24 19:02:59 +00:00
|
|
|
MAYBE_VERIFY_UTF8( c_str() );
|
2014-01-02 02:17:07 +00:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
// a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
|
|
|
|
// was split, so return std::string not UTF8
|
|
|
|
std::string substr( size_t pos = 0, size_t len = npos ) const
|
2014-01-02 02:17:07 +00:00
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
return m_s.substr( pos, len );
|
2014-01-02 02:17:07 +00:00
|
|
|
}
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
operator const std::string& () const { return m_s; }
|
|
|
|
//operator std::string& () { return m_s; }
|
|
|
|
//operator std::string () const { return m_s; }
|
2013-12-09 18:09:58 +00:00
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
wxString wx_str() const;
|
|
|
|
operator wxString () const;
|
2013-12-09 18:09:58 +00:00
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
// "Read only" iterating over bytes is done with these, use the uni_iter to iterate
|
|
|
|
// over UTF8 (multi-byte) characters
|
|
|
|
std::string::const_iterator begin() const { return m_s.begin(); }
|
|
|
|
std::string::const_iterator end() const { return m_s.end(); }
|
2013-12-09 18:09:58 +00:00
|
|
|
|
Pcbnew: major swig fix.
* Switched hashtables.h over to std::undordered_map from boost version.
* Added new macros DECL_VEC_FOR_SWIG() and DECL_MAP_FOR_SWIG() in macros.h.
These along with future DECL_HASH_FOR_SWIG() unify the declaration to swig
and C++ so that the resultant type name is common in both languages, and
the types AGREE.
* Fixed swigging of NETINFO_ITEM and NETINFO_LIST via magic.
* Newly exposed (python wrapped) are: D_PADS, TRACKS (was TRACK_PTRS),
NETNAME_MAP, NETCODE_MAP, wxString (without constructor purposely, read
comment in wx.i), MARKERS, ZONE_CONTAINERS, NETCLASSPTR, KICAD_T types.
* std::vector<SOMETHING*> tends to end up named SOMETHINGS in C++ and python.
Having the name consistent between like types is helpful, and between
languages. std::map<> ends up as SOMETHING_MAP.
* NETINFO_LIST::m_netNames and NETINFO_LIST::m_netCodes are now std::map
instead of hashtables, because swig does not yet support std::unordered_map.
* You can now get to any netclass or net info. NETNAMES_MAP and NETCODES_MAP
are traversable basically the same as a python dictionary using a python
string (not wsString) as the key! The wxString typemap converts python
string to wxString before the lookup happens. Iteration also works.
2016-07-18 17:23:09 +00:00
|
|
|
#ifndef SWIG
|
2013-12-09 18:09:58 +00:00
|
|
|
/**
|
2020-01-10 14:31:00 +00:00
|
|
|
* uni_iter
|
2017-07-25 19:14:31 +00:00
|
|
|
* is a non-mutating iterator that walks through unicode code points in the UTF8 encoded
|
2013-12-09 18:09:58 +00:00
|
|
|
* string. The normal ++(), ++(int), ->(), and *() operators are all supported
|
2014-02-03 15:10:37 +00:00
|
|
|
* for read only access and some return an unsigned holding the unicode character
|
2013-12-09 18:09:58 +00:00
|
|
|
* appropriate for the respective operator.
|
|
|
|
*/
|
|
|
|
class uni_iter
|
|
|
|
{
|
|
|
|
public:
|
2014-10-18 08:18:14 +00:00
|
|
|
uni_iter() // Needed only to build python wrapper, not used outside the wrapper
|
|
|
|
{
|
|
|
|
it = NULL;
|
|
|
|
}
|
|
|
|
|
2013-12-09 18:09:58 +00:00
|
|
|
uni_iter( const uni_iter& o )
|
|
|
|
{
|
|
|
|
it = o.it;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// pre-increment and return uni_iter at new position
|
|
|
|
const uni_iter& operator++()
|
|
|
|
{
|
|
|
|
it += uni_forward( it );
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// post-increment and return uni_iter at initial position
|
|
|
|
uni_iter operator++( int )
|
|
|
|
{
|
|
|
|
uni_iter ret = *this;
|
|
|
|
|
|
|
|
it += uni_forward( it );
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// return unicode at current position
|
|
|
|
unsigned operator->() const
|
|
|
|
{
|
|
|
|
unsigned result;
|
|
|
|
|
|
|
|
// grab the result, do not advance
|
|
|
|
uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// return unicode at current position
|
|
|
|
unsigned operator*() const
|
|
|
|
{
|
|
|
|
unsigned result;
|
|
|
|
|
|
|
|
// grab the result, do not advance
|
|
|
|
uni_forward( it, &result );
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
|
|
|
|
|
2013-12-09 18:09:58 +00:00
|
|
|
bool operator==( const uni_iter& other ) const { return it == other.it; }
|
|
|
|
bool operator!=( const uni_iter& other ) const { return it != other.it; }
|
|
|
|
|
|
|
|
/// Since the ++ operators advance more than one byte, this is your best
|
|
|
|
/// loop termination test, < end(), not == end().
|
|
|
|
bool operator< ( const uni_iter& other ) const { return it < other.it; }
|
|
|
|
bool operator<=( const uni_iter& other ) const { return it <= other.it; }
|
|
|
|
bool operator> ( const uni_iter& other ) const { return it > other.it; }
|
|
|
|
bool operator>=( const uni_iter& other ) const { return it >= other.it; }
|
2020-12-21 15:17:52 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
friend class UTF8;
|
|
|
|
|
|
|
|
const unsigned char* it;
|
|
|
|
|
|
|
|
// private constructor
|
|
|
|
uni_iter( const char* start ) :
|
|
|
|
it( (const unsigned char*) start )
|
|
|
|
{
|
|
|
|
}
|
2013-12-09 18:09:58 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
2020-12-21 15:17:52 +00:00
|
|
|
* Returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
|
2013-12-09 18:09:58 +00:00
|
|
|
*/
|
|
|
|
uni_iter ubegin() const
|
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
return uni_iter( m_s.data() );
|
2013-12-09 18:09:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2020-12-21 15:17:52 +00:00
|
|
|
* Return a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
|
2013-12-09 18:09:58 +00:00
|
|
|
*/
|
|
|
|
uni_iter uend() const
|
|
|
|
{
|
2017-07-25 19:14:31 +00:00
|
|
|
return uni_iter( m_s.data() + m_s.size() );
|
2013-12-09 18:09:58 +00:00
|
|
|
}
|
2017-07-25 19:14:31 +00:00
|
|
|
|
|
|
|
/**
|
2020-12-21 15:17:52 +00:00
|
|
|
* Advance over a single UTF8 encoded multibyte character, capturing the Unicode character
|
|
|
|
* as it goes, and returning the number of bytes consumed.
|
2017-07-25 19:14:31 +00:00
|
|
|
*
|
|
|
|
* @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
|
|
|
|
* @param aResult is where to put the unicode character, and may be NULL if no interest.
|
2020-12-21 15:17:52 +00:00
|
|
|
* @return the count of bytes consumed.
|
2017-07-25 19:14:31 +00:00
|
|
|
*/
|
|
|
|
static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
|
Pcbnew: major swig fix.
* Switched hashtables.h over to std::undordered_map from boost version.
* Added new macros DECL_VEC_FOR_SWIG() and DECL_MAP_FOR_SWIG() in macros.h.
These along with future DECL_HASH_FOR_SWIG() unify the declaration to swig
and C++ so that the resultant type name is common in both languages, and
the types AGREE.
* Fixed swigging of NETINFO_ITEM and NETINFO_LIST via magic.
* Newly exposed (python wrapped) are: D_PADS, TRACKS (was TRACK_PTRS),
NETNAME_MAP, NETCODE_MAP, wxString (without constructor purposely, read
comment in wx.i), MARKERS, ZONE_CONTAINERS, NETCLASSPTR, KICAD_T types.
* std::vector<SOMETHING*> tends to end up named SOMETHINGS in C++ and python.
Having the name consistent between like types is helpful, and between
languages. std::map<> ends up as SOMETHING_MAP.
* NETINFO_LIST::m_netNames and NETINFO_LIST::m_netCodes are now std::map
instead of hashtables, because swig does not yet support std::unordered_map.
* You can now get to any netclass or net info. NETNAMES_MAP and NETCODES_MAP
are traversable basically the same as a python dictionary using a python
string (not wsString) as the key! The wxString typemap converts python
string to wxString before the lookup happens. Iteration also works.
2016-07-18 17:23:09 +00:00
|
|
|
#endif // SWIG
|
2017-07-25 19:14:31 +00:00
|
|
|
|
|
|
|
protected:
|
|
|
|
std::string m_s;
|
2013-12-09 18:09:58 +00:00
|
|
|
};
|
|
|
|
|
2017-07-25 19:14:31 +00:00
|
|
|
|
2014-02-03 15:10:37 +00:00
|
|
|
#endif // UTF8_H_
|