kicad/include/utf8.h

/*
 * This program source code file is part of KiCad, a free EDA CAD application.
 *
 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
 * Copyright (C) 2013 KiCad Developers, see CHANGELOG.TXT for contributors.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, you may find one here:
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 * or you may search the http://www.gnu.org website for the version 2 license,
 * or you may write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */

#ifndef UTF8_H_
#define UTF8_H_

#include <string>
#include <wx/string.h>

#if defined(DEBUG)
 #define UTF8_VERIFY    // Might someday be a hidden cmake config option
#endif


/**
 * Function IsUTF8
 * tests a c-string to see if it is UTF8 encoded.  BTW an ASCII string is a valid
 * UTF8 string.
 */
bool IsUTF8( const char* aString );


#if defined(UTF8_VERIFY)
 #define MAYBE_VERIFY_UTF8(x)       wxASSERT( IsUTF8(x) )
#else
 #define MAYBE_VERIFY_UTF8(x)       // nothing
#endif


/**
 * Class UTF8
 * is an 8 bit string that is assuredly encoded in UTF8, and supplies special
 * conversion support to and from wxString, to and from std::string, and has
 * non-mutating iteration over unicode characters.
 *
 * <p>I've been careful to supply only conversion facilities and not try
 * and duplicate wxString() with many member functions. There are multiple ways
 * to create text into a std::string without the need of too many member functions:
 *
 * <ul>
 *  <li>richio.h's StrPrintf()</li>
 *  <li>std::ostringstream.</li>
 * </ul>
 *
 * <p>Because this class used no virtuals, it should be possible to cast any
 * std::string into a UTF8 using this kind of cast: (UTF8 &) without construction
 * or copying being the effect of the cast.  Be sure the source std::string holds
 * UTF8 encoded text before you do that.
 *
 * @author Dick Hollenbeck
 */
class UTF8
{
public:

    UTF8( const wxString& o );

    /// This is a constructor for which you could end up with
    /// non-UTF8 encoding, but that would be your fault.
    UTF8( const char* txt ) :
        m_s( txt )
    {
        MAYBE_VERIFY_UTF8( c_str() );
    }

    /// For use with _() function on wx 2.8.
    /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
    UTF8( const wchar_t* txt );

    UTF8( const std::string& o ) :
        m_s( o )
    {
        MAYBE_VERIFY_UTF8( c_str() );
    }

    UTF8()
    {
    }

    ~UTF8()     // Needed mainly to build python wrapper
    {
    }

    // expose some std::string functions publicly, since base class must be private.

    const char* c_str()                         const   { return m_s.c_str(); }
    bool empty()                                const   { return m_s.empty(); }

    std::string::size_type find( char c )       const   { return m_s.find( c ); }
    std::string::size_type find( char c, size_t& s )     const   { return m_s.find( c, s ); }

    void clear()                                        { m_s.clear(); }
    std::string::size_type length()             const   { return m_s.length(); }
    std::string::size_type size()               const   { return m_s.size(); }
    int compare( const std::string& s )         const   { return m_s.compare( s ); }

    bool operator==( const UTF8& rhs )          const   { return m_s == rhs.m_s; }
    bool operator==( const std::string& rhs )   const   { return m_s == rhs; }
    bool operator==( const char* s )            const   { return m_s == s; }

    std::string::size_type find_first_of( const std::string& str, std::string::size_type pos = 0 ) const
    {
        return m_s.find_first_of( str, pos );
    }

    UTF8& operator+=( const UTF8& str )
    {
        m_s += str.m_s;
        MAYBE_VERIFY_UTF8( c_str() );
        return (UTF8&) *this;
    }

    UTF8& operator+=( char ch )
    {
        m_s.operator+=( ch );
        MAYBE_VERIFY_UTF8( c_str() );
        return (UTF8&) *this;
    }

    UTF8& operator+=( const char* s )
    {
        m_s.operator+=( s );
        MAYBE_VERIFY_UTF8( c_str() );
        return (UTF8&) *this;
    }

    /// Append a wide (unicode) char to the UTF8 string.
    /// if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte seqence
    /// @param w_ch is a UTF-16 value (can be a UTF-32 on Linux)
    UTF8& operator+=( unsigned w_ch );

    // std::string::npos is not constexpr, so we can't use it in an
    // initializer.
    static constexpr std::string::size_type npos = -1;

    UTF8& operator=( const wxString& o );

    UTF8& operator=( const std::string& o )
    {
        m_s = o;
        MAYBE_VERIFY_UTF8( c_str() );
        return *this;
    }

    UTF8& operator=( const char* s )
    {
        m_s = s;
        MAYBE_VERIFY_UTF8( c_str() );
        return *this;
    }

    UTF8& operator=( char c )
    {
        m_s = c;
        MAYBE_VERIFY_UTF8( c_str() );
        return *this;
    }

    // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
    // was split, so return std::string not UTF8
    std::string substr( size_t pos = 0, size_t len = npos ) const
    {
        return m_s.substr( pos, len );
    }

    operator const std::string& () const    { return m_s; }
    //operator std::string& ()                { return m_s; }
    //operator std::string () const           { return m_s; }

    wxString wx_str() const;
    operator wxString () const;

    // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
    // over UTF8 (multi-byte) characters
    std::string::const_iterator begin()         const   { return m_s.begin(); }
    std::string::const_iterator end()           const   { return m_s.end(); }

#ifndef SWIG
    /**
     * class uni_iter
     * is a non-mutating iterator that walks through unicode code points in the UTF8 encoded
     * string.  The normal ++(), ++(int), ->(), and *() operators are all supported
     * for read only access and some return an unsigned holding the unicode character
     * appropriate for the respective operator.
     */
    class uni_iter
    {
        friend class UTF8;

        const unsigned char* it;

        // private constructor
        uni_iter( const char* start ) :
            it( (const unsigned char*) start )
        {
        }


    public:

        uni_iter()  // Needed only to build python wrapper, not used outside the wrapper
        {
            it = NULL;
        }

        uni_iter( const uni_iter& o )
        {
            it = o.it;
        }

        /// pre-increment and return uni_iter at new position
        const uni_iter& operator++()
        {
            it += uni_forward( it );
            return *this;
        }

        /// post-increment and return uni_iter at initial position
        uni_iter operator++( int )
        {
            uni_iter ret = *this;

            it += uni_forward( it );
            return ret;
        }

        /// return unicode at current position
        unsigned operator->() const
        {
            unsigned    result;

            // grab the result, do not advance
            uni_forward( it, &result );
            return result;
        }

        /// return unicode at current position
        unsigned operator*() const
        {
            unsigned    result;

            // grab the result, do not advance
            uni_forward( it, &result );
            return result;
        }

        uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }

        bool operator==( const uni_iter& other ) const  { return it == other.it; }
        bool operator!=( const uni_iter& other ) const  { return it != other.it; }

        /// Since the ++ operators advance more than one byte, this is your best
        /// loop termination test, < end(), not == end().
        bool operator< ( const uni_iter& other ) const  { return it <  other.it; }
        bool operator<=( const uni_iter& other ) const  { return it <= other.it; }
        bool operator> ( const uni_iter& other ) const  { return it >  other.it; }
        bool operator>=( const uni_iter& other ) const  { return it >= other.it; }
    };

    /**
     * Function ubegin
     * returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
     */
    uni_iter ubegin() const
    {
        return uni_iter( m_s.data() );
    }

    /**
     * Function uend
     * returns a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
     */
    uni_iter uend() const
    {
        return uni_iter( m_s.data() + m_s.size() );
    }

    /**
     * Function uni_forward
     * advances over a single UTF8 encoded multibyte character, capturing the
     * unicode character as it goes, and returning the number of bytes consumed.
     *
     * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
     * @param aResult is where to put the unicode character, and may be NULL if no interest.
     * @return int - the count of bytes consumed.
     */
    static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
#endif  // SWIG

protected:
    std::string m_s;
};


#endif // UTF8_H_