175 lines
5.4 KiB
C++
175 lines
5.4 KiB
C++
// Copyright (C) 2008 Keita Mochizuki, Davis E. King (davis@dlib.net)
|
|
// License: Boost Software License See LICENSE.txt for the full license.
|
|
#ifndef DLIB_UNICODe_CPp_
|
|
#define DLIB_UNICODe_CPp_
|
|
#include "unicode.h"
|
|
#include <cwchar>
|
|
#include "../string.h"
|
|
#include <vector>
|
|
|
|
namespace dlib
|
|
{
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
static const unichar SURROGATE_FIRST_TOP = 0xD800;
|
|
static const unichar SURROGATE_SECOND_TOP = 0xDC00;
|
|
static const unichar SURROGATE_CLEARING_MASK = 0x03FF;
|
|
static const unichar SURROGATE_TOP = SURROGATE_FIRST_TOP;
|
|
static const unichar SURROGATE_END = 0xE000;
|
|
static const unichar SMP_TOP = 0x10000;
|
|
static const int VALID_BITS = 10;
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <typename T> bool is_surrogate(T ch)
|
|
{
|
|
return (zero_extend_cast<unichar>(ch) >= SURROGATE_TOP &&
|
|
zero_extend_cast<unichar>(ch) < SURROGATE_END);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <typename T> unichar surrogate_pair_to_unichar(T first, T second)
|
|
{
|
|
return ((first & SURROGATE_CLEARING_MASK) << VALID_BITS) | ((second & SURROGATE_CLEARING_MASK) + SMP_TOP);
|
|
}
|
|
//110110 0000000000
|
|
//110111 0000000000
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
void unichar_to_surrogate_pair(unichar input, unichar &first, unichar &second)
|
|
{
|
|
first = ((input - SMP_TOP) >> VALID_BITS) | SURROGATE_FIRST_TOP;
|
|
second = (input & SURROGATE_CLEARING_MASK) | SURROGATE_SECOND_TOP;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <int N> void wstr2ustring_t(const wchar_t *src, size_t src_len, ustring &dest);
|
|
|
|
template <> void wstr2ustring_t<4>(const wchar_t *src, size_t , ustring &dest)
|
|
{
|
|
dest.assign((const unichar *)(src));
|
|
}
|
|
|
|
template <> void wstr2ustring_t<2>(const wchar_t *src, size_t src_len, ustring &dest)
|
|
{
|
|
size_t wlen = 0;
|
|
for (size_t i = 0; i < src_len; i++)
|
|
{
|
|
is_surrogate(src[i]) ? i++, wlen++ : wlen++;
|
|
}
|
|
dest.resize(wlen);
|
|
for (size_t i = 0, ii = 0; ii < src_len; ++i)
|
|
{
|
|
if (is_surrogate(src[ii]))
|
|
{
|
|
dest[i] = surrogate_pair_to_unichar(src[ii], src[ii+1]);
|
|
ii += 2;
|
|
}else
|
|
{
|
|
dest[i] = zero_extend_cast<unichar>(src[ii]);
|
|
ii++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const ustring convert_wstring_to_utf32(const std::wstring &src)
|
|
{
|
|
ustring dest;
|
|
wstr2ustring_t<sizeof(wchar_t)>(src.c_str(), src.size(), dest);
|
|
return dest;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
template <int N> struct ustring2wstr
|
|
{
|
|
};
|
|
|
|
// for the environment of sizeof(wchar_t) == 2 (i.e. Win32)
|
|
template <> struct ustring2wstr<2>
|
|
{
|
|
wchar_t *wstr;
|
|
size_t wlen;
|
|
ustring2wstr(const ustring &src){
|
|
wlen = 0;
|
|
for (size_t i = 0; i < src.length(); ++i)
|
|
{
|
|
if (src[i] < SMP_TOP) wlen++;
|
|
else wlen += 2;
|
|
}
|
|
wstr = new wchar_t[wlen+1];
|
|
wstr[wlen] = L'\0';
|
|
|
|
size_t wi = 0;
|
|
for (size_t i = 0; i < src.length(); ++i)
|
|
{
|
|
if (src[i] < SMP_TOP)
|
|
{
|
|
wstr[wi++] = (wchar_t)src[i];
|
|
}else
|
|
{
|
|
unichar high, low;
|
|
unichar_to_surrogate_pair(src[i], high, low);
|
|
wstr[wi++] = (wchar_t)high;
|
|
wstr[wi++] = (wchar_t)low;
|
|
}
|
|
}
|
|
}
|
|
~ustring2wstr()
|
|
{
|
|
delete[] wstr;
|
|
}
|
|
};
|
|
|
|
// for the environment of sizeof(wchar_t) == 4 (i.e. Unix gcc)
|
|
template <> struct ustring2wstr<4>
|
|
{
|
|
const wchar_t *wstr;
|
|
size_t wlen;
|
|
ustring2wstr(const ustring &src){
|
|
wstr = (const wchar_t *)(src.c_str());
|
|
wlen = src.size();
|
|
}
|
|
};
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::wstring convert_utf32_to_wstring(const ustring &src)
|
|
{
|
|
ustring2wstr<sizeof(wchar_t)> conv(src);
|
|
std::wstring dest(conv.wstr);
|
|
return dest;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::wstring convert_mbstring_to_wstring(const std::string &src)
|
|
{
|
|
std::vector<wchar_t> wstr(src.length()+5);
|
|
std::mbstowcs(&wstr[0], src.c_str(), src.length()+1);
|
|
return std::wstring(&wstr[0]);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
const std::string convert_wstring_to_mbstring(const std::wstring &src)
|
|
{
|
|
using namespace std;
|
|
std::string str;
|
|
str.resize((src.length() + 1) * MB_CUR_MAX);
|
|
wcstombs(&str[0], src.c_str(), str.size());
|
|
return std::string(&str[0]);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------------------
|
|
|
|
}
|
|
|
|
#endif // DLIB_UNICODe_CPp_
|
|
|