Lightweight 0.1.0
Loading...
Searching...
No Matches
UnicodeConverter.hpp
1// SPDX-License-Identifier: Apache-2.0
2
3#pragma once
4
5#include "../Api.hpp"
6
7#include <concepts>
8#include <iterator>
9#include <optional>
10#include <string>
11#include <string_view>
12
13namespace detail
14{
15
16template <typename>
17struct UnicodeConverter;
18
19template <>
20struct LIGHTWEIGHT_API UnicodeConverter<char8_t>
21{
22 // Converts a UTF-32 code point to one to four UTF-8 code units.
23 template <typename OutputIterator>
24 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
25 {
26 if (input <= 0x7F)
27 {
28 *output++ = static_cast<char8_t>(input & 0b0111'1111);
29 }
30 else if (input <= 0x07FF)
31 {
32 *output++ = static_cast<char8_t>(((input >> 6) & 0b0001'1111) | 0b1100'0000);
33 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
34 }
35 else if (input <= 0xFFFF)
36 {
37 *output++ = static_cast<char8_t>(((input >> 12) & 0b0000'1111) | 0b1110'0000);
38 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
39 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
40 }
41 else
42 {
43 *output++ = static_cast<char8_t>(((input >> 18) & 0b0000'0111) | 0b1111'0000);
44 *output++ = static_cast<char8_t>(((input >> 12) & 0b0011'1111) | 0b1000'0000);
45 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
46 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
47 }
48 return output;
49 }
50};
51
52template <>
53struct LIGHTWEIGHT_API UnicodeConverter<char16_t>
54{
55 // Converts a UTF-32 code point to one or two UTF-16 code units.
56 template <typename OutputIterator>
57 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
58 {
59 if (input < 0xD800) // [0x0000 .. 0xD7FF]
60 {
61 *output++ = char16_t(input);
62 return output;
63 }
64 else if (input < 0x10000)
65 {
66 if (input < 0xE000)
67 return output; // The UTF-16 code point can not be in surrogate range.
68
69 // [0xE000 .. 0xFFFF]
70 *output++ = char16_t(input);
71 return output;
72 }
73 else if (input < 0x110000) // [0xD800 .. 0xDBFF] [0xDC00 .. 0xDFFF]
74 {
75 *output++ = char16_t(0xD7C0 + (input >> 10));
76 *output++ = char16_t(0xDC00 + (input & 0x3FF));
77 return output;
78 }
79 else
80 return output; // Too large UTF-16 code point.
81 }
82};
83
84struct Utf32Converter
85{
86 char32_t codePoint = 0;
87 int codeUnits = 0;
88
89 static constexpr auto InvalidCodePoint = char32_t { 0xFFFD };
90
91 constexpr std::optional<char32_t> Process(char8_t c8) noexcept
92 {
93 if ((c8 & 0b1100'0000) == 0b1000'0000)
94 {
95 if (codeUnits == 0)
96 return InvalidCodePoint;
97 codePoint <<= 6;
98 codePoint |= c8 & 0b0011'1111;
99 if (--codeUnits == 0)
100 {
101 auto result = codePoint;
102 codePoint = 0;
103 return result;
104 }
105 return std::nullopt;
106 }
107 if (codeUnits == 0)
108 {
109 if ((c8 & 0b1000'0000) == 0)
110 return c8;
111 if ((c8 & 0b1110'0000) == 0b1100'0000)
112 {
113 codePoint = c8 & 0b0001'1111;
114 codeUnits = 1;
115 return std::nullopt;
116 }
117 if ((c8 & 0b1111'0000) == 0b1110'0000)
118 {
119 codePoint = c8 & 0b0000'1111;
120 codeUnits = 2;
121 return std::nullopt;
122 }
123 if ((c8 & 0b1111'1000) == 0b1111'0000)
124 {
125 codePoint = c8 & 0b0000'0111;
126 codeUnits = 3;
127 return std::nullopt;
128 }
129 return InvalidCodePoint;
130 }
131 return InvalidCodePoint;
132 }
133};
134
135struct [[nodiscard]] Utf32Iterator
136{
137 std::u8string_view u8InputString;
138
139 struct [[nodiscard]] iterator
140 {
141 std::u8string_view::iterator current {};
142 std::u8string_view::iterator end {};
143 char32_t codePoint = Utf32Converter::InvalidCodePoint;
144
145 constexpr explicit iterator(std::u8string_view::iterator current, std::u8string_view::iterator end) noexcept:
146 current { current },
147 end { end }
148 {
149 if (current != end)
150 operator++();
151 }
152
153 constexpr char32_t operator*() const noexcept
154 {
155 return codePoint;
156 }
157
158 constexpr iterator& operator++() noexcept
159 {
160 auto converter = Utf32Converter {};
161 codePoint = Utf32Converter::InvalidCodePoint;
162 while (current != end)
163 {
164 if (auto const result = converter.Process(*current++); result.has_value())
165 {
166 codePoint = *result;
167 break;
168 }
169 }
170 return *this;
171 }
172
173 constexpr iterator& operator++(int) noexcept
174 {
175 return ++*this;
176 }
177
178 constexpr bool operator==(iterator const& other) const noexcept
179 {
180 return current == other.current && codePoint == other.codePoint;
181 }
182
183 constexpr bool operator!=(iterator const& other) const noexcept
184 {
185 return !(*this == other);
186 }
187 };
188
189 iterator begin() const noexcept
190 {
191 return iterator { u8InputString.begin(), u8InputString.end() };
192 }
193
194 iterator end() const noexcept
195 {
196 return iterator { u8InputString.end(), u8InputString.end() };
197 }
198};
199
200} // namespace detail
201
202/// @defgroup Unicode Unicode conversion functions
203///
204/// @brief Functions for converting between different (Unicode) encodings.
205
206/// Converts from UTF-32 to UTF-8.
207///
208/// @ingroup Unicode
209LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString);
210
211/// Converts from UTF-16 to UTF-8.
212///
213/// @ingroup Unicode
214LIGHTWEIGHT_API std::u8string ToUtf8(std::u16string_view u16InputString);
215
216/// Converts from UTF-16 (as wchar_t) to UTF-8.
217///
218/// @ingroup Unicode
219template <typename T>
220 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 2)
221inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u16InputString)
222{
223 return ToUtf8(std::u16string_view(reinterpret_cast<const char16_t*>(u16InputString.data()), u16InputString.size()));
224}
225
226/// Converts a wchar_t-based wide string view to UTF-8.
227///
228/// @ingroup Unicode
229template <typename T>
230 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
231inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u32InputString)
232{
233 return ToUtf8(std::u32string_view(reinterpret_cast<const char32_t*>(u32InputString.data()), u32InputString.size()));
234}
235
236/// Converts from UTF-32 to UTF-16.
237///
238/// @ingroup Unicode
239template <typename T>
240 requires std::same_as<T, char32_t> || (std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
241std::u16string ToUtf16(const std::basic_string_view<T> u32InputString)
242{
243 std::u16string u16OutputString;
244 u16OutputString.reserve(u32InputString.size());
245 detail::UnicodeConverter<char16_t> converter;
246 for (auto const c: u32InputString)
247 converter.Convert(c, std::back_inserter(u16OutputString));
248 return u16OutputString;
249}
250
251/// Converts from UTF-8 to UTF-16.
252///
253/// @ingroup Unicode
254LIGHTWEIGHT_API std::u16string ToUtf16(std::u8string_view u8InputString);
255
256/// Converts from local 8-bit string to UTF-16.
257///
258/// @ingroup Unicode
259LIGHTWEIGHT_API std::u16string ToUtf16(std::string const& localeInputString);
260
261/// Converts from UTF-8 to UTF-32.
262///
263/// @ingroup Unicode
264template <typename T = std::u32string>
265T ToUtf32(std::u8string_view u8InputString)
266{
267 auto result = T {};
268 for (char32_t const c32: detail::Utf32Iterator { u8InputString })
269 result.push_back(c32);
270 return result;
271}
272
273
274/// Converts from UTF-16 to UTF-32.
275///
276/// @ingroup Unicode
277template <typename T = std::u32string>
278T ToUtf32(std::u16string_view u16InputString)
279{
280 auto result = T {};
281
282 for (char16_t const c16: u16InputString)
283 {
284 if (c16 < 0xD800 || c16 >= 0xDC00)
285 result.push_back(c16);
286 else
287 result.push_back(0x10000 + ((c16 & 0x3FF) | ((c16 & 0x3FF) << 10)));
288 }
289
290 return result;
291}
292
293// Converts a UTF-8 string to wchar_t-based wide string.
294///
295/// @ingroup Unicode
296LIGHTWEIGHT_API std::wstring ToStdWideString(std::u8string_view u8InputString);
297
298// Converts a local 8-bit string to wchar_t-based wide string.
299///
300/// @ingroup Unicode
301LIGHTWEIGHT_API std::wstring ToStdWideString(std::string const& localeInputString);
T ToUtf32(std::u8string_view u8InputString)
std::u16string ToUtf16(const std::basic_string_view< T > u32InputString)
LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString)