Lightweight 0.20250904.0
Loading...
Searching...
No Matches
UnicodeConverter.hpp
1// SPDX-License-Identifier: Apache-2.0
2
3#pragma once
4
5#include "../Api.hpp"
6
7#include <concepts>
8#include <iterator>
9#include <optional>
10#include <string>
11#include <string_view>
12
13namespace Lightweight
14{
15
16namespace detail
17{
18
19 template <typename>
20 struct UnicodeConverter;
21
22 template <>
23 struct LIGHTWEIGHT_API UnicodeConverter<char8_t>
24 {
25 // Converts a UTF-32 code point to one to four UTF-8 code units.
26 template <typename OutputIterator>
27 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
28 {
29 if (input <= 0x7F)
30 {
31 *output++ = static_cast<char8_t>(input & 0b0111'1111);
32 }
33 else if (input <= 0x07FF)
34 {
35 *output++ = static_cast<char8_t>(((input >> 6) & 0b0001'1111) | 0b1100'0000);
36 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
37 }
38 else if (input <= 0xFFFF)
39 {
40 *output++ = static_cast<char8_t>(((input >> 12) & 0b0000'1111) | 0b1110'0000);
41 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
42 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
43 }
44 else
45 {
46 *output++ = static_cast<char8_t>(((input >> 18) & 0b0000'0111) | 0b1111'0000);
47 *output++ = static_cast<char8_t>(((input >> 12) & 0b0011'1111) | 0b1000'0000);
48 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
49 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
50 }
51 return output;
52 }
53 };
54
55 template <>
56 struct LIGHTWEIGHT_API UnicodeConverter<char16_t>
57 {
58 // Converts a UTF-32 code point to one or two UTF-16 code units.
59 template <typename OutputIterator>
60 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
61 {
62 if (input < 0xD800) // [0x0000 .. 0xD7FF]
63 {
64 *output++ = char16_t(input);
65 return output;
66 }
67 else if (input < 0x10000)
68 {
69 if (input < 0xE000)
70 return output; // The UTF-16 code point can not be in surrogate range.
71
72 // [0xE000 .. 0xFFFF]
73 *output++ = char16_t(input);
74 return output;
75 }
76 else if (input < 0x110000) // [0xD800 .. 0xDBFF] [0xDC00 .. 0xDFFF]
77 {
78 *output++ = char16_t(0xD7C0 + (input >> 10));
79 *output++ = char16_t(0xDC00 + (input & 0x3FF));
80 return output;
81 }
82 else
83 return output; // Too large UTF-16 code point.
84 }
85 };
86
87 struct Utf32Converter
88 {
89 char32_t codePoint = 0;
90 int codeUnits = 0;
91
92 static constexpr auto InvalidCodePoint = char32_t { 0xFFFD };
93
94 constexpr std::optional<char32_t> Process(char8_t c8) noexcept
95 {
96 if ((c8 & 0b1100'0000) == 0b1000'0000)
97 {
98 if (codeUnits == 0)
99 return InvalidCodePoint;
100 codePoint <<= 6;
101 codePoint |= c8 & 0b0011'1111;
102 if (--codeUnits == 0)
103 {
104 auto result = codePoint;
105 codePoint = 0;
106 return result;
107 }
108 return std::nullopt;
109 }
110 if (codeUnits == 0)
111 {
112 if ((c8 & 0b1000'0000) == 0)
113 return c8;
114 if ((c8 & 0b1110'0000) == 0b1100'0000)
115 {
116 codePoint = c8 & 0b0001'1111;
117 codeUnits = 1;
118 return std::nullopt;
119 }
120 if ((c8 & 0b1111'0000) == 0b1110'0000)
121 {
122 codePoint = c8 & 0b0000'1111;
123 codeUnits = 2;
124 return std::nullopt;
125 }
126 if ((c8 & 0b1111'1000) == 0b1111'0000)
127 {
128 codePoint = c8 & 0b0000'0111;
129 codeUnits = 3;
130 return std::nullopt;
131 }
132 return InvalidCodePoint;
133 }
134 return InvalidCodePoint;
135 }
136 };
137
138 struct [[nodiscard]] Utf32Iterator
139 {
140 std::u8string_view u8InputString;
141
142 struct [[nodiscard]] iterator
143 {
144 std::u8string_view::iterator current {};
145 std::u8string_view::iterator end {};
146 char32_t codePoint = Utf32Converter::InvalidCodePoint;
147
148 constexpr explicit iterator(std::u8string_view::iterator current, std::u8string_view::iterator end) noexcept:
149 current { current },
150 end { end }
151 {
152 if (current != end)
153 operator++();
154 }
155
156 constexpr char32_t operator*() const noexcept
157 {
158 return codePoint;
159 }
160
161 constexpr iterator& operator++() noexcept
162 {
163 auto converter = Utf32Converter {};
164 codePoint = Utf32Converter::InvalidCodePoint;
165 while (current != end)
166 {
167 if (auto const result = converter.Process(*current++); result.has_value())
168 {
169 codePoint = *result;
170 break;
171 }
172 }
173 return *this;
174 }
175
176 constexpr iterator& operator++(int) noexcept
177 {
178 return ++*this;
179 }
180
181 constexpr bool operator==(iterator const& other) const noexcept
182 {
183 return current == other.current && codePoint == other.codePoint;
184 }
185
186 constexpr bool operator!=(iterator const& other) const noexcept
187 {
188 return !(*this == other);
189 }
190 };
191
192 iterator begin() const noexcept
193 {
194 return iterator { u8InputString.begin(), u8InputString.end() };
195 }
196
197 iterator end() const noexcept
198 {
199 return iterator { u8InputString.end(), u8InputString.end() };
200 }
201 };
202
203} // namespace detail
204
205/// @defgroup Unicode Unicode conversion functions
206///
207/// @brief Functions for converting between different (Unicode) encodings.
208
209/// Converts from UTF-32 to UTF-8.
210///
211/// @ingroup Unicode
212LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString);
213
214/// Converts from UTF-16 to UTF-8.
215///
216/// @ingroup Unicode
217LIGHTWEIGHT_API std::u8string ToUtf8(std::u16string_view u16InputString);
218
219/// Converts from UTF-16 (as wchar_t) to UTF-8.
220///
221/// @ingroup Unicode
222template <typename T>
223 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 2)
224inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u16InputString)
225{
226 return ToUtf8(std::u16string_view(reinterpret_cast<char16_t const*>(u16InputString.data()), u16InputString.size()));
227}
228
229/// Converts a wchar_t-based wide string view to UTF-8.
230///
231/// @ingroup Unicode
232template <typename T>
233 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
234inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u32InputString)
235{
236 return ToUtf8(std::u32string_view(reinterpret_cast<char32_t const*>(u32InputString.data()), u32InputString.size()));
237}
238
239/// Converts from UTF-32 to UTF-16.
240///
241/// @ingroup Unicode
242template <typename T>
243 requires std::same_as<T, char32_t> || (std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
244std::u16string ToUtf16(std::basic_string_view<T> const u32InputString)
245{
246 std::u16string u16OutputString;
247 u16OutputString.reserve(u32InputString.size());
248 detail::UnicodeConverter<char16_t> converter;
249 for (auto const c: u32InputString)
250 converter.Convert(c, std::back_inserter(u16OutputString));
251 return u16OutputString;
252}
253
254/// Converts from UTF-8 to UTF-16.
255///
256/// @ingroup Unicode
257LIGHTWEIGHT_API std::u16string ToUtf16(std::u8string_view u8InputString);
258
259/// Converts from local 8-bit string to UTF-16.
260///
261/// @ingroup Unicode
262LIGHTWEIGHT_API std::u16string ToUtf16(std::string const& localeInputString);
263
264/// Converts from UTF-8 to UTF-32.
265///
266/// @ingroup Unicode
267template <typename T = std::u32string>
268T ToUtf32(std::u8string_view u8InputString)
269{
270 auto result = T {};
271 for (char32_t const c32: detail::Utf32Iterator { u8InputString })
272 result.push_back(c32);
273 return result;
274}
275
276/// Converts from UTF-16 to UTF-32.
277///
278/// @ingroup Unicode
279template <typename T = std::u32string>
280T ToUtf32(std::u16string_view u16InputString)
281{
282 auto result = T {};
283
284 for (char16_t const c16: u16InputString)
285 {
286 if (c16 < 0xD800 || c16 >= 0xDC00)
287 result.push_back(static_cast<char32_t>(c16));
288 else
289 result.push_back(0x10000 + ((c16 & 0x3FF) | ((c16 & 0x3FF) << 10)));
290 }
291
292 return result;
293}
294
295// Converts a UTF-8 string to wchar_t-based wide string.
296///
297/// @ingroup Unicode
298LIGHTWEIGHT_API std::wstring ToStdWideString(std::u8string_view u8InputString);
299
300// Converts a local 8-bit string to wchar_t-based wide string.
301///
302/// @ingroup Unicode
303LIGHTWEIGHT_API std::wstring ToStdWideString(std::string const& localeInputString);
304
305} // namespace Lightweight
T ToUtf32(std::u8string_view u8InputString)
LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString)
std::u16string ToUtf16(std::basic_string_view< T > const u32InputString)