Lightweight 0.20260617.0
Loading...
Searching...
No Matches
UnicodeConverter.hpp
1// SPDX-License-Identifier: Apache-2.0
2
3#pragma once
4
5#include "../Api.hpp"
6
7#include <concepts>
8#include <iterator>
9#include <optional>
10#include <string>
11#include <string_view>
12
13namespace Lightweight
14{
15
16namespace detail
17{
18
19 template <typename>
20 struct UnicodeConverter;
21
22 template <>
23 struct LIGHTWEIGHT_API UnicodeConverter<char8_t>
24 {
25 // Converts a UTF-32 code point to one to four UTF-8 code units.
26 template <typename OutputIterator>
27 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
28 {
29 if (input <= 0x7F)
30 {
31 *output++ = static_cast<char8_t>(input & 0b0111'1111);
32 }
33 else if (input <= 0x07FF)
34 {
35 *output++ = static_cast<char8_t>(((input >> 6) & 0b0001'1111) | 0b1100'0000);
36 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
37 }
38 else if (input <= 0xFFFF)
39 {
40 *output++ = static_cast<char8_t>(((input >> 12) & 0b0000'1111) | 0b1110'0000);
41 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
42 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
43 }
44 else
45 {
46 *output++ = static_cast<char8_t>(((input >> 18) & 0b0000'0111) | 0b1111'0000);
47 *output++ = static_cast<char8_t>(((input >> 12) & 0b0011'1111) | 0b1000'0000);
48 *output++ = static_cast<char8_t>(((input >> 6) & 0b0011'1111) | 0b1000'0000);
49 *output++ = static_cast<char8_t>(((input >> 0) & 0b0011'1111) | 0b1000'0000);
50 }
51 return output;
52 }
53 };
54
55 template <>
56 struct LIGHTWEIGHT_API UnicodeConverter<char16_t>
57 {
58 // Converts a UTF-32 code point to one or two UTF-16 code units.
59 template <typename OutputIterator>
60 static constexpr OutputIterator Convert(char32_t input, OutputIterator output) noexcept
61 {
62 if (input < 0xD800) // [0x0000 .. 0xD7FF]
63 {
64 *output++ = char16_t(input);
65 return output;
66 }
67 else if (input < 0x10000)
68 {
69 if (input < 0xE000)
70 return output; // The UTF-16 code point can not be in surrogate range.
71
72 // [0xE000 .. 0xFFFF]
73 *output++ = char16_t(input);
74 return output;
75 }
76 else if (input < 0x110000) // [0xD800 .. 0xDBFF] [0xDC00 .. 0xDFFF]
77 {
78 *output++ = char16_t(0xD7C0 + (input >> 10));
79 *output++ = char16_t(0xDC00 + (input & 0x3FF));
80 return output;
81 }
82 else
83 return output; // Too large UTF-16 code point.
84 }
85 };
86
87 struct Utf32Converter
88 {
89 char32_t codePoint = 0;
90 int codeUnits = 0;
91
92 static constexpr auto InvalidCodePoint = char32_t { 0xFFFD };
93
94 constexpr std::optional<char32_t> Process(char8_t c8) noexcept
95 {
96 if ((c8 & 0b1100'0000) == 0b1000'0000)
97 {
98 if (codeUnits == 0)
99 return InvalidCodePoint;
100 codePoint <<= 6;
101 codePoint |= c8 & 0b0011'1111;
102 if (--codeUnits == 0)
103 {
104 auto result = codePoint;
105 codePoint = 0;
106 return result;
107 }
108 return std::nullopt;
109 }
110 if (codeUnits == 0)
111 {
112 if ((c8 & 0b1000'0000) == 0)
113 return c8;
114 if ((c8 & 0b1110'0000) == 0b1100'0000)
115 {
116 codePoint = c8 & 0b0001'1111;
117 codeUnits = 1;
118 return std::nullopt;
119 }
120 if ((c8 & 0b1111'0000) == 0b1110'0000)
121 {
122 codePoint = c8 & 0b0000'1111;
123 codeUnits = 2;
124 return std::nullopt;
125 }
126 if ((c8 & 0b1111'1000) == 0b1111'0000)
127 {
128 codePoint = c8 & 0b0000'0111;
129 codeUnits = 3;
130 return std::nullopt;
131 }
132 return InvalidCodePoint;
133 }
134 return InvalidCodePoint;
135 }
136 };
137
138 struct [[nodiscard]] Utf32Iterator
139 {
140 std::u8string_view u8InputString;
141
142 struct [[nodiscard]] iterator
143 {
144 std::u8string_view::iterator current {};
145 std::u8string_view::iterator end {};
146 char32_t codePoint = Utf32Converter::InvalidCodePoint;
147
148 constexpr explicit iterator(std::u8string_view::iterator current, std::u8string_view::iterator end) noexcept:
149 current { current },
150 end { end }
151 {
152 if (current != end)
153 operator++();
154 }
155
156 constexpr char32_t operator*() const noexcept
157 {
158 return codePoint;
159 }
160
161 constexpr iterator& operator++() noexcept
162 {
163 auto converter = Utf32Converter {};
164 codePoint = Utf32Converter::InvalidCodePoint;
165 while (current != end)
166 {
167 if (auto const result = converter.Process(*current++); result.has_value())
168 {
169 codePoint = *result;
170 break;
171 }
172 }
173 return *this;
174 }
175
176 constexpr iterator& operator++(int) noexcept
177 {
178 return ++*this;
179 }
180
181 constexpr bool operator==(iterator const& other) const noexcept
182 {
183 return current == other.current && codePoint == other.codePoint;
184 }
185
186 constexpr bool operator!=(iterator const& other) const noexcept
187 {
188 return !(*this == other);
189 }
190 };
191
192 iterator begin() const noexcept
193 {
194 return iterator { u8InputString.begin(), u8InputString.end() };
195 }
196
197 iterator end() const noexcept
198 {
199 return iterator { u8InputString.end(), u8InputString.end() };
200 }
201 };
202
203} // namespace detail
204
205/// @defgroup Unicode Unicode conversion functions
206///
207/// @brief Functions for converting between different (Unicode) encodings.
208
209/// Converts from UTF-32 to UTF-8.
210///
211/// @ingroup Unicode
212LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString);
213
214/// Converts from UTF-16 to UTF-8.
215///
216/// @ingroup Unicode
217LIGHTWEIGHT_API std::u8string ToUtf8(std::u16string_view u16InputString);
218
219/// Converts from UTF-16 (as wchar_t) to UTF-8.
220///
221/// @ingroup Unicode
222template <typename T>
223 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 2)
224inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u16InputString)
225{
226 return ToUtf8(std::u16string_view(reinterpret_cast<char16_t const*>(u16InputString.data()), u16InputString.size()));
227}
228
229/// Converts a wchar_t-based wide string view to UTF-8.
230///
231/// @ingroup Unicode
232template <typename T>
233 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
234inline LIGHTWEIGHT_FORCE_INLINE std::u8string ToUtf8(std::basic_string_view<T> u32InputString)
235{
236 return ToUtf8(std::u32string_view(reinterpret_cast<char32_t const*>(u32InputString.data()), u32InputString.size()));
237}
238
239/// Converts from UTF-32 to UTF-16.
240///
241/// @ingroup Unicode
242template <typename T>
243 requires std::same_as<T, char32_t> || (std::same_as<T, wchar_t> && sizeof(wchar_t) == 4)
244std::u16string ToUtf16(std::basic_string_view<T> const u32InputString)
245{
246 std::u16string u16OutputString;
247 u16OutputString.reserve(u32InputString.size());
248 for (auto const c: u32InputString)
249 detail::UnicodeConverter<char16_t>::Convert(static_cast<char32_t>(c), std::back_inserter(u16OutputString));
250 return u16OutputString;
251}
252
253/// Converts a wchar_t-based wide string view to UTF-16.
254///
255/// When `sizeof(wchar_t) == 2` (typically Windows), the wide string is already UTF-16,
256/// so this is a reinterpreting copy.
257///
258/// @ingroup Unicode
259template <typename T>
260 requires(std::same_as<T, wchar_t> && sizeof(wchar_t) == 2)
261inline LIGHTWEIGHT_FORCE_INLINE std::u16string ToUtf16(std::basic_string_view<T> u16InputString)
262{
263 return { reinterpret_cast<char16_t const*>(u16InputString.data()), u16InputString.size() };
264}
265
266/// Converts from UTF-8 to UTF-16.
267///
268/// @ingroup Unicode
269LIGHTWEIGHT_API std::u16string ToUtf16(std::u8string_view u8InputString);
270
271/// Converts from local 8-bit string to UTF-16.
272///
273/// @ingroup Unicode
274LIGHTWEIGHT_API std::u16string ToUtf16(std::string const& localeInputString);
275
276/// Converts from UTF-8 to UTF-32.
277///
278/// @ingroup Unicode
279template <typename T = std::u32string>
280T ToUtf32(std::u8string_view u8InputString)
281{
282 auto result = T {};
283 for (char32_t const c32: detail::Utf32Iterator { u8InputString })
284 result.push_back(c32);
285 return result;
286}
287
288/// Converts from UTF-16 to UTF-32.
289///
290/// @ingroup Unicode
291template <typename T = std::u32string>
292T ToUtf32(std::u16string_view u16InputString)
293{
294 auto result = T {};
295 result.reserve(u16InputString.size());
296
297 for (size_t i = 0; i < u16InputString.size(); ++i)
298 {
299 auto const c16 = u16InputString[i];
300 if (c16 >= 0xD800 && c16 <= 0xDBFF && i + 1 < u16InputString.size() && u16InputString[i + 1] >= 0xDC00
301 && u16InputString[i + 1] <= 0xDFFF)
302 {
303 auto const high = static_cast<char32_t>(c16 - 0xD800);
304 auto const low = static_cast<char32_t>(u16InputString[++i] - 0xDC00);
305 result.push_back(static_cast<char32_t>(0x10000 + (high << 10) + low));
306 }
307 else if (c16 >= 0xD800 && c16 <= 0xDFFF)
308 {
309 // Orphan surrogate (high without low, or stray low) — emit replacement.
310 result.push_back(detail::Utf32Converter::InvalidCodePoint);
311 }
312 else
313 {
314 result.push_back(static_cast<char32_t>(c16));
315 }
316 }
317
318 return result;
319}
320
321/// Converts a UTF-8 string to wchar_t-based wide string.
322///
323/// @ingroup Unicode
324LIGHTWEIGHT_API std::wstring ToStdWideString(std::u8string_view u8InputString);
325
326/// Converts a local 8-bit string to wchar_t-based wide string.
327///
328/// @ingroup Unicode
329LIGHTWEIGHT_API std::wstring ToStdWideString(std::string const& localeInputString);
330
331/// Converts from Windows-1252 encoding to UTF-8.
332///
333/// @param input The Windows-1252 encoded string
334/// @return UTF-8 encoded string
335///
336/// @ingroup Unicode
337LIGHTWEIGHT_API std::u8string ConvertWindows1252ToUtf8(std::string_view input);
338
339} // namespace Lightweight
LIGHTWEIGHT_API std::u8string ConvertWindows1252ToUtf8(std::string_view input)
T ToUtf32(std::u8string_view u8InputString)
LIGHTWEIGHT_API std::u8string ToUtf8(std::u32string_view u32InputString)
std::u16string ToUtf16(std::basic_string_view< T > const u32InputString)
LIGHTWEIGHT_API std::wstring ToStdWideString(std::u8string_view u8InputString)