Utilities#
Character and String Types#
-
using icubaby::char8 = char8_t#
The type of a UTF-8 code unit. Defined as
char8_t
when the native type is available andchar
otherwise.
Code Point Constants#
-
constexpr auto icubaby::replacement_char = char32_t{0xFFFD}#
A constant for the U+FFFD REPLACEMENT CHARACTER code point.
-
constexpr auto icubaby::zero_width_no_break_space = char32_t{0xFEFF}#
A constant for the U+FEFF ZERO WIDTH NO-BREAK SPACE (BYTE ORDER MARK) code point.
-
constexpr auto icubaby::byte_order_mark = zero_width_no_break_space#
A constant for the U+FEFF ZERO WIDTH NO-BREAK SPACE (BYTE ORDER MARK) code point.
-
constexpr auto icubaby::code_point_bits = 21U#
The number of bits required to represent a code point.
Starting with Unicode 2.0, characters are encoded in the range U+0000..U+10FFFF, which amounts to a 21-bit code space.
-
constexpr auto icubaby::first_high_surrogate = char32_t{0xD800}#
The code point of the first UTF-16 high surrogate.
-
constexpr auto icubaby::last_high_surrogate = char32_t{0xDBFF}#
The code point of the last UTF-16 high surrogate.
-
constexpr auto icubaby::first_low_surrogate = char32_t{0xDC00}#
The code point of the first UTF-16 low surrogate.
-
constexpr auto icubaby::last_low_surrogate = char32_t{0xDFFF}#
The code point of the last UTF-16 low surrogate.
-
constexpr auto icubaby::max_code_point = char32_t{0x10FFFF}#
The number of the last code point.
Unicode Char Types#
-
using icubaby::character_types = details::make_t<char8, char16_t, char32_t>#
A list of the character types used for UTF-8 UTF-16, and UTF-32 encoded text.
-
template<typename T>
struct is_unicode_char_type : public std::bool_constant<details::contains_v<character_types, T>># Checks whether the argument is one of the unicode character types.
Provides the boolean constant
value
which is true if T is one of the unicode character types as defined by icubaby::character_types and false otherwise.- Template Parameters:
T – The type to be checked.
-
template<typename T>
constexpr bool icubaby::is_unicode_char_type_v = is_unicode_char_type<T>::value# A helper variable template to simplify use of icubaby::is_unicode_char_type.
-
template<typename T>
struct is_unicode_input_type : public std::bool_constant<is_unicode_char_type_v<T> || std::is_same_v<T, std::byte>># Checks whether the argument is one of the unicode data source types.
Provides the constant
value
which is equal to true if T is one of the types which may contain unicode data otherwise, value is equal to false. The unicode data types are the types allowed by icubaby::is_unicode_char_type_v plusstd::byte
.- Template Parameters:
T – The type to be checked.
-
template<typename T>
constexpr bool icubaby::is_unicode_input_v = is_unicode_input_type<T>::value# A helper variable template to simplify use of icubaby::is_unicode_input_type.
Longest Sequence#
-
template<unicode_char_type Encoding>
struct longest_sequence# The number of code-units in the longest legal representation of a code-point.
Provides the constant
value
which is of typestd::size_t
.- Template Parameters:
Encoding – The encoding to be used.
-
template<unicode_char_type Encoding>
constexpr auto icubaby::longest_sequence_v = longest_sequence<Encoding>::value# A helper variable template to simplify use of icubaby::longest_sequence<>.
Index#
Returns an iterator to the beginning of the pos’th code point in a range of code units.
The functions documented here assume toolchain support for C++ 20 Ranges. If not available, an implementation with signature accepting conventional [begin, end) iterators is supplied.
-
template<std::input_iterator I, std::sentinel_for<I> S, typename Proj = std::identity>
constexpr I icubaby::index(I first, S last, std::size_t pos, Proj proj = {})# Returns an iterator to the beginning of the pos’th code point in the code unit sequence [first, last).
- Parameters:
first – The start of the range of code units to examine.
last – The end of the range of code units to examine.
pos – The number of code points to move.
proj – Projection to apply to the elements.
- Returns:
An iterator that is ‘pos’ code points after the start of the range or ‘last’ if the end of the range was encountered.
-
template<std::ranges::input_range Range, typename Proj = std::identity>
constexpr std::ranges::borrowed_iterator_t<Range> icubaby::index(Range &&range, std::size_t pos, Proj proj = {})# Returns an iterator to the beginning of the pos’th code point in the range of code-units given by
range
.- Template Parameters:
Range – An input range.
Proj – The type of the projection applied to elements.
- Parameters:
range – The range of code units to examine.
pos – The number of code points to move.
proj – Projection to apply to the elements.
- Returns:
Iterator to the start of the selected code point or iterator equal to last if no such element is found.
Length#
Returns the number of code points in a sequence of code units.
The functions documented here assume toolchain support for C++ 20 Ranges. If not available, an implementation with signature accepting conventional [begin, end) iterators is supplied.
-
template<std::input_iterator I, std::sentinel_for<I> S, typename Proj = std::identity>
constexpr std::iter_difference_t<I> icubaby::length(I first, S last, Proj proj = {})# Returns the number of code points in a sequence.
Note
The input sequence must be well formed for the result to be accurate.
- Parameters:
first – The start of the range of code units to examine.
last – The end of the range of code units to examine.
proj – Projection to apply to the elements.
- Returns:
The number of code points.
-
template<std::ranges::input_range Range, typename Proj = std::identity>
constexpr std::ranges::range_difference_t<Range> icubaby::length(Range &&range, Proj proj = {})# Returns the number of code points in a sequence.
Note
The input sequence must be well formed for the result to be accurate.
- Template Parameters:
Range – An input range.
Proj – Type of the projection applied to elements.
- Parameters:
range – The range of the elements to examine.
proj – Projection to apply to the elements.
- Returns:
The number of code points.
Surrogates#
Functions that determine whether a particular code point is one of the high or low surrogates.
-
constexpr bool icubaby::is_high_surrogate(char32_t code_point) noexcept#
Returns true if the code point
code_point
represents a UTF-16 high surrogate.- Parameters:
code_point – The code point to be tested.
- Returns:
true if the code point
code_point
represents a UTF-16 high surrogate.
-
constexpr bool icubaby::is_low_surrogate(char32_t code_point) noexcept#
Returns true if the code point
code_point
represents a UTF-16 low surrogate.- Parameters:
code_point – The code point to be tested.
- Returns:
true if the code point
code_point
represents a UTF-16 low surrogate.
-
constexpr bool icubaby::is_surrogate(char32_t code_point) noexcept#
Returns true if the code point
code_point
represents a UTF-16 low or high surrogate.- Parameters:
code_point – The code point to be tested.
- Returns:
true if the code point
c
represents a UTF-16 high or low surrogate.
Code Point Start#
An overloaded function that can be used used to determine whether a code unit represents the start of a code point.
-
constexpr bool icubaby::is_code_point_start(char8 code_unit) noexcept#
Returns true if
code_unit
represents the start of a multi-byte UTF-8 sequence.- Parameters:
code_unit – The UTF-8 code unit to be tested.
- Returns:
true if
code_unit
represents the start of a multi-byte UTF-8 sequence.
-
constexpr bool icubaby::is_code_point_start(char16_t code_unit) noexcept#
Returns true if
code_unit
represents the start of a UTF-16 high/low surrogate pair.- Parameters:
code_unit – The UTF-16 code unit to be tested.
- Returns:
true if
code_unit
represents the start of a UTF-16 high/low surrogate pair.
-
constexpr bool icubaby::is_code_point_start(char32_t code_unit) noexcept#
Returns true if
code_unit
represents a valid UTF-32 code point.- Parameters:
code_unit – The UTF-32 code unit to be tested.
- Returns:
true if
code_unit
represents a valid UTF-32 code point.