Transcoder Internals#

Byte Transcoder#

template<unicode_char_type ToEncoding>
class transcoder<std::byte, ToEncoding>#

The “byte transcoder” takes a sequence of bytes, determines their encoding and converts to a specified encoding.

This transcoder is used when the input encoding is not known at compile-time. If present, a leading byte-order-mark is interpreted to select the source encoding; if not present, UTF-8 encoding is assumed.

The byte transcoder is implemented as a finite state machine. The following diagram shows the state transitions that occur as input bytes are received. Each vertex rectangle represents a state (the upper half has the state name and the lower briefly describes the meaning of that state). Each edge describes the condition for that transition to be made.

  • An edge with description of the form x=y (where y is a hexadecimal constant) is taken if the input value x is equal to the constant y.

  • An edge with the description “otherwise” is taken if no other edges with the same origin are matched.

  • An edge without a description is unconditionally taken for the next byte

digraph StateTransitionDiagram {
    compound=true;
    rankdir="TB";
    fontsize="9pt"
    #graph [splines=curved ]
    graph [ranksep="0.3" nodesep="0.15"]
    node [colorscheme="rdylbu3"  fontsize="9pt"]
    #node [shape="circle"fixedsize=true height=1.5 width=1.5]
    node [shape=record]
    edge [fontsize="9pt"]

    init [label="" shape=none];
    start [fontsize="12pt"];
    init -> start;

    utf16_be_bom_byte1[color=1 label="{utf16_be_bom_byte1 | UTF-16BE, BOM, Byte 1}"]
    utf32_or_16_le_bom_byte2[color=1 label="{utf32_or_16_le_bom_byte2 | UTF-32/16LE, BOM, Byte 2}"]
    utf8_bom_byte1[color=1 label="{utf8_bom_byte1 | UTF-8, BOM, Byte 1}"]
    utf8_bom_byte2[color=1 label="{utf8_bom_byte2 | UTF-8, BOM, Byte 2}"]
    utf32_or_16_le_bom_byte1[color=1 label="{utf32_or_16_le_bom_byte1 | UTF-32/16LE, BOM, Byte 1}"]
    utf32_or_16_be_bom_byte1[color=1 label="{utf32_or_16_be_bom_byte1 | UTF-32/16BE, BOM, Byte 1}"]
    utf32_be_bom_byte2[color=1 label="{utf32_be_bom_byte2 | UTF-32BE, BOM, Byte 2}"]
    utf32_le_bom_byte3[color=1 label="{utf32_le_bom_byte3 | UTF-32LE, BOM, Byte 3}"]
    utf32_be_bom_byte3[color=1 label="{utf32_be_bom_byte3 | UTF-32BE, BOM, Byte 3}"]

    run8[color=3 label="{run8 | UTF-8, Run}"]
    run_16be_byte0[color=3 label="{run_16be_byte0 | UTF-16BE,\ Run,\ Byte 0}"]
    run_16be_byte1[color=3 label="{run_16be_byte1 | UTF-16BE,\ Run,\ Byte 1}"]
    run_16le_byte0[color=3 label="{run_16le_byte0 | UTF-16LE,\ Run,\ Byte\ 0}"]
    run_16le_byte1[color=3 label="{run_16le_byte1 | UTF-16LE,\ Run,\ Byte\ 1}"]
    run_32be_byte0[color=3 label="{run_32be_byte0 | UTF-32BE,\ Run,\ Byte\ 0}"]
    run_32be_byte1[color=3 label="{run_32be_byte1 | UTF-32BE,\ Run,\ Byte\ 1}"]
    run_32be_byte2[color=3 label="{run_32be_byte2 | UTF-32BE,\ Run,\ Byte\ 2}"]
    run_32be_byte3[color=3 label="{run_32be_byte3 | UTF-32BE,\ Run,\ Byte\ 3}"]
    run_32le_byte0[color=3 label="{run_32le_byte0 | UTF-32LE,\ Run,\ Byte\ 0}"]
    run_32le_byte1[color=3 label="{run_32le_byte1 | UTF-32LE,\ Run,\ Byte\ 1}"]
    run_32le_byte2[color=3 label="{run_32le_byte2 | UTF-32LE,\ Run,\ Byte\ 2}"]
    run_32le_byte3[color=3 label="{run_32le_byte3 | UTF-32LE,\ Run,\ Byte\ 3}"]

    start -> utf8_bom_byte1[xlabel="x=0xEF"];
    start -> utf16_be_bom_byte1[xlabel="x=0xFE"];
    start -> utf32_or_16_le_bom_byte1[xlabel="x=0xFF"];
    start -> utf32_or_16_be_bom_byte1[xlabel="x=0x00"];
    start -> run8[taillabel="otherwise"];

    subgraph cluster_be16 {
        peripheries=0;
        utf16_be_bom_byte1; run_16be_byte0; run_16be_byte1;
    }
    subgraph cluster_le {
        peripheries=0;
        utf32_or_16_le_bom_byte1; utf32_or_16_le_bom_byte2; utf32_le_bom_byte3;
        run_32le_byte0; run_32le_byte1; run_32le_byte2; run_32le_byte3;
        run_16le_byte0; run_16le_byte1;
    };
    subgraph cluster_be {
        peripheries=0;
        utf32_or_16_be_bom_byte1; utf32_be_bom_byte2; utf32_be_bom_byte3;
        run_32be_byte0; run_32be_byte1; run_32be_byte2; run_32be_byte3;
    };
    subgraph cluster_c {
        peripheries=0;
        utf8_bom_byte1;
        utf8_bom_byte2;
    };

    utf8_bom_byte1 -> utf8_bom_byte2[xlabel="x=0xBB"];
    utf8_bom_byte1 -> run8[taillabel="otherwise"];
    utf8_bom_byte2 -> run8[xlabel="x=0xBF"];
    utf8_bom_byte2 -> run8[taillabel="otherwise"];

    utf16_be_bom_byte1 -> run_16be_byte0[xlabel="x=0xFF"];
    utf16_be_bom_byte1 -> run8[taillabel="otherwise"];

    utf32_or_16_le_bom_byte1 -> utf32_or_16_le_bom_byte2[xlabel="x=0xFE"];
    utf32_or_16_le_bom_byte1 -> run8[taillabel="otherwise"];

    utf32_or_16_le_bom_byte2 -> run_16le_byte0[xlabel="x=0x00"];
    utf32_or_16_le_bom_byte2 -> utf32_le_bom_byte3[taillabel="otherwise"];

    utf32_le_bom_byte3 -> run_32le_byte0[xlabel="x=0x00"];
    utf32_le_bom_byte3 -> run8[taillabel="otherwise"];

    utf32_or_16_be_bom_byte1 -> utf32_be_bom_byte2[xlabel="x=0x00"];
    utf32_or_16_be_bom_byte1 -> run8[taillabel="otherwise"];

    utf32_be_bom_byte2 -> utf32_be_bom_byte3[xlabel="value=0xFE"];
    utf32_be_bom_byte2 -> run8[taillabel="otherwise"];
    utf32_be_bom_byte3 -> run_32be_byte0[label="value=0xFF"];
    utf32_be_bom_byte3 -> run8[taillabel="otherwise"];

    run8 -> run8;

    run_16be_byte0 -> run_16be_byte1 -> run_16be_byte0;
    run_16le_byte0 -> run_16le_byte1;
    run_16le_byte1 -> run_16le_byte0;

    run_32be_byte0 -> run_32be_byte1;
    run_32be_byte1 -> run_32be_byte2;
    run_32be_byte2 -> run_32be_byte3 -> run_32be_byte0;

    run_32le_byte0 -> run_32le_byte1;
    run_32le_byte1 -> run_32le_byte2;
    run_32le_byte2 -> run_32le_byte3 -> run_32le_byte0;
}

Public Types

using input_type = std::byte#

The type of the values consumed by this transcoder.

using output_type = ToEncoding#

The type of the code units produced by this transcoder.

Public Functions

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type value, OutputIterator dest) noexcept#

Accepts a byte for decoding. Output is written to a supplied output iterator.

As output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • value – A byte of input.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator end_cp(OutputIterator dest) noexcept#

Call once the entire input sequence has been fed to operator().

This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator().

This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

constexpr bool well_formed() const noexcept#

Returns true if the input represents well formed Unicode.

Returns:

True if the input represents well formed Unicode.

constexpr bool partial() const noexcept#

Return true if a partial code-point has been passed to operator().

Returns:

True if a partial code-point has been passed to operator() and false otherwise.

constexpr encoding selected_encoding() const noexcept#

The detected encoding of the input stream.

Returns:

The encoding of the input stream as detected by consuming an optional leading byte order mark. Initially encoding::unknown.

Private Types

enum class states : std::uint_least8_t#

The states that the finite state machine can occupy.

The values for each state is made from a collection of bits that not only uniquely identify the state, but which can be used to share a great deal of code between similar states. For example, the code for handling all but the final byte of a UTF-16 BE, UTF-16 LE, UTF-32 BE and UTF-32 LE code unit is shared. We can extract the encoding, endianness, and byte numbers from the state codes.

Bit

Interpretation

0

2 bits to describe the encoding being processed: UTF-16 (encoding_utf16), UTF-32 (encoding_utf32), UTF-8 (encoding_utf8), unknown (encoding_unknown). Note that the values are combined with the value of bit 2 (the endianness) to produce an index to the first dimension of the details::boms array.

1

2

1 bit to identify the input as big (big_endian) or little (little_endian) endian.

3

1 bit to identify when the state machine is in “Run” or “BOM” mode. In BOM mode (bom_mode), we are in the process of identifying the input encoding. In run mode (run_mode), we are consuming and emitting code-units.

4

2 bits provide the index of the byte within the BOM that we are processing. This value corresponds to an index into the second dimension of the details::boms array.

5

6

Unused. Always 0.

7

Unused. Always 0.

There are constants which may be bit-wise ORed together to create the appropriate value for each of the FSM’s states.

Values:

enumerator start#

The FSM’s initial state.

enumerator utf8_bom_byte1#

The state if the second byte of a UTF-8 BOM was identified.

enumerator utf8_bom_byte2#

The state if the third byte of a UTF-8 BOM was identified.

enumerator utf16_be_bom_byte1#

The state if the second byte of a UTF-16 BOM was identified.

enumerator utf32_be_bom_byte2#

The state if the third byte of a UTF-32 BE BOM was identified.

enumerator utf32_be_bom_byte3#

The state if the fourth byte of a UTF-32 BE BOM was identified.

enumerator utf32_or_16_be_bom_byte1#

The state if the second byte of a UTF-32 BE or UTF-16 BE BOM was identified.

enumerator utf32_or_16_le_bom_byte1#

The state if the second byte of a UTF-32 LE or UTF-16 LE BOM was identified.

enumerator utf32_or_16_le_bom_byte2#

The state when the state machine is checking for the third byte of a UTF-32 LE BOM or the start of a UTF-16 LE run.

enumerator utf32_le_bom_byte3#

The state when the state machine is checking for the third byte of a UTF-32 LE BOM or the start of a UTF-16 LE run.

enumerator run_8#
enumerator run_16be_byte0#

The state when the state machine is handling the first byte of a UTF-16 BE code-unit.

enumerator run_16be_byte1#

The state when the state machine is handling the second and final byte of a UTF-32 BE code-unit.

enumerator run_16le_byte0#

The state when the state machine is handling the first byte of a UTF-16 LE code-unit.

enumerator run_16le_byte1#

The state when the state machine is handling the second and final byte of a UTF-32 LE code-unit.

enumerator run_32be_byte0#

The state when the state machine is handling the first byte of a UTF-32 BE code-unit.

enumerator run_32be_byte1#

The state when the state machine is handling the second byte of a UTF-32 BE code-unit.

enumerator run_32be_byte2#

The state when the state machine is handling the third byte of a UTF-32 BE code-unit.

enumerator run_32be_byte3#

The state when the state machine is handling the fourth and final byte of a UTF-32 BE code-unit.

enumerator run_32le_byte0#

The state when the state machine is handling the first byte of a UTF-32 LE code-unit.

enumerator run_32le_byte1#

The state when the state machine is handling the second byte of a UTF-32 LE code-unit.

enumerator run_32le_byte2#

The state when the state machine is handling the third byte of a UTF-32 LE code-unit.

enumerator run_32le_byte3#

The state when the state machine is handling the fourth and final byte of a UTF-32 LE code-unit.

using t8_type = transcoder<icubaby::char8, ToEncoding>#

A short name for the transcoder used when UTF-8 input has been detected.

using t16_type = transcoder<char16_t, ToEncoding>#

A short name for the transcoder used when UTF-16 input has been detected.

using t32_type = transcoder<char32_t, ToEncoding>#

A short name for the transcoder used when UTF-32 input has been detected.

Private Functions

inline constexpr bool is_run_mode() const noexcept#

Returns true if the argument represents a state where the FSM is consuming and producing code-units.

Returns:

True if the parameter represents a state where the FSM is consuming and producing code-units.

inline constexpr bool is_little_endian() const noexcept#

Returns true if the argument represents a state in which the FSM is consuming little endian code units.

Returns:

True if the transcoder is consuming little-endian values, false otherwise.

inline constexpr std::uint_least8_t get_byte_no() const noexcept#
inline constexpr states next_byte() const noexcept#

Returns a state which references the next byte number.

Returns:

A state referencing the next byte number.

inline constexpr std::byte bom_value() const noexcept#
template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator start_state(input_type const value, OutputIterator dest) noexcept#

Handles the initial state of the FSM. Checks the initial input byte against the collection of potential byte order mark initial bytes and decides on the next action.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • value – The initial input byte.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator run8_start(bool const copy_buffer, OutputIterator dest) noexcept#

Switches to the run state in which the input has been determined to be UTF-8 encoded.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • copy_buffer – True if the contents of buffer_ should be copied immediately to the output.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator run16_start(OutputIterator dest) noexcept#

Switches to the run state in which the input has been determined to be UTF-16 encoded.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator run16(input_type const value, OutputIterator dest) noexcept#

Handler for the states::run_16be_byte1 and states::run_16le_byte1 states.

This function is called once we have received the second byte of a UTF-16 code unit. We build the native-endian version of the 16-bit value and pass it to the transcoder which will be expecting UTF-16. The FSM is then reset to expect byte 0 of the next code unit.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • value – The final byte of the 16-bit code unit.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<ToEncoding> OutputIterator>
inline OutputIterator run32(input_type const value, OutputIterator dest) noexcept#

Handler for the state::run_32be_byte3 and state::run_32le_byte3 states.

This function is called once we have received all four bytes of a UTF-32 code unit. We build the native-endian version of the 32-bit value and pass it to the transcoder which will be expecting UTF-32. The FSM is then reset to expect byte 0 of the next code unit.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • value – The final byte of the 32-bit code unit.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr char16_t char16_from_big_endian_buffer(input_type const value) const noexcept#

Produces a native-endian 16-bit value from big endian encoded input by combining the first entry in the buffer_ array with value.

Parameters:

value – An input byte

Returns:

A native-endian 16 bit value.

inline constexpr char16_t char16_from_little_endian_buffer(input_type const value) const noexcept#

Produces a native-endian 16-bit value from little endian encoded input by combining the first entry in the buffer_ array with value.

Parameters:

value – An input byte

Returns:

A native-endian 16 bit value.

inline constexpr char32_t char32_from_big_endian_buffer(input_type const value) const noexcept#

Produces a native-endian 32-bit value from big endian encoded input by combining the entries in the buffer_ array with value.

Parameters:

value – An input byte

Returns:

A native-endian 32 bit value.

inline constexpr char32_t char32_from_little_endian_buffer(input_type const value) const noexcept#

Produces a native-endian 32-bit value from little endian encoded input by combining the entries in the buffer_ array with value.

Parameters:

value – An input byte

Returns:

A native-endian 32 bit value.

Private Members

states state_ = states::start#

The current state of the FSM.

std::array<std::byte, 4> buffer_ = {}#

A buffer into which input bytes are gathered as a complete code unit is being assembled by the state machine.

std::variant<std::monostate, t8_type, t16_type, t32_type> transcoder_variant_#

Holds the transcoder used to convert input code units. Holds monostate until the input encoding has been selected.

Private Static Functions

static inline constexpr std::byte byte_no(std::uint_least8_t index) noexcept#

Parameters:

index – The byte number within the BOM encoding. Must be in the range [0,3].

Returns:

A value which can be bitwise ORed to represent a particular byte within a BOM encoding.

static inline constexpr std::uint_least8_t get_byte_no(states const state) noexcept#

Extracts the byte number referenced by the argument.

Each of the valid FSM states has an embedded byte number in the range [0..4). This is the current byte of the current code unit as it is being assembled by the FSM.

Parameters:

state – A valid state machine state.

Returns:

The byte number referenced by state.

static inline constexpr states set_byte(states const state, std::uint_least8_t const byte_number) noexcept#

Returns a state which references a specific byte number.

Parameters:
  • state – A valid state machine state. The returned state will be the same as this argument but with the byte number set to byte_number.

  • byte_number – The byte to be referenced. Must be in the range [0..4).

Returns:

A state referencing the supplied byte number.

static inline constexpr states next_byte(states const state) noexcept#

Returns a state which references the next byte number.

Parameters:

state – A valid state machine state. The returned state will be the same as this argument but with the byte number incremented.

Returns:

A state referencing the next byte number.

static inline constexpr states set_run_mode(states const state) noexcept#

Adjusts a state so that run mode is selected.

Parameters:

state – A valid state machine state.

Returns:

The modified state.

static inline constexpr std::byte bom_value(std::byte const state_byte, std::uint_least8_t const byte_number) noexcept#

Private Static Attributes

static constexpr auto encoding_shift = 4U#

The number of places to left shift when constructing encoding values for the FSM state enumeration.

static constexpr auto endian_shift = 3U#

The number of places to left shift when constructing endian values for the FSM state enumeration.

static constexpr auto run_shift = 2U#

The number of places to left shift when constructing mode values for the FSM state enumeration.

static constexpr auto encoding_mask = std::byte{0b11 << encoding_shift}#

One of unknown or UTF-8/16/32.

static constexpr auto endian_mask = std::byte{1U << endian_shift}#

One of big_endian or little_endian.

static constexpr auto run_mask = std::byte{1U << run_shift}#

Run or bom mode.

static constexpr auto byte_no_mask = std::byte{0b11}#

Values from 0-3.

static constexpr auto encoding_utf16 = std::byte{0b00 << encoding_shift}#

UTF-16 BE or UTF-16 LE encoding. Bitwise-or this value to create a state representing UTF-16 (BE or LE) encoding.

static constexpr auto encoding_utf32 = std::byte{0b01 << encoding_shift}#

UTF-32 BE or UTF-32 LE encoding. Bitwise-or this value to create a state representing UTF-32 (BE or LE) encoding.

static constexpr auto encoding_utf8 = std::byte{0b10 << encoding_shift}#

UTF-8 encoding. Bitwise-or this value to create a state representing UTF-8 encoding.

static constexpr auto encoding_unknown = std::byte{0b11 << encoding_shift}#

The input encoding is not yet known.

static constexpr auto bom_mode = std::byte{0}#

Bitwise-or this value to create a state representing the FSM identifying the BOM. .

static constexpr auto run_mode = run_mask#

Bitwise-or this value to create a state representing the FSM consuming runs of bytes and emitting code-units. .

static constexpr auto big_endian = std::byte{0}#

Bitwise-or this value to create a state consuming big-endian values. .

static constexpr auto little_endian = endian_mask#

Bitwise-or this value to create a state consuming little-endian values. .

template<typename T>
struct is_nothrowable : public std::bool_constant<std::is_nothrow_constructible_v<T> && std::is_nothrow_copy_constructible_v<T> && std::is_nothrow_move_constructible_v<T> && std::is_nothrow_copy_assignable_v<T> && std::is_nothrow_move_assignable_v<T>>#

Inheritence diagram for icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable< T >" tooltip="icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable< T >" fillcolor="#BFBFBF"]
    "2" [label="std::bool_constant< std::is_nothrow_constructible_v< T > &&std::is_nothrow_copy_constructible_v< T > &&std::is_nothrow_move_constructible_v< T > &&std::is_nothrow_copy_assignable_v< T > &&std::is_nothrow_move_assignable_v< T > >" tooltip="std::bool_constant< std::is_nothrow_constructible_v< T > &&std::is_nothrow_copy_constructible_v< T > &&std::is_nothrow_move_constructible_v< T > &&std::is_nothrow_copy_assignable_v< T > &&std::is_nothrow_move_assignable_v< T > >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

Collaboration diagram for icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable< T >" tooltip="icubaby::transcoder< std::byte, ToEncoding >::is_nothrowable< T >" fillcolor="#BFBFBF"]
    "2" [label="std::bool_constant< std::is_nothrow_constructible_v< T > &&std::is_nothrow_copy_constructible_v< T > &&std::is_nothrow_move_constructible_v< T > &&std::is_nothrow_copy_assignable_v< T > &&std::is_nothrow_move_assignable_v< T > >" tooltip="std::bool_constant< std::is_nothrow_constructible_v< T > &&std::is_nothrow_copy_constructible_v< T > &&std::is_nothrow_move_constructible_v< T > &&std::is_nothrow_copy_assignable_v< T > &&std::is_nothrow_move_assignable_v< T > >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

A helper for ensuring that a type will not cause variant_ to become valueless by exception.

We must ensure that the transcoder_variant_ member cannot be in the valueless by exception state. This means that construction and assignment to the variant must never throw. This type is a helper to verify that a type cannot throw during default, copy, or move construction as well as copy or move assignment.

UTF-8 to UTF-32 Transcoder#

template<>
class transcoder<char8, char32_t>#

Takes a sequence of UTF-8 code units and converts them to UTF-32.

Public Types

using input_type = char8#

The type of the code units consumed by this transcoder.

using output_type = char32_t#

The type of the code units produced by this transcoder.

Public Functions

inline constexpr transcoder() noexcept#
inline explicit constexpr transcoder(bool well_formed) noexcept#

Initializes a transcoder instance with an initial value for its “well formed” state. This can be useful if converting a stream of data which may be using different encodings.

Parameters:

well_formed – The initial value for the transcoder’s “well formed” state.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest)#

Accepts a code unit in the UTF-8 source encoding. As UTF-32 output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of output_type can be written.

Parameters:
  • code_unit – A UTF-8 code unit,

  • dest – Iterator to which the output should be written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr OutputIterator end_cp(OutputIterator dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input represented well formed UTF-8.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Types

Values:

enumerator accept#
enumerator reject#

Private Members

uint_least32_t code_point_#

The code point value being assembled from input code units.

uint_least32_t well_formed_#

True if the input consumed is well formed, false otherwise.

uint_least32_t pad_#

Pad bits intended to put the next value to a byte boundary.

uint_least32_t state_#

The state of the converter.

Private Static Attributes

static std::array<uint8_t, 364> const utf8d_#

The utf8d_ table consists of two parts. The first part maps bytes to character classes, the second part encodes a deterministic finite automaton using these character classes as transitions.

UTF-16 to UTF-32 Transcoder#

template<>
class transcoder<char16_t, char32_t>#

Takes a sequence of UTF-16 code units and converts them to UTF-32.

Public Types

using input_type = char16_t#

The type of the code units consumed by this transcoder.

using output_type = char32_t#

The type of the code units produced by this transcoder.

Public Functions

inline constexpr transcoder() noexcept#
inline explicit constexpr transcoder(bool well_formed) noexcept#

Initializes a transcoder instance with an initial value for its “well formed” state. This can be useful if converting a stream of data which may be using different encodings.

Parameters:

well_formed – The initial value for the transcoder’s “well formed” state.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest) noexcept#

Accepts a code unit in the UTF-16 source encoding. As UTF-32 output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • code_unit – A code unit in the source encoding.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator end_cp(OutputIterator dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

The output iterator.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input represented well formed UTF-16.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Members

uint_least16_t high_#

The previous high surrogate that was passed to operator(). Valid if has_high_ is true.

uint_least16_t has_high_#

true if the previous code unit passed to operator() was a high surrogate, false otherwise.

uint_least16_t well_formed_#

true if the code units passed to operator() represent well formed UTF-16 input, false otherwise.

Private Static Functions

static inline std::uint_least16_t adjusted_high(std::uint_least16_t code_unit) noexcept#

This function returns a high surrogate value that can be stored in the high_ field.

The high surrogate value is stored after the first_high_surrogate value has been subtracted. This reduces the number of bits that we need to remember.

Parameters:

code_unit – A UTF-16 code unit for which icubaby::is_high_surrogate() returns true.

Returns:

A high surrogate value that can be stored in the class’s high_ field.

Private Static Attributes

static constexpr auto high_bits = 10U#

The number of bits required to represent a high surrogate value.

UTF-32 to UTF-16 Transcoder#

template<>
class transcoder<char32_t, char16_t>#

Takes a sequence of UTF-32 code units and converts them to UTF-16.

Public Types

using input_type = char32_t#

The type of the code units consumed by this transcoder.

using output_type = char16_t#

The type of the code units produced by this transcoder.

Public Functions

constexpr transcoder() noexcept = default#
inline explicit constexpr transcoder(bool well_formed) noexcept#

Initializes a transcoder instance with an initial value for its “well formed” state. This can be useful if converting a stream of data which may be using different encodings.

Parameters:

well_formed – The initial value for the transcoder’s “well formed” state.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest) noexcept#

Accepts a code unit in the UTF-32 source encoding. As UTF-16 output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of output_type can be written.

Parameters:
  • code_unit – A UTF-32 code unit,

  • dest – Iterator to which the output should be written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr OutputIterator end_cp(OutputIterator dest) noexcept#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

The output iterator.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input represented valid UTF-32.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Members

bool well_formed_ = true#

True if the input consumed is well formed, false otherwise.

UTF-32 to UTF-8 Transcoder#

template<>
class transcoder<char32_t, char8>#

Takes a sequence of UTF-32 code units and converts them to UTF-8.

Public Types

using input_type = char32_t#

The type of the code units consumed by this transcoder.

using output_type = char8#

The type of the code units produced by this transcoder.

Public Functions

constexpr transcoder() noexcept = default#
inline explicit constexpr transcoder(bool well_formed) noexcept#

Initializes a transcoder instance with an initial value for its “well formed” state. This can be useful if converting a stream of data which may be using different encodings.

Parameters:

well_formed – The initial value for the transcoder’s “well formed” state.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest) noexcept#

Accepts a code unit in the UTF-32 source encoding. As UTF-8 output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • code_unit – A code unit in the source encoding.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr OutputIterator end_cp(OutputIterator dest) const#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input represented well formed UTF-32.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Functions

template<typename OutputIterator>
inline OutputIterator not_well_formed(OutputIterator dest)#

Writes U+FFFD REPLACEMENT CHAR to the output and records the input as not well formed.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

Private Members

bool well_formed_ = true#

True if the input consumed is well formed, false otherwise.

Private Static Functions

template<typename OutputIterator>
static inline OutputIterator write2(input_type code_unit, OutputIterator dest)#

Writes a two CU value to the output.

Parameters:
  • code_unit – The code unit to be written.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<typename OutputIterator>
static inline OutputIterator write3(input_type code_unit, OutputIterator dest)#

Writes a three CU value to the output.

Parameters:
  • code_unit – The code unit to be written.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<typename OutputIterator>
static inline OutputIterator write4(input_type code_unit, OutputIterator dest)#

Writes a four CU value to the output.

Parameters:
  • code_unit – The code unit to be written.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

UTF-32 to UTF-32 Transcoder#

template<>
class transcoder<char32_t, char32_t>#

Takes a sequence of UTF-32 code units and converts them to UTF-32.

Public Types

using input_type = char32_t#

The type of the code units consumed by this transcoder.

using output_type = char32_t#

The type of the code units produced by this transcoder.

Public Functions

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest)#

Accepts a code unit in the UTF-32 source encoding. As UTF-32 output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of output_type can be written.

Parameters:
  • code_unit – A UTF-32 code unit,

  • dest – Iterator to which the output should be written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr OutputIterator end_cp(OutputIterator dest) const#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

The output iterator.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder, OutputIterator> end_cp(iterator<transcoder, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input represented well formed UTF-32.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Members

bool well_formed_ = true#

True if the input consumed is well formed, false otherwise.

Triangulator#

template<unicode_char_type FromEncoding, unicode_char_type ToEncoding>
class triangulator#

Collaboration diagram for icubaby::details::triangulator:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::details::triangulator< FromEncoding, ToEncoding >" tooltip="icubaby::details::triangulator< FromEncoding, ToEncoding >" fillcolor="#BFBFBF"]
    "3" [label="icubaby::transcoder< char32_t, output_type >" tooltip="icubaby::transcoder< char32_t, output_type >"]
    "2" [label="icubaby::transcoder< input_type, char32_t >" tooltip="icubaby::transcoder< input_type, char32_t >"]
    "1" -> "2" [dir=forward tooltip="usage"]
    "1" -> "3" [dir=forward tooltip="usage"]
}

A “triangulator” converts from the FromEncoding encoding to the ToEncoding encoding via an intermediate UTF-32 encoding.

Template Parameters:
  • FromEncoding – The source encoding.

  • ToEncoding – The destination encoding.

Public Types

using input_type = FromEncoding#

The type of the code units consumed by this transcoder.

using output_type = ToEncoding#

The type of the code units produced by this transcoder.

Public Functions

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator operator()(input_type code_unit, OutputIterator dest)#

Accepts a code unit in the source encoding (as given by triangulator::input_type). These are first converted to UTF-32 and then to the output encoding (double_transcoder::output_type). As output code units are generated, they are written to the output iterator dest.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:
  • code_unit – A code unit in the source encoding.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline OutputIterator end_cp(OutputIterator dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

template<std::output_iterator<output_type> OutputIterator>
inline constexpr iterator<transcoder<FromEncoding, ToEncoding>, OutputIterator> end_cp(iterator<transcoder<FromEncoding, ToEncoding>, OutputIterator> dest)#

Call once the entire input sequence has been fed to operator(). This function ensures that the sequence did not end with a partial code point.

Template Parameters:

OutputIterator – An output iterator type to which values of type output_type can be written.

Parameters:

dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

inline constexpr bool well_formed() const noexcept#
Returns:

True if the input passed to operator() was valid.

inline constexpr bool partial() const noexcept#
Returns:

True if a partial code-point has been passed to operator() and false otherwise.

Private Functions

template<typename InputIterator, typename OutputIterator>
inline OutputIterator copy(InputIterator first, InputIterator last, OutputIterator dest)#

Copies the range [first, last) to the output iterator dest via the output_ transcoder.

Parameters:
  • first – The first of the range to copy.

  • last – The last of the range to copy.

  • dest – An output iterator to which the output sequence is written.

Returns:

Iterator one past the last element assigned.

Private Members

transcoder<input_type, char32_t> intermediate_#

We use the intermediate_ transcoder to convert from the input encoding to UTF-32.

transcoder<char32_t, output_type> output_#

The output_ transcoder converts from the intermediate (UTF-32) encoding to the selected output encoding.

UTF-8 to UTF-16 Transcoder#

template<>
class transcoder<char8, char16_t> : public icubaby::details::triangulator<char8, char16_t>#

Inheritence diagram for icubaby::transcoder< char8, char16_t >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char8, char16_t >" tooltip="icubaby::transcoder< char8, char16_t >" fillcolor="#BFBFBF"]
    "2" [label="icubaby::details::triangulator< char8, char16_t >" tooltip="icubaby::details::triangulator< char8, char16_t >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

Collaboration diagram for icubaby::transcoder< char8, char16_t >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char8, char16_t >" tooltip="icubaby::transcoder< char8, char16_t >" fillcolor="#BFBFBF"]
    "4" [label="icubaby::transcoder< char32_t, output_type >" tooltip="icubaby::transcoder< char32_t, output_type >"]
    "3" [label="icubaby::transcoder< input_type, char32_t >" tooltip="icubaby::transcoder< input_type, char32_t >"]
    "2" [label="icubaby::details::triangulator< char8, char16_t >" tooltip="icubaby::details::triangulator< char8, char16_t >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
    "2" -> "3" [dir=forward tooltip="usage"]
    "2" -> "4" [dir=forward tooltip="usage"]
}

Takes a sequence of UTF-8 code units and converts them to UTF-16.

UTF-16 to UTF-8 Transcoder#

template<>
class transcoder<char16_t, char8> : public icubaby::details::triangulator<char16_t, char8>#

Inheritence diagram for icubaby::transcoder< char16_t, char8 >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char16_t, char8 >" tooltip="icubaby::transcoder< char16_t, char8 >" fillcolor="#BFBFBF"]
    "2" [label="icubaby::details::triangulator< char16_t, char8 >" tooltip="icubaby::details::triangulator< char16_t, char8 >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

Collaboration diagram for icubaby::transcoder< char16_t, char8 >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char16_t, char8 >" tooltip="icubaby::transcoder< char16_t, char8 >" fillcolor="#BFBFBF"]
    "4" [label="icubaby::transcoder< char32_t, output_type >" tooltip="icubaby::transcoder< char32_t, output_type >"]
    "3" [label="icubaby::transcoder< input_type, char32_t >" tooltip="icubaby::transcoder< input_type, char32_t >"]
    "2" [label="icubaby::details::triangulator< char16_t, char8 >" tooltip="icubaby::details::triangulator< char16_t, char8 >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
    "2" -> "3" [dir=forward tooltip="usage"]
    "2" -> "4" [dir=forward tooltip="usage"]
}

Takes a sequence of UTF-16 code units and converts them to UTF-8.

UTF-8 to UTF-8 Transcoder#

template<>
class transcoder<char8, char8> : public icubaby::details::triangulator<char8, char8>#

Inheritence diagram for icubaby::transcoder< char8, char8 >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char8, char8 >" tooltip="icubaby::transcoder< char8, char8 >" fillcolor="#BFBFBF"]
    "2" [label="icubaby::details::triangulator< char8, char8 >" tooltip="icubaby::details::triangulator< char8, char8 >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

Collaboration diagram for icubaby::transcoder< char8, char8 >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char8, char8 >" tooltip="icubaby::transcoder< char8, char8 >" fillcolor="#BFBFBF"]
    "4" [label="icubaby::transcoder< char32_t, output_type >" tooltip="icubaby::transcoder< char32_t, output_type >"]
    "3" [label="icubaby::transcoder< input_type, char32_t >" tooltip="icubaby::transcoder< input_type, char32_t >"]
    "2" [label="icubaby::details::triangulator< char8, char8 >" tooltip="icubaby::details::triangulator< char8, char8 >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
    "2" -> "3" [dir=forward tooltip="usage"]
    "2" -> "4" [dir=forward tooltip="usage"]
}

Takes a sequence of UTF-8 code units and converts them to UTF-8.

UTF-16 to UTF-16 Transcoder#

template<>
class transcoder<char16_t, char16_t> : public icubaby::details::triangulator<char16_t, char16_t>#

Inheritence diagram for icubaby::transcoder< char16_t, char16_t >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char16_t, char16_t >" tooltip="icubaby::transcoder< char16_t, char16_t >" fillcolor="#BFBFBF"]
    "2" [label="icubaby::details::triangulator< char16_t, char16_t >" tooltip="icubaby::details::triangulator< char16_t, char16_t >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
}

Collaboration diagram for icubaby::transcoder< char16_t, char16_t >:

digraph {
    graph [bgcolor="#00000000"]
    node [shape=rectangle style=filled fillcolor="#FFFFFF" font=Helvetica padding=2]
    edge [color="#1414CE"]
    "1" [label="icubaby::transcoder< char16_t, char16_t >" tooltip="icubaby::transcoder< char16_t, char16_t >" fillcolor="#BFBFBF"]
    "4" [label="icubaby::transcoder< char32_t, output_type >" tooltip="icubaby::transcoder< char32_t, output_type >"]
    "3" [label="icubaby::transcoder< input_type, char32_t >" tooltip="icubaby::transcoder< input_type, char32_t >"]
    "2" [label="icubaby::details::triangulator< char16_t, char16_t >" tooltip="icubaby::details::triangulator< char16_t, char16_t >"]
    "1" -> "2" [dir=forward tooltip="public-inheritance"]
    "2" -> "3" [dir=forward tooltip="usage"]
    "2" -> "4" [dir=forward tooltip="usage"]
}

Takes a sequence of UTF-16 code units and converts them to UTF-16.