//----------------------------------------------------------------------------
//
// TSDuck - The MPEG Transport Stream Toolkit
// Copyright (c) 2005-2026, Thierry Lelegard
// BSD-2-Clause license, see LICENSE.txt file or https://tsduck.io/license
//
//----------------------------------------------------------------------------
//!
//!  @file
//!  Declaration of class ARIBCharset.
//!
//----------------------------------------------------------------------------

#pragma once
#include "tsCharset.h"

namespace ts {
    //!
    //! Definition of the ARIB STD-B24 character set (ISDB Japan).
    //!
    //! Limitations:
    //! - The decoding operation produces a Unicode string (UString).
    //!   All non-Unicode concepts in ARIB STD-B24 are dropped (colors,
    //!   placemenent, Mosaic and DRCS character sets).
    //! - User-defined macros are ignored, only predefined macros are
    //!   decoded.
    //! - The encoding operation is character-based and not optimized.
    //!   The resulting data could be shorter with a prior global analysis
    //!   of the input string.
    //!
    //! @see ARIB STD-B24, chapter 7
    //! @see ARIB STD-B62, fascicle 1, part 2, chapter 5
    //! @see ISO/IEC 2022
    //! @see https://en.wikipedia.org/wiki/ARIB_STD_B24_character_set
    //! @see https://en.wikipedia.org/wiki/ISO/IEC_2022
    //! @ingroup libtsduck mpeg
    //!
    class TSDUCKDLL ARIBCharset: public Charset
    {
        TS_NOCOPY(ARIBCharset);
    public:
        //! Only one predefined character set for ARIB STD-B24.
        static const ARIBCharset B24;

        // Inherited methods.
        virtual bool decode(UString& str, const uint8_t* data, size_t size) const override;
        virtual bool canEncode(const UString& str, size_t start = 0, size_t count = NPOS) const override;
        virtual size_t encode(uint8_t*& buffer, size_t& size, const UString& str, size_t start = 0, size_t count = NPOS) const override;

    private:
        // Private constructor since only one instance is available.
        ARIBCharset(std::initializer_list<const UChar*> names);

        // The decoding tables are manually crafted from the ARIB STD-24 standard.
        // The encoding tables are generated by a tool named aribb24 (see src/utils/aribb24.cpp).
        // Give it access to the decoding tables.
        friend class ARIBCharsetCodeGenerator;

        // Control codes. See ARIB STD-B24, part 2, table 7-14.
        static constexpr uint8_t NUL  = 0x00;
        static constexpr uint8_t BEL  = 0x07;
        static constexpr uint8_t APB  = 0x08;
        static constexpr uint8_t APF  = 0x09;
        static constexpr uint8_t APD  = 0x0A;
        static constexpr uint8_t APU  = 0x0B;
        static constexpr uint8_t CS   = 0x0C;
        static constexpr uint8_t APR  = 0x0D;
        static constexpr uint8_t LS1  = 0x0E;
        static constexpr uint8_t LS0  = 0x0F;
        static constexpr uint8_t PAPF = 0x16;
        static constexpr uint8_t CAN  = 0x18;
        static constexpr uint8_t SS2  = 0x19;
        static constexpr uint8_t ESC  = 0x1B;
        static constexpr uint8_t APS  = 0x1C;
        static constexpr uint8_t SS3  = 0x1D;
        static constexpr uint8_t RS   = 0x1E;
        static constexpr uint8_t US   = 0x1F;
        static constexpr uint8_t SP   = 0x20;
        static constexpr uint8_t DEL  = 0x7F;
        static constexpr uint8_t BKF  = 0x80;
        static constexpr uint8_t RDF  = 0x81;
        static constexpr uint8_t GRF  = 0x82;
        static constexpr uint8_t YLF  = 0x83;
        static constexpr uint8_t BLF  = 0x84;
        static constexpr uint8_t MGF  = 0x85;
        static constexpr uint8_t CNF  = 0x86;
        static constexpr uint8_t WHF  = 0x87;
        static constexpr uint8_t SSZ  = 0x88;
        static constexpr uint8_t MSZ  = 0x89;
        static constexpr uint8_t NSZ  = 0x8A;
        static constexpr uint8_t SZX  = 0x8B;
        static constexpr uint8_t COL  = 0x90;
        static constexpr uint8_t FLC  = 0x91;
        static constexpr uint8_t CDC  = 0x92;
        static constexpr uint8_t POL  = 0x93;
        static constexpr uint8_t WMM  = 0x94;
        static constexpr uint8_t MACRO= 0x95;
        static constexpr uint8_t HLC  = 0x97;
        static constexpr uint8_t RPC  = 0x98;
        static constexpr uint8_t SPL  = 0x99;
        static constexpr uint8_t STL  = 0x9A;
        static constexpr uint8_t CSI  = 0x9B;
        static constexpr uint8_t TIME = 0x9D;

        // Characters are grouped in rows of 94 characters which are mapped in
        // ranges 0x21-0x7E (GL) or 0xA1-0xFE (GR). We store unicode code points
        // as 32-bit values because a small portion of the mapped character sets
        // used 17 bits. When stored in UString, they will use surrogate pairs.

        static constexpr uint8_t GL_FIRST = 0x21;
        static constexpr uint8_t GL_LAST  = 0x7E;
        static constexpr uint8_t GR_FIRST = 0xA1;
        static constexpr uint8_t GR_LAST  = 0xFE;
        static constexpr size_t  CHAR_ROW_SIZE = 94;

        using CharRow = char32_t[CHAR_ROW_SIZE];

        // Several contiguous rows are described in a structure.
        struct CharRows
        {
            size_t         first;  // First row (starting at 0).
            size_t         count;  // Number of 94-byte rows.
            const CharRow* rows;   // Address of first (or only) character row.
        };

        // Max number of CharRows in a character map.
        static constexpr size_t MAX_ROWS = 4;

        // Description of a character mapping.
        // See ARIB STD-B24, part 2, chapter 7, table 7-3, for the list of selector bytes.
        struct CharMap
        {
            bool     byte2;           // True: 2-byte mapping, false: 1-byte mapping.
            bool     macro;           // True: this is the macro character set, not a table-based one.
            uint8_t  selector1;       // Selector byte (escape sequence final F). Preferred one for encoding.
            uint8_t  selector2;       // Alternate selector byte (escape sequence final F).
            CharRows rows[MAX_ROWS];  // A list of contiguous rows.
        };

        // All supported character maps are placed in a static array.
        // This table is used by the ARIB STD-B24 decoder.
        // When a Unicode point is shared by several character sets
        // (base rows of Kanji map for instance), the first one is used.
        // The last address in the array is a null pointer.
        // Note: the macro map is not table-based and not stored here.
        static const CharMap* const ALL_MAPS[];

        // Definition of known character maps.
        static const CharMap UNSUPPORTED_1BYTE;  // empty map for unsupported 1-byte character sets
        static const CharMap UNSUPPORTED_2BYTE;  // empty map for unsupported 2-byte character sets
        static const CharMap MACRO_MAP;          // dummy map for the macro character set
        static const CharMap ALPHANUMERIC_MAP;
        static const CharMap HIRAGANA_MAP;
        static const CharMap KATAKANA_MAP;
        static const CharMap JIS_X0201_KATAKANA_MAP;
        static const CharMap KANJI_STANDARD_MAP;
        static const CharMap KANJI_ADDITIONAL_MAP;

        static const CharRow ALPHANUMERIC_ROW;
        static const CharRow HIRAGANA_ROW;
        static const CharRow KATAKANA_ROW;
        static const CharRow JIS_X0201_KATAKANA_ROW;
        static const CharRow KANJI_BASE_ROWS[86];
        static const CharRow KANJI_STANDARD_ROWS[5];
        static const CharRow KANJI_ADDITIONAL_ROWS[5];

        // Predefined macro sequences.
        static constexpr uint8_t PREDEF_MACRO_BASE = 0x60;
        static constexpr size_t PREDEF_MACRO_COUNT = 16;
        struct PredefMacro
        {
            size_t  size;         // Number of bytes in content.
            uint8_t content[19];  // Macro content (19 is the size of longest predefined macro).
        };
        static const PredefMacro PREDEF_MACROS[PREDEF_MACRO_COUNT];

        // Definition of an entry in the encoding table.
        // This table is used by the ARIB STD-B24 encoder.
        // There is one entry per slice of contiguous code points.
        // The 32-bit encoded entry contains 5 fields:
        // - 1 bit: 2-byte charset (bool)
        // - 7 bits: escape sequence final F for the charset
        // - 8 bits: row number (0x21-0x7F) for 2-byte charset
        // - 8 bits: character encoding (0x21-0x7F)
        // - 8 bits: number of characters in the slice (1-94)
        struct EncoderEntry {
            char32_t code_point;  // Base code point of a slice.
            uint32_t entry;       // 32-bit encoded entry.

            bool byte2() const { return (entry & 0x80000000) != 0; }
            uint8_t selectorF() const { return (entry >> 24) & 0x7F; }
            uint8_t row() const { return (entry >> 16) & 0x7F; }
            uint8_t index() const { return (entry >> 8) & 0x7F; }
            uint32_t count() const { return entry & 0x000000FF; }
            bool contains(char32_t cp) const { return cp >= code_point && cp < code_point + count(); }
        };

        // Encoding table.
        static const size_t ENCODING_COUNT;
        static const EncoderEntry ENCODING_TABLE[];

        // Find the encoding entry for a Unicode point.
        // Use a hint for where to start the search (typically from a previous search).
        // Return NPOS when none is found.
        static size_t FindEncoderEntry(char32_t code_point, size_t hint = NPOS);

        // An internal decoder class. Using ARIB STD-B24 notation.
        class Decoder
        {
            TS_NOBUILD_NOCOPY(Decoder);
        public:
            // The decoding is done in the constructor.
            // The decoded characters are appended in str.
            Decoder(UString& str, const uint8_t* data, size_t size);

            // Get the decoding status from the constructor.
            bool success() const { return _success; }

        private:
            bool           _success;
            UString&       _str;
            const uint8_t* _data;
            size_t         _size;
            const CharMap* _G[4];     // G0-G3 character sets.
            uint8_t        _GL;       // 0-3 index in _G[], current left character set
            uint8_t        _GR;       // 0-3 index in _G[], current right character set
            uint8_t        _lockedGL; // 0-3 index in _G[], locked left character set

            // Nested decoding for macros: use the current mappings of another decoder.
            Decoder(const Decoder& other, const uint8_t* data, size_t size);

            // Decode all characters in specified memory area.
            void decodeAll(const uint8_t* data, size_t size);

            // Decode one character and append to str. Update data and size.
            bool decodeOneChar(const CharMap* gset);

            // Process an escape sequence starting at current byte (after ESC).
            bool escape();

            // Process a character in C0 or C1 areas.
            bool processControl();

            // Check if next character matches c. If yes, update data and size.
            bool match(uint8_t c);

            // Get a character set from an ESC sequence "final byte" F.
            const CharMap* finalToCharMap(uint8_t f, bool gset_not_drcs) const;
        };

        // An internal encoder class.
        class Encoder
        {
            TS_NOBUILD_NOCOPY(Encoder);
        public:
            // The encoding is done in the constructor.
            Encoder(uint8_t*& out, size_t& out_size, const UChar*& in, size_t&in_count);

        private:
            uint8_t  _G[4];       // G0-G3 escape sequence final selector F for the character set.
            bool     _byte2[4];   // G0-G3 is 2-byte encoding (vs. 1-byte).
            uint8_t  _GL;         // 0-3 index in _G[], current left character set
            uint8_t  _GR;         // 0-3 index in _G[], current right character set
            bool     _GL_last;    // true if GL was used last (ie. not GR)
            uint16_t _Gn_history; // 4 nibbles with values 0,1,2,3, MSB=oldest, LSB=last-used

            // Check if Gn (n=0-3) is alphanumeric.
            bool isAlphaNumeric(uint8_t index) const;

            // Encode a space, alphanumeric or ideographic.
            // Return false if there is not enough room in the output buffer.
            bool encodeSpace(uint8_t*& out, size_t& out_size, bool ideographic);

            // Switch to a given character set (from selector F).
            // If a switch needs to be made, insert the switch sequence in the
            // output buffer and make sure there is room for at least one character.
            // Return false if there is not enough room in the output buffer.
            bool selectCharSet(uint8_t*& out, size_t& out_size, uint8_t selectorF, bool byte2);

            // Select GL/GR from G0-3 for a given selector F. Return escape sequence size.
            // Escape sequence buffer must be at least 2 characters long.
            size_t selectGLR(uint8_t* seq, uint8_t F);

            // Set G0-3 to a given selector F. Return escape sequence size.
            // Escape sequence buffer must be at least 5 characters long.
            size_t selectG0123(uint8_t* seq, uint8_t F, bool byte2);
        };
    };
}
