Cosmos/source/Cosmos.System2/Encoding/CosmosUTF8Encoding.cs

#define COSMOSDEBUG
using Cosmos.System;
using System;
using System.Collections.Generic;
using System.Text;

namespace Cosmos.System2.Encoding
{
    public class CosmosUTF8Encoding : CosmosEncoding
    {
        private const uint UNI_REPLACEMENT_CHAR = 0x0000FFFD;
        private const uint UNI_SUR_HIGH_START = 0xD800;
        private const uint UNI_SUR_HIGH_END = 0xDBFF;
        private const uint UNI_SUR_LOW_START = 0xDC00;
        private const uint UNI_SUR_LOW_END = 0xDFFF;
        private const uint UNI_MAX_BMP = 0x0000FFFF;
        private const uint UNI_MAX_UTF16 = 0x0010FFFF;
        private const int  halfShift = 10;
        private const int  halfBase = 0x0010000;
        private const uint halfMask = 0x3FF;

        /*
         * Index into the table below with the first byte of a UTF-8 sequence to
         * get the number of trailing bytes that are supposed to follow it.
         * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
         * left as-is for anyone who may want to do such conversion, which was
         * allowed in earlier algorithms.
         */
        private static int[] trailingBytesForUTF8 = new int[] {
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
                2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
         };

        /*
         * Magic values subtracted from a buffer value during UTF8 conversion.
         * This table contains as many values as there might be trailing bytes
         * in a UTF-8 sequence.
         */
        static uint[] offsetsFromUTF8 = new uint[] { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080,
                                                     0x82082080 };

        private static int GetCharBytes(uint ch, byte[] bytes, int byteIndex, int bytePos)
        {
            int bytesToWrite;

            // Filter out byte order marks and invalid character 0xFFFF
            if ((ch == 0xFEFF) || (ch == 0xFFFE) || (ch == 0xFFFF))
            {
                return bytePos;
            }

            /* Figure out how many bytes the result will require */
            if (ch < 0x80) /* 0XXX XXXX one byte */
                bytesToWrite = 1;
            else if (ch < 0x800) /* 110X XXXX two bytes */
                bytesToWrite = 2;
            else if (ch < 0x10000) /* 1110 XXXX three bytes */
                bytesToWrite = 3;
            else if (ch < 0x110000) /* 1111 0XXX four bytes */
                bytesToWrite = 4;
            else /* Invalid Unicode sequence Encode it as UNI_REPLACEMENT_CHAR */
            {
                ch = UNI_REPLACEMENT_CHAR;
                return GetCharBytes(ch, bytes, byteIndex, bytePos);
            }

            /* Check if there is sufficient space on bytes before writing on it */
            if (bytes.Length - (byteIndex + bytePos) < bytesToWrite)
                throw new ArgumentException("bytes has no sufficient space");

            switch (bytesToWrite)
            {
                case 1:
                    bytes[byteIndex + bytePos + 0] = (byte)ch;
                    break;

                case 2:
                    bytes[byteIndex + bytePos + 0] = (byte)(0xC0 | (ch >> 6));
                    bytes[byteIndex + bytePos + 1] = (byte)(0x80 | (ch & 0x3F));
                    break;

                case 3:
                    bytes[byteIndex + bytePos + 0] = (byte)(0xE0 | (ch >> 12));
                    bytes[byteIndex + bytePos + 1] = (byte)(0x80 | ((ch >> 6) & 0x3F));
                    bytes[byteIndex + bytePos + 2] = (byte)(0x80 | (ch & 0x3F));
                    break;

                case 4:
                    bytes[byteIndex + bytePos + 0] = (byte)(0xF0 | (ch >> 18));
                    bytes[byteIndex + bytePos + 1] = (byte)(0x80 | ((ch >> 12) & 0x3F));
                    bytes[byteIndex + bytePos + 2] = (byte)(0x80 | ((ch >> 6) & 0x3F));
                    bytes[byteIndex + bytePos + 3] = (byte)(0x80 | (ch & 0x3F));
                    break;
            }

            //bytePos += bytesToWrite;
            return bytesToWrite;
        }

        private static uint HandleSurrogatePairs(uint SurrFirst, uint SurrSecond)
        {
            if (SurrSecond >= UNI_SUR_LOW_START && SurrSecond <= UNI_SUR_LOW_END)
            {
                return ((SurrFirst - UNI_SUR_HIGH_START) << halfShift)
                      + (SurrSecond - UNI_SUR_LOW_START) + halfBase;
            }
            else /* it's an unpaired high surrogate */
            {
                throw new ArgumentException("Source contains unpaired surrogate");
            }
        }

        public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
        {
            if (chars == null)
            {
                Global.mFileSystemDebugger.SendInternal($"chars is null returning 0");
                return 0;
            }

            if (charIndex == 0 && charCount == 0)
            {
                Global.mFileSystemDebugger.SendInternal($"charIndex and charCount both 0 returning 0");
                return 0;
            }

            int bytePos = 0;

            for (int i = charIndex; i < charCount; i++)
            {
                uint ch = chars[i];
                /* If we have a surrogate pair, convert to UTF32 first. */
                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
                {
                    /* There is the next part of the surrogate? */
                    if (chars.Length >= i + 1)
                    {
                        i++;
                        ch = HandleSurrogatePairs(ch, chars[i]);
                    }
                    else
                        throw new ArgumentException("Source contains unpaired surrogate");
                }

                bytePos += GetCharBytes(ch, bytes, byteIndex, bytePos);
            }

            return bytePos;
        }

        /* Some UFT-8 "character" can occupy 4 bytes */
        public override int GetMaxByteCount(int ByteCount) => 4 * ByteCount;

        private static uint GetCharFromUFT8(byte[] bytes, out int bytesConsumed, int bytePos)
        {
            //uint ch = bytes[bytePos];
            uint ch = 0;

            int UtfTrailingBytes = trailingBytesForUTF8[bytes[bytePos]];
            int Uft8CharLen = UtfTrailingBytes + 1;
            bytesConsumed = Uft8CharLen;

            int i = bytePos;
            /* We "consume" the bytes and do the needed bitmasking to obtain the corrisponding codepoint */
            do
            {
                ch += bytes[i];
                i++;
                --Uft8CharLen;
                if (Uft8CharLen != 0)
                    ch <<= 6;
            } while (Uft8CharLen > 0);
            ch -= offsetsFromUTF8[UtfTrailingBytes];

            /* Target is a character <= 0xFFFF */
            if (ch <= UNI_MAX_BMP)
            {
                /* Invalid surrugates */
                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
                    return UNI_REPLACEMENT_CHAR;
                /* normal case */
                else
                    return (char)ch;
            }
            else if (ch > UNI_MAX_UTF16)
            {
                return UNI_REPLACEMENT_CHAR;
            }
            /* surrogate pairs */
            else
            {
                ushort lo = 0;
                ushort hi = 0;
                ch -= halfBase;
                hi = (ushort)((ch >> halfShift) + UNI_SUR_HIGH_START);
                lo = (ushort)((ch & halfMask) + UNI_SUR_LOW_START);
                /*
                 * We pack the two halves of the pair in an uint sadly we need to unpack them later
                 * the alternative was to make this function return an array of character that will be really
                 * used only in this case :-(
                 */
                ch = (uint)((uint)hi << 16 | (uint)lo);

                return ch;
            }
        }

        public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
        {
            uint ch = 0;
            int bytesConsumed = 0;
            //for (i = byteIndex; i < byteCount; i++)
            int numChar = 0;
            int bytePos = byteIndex;
            while (byteCount != 0)
            {
                ch = GetCharFromUFT8(bytes, out bytesConsumed, bytePos);
                /* check that chars has sufficient space */
                if (chars.Length < (charIndex + numChar))
                    throw new ArgumentException("chars has no sufficient space");

                if (ch < UNI_SUR_HIGH_START)
                    chars[charIndex + numChar] = (char)ch;
                else
                {
                    /* Unpach the uint in the two paired surrugates */
                    char chHigh = (char)(ch >> 16);
                    char chLow = (char)(ch & 0xFFFF);
                    chars[charIndex + numChar] = chHigh;
                    chars[charIndex + numChar + 1] = chLow;
                    numChar++;
                }

                /* skip the part of 'bytes' we have already consumed */
                byteCount -= bytesConsumed;
                bytePos += bytesConsumed;
                numChar++;
            }

            return numChar;
        }
    }
}