A custom string to handle UTF8 natively in C#

Pascal Ganaye

5.00/5 (2 votes)

Dec 15, 2011

CPOL

14587

A custom C# string implementation that stores its data in a UTF8 byte array.

I am doing a lot of UTF8 processing and I found that most of the processor time is wasted encoding and decoding byte array back and to strings. I am experimenting with the idea of not converting the entire data to string. This should run faster and take less space in memory. The idea is NOT to try to encode faster than the framework. The idea is to load in a byte array and transform only what's required from and to utf8 when required. Thought it might be useful.

    public struct Utf8String : IEnumerable<char>
    {
        private readonly Byte[] m_bytes;
        private static Byte[] mCharLength;

        static Utf8String()
        {
            mCharLength = new Byte[256];
            int i = 0;
            while (/* i>=0x00 && */ i <= 0x7F) mCharLength[i++] = 1;
            while (/* i>=0x80 && */ i <= 0xBF) mCharLength[i++] = 1; // invalid 
            while (/* i>=0xC0 && */ i <= 0xDF) mCharLength[i++] = 2;
            while (/* i>=0xE0 && */ i <= 0xEF) mCharLength[i++] = 3;
            while (/* i>=0xF0 && */ i <= 0xF7) mCharLength[i++] = 1; // 4 but not available in Windows
            while (/* i>=0xF8 && */ i <= 0xFB) mCharLength[i++] = 1; // 5 but not available in Windows
            while (/* i>=0xFC && */ i <= 0xFD) mCharLength[i++] = 1; // 6 but not available in Windows
            mCharLength[0xFE] = 1; // invalid
            mCharLength[0xFF] = 1; // invalid
        }

        public Utf8String(string value)
        {
            m_bytes = Encoding.UTF8.GetBytes(value);
        }

        /// <summary>
        /// We can't make this public as people would then have 
        /// a point to the internal array and potentially change it.
        /// </summary>
        /// <param name="value"></param>
        private Utf8String(Byte[] value)
        {
            m_bytes = value;
        }

        public static implicit operator Utf8String(string value)
        {
            return new Utf8String(value);
        }

        public static Utf8String operator +(Utf8String a, Utf8String b)
        {
            int alength = a.m_bytes.Length;
            int blength = b.m_bytes.Length;
            var newBytes = new Byte[alength + blength];
            a.m_bytes.CopyTo(newBytes, 0);
            b.m_bytes.CopyTo(newBytes, alength);
            return new Utf8String(newBytes);
        }

        public override string ToString()
        {
            // not efficient this is why I add the  (... bytes) at the end to discourage people using this method.
            return string.Format("{0} ({1} bytes)", System.Text.Encoding.UTF8.GetString(m_bytes), m_bytes.Length);
        }

        public static bool operator ==(Utf8String a, Utf8String b)
        {

            return CompareArrays(a.m_bytes, b.m_bytes);
        }

        public static bool operator !=(Utf8String a, Utf8String b)
        {
            return !CompareArrays(a.m_bytes, b.m_bytes);
        }

        private static bool CompareArrays(byte[] a, byte[] b)
        {
            if (a.Length != b.Length)
            {
                return false;
            }
            int len = a.Length;
            for (int i = 0; i < len; i++)
            {
                if (a[i] != b[i])
                {
                    return false;
                }
            }
            return true;
        }

        public int length
        {
            get
            {
                int result = 0;
                int len = m_bytes.Length;
                int mbytesIndex = 0;
                while (mbytesIndex < len)
                {
                    mbytesIndex += mCharLength[m_bytes[mbytesIndex]];
                    result++;
                }
                return result;
            }
        }

        internal Utf8String SubString(int startIndex, int length)
        {
            if (startIndex < 0) throw new ArgumentOutOfRangeException("startIndex");
            var startmBytesIndex = GetmBytesIndex(startIndex);
            if (startmBytesIndex < 0) throw new ArgumentOutOfRangeException("startIndex");

            if (length == 0) return Utf8String.Empty;
            if (length < 0) throw new ArgumentOutOfRangeException("length");

            var endmBytesIndex = GetmBytesIndex(length, startmBytesIndex);
            if (endmBytesIndex < 0) throw new ArgumentOutOfRangeException("length");

            if (startIndex == 0 && length == m_bytes.Length) return this;

            var newBytes = new Byte[endmBytesIndex - startmBytesIndex];
            Array.Copy(m_bytes, startmBytesIndex, newBytes, 0, endmBytesIndex - startmBytesIndex);
            return new Utf8String(newBytes);
        }

        internal Utf8String SubString(int startIndex)
        {
            if (startIndex == 0) return this;

            if (startIndex < 0) throw new ArgumentOutOfRangeException("startIndex");
            var startmBytesIndex = GetmBytesIndex(startIndex);
            if (startmBytesIndex < 0) throw new ArgumentOutOfRangeException("startIndex");

            var newBytes = new Byte[m_bytes.Length - startmBytesIndex];
            Array.Copy(m_bytes, startmBytesIndex, newBytes, 0, m_bytes.Length - startmBytesIndex);
            return new Utf8String(newBytes);
        }

        private int GetmBytesIndex(int charCount, int mbytesIndex = 0)
        {
            if (charCount == 0) return mbytesIndex;
            int len = m_bytes.Length;
            while (mbytesIndex < len)
            {
                mbytesIndex += mCharLength[m_bytes[mbytesIndex]];
                charCount--;
                if (charCount == 0) return mbytesIndex;
            }
            return -1;
        }

        public static readonly Utf8String Empty = new Utf8String(new byte[] { });

        public IEnumerator<char> GetEnumerator()
        {
            return new Utf8StringEnumerator(this);
        }

        System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
        {
            return new Utf8StringEnumerator(this);
        }

        class Utf8StringEnumerator : IEnumerator<char>
        {
            private Byte[] m_bytes;
            private int mBytesIndex;
            private char mCurChar;
            private int mLength;

            public char Current
            {
                get
                {
                    return mCurChar;
                }
            }

            object System.Collections.IEnumerator.Current
            {
                get { return mCurChar; }
            }

            internal Utf8StringEnumerator(Utf8String source)
            {
                m_bytes = source.m_bytes;
                mLength = m_bytes.Length;
                Reset();
            }

            public bool MoveNext()
            {
                if (mBytesIndex < mLength)
                {
                    int curCharLength = Utf8String.mCharLength[m_bytes[mBytesIndex]];
                    if (mBytesIndex + curCharLength< mLength)
                    {
                        mCurChar = System.Text.Encoding.UTF8.GetString(m_bytes, mBytesIndex, curCharLength)[0];
                        mBytesIndex += curCharLength;
                        return true;
                    }
                }
                return false;
            }

            public void Reset()
            {
                this.mCurChar = '\0';
                this.mBytesIndex = 0;
            }

            public void Dispose()
            {
            }

        }
    }

Here is an attempt at doing the UTF8/Char conversions in C#:

public static char Utf8ToChar(byte[] bytes)
{
    if (bytes.Length == 1)
    {
        return (char)bytes[0];
    }
    else if (bytes.Length == 2)
    {
        return (char)((bytes[0] - 0xc0) * 0x40 + (bytes[1] - 0x80));
    }
    else if (bytes.Length == 3)
    {
        return (char)((bytes[0] - 0xE0) * 0x1000 + (bytes[1] - 0x80) * 0x40 + (bytes[2] - 0x80));
    }
    else return (char)0;
}

public static byte[] CharToUtf8(char c)
{
    if (c &lt; 0x80)
    {
        return new byte[] { (byte)c };
    }
    else
    {
        byte c0 = (byte)(c &amp; 0x3f);
        byte c1 = (byte)((c &gt;&gt; 6) &amp; 0x3f);
        byte c2 = (byte)((c &gt;&gt; 12) &amp; 0x3f);

        if (c &lt; 0x800)
        {
            return new byte[] { (byte)(0xC0 + c1), (byte)(0x80 + c0) };
        }
        else if (c &lt; 0xd800 || c &gt;= 0xe000)
        {
            return new byte[] { (byte)(0xE0 + c2), (byte)(0x80 + c1), (byte)(0x80 + c0) };
        }
        else
        {
            return new byte[] { 0xEF, 0xBF, 0xBD };
        }
    }
}