Click here to Skip to main content
14,981,456 members
Articles / Desktop Programming / WTL
Article
Posted 4 Mar 2004

Stats

36.9K views
16 bookmarked

PreParse XML using CString

Rate me:
Please Sign up or sign in to vote.
4.29/5 (3 votes)
4 Mar 2004
An article on XML parsing using CString.

Introduction

It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.

Background

I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.

Using the code

You read out the XML string buf, to say, strxml. Then just call PreFormatXML(strxml); after that, you can create an instance of xmldom, and call LoadXML.

The following are the functions involved:

void PreFormatXML(CString& strxml)
{
    // it is possible that the tooltip include a \r\n
    strxml.Remove(_T('\r'));
    strxml.Replace(_T('\n'),_T(' '));
    strxml.Replace(_T('\t'),_T(' '));
    //Dump(strxml,_T("e:\\bbb.xml"));

    //////////////////////// in the following, you should reassign \r\n
    ////////// now just one line
    int iStart = 0;
    int iEnd1,iEnd2;
    while(iStart>-1)
    {
        iEnd1 = strxml.Find(_T("<!--"),iStart);
        if(iEnd1<0) 
            break;
        iEnd2 = strxml.Find(_T("-->"),iEnd1);
        if(iEnd2<0) 
            break;

        if(iEnd2>iEnd1) {
            strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
            int n = strxml.Replace(strtemp,_T(" "));
            iStart = 0;
            continue;
        }
        iStart = iEnd2+1;
    }

    iStart  = 0;
    iEnd1    = 0;
    iEnd2    = 0;
    CString strcmp = strxml;
    while(iEnd2>-1)
    {
        iEnd1 = strxml.Find(_T('\"'),iStart);
        iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
        if(iEnd1>-1&&iEnd2>-1) {
            for(int i=iEnd1;i<iEnd2+1;i++)
                strcmp.SetAt(i,_T('*'));
        }
        else
            break;
        iStart = iEnd2+1;
    }

    strcmp.MakeLower();

    // minimize all tag and split concat attr
    int i1 = 0;
    int i2 = 0;
    int nlen = strcmp.GetLength();
    while(i1<nlen)
    {
        TCHAR ch1 = strcmp.GetAt(i1);
        if(ch1!=_T('*')) 
            strxml.SetAt(i2,ch1);

        if(ch1==_T('>'))
        {
            strxml.Insert(i2+1,_T('\n'));
            strxml.Insert(i2+1,_T('\r'));
            i2 += 2;
        }

        if(ch1==_T('*')) 
        {
            if(i1<nlen-1)
            {
                TCHAR ch2 = strcmp.GetAt(i1+1);
                if(ch2!=_T('*'))
                {
                    /// the 
                    if(ch2>_T('a'-1)&&ch2<_T('z'+1))
                    {
                        strxml.Insert(i2+1,_T(' '));
                        i2++;
                    }
                }
            }
        }
        i1++;
        i2++;
    }

    ////// the following remove duplicate tag, and makelower
    //     of everything except attribute value.
    ////int nlen;
    nlen = strxml.GetLength();
    strxml.Insert(nlen,_T("\r\n"));
    ////Dump(strxml,_T("e:\\aaaa.txt"));
    
    CString strtemp,strfake,strleft,strright;

    ///////////////////////////////////////////////////////////////////
    int size = m_ArrPreDefTag.GetSize();
    preTag pa;
    CString strnodename,strattr;

    iStart    = iEnd1    = iEnd2    = 0;
    while(iEnd1>-1)
    {
        nlen  = strxml.GetLength();
        iEnd1 = strxml.Find(_T('<'),iStart);
        if(iEnd1<0)
            break;
        iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
        if(iEnd2<0)
            break;
            
        //// keep left and right
        strleft.Empty();
        strright.Empty();
        strleft = strxml.Left(iEnd1+1);
        strright= strxml.Right(nlen-iEnd2);
        // pick out <> and process it
        strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
        strfake = strtemp;
        strfake.TrimLeft();
        strfake.TrimRight();

        int lensub = strfake.GetLength();
        if(lensub>0) 
        {
            bool bselfClosed    = _T('/')==strfake.GetAt(lensub-1);
            if(bselfClosed)
                strfake = strfake.Left(lensub-1);

            strfake.TrimLeft();
            strfake.TrimRight();
            bool breversetag    =_T('/')==strfake.GetAt(0);
            if(breversetag) 
            {
                strfake = strfake.Right(lensub-1);
                int n1 = strfake.Find(_T(' '));

                // just truncate it. reverse tag has no attr-value pair
                if(n1>1)
                    strfake = strfake.Left(n1-1);        

                for(int isize=0;isize<size;isize++)
                {
                    pa = m_ArrPreDefTag[isize];
                    if(strfake.CompareNoCase(pa.tag)==0)
                    {
                        strfake = pa.oritag;
                        break;
                    }
                }
                strfake.MakeLower();
            }
            else
            {
                /// replace tag with predefined tag
                int n0 = strfake.GetLength();
                int n1 = strfake.Find(_T(' '));
                if(n1>0) /// yeah, it include serveral fields
                {
                    strnodename = strfake.Left(n1);
                    strnodename.MakeLower();
                    for(int isize=0;isize<size;isize++)
                    {
                        pa = m_ArrPreDefTag[isize];
                        if(strnodename.CompareNoCase(pa.tag)==0)
                        {
                            strnodename = pa.oritag + _T(" ") + pa.preattrs;
                            break;
                        }
                    }
                    // reconcat 
                    // find out nodename, attr-pair;
                    ATLASSERT(n0>n1);
                    strfake = strnodename+strfake.Right(n0-n1);
                    // breplaced is possible to change in this tag.
                    RemoveDuplicate(strfake);
                }
                ///only one tag, and no attr-value pair

            }
            strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) + 
                strfake +     ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
            strxml = strleft + strtemp    +     strright;
            iEnd2    = strtemp.GetLength()+iEnd1;
        }
        else
            ATLASSERT(0);        // there must be no < > things 
        iStart = iEnd2+3;
    }
    return;
}

void RemoveDuplicate(CString& str)
{
    str.TrimLeft();
    str.TrimRight();

    // replace simplified tag and spawn attribute array.
    // fill out attr-pair map;

    CString strnodename;
    int n0 = str.GetLength();
    int n1 = str.Find(_T(' '));
    if(n1>0) 
    {
        strnodename = str.Left(n1);
        strnodename.MakeLower();

        CString strfake;
        CSimpleMap<CString,CString> attributes;
        CString strattr,strvalue;

        strfake = str.Right(n0-n1);
        strfake.TrimLeft();
        strfake.TrimRight();
        /// find attr-value in strfake

        int m0 = 0;
        int m1 = 0;
        int m2 = 0;
        int mlen;
        ////////////////////
        while(m0>-1)
        {
            mlen = strfake.GetLength();
            m1 = strfake.Find(_T('\"'),m0);
            if(m1<0)
                break;

            m2 = strfake.Find(_T('\"'),m1+1);
            if(m1<0)
                break;

            strattr  = strfake.Mid(m0,m1-m0-1);
            strattr.Remove(_T('='));
            strattr.MakeLower();
            strattr.TrimLeft();
            strattr.TrimRight();

            strvalue = strfake.Mid(m1+1,m2-m1-1);
            strvalue.TrimLeft();
            strvalue.TrimRight();
            int nd = attributes.FindKey(strattr);
            if(nd<0)
                attributes.Add(strattr,strvalue);
            m0 = m2+1;
        }
        /// process default id

        str = strnodename;
        int size = attributes.GetSize();
        for(int i=0;i<size;i++)
        {
            strattr     = attributes.GetKeyAt(i);
            strvalue    = attributes.GetValueAt(i);
            str += _T(" ");
            str += strattr;
            str += _T("=\"");
            str += strvalue;
            str += _T("\"");
        }
        attributes.RemoveAll();
    }
    return;
}

As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">, the space between "blah" and c has been added, or you cannot load successfully.

This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here

Share

About the Author

Chau Johnthan
United States United States
No Biography provided

Comments and Discussions

 
Questionerror ! Pin
bob zheng14-Aug-07 19:18
Memberbob zheng14-Aug-07 19:18 
GeneralSsiballoma.. Pin
ukizima16-Jun-05 0:56
Memberukizima16-Jun-05 0:56 

General General    News News    Suggestion Suggestion    Question Question    Bug Bug    Answer Answer    Joke Joke    Praise Praise    Rant Rant    Admin Admin   

Use Ctrl+Left/Right to switch messages, Ctrl+Up/Down to switch threads, Ctrl+Shift+Left/Right to switch pages.