Click here to Skip to main content
15,881,803 members
Articles / Desktop Programming / WTL
Article

PreParse XML using CString

Rate me:
Please Sign up or sign in to vote.
4.29/5 (3 votes)
4 Mar 2004 38.7K   16   2
An article on XML parsing using CString.

Introduction

It is possible that when you parse an XML-like document, it can't be opened by XML DOM without any pre-parsing, because it has some duplicate attr-value pairs, or some spaces were omitted unexpectedly by the author.

Background

I have a lot of XML docs which were composed and unloaded by clients, but there is always some error in it.

Using the code

You read out the XML string buf, to say, strxml. Then just call PreFormatXML(strxml); after that, you can create an instance of xmldom, and call LoadXML.

The following are the functions involved:

void PreFormatXML(CString& strxml)
{
    // it is possible that the tooltip include a \r\n
    strxml.Remove(_T('\r'));
    strxml.Replace(_T('\n'),_T(' '));
    strxml.Replace(_T('\t'),_T(' '));
    //Dump(strxml,_T("e:\\bbb.xml"));

    //////////////////////// in the following, you should reassign \r\n
    ////////// now just one line
    int iStart = 0;
    int iEnd1,iEnd2;
    while(iStart>-1)
    {
        iEnd1 = strxml.Find(_T("<!--"),iStart);
        if(iEnd1<0) 
            break;
        iEnd2 = strxml.Find(_T("-->"),iEnd1);
        if(iEnd2<0) 
            break;

        if(iEnd2>iEnd1) {
            strtemp = strxml.Mid(iEnd1,iEnd2-iEnd1+3);
            int n = strxml.Replace(strtemp,_T(" "));
            iStart = 0;
            continue;
        }
        iStart = iEnd2+1;
    }

    iStart  = 0;
    iEnd1    = 0;
    iEnd2    = 0;
    CString strcmp = strxml;
    while(iEnd2>-1)
    {
        iEnd1 = strxml.Find(_T('\"'),iStart);
        iEnd2 = strxml.Find(_T('\"'),iEnd1+1);
        if(iEnd1>-1&&iEnd2>-1) {
            for(int i=iEnd1;i<iEnd2+1;i++)
                strcmp.SetAt(i,_T('*'));
        }
        else
            break;
        iStart = iEnd2+1;
    }

    strcmp.MakeLower();

    // minimize all tag and split concat attr
    int i1 = 0;
    int i2 = 0;
    int nlen = strcmp.GetLength();
    while(i1<nlen)
    {
        TCHAR ch1 = strcmp.GetAt(i1);
        if(ch1!=_T('*')) 
            strxml.SetAt(i2,ch1);

        if(ch1==_T('>'))
        {
            strxml.Insert(i2+1,_T('\n'));
            strxml.Insert(i2+1,_T('\r'));
            i2 += 2;
        }

        if(ch1==_T('*')) 
        {
            if(i1<nlen-1)
            {
                TCHAR ch2 = strcmp.GetAt(i1+1);
                if(ch2!=_T('*'))
                {
                    /// the 
                    if(ch2>_T('a'-1)&&ch2<_T('z'+1))
                    {
                        strxml.Insert(i2+1,_T(' '));
                        i2++;
                    }
                }
            }
        }
        i1++;
        i2++;
    }

    ////// the following remove duplicate tag, and makelower
    //     of everything except attribute value.
    ////int nlen;
    nlen = strxml.GetLength();
    strxml.Insert(nlen,_T("\r\n"));
    ////Dump(strxml,_T("e:\\aaaa.txt"));
    
    CString strtemp,strfake,strleft,strright;

    ///////////////////////////////////////////////////////////////////
    int size = m_ArrPreDefTag.GetSize();
    preTag pa;
    CString strnodename,strattr;

    iStart    = iEnd1    = iEnd2    = 0;
    while(iEnd1>-1)
    {
        nlen  = strxml.GetLength();
        iEnd1 = strxml.Find(_T('<'),iStart);
        if(iEnd1<0)
            break;
        iEnd2 = strxml.Find(_T(">\r\n"),iEnd1);
        if(iEnd2<0)
            break;
            
        //// keep left and right
        strleft.Empty();
        strright.Empty();
        strleft = strxml.Left(iEnd1+1);
        strright= strxml.Right(nlen-iEnd2);
        // pick out <> and process it
        strtemp = strxml.Mid(iEnd1+1,iEnd2-iEnd1-1);
        strfake = strtemp;
        strfake.TrimLeft();
        strfake.TrimRight();

        int lensub = strfake.GetLength();
        if(lensub>0) 
        {
            bool bselfClosed    = _T('/')==strfake.GetAt(lensub-1);
            if(bselfClosed)
                strfake = strfake.Left(lensub-1);

            strfake.TrimLeft();
            strfake.TrimRight();
            bool breversetag    =_T('/')==strfake.GetAt(0);
            if(breversetag) 
            {
                strfake = strfake.Right(lensub-1);
                int n1 = strfake.Find(_T(' '));

                // just truncate it. reverse tag has no attr-value pair
                if(n1>1)
                    strfake = strfake.Left(n1-1);        

                for(int isize=0;isize<size;isize++)
                {
                    pa = m_ArrPreDefTag[isize];
                    if(strfake.CompareNoCase(pa.tag)==0)
                    {
                        strfake = pa.oritag;
                        break;
                    }
                }
                strfake.MakeLower();
            }
            else
            {
                /// replace tag with predefined tag
                int n0 = strfake.GetLength();
                int n1 = strfake.Find(_T(' '));
                if(n1>0) /// yeah, it include serveral fields
                {
                    strnodename = strfake.Left(n1);
                    strnodename.MakeLower();
                    for(int isize=0;isize<size;isize++)
                    {
                        pa = m_ArrPreDefTag[isize];
                        if(strnodename.CompareNoCase(pa.tag)==0)
                        {
                            strnodename = pa.oritag + _T(" ") + pa.preattrs;
                            break;
                        }
                    }
                    // reconcat 
                    // find out nodename, attr-pair;
                    ATLASSERT(n0>n1);
                    strfake = strnodename+strfake.Right(n0-n1);
                    // breplaced is possible to change in this tag.
                    RemoveDuplicate(strfake);
                }
                ///only one tag, and no attr-value pair

            }
            strtemp = ((breversetag&&!bselfClosed)?_T("/"):_T("")) + 
                strfake +     ((bselfClosed&&!breversetag)?_T("/"):_T("")) ;
            strxml = strleft + strtemp    +     strright;
            iEnd2    = strtemp.GetLength()+iEnd1;
        }
        else
            ATLASSERT(0);        // there must be no < > things 
        iStart = iEnd2+3;
    }
    return;
}

void RemoveDuplicate(CString& str)
{
    str.TrimLeft();
    str.TrimRight();

    // replace simplified tag and spawn attribute array.
    // fill out attr-pair map;

    CString strnodename;
    int n0 = str.GetLength();
    int n1 = str.Find(_T(' '));
    if(n1>0) 
    {
        strnodename = str.Left(n1);
        strnodename.MakeLower();

        CString strfake;
        CSimpleMap<CString,CString> attributes;
        CString strattr,strvalue;

        strfake = str.Right(n0-n1);
        strfake.TrimLeft();
        strfake.TrimRight();
        /// find attr-value in strfake

        int m0 = 0;
        int m1 = 0;
        int m2 = 0;
        int mlen;
        ////////////////////
        while(m0>-1)
        {
            mlen = strfake.GetLength();
            m1 = strfake.Find(_T('\"'),m0);
            if(m1<0)
                break;

            m2 = strfake.Find(_T('\"'),m1+1);
            if(m1<0)
                break;

            strattr  = strfake.Mid(m0,m1-m0-1);
            strattr.Remove(_T('='));
            strattr.MakeLower();
            strattr.TrimLeft();
            strattr.TrimRight();

            strvalue = strfake.Mid(m1+1,m2-m1-1);
            strvalue.TrimLeft();
            strvalue.TrimRight();
            int nd = attributes.FindKey(strattr);
            if(nd<0)
                attributes.Add(strattr,strvalue);
            m0 = m2+1;
        }
        /// process default id

        str = strnodename;
        int size = attributes.GetSize();
        for(int i=0;i<size;i++)
        {
            strattr     = attributes.GetKeyAt(i);
            strvalue    = attributes.GetValueAt(i);
            str += _T(" ");
            str += strattr;
            str += _T("=\"");
            str += strvalue;
            str += _T("\"");
        }
        attributes.RemoveAll();
    }
    return;
}

As above, you can see, I have recomposed all of the attr-value pair using CSimpleMap. When encountered a duplicate pair, it will be overwritten. Another thing, in strings such as <a b="blah"c="interesting value">, the space between "blah" and c has been added, or you cannot load successfully.

This code has been tested under XP, VS.NET 2002, WTL 71, ATL70. Any comment is appreciated.

License

This article has no explicit license attached to it but may contain usage terms in the article text or the download files themselves. If in doubt please contact the author via the discussion board below.

A list of licenses authors might use can be found here


Written By
United States United States
This member has not yet provided a Biography. Assume it's interesting and varied, and probably something to do with programming.

Comments and Discussions

 
Questionerror ! Pin
bob zheng14-Aug-07 19:18
bob zheng14-Aug-07 19:18 
GeneralSsiballoma.. Pin
ukizima16-Jun-05 0:56
ukizima16-Jun-05 0:56 
Mad | :mad:

General General    News News    Suggestion Suggestion    Question Question    Bug Bug    Answer Answer    Joke Joke    Praise Praise    Rant Rant    Admin Admin   

Use Ctrl+Left/Right to switch messages, Ctrl+Up/Down to switch threads, Ctrl+Shift+Left/Right to switch pages.