XML Sitemap

Ștefan-Mihai MOGA

4.91/5 (10 votes)

Aug 6, 2014

GPL3

2 min read

19814

358

An alternative Windows version to XML Sitemap online generators

Download demo project with source code (from GitHub)

XML Sitemap

Introduction

Sitemaps provide a way for you to tell search engines about the pages on your website that might not otherwise be found, as well as provide additional supporting information about the pages.

They are essentially a structured list of all the pages in your website.

As well as the list of pages you can advise search engines on when the page was last updated, how often it is updated and the relative priority of each page within your website.

Why Get a Sitemap?

Sitemaps help you improve your search engine coverage and ranking. If you verify ownership of your website with search engines such as Google and Bing, you can access a wealth of information about how Google sees your website and how visitors are finding you.

Maximise your search engine coverage and ranking
Improve the speed with which your pages get listed
Take advantage of Google and Bing webmaster tools
Google how Google crawls, indexes and ranks your site
Analyze search data to see how many people find you

XML Sitemaps

XML sitemaps are the preferred option for detailing your website and contain more detail to inform the search engines about the structure of your website.

The core elements of an XML Sitemap are:

Web page address
Last modified date
Update frequency
Relative priority within your site

<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <url>
        <loc>http://www.example.com/</loc>
        <lastmod>2005-01-01</lastmod>
        <changefreq>monthly</changefreq>
        <priority>0.8</priority>
    </url>
</urlset>

XML Sitemaps can also be extended to include details about media content in your pages such as images.

RSS Sitemaps

RSS feeds are typical about informing search engines, directories and your users about new and updated content although they can also be used to detail all the pages in your website.

The good thing about RSS feeds is that you can list all your new and updated webpages and submit them to the search engines and directories to improve the inclusion of new content.

<channel>
    <title>Website name</title>
    <link>http://www.xmlwitemapgenerator.com</link>
    <description>Website description here</description>

    <item>
        <title>A Sitemap entry</title>
        <link>http://www.xmlwitemapgenerator.com/example</link>
        <description>Sitemap entry description</description>
    </item>

    <item>
        <title>Another item</title>
        <link>http://www.xmlwitemapgenerator.com/xml</link>
        <description>Another entry description</description>
    </item>
</channel>

HTML Sitemaps

HTML sitemaps are traditional web pages that provide a structured list of webpages in your website.

They are more accessible to people but also help search engines finding pages in your website.

Unlike RSS and XML sitemaps, HTML sitemaps do not contain the same meta data about your pages.

Using the Code

The main implementation is done in XMLSitemapExt.h and XMLSitemapExt.cpp files, as follows:

BOOL IsLocalURL(LPCTSTR lpszAbsoluteURL, LPCTSTR lpszBaseURL)
{
    // Check to see if <lpszAbsoluteURL> has the same domain as <lpszBaseURL>
    TCHAR lpszFooBarURL[MAX_URL_LENGTH] = { 0 };
    TCHAR lpszDomainURL[MAX_URL_LENGTH] = { 0 };
    ASSERT(IsValidURL(lpszAbsoluteURL, FALSE));
    ASSERT(IsValidURL(lpszBaseURL, FALSE));
    if ((lpszAbsoluteURL != NULL) && (_tcslen(lpszAbsoluteURL) > 0))
    {
        if ((lpszBaseURL != NULL) && (_tcslen(lpszBaseURL) > 0))
        {
            _tcscpy_s(lpszDomainURL, MAX_URL_LENGTH, lpszBaseURL);
            LPCTSTR lpszStart = _tcsstr(lpszDomainURL, _T("//"));
            if (lpszStart != NULL)
            {
                lpszStart++;
                lpszStart++;
                ASSERT(lpszStart != NULL);
                LPTSTR lpszEnd = (LPTSTR) _tcschr(lpszStart, _T('/'));
                if (lpszEnd != NULL)
                    lpszEnd[0] = _T('\0'); // modify lpszDomainURL

                _tcscpy_s(lpszFooBarURL, MAX_URL_LENGTH, lpszAbsoluteURL);
                LPCTSTR lpszBegin = _tcsstr(lpszFooBarURL, _T("//"));
                if (lpszBegin != NULL)
                {
                    lpszBegin++;
                    lpszBegin++;
                    ASSERT(lpszBegin != NULL);
                    lpszEnd = (LPTSTR) _tcschr(lpszBegin, _T('/'));
                    if (lpszEnd != NULL)
                        lpszEnd[0] = _T('\0'); // modify lpszFooBarURL

                    if (_tcsstr(lpszBegin, lpszStart) != NULL)
                        return TRUE;
                }
            }
        }
    }
    return FALSE;
}

CString ConvertURL(LPCTSTR lpszRelativeURL, LPCTSTR lpszBaseURL)
{
    // Convert relative URL (e.g. "../../somedir") to absolute URL
    TCHAR lpszAbsoluteURL[MAX_URL_LENGTH] = { 0 };
    ASSERT(IsValidURL(lpszRelativeURL, TRUE));
    ASSERT(IsValidURL(lpszBaseURL, FALSE));
    if ((lpszRelativeURL != NULL) && (_tcslen(lpszRelativeURL) > 0))
    {
        if ((lpszBaseURL != NULL) && (_tcslen(lpszBaseURL) > 0))
        {
            DWORD dwLength = MAX_URL_LENGTH;
            if (CoInternetCombineUrl(lpszBaseURL, lpszRelativeURL, 0, 
            lpszAbsoluteURL, MAX_URL_LENGTH, &dwLength, 0) == S_OK)
            {
                lpszAbsoluteURL[dwLength] = '\0';
                return lpszAbsoluteURL;
            }
            OutputDebugString(_T("ERROR: CoInternetCombineUrl has failed\n"));
            return NULL;
        }
    }
    return lpszAbsoluteURL;
}

BOOL ProcessHTML(CXMLSitemapDlg* dlgXMLSitemap, CString strFileName, CString strBaseURL)
{
    CString strURL;
    CString strMessage;
    CString strFileLine;
    BOOL bRetVal = TRUE;
    CStringArray arrURL;
    TCHAR lpszTempPath[MAX_STR_LENGTH] = { 0 };
    TCHAR lpszTempFile[MAX_STR_LENGTH] = { 0 };

    if (dlgXMLSitemap != NULL)
    {
        if (!strFileName.IsEmpty())
        {
            try
            {
                CStdioFile pInputFile(strFileName, CFile::modeRead | CFile::typeText);
                while (pInputFile.ReadString(strFileLine))
                {
                    int nIndex = strFileLine.Find(_T("href="), 0);
                    while (nIndex >= 0)
                    {
                        const int nFirst = strFileLine.Find(_T('\"'), nIndex);
                        if (nFirst >= 0)
                        {
                            const int nLast = strFileLine.Find(_T('\"'), nFirst + 1);
                            if (nLast >= 0)
                            {
                                strURL = strFileLine.Mid(nFirst + 1, nLast - nFirst - 1);
                                strMessage.Format(_T("URL found - %s\n"), strURL);
                                OutputDebugString(strMessage);
                                arrURL.Add(strURL);
                            }
                        }
                        nIndex = (nFirst == -1) ? -1 : strFileLine.Find(_T("href="), nFirst + 1);
                    }
                }
                pInputFile.Close();
            }
            catch (CFileException * pFileException)
            {
                TCHAR lpszError[MAX_STR_LENGTH] = { 0 };
                pFileException->GetErrorMessage(lpszError, MAX_STR_LENGTH);
                pFileException->Delete();
                OutputDebugString(lpszError);
                bRetVal = FALSE;
            }

            const int nSize = arrURL.GetSize();
            for (int nIndex = 0; ((nIndex < nSize) 
            && (dlgXMLSitemap->m_bGenerate)); nIndex++)
            {
                CString strURL = arrURL.GetAt(nIndex);
                if (IsHtmlPage(strURL))
                {
                    if (!IsValidURL(strURL, FALSE))
                    {
                        strURL = ConvertURL(strURL, strBaseURL);
                    }

                    if (IsValidURL(strURL, FALSE))
                    {
                        if (IsLocalURL(strURL, strBaseURL))
                        {
                            const int nFind = strURL.ReverseFind(_T('#'));
                            if (nFind >= 0)
                            {
                                const int nSlash = strURL.Find(_T('/'), nFind);
                                if (nSlash == -1)
                                {
                                    strURL = strURL.Left(nFind);
                                }
                            }

                            if (strURL.GetAt(strURL.GetLength() - 1) == _T('/'))
                                strURL = strURL.Left(strURL.GetLength() - 1);

                            int nSearch = 0;
                            const int nArraySize = g_arrURL.GetSize();
                            for (; nSearch < nArraySize; nSearch++)
                            {
                                if (strURL.CompareNoCase(g_arrURL.GetAt(nSearch)) == 0)
                                    break;
                            }
                            if (nSearch == nArraySize)
                            {
                                const DWORD dwTempPath = 
                                GetTempPath(MAX_STR_LENGTH, lpszTempPath);
                                lpszTempPath[dwTempPath] = '\0';
                                strMessage.Format
                                (_T("lpszTempPath = %s\n"), lpszTempPath);
                                OutputDebugString(strMessage);

                                if (GetTempFileName(lpszTempPath, 
                                _T("map"), 0, lpszTempFile) != 0)
                                {
                                    strMessage.Format
                                    (_T("lpszTempFile = %s\n"), lpszTempFile);
                                    OutputDebugString(strMessage);

                                    if (URLDownloadToFile
                                    (NULL, strURL, lpszTempFile, 0, NULL) == S_OK)
                                    {
                                        strMessage.Format(_T("URL added - %s\n"), strURL);
                                        OutputDebugString(strMessage);
                                        g_arrURL.Add(strURL); // add URL to XML sitemap
                                        dlgXMLSitemap->m_ctrlStatus.SetWindowText(strURL);
                                        dlgXMLSitemap->m_ctrlStatus.UpdateWindow();

                                        if (!ProcessHTML(dlgXMLSitemap, lpszTempFile, strURL))
                                        {
                                            strMessage.Format
                                            (_T("ProcessHTML(%s) has failed\n"), lpszTempFile);
                                            OutputDebugString(strMessage);
                                        }
                                    }
                                    else
                                    {
                                        strMessage.Format
                                        (_T("Broken link %s found in page %s\n"), strURL, strBaseURL);
                                        OutputDebugString(strMessage);
                                        g_arrBrokenLink.Add(strURL);
                                        g_arrBrokenPage.Add(strBaseURL);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        else
        {
            OutputDebugString(_T("ERROR: strFileName is empty\n"));
            bRetVal = FALSE;
        }
    }
    else
    {
        OutputDebugString(_T("ERROR: dlgXMLSitemap = NULL\n"));
        bRetVal = FALSE;
    }
    VERIFY(DeleteFile(strFileName));
    return bRetVal;
}

DWORD WINAPI XMLSitemap_ThreadProc(LPVOID lpParam)
{
    CString strMessage, strURL;
    TCHAR lpszTempPath[MAX_STR_LENGTH] = { 0 };
    TCHAR lpszTempFile[MAX_STR_LENGTH] = { 0 };
    CXMLSitemapDlg* dlgXMLSitemap = (CXMLSitemapDlg*) lpParam;

    if (dlgXMLSitemap != NULL)
    {
        g_arrURL.RemoveAll();
        g_arrBrokenLink.RemoveAll();
        g_arrBrokenPage.RemoveAll();
        strURL = dlgXMLSitemap->m_strDomainName;
        if (!strURL.IsEmpty())
        {
            if (strURL.GetAt(strURL.GetLength() - 1) == _T('/'))
                strURL = strURL.Left(strURL.GetLength() - 1);

            const DWORD dwTempPath = GetTempPath(MAX_STR_LENGTH, lpszTempPath);
            lpszTempPath[dwTempPath] = '\0';
            strMessage.Format(_T("lpszTempPath = %s\n"), lpszTempPath);
            OutputDebugString(strMessage);

            if (GetTempFileName
            (lpszTempPath, _T("map"), 0, lpszTempFile) != 0)
            {
                strMessage.Format(_T("lpszTempFile = %s\n"), lpszTempFile);
                OutputDebugString(strMessage);

                strMessage.Format(_T("URLDownloadToFile(%s)...\n"), strURL);
                OutputDebugString(strMessage);
                if (URLDownloadToFile
                (NULL, strURL, lpszTempFile, 0, NULL) == S_OK)
                {
                    strMessage.Format(_T("URL added - %s\n"), strURL);
                    OutputDebugString(strMessage);
                    g_arrURL.Add(strURL); // add URL to XML sitemap
                    dlgXMLSitemap->m_ctrlStatus.SetWindowText(strURL);
                    dlgXMLSitemap->m_ctrlStatus.UpdateWindow();

                    if (!ProcessHTML(dlgXMLSitemap, lpszTempFile, strURL))
                    {
                        strMessage.Format
                        (_T("ProcessHTML(%s) has failed\n"), lpszTempFile);
                        OutputDebugString(strMessage);
                    }
                }
                else
                {
                    OutputDebugString(_T("URLDownloadToFile has failed\n"));
                }
            }
            else
            {
                OutputDebugString(_T("GetTempFileName has failed\n"));
            }
        }
        else
        {
            OutputDebugString
            (_T("ERROR: dlgXMLSitemap->m_strDomainName is empty!\n"));
        }
    }
    else
    {
        OutputDebugString(_T("ERROR: dlgXMLSitemap = NULL\n"));
    }
    return 0;
}

Points of Interest

When the website is completely processed, this MFC application also generates a broken links report.

BOOL ExportXMLSitemap(HWND hWnd, CString strFileName, 
CString strFrequency, CString strPriority, CString strDateTime)
{
    CString strFileLine;
    BOOL bRetVal = TRUE;
    try
    {
        CStdioFile pOutputFile(strFileName, 
        CFile::modeCreate | CFile::modeWrite | CFile::typeText);
        pOutputFile.WriteString(_T
        ("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"));
        pOutputFile.WriteString(_T
        ("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"));
        const int nSize = g_arrURL.GetSize();
        for (int nIndex = 0; nIndex < nSize; nIndex++)
        {
            pOutputFile.WriteString(_T("<url>\n"));

            const CString strURL = g_arrURL.GetAt(nIndex);
            strFileLine.Format(_T("<loc>%s</loc>\n"), strURL);
            pOutputFile.WriteString(strFileLine);

            strFileLine.Format(_T("<lastmod>%s</lastmod>\n"), strDateTime);
            pOutputFile.WriteString(strFileLine);

            strFileLine.Format(_T("<changefreq>%s</changefreq>\n"), strFrequency);
            pOutputFile.WriteString(strFileLine);

            strFileLine.Format(_T("<priority>%s</priority>\n"), strPriority);
            pOutputFile.WriteString(strFileLine);

            pOutputFile.WriteString(_T("</url>\n"));
        }
        pOutputFile.WriteString(_T("</urlset>\n"));
        pOutputFile.Close();

        ::ShellExecute(hWnd, _T("open"), strFileName, NULL, NULL, SW_NORMAL);
    }
    catch (CFileException * pFileException)
    {
        TCHAR lpszError[MAX_STR_LENGTH] = { 0 };
        pFileException->GetErrorMessage(lpszError, MAX_STR_LENGTH);
        pFileException->Delete();
        OutputDebugString(lpszError);
        bRetVal = FALSE;
    }
    return bRetVal;
}

BOOL ExportBrokenLink(HWND hWnd, CString strFileName, CString strDomainName)
{
    CString strFileLine;
    BOOL bRetVal = TRUE;
    try
    {
        CStdioFile pOutputFile(strFileName, CFile::modeCreate | CFile::modeWrite | CFile::typeText);
        pOutputFile.WriteString(_T("<!DOCTYPE html PUBLIC \"-
        //W3C//DTD XHTML 1.0 Strict//EN\" 
        \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"));
        pOutputFile.WriteString(_T
        ("<html xmlns=\"http://www.w3.org/1999/xhtml\" >\n"));
        pOutputFile.WriteString(_T("<head>\n"));
        pOutputFile.WriteString(_T("<meta http-equiv=\"Content-Type\" 
        content=\"text/html; charset=UTF-8\" />\n"));
        strFileLine.Format(_T
        ("<title>Broken link report for %s</title>\n"), strDomainName);
        pOutputFile.WriteString(strFileLine);
        pOutputFile.WriteString(_T("</head>\n"));
        pOutputFile.WriteString(_T("<body>\n"));
        const int nSize = g_arrBrokenLink.GetSize();
        for (int nIndex = 0; nIndex < nSize; nIndex++)
        {
            const CString strBrokenLink = g_arrBrokenLink.GetAt(nIndex);
            const CString strBrokenPage = g_arrBrokenPage.GetAt(nIndex);
            strFileLine.Format(_T
            ("Broken link %s found on page %s.<br />\n"), strBrokenLink, strBrokenPage);
            pOutputFile.WriteString(strFileLine);
        }
        pOutputFile.WriteString(_T("</body>\n"));
        pOutputFile.WriteString(_T("</html>\n"));
        pOutputFile.Close();

        ::ShellExecute(hWnd, _T("open"), strFileName, NULL, NULL, SW_NORMAL);
    }
    catch (CFileException * pFileException)
    {
        TCHAR lpszError[MAX_STR_LENGTH] = { 0 };
        pFileException->GetErrorMessage(lpszError, MAX_STR_LENGTH);
        pFileException->Delete();
        OutputDebugString(lpszError);
        bRetVal = FALSE;
    }
    return bRetVal;
}

Final Words

XML Sitemap application uses some components that have been published on The Code Project. Many thanks to:

Ben Hanson for his CFilterEdit class
PJ Naughter for his CVersionInfo class

Further plans: I would like to add support for RSS Sitemap and HTML Sitemap as soon as possible.

History

Version 1.03 (August 3^rd, 2014) - Initial release;
Version 1.04 (August 9^th, 2015) - Fix critical bug for large websites;
Version 1.05 (March 14^th, 2024) - Moved source code from CodeProject to GitHub.