Abstract
This article describes how we can parse input content and tokenize our own HTML like syntax tags embedded in the content. The content can be defined as an input string containing our own HTML like syntax tags in it.
Introduction
Very often, we need to write code which simply reads through a piece of content and parses the same for any self defined tags. This article explains an approach to achieve the same. The code has been developed in C# and uses Regular Expressions for parsing the content.
Source code
ITag
: Interface to define your own tag. A sample MyLinkTag
class does the same, and defines its structure which the TagParser
will use when parsing the input content for your tags.
public interface ITag
{
string Prefix
{
get;
}
bool IsCaseSensitive
{
get;
}
string Type
{
get;
}
AttributeCollection Attributes
{
get;
}
}
TagParser
: The class which does the parsing. It has a single public method named Parse
which takes in the input content to be parsed for your tag. This class raises an event when the tag is parsed so that the subscriber of this event can get the details of the current tag being parsed and can make logical decisions based on the tag type and its attribute values.
public class TagParser
{
private const string HTML_TAG_REGEX_PATTERN =
@"<([A-Za-z_:]|[^\x00-\x7F])([A-Za-z0-9_:.-]|" +
@"[^\x00-\x7F])*([ \n\t\r]+([A-Za-z_:]|[^\x00-\x7F])" +
@"([A-Za-z0-9_:.-]|[^\x00-\x7F])*([ \n\t\r]+)?=([ \n\t\r]+)" +
@"?(""[^<""]*""|'[^<']*'))*([ \n\t\r]+)?/?>";
private const string CONTENT_TAG_REGEX_PATTERN =
@"<{0}\b(?>\s+(?:{1})|[^\s>]+|\s+)*>";
private const string CONTENT_TAG_ATTRIBUTE_REGEX_PATTERN =
@"{0}=""([^""]*)""";
private static System.Text.RegularExpressions.Regex tagRegEx = null;
private System.Text.RegularExpressions.Regex contentTagRegex = null;
private ITag contentTag = null;
public delegate void TagParsed(object sender, ITag iTag);
public event TagParsed OnTagParsed;
private static System.Text.RegularExpressions.Regex TagRegEx
{
get
{
if (null == tagRegEx)
{
tagRegEx =
new System.Text.RegularExpressions.Regex(HTML_TAG_REGEX_PATTERN
, System.Text.RegularExpressions.RegexOptions.Singleline
| System.Text.RegularExpressions.RegexOptions.IgnoreCase
| System.Text.RegularExpressions.RegexOptions.CultureInvariant
| System.Text.RegularExpressions.
RegexOptions.IgnorePatternWhitespace);
}
return tagRegEx;
}
}
public void Parse(string content, ITag tag)
{
contentTag = tag;
ConstructHTMLTagRegex();
System.Text.RegularExpressions.MatchCollection mc =
TagRegEx.Matches(content);
foreach (System.Text.RegularExpressions.Match m in mc)
{
ReadTag(m);
}
}
private void ConstructHTMLTagRegex()
{
string tagRegexPattern = string.Empty;
System.Text.StringBuilder sbAttributeRegex =
new System.Text.StringBuilder();
foreach (Attribute attribute in contentTag.Attributes)
{
sbAttributeRegex.AppendFormat("{0}|", string.Format(
CONTENT_TAG_ATTRIBUTE_REGEX_PATTERN, attribute.Name));
}
tagRegexPattern = string.Format(CONTENT_TAG_REGEX_PATTERN,
contentTag.Prefix, sbAttributeRegex.ToString());
contentTagRegex = new System.Text.RegularExpressions.Regex(tagRegexPattern,
System.Text.RegularExpressions.RegexOptions.Singleline|
(contentTag.IsCaseSensitive ?
System.Text.RegularExpressions.RegexOptions.Singleline :
System.Text.RegularExpressions.RegexOptions.IgnoreCase)
| System.Text.RegularExpressions.RegexOptions.CultureInvariant
| System.Text.RegularExpressions.
RegexOptions.IgnorePatternWhitespace);
}
private void ReadTag(System.Text.RegularExpressions.Match match)
{
if (null != match)
{
System.Text.RegularExpressions.Match matchTag =
contentTagRegex.Match(match.Value);
if (matchTag.Success)
{
ReadAttributes(matchTag);
}
}
}
private void ReadAttributes(System.Text.RegularExpressions.Match match)
{
foreach (Attribute attr in contentTag.Attributes)
{
attr.Value = GetGroupCollectionValue(match.Groups, attr.Ordinal);
}
if (null != OnTagParsed)
{
OnTagParsed(this, contentTag);
}
}
private string GetGroupCollectionValue(
System.Text.RegularExpressions.GroupCollection gc, int gcIndex)
{
string namedItemValue = string.Empty;
try
{
namedItemValue = gc[gcIndex].Captures[0].ToString();
}
catch { }
return namedItemValue;
}
}
Attribute
: The class to define the attributes of your tag.
public class Attribute
{
private string _name = string.Empty;
public string Name
{
get { return _name; }
set { _name = value; }
}
private string _value = string.Empty;
public string Value
{
get { return _value; }
set { _value = value; }
}
private System.UInt16 _ordinal = 1;
public System.UInt16 Ordinal
{
get { return _ordinal; }
set { _ordinal = value; }
}
}
AttributeCollection
: The class to represent a collection of attributes of your tag.
public class AttributeCollection : System.Collections.Generic.List<attribute />
{
private System.UInt16 ordinal = 1;
public new void Add(Attribute attr)
{
attr.Ordinal = ordinal;
ordinal += 1;
base.Add(attr);
}
}
MyLinkTag
: A sample class showing how you can define your own tag to be parsed.
public sealed class MyLinkTag : ITag
{
private static AttributeCollection attrCollection = null;
#region ITag Members
public string Prefix
{
get { return "link"; }
}
public bool IsCaseSensitive
{
get { return false; }
}
public string Type
{
get { return "anchor"; }
}
public AttributeCollection Attributes
{
get
{
if (null == attrCollection)
{
attrCollection = new AttributeCollection();
Attribute attrHref = new Attribute();
attrHref.Name = "href";
attrCollection.Add(attrHref);
Attribute attrText = new Attribute();
attrText.Name = "text";
attrCollection.Add(attrText);
Attribute attrTarget = new Attribute();
attrTarget.Name = "target";
attrCollection.Add(attrTarget);
}
return attrCollection;
}
}
#endregion
}
Using the code
public static void Main()
{
string myContent = @"Sample Content <link
href=""http://google.com"" text=""Click to Google""/>";
TagParser tp = new TagParser();
tp.OnTagParsed += new TagParser.TagParsed(tp_OnTagParsed);
tp.Parse(myContent, new MyLinkTag());
}
private static void tp_OnTagParsed(object o, ITag itag)
{
string consoleOut = "Tag Type is :" + itag.Type +
"\nAttributes for the tag are;\n";
foreach (Attribute attr in itag.Attributes)
{
consoleOut += string.Format("\tAttribute Name:" +
" {0}\tAttribute Value: {1}\n",
attr.Name, attr.Value);
}
Console.WriteLine(consoleOut);
Console.ReadLine();
}
Prashant Dhavale has around 6 years of experience in IT industry of which 5.6 yrs experience on .NET Platform which includes ASP.NET, C#, VB.NET, ADO.NET, MS-SQL 2000/2005 and XML/XSLT.