.NET Framework
The following is to remove all HTML markup from a string. It has an option to keep line breaks (converts them to <br />'s).
It uses Regular Expressions and requires clean HTML.
public static string RemoveHtml(string html, bool keepLineBreaks = false)
{
if (string.IsNullOrEmpty(html))
{
return html;
}
System.Text.RegularExpressions.Regex regHtml = new System.Text.RegularExpressions.Regex("<[^>]*>");
if (keepLineBreaks)
{
//replace p's
html = html.Replace("<p>", "");
html = html.Replace("</p>", "<br />");
html = html.Replace("<br>", "<br />");
html = html.Replace("<br/>", "<br />");
html = html.Replace("<br />", Environment.NewLine);
}
//clean html
html = regHtml.Replace(html, "");
if (keepLineBreaks)
{
//add back breaks
html = html.Replace(Environment.NewLine, "<br />");
}
return html;
}