[12074] | 1 | /*******************************************************************************
|
---|
| 2 | * You may amend and distribute as you like, but don't remove this header!
|
---|
| 3 | *
|
---|
| 4 | * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
|
---|
| 5 | * See http://www.codeplex.com/EPPlus for details.
|
---|
| 6 | *
|
---|
| 7 | * Copyright (C) 2011 Jan Källman
|
---|
| 8 | *
|
---|
| 9 | * This library is free software; you can redistribute it and/or
|
---|
| 10 | * modify it under the terms of the GNU Lesser General Public
|
---|
| 11 | * License as published by the Free Software Foundation; either
|
---|
| 12 | * version 2.1 of the License, or (at your option) any later version.
|
---|
| 13 |
|
---|
| 14 | * This library is distributed in the hope that it will be useful,
|
---|
| 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
---|
| 17 | * See the GNU Lesser General Public License for more details.
|
---|
| 18 | *
|
---|
| 19 | * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
|
---|
| 20 | * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
|
---|
| 21 | *
|
---|
| 22 | * All code and executables are provided "as is" with no warranty either express or implied.
|
---|
| 23 | * The author accepts no liability for any damage or loss of business that this product may cause.
|
---|
| 24 | *
|
---|
| 25 | * Code change notes:
|
---|
| 26 | *
|
---|
| 27 | * Author Change Date
|
---|
| 28 | * ******************************************************************************
|
---|
| 29 | * Richard Tallent Initial Release 2012-08-13
|
---|
| 30 | *******************************************************************************/
|
---|
| 31 | using System;
|
---|
| 32 | using System.Collections.Generic;
|
---|
| 33 | using System.Text;
|
---|
| 34 | using System.Text.RegularExpressions;
|
---|
| 35 |
|
---|
| 36 | namespace OfficeOpenXml.Style
|
---|
| 37 | {
|
---|
| 38 | public class ExcelRichTextHtmlUtility
|
---|
| 39 | {
|
---|
| 40 |
|
---|
| 41 | /// <summary>
|
---|
| 42 | /// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
|
---|
| 43 | /// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
|
---|
| 44 | /// and also does not support CSS style attributes. It does support line breaks using the BR tag.
|
---|
| 45 | ///
|
---|
| 46 | /// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
|
---|
| 47 | /// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
|
---|
| 48 | /// of settings, and if the text is not blank, a new block is added to rich text.
|
---|
| 49 | /// </summary>
|
---|
| 50 | /// <param name="range"></param>
|
---|
| 51 | /// <param name="html">The HTML to parse into RichText</param>
|
---|
| 52 | /// <param name="defaultFontName"></param>
|
---|
| 53 | /// <param name="defaultFontSize"></param>
|
---|
| 54 |
|
---|
| 55 | public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize)
|
---|
| 56 | {
|
---|
| 57 | // Reset the cell value, just in case there is an existing RichText value.
|
---|
| 58 | range.Value = "";
|
---|
| 59 |
|
---|
| 60 | // Sanity check for blank values, skips creating Regex objects for performance.
|
---|
| 61 | if (String.IsNullOrEmpty(html))
|
---|
| 62 | {
|
---|
| 63 | range.IsRichText = false;
|
---|
| 64 | return;
|
---|
| 65 | }
|
---|
| 66 |
|
---|
| 67 | // Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
|
---|
| 68 | // Cells with line breaks aren't necessarily considered rich text, so this is performed
|
---|
| 69 | // before parsing the HTML tags.
|
---|
| 70 | html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
---|
| 71 |
|
---|
| 72 | string tag;
|
---|
| 73 | string text;
|
---|
| 74 | ExcelRichText thisrt = null;
|
---|
| 75 | bool isFirst = true;
|
---|
| 76 |
|
---|
| 77 | // Get all pairs of legitimate tags and the text between them. This loop will
|
---|
| 78 | // only execute if there is at least one start or end tag.
|
---|
| 79 | foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase))
|
---|
| 80 | {
|
---|
| 81 | if (isFirst)
|
---|
| 82 | {
|
---|
| 83 | // On the very first match, set up the initial rich text object with
|
---|
| 84 | // the defaults for the text BEFORE the match.
|
---|
| 85 | range.IsRichText = true;
|
---|
| 86 | thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
|
---|
| 87 | thisrt.Size = defaultFontSize; // Set the default font size
|
---|
| 88 | thisrt.FontName = defaultFontName; // Set the default font name
|
---|
| 89 | isFirst = false;
|
---|
| 90 | }
|
---|
| 91 | // Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
|
---|
| 92 | // encoded, unencode them, they should be passed to RichText as normal characters (other
|
---|
| 93 | // than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
|
---|
| 94 | tag = m.Groups[1].Captures[0].Value;
|
---|
| 95 | text = CleanText(m.Groups[2].Captures[0].Value);
|
---|
| 96 |
|
---|
| 97 | if (thisrt.Text == "")
|
---|
| 98 | {
|
---|
| 99 | // The most recent rich text block wasn't *actually* used last time around, so update
|
---|
| 100 | // the text and keep it as the "current" block. This happens with the first block if
|
---|
| 101 | // it starts with a tag, and may happen later if tags come one right after the other.
|
---|
| 102 | thisrt.Text = text;
|
---|
| 103 | }
|
---|
| 104 | else
|
---|
| 105 | {
|
---|
| 106 | // The current rich text block has some text, so create a new one. RichText.Add()
|
---|
| 107 | // automatically applies the settings from the previous block, other than vertical
|
---|
| 108 | // alignment.
|
---|
| 109 | thisrt = range.RichText.Add(text);
|
---|
| 110 | }
|
---|
| 111 | // Override the settings based on the current tag, keep all other settings.
|
---|
| 112 | SetStyleFromTag(tag, thisrt);
|
---|
| 113 | }
|
---|
| 114 |
|
---|
| 115 | if (thisrt == null)
|
---|
| 116 | {
|
---|
| 117 | // No HTML tags were found, so treat this as a normal text value.
|
---|
| 118 | range.IsRichText = false;
|
---|
| 119 | range.Value = CleanText(html);
|
---|
| 120 | }
|
---|
| 121 | else if (String.IsNullOrEmpty(thisrt.Text))
|
---|
| 122 | {
|
---|
| 123 | // Rich text was found, but the last node contains no text, so remove it. This can happen if,
|
---|
| 124 | // say, the end of the string is an end tag or unsupported tag (common).
|
---|
| 125 | range.RichText.Remove(thisrt);
|
---|
| 126 |
|
---|
| 127 | // Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
|
---|
| 128 | // directives that remain. If that is the case, turn off rich text and treat this like a blank
|
---|
| 129 | // cell value.
|
---|
| 130 | if (range.RichText.Count == 0)
|
---|
| 131 | {
|
---|
| 132 | range.IsRichText = false;
|
---|
| 133 | range.Value = "";
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 | }
|
---|
| 137 |
|
---|
| 138 | }
|
---|
| 139 |
|
---|
| 140 | private static void SetStyleFromTag(string tag, ExcelRichText settings)
|
---|
| 141 | {
|
---|
| 142 | switch (tag.ToLower())
|
---|
| 143 | {
|
---|
| 144 | case "b":
|
---|
| 145 | case "strong":
|
---|
| 146 | settings.Bold = true;
|
---|
| 147 | break;
|
---|
| 148 | case "i":
|
---|
| 149 | case "em":
|
---|
| 150 | settings.Italic = true;
|
---|
| 151 | break;
|
---|
| 152 | case "u":
|
---|
| 153 | settings.UnderLine = true;
|
---|
| 154 | break;
|
---|
| 155 | case "s":
|
---|
| 156 | case "strike":
|
---|
| 157 | settings.Strike = true;
|
---|
| 158 | break;
|
---|
| 159 | case "sup":
|
---|
| 160 | settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
|
---|
| 161 | break;
|
---|
| 162 | case "sub":
|
---|
| 163 | settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
|
---|
| 164 | break;
|
---|
| 165 | case "/b":
|
---|
| 166 | case "/strong":
|
---|
| 167 | settings.Bold = false;
|
---|
| 168 | break;
|
---|
| 169 | case "/i":
|
---|
| 170 | case "/em":
|
---|
| 171 | settings.Italic = false;
|
---|
| 172 | break;
|
---|
| 173 | case "/u":
|
---|
| 174 | settings.UnderLine = false;
|
---|
| 175 | break;
|
---|
| 176 | case "/s":
|
---|
| 177 | case "/strike":
|
---|
| 178 | settings.Strike = false;
|
---|
| 179 | break;
|
---|
| 180 | case "/sup":
|
---|
| 181 | case "/sub":
|
---|
| 182 | settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
|
---|
| 183 | break;
|
---|
| 184 | default:
|
---|
| 185 | // unsupported HTML, no style change
|
---|
| 186 | break;
|
---|
| 187 | }
|
---|
| 188 | }
|
---|
| 189 |
|
---|
| 190 | private static string CleanText(string s)
|
---|
| 191 | {
|
---|
| 192 | // Need to convert HTML entities (named or numbered) into actual Unicode characters
|
---|
| 193 | s = System.Web.HttpUtility.HtmlDecode(s);
|
---|
| 194 | // Remove any non-breaking spaces, kills Excel
|
---|
| 195 | s = s.Replace("\u00A0", " ");
|
---|
| 196 | return s;
|
---|
| 197 | }
|
---|
| 198 |
|
---|
| 199 | }
|
---|
| 200 | }
|
---|