1 | /*******************************************************************************
|
---|
2 | * You may amend and distribute as you like, but don't remove this header!
|
---|
3 | *
|
---|
4 | * EPPlus provides server-side generation of Excel 2007/2010 spreadsheets.
|
---|
5 | * See http://www.codeplex.com/EPPlus for details.
|
---|
6 | *
|
---|
7 | * Copyright (C) 2011 Jan Källman
|
---|
8 | *
|
---|
9 | * This library is free software; you can redistribute it and/or
|
---|
10 | * modify it under the terms of the GNU Lesser General Public
|
---|
11 | * License as published by the Free Software Foundation; either
|
---|
12 | * version 2.1 of the License, or (at your option) any later version.
|
---|
13 |
|
---|
14 | * This library is distributed in the hope that it will be useful,
|
---|
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
---|
17 | * See the GNU Lesser General Public License for more details.
|
---|
18 | *
|
---|
19 | * The GNU Lesser General Public License can be viewed at http://www.opensource.org/licenses/lgpl-license.php
|
---|
20 | * If you unfamiliar with this license or have questions about it, here is an http://www.gnu.org/licenses/gpl-faq.html
|
---|
21 | *
|
---|
22 | * All code and executables are provided "as is" with no warranty either express or implied.
|
---|
23 | * The author accepts no liability for any damage or loss of business that this product may cause.
|
---|
24 | *
|
---|
25 | * Code change notes:
|
---|
26 | *
|
---|
27 | * Author Change Date
|
---|
28 | * ******************************************************************************
|
---|
29 | * Richard Tallent Initial Release 2012-08-13
|
---|
30 | *******************************************************************************/
|
---|
31 | using System;
|
---|
32 | using System.Collections.Generic;
|
---|
33 | using System.Text;
|
---|
34 | using System.Text.RegularExpressions;
|
---|
35 |
|
---|
36 | namespace OfficeOpenXml.Style
|
---|
37 | {
|
---|
38 | public class ExcelRichTextHtmlUtility
|
---|
39 | {
|
---|
40 |
|
---|
41 | /// <summary>
|
---|
42 | /// Provides basic HTML support by converting well-behaved HTML into appropriate RichText blocks.
|
---|
43 | /// HTML support is limited, and does not include font colors, sizes, or typefaces at this time,
|
---|
44 | /// and also does not support CSS style attributes. It does support line breaks using the BR tag.
|
---|
45 | ///
|
---|
46 | /// This routine parses the HTML into RegEx pairings of an HTML tag and the text until the NEXT
|
---|
47 | /// tag (if any). The tag is parsed to determine the setting change to be applied to the last set
|
---|
48 | /// of settings, and if the text is not blank, a new block is added to rich text.
|
---|
49 | /// </summary>
|
---|
50 | /// <param name="range"></param>
|
---|
51 | /// <param name="html">The HTML to parse into RichText</param>
|
---|
52 | /// <param name="defaultFontName"></param>
|
---|
53 | /// <param name="defaultFontSize"></param>
|
---|
54 |
|
---|
55 | public static void SetRichTextFromHtml(ExcelRange range, string html, string defaultFontName, short defaultFontSize)
|
---|
56 | {
|
---|
57 | // Reset the cell value, just in case there is an existing RichText value.
|
---|
58 | range.Value = "";
|
---|
59 |
|
---|
60 | // Sanity check for blank values, skips creating Regex objects for performance.
|
---|
61 | if (String.IsNullOrEmpty(html))
|
---|
62 | {
|
---|
63 | range.IsRichText = false;
|
---|
64 | return;
|
---|
65 | }
|
---|
66 |
|
---|
67 | // Change all BR tags to line breaks. http://epplus.codeplex.com/discussions/238692/
|
---|
68 | // Cells with line breaks aren't necessarily considered rich text, so this is performed
|
---|
69 | // before parsing the HTML tags.
|
---|
70 | html = System.Text.RegularExpressions.Regex.Replace(html, @"<br[^>]*>", "\r\n", RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
---|
71 |
|
---|
72 | string tag;
|
---|
73 | string text;
|
---|
74 | ExcelRichText thisrt = null;
|
---|
75 | bool isFirst = true;
|
---|
76 |
|
---|
77 | // Get all pairs of legitimate tags and the text between them. This loop will
|
---|
78 | // only execute if there is at least one start or end tag.
|
---|
79 | foreach (Match m in System.Text.RegularExpressions.Regex.Matches(html, @"<(/?[a-z]+)[^<>]*>([\s\S]*?)(?=</?[a-z]+[^<>]*>|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase))
|
---|
80 | {
|
---|
81 | if (isFirst)
|
---|
82 | {
|
---|
83 | // On the very first match, set up the initial rich text object with
|
---|
84 | // the defaults for the text BEFORE the match.
|
---|
85 | range.IsRichText = true;
|
---|
86 | thisrt = range.RichText.Add(CleanText(html.Substring(0, m.Index))); // May be 0-length
|
---|
87 | thisrt.Size = defaultFontSize; // Set the default font size
|
---|
88 | thisrt.FontName = defaultFontName; // Set the default font name
|
---|
89 | isFirst = false;
|
---|
90 | }
|
---|
91 | // Get the tag and the block of text until the NEXT tag or EOS. If there are HTML entities
|
---|
92 | // encoded, unencode them, they should be passed to RichText as normal characters (other
|
---|
93 | // than non-breaking spaces, which should be replaced with normal spaces, they break Excel.
|
---|
94 | tag = m.Groups[1].Captures[0].Value;
|
---|
95 | text = CleanText(m.Groups[2].Captures[0].Value);
|
---|
96 |
|
---|
97 | if (thisrt.Text == "")
|
---|
98 | {
|
---|
99 | // The most recent rich text block wasn't *actually* used last time around, so update
|
---|
100 | // the text and keep it as the "current" block. This happens with the first block if
|
---|
101 | // it starts with a tag, and may happen later if tags come one right after the other.
|
---|
102 | thisrt.Text = text;
|
---|
103 | }
|
---|
104 | else
|
---|
105 | {
|
---|
106 | // The current rich text block has some text, so create a new one. RichText.Add()
|
---|
107 | // automatically applies the settings from the previous block, other than vertical
|
---|
108 | // alignment.
|
---|
109 | thisrt = range.RichText.Add(text);
|
---|
110 | }
|
---|
111 | // Override the settings based on the current tag, keep all other settings.
|
---|
112 | SetStyleFromTag(tag, thisrt);
|
---|
113 | }
|
---|
114 |
|
---|
115 | if (thisrt == null)
|
---|
116 | {
|
---|
117 | // No HTML tags were found, so treat this as a normal text value.
|
---|
118 | range.IsRichText = false;
|
---|
119 | range.Value = CleanText(html);
|
---|
120 | }
|
---|
121 | else if (String.IsNullOrEmpty(thisrt.Text))
|
---|
122 | {
|
---|
123 | // Rich text was found, but the last node contains no text, so remove it. This can happen if,
|
---|
124 | // say, the end of the string is an end tag or unsupported tag (common).
|
---|
125 | range.RichText.Remove(thisrt);
|
---|
126 |
|
---|
127 | // Failsafe -- the HTML may be just tags, no text, in which case there may be no rich text
|
---|
128 | // directives that remain. If that is the case, turn off rich text and treat this like a blank
|
---|
129 | // cell value.
|
---|
130 | if (range.RichText.Count == 0)
|
---|
131 | {
|
---|
132 | range.IsRichText = false;
|
---|
133 | range.Value = "";
|
---|
134 | }
|
---|
135 |
|
---|
136 | }
|
---|
137 |
|
---|
138 | }
|
---|
139 |
|
---|
140 | private static void SetStyleFromTag(string tag, ExcelRichText settings)
|
---|
141 | {
|
---|
142 | switch (tag.ToLower())
|
---|
143 | {
|
---|
144 | case "b":
|
---|
145 | case "strong":
|
---|
146 | settings.Bold = true;
|
---|
147 | break;
|
---|
148 | case "i":
|
---|
149 | case "em":
|
---|
150 | settings.Italic = true;
|
---|
151 | break;
|
---|
152 | case "u":
|
---|
153 | settings.UnderLine = true;
|
---|
154 | break;
|
---|
155 | case "s":
|
---|
156 | case "strike":
|
---|
157 | settings.Strike = true;
|
---|
158 | break;
|
---|
159 | case "sup":
|
---|
160 | settings.VerticalAlign = ExcelVerticalAlignmentFont.Superscript;
|
---|
161 | break;
|
---|
162 | case "sub":
|
---|
163 | settings.VerticalAlign = ExcelVerticalAlignmentFont.Subscript;
|
---|
164 | break;
|
---|
165 | case "/b":
|
---|
166 | case "/strong":
|
---|
167 | settings.Bold = false;
|
---|
168 | break;
|
---|
169 | case "/i":
|
---|
170 | case "/em":
|
---|
171 | settings.Italic = false;
|
---|
172 | break;
|
---|
173 | case "/u":
|
---|
174 | settings.UnderLine = false;
|
---|
175 | break;
|
---|
176 | case "/s":
|
---|
177 | case "/strike":
|
---|
178 | settings.Strike = false;
|
---|
179 | break;
|
---|
180 | case "/sup":
|
---|
181 | case "/sub":
|
---|
182 | settings.VerticalAlign = ExcelVerticalAlignmentFont.None;
|
---|
183 | break;
|
---|
184 | default:
|
---|
185 | // unsupported HTML, no style change
|
---|
186 | break;
|
---|
187 | }
|
---|
188 | }
|
---|
189 |
|
---|
190 | private static string CleanText(string s)
|
---|
191 | {
|
---|
192 | // Need to convert HTML entities (named or numbered) into actual Unicode characters
|
---|
193 | s = System.Web.HttpUtility.HtmlDecode(s);
|
---|
194 | // Remove any non-breaking spaces, kills Excel
|
---|
195 | s = s.Replace("\u00A0", " ");
|
---|
196 | return s;
|
---|
197 | }
|
---|
198 |
|
---|
199 | }
|
---|
200 | }
|
---|