1 | // Copyright (c) 2014 AlphaSierraPapa for the SharpDevelop Team |
---|
2 | // |
---|
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this |
---|
4 | // software and associated documentation files (the "Software"), to deal in the Software |
---|
5 | // without restriction, including without limitation the rights to use, copy, modify, merge, |
---|
6 | // publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons |
---|
7 | // to whom the Software is furnished to do so, subject to the following conditions: |
---|
8 | // |
---|
9 | // The above copyright notice and this permission notice shall be included in all copies or |
---|
10 | // substantial portions of the Software. |
---|
11 | // |
---|
12 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
---|
13 | // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR |
---|
14 | // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE |
---|
15 | // FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
---|
16 | // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
17 | // DEALINGS IN THE SOFTWARE. |
---|
18 | |
---|
19 | using System; |
---|
20 | using System.IO; |
---|
21 | using System.Text; |
---|
22 | |
---|
23 | namespace ICSharpCode.AvalonEdit.Utils |
---|
24 | { |
---|
25 | /// <summary> |
---|
26 | /// Class that can open text files with auto-detection of the encoding. |
---|
27 | /// </summary> |
---|
28 | public static class FileReader |
---|
29 | { |
---|
30 | /// <summary> |
---|
31 | /// Gets if the given encoding is a Unicode encoding (UTF). |
---|
32 | /// </summary> |
---|
33 | /// <remarks> |
---|
34 | /// Returns true for UTF-7, UTF-8, UTF-16 LE, UTF-16 BE, UTF-32 LE and UTF-32 BE. |
---|
35 | /// Returns false for all other encodings. |
---|
36 | /// </remarks> |
---|
37 | public static bool IsUnicode(Encoding encoding) |
---|
38 | { |
---|
39 | if (encoding == null) |
---|
40 | throw new ArgumentNullException("encoding"); |
---|
41 | switch (encoding.CodePage) { |
---|
42 | case 65000: // UTF-7 |
---|
43 | case 65001: // UTF-8 |
---|
44 | case 1200: // UTF-16 LE |
---|
45 | case 1201: // UTF-16 BE |
---|
46 | case 12000: // UTF-32 LE |
---|
47 | case 12001: // UTF-32 BE |
---|
48 | return true; |
---|
49 | default: |
---|
50 | return false; |
---|
51 | } |
---|
52 | } |
---|
53 | |
---|
54 | static bool IsASCIICompatible(Encoding encoding) |
---|
55 | { |
---|
56 | byte[] bytes = encoding.GetBytes("Az"); |
---|
57 | return bytes.Length == 2 && bytes[0] == 'A' && bytes[1] == 'z'; |
---|
58 | } |
---|
59 | |
---|
60 | static Encoding RemoveBOM(Encoding encoding) |
---|
61 | { |
---|
62 | switch (encoding.CodePage) { |
---|
63 | case 65001: // UTF-8 |
---|
64 | return UTF8NoBOM; |
---|
65 | default: |
---|
66 | return encoding; |
---|
67 | } |
---|
68 | } |
---|
69 | |
---|
70 | /// <summary> |
---|
71 | /// Reads the content of the given stream. |
---|
72 | /// </summary> |
---|
73 | /// <param name="stream">The stream to read. |
---|
74 | /// The stream must support seeking and must be positioned at its beginning.</param> |
---|
75 | /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
---|
76 | /// <returns>The file content as string.</returns> |
---|
77 | public static string ReadFileContent(Stream stream, Encoding defaultEncoding) |
---|
78 | { |
---|
79 | using (StreamReader reader = OpenStream(stream, defaultEncoding)) { |
---|
80 | return reader.ReadToEnd(); |
---|
81 | } |
---|
82 | } |
---|
83 | |
---|
84 | /// <summary> |
---|
85 | /// Reads the content of the file. |
---|
86 | /// </summary> |
---|
87 | /// <param name="fileName">The file name.</param> |
---|
88 | /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
---|
89 | /// <returns>The file content as string.</returns> |
---|
90 | public static string ReadFileContent(string fileName, Encoding defaultEncoding) |
---|
91 | { |
---|
92 | using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { |
---|
93 | return ReadFileContent(fs, defaultEncoding); |
---|
94 | } |
---|
95 | } |
---|
96 | |
---|
97 | /// <summary> |
---|
98 | /// Opens the specified file for reading. |
---|
99 | /// </summary> |
---|
100 | /// <param name="fileName">The file to open.</param> |
---|
101 | /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
---|
102 | /// <returns>Returns a StreamReader that reads from the stream. Use |
---|
103 | /// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns> |
---|
104 | public static StreamReader OpenFile(string fileName, Encoding defaultEncoding) |
---|
105 | { |
---|
106 | if (fileName == null) |
---|
107 | throw new ArgumentNullException("fileName"); |
---|
108 | FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read); |
---|
109 | try { |
---|
110 | return OpenStream(fs, defaultEncoding); |
---|
111 | // don't use finally: the stream must be kept open until the StreamReader closes it |
---|
112 | } catch { |
---|
113 | fs.Dispose(); |
---|
114 | throw; |
---|
115 | } |
---|
116 | } |
---|
117 | |
---|
118 | /// <summary> |
---|
119 | /// Opens the specified stream for reading. |
---|
120 | /// </summary> |
---|
121 | /// <param name="stream">The stream to open.</param> |
---|
122 | /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param> |
---|
123 | /// <returns>Returns a StreamReader that reads from the stream. Use |
---|
124 | /// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns> |
---|
125 | public static StreamReader OpenStream(Stream stream, Encoding defaultEncoding) |
---|
126 | { |
---|
127 | if (stream == null) |
---|
128 | throw new ArgumentNullException("stream"); |
---|
129 | if (stream.Position != 0) |
---|
130 | throw new ArgumentException("stream is not positioned at beginning.", "stream"); |
---|
131 | if (defaultEncoding == null) |
---|
132 | throw new ArgumentNullException("defaultEncoding"); |
---|
133 | |
---|
134 | if (stream.Length >= 2) { |
---|
135 | // the autodetection of StreamReader is not capable of detecting the difference |
---|
136 | // between ISO-8859-1 and UTF-8 without BOM. |
---|
137 | int firstByte = stream.ReadByte(); |
---|
138 | int secondByte = stream.ReadByte(); |
---|
139 | switch ((firstByte << 8) | secondByte) { |
---|
140 | case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader |
---|
141 | case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE) |
---|
142 | case 0xfeff: // UTF-16 BE BOM |
---|
143 | case 0xefbb: // start of UTF-8 BOM |
---|
144 | // StreamReader autodetection works |
---|
145 | stream.Position = 0; |
---|
146 | return new StreamReader(stream); |
---|
147 | default: |
---|
148 | return AutoDetect(stream, (byte)firstByte, (byte)secondByte, defaultEncoding); |
---|
149 | } |
---|
150 | } else { |
---|
151 | if (defaultEncoding != null) { |
---|
152 | return new StreamReader(stream, defaultEncoding); |
---|
153 | } else { |
---|
154 | return new StreamReader(stream); |
---|
155 | } |
---|
156 | } |
---|
157 | } |
---|
158 | |
---|
159 | static readonly Encoding UTF8NoBOM = new UTF8Encoding(false); |
---|
160 | |
---|
161 | static StreamReader AutoDetect(Stream fs, byte firstByte, byte secondByte, Encoding defaultEncoding) |
---|
162 | { |
---|
163 | int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB |
---|
164 | const int ASCII = 0; |
---|
165 | const int Error = 1; |
---|
166 | const int UTF8 = 2; |
---|
167 | const int UTF8Sequence = 3; |
---|
168 | int state = ASCII; |
---|
169 | int sequenceLength = 0; |
---|
170 | byte b; |
---|
171 | for (int i = 0; i < max; i++) { |
---|
172 | if (i == 0) { |
---|
173 | b = firstByte; |
---|
174 | } else if (i == 1) { |
---|
175 | b = secondByte; |
---|
176 | } else { |
---|
177 | b = (byte)fs.ReadByte(); |
---|
178 | } |
---|
179 | if (b < 0x80) { |
---|
180 | // normal ASCII character |
---|
181 | if (state == UTF8Sequence) { |
---|
182 | state = Error; |
---|
183 | break; |
---|
184 | } |
---|
185 | } else if (b < 0xc0) { |
---|
186 | // 10xxxxxx : continues UTF8 byte sequence |
---|
187 | if (state == UTF8Sequence) { |
---|
188 | --sequenceLength; |
---|
189 | if (sequenceLength < 0) { |
---|
190 | state = Error; |
---|
191 | break; |
---|
192 | } else if (sequenceLength == 0) { |
---|
193 | state = UTF8; |
---|
194 | } |
---|
195 | } else { |
---|
196 | state = Error; |
---|
197 | break; |
---|
198 | } |
---|
199 | } else if (b >= 0xc2 && b < 0xf5) { |
---|
200 | // beginning of byte sequence |
---|
201 | if (state == UTF8 || state == ASCII) { |
---|
202 | state = UTF8Sequence; |
---|
203 | if (b < 0xe0) { |
---|
204 | sequenceLength = 1; // one more byte following |
---|
205 | } else if (b < 0xf0) { |
---|
206 | sequenceLength = 2; // two more bytes following |
---|
207 | } else { |
---|
208 | sequenceLength = 3; // three more bytes following |
---|
209 | } |
---|
210 | } else { |
---|
211 | state = Error; |
---|
212 | break; |
---|
213 | } |
---|
214 | } else { |
---|
215 | // 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629) |
---|
216 | state = Error; |
---|
217 | break; |
---|
218 | } |
---|
219 | } |
---|
220 | fs.Position = 0; |
---|
221 | switch (state) { |
---|
222 | case ASCII: |
---|
223 | return new StreamReader(fs, IsASCIICompatible(defaultEncoding) ? RemoveBOM(defaultEncoding) : Encoding.ASCII); |
---|
224 | case Error: |
---|
225 | // When the file seems to be non-UTF8, |
---|
226 | // we read it using the user-specified encoding so it is saved again |
---|
227 | // using that encoding. |
---|
228 | if (IsUnicode(defaultEncoding)) { |
---|
229 | // the file is not Unicode, so don't read it using Unicode even if the |
---|
230 | // user has choosen Unicode as the default encoding. |
---|
231 | |
---|
232 | defaultEncoding = Encoding.Default; // use system encoding instead |
---|
233 | } |
---|
234 | return new StreamReader(fs, RemoveBOM(defaultEncoding)); |
---|
235 | default: |
---|
236 | return new StreamReader(fs, UTF8NoBOM); |
---|
237 | } |
---|
238 | } |
---|
239 | } |
---|
240 | } |
---|