Free cookie consent management tool by TermsFeed Policy Generator

source: branches/Async/HeuristicLab.ExtLibs/HeuristicLab.AvalonEdit/5.0.1/AvalonEdit-5.0.1/Utils/FileReader.cs @ 13401

Last change on this file since 13401 was 11700, checked in by jkarder, 10 years ago

#2077: created branch and added first version

File size: 8.1 KB
Line 
1// Copyright (c) 2014 AlphaSierraPapa for the SharpDevelop Team
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy of this
4// software and associated documentation files (the "Software"), to deal in the Software
5// without restriction, including without limitation the rights to use, copy, modify, merge,
6// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
7// to whom the Software is furnished to do so, subject to the following conditions:
8//
9// The above copyright notice and this permission notice shall be included in all copies or
10// substantial portions of the Software.
11//
12// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
15// FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
16// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17// DEALINGS IN THE SOFTWARE.
18
19using System;
20using System.IO;
21using System.Text;
22
23namespace ICSharpCode.AvalonEdit.Utils
24{
25  /// <summary>
26  /// Class that can open text files with auto-detection of the encoding.
27  /// </summary>
28  public static class FileReader
29  {
30    /// <summary>
31    /// Gets if the given encoding is a Unicode encoding (UTF).
32    /// </summary>
33    /// <remarks>
34    /// Returns true for UTF-7, UTF-8, UTF-16 LE, UTF-16 BE, UTF-32 LE and UTF-32 BE.
35    /// Returns false for all other encodings.
36    /// </remarks>
37    public static bool IsUnicode(Encoding encoding)
38    {
39      if (encoding == null)
40        throw new ArgumentNullException("encoding");
41      switch (encoding.CodePage) {
42        case 65000: // UTF-7
43        case 65001: // UTF-8
44        case 1200: // UTF-16 LE
45        case 1201: // UTF-16 BE
46        case 12000: // UTF-32 LE
47        case 12001: // UTF-32 BE
48          return true;
49        default:
50          return false;
51      }
52    }
53   
54    static bool IsASCIICompatible(Encoding encoding)
55    {
56      byte[] bytes = encoding.GetBytes("Az");
57      return bytes.Length == 2 && bytes[0] == 'A' && bytes[1] == 'z';
58    }
59   
60    static Encoding RemoveBOM(Encoding encoding)
61    {
62      switch (encoding.CodePage) {
63        case 65001: // UTF-8
64          return UTF8NoBOM;
65        default:
66          return encoding;
67      }
68    }
69   
70    /// <summary>
71    /// Reads the content of the given stream.
72    /// </summary>
73    /// <param name="stream">The stream to read.
74    /// The stream must support seeking and must be positioned at its beginning.</param>
75    /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param>
76    /// <returns>The file content as string.</returns>
77    public static string ReadFileContent(Stream stream, Encoding defaultEncoding)
78    {
79      using (StreamReader reader = OpenStream(stream, defaultEncoding)) {
80        return reader.ReadToEnd();
81      }
82    }
83   
84    /// <summary>
85    /// Reads the content of the file.
86    /// </summary>
87    /// <param name="fileName">The file name.</param>
88    /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param>
89    /// <returns>The file content as string.</returns>
90    public static string ReadFileContent(string fileName, Encoding defaultEncoding)
91    {
92      using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) {
93        return ReadFileContent(fs, defaultEncoding);
94      }
95    }
96   
97    /// <summary>
98    /// Opens the specified file for reading.
99    /// </summary>
100    /// <param name="fileName">The file to open.</param>
101    /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param>
102    /// <returns>Returns a StreamReader that reads from the stream. Use
103    /// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns>
104    public static StreamReader OpenFile(string fileName, Encoding defaultEncoding)
105    {
106      if (fileName == null)
107        throw new ArgumentNullException("fileName");
108      FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read);
109      try {
110        return OpenStream(fs, defaultEncoding);
111        // don't use finally: the stream must be kept open until the StreamReader closes it
112      } catch {
113        fs.Dispose();
114        throw;
115      }
116    }
117   
118    /// <summary>
119    /// Opens the specified stream for reading.
120    /// </summary>
121    /// <param name="stream">The stream to open.</param>
122    /// <param name="defaultEncoding">The encoding to use if the encoding cannot be auto-detected.</param>
123    /// <returns>Returns a StreamReader that reads from the stream. Use
124    /// <see cref="StreamReader.CurrentEncoding"/> to get the encoding that was used.</returns>
125    public static StreamReader OpenStream(Stream stream, Encoding defaultEncoding)
126    {
127      if (stream == null)
128        throw new ArgumentNullException("stream");
129      if (stream.Position != 0)
130        throw new ArgumentException("stream is not positioned at beginning.", "stream");
131      if (defaultEncoding == null)
132        throw new ArgumentNullException("defaultEncoding");
133     
134      if (stream.Length >= 2) {
135        // the autodetection of StreamReader is not capable of detecting the difference
136        // between ISO-8859-1 and UTF-8 without BOM.
137        int firstByte = stream.ReadByte();
138        int secondByte = stream.ReadByte();
139        switch ((firstByte << 8) | secondByte) {
140          case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader
141          case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE)
142          case 0xfeff: // UTF-16 BE BOM
143          case 0xefbb: // start of UTF-8 BOM
144            // StreamReader autodetection works
145            stream.Position = 0;
146            return new StreamReader(stream);
147          default:
148            return AutoDetect(stream, (byte)firstByte, (byte)secondByte, defaultEncoding);
149        }
150      } else {
151        if (defaultEncoding != null) {
152          return new StreamReader(stream, defaultEncoding);
153        } else {
154          return new StreamReader(stream);
155        }
156      }
157    }
158   
159    static readonly Encoding UTF8NoBOM = new UTF8Encoding(false);
160   
161    static StreamReader AutoDetect(Stream fs, byte firstByte, byte secondByte, Encoding defaultEncoding)
162    {
163      int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
164      const int ASCII = 0;
165      const int Error = 1;
166      const int UTF8  = 2;
167      const int UTF8Sequence = 3;
168      int state = ASCII;
169      int sequenceLength = 0;
170      byte b;
171      for (int i = 0; i < max; i++) {
172        if (i == 0) {
173          b = firstByte;
174        } else if (i == 1) {
175          b = secondByte;
176        } else {
177          b = (byte)fs.ReadByte();
178        }
179        if (b < 0x80) {
180          // normal ASCII character
181          if (state == UTF8Sequence) {
182            state = Error;
183            break;
184          }
185        } else if (b < 0xc0) {
186          // 10xxxxxx : continues UTF8 byte sequence
187          if (state == UTF8Sequence) {
188            --sequenceLength;
189            if (sequenceLength < 0) {
190              state = Error;
191              break;
192            } else if (sequenceLength == 0) {
193              state = UTF8;
194            }
195          } else {
196            state = Error;
197            break;
198          }
199        } else if (b >= 0xc2 && b < 0xf5) {
200          // beginning of byte sequence
201          if (state == UTF8 || state == ASCII) {
202            state = UTF8Sequence;
203            if (b < 0xe0) {
204              sequenceLength = 1; // one more byte following
205            } else if (b < 0xf0) {
206              sequenceLength = 2; // two more bytes following
207            } else {
208              sequenceLength = 3; // three more bytes following
209            }
210          } else {
211            state = Error;
212            break;
213          }
214        } else {
215          // 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
216          state = Error;
217          break;
218        }
219      }
220      fs.Position = 0;
221      switch (state) {
222        case ASCII:
223          return new StreamReader(fs, IsASCIICompatible(defaultEncoding) ? RemoveBOM(defaultEncoding) : Encoding.ASCII);
224        case Error:
225          // When the file seems to be non-UTF8,
226          // we read it using the user-specified encoding so it is saved again
227          // using that encoding.
228          if (IsUnicode(defaultEncoding)) {
229            // the file is not Unicode, so don't read it using Unicode even if the
230            // user has choosen Unicode as the default encoding.
231           
232            defaultEncoding = Encoding.Default; // use system encoding instead
233          }
234          return new StreamReader(fs, RemoveBOM(defaultEncoding));
235        default:
236          return new StreamReader(fs, UTF8NoBOM);
237      }
238    }
239  }
240}
Note: See TracBrowser for help on using the repository browser.