// Copyright (c) 2014 AlphaSierraPapa for the SharpDevelop Team
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this
// software and associated documentation files (the "Software"), to deal in the Software
// without restriction, including without limitation the rights to use, copy, modify, merge,
// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
// to whom the Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or
// substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
// FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
using System;
using System.IO;
using System.Text;
namespace ICSharpCode.AvalonEdit.Utils
{
///
/// Class that can open text files with auto-detection of the encoding.
///
public static class FileReader
{
///
/// Gets if the given encoding is a Unicode encoding (UTF).
///
///
/// Returns true for UTF-7, UTF-8, UTF-16 LE, UTF-16 BE, UTF-32 LE and UTF-32 BE.
/// Returns false for all other encodings.
///
public static bool IsUnicode(Encoding encoding)
{
if (encoding == null)
throw new ArgumentNullException("encoding");
switch (encoding.CodePage) {
case 65000: // UTF-7
case 65001: // UTF-8
case 1200: // UTF-16 LE
case 1201: // UTF-16 BE
case 12000: // UTF-32 LE
case 12001: // UTF-32 BE
return true;
default:
return false;
}
}
static bool IsASCIICompatible(Encoding encoding)
{
byte[] bytes = encoding.GetBytes("Az");
return bytes.Length == 2 && bytes[0] == 'A' && bytes[1] == 'z';
}
static Encoding RemoveBOM(Encoding encoding)
{
switch (encoding.CodePage) {
case 65001: // UTF-8
return UTF8NoBOM;
default:
return encoding;
}
}
///
/// Reads the content of the given stream.
///
/// The stream to read.
/// The stream must support seeking and must be positioned at its beginning.
/// The encoding to use if the encoding cannot be auto-detected.
/// The file content as string.
public static string ReadFileContent(Stream stream, Encoding defaultEncoding)
{
using (StreamReader reader = OpenStream(stream, defaultEncoding)) {
return reader.ReadToEnd();
}
}
///
/// Reads the content of the file.
///
/// The file name.
/// The encoding to use if the encoding cannot be auto-detected.
/// The file content as string.
public static string ReadFileContent(string fileName, Encoding defaultEncoding)
{
using (FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) {
return ReadFileContent(fs, defaultEncoding);
}
}
///
/// Opens the specified file for reading.
///
/// The file to open.
/// The encoding to use if the encoding cannot be auto-detected.
/// Returns a StreamReader that reads from the stream. Use
/// to get the encoding that was used.
public static StreamReader OpenFile(string fileName, Encoding defaultEncoding)
{
if (fileName == null)
throw new ArgumentNullException("fileName");
FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read);
try {
return OpenStream(fs, defaultEncoding);
// don't use finally: the stream must be kept open until the StreamReader closes it
} catch {
fs.Dispose();
throw;
}
}
///
/// Opens the specified stream for reading.
///
/// The stream to open.
/// The encoding to use if the encoding cannot be auto-detected.
/// Returns a StreamReader that reads from the stream. Use
/// to get the encoding that was used.
public static StreamReader OpenStream(Stream stream, Encoding defaultEncoding)
{
if (stream == null)
throw new ArgumentNullException("stream");
if (stream.Position != 0)
throw new ArgumentException("stream is not positioned at beginning.", "stream");
if (defaultEncoding == null)
throw new ArgumentNullException("defaultEncoding");
if (stream.Length >= 2) {
// the autodetection of StreamReader is not capable of detecting the difference
// between ISO-8859-1 and UTF-8 without BOM.
int firstByte = stream.ReadByte();
int secondByte = stream.ReadByte();
switch ((firstByte << 8) | secondByte) {
case 0x0000: // either UTF-32 Big Endian or a binary file; use StreamReader
case 0xfffe: // Unicode BOM (UTF-16 LE or UTF-32 LE)
case 0xfeff: // UTF-16 BE BOM
case 0xefbb: // start of UTF-8 BOM
// StreamReader autodetection works
stream.Position = 0;
return new StreamReader(stream);
default:
return AutoDetect(stream, (byte)firstByte, (byte)secondByte, defaultEncoding);
}
} else {
if (defaultEncoding != null) {
return new StreamReader(stream, defaultEncoding);
} else {
return new StreamReader(stream);
}
}
}
static readonly Encoding UTF8NoBOM = new UTF8Encoding(false);
static StreamReader AutoDetect(Stream fs, byte firstByte, byte secondByte, Encoding defaultEncoding)
{
int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
const int ASCII = 0;
const int Error = 1;
const int UTF8 = 2;
const int UTF8Sequence = 3;
int state = ASCII;
int sequenceLength = 0;
byte b;
for (int i = 0; i < max; i++) {
if (i == 0) {
b = firstByte;
} else if (i == 1) {
b = secondByte;
} else {
b = (byte)fs.ReadByte();
}
if (b < 0x80) {
// normal ASCII character
if (state == UTF8Sequence) {
state = Error;
break;
}
} else if (b < 0xc0) {
// 10xxxxxx : continues UTF8 byte sequence
if (state == UTF8Sequence) {
--sequenceLength;
if (sequenceLength < 0) {
state = Error;
break;
} else if (sequenceLength == 0) {
state = UTF8;
}
} else {
state = Error;
break;
}
} else if (b >= 0xc2 && b < 0xf5) {
// beginning of byte sequence
if (state == UTF8 || state == ASCII) {
state = UTF8Sequence;
if (b < 0xe0) {
sequenceLength = 1; // one more byte following
} else if (b < 0xf0) {
sequenceLength = 2; // two more bytes following
} else {
sequenceLength = 3; // three more bytes following
}
} else {
state = Error;
break;
}
} else {
// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
state = Error;
break;
}
}
fs.Position = 0;
switch (state) {
case ASCII:
return new StreamReader(fs, IsASCIICompatible(defaultEncoding) ? RemoveBOM(defaultEncoding) : Encoding.ASCII);
case Error:
// When the file seems to be non-UTF8,
// we read it using the user-specified encoding so it is saved again
// using that encoding.
if (IsUnicode(defaultEncoding)) {
// the file is not Unicode, so don't read it using Unicode even if the
// user has choosen Unicode as the default encoding.
defaultEncoding = Encoding.Default; // use system encoding instead
}
return new StreamReader(fs, RemoveBOM(defaultEncoding));
default:
return new StreamReader(fs, UTF8NoBOM);
}
}
}
}