#region License Information /* HeuristicLab * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections; using System.Collections.Generic; using System.Diagnostics.Contracts; using System.Globalization; using System.IO; using System.Linq; using System.Text; using HeuristicLab.Problems.DataAnalysis; using DoubleVector = MathNet.Numerics.LinearAlgebra.Vector; namespace HeuristicLab.Problems.Instances.DataAnalysis { public class TableFileParser : Progress { // reports the number of bytes read private const int BUFFER_SIZE = 65536; // char used to symbolize whitespaces (no missing values can be handled with whitespaces) private const char WHITESPACECHAR = (char)0; private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR }; private Tokenizer tokenizer; private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file private Encoding encoding = Encoding.Default; public Encoding Encoding { get { return encoding; } set { if (value == null) throw new ArgumentNullException("Encoding"); encoding = value; } } private int rows; public int Rows { get { return rows; } set { rows = value; } } private int columns; public int Columns { get { return columns; } set { columns = value; } } private List values; public List Values { get { return values; } } private List variableNames; public IEnumerable VariableNames { get { if (variableNames.Count > 0) return variableNames; else { string[] names = new string[columns]; for (int i = 0; i < names.Length; i++) { names[i] = "X" + i.ToString("000"); } return names; } } } public TableFileParser() { variableNames = new List(); } public bool AreColumnNamesInFirstLine(string fileName) { var formatOptions = DetermineFileFormat(fileName); using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { return AreColumnNamesInFirstLine(stream, formatOptions); } } public bool AreColumnNamesInFirstLine(Stream stream) { var formatOptions = new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = ',' }; return AreColumnNamesInFirstLine(stream, formatOptions); } public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) { using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { return AreColumnNamesInFirstLine(stream, formatOptions); } } public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) { using (StreamReader reader = new StreamReader(stream, Encoding)) { tokenizer = new Tokenizer(reader, formatOptions); return (tokenizer.PeekType() != TokenTypeEnum.Double); } } /// /// Parses a file and determines the format first /// /// file which is parsed /// public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { var formatOptions = DetermineFileFormat(fileName); EstimateNumberOfLines(fileName); Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit); } /// /// Parses a file with the given formats /// /// file which is parsed /// Format of numbers /// Format of datetime /// defines the separator /// public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { EstimateNumberOfLines(fileName); using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); } } // determines the number of newline characters in the first 64KB to guess the number of rows for a file private void EstimateNumberOfLines(string fileName) { var len = new System.IO.FileInfo(fileName).Length; var buf = new char[1024 * 1024]; using (var reader = new StreamReader(fileName, Encoding)) { reader.ReadBlock(buf, 0, buf.Length); } int numNewLine = 0; int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative foreach (var ch in buf) { charsInCurrentLine++; if (ch == '\n') { if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line charsInCurrentLine = 0; numNewLine++; } } if (numNewLine <= 1) { // fail -> keep the default setting return; } else { double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1); double estimatedLines = len / charsPerLineFactor; estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough } } /// /// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ',' /// /// stream which is parsed /// public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { var formatOptions = new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = ',' }; Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); } /// /// Parses a stream with the given formats. /// /// Stream which is parsed /// Format of numbers /// Format of datetime /// defines the separator /// public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { if (lineLimit > 0) estimatedNumberOfLines = lineLimit; using (var reader = new StreamReader(stream)) { tokenizer = new Tokenizer(reader, formatOptions); var strValues = new List>(); values = new List(); Prepare(columnNamesInFirstLine, strValues); int nLinesParsed = 0; int colIdx = 0; while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { tokenizer.Skip(); // all rows have to have the same number of values // the first row defines how many elements are needed if (colIdx > 0 && values.Count != colIdx) { // read at least one value in the row (support for skipping empty lines) Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", tokenizer.CurrentLineNumber); } OnReport(tokenizer.BytesRead); nLinesParsed++; colIdx = 0; } else { // read one value TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); if (colIdx == values.Count) { Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", tokenizer.CurrentLineNumber); } if (!IsColumnTypeCompatible(values[colIdx], type)) { values[colIdx] = strValues[colIdx]; } // add the value to the column AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); if (!(values[colIdx] is List)) { // optimization: don't store the string values in another list if the column is list strValues[colIdx].Add(strVal); } colIdx++; } } } if (!values.Any() || values.First().Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format " + "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); this.rows = values.First().Count; this.columns = values.Count; // see if any string column can be converted to vectors if (formatOptions.VectorSeparator != null) { for (int i = 0; i < values.Count; i++) { if (!(values[i] is List stringList)) continue; var strings = new string[stringList.Count][]; var doubles = new double[strings.Length][]; bool allDoubles = true; for (int j = 0; j < strings.Length && allDoubles; j++) { strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value); doubles[j] = new double[strings[j].Length]; for (int k = 0; k < doubles[j].Length && allDoubles; k++) { allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]); } } if (allDoubles) { var vectorList = new List(stringList.Count); for (int j = 0; j < doubles.Length; j++) { vectorList.Add(DoubleVector.Build.Dense(doubles[j])); } values[i] = vectorList; } } } // replace lists with undefined type (object) with double-lists for (int i = 0; i < values.Count; i++) { if (values[i] is List) { values[i] = Enumerable.Repeat(double.NaN, rows).ToList(); } } // after everything has been parsed make sure the lists are as compact as possible foreach (var l in values) { var dblList = l as List; var byteList = l as List; var dateList = l as List; var stringList = l as List; var objList = l as List; var vecList = l as List; if (dblList != null) dblList.TrimExcess(); if (byteList != null) byteList.TrimExcess(); if (dateList != null) dateList.TrimExcess(); if (stringList != null) stringList.TrimExcess(); if (objList != null) objList.TrimExcess(); if (vecList != null) vecList.TrimExcess(); } // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) GC.Collect(2, GCCollectionMode.Forced); } private void Prepare(bool columnNamesInFirstLine, List> strValues) { if (columnNamesInFirstLine) { ParseVariableNames(); if (!tokenizer.HasNext()) Error( "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); } // read first line to determine types and allocate specific lists // read values... start in first row int colIdx = 0; while (tokenizer.PeekType() != TokenTypeEnum.NewLine) { // read one value TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); // initialize column values.Add(CreateList(type, estimatedNumberOfLines)); if (type == TokenTypeEnum.String) strValues.Add(new List(0)); // optimization: don't store the string values in another list if the column is list else strValues.Add(new List(estimatedNumberOfLines)); AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); if (type != TokenTypeEnum.String) strValues[colIdx].Add(strVal); colIdx++; } tokenizer.Skip(); // skip newline } #region type-dependent dispatch private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) { return (list is List) || // unknown lists are compatible to everything (potential conversion) (list is List) || // all tokens can be added to a string list (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns (tokenType == TokenTypeEnum.Double && list is List) || (tokenType == TokenTypeEnum.DateTime && list is List); } // all columns are converted to string columns when we find an non-empty value that has incorrect type private IList ConvertToStringColumn(IList list) { var dblL = list as List; if (dblL != null) { var l = new List(dblL.Capacity); l.AddRange(dblL.Select(dbl => dbl.ToString())); return l; } var dtL = list as List; if (dtL != null) { var l = new List(dtL.Capacity); l.AddRange(dtL.Select(dbl => dbl.ToString())); return l; } if (list is List) return list; throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType())); } private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) { // Add value if list has a defined type var dblList = list as List; if (dblList != null) { AddValue(type, dblList, dblVal); return; } var strList = list as List; if (strList != null) { AddValue(type, strList, strVal); return; } var dtList = list as List; if (dtList != null) { AddValue(type, dtList, dateTimeVal); return; } // Undefined list-type if (type == TokenTypeEnum.Missing) { // add null to track number of missing values list.Add(null); } else { // first non-missing value for undefined list-type var newList = ConvertList(type, list, estimatedNumberOfLines); // replace list var idx = values.IndexOf(list); values[idx] = newList; // recursively call AddValue AddValue(type, newList, strVal, dblVal, dateTimeVal); } } private static void AddValue(TokenTypeEnum type, List list, double dblVal) { Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double); list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal); } private static void AddValue(TokenTypeEnum type, List list, string strVal) { // assumes that strVal is always set to the original token read from the input file list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal); } private static void AddValue(TokenTypeEnum type, List list, DateTime dtVal) { Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime); list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal); } private static IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) { switch (type) { case TokenTypeEnum.String: return new List(estimatedNumberOfLines); case TokenTypeEnum.Double: return new List(estimatedNumberOfLines); case TokenTypeEnum.DateTime: return new List(estimatedNumberOfLines); case TokenTypeEnum.Missing: // List represent list of unknown type return new List(estimatedNumberOfLines); default: throw new InvalidOperationException(); } } private static IList ConvertList(TokenTypeEnum type, IList list, int estimatedNumberOfLines) { var newList = CreateList(type, estimatedNumberOfLines); object missingValue = GetMissingValue(type); for (int i = 0; i < list.Count; i++) newList.Add(missingValue); return newList; } private static object GetMissingValue(TokenTypeEnum type) { switch (type) { case TokenTypeEnum.String: return string.Empty; case TokenTypeEnum.Double: return double.NaN; case TokenTypeEnum.DateTime: return DateTime.MinValue; default: throw new ArgumentOutOfRangeException("type", type, "No missing value defined"); } } #endregion public static TableFileFormatOptions DetermineFileFormat(string path) { return DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); } public static TableFileFormatOptions DetermineFileFormat(Stream stream) { using (StreamReader reader = new StreamReader(stream)) { // skip first line reader.ReadLine(); // read a block char[] buffer = new char[BUFFER_SIZE]; int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE); // count frequency of special characters Dictionary charCounts = buffer.Take(charsRead) .GroupBy(c => c) .ToDictionary(g => g.Key, g => g.Count()); // depending on the characters occuring in the block // we distinghish a number of different cases based on the the following rules: // many points => it must be English number format, the other frequently occuring char is the separator // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator // => check the line in more detail: // English: 0, 0, 0, 0 // German: 0,0 0,0 0,0 ... // => if commas are followed by space => English format // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators) if (OccurrencesOf(charCounts, '.') > 10) { return new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = POSSIBLE_SEPARATORS .Where(c => OccurrencesOf(charCounts, c) > 10) .OrderBy(c => -OccurrencesOf(charCounts, c)) .DefaultIfEmpty(' ') .First() }; } else if (OccurrencesOf(charCounts, ',') > 10) { // no points and many commas // count the number of tokens (chains of only digits and commas) that contain multiple comma characters int tokensWithMultipleCommas = 0; for (int i = 0; i < charsRead; i++) { int nCommas = 0; while (i < charsRead && (buffer[i] == ',' || Char.IsDigit(buffer[i]))) { if (buffer[i] == ',') nCommas++; i++; } if (nCommas > 2) tokensWithMultipleCommas++; } if (tokensWithMultipleCommas > 1) { // English format (only integer values) with ',' as separator return new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = ',' }; } else { char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail // German format (real values) return new TableFileFormatOptions { NumberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), DateTimeFormat = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), ColumnSeparator = POSSIBLE_SEPARATORS .Except(disallowedSeparators) .Where(c => OccurrencesOf(charCounts, c) > 10) .OrderBy(c => -OccurrencesOf(charCounts, c)) .DefaultIfEmpty(' ') .First() }; } } else { // no points and no commas => English format return new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = POSSIBLE_SEPARATORS .Where(c => OccurrencesOf(charCounts, c) > 10) .OrderBy(c => -OccurrencesOf(charCounts, c)) .DefaultIfEmpty(' ') .First() }; } } } private static int OccurrencesOf(Dictionary charCounts, char c) { return charCounts.ContainsKey(c) ? charCounts[c] : 0; } #region tokenizer // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character internal enum TokenTypeEnum { NewLine, String, Double, DateTime, Missing } internal class Tokenizer { private StreamReader reader; // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary) private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024]; private string[] stringVals = new string[1024]; private double[] doubleVals = new double[1024]; private DateTime[] dateTimeVals = new DateTime[1024]; private int tokenPos; private int numTokens; private NumberFormatInfo numberFormatInfo; private DateTimeFormatInfo dateTimeFormatInfo; private char separator; // arrays for string.Split() private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default private readonly char[] separators; private int currentLineNumber = 0; public int CurrentLineNumber { get { return currentLineNumber; } private set { currentLineNumber = value; } } private string currentLine; public string CurrentLine { get { return currentLine; } private set { currentLine = value; } } public long BytesRead { get; private set; } public Tokenizer(StreamReader reader, TableFileFormatOptions formatOptions) { this.reader = reader; this.numberFormatInfo = formatOptions.NumberFormat; this.dateTimeFormatInfo = formatOptions.DateTimeFormat; this.separator = formatOptions.ColumnSeparator; this.separators = new char[] { separator }; ReadNextTokens(); } public bool HasNext() { return numTokens > tokenPos || !reader.EndOfStream; } public TokenTypeEnum PeekType() { return tokenTypes[tokenPos]; } public void Skip() { // simply skips one token without returning the result values tokenPos++; if (numTokens == tokenPos) { ReadNextTokens(); } } public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) { type = tokenTypes[tokenPos]; strVal = stringVals[tokenPos]; dblVal = doubleVals[tokenPos]; dateTimeVal = dateTimeVals[tokenPos]; Skip(); } private void ReadNextTokens() { if (!reader.EndOfStream) { CurrentLine = reader.ReadLine(); CurrentLineNumber++; if (reader.BaseStream.CanSeek) { BytesRead = reader.BaseStream.Position; } else { BytesRead += CurrentLine.Length + 2; // guess } int i = 0; if (!string.IsNullOrWhiteSpace(CurrentLine)) { foreach (var tok in Split(CurrentLine)) { TokenTypeEnum type; double doubleVal; DateTime dateTimeValue; type = TokenTypeEnum.String; // default stringVals[i] = tok.Trim(); if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) { type = TokenTypeEnum.Double; doubleVals[i] = doubleVal; } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue) && (dateTimeValue.Year > 1 || dateTimeValue.Month > 1 || dateTimeValue.Day > 1)// if no date is given it is returned as 1.1.0001 -> don't allow this ) { type = TokenTypeEnum.DateTime; dateTimeVals[i] = dateTimeValue; } else if (string.IsNullOrWhiteSpace(tok)) { type = TokenTypeEnum.Missing; } // couldn't parse the token as an int or float number or datetime value so return a string token tokenTypes[i] = type; i++; if (i >= tokenTypes.Length) { // increase buffer size if necessary IncreaseCapacity(ref tokenTypes); IncreaseCapacity(ref doubleVals); IncreaseCapacity(ref stringVals); IncreaseCapacity(ref dateTimeVals); } } } tokenTypes[i] = TokenTypeEnum.NewLine; numTokens = i + 1; tokenPos = 0; } } private IEnumerable Split(string line) { return separator == WHITESPACECHAR ? line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) : line.Split(separators); } private static void IncreaseCapacity(ref T[] arr) { int n = (int)Math.Floor(arr.Length * 1.7); // guess T[] arr2 = new T[n]; Array.Copy(arr, arr2, arr.Length); arr = arr2; } } #endregion #region parsing private void ParseVariableNames() { // the first line must contain variable names List varNames = new List(); TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); // the first token must be a variable name if (type != TokenTypeEnum.String) throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type); varNames.Add(strVal); while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) { tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); varNames.Add(strVal); } ExpectType(TokenTypeEnum.NewLine); variableNames = varNames; } private void ExpectType(TokenTypeEnum expectedToken) { if (tokenizer.PeekType() != expectedToken) throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType()); tokenizer.Skip(); } private void Error(string message, string token, int lineNumber) { throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber)); } #endregion } }