#region License Information /* HeuristicLab * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections; using System.Collections.Generic; using System.Diagnostics.Contracts; using System.Globalization; using System.IO; using System.Linq; using System.Text; using HeuristicLab.Problems.DataAnalysis; using DoubleVector = MathNet.Numerics.LinearAlgebra.Vector; namespace HeuristicLab.Problems.Instances.DataAnalysis { public class TableFileParser : Progress { // reports the number of bytes read private const int BUFFER_SIZE = 65536; // char used to symbolize whitespaces (no missing values can be handled with whitespaces) private const char WHITESPACECHAR = (char)0; private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR }; private Tokenizer tokenizer; private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file private Encoding encoding = Encoding.Default; public Encoding Encoding { get { return encoding; } set { if (value == null) throw new ArgumentNullException("Encoding"); encoding = value; } } private int rows; public int Rows { get { return rows; } set { rows = value; } } private int columns; public int Columns { get { return columns; } set { columns = value; } } private List values; public List Values { get { return values; } } private List variableNames; public IEnumerable VariableNames { get { if (variableNames.Count > 0) return variableNames; else { string[] names = new string[columns]; for (int i = 0; i < names.Length; i++) { names[i] = "X" + i.ToString("000"); } return names; } } } public TableFileParser() { variableNames = new List(); } public bool AreColumnNamesInFirstLine(string fileName) { var formatOptions = DetermineFileFormat(fileName); using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { return AreColumnNamesInFirstLine(stream, formatOptions); } } public bool AreColumnNamesInFirstLine(Stream stream) { var formatOptions = new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = ',' }; return AreColumnNamesInFirstLine(stream, formatOptions); } public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) { using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { return AreColumnNamesInFirstLine(stream, formatOptions); } } public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) { using (StreamReader reader = new StreamReader(stream, Encoding)) { tokenizer = new Tokenizer(reader, formatOptions); return (tokenizer.PeekType() != TokenTypeEnum.Double); } } ///

/// Parses a file and determines the format first ///

/// file which is parsed /// public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { var formatOptions = DetermineFileFormat(fileName); EstimateNumberOfLines(fileName); Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit); } ///

/// Parses a file with the given formats ///

/// file which is parsed /// Format of numbers /// Format of datetime /// defines the separator /// public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { EstimateNumberOfLines(fileName); using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); } } // determines the number of newline characters in the first 64KB to guess the number of rows for a file private void EstimateNumberOfLines(string fileName) { var len = new System.IO.FileInfo(fileName).Length; var buf = new char[1024 * 1024]; using (var reader = new StreamReader(fileName, Encoding)) { reader.ReadBlock(buf, 0, buf.Length); } int numNewLine = 0; int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative foreach (var ch in buf) { charsInCurrentLine++; if (ch == '\n') { if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line charsInCurrentLine = 0; numNewLine++; } } if (numNewLine <= 1) { // fail -> keep the default setting return; } else { double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1); double estimatedLines = len / charsPerLineFactor; estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough } } ///

/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ',' ///

/// stream which is parsed /// public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { var formatOptions = new TableFileFormatOptions { NumberFormat = NumberFormatInfo.InvariantInfo, DateTimeFormat = DateTimeFormatInfo.InvariantInfo, ColumnSeparator = ',' }; Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit); } ///

/// Parses a stream with the given formats. ///

/// Stream which is parsed /// Format of numbers /// Format of datetime /// defines the separator /// public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) { if (lineLimit > 0) estimatedNumberOfLines = lineLimit; using (var reader = new StreamReader(stream)) { tokenizer = new Tokenizer(reader, formatOptions); var strValues = new List>(); values = new List(); Prepare(columnNamesInFirstLine, strValues); int nLinesParsed = 0; int colIdx = 0; while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { tokenizer.Skip(); // all rows have to have the same number of values // the first row defines how many elements are needed if (colIdx > 0 && values.Count != colIdx) { // read at least one value in the row (support for skipping empty lines) Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", tokenizer.CurrentLineNumber); } OnReport(tokenizer.BytesRead); nLinesParsed++; colIdx = 0; } else { // read one value TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); if (colIdx == values.Count) { Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", tokenizer.CurrentLineNumber); } if (!IsColumnTypeCompatible(values[colIdx], type)) { values[colIdx] = strValues[colIdx]; } // add the value to the column AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); if (!(values[colIdx] is List)) { // optimization: don't store the string values in another list if the column is list strValues[colIdx].Add(strVal); } colIdx++; } } } if (!values.Any() || values.First().Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format " + "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); this.rows = values.First().Count; this.columns = values.Count; // see if any string column can be converted to vectors if (formatOptions.VectorSeparator != null) { for (int i = 0; i < values.Count; i++) { if (!(values[i] is List stringList)) continue; var strings = new string[stringList.Count][]; var doubles = new double[strings.Length][]; bool allDoubles = true; for (int j = 0; j < strings.Length && allDoubles; j++) { strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value); doubles[j] = new double[strings[j].Length]; for (int k = 0; k < doubles[j].Length && allDoubles; k++) { allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]); } } if (allDoubles) { var vectorList = new List(stringList.Count); for (int j = 0; j < doubles.Length; j++) { vectorList.Add(DoubleVector.Build.Dense(doubles[j])); } values[i] = vectorList; } } } // replace lists with undefined type (object) with double-lists for (int i = 0; i < values.Count; i++) { if (values[i] is List