#region License Information
/* HeuristicLab
* Copyright (C) 2002-2014 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Runtime.Serialization;
namespace HeuristicLab.Problems.Instances.DataAnalysis {
public class TableFileParser {
private const int BUFFER_SIZE = 65536;
// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
private const char WHITESPACECHAR = (char)0;
private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
private Tokenizer tokenizer;
private List> rowValues;
private int rows;
public int Rows {
get { return rows; }
set { rows = value; }
}
private int columns;
public int Columns {
get { return columns; }
set { columns = value; }
}
private List values;
public List Values {
get {
return values;
}
}
private List variableNames;
public IEnumerable VariableNames {
get {
if (variableNames.Count > 0) return variableNames;
else {
string[] names = new string[columns];
for (int i = 0; i < names.Length; i++) {
names[i] = "X" + i.ToString("000");
}
return names;
}
}
}
public TableFileParser() {
rowValues = new List>();
variableNames = new List();
}
public bool AreColumnNamesInFirstLine(string fileName) {
NumberFormatInfo numberFormat;
DateTimeFormatInfo dateTimeFormatInfo;
char separator;
DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
}
public bool AreColumnNamesInFirstLine(Stream stream) {
NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
char separator = ',';
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
DateTimeFormatInfo dateTimeFormatInfo, char separator) {
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
}
}
public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
DateTimeFormatInfo dateTimeFormatInfo, char separator) {
using (StreamReader reader = new StreamReader(stream)) {
tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
return tokenizer.Peek().type != TokenTypeEnum.Double;
}
}
///
/// Parses a file and determines the format first
///
/// file which is parsed
///
public void Parse(string fileName, bool columnNamesInFirstLine) {
NumberFormatInfo numberFormat;
DateTimeFormatInfo dateTimeFormatInfo;
char separator;
DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
}
///
/// Parses a file with the given formats
///
/// file which is parsed
/// Format of numbers
/// Format of datetime
/// defines the separator
///
public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
}
}
///
/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
///
/// stream which is parsed
///
public void Parse(Stream stream, bool columnNamesInFirstLine) {
NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
char separator = ',';
Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
}
///
/// Parses a stream with the given formats.
///
/// Stream which is parsed
/// Format of numbers
/// Format of datetime
/// defines the separator
///
public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
using (StreamReader reader = new StreamReader(stream)) {
tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
// parse the file
Parse(columnNamesInFirstLine);
}
// translate the list of samples into a DoubleMatrixData item
rows = rowValues.Count;
columns = rowValues[0].Count;
values = new List();
//create columns
for (int col = 0; col < columns; col++) {
var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
if (!types.Any()) {
values.Add(new List());
continue;
}
var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
if (columnType == typeof(double)) values.Add(new List());
else if (columnType == typeof(DateTime)) values.Add(new List());
else if (columnType == typeof(string)) values.Add(new List());
else throw new InvalidOperationException();
}
//fill with values
foreach (List