Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
01/31/20 12:42:47 (4 years ago)
Author:
pfleck
Message:

#3040 Started adding UCI time series regression benchmarks.
Adapted parser (extracted format options & added parsing for double vectors).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r17180 r17414  
    2020#endregion
    2121
    22 
    2322using System;
    2423using System.Collections;
     
    2928using System.Linq;
    3029using System.Text;
     30using HeuristicLab.Problems.DataAnalysis;
    3131
    3232namespace HeuristicLab.Problems.Instances.DataAnalysis {
     
    8989
    9090    public bool AreColumnNamesInFirstLine(string fileName) {
    91       NumberFormatInfo numberFormat;
    92       DateTimeFormatInfo dateTimeFormatInfo;
    93       char separator;
    94       DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
     91      var formatOptions = DetermineFileFormat(fileName);
    9592      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    96         return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
     93        return AreColumnNamesInFirstLine(stream, formatOptions);
    9794      }
    9895    }
    9996
    10097    public bool AreColumnNamesInFirstLine(Stream stream) {
    101       NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
    102       DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    103       char separator = ',';
    104       return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
    105     }
    106 
    107     public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
    108                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
     98      var formatOptions = new TableFileFormatOptions {
     99        NumberFormat = NumberFormatInfo.InvariantInfo,
     100        DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
     101        ColumnSeparator = ','
     102      };
     103      return AreColumnNamesInFirstLine(stream, formatOptions);
     104    }
     105
     106    public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) {
    109107      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    110         return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
    111       }
    112     }
    113 
    114     public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
    115                                           DateTimeFormatInfo dateTimeFormatInfo, char separator) {
     108        return AreColumnNamesInFirstLine(stream, formatOptions);
     109      }
     110    }
     111
     112    public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) {
    116113      using (StreamReader reader = new StreamReader(stream, Encoding)) {
    117         tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
     114        tokenizer = new Tokenizer(reader, formatOptions);
    118115        return (tokenizer.PeekType() != TokenTypeEnum.Double);
    119116      }
     
    126123    /// <param name="columnNamesInFirstLine"></param>
    127124    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
    128       NumberFormatInfo numberFormat;
    129       DateTimeFormatInfo dateTimeFormatInfo;
    130       char separator;
    131       DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
     125      var formatOptions = DetermineFileFormat(fileName);
    132126      EstimateNumberOfLines(fileName);
    133       Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     127      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit);
    134128    }
    135129
     
    142136    /// <param name="separator">defines the separator</param>
    143137    /// <param name="columnNamesInFirstLine"></param>
    144     public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     138    public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) {
    145139      EstimateNumberOfLines(fileName);
    146140      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    147         Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     141        Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit);
    148142      }
    149143    }
     
    182176    /// <param name="columnNamesInFirstLine"></param>
    183177    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
    184       NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
    185       DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    186       char separator = ',';
    187       Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     178      var formatOptions = new TableFileFormatOptions {
     179        NumberFormat = NumberFormatInfo.InvariantInfo,
     180        DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
     181        ColumnSeparator = ','
     182      };
     183      Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit);
    188184    }
    189185
     
    196192    /// <param name="separator">defines the separator</param>
    197193    /// <param name="columnNamesInFirstLine"></param>
    198     public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     194    public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) {
    199195      if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
    200196
    201197      using (var reader = new StreamReader(stream)) {
    202         tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
     198        tokenizer = new Tokenizer(reader, formatOptions);
    203199        var strValues = new List<List<string>>();
    204200        values = new List<IList>();
     
    257253      this.columns = values.Count;
    258254
     255      // see if any string column can be converted to vectors
     256      if (formatOptions.VectorSeparator != null) {
     257        for (int i = 0; i < values.Count; i++) {
     258          if (!(values[i] is List<string> stringList)) continue;
     259
     260          var strings = new string[stringList.Count][];
     261          var doubles = new double[strings.Length][];
     262          bool allDoubles = true;
     263          for (int j = 0; j < strings.Length && allDoubles; j++) {
     264            strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value);
     265            doubles[j] = new double[strings[j].Length];
     266            for (int k = 0; k < doubles[j].Length && allDoubles; k++) {
     267              allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]);
     268            }
     269          }
     270
     271          if (allDoubles) {
     272            var vectorList = new List<DoubleVector>(stringList.Count);
     273            for (int j = 0; j < doubles.Length; j++) {
     274              vectorList.Add(new DoubleVector(doubles[j]));
     275            }
     276
     277            values[i] = vectorList;
     278          }
     279        }
     280      }
     281
    259282      // replace lists with undefined type (object) with double-lists
    260283      for (int i = 0; i < values.Count; i++) {
     
    271294        var stringList = l as List<string>;
    272295        var objList = l as List<object>;
     296        var vecList = l as List<DoubleVector>;
    273297        if (dblList != null) dblList.TrimExcess();
    274298        if (byteList != null) byteList.TrimExcess();
     
    276300        if (stringList != null) stringList.TrimExcess();
    277301        if (objList != null) objList.TrimExcess();
     302        if (vecList != null) vecList.TrimExcess();
    278303      }
    279304
     
    422447    #endregion
    423448
    424     public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
    425       DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
    426     }
    427 
    428     public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
     449    public static TableFileFormatOptions DetermineFileFormat(string path) {
     450      return DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
     451    }
     452
     453    public static TableFileFormatOptions DetermineFileFormat(Stream stream) {
    429454      using (StreamReader reader = new StreamReader(stream)) {
    430455        // skip first line
     
    449474        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
    450475        if (OccurrencesOf(charCounts, '.') > 10) {
    451           numberFormat = NumberFormatInfo.InvariantInfo;
    452           dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    453           separator = POSSIBLE_SEPARATORS
    454             .Where(c => OccurrencesOf(charCounts, c) > 10)
    455             .OrderBy(c => -OccurrencesOf(charCounts, c))
    456             .DefaultIfEmpty(' ')
    457             .First();
     476          return new TableFileFormatOptions {
     477            NumberFormat = NumberFormatInfo.InvariantInfo,
     478            DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
     479            ColumnSeparator = POSSIBLE_SEPARATORS
     480              .Where(c => OccurrencesOf(charCounts, c) > 10)
     481              .OrderBy(c => -OccurrencesOf(charCounts, c))
     482                .DefaultIfEmpty(' ')
     483              .First()
     484          };
    458485        } else if (OccurrencesOf(charCounts, ',') > 10) {
    459486          // no points and many commas
     
    470497          if (tokensWithMultipleCommas > 1) {
    471498            // English format (only integer values) with ',' as separator
    472             numberFormat = NumberFormatInfo.InvariantInfo;
    473             dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    474             separator = ',';
     499            return new TableFileFormatOptions {
     500              NumberFormat = NumberFormatInfo.InvariantInfo,
     501              DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
     502              ColumnSeparator = ','
     503            };
    475504          } else {
    476505            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
    477506            // German format (real values)
    478             numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
    479             dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
    480             separator = POSSIBLE_SEPARATORS
    481               .Except(disallowedSeparators)
     507            return new TableFileFormatOptions {
     508              NumberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")),
     509              DateTimeFormat = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")),
     510              ColumnSeparator = POSSIBLE_SEPARATORS
     511                .Except(disallowedSeparators)
     512                .Where(c => OccurrencesOf(charCounts, c) > 10)
     513                .OrderBy(c => -OccurrencesOf(charCounts, c))
     514                .DefaultIfEmpty(' ')
     515                .First()
     516            };
     517          }
     518        } else {
     519          // no points and no commas => English format
     520          return new TableFileFormatOptions {
     521            NumberFormat = NumberFormatInfo.InvariantInfo,
     522            DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
     523            ColumnSeparator = POSSIBLE_SEPARATORS
    482524              .Where(c => OccurrencesOf(charCounts, c) > 10)
    483525              .OrderBy(c => -OccurrencesOf(charCounts, c))
    484526              .DefaultIfEmpty(' ')
    485               .First();
    486           }
    487         } else {
    488           // no points and no commas => English format
    489           numberFormat = NumberFormatInfo.InvariantInfo;
    490           dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    491           separator = POSSIBLE_SEPARATORS
    492             .Where(c => OccurrencesOf(charCounts, c) > 10)
    493             .OrderBy(c => -OccurrencesOf(charCounts, c))
    494             .DefaultIfEmpty(' ')
    495             .First();
     527              .First()
     528          };
    496529        }
    497530      }
     
    540573      }
    541574
    542       public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
     575      public Tokenizer(StreamReader reader, TableFileFormatOptions formatOptions) {
    543576        this.reader = reader;
    544         this.numberFormatInfo = numberFormatInfo;
    545         this.dateTimeFormatInfo = dateTimeFormatInfo;
    546         this.separator = separator;
     577        this.numberFormatInfo = formatOptions.NumberFormat;
     578        this.dateTimeFormatInfo = formatOptions.DateTimeFormat;
     579        this.separator = formatOptions.ColumnSeparator;
    547580        this.separators = new char[] { separator };
    548581        ReadNextTokens();
Note: See TracChangeset for help on using the changeset viewer.