Changeset 13440


Ignore:
Timestamp:
12/07/15 17:25:31 (5 years ago)
Author:
gkronber
Message:

#2071 improved memory efficiency in TableFileParser by removing duplicate storage of all columns, added heuristic to estimate the necessary capacity of columns

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r13414 r13440  
    2828using System.Linq;
    2929using System.Runtime.Serialization;
     30using System.Text;
    3031
    3132namespace HeuristicLab.Problems.Instances.DataAnalysis {
     
    3637    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
    3738    private Tokenizer tokenizer;
    38     private List<List<object>> rowValues;
     39    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
    3940
    4041    private int rows;
     
    7273
    7374    public TableFileParser() {
    74       rowValues = new List<List<object>>();
    7575      variableNames = new List<string>();
    7676    }
     
    104104      using (StreamReader reader = new StreamReader(stream)) {
    105105        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    106         return tokenizer.PeekType() != TokenTypeEnum.Double;
     106        return (tokenizer.PeekType() != TokenTypeEnum.Double);
    107107      }
    108108    }
     
    118118      char separator;
    119119      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
     120      EstimateNumberOfLines(fileName);
    120121      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
    121122    }
     
    130131    /// <param name="columnNamesInFirstLine"></param>
    131132    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     133      EstimateNumberOfLines(fileName);
    132134      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    133135        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     136      }
     137    }
     138
     139    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
     140    private void EstimateNumberOfLines(string fileName) {
     141      var len = new System.IO.FileInfo(fileName).Length;
     142      var buf = new char[64 * 1024];
     143      var reader = new StreamReader(File.OpenRead(fileName));
     144      reader.ReadBlock(buf, 0, buf.Length);
     145      int numNewLine = 0;
     146      foreach (var ch in buf) if (ch == '\n') numNewLine++;
     147      if (numNewLine == 0) {
     148        // fail -> keep the default setting
     149        return;
     150      } else {
     151        double charsPerLineFactor = buf.Length / (double)numNewLine;
     152        double estimatedLines = len / charsPerLineFactor;
     153        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
    134154      }
    135155    }
     
    158178      using (StreamReader reader = new StreamReader(stream)) {
    159179        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    160         // parse the file
    161         Parse(columnNamesInFirstLine, lineLimit);
    162       }
    163 
    164       // translate the list of samples into a DoubleMatrixData item
    165       rows = rowValues.Count;
    166       columns = rowValues[0].Count;
    167       values = new List<IList>();
    168 
    169       //create columns
    170       for (int col = 0; col < columns; col++) {
    171         var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
    172         if (!types.Any()) {
    173           values.Add(new List<string>());
    174           continue;
    175         }
    176 
    177         var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
    178         if (columnType == typeof(double)) values.Add(new List<double>());
    179         else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
    180         else if (columnType == typeof(string)) values.Add(new List<string>());
    181         else throw new InvalidOperationException();
    182       }
    183 
    184 
    185 
    186       //fill with values
    187       foreach (List<object> row in rowValues) {
    188         int columnIndex = 0;
    189         foreach (object element in row) {
    190           if (values[columnIndex] is List<double> && !(element is double))
    191             values[columnIndex].Add(double.NaN);
    192           else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
    193             values[columnIndex].Add(DateTime.MinValue);
    194           else if (values[columnIndex] is List<string> && !(element is string))
    195             values[columnIndex].Add(element.ToString());
    196           else
    197             values[columnIndex].Add(element);
    198           columnIndex++;
    199         }
     180        // parse the file line by line
     181        values = new List<IList>();
     182        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
     183        foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) {
     184          columns = row.Count;
     185          // on the first row we create our lists for column-oriented storage
     186          if (!values.Any()) {
     187            foreach (var obj in row) {
     188              // create a list type matching the object type and add first element
     189              if (obj == null) {
     190                var l = new List<object>(estimatedNumberOfLines);
     191                values.Add(l);
     192                l.Add(obj);
     193              } else if (obj is double) {
     194                var l = new List<double>(estimatedNumberOfLines);
     195                values.Add(l);
     196                l.Add((double)obj);
     197              } else if (obj is DateTime) {
     198                var l = new List<DateTime>(estimatedNumberOfLines);
     199                values.Add(l);
     200                l.Add((DateTime)obj);
     201              } else if (obj is string) {
     202                var l = new List<string>(estimatedNumberOfLines);
     203                values.Add(l);
     204                l.Add((string)obj);
     205              } else throw new InvalidOperationException();
     206            }
     207            // fill with initial value
     208          } else {
     209            // the columns are already there -> try to add values
     210            int columnIndex = 0;
     211            foreach (object element in row) {
     212              if (values[columnIndex] is List<double> && !(element is double))
     213                values[columnIndex].Add(double.NaN);
     214              else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
     215                values[columnIndex].Add(DateTime.MinValue);
     216              else if (values[columnIndex] is List<string> && !(element is string))
     217                values[columnIndex].Add(element.ToString());
     218              else
     219                values[columnIndex].Add(element);
     220              columnIndex++;
     221            }
     222          }
     223        }
     224
     225        if (!values.Any() || values.First().Count == 0)
     226          Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
     227      }
     228
     229      // after everything has been parsed make sure the lists are as compact as possible
     230      foreach (var l in values) {
     231        var dblList = l as List<double>;
     232        var byteList = l as List<byte>;
     233        var dateList = l as List<DateTime>;
     234        var stringList = l as List<string>;
     235        var objList = l as List<object>;
     236        if (dblList != null) dblList.TrimExcess();
     237        if (byteList != null) byteList.TrimExcess();
     238        if (dateList != null) dateList.TrimExcess();
     239        if (stringList != null) stringList.TrimExcess();
     240        if (objList != null) objList.TrimExcess();
    200241      }
    201242    }
     
    315356      }
    316357
    317 
    318358      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
    319359        this.reader = reader;
     
    329369          try {
    330370            BytesRead = reader.BaseStream.Position;
    331           } catch (IOException) {
     371          }
     372          catch (IOException) {
    332373            BytesRead += CurrentLine.Length + 2; // guess
    333           } catch (NotSupportedException) {
     374          }
     375          catch (NotSupportedException) {
    334376            BytesRead += CurrentLine.Length + 2;
    335377          }
     
    413455        dblVal = doubleVals[tokenPos];
    414456        dateTimeVal = dateTimeVals[tokenPos];
    415 
    416457        Skip();
    417458      }
     
    424465
    425466    #region parsing
    426     private void Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
     467    private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
    427468      if (columnNamesInFirstLine) {
    428469        ParseVariableNames();
     
    432473            "", tokenizer.CurrentLineNumber);
    433474      }
    434       ParseValues(lineLimit);
    435       if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    436     }
    437 
    438     private void ParseValues(int lineLimit = -1) {
     475      return ParseValues(lineLimit);
     476    }
     477
     478    private IEnumerable<List<object>> ParseValues(int lineLimit = -1) {
    439479      int nLinesParsed = 0;
     480      int numValuesInFirstRow = -1;
    440481      while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
    441482        if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     
    454495          // all rows have to have the same number of values           
    455496          // the first row defines how many samples are needed
    456           if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
    457             Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
     497          if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count;
     498          else if (numValuesInFirstRow != row.Count) {
     499            Error("The first row of the dataset has " + numValuesInFirstRow + " columns." +
    458500                  "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
    459501                  tokenizer.CurrentLineNumber);
    460502          }
    461           rowValues.Add(row);
     503          yield return row;
    462504        }
    463505
Note: See TracChangeset for help on using the changeset viewer.