Changeset 13447


Ignore:
Timestamp:
12/10/15 17:12:45 (4 years ago)
Author:
gkronber
Message:

#2072 more refactoring of TableFileParser to remove boxing of double values and to remove production of separator tokens

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r13445 r13447  
    142142      var len = new System.IO.FileInfo(fileName).Length;
    143143      var buf = new char[64 * 1024];
    144       using(var reader = new StreamReader(fileName)) {
     144      using (var reader = new StreamReader(fileName)) {
    145145        reader.ReadBlock(buf, 0, buf.Length);
    146146      }
     
    188188      using (StreamReader reader = new StreamReader(stream)) {
    189189        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    190         // parse the file line by line
    191190        values = new List<IList>();
    192191        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
    193         foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) {
    194           columns = row.Count;
    195           // on the first row we create our lists for column-oriented storage
    196           if (!values.Any()) {
    197             foreach (var obj in row) {
    198               // create a list type matching the object type and add first element
    199               if (obj == null) {
    200                 var l = new List<object>(estimatedNumberOfLines);
    201                 values.Add(l);
    202                 l.Add(obj);
    203               } else if (obj is double) {
    204                 var l = new List<double>(estimatedNumberOfLines);
    205                 values.Add(l);
    206                 l.Add((double)obj);
    207               } else if (obj is DateTime) {
    208                 var l = new List<DateTime>(estimatedNumberOfLines);
    209                 values.Add(l);
    210                 l.Add((DateTime)obj);
    211               } else if (obj is string) {
    212                 var l = new List<string>(estimatedNumberOfLines);
    213                 values.Add(l);
    214                 l.Add((string)obj);
    215               } else throw new InvalidOperationException();
     192
     193        if (columnNamesInFirstLine) {
     194          ParseVariableNames();
     195          if (!tokenizer.HasNext())
     196            Error(
     197              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
     198              "", tokenizer.CurrentLineNumber);
     199        }
     200
     201
     202        // read values... start in first row
     203        int nLinesParsed = 0;
     204        int colIdx = 0;
     205        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
     206        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
     207          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     208            tokenizer.Skip();
     209
     210            // all rows have to have the same number of values
     211            // the first row defines how many samples are needed
     212            if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
     213            else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
     214              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     215                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
     216                    tokenizer.CurrentLineNumber);
    216217            }
    217             // fill with initial value
     218            OnReport(tokenizer.BytesRead);
     219
     220            nLinesParsed++;
     221            colIdx = 0;
    218222          } else {
    219             // the columns are already there -> try to add values
    220             int columnIndex = 0;
    221             foreach (object element in row) {
    222               if (values[columnIndex] is List<double> && !(element is double))
    223                 values[columnIndex].Add(double.NaN);
    224               else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
    225                 values[columnIndex].Add(DateTime.MinValue);
    226               else if (values[columnIndex] is List<string> && !(element is string))
    227                 values[columnIndex].Add(element.ToString());
    228               else
    229                 values[columnIndex].Add(element);
    230               columnIndex++;
     223            // read one value
     224            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     225            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     226
     227            // initialize columns on the first row (fixing data types as presented in the first row...)
     228            if (nLinesParsed == 0) {
     229              values.Add(CreateList(type, estimatedNumberOfLines));
     230            } else if (colIdx == values.Count) {
     231              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     232                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
     233                tokenizer.CurrentLineNumber);
    231234            }
     235            // add the value to the column
     236            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
    232237          }
    233238        }
    234239
    235240        if (!values.Any() || values.First().Count == 0)
    236           Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
     241          Error("Couldn't parse data values. Probably because of incorrect number format " +
     242                "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    237243      }
    238244
    239245      this.rows = values.First().Count;
     246      this.columns = values.Count;
    240247
    241248      // after everything has been parsed make sure the lists are as compact as possible
     
    256263      GC.Collect(2, GCCollectionMode.Forced);
    257264    }
     265
     266    #region type-dependent dispatch
     267    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
     268      switch (type) {
     269        case TokenTypeEnum.Double:
     270          AddDoubleToList(list, dblVal);
     271          break;
     272        case TokenTypeEnum.String:
     273          AddStringToList(list, strVal);
     274          break;
     275        case TokenTypeEnum.DateTime:
     276          AddDateTimeToList(list, dateTimeVal);
     277          break;
     278        default:
     279          throw new InvalidOperationException();
     280      }
     281    }
     282
     283    private void AddDoubleToList(IList list, double dblVal) {
     284      var dblList = list as List<double>;
     285      if (dblList != null) dblList.Add(dblVal);
     286      else {
     287        var strList = list as List<string>;
     288        if (strList != null) strList.Add(dblVal.ToString());
     289        else list.Add(null);
     290      }
     291    }
     292
     293    private void AddStringToList(IList list, string strVal) {
     294      var strList = list as List<string>;
     295      if (strList != null) strList.Add(strVal);
     296      else {
     297        var dblList = list as List<double>;
     298        if (dblList != null) dblList.Add(double.NaN);
     299        else list.Add(null);
     300      }
     301    }
     302
     303    private void AddDateTimeToList(IList list, DateTime dateTimeVal) {
     304      var dateTimeList = list as List<DateTime>;
     305      if (dateTimeList != null) dateTimeList.Add(dateTimeVal);
     306      else {
     307        var dblList = list as List<double>;
     308        if (dblList != null) dblList.Add(double.NaN);
     309        else {
     310          var strList = list as List<string>;
     311          if (strList != null) strList.Add(dateTimeVal.ToString());
     312          else list.Add(null);
     313        }
     314      }
     315    }
     316
     317    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
     318      switch (type) {
     319        case TokenTypeEnum.String:
     320          return new List<string>(estimatedNumberOfLines);
     321        case TokenTypeEnum.Double:
     322          return new List<double>(estimatedNumberOfLines);
     323        case TokenTypeEnum.DateTime:
     324          return new List<DateTime>(estimatedNumberOfLines);
     325        default:
     326          throw new InvalidOperationException();
     327      }
     328    }
     329    #endregion
    258330
    259331    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
     
    338410
    339411    #region tokenizer
     412    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
    340413    internal enum TokenTypeEnum {
    341       NewLine, Separator, String, Double, DateTime
     414      NewLine, String, Double, DateTime
    342415    }
    343416
     
    354427      private DateTimeFormatInfo dateTimeFormatInfo;
    355428      private char separator;
    356       private const string INTERNAL_SEPARATOR = "#";
     429
     430      // arrays for string.Split()
     431      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
     432      private readonly char[] separators;
    357433
    358434      private int currentLineNumber = 0;
     
    376452        this.dateTimeFormatInfo = dateTimeFormatInfo;
    377453        this.separator = separator;
     454        this.separators = new char[] { separator };
    378455        ReadNextTokens();
     456      }
     457
     458      public bool HasNext() {
     459        return numTokens > tokenPos || !reader.EndOfStream;
     460      }
     461
     462      public TokenTypeEnum PeekType() {
     463        return tokenTypes[tokenPos];
     464      }
     465
     466      public void Skip() {
     467        // simply skips one token without returning the result values
     468        tokenPos++;
     469        if (numTokens == tokenPos) {
     470          ReadNextTokens();
     471        }
     472      }
     473
     474      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
     475        type = tokenTypes[tokenPos];
     476        strVal = stringVals[tokenPos];
     477        dblVal = doubleVals[tokenPos];
     478        dateTimeVal = dateTimeVals[tokenPos];
     479        Skip();
    379480      }
    380481
     
    382483        if (!reader.EndOfStream) {
    383484          CurrentLine = reader.ReadLine();
     485          CurrentLineNumber++;
    384486          try {
    385487            BytesRead = reader.BaseStream.Position;
     
    390492          }
    391493          int i = 0;
    392           foreach (var tok in Split(CurrentLine)) {
    393             var trimmedStr = tok.Trim();
    394             if (!string.IsNullOrEmpty(trimmedStr)) {
    395               TokenTypeEnum type = TokenTypeEnum.String; // default
    396               stringVals[i] = trimmedStr;
     494          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
     495            foreach (var tok in Split(CurrentLine)) {
     496              TokenTypeEnum type;
    397497              double doubleVal;
    398498              DateTime dateTimeValue;
    399               if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
    400                 type = TokenTypeEnum.Separator;
    401               } else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
     499              type = TokenTypeEnum.String; // default
     500              stringVals[i] = tok.Trim();
     501              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
    402502                type = TokenTypeEnum.Double;
    403503                doubleVals[i] = doubleVal;
    404               } else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
     504              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
    405505                type = TokenTypeEnum.DateTime;
    406506                dateTimeVals[i] = dateTimeValue;
    407507              }
    408508
    409               // couldn't parse the token as an int or float number  or datetime value so return a string token
     509              // couldn't parse the token as an int or float number or datetime value so return a string token
    410510
    411511              tokenTypes[i] = type;
     
    427527      }
    428528
     529      private IEnumerable<string> Split(string line) {
     530        return separator == WHITESPACECHAR ?
     531          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
     532          line.Split(separators);
     533      }
     534
    429535      private static void IncreaseCapacity<T>(ref T[] arr) {
    430536        int n = (int)Math.Floor(arr.Length * 1.7); // guess
     
    433539        arr = arr2;
    434540      }
    435 
    436       private IEnumerable<string> Split(string line) {
    437         string[] splitString;
    438         if (separator == WHITESPACECHAR) {
    439           //separate whitespaces
    440           splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
    441         } else {
    442           splitString = line.Split(separator);
    443         }
    444 
    445         for (int i = 0; i < splitString.Length - 1; i++) {
    446           yield return splitString[i];
    447           yield return INTERNAL_SEPARATOR;
    448         }
    449         // do not return the INTERNAL_SEPARATOR after the last string
    450         yield return splitString[splitString.Length - 1];
    451       }
    452 
    453       public TokenTypeEnum PeekType() {
    454         return tokenTypes[tokenPos];
    455       }
    456 
    457       public void Skip() {
    458         // simply skips one token without returning the result values
    459         tokenPos++;
    460         if (numTokens == tokenPos) {
    461           ReadNextTokens();
    462         }
    463       }
    464 
    465       public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
    466         type = tokenTypes[tokenPos];
    467         strVal = stringVals[tokenPos];
    468         dblVal = doubleVals[tokenPos];
    469         dateTimeVal = dateTimeVals[tokenPos];
    470         Skip();
    471       }
    472 
    473       public bool HasNext() {
    474         return numTokens > tokenPos || !reader.EndOfStream;
    475       }
    476541    }
    477542    #endregion
    478543
    479544    #region parsing
    480     private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
    481       if (columnNamesInFirstLine) {
    482         ParseVariableNames();
    483         if (!tokenizer.HasNext())
    484           Error(
    485             "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
    486             "", tokenizer.CurrentLineNumber);
    487       }
    488       return ParseValues(lineLimit);
    489     }
    490 
    491     private IEnumerable<List<object>> ParseValues(int lineLimit = -1) {
    492       int nLinesParsed = 0;
    493       int numValuesInFirstRow = -1;
    494       while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
    495         if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
    496           tokenizer.Skip();
    497           nLinesParsed++;
    498         } else {
    499           List<object> row = new List<object>();
    500           object value = NextValue(tokenizer);
    501           row.Add(value);
    502           while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
    503             ExpectType(TokenTypeEnum.Separator);
    504             row.Add(NextValue(tokenizer));
    505           }
    506           ExpectType(TokenTypeEnum.NewLine);
    507           nLinesParsed++;
    508           // all rows have to have the same number of values           
    509           // the first row defines how many samples are needed
    510           if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count;
    511           else if (numValuesInFirstRow != row.Count) {
    512             Error("The first row of the dataset has " + numValuesInFirstRow + " columns." +
    513                   "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
    514                   tokenizer.CurrentLineNumber);
    515           }
    516           yield return row;
    517         }
    518 
    519         OnReport(tokenizer.BytesRead);
    520       }
    521     }
    522 
    523     private object NextValue(Tokenizer tokenizer) {
    524       if (tokenizer.PeekType() == TokenTypeEnum.Separator || tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
     545
     546    private void ParseVariableNames() {
     547      // the first line must contain variable names
     548      List<string> varNames = new List<string>();
     549
    525550      TokenTypeEnum type;
    526551      string strVal;
     
    529554
    530555      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
    531       switch (type) {
    532         case TokenTypeEnum.Separator: return double.NaN;
    533         case TokenTypeEnum.String: return strVal;
    534         case TokenTypeEnum.Double: return dblVal;
    535         case TokenTypeEnum.DateTime: return dateTimeVal;
    536       }
    537       // found an unexpected token => throw error
    538       Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
    539       // this line is never executed because Error() throws an exception
    540       throw new InvalidOperationException();
    541     }
    542 
    543     private void ParseVariableNames() {
    544       // the first line must contain variable names
    545       List<string> varNames = new List<string>();
    546 
    547       TokenTypeEnum type;
    548       string strVal;
    549       double dblVal;
    550       DateTime dateTimeVal;
    551 
    552       tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
    553556
    554557      // the first token must be a variable name
     
    557560      varNames.Add(strVal);
    558561
    559       while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
    560         ExpectType(TokenTypeEnum.Separator);
     562      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
    561563        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
    562564        varNames.Add(strVal);
Note: See TracChangeset for help on using the changeset viewer.