Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
07/07/17 16:44:47 (7 years ago)
Author:
gkronber
Message:

#2661: merged r14284:14286,r14288,r14296,r14298,r14408 from trunk to stable

Location:
stable
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • stable

  • stable/HeuristicLab.Problems.Instances.DataAnalysis

  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r14186 r15170  
    2828using System.IO;
    2929using System.Linq;
    30 using System.Runtime.Serialization;
    3130using System.Text;
    3231
     
    198197    /// <param name="columnNamesInFirstLine"></param>
    199198    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
    200       using (StreamReader reader = new StreamReader(stream, Encoding)) {
     199      if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
     200
     201      using (var reader = new StreamReader(stream)) {
    201202        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
     203        var strValues = new List<List<string>>();
    202204        values = new List<IList>();
    203         if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
    204 
    205         if (columnNamesInFirstLine) {
    206           ParseVariableNames();
    207           if (!tokenizer.HasNext())
    208             Error(
    209               "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
    210               "", tokenizer.CurrentLineNumber);
    211         }
    212 
    213 
    214         // read values... start in first row
     205        Prepare(columnNamesInFirstLine, strValues);
     206
    215207        int nLinesParsed = 0;
    216208        int colIdx = 0;
    217         int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
    218209        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
    219210          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     
    221212
    222213            // all rows have to have the same number of values
    223             // the first row defines how many samples are needed
    224             if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
    225             else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
    226               Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     214            // the first row defines how many elements are needed
     215            if (colIdx > 0 && values.Count != colIdx) {
     216              // read at least one value in the row (support for skipping empty lines)
     217              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
    227218                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
    228                     tokenizer.CurrentLineNumber);
     219                tokenizer.CurrentLineNumber);
    229220            }
    230221            OnReport(tokenizer.BytesRead);
     
    234225          } else {
    235226            // read one value
    236             TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     227            TokenTypeEnum type;
     228            string strVal;
     229            double dblVal;
     230            DateTime dateTimeVal;
    237231            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
    238232
    239             // initialize columns on the first row (fixing data types as presented in the first row...)
    240             if (nLinesParsed == 0) {
    241               values.Add(CreateList(type, estimatedNumberOfLines));
    242             } else if (colIdx == values.Count) {
    243               Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     233            if (colIdx == values.Count) {
     234              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
    244235                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
    245236                tokenizer.CurrentLineNumber);
    246237            }
    247238            if (!IsColumnTypeCompatible(values[colIdx], type)) {
    248               values[colIdx] = ConvertToStringColumn(values[colIdx]);
     239              values[colIdx] = strValues[colIdx];
    249240            }
     241
    250242            // add the value to the column
    251             AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
     243            AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
     244            if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
     245              strValues[colIdx].Add(strVal);
     246            }
     247            colIdx++;
    252248          }
    253249        }
    254 
    255         if (!values.Any() || values.First().Count == 0)
    256           Error("Couldn't parse data values. Probably because of incorrect number format " +
    257                 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    258       }
     250      }
     251
     252      if (!values.Any() || values.First().Count == 0)
     253        Error("Couldn't parse data values. Probably because of incorrect number format " +
     254              "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    259255
    260256      this.rows = values.First().Count;
     
    277273      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
    278274      GC.Collect(2, GCCollectionMode.Forced);
     275    }
     276
     277    private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
     278      if (columnNamesInFirstLine) {
     279        ParseVariableNames();
     280        if (!tokenizer.HasNext())
     281          Error(
     282            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
     283            "", tokenizer.CurrentLineNumber);
     284      }
     285      // read first line to determine types and allocate specific lists
     286      // read values... start in first row
     287      int colIdx = 0;
     288      while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
     289        // read one value
     290        TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     291        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     292
     293        // initialize column
     294        values.Add(CreateList(type, estimatedNumberOfLines));
     295        if (type == TokenTypeEnum.String)
     296          strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
     297        else
     298          strValues.Add(new List<string>(estimatedNumberOfLines));
     299
     300        AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
     301        if (type != TokenTypeEnum.String)
     302          strValues[colIdx].Add(strVal);
     303        colIdx++;
     304      }
     305      tokenizer.Skip(); // skip newline
    279306    }
    280307
     
    530557                type = TokenTypeEnum.Double;
    531558                doubleVals[i] = doubleVal;
    532               } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
     559              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
     560                && (dateTimeValue.Year > 1 || dateTimeValue.Month > 1 || dateTimeValue.Day > 1)// if no date is given it is returned as 1.1.0001 -> don't allow this
     561                ) {
    533562                type = TokenTypeEnum.DateTime;
    534563                dateTimeVals[i] = dateTimeValue;
     
    606635
    607636    private void Error(string message, string token, int lineNumber) {
    608       throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
     637      throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
    609638    }
    610639    #endregion
    611 
    612     [Serializable]
    613     public class DataFormatException : Exception {
    614       private int line;
    615       public int Line {
    616         get { return line; }
    617       }
    618       private string token;
    619       public string Token {
    620         get { return token; }
    621       }
    622       public DataFormatException(string message, string token, int line)
    623         : base(message + "\nToken: " + token + " (line: " + line + ")") {
    624         this.token = token;
    625         this.line = line;
    626       }
    627 
    628       public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
    629     }
    630640  }
    631641}
Note: See TracChangeset for help on using the changeset viewer.