Changeset 13526


Ignore:
Timestamp:
01/16/16 15:22:52 (5 years ago)
Author:
gkronber
Message:

#2071 added code for type conversion of columns to the table file parser and made some other minor changes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r13447 r13526  
    2424using System.Collections;
    2525using System.Collections.Generic;
     26using System.Diagnostics.Contracts;
    2627using System.Globalization;
    2728using System.IO;
     
    141142    private void EstimateNumberOfLines(string fileName) {
    142143      var len = new System.IO.FileInfo(fileName).Length;
    143       var buf = new char[64 * 1024];
     144      var buf = new char[1024 * 1024];
    144145      using (var reader = new StreamReader(fileName)) {
    145146        reader.ReadBlock(buf, 0, buf.Length);
     
    233234                tokenizer.CurrentLineNumber);
    234235            }
     236            if (!IsColumnTypeCompatible(values[colIdx], type)) {
     237              values[colIdx] = ConvertToStringColumn(values[colIdx]);
     238            }
    235239            // add the value to the column
    236240            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
     
    265269
    266270    #region type-dependent dispatch
     271    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
     272      return (list is List<string>) || // all tokens can be added to a string list
     273             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
     274             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
     275             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
     276    }
     277
     278    // all columns are converted to string columns when we find an non-empty value that has incorrect type
     279    private IList ConvertToStringColumn(IList list) {
     280      var dblL = list as List<double>;
     281      if (dblL != null) {
     282        var l = new List<string>(dblL.Capacity);
     283        l.AddRange(dblL.Select(dbl => dbl.ToString()));
     284        return l;
     285      }
     286
     287      var dtL = list as List<DateTime>;
     288      if (dtL != null) {
     289        var l = new List<string>(dtL.Capacity);
     290        l.AddRange(dtL.Select(dbl => dbl.ToString()));
     291        return l;
     292      }
     293
     294      if (list is List<string>) return list;
     295
     296      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
     297    }
     298
    267299    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
    268       switch (type) {
    269         case TokenTypeEnum.Double:
    270           AddDoubleToList(list, dblVal);
    271           break;
    272         case TokenTypeEnum.String:
    273           AddStringToList(list, strVal);
    274           break;
    275         case TokenTypeEnum.DateTime:
    276           AddDateTimeToList(list, dateTimeVal);
    277           break;
    278         default:
    279           throw new InvalidOperationException();
    280       }
    281     }
    282 
    283     private void AddDoubleToList(IList list, double dblVal) {
    284300      var dblList = list as List<double>;
    285       if (dblList != null) dblList.Add(dblVal);
    286       else {
    287         var strList = list as List<string>;
    288         if (strList != null) strList.Add(dblVal.ToString());
    289         else list.Add(null);
    290       }
    291     }
    292 
    293     private void AddStringToList(IList list, string strVal) {
     301      if (dblList != null) {
     302        AddValue(type, dblList, dblVal);
     303        return;
     304      }
     305
    294306      var strList = list as List<string>;
    295       if (strList != null) strList.Add(strVal);
    296       else {
    297         var dblList = list as List<double>;
    298         if (dblList != null) dblList.Add(double.NaN);
    299         else list.Add(null);
    300       }
    301     }
    302 
    303     private void AddDateTimeToList(IList list, DateTime dateTimeVal) {
    304       var dateTimeList = list as List<DateTime>;
    305       if (dateTimeList != null) dateTimeList.Add(dateTimeVal);
    306       else {
    307         var dblList = list as List<double>;
    308         if (dblList != null) dblList.Add(double.NaN);
    309         else {
    310           var strList = list as List<string>;
    311           if (strList != null) strList.Add(dateTimeVal.ToString());
    312           else list.Add(null);
    313         }
    314       }
     307      if (strList != null) {
     308        AddValue(type, strList, strVal);
     309        return;
     310      }
     311      var dtList = list as List<DateTime>;
     312      if (dtList != null) {
     313        AddValue(type, dtList, dateTimeVal);
     314        return;
     315      }
     316
     317      list.Add(strVal); // assumes List<object>
     318    }
     319
     320    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
     321      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
     322      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
     323    }
     324
     325    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
     326      // assumes that strVal is always set to the original token read from the input file
     327      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
     328    }
     329
     330    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
     331      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
     332      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
    315333    }
    316334
     
    320338          return new List<string>(estimatedNumberOfLines);
    321339        case TokenTypeEnum.Double:
     340        case TokenTypeEnum.Missing: // assume double columns
    322341          return new List<double>(estimatedNumberOfLines);
    323342        case TokenTypeEnum.DateTime:
     
    381400            separator = ',';
    382401          } else {
    383             char[] disallowedSeparators = new char[] { ',' };
     402            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
    384403            // German format (real values)
    385404            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
     
    389408              .Where(c => OccurrencesOf(charCounts, c) > 10)
    390409              .OrderBy(c => -OccurrencesOf(charCounts, c))
    391               .DefaultIfEmpty(' ')
     410              .DefaultIfEmpty(' ') 
    392411              .First();
    393412          }
     
    412431    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
    413432    internal enum TokenTypeEnum {
    414       NewLine, String, Double, DateTime
     433      NewLine, String, Double, DateTime, Missing
    415434    }
    416435
     
    505524                type = TokenTypeEnum.DateTime;
    506525                dateTimeVals[i] = dateTimeValue;
     526              } else if (string.IsNullOrWhiteSpace(tok)) {
     527                type = TokenTypeEnum.Missing;
    507528              }
    508529
Note: See TracChangeset for help on using the changeset viewer.