Changeset 13974


Ignore:
Timestamp:
07/02/16 08:15:07 (3 years ago)
Author:
gkronber
Message:

#2071: merged r13411,r13413,r13414,r13415,r13419,r13440,r13441,r13442,r13445,r13447,r13525,r13526,r13529,r13584,r13901,r13925 from trunk to stable

Location:
stable
Files:
16 edited

Legend:

Unmodified
Added
Removed
  • stable

  • stable/HeuristicLab.Problems.DataAnalysis

  • stable/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs

    r13949 r13974  
    5252    }
    5353
     54    /// <summary>
     55    /// Creates a new dataset. The variableValues are not cloned.
     56    /// </summary>
     57    /// <param name="variableNames">The names of the variables in the dataset</param>
     58    /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param>
    5459    public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues)
    5560      : base() {
     
    7580      for (int i = 0; i < this.variableNames.Count; i++) {
    7681        var values = variableValues.ElementAt(i);
    77         IList clonedValues = null;
    78         if (values is IList<double>)
    79           clonedValues = new List<double>(values.Cast<double>());
    80         else if (values is IList<string>)
    81           clonedValues = new List<string>(values.Cast<string>());
    82         else if (values is IList<DateTime>)
    83           clonedValues = new List<DateTime>(values.Cast<DateTime>());
    84         else {
    85           this.variableNames = new List<string>();
    86           this.variableValues = new Dictionary<string, IList>();
    87           throw new ArgumentException("The variable values must be of type IList<double>, IList<string> or IList<DateTime>");
    88         }
    89         this.variableValues.Add(this.variableNames[i], clonedValues);
     82        this.variableValues.Add(this.variableNames[i], values);
    9083      }
    9184    }
  • stable/HeuristicLab.Problems.Instances.DataAnalysis

  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views

  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/ClassificationImportTypeDialog.Designer.cs

    r12009 r13974  
    8787      this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox);
    8888      this.ProblemDataSettingsGroupBox.Controls.Add(this.UniformDistributionOfClassesCheckbox);
    89       this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 312);
     89      this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 285);
    9090      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.UniformDistributionOfClassesCheckbox, 0);
    9191      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.TargetVariableComboBox, 0);
     
    110110      //
    111111      this.PreviewDatasetMatrix.Location = new System.Drawing.Point(6, 134);
    112       this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 172);
     112      this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 145);
    113113      //
    114114      // PreviewLabel
     
    118118      // TargetVariableComboBox
    119119      //
    120       this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 
     120      this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
    121121            | System.Windows.Forms.AnchorStyles.Right)));
    122122      this.TargetVariableComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/DataAnalysisImportTypeDialog.Designer.cs

    r12009 r13974  
    6363      this.SeparatorComboBox = new System.Windows.Forms.ComboBox();
    6464      this.CSVSettingsGroupBox = new System.Windows.Forms.GroupBox();
     65      this.EncodingInfoLabel = new System.Windows.Forms.Label();
     66      this.EncodingLabel = new System.Windows.Forms.Label();
     67      this.EncodingComboBox = new System.Windows.Forms.ComboBox();
     68      this.CheckboxColumnNames = new System.Windows.Forms.CheckBox();
    6569      this.DateTimeFormatInfoLabel = new System.Windows.Forms.Label();
    6670      this.DecimalSeparatorInfoLabel = new System.Windows.Forms.Label();
     
    7276      this.PreviewDatasetMatrix = new HeuristicLab.Data.Views.StringConvertibleMatrixView();
    7377      this.ToolTip = new System.Windows.Forms.ToolTip(this.components);
    74       this.CheckboxColumnNames = new System.Windows.Forms.CheckBox();
    7578      ((System.ComponentModel.ISupportInitialize)(this.TrainingTestTrackBar)).BeginInit();
    7679      this.CSVSettingsGroupBox.SuspendLayout();
     
    249252      this.CSVSettingsGroupBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
    250253            | System.Windows.Forms.AnchorStyles.Right)));
     254      this.CSVSettingsGroupBox.Controls.Add(this.EncodingInfoLabel);
     255      this.CSVSettingsGroupBox.Controls.Add(this.EncodingLabel);
     256      this.CSVSettingsGroupBox.Controls.Add(this.EncodingComboBox);
    251257      this.CSVSettingsGroupBox.Controls.Add(this.CheckboxColumnNames);
    252258      this.CSVSettingsGroupBox.Controls.Add(this.DateTimeFormatInfoLabel);
     
    261267      this.CSVSettingsGroupBox.Location = new System.Drawing.Point(12, 32);
    262268      this.CSVSettingsGroupBox.Name = "CSVSettingsGroupBox";
    263       this.CSVSettingsGroupBox.Size = new System.Drawing.Size(447, 126);
     269      this.CSVSettingsGroupBox.Size = new System.Drawing.Size(447, 153);
    264270      this.CSVSettingsGroupBox.TabIndex = 16;
    265271      this.CSVSettingsGroupBox.TabStop = false;
    266272      this.CSVSettingsGroupBox.Text = "CSV Settings";
     273      //
     274      // EncodingInfoLabel
     275      //
     276      this.EncodingInfoLabel.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
     277      this.EncodingInfoLabel.Image = HeuristicLab.Common.Resources.VSImageLibrary.Information;
     278      this.EncodingInfoLabel.Location = new System.Drawing.Point(421, 102);
     279      this.EncodingInfoLabel.Name = "EncodingInfoLabel";
     280      this.EncodingInfoLabel.Size = new System.Drawing.Size(16, 16);
     281      this.EncodingInfoLabel.TabIndex = 27;
     282      this.EncodingInfoLabel.Tag = "Select the encoding the file was saved with.";
     283      this.ToolTip.SetToolTip(this.EncodingInfoLabel, "Select the encoding the file was saved with.");
     284      this.EncodingInfoLabel.DoubleClick += new System.EventHandler(this.ControlToolTip_DoubleClick);
     285      //
     286      // EncodingLabel
     287      //
     288      this.EncodingLabel.AutoSize = true;
     289      this.EncodingLabel.Location = new System.Drawing.Point(6, 103);
     290      this.EncodingLabel.Name = "EncodingLabel";
     291      this.EncodingLabel.Size = new System.Drawing.Size(52, 13);
     292      this.EncodingLabel.TabIndex = 26;
     293      this.EncodingLabel.Text = "Encoding";
     294      //
     295      // EncodingComboBox
     296      //
     297      this.EncodingComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
     298            | System.Windows.Forms.AnchorStyles.Right)));
     299      this.EncodingComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
     300      this.EncodingComboBox.Enabled = false;
     301      this.EncodingComboBox.FormattingEnabled = true;
     302      this.EncodingComboBox.Location = new System.Drawing.Point(111, 100);
     303      this.EncodingComboBox.Name = "EncodingComboBox";
     304      this.EncodingComboBox.Size = new System.Drawing.Size(300, 21);
     305      this.EncodingComboBox.TabIndex = 25;
     306      this.EncodingComboBox.SelectionChangeCommitted += new System.EventHandler(this.CSVFormatComboBoxSelectionChangeCommitted);
     307      //
     308      // CheckboxColumnNames
     309      //
     310      this.CheckboxColumnNames.AutoSize = true;
     311      this.CheckboxColumnNames.Location = new System.Drawing.Point(9, 127);
     312      this.CheckboxColumnNames.Name = "CheckboxColumnNames";
     313      this.CheckboxColumnNames.Size = new System.Drawing.Size(144, 17);
     314      this.CheckboxColumnNames.TabIndex = 24;
     315      this.CheckboxColumnNames.Text = "Column names in first line";
     316      this.CheckboxColumnNames.UseVisualStyleBackColor = true;
     317      this.CheckboxColumnNames.CheckedChanged += new System.EventHandler(this.CheckboxColumnNames_CheckedChanged);
    267318      //
    268319      // DateTimeFormatInfoLabel
     
    315366      this.ProblemDataSettingsGroupBox.Controls.Add(this.TrainingTestTrackBar);
    316367      this.ProblemDataSettingsGroupBox.Controls.Add(this.ShuffleDataCheckbox);
    317       this.ProblemDataSettingsGroupBox.Location = new System.Drawing.Point(12, 164);
     368      this.ProblemDataSettingsGroupBox.Location = new System.Drawing.Point(12, 191);
    318369      this.ProblemDataSettingsGroupBox.Name = "ProblemDataSettingsGroupBox";
    319       this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 252);
     370      this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 225);
    320371      this.ProblemDataSettingsGroupBox.TabIndex = 17;
    321372      this.ProblemDataSettingsGroupBox.TabStop = false;
     
    365416      this.PreviewDatasetMatrix.ShowRowsAndColumnsTextBox = false;
    366417      this.PreviewDatasetMatrix.ShowStatisticalInformation = false;
    367       this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 138);
     418      this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 111);
    368419      this.PreviewDatasetMatrix.TabIndex = 0;
    369       //
    370       // CheckboxColumnNames
    371       //
    372       this.CheckboxColumnNames.AutoSize = true;
    373       this.CheckboxColumnNames.Location = new System.Drawing.Point(9, 103);
    374       this.CheckboxColumnNames.Name = "CheckboxColumnNames";
    375       this.CheckboxColumnNames.Size = new System.Drawing.Size(144, 17);
    376       this.CheckboxColumnNames.TabIndex = 24;
    377       this.CheckboxColumnNames.Text = "Column names in first line";
    378       this.CheckboxColumnNames.UseVisualStyleBackColor = true;
    379       this.CheckboxColumnNames.CheckedChanged += new System.EventHandler(this.CheckboxColumnNames_CheckedChanged);
    380420      //
    381421      // DataAnalysisImportTypeDialog
     
    437477    protected System.Windows.Forms.ToolTip ToolTip;
    438478    private System.Windows.Forms.CheckBox CheckboxColumnNames;
     479    protected System.Windows.Forms.Label EncodingInfoLabel;
     480    protected System.Windows.Forms.Label EncodingLabel;
     481    protected System.Windows.Forms.ComboBox EncodingComboBox;
    439482  }
    440483}
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/DataAnalysisImportTypeDialog.cs

    r12009 r13974  
    2525using System.IO;
    2626using System.Linq;
     27using System.Text;
    2728using System.Windows.Forms;
    2829using HeuristicLab.Core.Views;
     
    3233  public partial class DataAnalysisImportTypeDialog : Form {
    3334
    34     public static readonly List<KeyValuePair<DateTimeFormatInfo, string>> dateTimeFormats = new List<KeyValuePair<DateTimeFormatInfo, string>>{
    35       new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), "dd/mm/yyyy hh:MM:ss" ),
    36       new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/dd/yyyy hh:MM:ss" ),
    37       new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "yyyy/mm/dd hh:MM:ss" ),
    38       new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/yyyy/dd hh:MM:ss" )
     35    private static readonly List<KeyValuePair<DateTimeFormatInfo, string>> dateTimeFormats =
     36      new List<KeyValuePair<DateTimeFormatInfo, string>>{
     37        new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), "dd/mm/yyyy hh:MM:ss" ),
     38        new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/dd/yyyy hh:MM:ss" ),
     39        new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "yyyy/mm/dd hh:MM:ss" ),
     40        new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/yyyy/dd hh:MM:ss" )
    3941    };
    4042
    41     public static readonly List<KeyValuePair<char, string>> POSSIBLE_SEPARATORS = new List<KeyValuePair<char, string>>{ 
    42       new KeyValuePair<char, string>(';', "; (Semicolon)" ),
    43       new KeyValuePair<char, string>(',', ", (Comma)" ),   
    44       new KeyValuePair<char, string>('\t', "\\t (Tab)"),
    45       new KeyValuePair<char, string>((char)0, "all whitespaces (including tabs and spaces)")
     43    private static readonly List<KeyValuePair<char, string>> POSSIBLE_SEPARATORS =
     44      new List<KeyValuePair<char, string>>{ 
     45        new KeyValuePair<char, string>(';', "; (Semicolon)" ),
     46        new KeyValuePair<char, string>(',', ", (Comma)" ),   
     47        new KeyValuePair<char, string>('\t', "\\t (Tab)"),
     48        new KeyValuePair<char, string>((char)0, "all whitespaces (including tabs and spaces)")
    4649    };
    4750
    48     public static readonly List<KeyValuePair<NumberFormatInfo, string>> POSSIBLE_DECIMAL_SEPARATORS = new List<KeyValuePair<NumberFormatInfo, string>>{
    49       new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), ", (Comma)"),
    50       new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.InvariantInfo, ". (Period)" )   
     51    private static readonly List<KeyValuePair<NumberFormatInfo, string>> POSSIBLE_DECIMAL_SEPARATORS =
     52      new List<KeyValuePair<NumberFormatInfo, string>>{
     53        new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), ", (Comma)"),
     54        new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.InvariantInfo, ". (Period)" )   
    5155    };
     56
     57    private static readonly List<KeyValuePair<Encoding, string>> POSSIBLE_ENCODINGS =
     58      new List<KeyValuePair<Encoding, string>> {
     59        new KeyValuePair<Encoding, string>(Encoding.Default, "Default"),
     60        new KeyValuePair<Encoding, string>(Encoding.ASCII, "ASCII"),
     61        new KeyValuePair<Encoding, string>(Encoding.Unicode, "Unicode"),   
     62        new KeyValuePair<Encoding, string>(Encoding.UTF8, "UTF8")       
     63      };
    5264
    5365    public string Path {
     
    8799      DateTimeFormatComboBox.ValueMember = "Key";
    88100      DateTimeFormatComboBox.DisplayMember = "Value";
     101      EncodingComboBox.DataSource = POSSIBLE_ENCODINGS;
     102      EncodingComboBox.ValueMember = "Key";
     103      EncodingComboBox.DisplayMember = "Value";
     104
    89105    }
    90106
     
    100116      DecimalSeparatorComboBox.Enabled = true;
    101117      DateTimeFormatComboBox.Enabled = true;
     118      EncodingComboBox.Enabled = true;
    102119      ProblemTextBox.Text = openFileDialog.FileName;
    103120      TableFileParser csvParser = new TableFileParser();
     
    125142      try {
    126143        TableFileParser csvParser = new TableFileParser();
     144        csvParser.Encoding = (Encoding)EncodingComboBox.SelectedValue;
    127145        csvParser.Parse(ProblemTextBox.Text,
    128146                        (NumberFormatInfo)DecimalSeparatorComboBox.SelectedValue,
    129147                        (DateTimeFormatInfo)DateTimeFormatComboBox.SelectedValue,
    130148                        (char)SeparatorComboBox.SelectedValue,
    131                         CheckboxColumnNames.Checked);
     149                        CheckboxColumnNames.Checked, lineLimit: 500);
    132150        IEnumerable<string> variableNamesWithType = GetVariableNamesWithType(csvParser);
    133151        PreviewDatasetMatrix.Content = new Dataset(variableNamesWithType, csvParser.Values);
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/Plugin.cs.frame

    r13316 r13974  
    2525  [Plugin("HeuristicLab.Problems.Instances.DataAnalysis.Views", "3.3.13.$WCREV$")]
    2626  [PluginFile("HeuristicLab.Problems.Instances.DataAnalysis.Views-3.3.dll", PluginFileType.Assembly)]
     27  [PluginDependency("HeuristicLab.Common", "3.3")]
    2728  [PluginDependency("HeuristicLab.Common.Resources", "3.3")]
    2829  [PluginDependency("HeuristicLab.Core.Views", "3.3")]
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/RegressionImportTypeDialog.Designer.cs

    r12009 r13974  
    6666      this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableLabel);
    6767      this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox);
    68       this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 251);
    6968      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.PreviewLabel, 0);
    7069      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.ShuffelInfoLabel, 0);
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/RegressionInstanceProviderView.cs

    r12009 r13974  
    2222using System;
    2323using System.IO;
     24using System.Threading.Tasks;
    2425using System.Windows.Forms;
    2526using HeuristicLab.MainForm;
     
    4445      if (importTypeDialog.ShowDialog() == DialogResult.OK) {
    4546        IRegressionProblemData instance = null;
    46         try {
    47           instance = Content.ImportData(importTypeDialog.Path, importTypeDialog.ImportType, importTypeDialog.CSVFormat);
    48         } catch (IOException ex) {
    49           ErrorWhileParsing(ex);
    50           return;
    51         }
    52         try {
    53           GenericConsumer.Load(instance);
    54           instancesComboBox.SelectedIndex = -1;
    55         } catch (IOException ex) {
    56           ErrorWhileLoading(ex, importTypeDialog.Path);
    57         }
     47
     48        Task.Factory.StartNew(() => {
     49          var mainForm = (MainForm.WindowsForms.MainForm)MainFormManager.MainForm;
     50          // lock active view and show progress bar
     51          IContentView activeView = (IContentView)MainFormManager.MainForm.ActiveView;
     52
     53          try {
     54            var progress = mainForm.AddOperationProgressToContent(activeView.Content, "Loading problem instance.");
     55
     56            Content.ProgressChanged += (o, args) => { progress.ProgressValue = args.ProgressPercentage / 100.0; };
     57
     58            instance = Content.ImportData(importTypeDialog.Path, importTypeDialog.ImportType, importTypeDialog.CSVFormat);
     59          } catch (IOException ex) {
     60            ErrorWhileParsing(ex);
     61            mainForm.RemoveOperationProgressFromContent(activeView.Content);
     62            return;
     63          }
     64          try {
     65            GenericConsumer.Load(instance);
     66          } catch (IOException ex) {
     67            ErrorWhileLoading(ex, importTypeDialog.Path);
     68          } finally {
     69            Invoke((Action)(() => instancesComboBox.SelectedIndex = -1));
     70            mainForm.RemoveOperationProgressFromContent(activeView.Content);
     71          }
     72        });
    5873      }
    5974    }
  • stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/TimeSeriesPrognosisImportTypeDialog.Designer.cs

    r12009 r13974  
    5858      this.ShuffleDataCheckbox.Enabled = false;
    5959      this.ShuffleDataCheckbox.Visible = false;
    60       //
    61       // OkButton
    62       //
    63       this.OkButton.Location = new System.Drawing.Point(303, 407);
    64       //
    65       // CancelationButton
    66       //
    67       this.CancelationButton.Location = new System.Drawing.Point(384, 407);
     60
    6861      //
    6962      // ProblemDataSettingsGroupBox
     
    7265      this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox);
    7366      this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableLabel);
    74       this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 237);
    7567      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.ShuffleDataCheckbox, 0);
    7668      this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.TargetVariableLabel, 0);
     
    8981      this.ErrorTextBox.Location = new System.Drawing.Point(6, 19);
    9082      this.ErrorTextBox.Size = new System.Drawing.Size(435, 69);
    91       //
    92       // PreviewDatasetMatrix
    93       //
    94       this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 123);
    9583      //
    9684      // SeparatorInfoLabel
     
    114102      // TargetVariableComboBox
    115103      //
    116       this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 
     104      this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
    117105            | System.Windows.Forms.AnchorStyles.Right)));
    118106      this.TargetVariableComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
     
    148136      this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
    149137      this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
    150       this.ClientSize = new System.Drawing.Size(471, 442);
     138      this.ClientSize = new System.Drawing.Size(471, 457);
    151139      this.Name = "TimeSeriesPrognosisImportTypeDialog";
    152140      this.Text = "TimeSeries Prognosis CSV Import";
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs

    r12009 r13974  
    2323using System.Collections;
    2424using System.Collections.Generic;
     25using System.ComponentModel;
    2526using System.Globalization;
    2627using System.IO;
     
    3536    where ImportType : DataAnalysisImportType {
    3637
     38    public event ProgressChangedEventHandler ProgressChanged;
    3739
    3840    public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) {
    3941      TableFileParser csvFileParser = new TableFileParser();
     42      long fileSize = new FileInfo(path).Length;
     43      csvFileParser.ProgressChanged += (sender, e) => {
     44        OnProgressChanged(e / (double)fileSize);
     45      };
    4046      csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator, csvFormat.VariableNamesAvailable);
    4147      return ImportData(path, type, csvFileParser);
     48    }
     49
     50    protected virtual void OnProgressChanged(double d) {
     51      var handler = ProgressChanged;
     52      if (handler != null)
     53        handler(this, new ProgressChangedEventArgs((int)(100 * d), null));
    4254    }
    4355
     
    89101        strBuilder.AppendLine();
    90102      }
    91 
    92       using (var writer = new StreamWriter(path)) {
    93         writer.Write(strBuilder);
     103      using (var fileStream = new FileStream(path, FileMode.Create)) {
     104        Encoding encoding = Encoding.GetEncoding(Encoding.Default.CodePage,
     105          new EncoderReplacementFallback("*"),
     106          new DecoderReplacementFallback("*"));
     107        using (var writer = new StreamWriter(fileStream, encoding)) {
     108          writer.Write(strBuilder);
     109        }
    94110      }
    95111    }
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r12009 r13974  
    2424using System.Collections;
    2525using System.Collections.Generic;
     26using System.Diagnostics.Contracts;
    2627using System.Globalization;
    2728using System.IO;
    2829using System.Linq;
    2930using System.Runtime.Serialization;
     31using System.Text;
    3032
    3133namespace HeuristicLab.Problems.Instances.DataAnalysis {
    32   public class TableFileParser {
     34  public class TableFileParser : Progress<long> { // reports the number of bytes read
    3335    private const int BUFFER_SIZE = 65536;
    3436    // char used to symbolize whitespaces (no missing values can be handled with whitespaces)
     
    3638    private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
    3739    private Tokenizer tokenizer;
    38     private List<List<object>> rowValues;
     40    private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
     41
     42
     43    private Encoding encoding = Encoding.Default;
     44
     45    public Encoding Encoding {
     46      get { return encoding; }
     47      set {
     48        if (value == null) throw new ArgumentNullException("Encoding");
     49        encoding = value;
     50      }
     51    }
     52
    3953
    4054    private int rows;
     
    7286
    7387    public TableFileParser() {
    74       rowValues = new List<List<object>>();
    7588      variableNames = new List<string>();
    7689    }
     
    102115    public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
    103116                                          DateTimeFormatInfo dateTimeFormatInfo, char separator) {
    104       using (StreamReader reader = new StreamReader(stream)) {
     117      using (StreamReader reader = new StreamReader(stream, Encoding)) {
    105118        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    106         return tokenizer.Peek().type != TokenTypeEnum.Double;
     119        return (tokenizer.PeekType() != TokenTypeEnum.Double);
    107120      }
    108121    }
     
    113126    /// <param name="fileName">file which is parsed</param>
    114127    /// <param name="columnNamesInFirstLine"></param>
    115     public void Parse(string fileName, bool columnNamesInFirstLine) {
     128    public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
    116129      NumberFormatInfo numberFormat;
    117130      DateTimeFormatInfo dateTimeFormatInfo;
    118131      char separator;
    119132      DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
    120       Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     133      EstimateNumberOfLines(fileName);
     134      Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
    121135    }
    122136
     
    129143    /// <param name="separator">defines the separator</param>
    130144    /// <param name="columnNamesInFirstLine"></param>
    131     public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
     145    public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     146      EstimateNumberOfLines(fileName);
    132147      using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
    133         Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     148        Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
     149      }
     150    }
     151
     152    // determines the number of newline characters in the first 64KB to guess the number of rows for a file
     153    private void EstimateNumberOfLines(string fileName) {
     154      var len = new System.IO.FileInfo(fileName).Length;
     155      var buf = new char[1024 * 1024];
     156      using (var reader = new StreamReader(fileName, Encoding)) {
     157        reader.ReadBlock(buf, 0, buf.Length);
     158      }
     159      int numNewLine = 0;
     160      int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
     161      foreach (var ch in buf) {
     162        charsInCurrentLine++;
     163        if (ch == '\n') {
     164          if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
     165          charsInCurrentLine = 0;
     166          numNewLine++;
     167        }
     168      }
     169      if (numNewLine <= 1) {
     170        // fail -> keep the default setting
     171        return;
     172      } else {
     173        double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
     174        double estimatedLines = len / charsPerLineFactor;
     175        estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
    134176      }
    135177    }
     
    140182    /// <param name="stream">stream which is parsed</param>
    141183    /// <param name="columnNamesInFirstLine"></param>
    142     public void Parse(Stream stream, bool columnNamesInFirstLine) {
     184    public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
    143185      NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
    144186      DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
    145187      char separator = ',';
    146       Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
     188      Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
    147189    }
    148190
     
    155197    /// <param name="separator">defines the separator</param>
    156198    /// <param name="columnNamesInFirstLine"></param>
    157     public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
    158       using (StreamReader reader = new StreamReader(stream)) {
     199    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
     200      using (StreamReader reader = new StreamReader(stream, Encoding)) {
    159201        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
    160         // parse the file
    161         Parse(columnNamesInFirstLine);
    162       }
    163 
    164       // translate the list of samples into a DoubleMatrixData item
    165       rows = rowValues.Count;
    166       columns = rowValues[0].Count;
    167       values = new List<IList>();
    168 
    169       //create columns
    170       for (int col = 0; col < columns; col++) {
    171         var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
    172         if (!types.Any()) {
    173           values.Add(new List<string>());
    174           continue;
     202        values = new List<IList>();
     203        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
     204
     205        if (columnNamesInFirstLine) {
     206          ParseVariableNames();
     207          if (!tokenizer.HasNext())
     208            Error(
     209              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
     210              "", tokenizer.CurrentLineNumber);
    175211        }
    176212
    177         var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
    178         if (columnType == typeof(double)) values.Add(new List<double>());
    179         else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
    180         else if (columnType == typeof(string)) values.Add(new List<string>());
    181         else throw new InvalidOperationException();
    182       }
    183 
    184 
    185 
    186       //fill with values
    187       foreach (List<object> row in rowValues) {
    188         int columnIndex = 0;
    189         foreach (object element in row) {
    190           if (values[columnIndex] is List<double> && !(element is double))
    191             values[columnIndex].Add(double.NaN);
    192           else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
    193             values[columnIndex].Add(DateTime.MinValue);
    194           else if (values[columnIndex] is List<string> && !(element is string))
    195             values[columnIndex].Add(element.ToString());
    196           else
    197             values[columnIndex].Add(element);
    198           columnIndex++;
     213
     214        // read values... start in first row
     215        int nLinesParsed = 0;
     216        int colIdx = 0;
     217        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
     218        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
     219          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     220            tokenizer.Skip();
     221
     222            // all rows have to have the same number of values
     223            // the first row defines how many samples are needed
     224            if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
     225            else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
     226              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     227                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
     228                    tokenizer.CurrentLineNumber);
     229            }
     230            OnReport(tokenizer.BytesRead);
     231
     232            nLinesParsed++;
     233            colIdx = 0;
     234          } else {
     235            // read one value
     236            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     237            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     238
     239            // initialize columns on the first row (fixing data types as presented in the first row...)
     240            if (nLinesParsed == 0) {
     241              values.Add(CreateList(type, estimatedNumberOfLines));
     242            } else if (colIdx == values.Count) {
     243              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     244                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
     245                tokenizer.CurrentLineNumber);
     246            }
     247            if (!IsColumnTypeCompatible(values[colIdx], type)) {
     248              values[colIdx] = ConvertToStringColumn(values[colIdx]);
     249            }
     250            // add the value to the column
     251            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
     252          }
    199253        }
    200       }
    201     }
     254
     255        if (!values.Any() || values.First().Count == 0)
     256          Error("Couldn't parse data values. Probably because of incorrect number format " +
     257                "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
     258      }
     259
     260      this.rows = values.First().Count;
     261      this.columns = values.Count;
     262
     263      // after everything has been parsed make sure the lists are as compact as possible
     264      foreach (var l in values) {
     265        var dblList = l as List<double>;
     266        var byteList = l as List<byte>;
     267        var dateList = l as List<DateTime>;
     268        var stringList = l as List<string>;
     269        var objList = l as List<object>;
     270        if (dblList != null) dblList.TrimExcess();
     271        if (byteList != null) byteList.TrimExcess();
     272        if (dateList != null) dateList.TrimExcess();
     273        if (stringList != null) stringList.TrimExcess();
     274        if (objList != null) objList.TrimExcess();
     275      }
     276
     277      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
     278      GC.Collect(2, GCCollectionMode.Forced);
     279    }
     280
     281    #region type-dependent dispatch
     282    private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
     283      return (list is List<string>) || // all tokens can be added to a string list
     284             (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns
     285             (tokenType == TokenTypeEnum.Double && list is List<double>) ||
     286             (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
     287    }
     288
     289    // all columns are converted to string columns when we find an non-empty value that has incorrect type
     290    private IList ConvertToStringColumn(IList list) {
     291      var dblL = list as List<double>;
     292      if (dblL != null) {
     293        var l = new List<string>(dblL.Capacity);
     294        l.AddRange(dblL.Select(dbl => dbl.ToString()));
     295        return l;
     296      }
     297
     298      var dtL = list as List<DateTime>;
     299      if (dtL != null) {
     300        var l = new List<string>(dtL.Capacity);
     301        l.AddRange(dtL.Select(dbl => dbl.ToString()));
     302        return l;
     303      }
     304
     305      if (list is List<string>) return list;
     306
     307      throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
     308    }
     309
     310    private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
     311      var dblList = list as List<double>;
     312      if (dblList != null) {
     313        AddValue(type, dblList, dblVal);
     314        return;
     315      }
     316
     317      var strList = list as List<string>;
     318      if (strList != null) {
     319        AddValue(type, strList, strVal);
     320        return;
     321      }
     322      var dtList = list as List<DateTime>;
     323      if (dtList != null) {
     324        AddValue(type, dtList, dateTimeVal);
     325        return;
     326      }
     327
     328      list.Add(strVal); // assumes List<object>
     329    }
     330
     331    private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
     332      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double);
     333      list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
     334    }
     335
     336    private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
     337      // assumes that strVal is always set to the original token read from the input file
     338      list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
     339    }
     340
     341    private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
     342      Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime);
     343      list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
     344    }
     345
     346    private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
     347      switch (type) {
     348        case TokenTypeEnum.String:
     349          return new List<string>(estimatedNumberOfLines);
     350        case TokenTypeEnum.Double:
     351        case TokenTypeEnum.Missing: // assume double columns
     352          return new List<double>(estimatedNumberOfLines);
     353        case TokenTypeEnum.DateTime:
     354          return new List<DateTime>(estimatedNumberOfLines);
     355        default:
     356          throw new InvalidOperationException();
     357      }
     358    }
     359    #endregion
    202360
    203361    public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
     
    253411            separator = ',';
    254412          } else {
    255             char[] disallowedSeparators = new char[] { ',' };
     413            char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
    256414            // German format (real values)
    257415            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
     
    282440
    283441    #region tokenizer
     442    // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
    284443    internal enum TokenTypeEnum {
    285       NewLine, Separator, String, Double, DateTime
    286     }
    287 
    288     internal class Token {
    289       public TokenTypeEnum type;
    290       public string stringValue;
    291       public double doubleValue;
    292       public DateTime dateTimeValue;
    293 
    294       public Token(TokenTypeEnum type, string value) {
    295         this.type = type;
    296         stringValue = value;
    297         dateTimeValue = DateTime.MinValue;
    298         doubleValue = 0.0;
    299       }
    300 
    301       public override string ToString() {
    302         return stringValue;
    303       }
    304     }
    305 
     444      NewLine, String, Double, DateTime, Missing
     445    }
    306446
    307447    internal class Tokenizer {
    308448      private StreamReader reader;
    309       private List<Token> tokens;
     449      // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
     450      private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
     451      private string[] stringVals = new string[1024];
     452      private double[] doubleVals = new double[1024];
     453      private DateTime[] dateTimeVals = new DateTime[1024];
     454      private int tokenPos;
     455      private int numTokens;
    310456      private NumberFormatInfo numberFormatInfo;
    311457      private DateTimeFormatInfo dateTimeFormatInfo;
    312458      private char separator;
    313       private const string INTERNAL_SEPARATOR = "#";
     459
     460      // arrays for string.Split()
     461      private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
     462      private readonly char[] separators;
    314463
    315464      private int currentLineNumber = 0;
     
    323472        private set { currentLine = value; }
    324473      }
    325 
    326       private Token newlineToken;
    327       public Token NewlineToken {
    328         get { return newlineToken; }
    329         private set { newlineToken = value; }
    330       }
    331       private Token separatorToken;
    332       public Token SeparatorToken {
    333         get { return separatorToken; }
    334         private set { separatorToken = value; }
     474      public long BytesRead {
     475        get;
     476        private set;
    335477      }
    336478
     
    340482        this.dateTimeFormatInfo = dateTimeFormatInfo;
    341483        this.separator = separator;
    342         separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
    343         newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
    344         tokens = new List<Token>();
     484        this.separators = new char[] { separator };
    345485        ReadNextTokens();
     486      }
     487
     488      public bool HasNext() {
     489        return numTokens > tokenPos || !reader.EndOfStream;
     490      }
     491
     492      public TokenTypeEnum PeekType() {
     493        return tokenTypes[tokenPos];
     494      }
     495
     496      public void Skip() {
     497        // simply skips one token without returning the result values
     498        tokenPos++;
     499        if (numTokens == tokenPos) {
     500          ReadNextTokens();
     501        }
     502      }
     503
     504      public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
     505        type = tokenTypes[tokenPos];
     506        strVal = stringVals[tokenPos];
     507        dblVal = doubleVals[tokenPos];
     508        dateTimeVal = dateTimeVals[tokenPos];
     509        Skip();
    346510      }
    347511
     
    349513        if (!reader.EndOfStream) {
    350514          CurrentLine = reader.ReadLine();
    351           var newTokens = from str in Split(CurrentLine)
    352                           let trimmedStr = str.Trim()
    353                           where !string.IsNullOrEmpty(trimmedStr)
    354                           select MakeToken(trimmedStr);
    355 
    356           tokens.AddRange(newTokens);
    357           tokens.Add(NewlineToken);
    358515          CurrentLineNumber++;
     516          if (reader.BaseStream.CanSeek) {
     517            BytesRead = reader.BaseStream.Position;
     518          } else {
     519            BytesRead += CurrentLine.Length + 2; // guess
     520          }
     521          int i = 0;
     522          if (!string.IsNullOrWhiteSpace(CurrentLine)) {
     523            foreach (var tok in Split(CurrentLine)) {
     524              TokenTypeEnum type;
     525              double doubleVal;
     526              DateTime dateTimeValue;
     527              type = TokenTypeEnum.String; // default
     528              stringVals[i] = tok.Trim();
     529              if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
     530                type = TokenTypeEnum.Double;
     531                doubleVals[i] = doubleVal;
     532              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
     533                type = TokenTypeEnum.DateTime;
     534                dateTimeVals[i] = dateTimeValue;
     535              } else if (string.IsNullOrWhiteSpace(tok)) {
     536                type = TokenTypeEnum.Missing;
     537              }
     538
     539              // couldn't parse the token as an int or float number or datetime value so return a string token
     540
     541              tokenTypes[i] = type;
     542              i++;
     543
     544              if (i >= tokenTypes.Length) {
     545                // increase buffer size if necessary
     546                IncreaseCapacity(ref tokenTypes);
     547                IncreaseCapacity(ref doubleVals);
     548                IncreaseCapacity(ref stringVals);
     549                IncreaseCapacity(ref dateTimeVals);
     550              }
     551            }
     552          }
     553          tokenTypes[i] = TokenTypeEnum.NewLine;
     554          numTokens = i + 1;
     555          tokenPos = 0;
    359556        }
    360557      }
    361558
    362559      private IEnumerable<string> Split(string line) {
    363         IEnumerable<string> splitString;
    364         if (separator == WHITESPACECHAR) {
    365           //separate whitespaces
    366           splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
    367         } else {
    368           splitString = line.Split(separator);
    369         }
    370         int cur = splitString.Count();
    371         foreach (var str in splitString) {
    372           yield return str;
    373           cur--;
    374           // do not return the INTERNAL_SEPARATOR after the last string
    375           if (cur != 0) {
    376             yield return INTERNAL_SEPARATOR;
    377           }
    378         }
    379       }
    380 
    381       private Token MakeToken(string strToken) {
    382         Token token = new Token(TokenTypeEnum.String, strToken);
    383         if (strToken.Equals(INTERNAL_SEPARATOR)) {
    384           return SeparatorToken;
    385         } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
    386           token.type = TokenTypeEnum.Double;
    387           return token;
    388         } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
    389           token.type = TokenTypeEnum.DateTime;
    390           return token;
    391         }
    392 
    393         // couldn't parse the token as an int or float number  or datetime value so return a string token
    394         return token;
    395       }
    396 
    397       public Token Peek() {
    398         return tokens[0];
    399       }
    400 
    401       public Token Next() {
    402         Token next = tokens[0];
    403         tokens.RemoveAt(0);
    404         if (tokens.Count == 0) {
    405           ReadNextTokens();
    406         }
    407         return next;
    408       }
    409 
    410       public bool HasNext() {
    411         return tokens.Count > 0 || !reader.EndOfStream;
     560        return separator == WHITESPACECHAR ?
     561          line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
     562          line.Split(separators);
     563      }
     564
     565      private static void IncreaseCapacity<T>(ref T[] arr) {
     566        int n = (int)Math.Floor(arr.Length * 1.7); // guess
     567        T[] arr2 = new T[n];
     568        Array.Copy(arr, arr2, arr.Length);
     569        arr = arr2;
    412570      }
    413571    }
     
    415573
    416574    #region parsing
    417     private void Parse(bool columnNamesInFirstLine) {
    418       if (columnNamesInFirstLine) {
    419         ParseVariableNames();
    420         if (!tokenizer.HasNext())
    421           Error(
    422             "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
    423             "", tokenizer.CurrentLineNumber);
    424       }
    425       ParseValues();
    426       if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    427     }
    428 
    429     private void ParseValues() {
    430       while (tokenizer.HasNext()) {
    431         if (tokenizer.Peek() == tokenizer.NewlineToken) {
    432           tokenizer.Next();
    433         } else {
    434           List<object> row = new List<object>();
    435           object value = NextValue(tokenizer);
    436           row.Add(value);
    437           while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
    438             Expect(tokenizer.SeparatorToken);
    439             row.Add(NextValue(tokenizer));
    440           }
    441           Expect(tokenizer.NewlineToken);
    442           // all rows have to have the same number of values           
    443           // the first row defines how many samples are needed
    444           if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
    445             Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
    446                   "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
    447                   tokenizer.CurrentLineNumber);
    448           }
    449           rowValues.Add(row);
    450         }
    451       }
    452     }
    453 
    454     private object NextValue(Tokenizer tokenizer) {
    455       if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
    456       Token current = tokenizer.Next();
    457       if (current.type == TokenTypeEnum.Separator) {
    458         return double.NaN;
    459       } else if (current.type == TokenTypeEnum.String) {
    460         return current.stringValue;
    461       } else if (current.type == TokenTypeEnum.Double) {
    462         return current.doubleValue;
    463       } else if (current.type == TokenTypeEnum.DateTime) {
    464         return current.dateTimeValue;
    465       }
    466       // found an unexpected token => throw error
    467       Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
    468       // this line is never executed because Error() throws an exception
    469       throw new InvalidOperationException();
    470     }
    471575
    472576    private void ParseVariableNames() {
    473577      // the first line must contain variable names
    474       List<Token> tokens = new List<Token>();
    475       Token valueToken;
    476       valueToken = tokenizer.Next();
    477       tokens.Add(valueToken);
    478       while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
    479         Expect(tokenizer.SeparatorToken);
    480         valueToken = tokenizer.Next();
    481         if (valueToken != tokenizer.NewlineToken) {
    482           tokens.Add(valueToken);
    483         }
    484       }
    485       if (valueToken != tokenizer.NewlineToken) {
    486         Expect(tokenizer.NewlineToken);
    487       }
    488       variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
    489     }
    490 
    491     private void Expect(Token expectedToken) {
    492       Token actualToken = tokenizer.Next();
    493       if (actualToken != expectedToken) {
    494         Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
    495       }
     578      List<string> varNames = new List<string>();
     579
     580      TokenTypeEnum type;
     581      string strVal;
     582      double dblVal;
     583      DateTime dateTimeVal;
     584
     585      tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     586
     587      // the first token must be a variable name
     588      if (type != TokenTypeEnum.String)
     589        throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
     590      varNames.Add(strVal);
     591
     592      while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
     593        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     594        varNames.Add(strVal);
     595      }
     596      ExpectType(TokenTypeEnum.NewLine);
     597
     598      variableNames = varNames;
     599    }
     600
     601    private void ExpectType(TokenTypeEnum expectedToken) {
     602      if (tokenizer.PeekType() != expectedToken)
     603        throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
     604      tokenizer.Skip();
    496605    }
    497606
  • stable/HeuristicLab.Tests

  • stable/HeuristicLab.Tests/HeuristicLab.Problems.Instances.DataAnalysis-3.3/TableFileParserTest.cs

    r12009 r13974  
    2121
    2222using System;
     23using System.Collections.Generic;
     24using System.Globalization;
    2325using System.IO;
    2426using Microsoft.VisualStudio.TestTools.UnitTesting;
     
    589591    }
    590592
     593
     594    [TestMethod]
     595    [TestCategory("Problems.Instances")]
     596    [TestProperty("Time", "short")]
     597    public void ParseWithColumnTypeConversionDE() {
     598      // If first entry of a column can be parsed as a double we assume all values are doubles.
     599      // However, if any of the following entries cannot be parsed as a double we convert the whole column to a string column.
     600      // Special care needs to be taken with missing values, NaN (n.def.) and infinity values.
     601      // We only support DE-DE and InvariantCulture number formats
     602      string tempFileName = Path.GetTempFileName();
     603      var deCultureInfo = CultureInfo.GetCultureInfo("DE-DE");
     604      WriteToFile(tempFileName,
     605      "str\tdbl\tdbl\tdbl" + Environment.NewLine +
     606      "1,3\t1,3\t0\t3" + Environment.NewLine +
     607      "1,3\t\t0\t0" + Environment.NewLine +
     608      "s\t" + double.NaN.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine + // double.NaN might have a different string representation on different systems (even when using the same CultureInfo)
     609      "s\t" + double.PositiveInfinity.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine +
     610      "s\t" + double.NegativeInfinity.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine +
     611      "s\t0\t0\t0");
     612      TableFileParser parser = new TableFileParser();
     613      try {
     614        parser.Parse(tempFileName,
     615          deCultureInfo.NumberFormat,
     616          deCultureInfo.DateTimeFormat,
     617          '\t',
     618          true);
     619        Assert.AreEqual(6, parser.Rows);
     620        Assert.AreEqual(4, parser.Columns);
     621        Assert.IsTrue(parser.Values[0] is List<string>);
     622        Assert.IsTrue(parser.Values[1] is List<double>);
     623        Assert.IsTrue(parser.Values[2] is List<double>);
     624        Assert.IsTrue(parser.Values[3] is List<double>);
     625        Assert.IsTrue(double.IsNaN((double)parser.Values[1][1])); // missing value
     626        Assert.IsTrue(double.IsNaN((double)parser.Values[1][2]));
     627        Assert.IsTrue(double.IsPositiveInfinity((double)parser.Values[1][3])); // NOTE: in DE-DE NumberFormat just "unendlich" is not allowed (compare with InvariantCulture)
     628        Assert.IsTrue(double.IsNegativeInfinity((double)parser.Values[1][4]));
     629      } finally {
     630        File.Delete(tempFileName);
     631      }
     632    }
     633
     634    [TestMethod]
     635    [TestCategory("Problems.Instances")]
     636    [TestProperty("Time", "short")]
     637    public void ParseWithColumnTypeConversionInvariant() {
     638      // see ParseWithColumnTypeConversionDE above
     639      // same routine only using invariant culture
     640      string tempFileName = Path.GetTempFileName();
     641      WriteToFile(tempFileName,
     642      @"str,dbl,dbl,dbl
     6431.3,1.3,0,3
     6441.3,,0,0
     645s,NaN,0,0
     646s,Infinity,0,0
     647s,-Infinity,0,0
     648s,0,0,0");
     649      TableFileParser parser = new TableFileParser();
     650      try {
     651        parser.Parse(tempFileName,
     652          CultureInfo.InvariantCulture.NumberFormat,
     653          CultureInfo.InvariantCulture.DateTimeFormat,
     654          ',',
     655          parser.AreColumnNamesInFirstLine(tempFileName));
     656        Assert.AreEqual(6, parser.Rows);
     657        Assert.AreEqual(4, parser.Columns);
     658        Assert.IsTrue(parser.Values[0] is List<string>);
     659        Assert.IsTrue(parser.Values[1] is List<double>);
     660        Assert.IsTrue(parser.Values[2] is List<double>);
     661        Assert.IsTrue(parser.Values[3] is List<double>);
     662        Assert.IsTrue(double.IsNaN((double)parser.Values[1][1])); // missing value
     663        Assert.IsTrue(double.IsNaN((double)parser.Values[1][2]));
     664        Assert.IsTrue(double.IsPositiveInfinity((double)parser.Values[1][3])); // NOTE: in InvariantCulture +Infinity is not allowed (compare with DE-DE)
     665        Assert.IsTrue(double.IsNegativeInfinity((double)parser.Values[1][4]));
     666      } finally {
     667        File.Delete(tempFileName);
     668      }
     669    }
     670
    591671    private void WriteToFile(string fileName, string content) {
    592672      using (StreamWriter writer = new StreamWriter(fileName)) {
Note: See TracChangeset for help on using the changeset viewer.