Changeset 13974
- Timestamp:
- 07/02/16 08:15:07 (8 years ago)
- Location:
- stable
- Files:
-
- 16 edited
Legend:
- Unmodified
- Added
- Removed
-
stable
- Property svn:mergeinfo changed
/trunk/sources merged: 13411,13413-13415,13419,13440-13442,13445,13447,13525-13526,13529,13584,13901,13925
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.DataAnalysis merged: 13419
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs
r13949 r13974 52 52 } 53 53 54 /// <summary> 55 /// Creates a new dataset. The variableValues are not cloned. 56 /// </summary> 57 /// <param name="variableNames">The names of the variables in the dataset</param> 58 /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param> 54 59 public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) 55 60 : base() { … … 75 80 for (int i = 0; i < this.variableNames.Count; i++) { 76 81 var values = variableValues.ElementAt(i); 77 IList clonedValues = null; 78 if (values is IList<double>) 79 clonedValues = new List<double>(values.Cast<double>()); 80 else if (values is IList<string>) 81 clonedValues = new List<string>(values.Cast<string>()); 82 else if (values is IList<DateTime>) 83 clonedValues = new List<DateTime>(values.Cast<DateTime>()); 84 else { 85 this.variableNames = new List<string>(); 86 this.variableValues = new Dictionary<string, IList>(); 87 throw new ArgumentException("The variable values must be of type IList<double>, IList<string> or IList<DateTime>"); 88 } 89 this.variableValues.Add(this.variableNames[i], clonedValues); 82 this.variableValues.Add(this.variableNames[i], values); 90 83 } 91 84 } -
stable/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis merged: 13411,13413-13414,13440-13442,13445,13447,13526,13584,13901,13925
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis.Views merged: 13413-13415,13441,13584
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/ClassificationImportTypeDialog.Designer.cs
r12009 r13974 87 87 this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox); 88 88 this.ProblemDataSettingsGroupBox.Controls.Add(this.UniformDistributionOfClassesCheckbox); 89 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 312);89 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 285); 90 90 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.UniformDistributionOfClassesCheckbox, 0); 91 91 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.TargetVariableComboBox, 0); … … 110 110 // 111 111 this.PreviewDatasetMatrix.Location = new System.Drawing.Point(6, 134); 112 this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 1 72);112 this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 145); 113 113 // 114 114 // PreviewLabel … … 118 118 // TargetVariableComboBox 119 119 // 120 this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 120 this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 121 121 | System.Windows.Forms.AnchorStyles.Right))); 122 122 this.TargetVariableComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/DataAnalysisImportTypeDialog.Designer.cs
r12009 r13974 63 63 this.SeparatorComboBox = new System.Windows.Forms.ComboBox(); 64 64 this.CSVSettingsGroupBox = new System.Windows.Forms.GroupBox(); 65 this.EncodingInfoLabel = new System.Windows.Forms.Label(); 66 this.EncodingLabel = new System.Windows.Forms.Label(); 67 this.EncodingComboBox = new System.Windows.Forms.ComboBox(); 68 this.CheckboxColumnNames = new System.Windows.Forms.CheckBox(); 65 69 this.DateTimeFormatInfoLabel = new System.Windows.Forms.Label(); 66 70 this.DecimalSeparatorInfoLabel = new System.Windows.Forms.Label(); … … 72 76 this.PreviewDatasetMatrix = new HeuristicLab.Data.Views.StringConvertibleMatrixView(); 73 77 this.ToolTip = new System.Windows.Forms.ToolTip(this.components); 74 this.CheckboxColumnNames = new System.Windows.Forms.CheckBox();75 78 ((System.ComponentModel.ISupportInitialize)(this.TrainingTestTrackBar)).BeginInit(); 76 79 this.CSVSettingsGroupBox.SuspendLayout(); … … 249 252 this.CSVSettingsGroupBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 250 253 | System.Windows.Forms.AnchorStyles.Right))); 254 this.CSVSettingsGroupBox.Controls.Add(this.EncodingInfoLabel); 255 this.CSVSettingsGroupBox.Controls.Add(this.EncodingLabel); 256 this.CSVSettingsGroupBox.Controls.Add(this.EncodingComboBox); 251 257 this.CSVSettingsGroupBox.Controls.Add(this.CheckboxColumnNames); 252 258 this.CSVSettingsGroupBox.Controls.Add(this.DateTimeFormatInfoLabel); … … 261 267 this.CSVSettingsGroupBox.Location = new System.Drawing.Point(12, 32); 262 268 this.CSVSettingsGroupBox.Name = "CSVSettingsGroupBox"; 263 this.CSVSettingsGroupBox.Size = new System.Drawing.Size(447, 1 26);269 this.CSVSettingsGroupBox.Size = new System.Drawing.Size(447, 153); 264 270 this.CSVSettingsGroupBox.TabIndex = 16; 265 271 this.CSVSettingsGroupBox.TabStop = false; 266 272 this.CSVSettingsGroupBox.Text = "CSV Settings"; 273 // 274 // EncodingInfoLabel 275 // 276 this.EncodingInfoLabel.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); 277 this.EncodingInfoLabel.Image = HeuristicLab.Common.Resources.VSImageLibrary.Information; 278 this.EncodingInfoLabel.Location = new System.Drawing.Point(421, 102); 279 this.EncodingInfoLabel.Name = "EncodingInfoLabel"; 280 this.EncodingInfoLabel.Size = new System.Drawing.Size(16, 16); 281 this.EncodingInfoLabel.TabIndex = 27; 282 this.EncodingInfoLabel.Tag = "Select the encoding the file was saved with."; 283 this.ToolTip.SetToolTip(this.EncodingInfoLabel, "Select the encoding the file was saved with."); 284 this.EncodingInfoLabel.DoubleClick += new System.EventHandler(this.ControlToolTip_DoubleClick); 285 // 286 // EncodingLabel 287 // 288 this.EncodingLabel.AutoSize = true; 289 this.EncodingLabel.Location = new System.Drawing.Point(6, 103); 290 this.EncodingLabel.Name = "EncodingLabel"; 291 this.EncodingLabel.Size = new System.Drawing.Size(52, 13); 292 this.EncodingLabel.TabIndex = 26; 293 this.EncodingLabel.Text = "Encoding"; 294 // 295 // EncodingComboBox 296 // 297 this.EncodingComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 298 | System.Windows.Forms.AnchorStyles.Right))); 299 this.EncodingComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; 300 this.EncodingComboBox.Enabled = false; 301 this.EncodingComboBox.FormattingEnabled = true; 302 this.EncodingComboBox.Location = new System.Drawing.Point(111, 100); 303 this.EncodingComboBox.Name = "EncodingComboBox"; 304 this.EncodingComboBox.Size = new System.Drawing.Size(300, 21); 305 this.EncodingComboBox.TabIndex = 25; 306 this.EncodingComboBox.SelectionChangeCommitted += new System.EventHandler(this.CSVFormatComboBoxSelectionChangeCommitted); 307 // 308 // CheckboxColumnNames 309 // 310 this.CheckboxColumnNames.AutoSize = true; 311 this.CheckboxColumnNames.Location = new System.Drawing.Point(9, 127); 312 this.CheckboxColumnNames.Name = "CheckboxColumnNames"; 313 this.CheckboxColumnNames.Size = new System.Drawing.Size(144, 17); 314 this.CheckboxColumnNames.TabIndex = 24; 315 this.CheckboxColumnNames.Text = "Column names in first line"; 316 this.CheckboxColumnNames.UseVisualStyleBackColor = true; 317 this.CheckboxColumnNames.CheckedChanged += new System.EventHandler(this.CheckboxColumnNames_CheckedChanged); 267 318 // 268 319 // DateTimeFormatInfoLabel … … 315 366 this.ProblemDataSettingsGroupBox.Controls.Add(this.TrainingTestTrackBar); 316 367 this.ProblemDataSettingsGroupBox.Controls.Add(this.ShuffleDataCheckbox); 317 this.ProblemDataSettingsGroupBox.Location = new System.Drawing.Point(12, 1 64);368 this.ProblemDataSettingsGroupBox.Location = new System.Drawing.Point(12, 191); 318 369 this.ProblemDataSettingsGroupBox.Name = "ProblemDataSettingsGroupBox"; 319 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 2 52);370 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 225); 320 371 this.ProblemDataSettingsGroupBox.TabIndex = 17; 321 372 this.ProblemDataSettingsGroupBox.TabStop = false; … … 365 416 this.PreviewDatasetMatrix.ShowRowsAndColumnsTextBox = false; 366 417 this.PreviewDatasetMatrix.ShowStatisticalInformation = false; 367 this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 1 38);418 this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 111); 368 419 this.PreviewDatasetMatrix.TabIndex = 0; 369 //370 // CheckboxColumnNames371 //372 this.CheckboxColumnNames.AutoSize = true;373 this.CheckboxColumnNames.Location = new System.Drawing.Point(9, 103);374 this.CheckboxColumnNames.Name = "CheckboxColumnNames";375 this.CheckboxColumnNames.Size = new System.Drawing.Size(144, 17);376 this.CheckboxColumnNames.TabIndex = 24;377 this.CheckboxColumnNames.Text = "Column names in first line";378 this.CheckboxColumnNames.UseVisualStyleBackColor = true;379 this.CheckboxColumnNames.CheckedChanged += new System.EventHandler(this.CheckboxColumnNames_CheckedChanged);380 420 // 381 421 // DataAnalysisImportTypeDialog … … 437 477 protected System.Windows.Forms.ToolTip ToolTip; 438 478 private System.Windows.Forms.CheckBox CheckboxColumnNames; 479 protected System.Windows.Forms.Label EncodingInfoLabel; 480 protected System.Windows.Forms.Label EncodingLabel; 481 protected System.Windows.Forms.ComboBox EncodingComboBox; 439 482 } 440 483 } -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/DataAnalysisImportTypeDialog.cs
r12009 r13974 25 25 using System.IO; 26 26 using System.Linq; 27 using System.Text; 27 28 using System.Windows.Forms; 28 29 using HeuristicLab.Core.Views; … … 32 33 public partial class DataAnalysisImportTypeDialog : Form { 33 34 34 public static readonly List<KeyValuePair<DateTimeFormatInfo, string>> dateTimeFormats = new List<KeyValuePair<DateTimeFormatInfo, string>>{ 35 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), "dd/mm/yyyy hh:MM:ss" ), 36 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/dd/yyyy hh:MM:ss" ), 37 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "yyyy/mm/dd hh:MM:ss" ), 38 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/yyyy/dd hh:MM:ss" ) 35 private static readonly List<KeyValuePair<DateTimeFormatInfo, string>> dateTimeFormats = 36 new List<KeyValuePair<DateTimeFormatInfo, string>>{ 37 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")), "dd/mm/yyyy hh:MM:ss" ), 38 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/dd/yyyy hh:MM:ss" ), 39 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "yyyy/mm/dd hh:MM:ss" ), 40 new KeyValuePair<DateTimeFormatInfo, string>(DateTimeFormatInfo.InvariantInfo, "mm/yyyy/dd hh:MM:ss" ) 39 41 }; 40 42 41 public static readonly List<KeyValuePair<char, string>> POSSIBLE_SEPARATORS = new List<KeyValuePair<char, string>>{ 42 new KeyValuePair<char, string>(';', "; (Semicolon)" ), 43 new KeyValuePair<char, string>(',', ", (Comma)" ), 44 new KeyValuePair<char, string>('\t', "\\t (Tab)"), 45 new KeyValuePair<char, string>((char)0, "all whitespaces (including tabs and spaces)") 43 private static readonly List<KeyValuePair<char, string>> POSSIBLE_SEPARATORS = 44 new List<KeyValuePair<char, string>>{ 45 new KeyValuePair<char, string>(';', "; (Semicolon)" ), 46 new KeyValuePair<char, string>(',', ", (Comma)" ), 47 new KeyValuePair<char, string>('\t', "\\t (Tab)"), 48 new KeyValuePair<char, string>((char)0, "all whitespaces (including tabs and spaces)") 46 49 }; 47 50 48 public static readonly List<KeyValuePair<NumberFormatInfo, string>> POSSIBLE_DECIMAL_SEPARATORS = new List<KeyValuePair<NumberFormatInfo, string>>{ 49 new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), ", (Comma)"), 50 new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.InvariantInfo, ". (Period)" ) 51 private static readonly List<KeyValuePair<NumberFormatInfo, string>> POSSIBLE_DECIMAL_SEPARATORS = 52 new List<KeyValuePair<NumberFormatInfo, string>>{ 53 new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.GetInstance(new CultureInfo("de-DE")), ", (Comma)"), 54 new KeyValuePair<NumberFormatInfo, string>(NumberFormatInfo.InvariantInfo, ". (Period)" ) 51 55 }; 56 57 private static readonly List<KeyValuePair<Encoding, string>> POSSIBLE_ENCODINGS = 58 new List<KeyValuePair<Encoding, string>> { 59 new KeyValuePair<Encoding, string>(Encoding.Default, "Default"), 60 new KeyValuePair<Encoding, string>(Encoding.ASCII, "ASCII"), 61 new KeyValuePair<Encoding, string>(Encoding.Unicode, "Unicode"), 62 new KeyValuePair<Encoding, string>(Encoding.UTF8, "UTF8") 63 }; 52 64 53 65 public string Path { … … 87 99 DateTimeFormatComboBox.ValueMember = "Key"; 88 100 DateTimeFormatComboBox.DisplayMember = "Value"; 101 EncodingComboBox.DataSource = POSSIBLE_ENCODINGS; 102 EncodingComboBox.ValueMember = "Key"; 103 EncodingComboBox.DisplayMember = "Value"; 104 89 105 } 90 106 … … 100 116 DecimalSeparatorComboBox.Enabled = true; 101 117 DateTimeFormatComboBox.Enabled = true; 118 EncodingComboBox.Enabled = true; 102 119 ProblemTextBox.Text = openFileDialog.FileName; 103 120 TableFileParser csvParser = new TableFileParser(); … … 125 142 try { 126 143 TableFileParser csvParser = new TableFileParser(); 144 csvParser.Encoding = (Encoding)EncodingComboBox.SelectedValue; 127 145 csvParser.Parse(ProblemTextBox.Text, 128 146 (NumberFormatInfo)DecimalSeparatorComboBox.SelectedValue, 129 147 (DateTimeFormatInfo)DateTimeFormatComboBox.SelectedValue, 130 148 (char)SeparatorComboBox.SelectedValue, 131 CheckboxColumnNames.Checked );149 CheckboxColumnNames.Checked, lineLimit: 500); 132 150 IEnumerable<string> variableNamesWithType = GetVariableNamesWithType(csvParser); 133 151 PreviewDatasetMatrix.Content = new Dataset(variableNamesWithType, csvParser.Values); -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/Plugin.cs.frame
r13316 r13974 25 25 [Plugin("HeuristicLab.Problems.Instances.DataAnalysis.Views", "3.3.13.$WCREV$")] 26 26 [PluginFile("HeuristicLab.Problems.Instances.DataAnalysis.Views-3.3.dll", PluginFileType.Assembly)] 27 [PluginDependency("HeuristicLab.Common", "3.3")] 27 28 [PluginDependency("HeuristicLab.Common.Resources", "3.3")] 28 29 [PluginDependency("HeuristicLab.Core.Views", "3.3")] -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/RegressionImportTypeDialog.Designer.cs
r12009 r13974 66 66 this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableLabel); 67 67 this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox); 68 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 251);69 68 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.PreviewLabel, 0); 70 69 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.ShuffelInfoLabel, 0); -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/RegressionInstanceProviderView.cs
r12009 r13974 22 22 using System; 23 23 using System.IO; 24 using System.Threading.Tasks; 24 25 using System.Windows.Forms; 25 26 using HeuristicLab.MainForm; … … 44 45 if (importTypeDialog.ShowDialog() == DialogResult.OK) { 45 46 IRegressionProblemData instance = null; 46 try { 47 instance = Content.ImportData(importTypeDialog.Path, importTypeDialog.ImportType, importTypeDialog.CSVFormat); 48 } catch (IOException ex) { 49 ErrorWhileParsing(ex); 50 return; 51 } 52 try { 53 GenericConsumer.Load(instance); 54 instancesComboBox.SelectedIndex = -1; 55 } catch (IOException ex) { 56 ErrorWhileLoading(ex, importTypeDialog.Path); 57 } 47 48 Task.Factory.StartNew(() => { 49 var mainForm = (MainForm.WindowsForms.MainForm)MainFormManager.MainForm; 50 // lock active view and show progress bar 51 IContentView activeView = (IContentView)MainFormManager.MainForm.ActiveView; 52 53 try { 54 var progress = mainForm.AddOperationProgressToContent(activeView.Content, "Loading problem instance."); 55 56 Content.ProgressChanged += (o, args) => { progress.ProgressValue = args.ProgressPercentage / 100.0; }; 57 58 instance = Content.ImportData(importTypeDialog.Path, importTypeDialog.ImportType, importTypeDialog.CSVFormat); 59 } catch (IOException ex) { 60 ErrorWhileParsing(ex); 61 mainForm.RemoveOperationProgressFromContent(activeView.Content); 62 return; 63 } 64 try { 65 GenericConsumer.Load(instance); 66 } catch (IOException ex) { 67 ErrorWhileLoading(ex, importTypeDialog.Path); 68 } finally { 69 Invoke((Action)(() => instancesComboBox.SelectedIndex = -1)); 70 mainForm.RemoveOperationProgressFromContent(activeView.Content); 71 } 72 }); 58 73 } 59 74 } -
stable/HeuristicLab.Problems.Instances.DataAnalysis.Views/3.3/TimeSeriesPrognosisImportTypeDialog.Designer.cs
r12009 r13974 58 58 this.ShuffleDataCheckbox.Enabled = false; 59 59 this.ShuffleDataCheckbox.Visible = false; 60 // 61 // OkButton 62 // 63 this.OkButton.Location = new System.Drawing.Point(303, 407); 64 // 65 // CancelationButton 66 // 67 this.CancelationButton.Location = new System.Drawing.Point(384, 407); 60 68 61 // 69 62 // ProblemDataSettingsGroupBox … … 72 65 this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableComboBox); 73 66 this.ProblemDataSettingsGroupBox.Controls.Add(this.TargetVariableLabel); 74 this.ProblemDataSettingsGroupBox.Size = new System.Drawing.Size(447, 237);75 67 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.ShuffleDataCheckbox, 0); 76 68 this.ProblemDataSettingsGroupBox.Controls.SetChildIndex(this.TargetVariableLabel, 0); … … 89 81 this.ErrorTextBox.Location = new System.Drawing.Point(6, 19); 90 82 this.ErrorTextBox.Size = new System.Drawing.Size(435, 69); 91 //92 // PreviewDatasetMatrix93 //94 this.PreviewDatasetMatrix.Size = new System.Drawing.Size(435, 123);95 83 // 96 84 // SeparatorInfoLabel … … 114 102 // TargetVariableComboBox 115 103 // 116 this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 104 this.TargetVariableComboBox.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) 117 105 | System.Windows.Forms.AnchorStyles.Right))); 118 106 this.TargetVariableComboBox.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList; … … 148 136 this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); 149 137 this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; 150 this.ClientSize = new System.Drawing.Size(471, 4 42);138 this.ClientSize = new System.Drawing.Size(471, 457); 151 139 this.Name = "TimeSeriesPrognosisImportTypeDialog"; 152 140 this.Text = "TimeSeries Prognosis CSV Import"; -
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs
r12009 r13974 23 23 using System.Collections; 24 24 using System.Collections.Generic; 25 using System.ComponentModel; 25 26 using System.Globalization; 26 27 using System.IO; … … 35 36 where ImportType : DataAnalysisImportType { 36 37 38 public event ProgressChangedEventHandler ProgressChanged; 37 39 38 40 public TData ImportData(string path, ImportType type, DataAnalysisCSVFormat csvFormat) { 39 41 TableFileParser csvFileParser = new TableFileParser(); 42 long fileSize = new FileInfo(path).Length; 43 csvFileParser.ProgressChanged += (sender, e) => { 44 OnProgressChanged(e / (double)fileSize); 45 }; 40 46 csvFileParser.Parse(path, csvFormat.NumberFormatInfo, csvFormat.DateTimeFormatInfo, csvFormat.Separator, csvFormat.VariableNamesAvailable); 41 47 return ImportData(path, type, csvFileParser); 48 } 49 50 protected virtual void OnProgressChanged(double d) { 51 var handler = ProgressChanged; 52 if (handler != null) 53 handler(this, new ProgressChangedEventArgs((int)(100 * d), null)); 42 54 } 43 55 … … 89 101 strBuilder.AppendLine(); 90 102 } 91 92 using (var writer = new StreamWriter(path)) { 93 writer.Write(strBuilder); 103 using (var fileStream = new FileStream(path, FileMode.Create)) { 104 Encoding encoding = Encoding.GetEncoding(Encoding.Default.CodePage, 105 new EncoderReplacementFallback("*"), 106 new DecoderReplacementFallback("*")); 107 using (var writer = new StreamWriter(fileStream, encoding)) { 108 writer.Write(strBuilder); 109 } 94 110 } 95 111 } -
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r12009 r13974 24 24 using System.Collections; 25 25 using System.Collections.Generic; 26 using System.Diagnostics.Contracts; 26 27 using System.Globalization; 27 28 using System.IO; 28 29 using System.Linq; 29 30 using System.Runtime.Serialization; 31 using System.Text; 30 32 31 33 namespace HeuristicLab.Problems.Instances.DataAnalysis { 32 public class TableFileParser {34 public class TableFileParser : Progress<long> { // reports the number of bytes read 33 35 private const int BUFFER_SIZE = 65536; 34 36 // char used to symbolize whitespaces (no missing values can be handled with whitespaces) … … 36 38 private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR }; 37 39 private Tokenizer tokenizer; 38 private List<List<object>> rowValues; 40 private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file 41 42 43 private Encoding encoding = Encoding.Default; 44 45 public Encoding Encoding { 46 get { return encoding; } 47 set { 48 if (value == null) throw new ArgumentNullException("Encoding"); 49 encoding = value; 50 } 51 } 52 39 53 40 54 private int rows; … … 72 86 73 87 public TableFileParser() { 74 rowValues = new List<List<object>>();75 88 variableNames = new List<string>(); 76 89 } … … 102 115 public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat, 103 116 DateTimeFormatInfo dateTimeFormatInfo, char separator) { 104 using (StreamReader reader = new StreamReader(stream )) {117 using (StreamReader reader = new StreamReader(stream, Encoding)) { 105 118 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 106 return tokenizer.Peek().type != TokenTypeEnum.Double;119 return (tokenizer.PeekType() != TokenTypeEnum.Double); 107 120 } 108 121 } … … 113 126 /// <param name="fileName">file which is parsed</param> 114 127 /// <param name="columnNamesInFirstLine"></param> 115 public void Parse(string fileName, bool columnNamesInFirstLine ) {128 public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) { 116 129 NumberFormatInfo numberFormat; 117 130 DateTimeFormatInfo dateTimeFormatInfo; 118 131 char separator; 119 132 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 120 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine); 133 EstimateNumberOfLines(fileName); 134 Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 121 135 } 122 136 … … 129 143 /// <param name="separator">defines the separator</param> 130 144 /// <param name="columnNamesInFirstLine"></param> 131 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) { 145 public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 146 EstimateNumberOfLines(fileName); 132 147 using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { 133 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine); 148 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 149 } 150 } 151 152 // determines the number of newline characters in the first 64KB to guess the number of rows for a file 153 private void EstimateNumberOfLines(string fileName) { 154 var len = new System.IO.FileInfo(fileName).Length; 155 var buf = new char[1024 * 1024]; 156 using (var reader = new StreamReader(fileName, Encoding)) { 157 reader.ReadBlock(buf, 0, buf.Length); 158 } 159 int numNewLine = 0; 160 int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative 161 foreach (var ch in buf) { 162 charsInCurrentLine++; 163 if (ch == '\n') { 164 if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line 165 charsInCurrentLine = 0; 166 numNewLine++; 167 } 168 } 169 if (numNewLine <= 1) { 170 // fail -> keep the default setting 171 return; 172 } else { 173 double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1); 174 double estimatedLines = len / charsPerLineFactor; 175 estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough 134 176 } 135 177 } … … 140 182 /// <param name="stream">stream which is parsed</param> 141 183 /// <param name="columnNamesInFirstLine"></param> 142 public void Parse(Stream stream, bool columnNamesInFirstLine ) {184 public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) { 143 185 NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo; 144 186 DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 145 187 char separator = ','; 146 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine );188 Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit); 147 189 } 148 190 … … 155 197 /// <param name="separator">defines the separator</param> 156 198 /// <param name="columnNamesInFirstLine"></param> 157 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine ) {158 using (StreamReader reader = new StreamReader(stream )) {199 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 200 using (StreamReader reader = new StreamReader(stream, Encoding)) { 159 201 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 160 // parse the file 161 Parse(columnNamesInFirstLine); 162 } 163 164 // translate the list of samples into a DoubleMatrixData item 165 rows = rowValues.Count; 166 columns = rowValues[0].Count; 167 values = new List<IList>(); 168 169 //create columns 170 for (int col = 0; col < columns; col++) { 171 var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType()); 172 if (!types.Any()) { 173 values.Add(new List<string>()); 174 continue; 202 values = new List<IList>(); 203 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 204 205 if (columnNamesInFirstLine) { 206 ParseVariableNames(); 207 if (!tokenizer.HasNext()) 208 Error( 209 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 210 "", tokenizer.CurrentLineNumber); 175 211 } 176 212 177 var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key; 178 if (columnType == typeof(double)) values.Add(new List<double>()); 179 else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>()); 180 else if (columnType == typeof(string)) values.Add(new List<string>()); 181 else throw new InvalidOperationException(); 182 } 183 184 185 186 //fill with values 187 foreach (List<object> row in rowValues) { 188 int columnIndex = 0; 189 foreach (object element in row) { 190 if (values[columnIndex] is List<double> && !(element is double)) 191 values[columnIndex].Add(double.NaN); 192 else if (values[columnIndex] is List<DateTime> && !(element is DateTime)) 193 values[columnIndex].Add(DateTime.MinValue); 194 else if (values[columnIndex] is List<string> && !(element is string)) 195 values[columnIndex].Add(element.ToString()); 196 else 197 values[columnIndex].Add(element); 198 columnIndex++; 213 214 // read values... start in first row 215 int nLinesParsed = 0; 216 int colIdx = 0; 217 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1) 218 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 219 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { 220 tokenizer.Skip(); 221 222 // all rows have to have the same number of values 223 // the first row defines how many samples are needed 224 if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row 225 else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines) 226 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 227 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 228 tokenizer.CurrentLineNumber); 229 } 230 OnReport(tokenizer.BytesRead); 231 232 nLinesParsed++; 233 colIdx = 0; 234 } else { 235 // read one value 236 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 237 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 238 239 // initialize columns on the first row (fixing data types as presented in the first row...) 240 if (nLinesParsed == 0) { 241 values.Add(CreateList(type, estimatedNumberOfLines)); 242 } else if (colIdx == values.Count) { 243 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 244 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 245 tokenizer.CurrentLineNumber); 246 } 247 if (!IsColumnTypeCompatible(values[colIdx], type)) { 248 values[colIdx] = ConvertToStringColumn(values[colIdx]); 249 } 250 // add the value to the column 251 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 252 } 199 253 } 200 } 201 } 254 255 if (!values.Any() || values.First().Count == 0) 256 Error("Couldn't parse data values. Probably because of incorrect number format " + 257 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 258 } 259 260 this.rows = values.First().Count; 261 this.columns = values.Count; 262 263 // after everything has been parsed make sure the lists are as compact as possible 264 foreach (var l in values) { 265 var dblList = l as List<double>; 266 var byteList = l as List<byte>; 267 var dateList = l as List<DateTime>; 268 var stringList = l as List<string>; 269 var objList = l as List<object>; 270 if (dblList != null) dblList.TrimExcess(); 271 if (byteList != null) byteList.TrimExcess(); 272 if (dateList != null) dateList.TrimExcess(); 273 if (stringList != null) stringList.TrimExcess(); 274 if (objList != null) objList.TrimExcess(); 275 } 276 277 // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) 278 GC.Collect(2, GCCollectionMode.Forced); 279 } 280 281 #region type-dependent dispatch 282 private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) { 283 return (list is List<string>) || // all tokens can be added to a string list 284 (tokenType == TokenTypeEnum.Missing) || // empty entries are allowed in all columns 285 (tokenType == TokenTypeEnum.Double && list is List<double>) || 286 (tokenType == TokenTypeEnum.DateTime && list is List<DateTime>); 287 } 288 289 // all columns are converted to string columns when we find an non-empty value that has incorrect type 290 private IList ConvertToStringColumn(IList list) { 291 var dblL = list as List<double>; 292 if (dblL != null) { 293 var l = new List<string>(dblL.Capacity); 294 l.AddRange(dblL.Select(dbl => dbl.ToString())); 295 return l; 296 } 297 298 var dtL = list as List<DateTime>; 299 if (dtL != null) { 300 var l = new List<string>(dtL.Capacity); 301 l.AddRange(dtL.Select(dbl => dbl.ToString())); 302 return l; 303 } 304 305 if (list is List<string>) return list; 306 307 throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType())); 308 } 309 310 private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) { 311 var dblList = list as List<double>; 312 if (dblList != null) { 313 AddValue(type, dblList, dblVal); 314 return; 315 } 316 317 var strList = list as List<string>; 318 if (strList != null) { 319 AddValue(type, strList, strVal); 320 return; 321 } 322 var dtList = list as List<DateTime>; 323 if (dtList != null) { 324 AddValue(type, dtList, dateTimeVal); 325 return; 326 } 327 328 list.Add(strVal); // assumes List<object> 329 } 330 331 private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) { 332 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.Double); 333 list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal); 334 } 335 336 private void AddValue(TokenTypeEnum type, List<string> list, string strVal) { 337 // assumes that strVal is always set to the original token read from the input file 338 list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal); 339 } 340 341 private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) { 342 Contract.Assert(type == TokenTypeEnum.Missing || type == TokenTypeEnum.DateTime); 343 list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal); 344 } 345 346 private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) { 347 switch (type) { 348 case TokenTypeEnum.String: 349 return new List<string>(estimatedNumberOfLines); 350 case TokenTypeEnum.Double: 351 case TokenTypeEnum.Missing: // assume double columns 352 return new List<double>(estimatedNumberOfLines); 353 case TokenTypeEnum.DateTime: 354 return new List<DateTime>(estimatedNumberOfLines); 355 default: 356 throw new InvalidOperationException(); 357 } 358 } 359 #endregion 202 360 203 361 public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) { … … 253 411 separator = ','; 254 412 } else { 255 char[] disallowedSeparators = new char[] { ',' }; 413 char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail 256 414 // German format (real values) 257 415 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); … … 282 440 283 441 #region tokenizer 442 // the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character 284 443 internal enum TokenTypeEnum { 285 NewLine, Separator, String, Double, DateTime 286 } 287 288 internal class Token { 289 public TokenTypeEnum type; 290 public string stringValue; 291 public double doubleValue; 292 public DateTime dateTimeValue; 293 294 public Token(TokenTypeEnum type, string value) { 295 this.type = type; 296 stringValue = value; 297 dateTimeValue = DateTime.MinValue; 298 doubleValue = 0.0; 299 } 300 301 public override string ToString() { 302 return stringValue; 303 } 304 } 305 444 NewLine, String, Double, DateTime, Missing 445 } 306 446 307 447 internal class Tokenizer { 308 448 private StreamReader reader; 309 private List<Token> tokens; 449 // we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary) 450 private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024]; 451 private string[] stringVals = new string[1024]; 452 private double[] doubleVals = new double[1024]; 453 private DateTime[] dateTimeVals = new DateTime[1024]; 454 private int tokenPos; 455 private int numTokens; 310 456 private NumberFormatInfo numberFormatInfo; 311 457 private DateTimeFormatInfo dateTimeFormatInfo; 312 458 private char separator; 313 private const string INTERNAL_SEPARATOR = "#"; 459 460 // arrays for string.Split() 461 private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default 462 private readonly char[] separators; 314 463 315 464 private int currentLineNumber = 0; … … 323 472 private set { currentLine = value; } 324 473 } 325 326 private Token newlineToken; 327 public Token NewlineToken { 328 get { return newlineToken; } 329 private set { newlineToken = value; } 330 } 331 private Token separatorToken; 332 public Token SeparatorToken { 333 get { return separatorToken; } 334 private set { separatorToken = value; } 474 public long BytesRead { 475 get; 476 private set; 335 477 } 336 478 … … 340 482 this.dateTimeFormatInfo = dateTimeFormatInfo; 341 483 this.separator = separator; 342 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); 343 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 344 tokens = new List<Token>(); 484 this.separators = new char[] { separator }; 345 485 ReadNextTokens(); 486 } 487 488 public bool HasNext() { 489 return numTokens > tokenPos || !reader.EndOfStream; 490 } 491 492 public TokenTypeEnum PeekType() { 493 return tokenTypes[tokenPos]; 494 } 495 496 public void Skip() { 497 // simply skips one token without returning the result values 498 tokenPos++; 499 if (numTokens == tokenPos) { 500 ReadNextTokens(); 501 } 502 } 503 504 public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) { 505 type = tokenTypes[tokenPos]; 506 strVal = stringVals[tokenPos]; 507 dblVal = doubleVals[tokenPos]; 508 dateTimeVal = dateTimeVals[tokenPos]; 509 Skip(); 346 510 } 347 511 … … 349 513 if (!reader.EndOfStream) { 350 514 CurrentLine = reader.ReadLine(); 351 var newTokens = from str in Split(CurrentLine)352 let trimmedStr = str.Trim()353 where !string.IsNullOrEmpty(trimmedStr)354 select MakeToken(trimmedStr);355 356 tokens.AddRange(newTokens);357 tokens.Add(NewlineToken);358 515 CurrentLineNumber++; 516 if (reader.BaseStream.CanSeek) { 517 BytesRead = reader.BaseStream.Position; 518 } else { 519 BytesRead += CurrentLine.Length + 2; // guess 520 } 521 int i = 0; 522 if (!string.IsNullOrWhiteSpace(CurrentLine)) { 523 foreach (var tok in Split(CurrentLine)) { 524 TokenTypeEnum type; 525 double doubleVal; 526 DateTime dateTimeValue; 527 type = TokenTypeEnum.String; // default 528 stringVals[i] = tok.Trim(); 529 if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) { 530 type = TokenTypeEnum.Double; 531 doubleVals[i] = doubleVal; 532 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 533 type = TokenTypeEnum.DateTime; 534 dateTimeVals[i] = dateTimeValue; 535 } else if (string.IsNullOrWhiteSpace(tok)) { 536 type = TokenTypeEnum.Missing; 537 } 538 539 // couldn't parse the token as an int or float number or datetime value so return a string token 540 541 tokenTypes[i] = type; 542 i++; 543 544 if (i >= tokenTypes.Length) { 545 // increase buffer size if necessary 546 IncreaseCapacity(ref tokenTypes); 547 IncreaseCapacity(ref doubleVals); 548 IncreaseCapacity(ref stringVals); 549 IncreaseCapacity(ref dateTimeVals); 550 } 551 } 552 } 553 tokenTypes[i] = TokenTypeEnum.NewLine; 554 numTokens = i + 1; 555 tokenPos = 0; 359 556 } 360 557 } 361 558 362 559 private IEnumerable<string> Split(string line) { 363 IEnumerable<string> splitString; 364 if (separator == WHITESPACECHAR) { 365 //separate whitespaces 366 splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries); 367 } else { 368 splitString = line.Split(separator); 369 } 370 int cur = splitString.Count(); 371 foreach (var str in splitString) { 372 yield return str; 373 cur--; 374 // do not return the INTERNAL_SEPARATOR after the last string 375 if (cur != 0) { 376 yield return INTERNAL_SEPARATOR; 377 } 378 } 379 } 380 381 private Token MakeToken(string strToken) { 382 Token token = new Token(TokenTypeEnum.String, strToken); 383 if (strToken.Equals(INTERNAL_SEPARATOR)) { 384 return SeparatorToken; 385 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { 386 token.type = TokenTypeEnum.Double; 387 return token; 388 } else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) { 389 token.type = TokenTypeEnum.DateTime; 390 return token; 391 } 392 393 // couldn't parse the token as an int or float number or datetime value so return a string token 394 return token; 395 } 396 397 public Token Peek() { 398 return tokens[0]; 399 } 400 401 public Token Next() { 402 Token next = tokens[0]; 403 tokens.RemoveAt(0); 404 if (tokens.Count == 0) { 405 ReadNextTokens(); 406 } 407 return next; 408 } 409 410 public bool HasNext() { 411 return tokens.Count > 0 || !reader.EndOfStream; 560 return separator == WHITESPACECHAR ? 561 line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) : 562 line.Split(separators); 563 } 564 565 private static void IncreaseCapacity<T>(ref T[] arr) { 566 int n = (int)Math.Floor(arr.Length * 1.7); // guess 567 T[] arr2 = new T[n]; 568 Array.Copy(arr, arr2, arr.Length); 569 arr = arr2; 412 570 } 413 571 } … … 415 573 416 574 #region parsing 417 private void Parse(bool columnNamesInFirstLine) {418 if (columnNamesInFirstLine) {419 ParseVariableNames();420 if (!tokenizer.HasNext())421 Error(422 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",423 "", tokenizer.CurrentLineNumber);424 }425 ParseValues();426 if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);427 }428 429 private void ParseValues() {430 while (tokenizer.HasNext()) {431 if (tokenizer.Peek() == tokenizer.NewlineToken) {432 tokenizer.Next();433 } else {434 List<object> row = new List<object>();435 object value = NextValue(tokenizer);436 row.Add(value);437 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {438 Expect(tokenizer.SeparatorToken);439 row.Add(NextValue(tokenizer));440 }441 Expect(tokenizer.NewlineToken);442 // all rows have to have the same number of values443 // the first row defines how many samples are needed444 if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {445 Error("The first row of the dataset has " + rowValues[0].Count + " columns." +446 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",447 tokenizer.CurrentLineNumber);448 }449 rowValues.Add(row);450 }451 }452 }453 454 private object NextValue(Tokenizer tokenizer) {455 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;456 Token current = tokenizer.Next();457 if (current.type == TokenTypeEnum.Separator) {458 return double.NaN;459 } else if (current.type == TokenTypeEnum.String) {460 return current.stringValue;461 } else if (current.type == TokenTypeEnum.Double) {462 return current.doubleValue;463 } else if (current.type == TokenTypeEnum.DateTime) {464 return current.dateTimeValue;465 }466 // found an unexpected token => throw error467 Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);468 // this line is never executed because Error() throws an exception469 throw new InvalidOperationException();470 }471 575 472 576 private void ParseVariableNames() { 473 577 // the first line must contain variable names 474 List<Token> tokens = new List<Token>(); 475 Token valueToken; 476 valueToken = tokenizer.Next(); 477 tokens.Add(valueToken); 478 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 479 Expect(tokenizer.SeparatorToken); 480 valueToken = tokenizer.Next(); 481 if (valueToken != tokenizer.NewlineToken) { 482 tokens.Add(valueToken); 483 } 484 } 485 if (valueToken != tokenizer.NewlineToken) { 486 Expect(tokenizer.NewlineToken); 487 } 488 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 489 } 490 491 private void Expect(Token expectedToken) { 492 Token actualToken = tokenizer.Next(); 493 if (actualToken != expectedToken) { 494 Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber); 495 } 578 List<string> varNames = new List<string>(); 579 580 TokenTypeEnum type; 581 string strVal; 582 double dblVal; 583 DateTime dateTimeVal; 584 585 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 586 587 // the first token must be a variable name 588 if (type != TokenTypeEnum.String) 589 throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type); 590 varNames.Add(strVal); 591 592 while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) { 593 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 594 varNames.Add(strVal); 595 } 596 ExpectType(TokenTypeEnum.NewLine); 597 598 variableNames = varNames; 599 } 600 601 private void ExpectType(TokenTypeEnum expectedToken) { 602 if (tokenizer.PeekType() != expectedToken) 603 throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType()); 604 tokenizer.Skip(); 496 605 } 497 606 -
stable/HeuristicLab.Tests
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Tests merged: 13525,13529
- Property svn:mergeinfo changed
-
stable/HeuristicLab.Tests/HeuristicLab.Problems.Instances.DataAnalysis-3.3/TableFileParserTest.cs
r12009 r13974 21 21 22 22 using System; 23 using System.Collections.Generic; 24 using System.Globalization; 23 25 using System.IO; 24 26 using Microsoft.VisualStudio.TestTools.UnitTesting; … … 589 591 } 590 592 593 594 [TestMethod] 595 [TestCategory("Problems.Instances")] 596 [TestProperty("Time", "short")] 597 public void ParseWithColumnTypeConversionDE() { 598 // If first entry of a column can be parsed as a double we assume all values are doubles. 599 // However, if any of the following entries cannot be parsed as a double we convert the whole column to a string column. 600 // Special care needs to be taken with missing values, NaN (n.def.) and infinity values. 601 // We only support DE-DE and InvariantCulture number formats 602 string tempFileName = Path.GetTempFileName(); 603 var deCultureInfo = CultureInfo.GetCultureInfo("DE-DE"); 604 WriteToFile(tempFileName, 605 "str\tdbl\tdbl\tdbl" + Environment.NewLine + 606 "1,3\t1,3\t0\t3" + Environment.NewLine + 607 "1,3\t\t0\t0" + Environment.NewLine + 608 "s\t" + double.NaN.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine + // double.NaN might have a different string representation on different systems (even when using the same CultureInfo) 609 "s\t" + double.PositiveInfinity.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine + 610 "s\t" + double.NegativeInfinity.ToString(deCultureInfo) + "\t0\t0" + Environment.NewLine + 611 "s\t0\t0\t0"); 612 TableFileParser parser = new TableFileParser(); 613 try { 614 parser.Parse(tempFileName, 615 deCultureInfo.NumberFormat, 616 deCultureInfo.DateTimeFormat, 617 '\t', 618 true); 619 Assert.AreEqual(6, parser.Rows); 620 Assert.AreEqual(4, parser.Columns); 621 Assert.IsTrue(parser.Values[0] is List<string>); 622 Assert.IsTrue(parser.Values[1] is List<double>); 623 Assert.IsTrue(parser.Values[2] is List<double>); 624 Assert.IsTrue(parser.Values[3] is List<double>); 625 Assert.IsTrue(double.IsNaN((double)parser.Values[1][1])); // missing value 626 Assert.IsTrue(double.IsNaN((double)parser.Values[1][2])); 627 Assert.IsTrue(double.IsPositiveInfinity((double)parser.Values[1][3])); // NOTE: in DE-DE NumberFormat just "unendlich" is not allowed (compare with InvariantCulture) 628 Assert.IsTrue(double.IsNegativeInfinity((double)parser.Values[1][4])); 629 } finally { 630 File.Delete(tempFileName); 631 } 632 } 633 634 [TestMethod] 635 [TestCategory("Problems.Instances")] 636 [TestProperty("Time", "short")] 637 public void ParseWithColumnTypeConversionInvariant() { 638 // see ParseWithColumnTypeConversionDE above 639 // same routine only using invariant culture 640 string tempFileName = Path.GetTempFileName(); 641 WriteToFile(tempFileName, 642 @"str,dbl,dbl,dbl 643 1.3,1.3,0,3 644 1.3,,0,0 645 s,NaN,0,0 646 s,Infinity,0,0 647 s,-Infinity,0,0 648 s,0,0,0"); 649 TableFileParser parser = new TableFileParser(); 650 try { 651 parser.Parse(tempFileName, 652 CultureInfo.InvariantCulture.NumberFormat, 653 CultureInfo.InvariantCulture.DateTimeFormat, 654 ',', 655 parser.AreColumnNamesInFirstLine(tempFileName)); 656 Assert.AreEqual(6, parser.Rows); 657 Assert.AreEqual(4, parser.Columns); 658 Assert.IsTrue(parser.Values[0] is List<string>); 659 Assert.IsTrue(parser.Values[1] is List<double>); 660 Assert.IsTrue(parser.Values[2] is List<double>); 661 Assert.IsTrue(parser.Values[3] is List<double>); 662 Assert.IsTrue(double.IsNaN((double)parser.Values[1][1])); // missing value 663 Assert.IsTrue(double.IsNaN((double)parser.Values[1][2])); 664 Assert.IsTrue(double.IsPositiveInfinity((double)parser.Values[1][3])); // NOTE: in InvariantCulture +Infinity is not allowed (compare with DE-DE) 665 Assert.IsTrue(double.IsNegativeInfinity((double)parser.Values[1][4])); 666 } finally { 667 File.Delete(tempFileName); 668 } 669 } 670 591 671 private void WriteToFile(string fileName, string content) { 592 672 using (StreamWriter writer = new StreamWriter(fileName)) {
Note: See TracChangeset
for help on using the changeset viewer.