Changeset 6740 for trunk/sources/HeuristicLab.Problems.DataAnalysis
- Timestamp:
- 09/12/11 13:48:31 (13 years ago)
- Location:
- trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4
- Files:
-
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs
r5847 r6740 21 21 22 22 using System; 23 using System.Collections; 23 24 using System.Collections.Generic; 25 using System.Collections.ObjectModel; 24 26 using System.Linq; 25 27 using HeuristicLab.Common; … … 36 38 private Dataset(Dataset original, Cloner cloner) 37 39 : base(original, cloner) { 38 variableNameToVariableIndexMapping = original.variableNameToVariableIndexMapping; 39 data = original.data; 40 } 41 public override IDeepCloneable Clone(Cloner cloner) { 42 return new Dataset(this, cloner); 43 } 40 variableValues = new Dictionary<string, IList>(original.variableValues); 41 variableNames = new List<string>(original.variableNames); 42 rows = original.rows; 43 } 44 public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); } 44 45 45 46 public Dataset() … … 47 48 Name = "-"; 48 49 VariableNames = Enumerable.Empty<string>(); 49 data = new double[0, 0]; 50 } 51 52 public Dataset(IEnumerable<string> variableNames, double[,] data) 50 variableValues = new Dictionary<string, IList>(); 51 rows = 0; 52 } 53 54 public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) 53 55 : base() { 54 56 Name = "-"; 55 if (variableNames.Count() != data.GetLength(1)) { 56 throw new ArgumentException("Number of variable names doesn't match the number of columns of data"); 57 } 58 this.data = (double[,])data.Clone(); 59 VariableNames = variableNames; 60 } 61 62 63 private Dictionary<string, int> variableNameToVariableIndexMapping; 64 private Dictionary<int, string> variableIndexToVariableNameMapping; 57 if (!variableNames.Any()) { 58 this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList(); 59 } else if (variableNames.Count() != variableValues.Count()) { 60 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 61 } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { 62 throw new ArgumentException("The number of values must be equal for every variable"); 63 } else if (variableNames.Distinct().Count() != variableNames.Count()) { 64 var duplicateVariableNames = 65 variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 66 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 67 foreach (var duplicateVariableName in duplicateVariableNames) 68 message += duplicateVariableName + Environment.NewLine; 69 throw new ArgumentException(message); 70 } 71 72 rows = variableValues.First().Count; 73 this.variableNames = new List<string>(variableNames); 74 this.variableValues = new Dictionary<string, IList>(); 75 for (int i = 0; i < this.variableNames.Count; i++) { 76 var values = variableValues.ElementAt(i); 77 IList clonedValues = null; 78 if (values is List<double>) 79 clonedValues = new List<double>(values.Cast<double>()); 80 else if (values is List<string>) 81 clonedValues = new List<string>(values.Cast<string>()); 82 else if (values is List<DateTime>) 83 clonedValues = new List<DateTime>(values.Cast<DateTime>()); 84 else { 85 this.variableNames = new List<string>(); 86 this.variableValues = new Dictionary<string, IList>(); 87 throw new ArgumentException("The variable values must be of type List<double>, List<string> or List<DateTime>"); 88 } 89 this.variableValues.Add(this.variableNames[i], clonedValues); 90 } 91 } 92 93 public Dataset(IEnumerable<string> variableNames, double[,] variableValues) { 94 Name = "-"; 95 if (variableNames.Count() != variableValues.GetLength(1)) { 96 throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); 97 } 98 if (variableNames.Distinct().Count() != variableNames.Count()) { 99 var duplicateVariableNames = variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); 100 string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; 101 foreach (var duplicateVariableName in duplicateVariableNames) 102 message += duplicateVariableName + Environment.NewLine; 103 throw new ArgumentException(message); 104 } 105 106 rows = variableValues.GetLength(0); 107 this.variableNames = new List<string>(variableNames); 108 109 this.variableValues = new Dictionary<string, IList>(); 110 for (int col = 0; col < variableValues.GetLength(1); col++) { 111 string columName = this.variableNames[col]; 112 var values = new List<double>(); 113 for (int row = 0; row < variableValues.GetLength(0); row++) { 114 values.Add(variableValues[row, col]); 115 } 116 this.variableValues.Add(columName, values); 117 } 118 } 119 120 #region Backwards compatible code, remove with 3.5 121 private double[,] storableData; 122 //name alias used to suppport backwards compatibility 123 [Storable(Name = "data", AllowOneWay = true)] 124 private double[,] StorableData { set { storableData = value; } } 125 126 [StorableHook(HookType.AfterDeserialization)] 127 private void AfterDeserialization() { 128 if (variableValues == null) { 129 rows = storableData.GetLength(0); 130 variableValues = new Dictionary<string, IList>(); 131 for (int col = 0; col < storableData.GetLength(1); col++) { 132 string columName = variableNames[col]; 133 var values = new List<double>(); 134 for (int row = 0; row < storableData.GetLength(0); row++) { 135 values.Add(storableData[row, col]); 136 } 137 variableValues.Add(columName, values); 138 } 139 storableData = null; 140 } 141 } 142 #endregion 143 144 private Dictionary<string, IList> variableValues; 145 private List<string> variableNames; 65 146 [Storable] 66 147 public IEnumerable<string> VariableNames { 67 get { 68 // convert KeyCollection to an array first for persistence 69 return variableNameToVariableIndexMapping.Keys.ToArray(); 70 } 148 get { return variableNames; } 71 149 private set { 72 if (variableNameToVariableIndexMapping != null) throw new InvalidOperationException("VariableNames can only be set once."); 73 this.variableNameToVariableIndexMapping = new Dictionary<string, int>(); 74 this.variableIndexToVariableNameMapping = new Dictionary<int, string>(); 75 int i = 0; 76 foreach (string variableName in value) { 77 this.variableNameToVariableIndexMapping.Add(variableName, i); 78 this.variableIndexToVariableNameMapping.Add(i, variableName); 79 i++; 80 } 81 } 82 } 83 150 if (variableNames != null) throw new InvalidOperationException(); 151 variableNames = new List<string>(value); 152 } 153 } 154 155 public IEnumerable<string> DoubleVariables { 156 get { return variableValues.Where(p => p.Value is List<double>).Select(p => p.Key); } 157 } 158 159 public IEnumerable<double> GetDoubleValues(string variableName) { 160 IList list; 161 if (!variableValues.TryGetValue(variableName, out list)) 162 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); 163 List<double> values = list as List<double>; 164 if (values == null) throw new ArgumentException("The variable " + variableName + " is not a double variable."); 165 166 //mkommend yield return used to enable lazy evaluation 167 foreach (double value in values) 168 yield return value; 169 } 170 public ReadOnlyCollection<double> GetReadOnlyDoubleValues(string variableName) { 171 IList list; 172 if (!variableValues.TryGetValue(variableName, out list)) 173 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); 174 List<double> values = list as List<double>; 175 if (values == null) throw new ArgumentException("The variable " + variableName + " is not a double variable."); 176 return values.AsReadOnly(); 177 } 178 public double GetDoubleValue(string variableName, int row) { 179 IList list; 180 if (!variableValues.TryGetValue(variableName, out list)) 181 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); 182 List<double> values = list as List<double>; 183 if (values == null) throw new ArgumentException("The variable " + variableName + " is not a double variable."); 184 return values[row]; 185 } 186 public IEnumerable<double> GetDoubleValues(string variableName, IEnumerable<int> rows) { 187 IList list; 188 if (!variableValues.TryGetValue(variableName, out list)) 189 throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); 190 List<double> values = list as List<double>; 191 if (values == null) throw new ArgumentException("The varialbe " + variableName + " is not a double variable."); 192 193 foreach (int index in rows) 194 yield return values[index]; 195 } 196 197 #region IStringConvertibleMatrix Members 84 198 [Storable] 85 private double[,] data; 86 private double[,] Data { 87 get { return data; } 88 } 89 90 // elementwise access 91 public double this[int rowIndex, int columnIndex] { 92 get { return data[rowIndex, columnIndex]; } 93 } 94 public double this[string variableName, int rowIndex] { 95 get { 96 int columnIndex = GetVariableIndex(variableName); 97 return data[rowIndex, columnIndex]; 98 } 99 } 100 101 public double[] GetVariableValues(int variableIndex) { 102 return GetVariableValues(variableIndex, 0, Rows); 103 } 104 public double[] GetVariableValues(string variableName) { 105 return GetVariableValues(GetVariableIndex(variableName), 0, Rows); 106 } 107 public double[] GetVariableValues(int variableIndex, int start, int end) { 108 return GetEnumeratedVariableValues(variableIndex, start, end).ToArray(); 109 } 110 public double[] GetVariableValues(string variableName, int start, int end) { 111 return GetVariableValues(GetVariableIndex(variableName), start, end); 112 } 113 114 public IEnumerable<double> GetEnumeratedVariableValues(int variableIndex) { 115 return GetEnumeratedVariableValues(variableIndex, 0, Rows); 116 } 117 public IEnumerable<double> GetEnumeratedVariableValues(int variableIndex, int start, int end) { 118 if (start < 0 || !(start <= end)) 119 throw new ArgumentException("Start must be between 0 and end (" + end + ")."); 120 if (end > Rows || end < start) 121 throw new ArgumentException("End must be between start (" + start + ") and dataset rows (" + Rows + ")."); 122 123 for (int i = start; i < end; i++) 124 yield return data[i, variableIndex]; 125 } 126 public IEnumerable<double> GetEnumeratedVariableValues(int variableIndex, IEnumerable<int> rows) { 127 foreach (int row in rows) 128 yield return data[row, variableIndex]; 129 } 130 131 public IEnumerable<double> GetEnumeratedVariableValues(string variableName) { 132 return GetEnumeratedVariableValues(GetVariableIndex(variableName), 0, Rows); 133 } 134 public IEnumerable<double> GetEnumeratedVariableValues(string variableName, int start, int end) { 135 return GetEnumeratedVariableValues(GetVariableIndex(variableName), start, end); 136 } 137 public IEnumerable<double> GetEnumeratedVariableValues(string variableName, IEnumerable<int> rows) { 138 return GetEnumeratedVariableValues(GetVariableIndex(variableName), rows); 139 } 140 141 public string GetVariableName(int variableIndex) { 142 try { 143 return variableIndexToVariableNameMapping[variableIndex]; 144 } 145 catch (KeyNotFoundException ex) { 146 throw new ArgumentException("The variable index " + variableIndex + " was not found.", ex); 147 } 148 } 149 public int GetVariableIndex(string variableName) { 150 try { 151 return variableNameToVariableIndexMapping[variableName]; 152 } 153 catch (KeyNotFoundException ex) { 154 throw new ArgumentException("The variable name " + variableName + " was not found.", ex); 155 } 156 } 157 158 #region IStringConvertibleMatrix Members 199 private int rows; 159 200 public int Rows { 160 get { return data.GetLength(0); }201 get { return rows; } 161 202 set { throw new NotSupportedException(); } 162 203 } 163 204 public int Columns { 164 get { return data.GetLength(1); }205 get { return variableNames.Count; } 165 206 set { throw new NotSupportedException(); } 166 207 } … … 184 225 185 226 public string GetValue(int rowIndex, int columnIndex) { 186 return data[rowIndex, columnIndex].ToString();227 return variableValues[variableNames[columnIndex]][rowIndex].ToString(); 187 228 } 188 229 public bool SetValue(string value, int rowIndex, int columnIndex) { -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationProblemData.cs
r6672 r6740 226 226 get { 227 227 if (classValues == null) { 228 classValues = Dataset.Get EnumeratedVariableValues(TargetVariableParameter.Value.Value).Distinct().ToList();228 classValues = Dataset.GetDoubleValues(TargetVariableParameter.Value.Value).Distinct().ToList(); 229 229 classValues.Sort(); 230 230 } … … 291 291 private static IEnumerable<string> CheckVariablesForPossibleTargetVariables(Dataset dataset) { 292 292 int maxSamples = Math.Min(InspectedRowsToDetermineTargets, dataset.Rows); 293 var validTargetVariables = (from v in dataset. VariableNames294 let distinctValues = dataset.Get EnumeratedVariableValues(v)293 var validTargetVariables = (from v in dataset.DoubleVariables 294 let distinctValues = dataset.GetDoubleValues(v) 295 295 .Take(maxSamples) 296 296 .Distinct() … … 410 410 dataset.Name = Path.GetFileName(fileName); 411 411 412 ClassificationProblemData problemData = new ClassificationProblemData(dataset, dataset. VariableNames.Skip(1), dataset.VariableNames.First());412 ClassificationProblemData problemData = new ClassificationProblemData(dataset, dataset.DoubleVariables.Skip(1), dataset.DoubleVariables.First()); 413 413 problemData.Name = "Data imported from " + Path.GetFileName(fileName); 414 414 return problemData; -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationSolutionBase.cs
r6653 r6740 67 67 protected void CalculateResults() { 68 68 double[] estimatedTrainingClassValues = EstimatedTrainingClassValues.ToArray(); // cache values 69 double[] originalTrainingClassValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray();69 double[] originalTrainingClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray(); 70 70 double[] estimatedTestClassValues = EstimatedTestClassValues.ToArray(); // cache values 71 double[] originalTestClassValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray();71 double[] originalTestClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray(); 72 72 73 73 OnlineCalculatorError errorState; -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/DiscriminantFunctionClassificationSolutionBase.cs
r6606 r6740 103 103 protected void CalculateRegressionResults() { 104 104 double[] estimatedTrainingValues = EstimatedTrainingValues.ToArray(); // cache values 105 double[] originalTrainingValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray();105 double[] originalTrainingValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray(); 106 106 double[] estimatedTestValues = EstimatedTestValues.ToArray(); // cache values 107 double[] originalTestValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray();107 double[] originalTestValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray(); 108 108 109 109 OnlineCalculatorError errorState; … … 132 132 double[] classValues; 133 133 double[] thresholds; 134 var targetClassValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes);134 var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes); 135 135 AccuracyMaximizationThresholdCalculator.CalculateThresholds(ProblemData, EstimatedTrainingValues, targetClassValues, out classValues, out thresholds); 136 136 … … 141 141 double[] classValues; 142 142 double[] thresholds; 143 var targetClassValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes);143 var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes); 144 144 NormalDistributionCutPointsThresholdCalculator.CalculateThresholds(ProblemData, EstimatedTrainingValues, targetClassValues, out classValues, out thresholds); 145 145 -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Clustering/ClusteringProblemData.cs
r5809 r6740 20 20 #endregion 21 21 22 using System;23 22 using System.Collections.Generic; 24 23 using System.IO; 25 using System.Linq;26 24 using HeuristicLab.Common; 27 25 using HeuristicLab.Core; 28 using HeuristicLab.Data;29 using HeuristicLab.Parameters;30 26 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; 31 27 … … 103 99 dataset.Name = Path.GetFileName(fileName); 104 100 105 ClusteringProblemData problemData = new ClusteringProblemData(dataset, dataset. VariableNames);101 ClusteringProblemData problemData = new ClusteringProblemData(dataset, dataset.DoubleVariables); 106 102 problemData.Name = "Data imported from " + Path.GetFileName(fileName); 107 103 return problemData; -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/DataAnalysisProblemData.cs
r6672 r6740 116 116 if (allowedInputVariables == null) throw new ArgumentNullException("The allowedInputVariables must not be null."); 117 117 118 if (allowedInputVariables.Except(dataset. VariableNames).Any())119 throw new ArgumentException("All allowed input variables must be present in the dataset .");118 if (allowedInputVariables.Except(dataset.DoubleVariables).Any()) 119 throw new ArgumentException("All allowed input variables must be present in the dataset and of type double."); 120 120 121 var inputVariables = new CheckedItemList<StringValue>(dataset. VariableNames.Select(x => new StringValue(x)));121 var inputVariables = new CheckedItemList<StringValue>(dataset.DoubleVariables.Select(x => new StringValue(x))); 122 122 foreach (StringValue x in inputVariables) 123 123 inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value)); -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionProblemData.cs
r6672 r6740 144 144 dataset.Name = Path.GetFileName(fileName); 145 145 146 RegressionProblemData problemData = new RegressionProblemData(dataset, dataset. VariableNames.Skip(1), dataset.VariableNames.First());146 RegressionProblemData problemData = new RegressionProblemData(dataset, dataset.DoubleVariables.Skip(1), dataset.DoubleVariables.First()); 147 147 problemData.Name = "Data imported from " + Path.GetFileName(fileName); 148 148 return problemData; -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionBase.cs
r6661 r6740 127 127 OnlineCalculatorError errorState; 128 128 Add(new Result(TrainingMeanAbsoluteErrorResultName, "Mean of absolute errors of the model on the training partition", new DoubleValue())); 129 double trainingMAE = OnlineMeanAbsoluteErrorCalculator.Calculate(EstimatedTrainingValues, ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes), out errorState);129 double trainingMAE = OnlineMeanAbsoluteErrorCalculator.Calculate(EstimatedTrainingValues, ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes), out errorState); 130 130 TrainingMeanAbsoluteError = errorState == OnlineCalculatorError.None ? trainingMAE : double.NaN; 131 131 } … … 134 134 OnlineCalculatorError errorState; 135 135 Add(new Result(TestMeanAbsoluteErrorResultName, "Mean of absolute errors of the model on the test partition", new DoubleValue())); 136 double testMAE = OnlineMeanAbsoluteErrorCalculator.Calculate(EstimatedTestValues, ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TestIndizes), out errorState);136 double testMAE = OnlineMeanAbsoluteErrorCalculator.Calculate(EstimatedTestValues, ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes), out errorState); 137 137 TestMeanAbsoluteError = errorState == OnlineCalculatorError.None ? testMAE : double.NaN; 138 138 } … … 142 142 protected void CalculateResults() { 143 143 double[] estimatedTrainingValues = EstimatedTrainingValues.ToArray(); // cache values 144 double[] originalTrainingValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray();144 double[] originalTrainingValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray(); 145 145 double[] estimatedTestValues = EstimatedTestValues.ToArray(); // cache values 146 double[] originalTestValues = ProblemData.Dataset.Get EnumeratedVariableValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray();146 double[] originalTestValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray(); 147 147 148 148 OnlineCalculatorError errorState; -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs
r5809 r6740 21 21 22 22 using System; 23 using System.Collections; 23 24 using System.Collections.Generic; 24 25 using System.Globalization; … … 33 34 private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' }; 34 35 private Tokenizer tokenizer; 35 private List<List< double>> rowValues;36 private List<List<object>> rowValues; 36 37 37 38 private int rows; … … 47 48 } 48 49 49 private double[,]values;50 public double[,]Values {50 private List<IList> values; 51 public List<IList> Values { 51 52 get { 52 53 return values; … … 69 70 70 71 public TableFileParser() { 71 rowValues = new List<List< double>>();72 rowValues = new List<List<object>>(); 72 73 variableNames = new List<string>(); 73 74 } … … 75 76 public void Parse(string fileName) { 76 77 NumberFormatInfo numberFormat; 78 DateTimeFormatInfo dateTimeFormatInfo; 77 79 char separator; 78 DetermineFileFormat(fileName, out numberFormat, out separator);80 DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator); 79 81 using (StreamReader reader = new StreamReader(fileName)) { 80 tokenizer = new Tokenizer(reader, numberFormat, separator);82 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 81 83 // parse the file 82 84 Parse(); … … 86 88 rows = rowValues.Count; 87 89 columns = rowValues[0].Count; 88 values = new double[rows, columns]; 89 90 int rowIndex = 0; 91 int columnIndex = 0; 92 foreach (List<double> row in rowValues) { 93 columnIndex = 0; 94 foreach (double element in row) { 95 values[rowIndex, columnIndex++] = element; 96 } 97 rowIndex++; 98 } 99 } 100 101 private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) { 90 values = new List<IList>(); 91 92 //create columns 93 for (int col = 0; col < columns; col++) { 94 var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType()); 95 if (!types.Any()) { 96 values.Add(new List<string>()); 97 continue; 98 } 99 100 var columnType = types.GroupBy(v => v).OrderBy(v => v).Last().Key; 101 if (columnType == typeof(double)) values.Add(new List<double>()); 102 else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>()); 103 else if (columnType == typeof(string)) values.Add(new List<string>()); 104 else throw new InvalidOperationException(); 105 } 106 107 108 109 //fill with values 110 foreach (List<object> row in rowValues) { 111 int columnIndex = 0; 112 foreach (object element in row) { 113 //handle missing values with default values 114 if (element as string == string.Empty) { 115 if (values[columnIndex] is List<double>) values[columnIndex].Add(double.NaN); 116 else if (values[columnIndex] is List<DateTime>) values[columnIndex].Add(DateTime.MinValue); 117 else if (values[columnIndex] is List<string>) values[columnIndex].Add(string.Empty); 118 else throw new InvalidOperationException(); 119 } else values[columnIndex].Add(element); 120 columnIndex++; 121 } 122 } 123 } 124 125 private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) { 102 126 using (StreamReader reader = new StreamReader(fileName)) { 103 127 // skip first line … … 123 147 if (OccurrencesOf(charCounts, '.') > 10) { 124 148 numberFormat = NumberFormatInfo.InvariantInfo; 149 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 125 150 separator = POSSIBLE_SEPARATORS 126 151 .Where(c => OccurrencesOf(charCounts, c) > 10) … … 139 164 // English format (only integer values) with ',' as separator 140 165 numberFormat = NumberFormatInfo.InvariantInfo; 166 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 141 167 separator = ','; 142 168 } else { … … 144 170 // German format (real values) 145 171 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")); 172 dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")); 146 173 separator = POSSIBLE_SEPARATORS 147 174 .Except(disallowedSeparators) … … 154 181 // no points and no commas => English format 155 182 numberFormat = NumberFormatInfo.InvariantInfo; 183 dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo; 156 184 separator = POSSIBLE_SEPARATORS 157 185 .Where(c => OccurrencesOf(charCounts, c) > 10) … … 169 197 #region tokenizer 170 198 internal enum TokenTypeEnum { 171 NewLine, Separator, String, Double 199 NewLine, Separator, String, Double, DateTime 172 200 } 173 201 … … 176 204 public string stringValue; 177 205 public double doubleValue; 206 public DateTime dateTimeValue; 178 207 179 208 public Token(TokenTypeEnum type, string value) { 180 209 this.type = type; 181 210 stringValue = value; 211 dateTimeValue = DateTime.MinValue; 182 212 doubleValue = 0.0; 183 213 } … … 193 223 private List<Token> tokens; 194 224 private NumberFormatInfo numberFormatInfo; 225 private DateTimeFormatInfo dateTimeFormatInfo; 195 226 private char separator; 196 227 private const string INTERNAL_SEPARATOR = "#"; … … 218 249 } 219 250 220 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {251 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) { 221 252 this.reader = reader; 222 253 this.numberFormatInfo = numberFormatInfo; 254 this.dateTimeFormatInfo = dateTimeFormatInfo; 223 255 this.separator = separator; 224 256 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); … … 264 296 token.type = TokenTypeEnum.Double; 265 297 return token; 266 } 267 268 // couldn't parse the token as an int or float number so return a string token 298 } else if (DateTime.TryParse(strToken, out token.dateTimeValue)) { 299 token.type = TokenTypeEnum.DateTime; 300 return token; 301 } 302 303 // couldn't parse the token as an int or float number or datetime value so return a string token 269 304 return token; 270 305 } … … 299 334 private void ParseValues() { 300 335 while (tokenizer.HasNext()) { 301 List<double> row = new List<double>(); 302 row.Add(NextValue(tokenizer)); 336 List<object> row = new List<object>(); 337 object value = NextValue(tokenizer); 338 if (value == null) { tokenizer.Next(); continue; } 339 row.Add(value); 303 340 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 304 341 Expect(tokenizer.SeparatorToken); … … 312 349 "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); 313 350 } 314 // add the current row to the collection of rows and start a new row315 351 rowValues.Add(row); 316 row = new List<double>(); 317 } 318 } 319 320 private double NextValue(Tokenizer tokenizer) { 321 if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN; 352 row = new List<object>(); 353 } 354 } 355 356 private object NextValue(Tokenizer tokenizer) { 357 if (tokenizer.Peek() == tokenizer.SeparatorToken) return string.Empty; 358 if (tokenizer.Peek() == tokenizer.NewlineToken) return null; 322 359 Token current = tokenizer.Next(); 323 if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {360 if (current.type == TokenTypeEnum.Separator) { 324 361 return double.NaN; 362 } else if (current.type == TokenTypeEnum.String) { 363 return current.stringValue; 325 364 } else if (current.type == TokenTypeEnum.Double) { 326 // just take the value327 365 return current.doubleValue; 366 } else if (current.type == TokenTypeEnum.DateTime) { 367 return current.dateTimeValue; 328 368 } 329 369 // found an unexpected token => throw error … … 334 374 335 375 private void ParseVariableNames() { 336 // if the first line doesn't start with a double value then we assume that the 337 // first line contains variable names 338 if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) { 339 340 List<Token> tokens = new List<Token>(); 341 Token valueToken; 376 //if first token is double no variables names are given 377 if (tokenizer.Peek().type == TokenTypeEnum.Double) return; 378 379 // the first line must contain variable names 380 List<Token> tokens = new List<Token>(); 381 Token valueToken; 382 valueToken = tokenizer.Next(); 383 tokens.Add(valueToken); 384 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) { 385 Expect(tokenizer.SeparatorToken); 342 386 valueToken = tokenizer.Next(); 343 tokens.Add(valueToken);344 while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {345 Expect(tokenizer.SeparatorToken);346 valueToken = tokenizer.Next();347 if (valueToken != tokenizer.NewlineToken) {348 tokens.Add(valueToken);349 }350 }351 387 if (valueToken != tokenizer.NewlineToken) { 352 Expect(tokenizer.NewlineToken); 353 } 354 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 355 } 388 tokens.Add(valueToken); 389 } 390 } 391 if (valueToken != tokenizer.NewlineToken) { 392 Expect(tokenizer.NewlineToken); 393 } 394 variableNames = tokens.Select(x => x.stringValue.Trim()).ToList(); 356 395 } 357 396 -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Tests/OnlineCalculatorPerformanceTest.cs
r5963 r6740 80 80 watch.Start(); 81 81 for (int i = 0; i < Repetitions; i++) { 82 double value = calculateFunc(dataset.Get EnumeratedVariableValues(0), dataset.GetEnumeratedVariableValues(1), out errorState);82 double value = calculateFunc(dataset.GetDoubleValues("y"), dataset.GetDoubleValues("x0"), out errorState); 83 83 } 84 84 Assert.AreEqual(errorState, OnlineCalculatorError.None); -
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Tests/TableFileParserTest.cs
r5809 r6740 21 21 22 22 using System; 23 using System.Collections.Generic;24 using System.Linq;25 using Microsoft.VisualStudio.TestTools.UnitTesting;26 23 using System.IO; 27 24 using HeuristicLab.Problems.DataAnalysis; 25 using Microsoft.VisualStudio.TestTools.UnitTesting; 28 26 namespace HeuristicLab.Problems.DataAnalysis_3_4.Tests { 29 27 … … 46 44 Assert.AreEqual(6, parser.Rows); 47 45 Assert.AreEqual(4, parser.Columns); 48 Assert.AreEqual(parser.Values[ 0, 3], 3.14);46 Assert.AreEqual(parser.Values[3][0], 3.14); 49 47 } 50 48 finally { … … 68 66 Assert.AreEqual(6, parser.Rows); 69 67 Assert.AreEqual(4, parser.Columns); 70 Assert.AreEqual(parser.Values[ 0, 3], 3.14);68 Assert.AreEqual(parser.Values[3][0], 3.14); 71 69 } 72 70 finally { … … 90 88 Assert.AreEqual(6, parser.Rows); 91 89 Assert.AreEqual(4, parser.Columns); 92 Assert.AreEqual(parser.Values[ 0, 3], 3.14);90 Assert.AreEqual(parser.Values[3][0], 3.14); 93 91 } 94 92 finally { … … 113 111 Assert.AreEqual(6, parser.Rows); 114 112 Assert.AreEqual(4, parser.Columns); 115 Assert.AreEqual(parser.Values[ 0, 3], 3.14);113 Assert.AreEqual(parser.Values[3][0], 3.14); 116 114 } 117 115 finally { … … 135 133 Assert.AreEqual(6, parser.Rows); 136 134 Assert.AreEqual(4, parser.Columns); 137 Assert.AreEqual( parser.Values[0, 3], 3);135 Assert.AreEqual((double)parser.Values[3][0], 3); 138 136 } 139 137 finally { … … 157 155 Assert.AreEqual(6, parser.Rows); 158 156 Assert.AreEqual(4, parser.Columns); 159 Assert.AreEqual( parser.Values[0, 3], 3);157 Assert.AreEqual((double)parser.Values[3][0], 3); 160 158 } 161 159 finally { … … 179 177 Assert.AreEqual(6, parser.Rows); 180 178 Assert.AreEqual(4, parser.Columns); 181 Assert.AreEqual( parser.Values[0, 3], 3);179 Assert.AreEqual((double)parser.Values[3][0], 3); 182 180 } 183 181 finally { … … 202 200 Assert.AreEqual(6, parser.Rows); 203 201 Assert.AreEqual(4, parser.Columns); 204 Assert.AreEqual( parser.Values[0, 3], 3);202 Assert.AreEqual((double)parser.Values[3][0], 3); 205 203 } 206 204 finally { … … 225 223 Assert.AreEqual(6, parser.Rows); 226 224 Assert.AreEqual(4, parser.Columns); 227 Assert.AreEqual( parser.Values[0, 3], 3.14);225 Assert.AreEqual((double)parser.Values[3][0], 3.14); 228 226 } 229 227 finally { … … 248 246 Assert.AreEqual(6, parser.Rows); 249 247 Assert.AreEqual(4, parser.Columns); 250 Assert.AreEqual( parser.Values[0, 3], 3.14);248 Assert.AreEqual((double)parser.Values[3][0], 3.14); 251 249 } 252 250 finally { … … 270 268 Assert.AreEqual(6, parser.Rows); 271 269 Assert.AreEqual(4, parser.Columns); 272 Assert.AreEqual( parser.Values[0, 3], 3.14);270 Assert.AreEqual((double)parser.Values[3][0], 3.14); 273 271 } 274 272 finally { … … 292 290 Assert.AreEqual(6, parser.Rows); 293 291 Assert.AreEqual(4, parser.Columns); 294 Assert.AreEqual( parser.Values[0, 3], 3.14);292 Assert.AreEqual((double)parser.Values[3][0], 3.14); 295 293 } 296 294 finally { … … 314 312 Assert.AreEqual(6, parser.Rows); 315 313 Assert.AreEqual(4, parser.Columns); 316 Assert.AreEqual( parser.Values[0, 3], 3);314 Assert.AreEqual((double)parser.Values[3][0], 3); 317 315 } 318 316 finally { … … 336 334 Assert.AreEqual(6, parser.Rows); 337 335 Assert.AreEqual(4, parser.Columns); 338 Assert.AreEqual(parser.Values[0, 3], 3); 336 Assert.AreEqual((double)parser.Values[3][0], 3); 337 } 338 finally { 339 File.Delete(tempFileName); 340 } 341 } 342 343 [TestMethod] 344 public void ParseWithEmtpyLines() { 345 string tempFileName = Path.GetTempFileName(); 346 WriteToFile(tempFileName, 347 "x01\t x02\t x03\t x04" + Environment.NewLine + 348 "0\t 0\t 0\t 3" + Environment.NewLine + 349 Environment.NewLine + 350 "0\t 0\t 0\t 0" + Environment.NewLine + 351 " " + Environment.NewLine + 352 "0\t 0\t 0\t 0" + Environment.NewLine + 353 "0\t 0\t 0\t 0" + Environment.NewLine + Environment.NewLine); 354 TableFileParser parser = new TableFileParser(); 355 try { 356 parser.Parse(tempFileName); 357 Assert.AreEqual(4, parser.Rows); 358 Assert.AreEqual(4, parser.Columns); 339 359 } 340 360 finally { … … 358 378 Assert.AreEqual(6, parser.Rows); 359 379 Assert.AreEqual(4, parser.Columns); 360 Assert.AreEqual( parser.Values[0, 3], 3.14);380 Assert.AreEqual((double)parser.Values[3][0], 3.14); 361 381 } 362 382 finally { … … 380 400 Assert.AreEqual(6, parser.Rows); 381 401 Assert.AreEqual(4, parser.Columns); 382 Assert.AreEqual( parser.Values[0, 3], 3.14);402 Assert.AreEqual((double)parser.Values[3][0], 3.14); 383 403 } 384 404 finally { … … 402 422 Assert.AreEqual(6, parser.Rows); 403 423 Assert.AreEqual(4, parser.Columns); 404 Assert.AreEqual( parser.Values[0, 3], 3.14);424 Assert.AreEqual((double)parser.Values[3][0], 3.14); 405 425 } 406 426 finally { … … 424 444 Assert.AreEqual(6, parser.Rows); 425 445 Assert.AreEqual(4, parser.Columns); 426 Assert.AreEqual( parser.Values[0, 3], 3.14);446 Assert.AreEqual((double)parser.Values[3][0], 3.14); 427 447 } 428 448 finally { … … 446 466 Assert.AreEqual(6, parser.Rows); 447 467 Assert.AreEqual(4, parser.Columns); 448 Assert.AreEqual( parser.Values[0, 3], 3);468 Assert.AreEqual((double)parser.Values[3][0], 3); 449 469 } 450 470 finally { … … 468 488 Assert.AreEqual(6, parser.Rows); 469 489 Assert.AreEqual(4, parser.Columns); 470 Assert.AreEqual( parser.Values[0, 3], 3);490 Assert.AreEqual((double)parser.Values[3][0], 3); 471 491 } 472 492 finally {
Note: See TracChangeset
for help on using the changeset viewer.