- Timestamp:
- 10/13/16 19:47:41 (8 years ago)
- Location:
- branches/symbreg-factors-2650
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/symbreg-factors-2650
- Property svn:mergeinfo changed
/trunk/sources merged: 14282,14284-14300,14307,14314-14316,14319,14322
- Property svn:mergeinfo changed
-
branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis (added) merged: 14285,14291,14296
- Property svn:mergeinfo changed
-
branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs
r14277 r14330 26 26 using HeuristicLab.Common; 27 27 using HeuristicLab.Core; 28 using HeuristicLab.Problems.DataAnalysis; 28 29 using HeuristicLab.Random; 29 30 … … 195 196 int nl = xs.Length; 196 197 int nRows = xs.First().Count; 197 double[,] K = new double[nRows, nRows]; 198 199 // sample length-scales 198 199 // sample u iid ~ N(0, 1) 200 var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray(); 201 202 // sample actual length-scales 200 203 var l = Enumerable.Range(0, nl) 201 204 .Select(_ => random.NextDouble() * 2 + 0.5) 202 205 .ToArray(); 203 // calculate covariance matrix 206 207 double[,] K = CalculateCovariance(xs, l); 208 209 // decompose 210 alglib.trfac.spdmatrixcholesky(ref K, nRows, false); 211 212 213 // calc y = Lu 214 var y = new double[u.Length]; 215 alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0); 216 217 // calculate relevance by removing dimensions 218 relevance = CalculateRelevance(y, u, xs, l); 219 220 221 // calculate variable relevance 222 // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106: 223 // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters 224 // play the role of characteristic length scales [...]. Such a covariance function implements 225 // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale 226 // determines how relevant an input is: if the length-scale has a very large value, the covariance 227 // will become almost independent of that input, effectively removing it from inference.'' 228 // relevance = l.Select(li => 1.0 / li).ToArray(); 229 230 return y; 231 } 232 233 // calculate variable relevance based on removal of variables 234 // 1) to remove a variable we set it's length scale to infinity (no relation of the variable value to the target) 235 // 2) calculate MSE of the original target values (y) to the updated targes y' (after variable removal) 236 // 3) relevance is larger if MSE(y,y') is large 237 // 4) scale impacts so that the most important variable has impact = 1 238 private double[] CalculateRelevance(double[] y, double[] u, List<double>[] xs, double[] l) { 239 int nRows = xs.First().Count; 240 var changedL = new double[l.Length]; 241 var relevance = new double[l.Length]; 242 for (int i = 0; i < l.Length; i++) { 243 Array.Copy(l, changedL, changedL.Length); 244 changedL[i] = double.MaxValue; 245 var changedK = CalculateCovariance(xs, changedL); 246 247 var yChanged = new double[u.Length]; 248 alglib.ablas.rmatrixmv(nRows, nRows, changedK, 0, 0, 0, u, 0, ref yChanged, 0); 249 250 OnlineCalculatorError error; 251 var mse = OnlineMeanSquaredErrorCalculator.Calculate(y, yChanged, out error); 252 if (error != OnlineCalculatorError.None) mse = double.MaxValue; 253 relevance[i] = mse; 254 } 255 // scale so that max relevance is 1.0 256 var maxRel = relevance.Max(); 257 for (int i = 0; i < relevance.Length; i++) relevance[i] /= maxRel; 258 return relevance; 259 } 260 261 private double[,] CalculateCovariance(List<double>[] xs, double[] l) { 262 int nRows = xs.First().Count; 263 double[,] K = new double[nRows, nRows]; 204 264 for (int r = 0; r < nRows; r++) { 205 265 double[] xi = xs.Select(x => x[r]).ToArray(); … … 213 273 } 214 274 } 215 216 275 // add a small diagonal matrix for numeric stability 217 276 for (int i = 0; i < nRows; i++) { … … 219 278 } 220 279 221 // decompose 222 alglib.trfac.spdmatrixcholesky(ref K, nRows, false); 223 224 // sample u iid ~ N(0, 1) 225 var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray(); 226 227 // calc y = Lu 228 var y = new double[u.Length]; 229 alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0); 230 231 // calculate variable relevance 232 // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106: 233 // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters 234 // play the role of characteristic length scales [...]. Such a covariance function implements 235 // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale 236 // determines how relevant an input is: if the length-scale has a very large value, the covariance 237 // will become almost independent of that input, effectively removing it from inference.'' 238 relevance = l.Select(li => 1.0 / li).ToArray(); 239 240 return y; 280 return K; 241 281 } 242 282 } -
branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs
r14185 r14330 198 198 /// <param name="columnNamesInFirstLine"></param> 199 199 public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) { 200 using (StreamReader reader = new StreamReader(stream, Encoding)) { 200 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 201 202 using (var reader = new StreamReader(stream)) { 201 203 tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator); 204 var strValues = new List<List<string>>(); 202 205 values = new List<IList>(); 203 if (lineLimit > 0) estimatedNumberOfLines = lineLimit; 204 205 if (columnNamesInFirstLine) { 206 ParseVariableNames(); 207 if (!tokenizer.HasNext()) 208 Error( 209 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 210 "", tokenizer.CurrentLineNumber); 211 } 212 213 214 // read values... start in first row 206 Prepare(columnNamesInFirstLine, strValues); 207 215 208 int nLinesParsed = 0; 216 209 int colIdx = 0; 217 int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)218 210 while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) { 219 211 if (tokenizer.PeekType() == TokenTypeEnum.NewLine) { … … 221 213 222 214 // all rows have to have the same number of values 223 // the first row defines how many samples are needed224 if ( numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row225 else if (colIdx > 0 && numValuesInFirstRow != colIdx) {// read at least one value in the row (support for skipping empty lines)226 Error("The first row of the dataset has " + numValuesInFirstRow+ " columns." + Environment.NewLine +215 // the first row defines how many elements are needed 216 if (colIdx > 0 && values.Count != colIdx) { 217 // read at least one value in the row (support for skipping empty lines) 218 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 227 219 "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "", 228 220 tokenizer.CurrentLineNumber); 229 221 } 230 222 OnReport(tokenizer.BytesRead); … … 234 226 } else { 235 227 // read one value 236 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 228 TokenTypeEnum type; 229 string strVal; 230 double dblVal; 231 DateTime dateTimeVal; 237 232 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 238 233 239 // initialize columns on the first row (fixing data types as presented in the first row...) 240 if (nLinesParsed == 0) { 241 values.Add(CreateList(type, estimatedNumberOfLines)); 242 } else if (colIdx == values.Count) { 243 Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine + 234 if (colIdx == values.Count) { 235 Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine + 244 236 "Line " + tokenizer.CurrentLineNumber + " has more columns.", "", 245 237 tokenizer.CurrentLineNumber); 246 238 } 247 239 if (!IsColumnTypeCompatible(values[colIdx], type)) { 248 values[colIdx] = ConvertToStringColumn(values[colIdx]);240 values[colIdx] = strValues[colIdx]; 249 241 } 242 250 243 // add the value to the column 251 AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal); 244 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 245 if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string> 246 strValues[colIdx].Add(strVal); 247 } 248 colIdx++; 252 249 } 253 250 } 254 255 if (!values.Any() || values.First().Count == 0) 256 Error("Couldn't parse data values. Probably because of incorrect number format " +257 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);258 }251 } 252 253 if (!values.Any() || values.First().Count == 0) 254 Error("Couldn't parse data values. Probably because of incorrect number format " + 255 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); 259 256 260 257 this.rows = values.First().Count; … … 277 274 // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction) 278 275 GC.Collect(2, GCCollectionMode.Forced); 276 } 277 278 private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) { 279 if (columnNamesInFirstLine) { 280 ParseVariableNames(); 281 if (!tokenizer.HasNext()) 282 Error( 283 "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", 284 "", tokenizer.CurrentLineNumber); 285 } 286 // read first line to determine types and allocate specific lists 287 // read values... start in first row 288 int colIdx = 0; 289 while (tokenizer.PeekType() != TokenTypeEnum.NewLine) { 290 // read one value 291 TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal; 292 tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal); 293 294 // initialize column 295 values.Add(CreateList(type, estimatedNumberOfLines)); 296 if (type == TokenTypeEnum.String) 297 strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string> 298 else 299 strValues.Add(new List<string>(estimatedNumberOfLines)); 300 301 AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal); 302 if (type != TokenTypeEnum.String) 303 strValues[colIdx].Add(strVal); 304 colIdx++; 305 } 306 tokenizer.Skip(); // skip newline 279 307 } 280 308 … … 530 558 type = TokenTypeEnum.Double; 531 559 doubleVals[i] = doubleVal; 532 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) { 560 } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue) 561 && dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this 562 ) { 533 563 type = TokenTypeEnum.DateTime; 534 564 dateTimeVals[i] = dateTimeValue; … … 606 636 607 637 private void Error(string message, string token, int lineNumber) { 608 throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);638 throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber)); 609 639 } 610 640 #endregion 611 612 [Serializable]613 public class DataFormatException : Exception {614 private int line;615 public int Line {616 get { return line; }617 }618 private string token;619 public string Token {620 get { return token; }621 }622 public DataFormatException(string message, string token, int line)623 : base(message + "\nToken: " + token + " (line: " + line + ")") {624 this.token = token;625 this.line = line;626 }627 628 public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }629 }630 641 } 631 642 }
Note: See TracChangeset
for help on using the changeset viewer.