Changeset 18003 for branches/3127MRGPVarProExploration/HeuristicLab.Problems.VarProMRGP/3.3/Problem.cs
 Timestamp:
 07/09/21 19:15:12 (3 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

branches/3127MRGPVarProExploration/HeuristicLab.Problems.VarProMRGP/3.3/Problem.cs
r17988 r18003 38 38 using System.Runtime.InteropServices; 39 39 using HeuristicLab.Random; 40 using HeuristicLab.Analysis.Statistics;41 40 42 41 namespace HeuristicLab.Problems.VarProMRGP { … … 91 90 92 91 #region not cloned or stored 93 private int[] featureIdx = null; // code[featureIdx[i]] is the last operation of feature i 94 [ExcludeFromObjectGraphTraversal] 95 private NativeInterpreter.NativeInstruction[] code = null; 96 private Dictionary<string, GCHandle> cachedData = null; 97 private LinearTransformation[] transformations; 92 ISymbolicExpressionTree[] features; 93 private List<TreeToAutoDiffTermConverter.ParametricFunctionGradient> featCode; // AutoDiff delegates for the features 94 private List<double[]> featParam; // parameters for the features 95 private List<double[][]> featVariables; 98 96 #endregion 99 97 … … 108 106 public Problem() { 109 107 var g = new VarProGrammar(); 108 109 // TODO optionally: scale dataset 110 110 111 111 Parameters.Add(new ValueParameter<IRegressionProblemData>("ProblemData", "", new RegressionProblemData())); … … 140 140 // ProblemData 141 141 //  142 // ++ 143 //   144 // cachedData Grammar MaxSize MaxDepth MaxInteractions 145 // cachedDataSet     146 // ++++ 147 //  148 // Features 149 // Code 150 //  151 // Encoding (Length) 152 //  153 // ++ 154 //   155 // BestKnownSolution Operators (Parameter) 156 // BestKnownQuality 142 // Grammar MaxSize MaxDepth MaxInteractions 143 //     144 // ++++ 145 //  146 // Features 147 // Code 148 //  149 // Encoding (Length) 150 //  151 // ++ 152 //   153 // BestKnownSolution Operators (Parameter) 154 // BestKnownQuality 157 155 158 156 private void RegisterEventHandlers() { … … 187 185 188 186 private void RegressionProblemData_Changed(object sender, EventArgs e) { 189 UpdateDataCache();190 187 Grammar.ConfigureVariableSymbols(RegressionProblemData); 191 188 } … … 193 190 private void RegressionProblemDataParameter_ValueChanged(object sender, EventArgs e) { 194 191 RegressionProblemData.Changed += RegressionProblemData_Changed; 195 UpdateDataCache();196 192 Grammar.ConfigureVariableSymbols(RegressionProblemData); 197 193 } … … 209 205 } 210 206 211 private void UpdateDataCache() {212 if (cachedData != null) {213 foreach (var gch in cachedData.Values) {214 gch.Free();215 }216 cachedData = null;217 }218 219 var dataset = RegressionProblemData.Dataset;220 221 // free handles to old data222 if (cachedData != null) {223 foreach (var gch in cachedData.Values) {224 gch.Free();225 }226 cachedData = null;227 }228 229 // cache new data230 cachedData = new Dictionary<string, GCHandle>();231 transformations = new LinearTransformation[dataset.DoubleVariables.Count()];232 int varIdx = 0;233 foreach (var v in dataset.DoubleVariables) {234 var values = dataset.GetDoubleValues(v).ToArray();235 if (v == RegressionProblemData.TargetVariable) {236 // do not scale target237 var linTransform = new LinearTransformation(dataset.DoubleVariables);238 linTransform.Addend = 0;239 linTransform.Multiplier = 1.0;240 transformations[varIdx++] = linTransform;241 } else {242 // Scale to 0 .. 1243 var max = values.Max();244 var min = values.Min();245 var range = max  min;246 var linTransform = new LinearTransformation(dataset.DoubleVariables);247 transformations[varIdx++] = linTransform;248 linTransform.Addend = 1 * min / range  0.0;249 linTransform.Multiplier = 1.0 / range;250 for (int i = 0; i < values.Length; i++) {251 values[i] = values[i] * linTransform.Multiplier + linTransform.Addend;252 }253 }254 var gch = GCHandle.Alloc(values, GCHandleType.Pinned);255 cachedData[v] = gch;256 }257 }258 259 207 private void UpdateFeaturesAndCode() { 260 varfeatures = GenerateFeaturesSystematic(10000, new MersenneTwister(31415), Grammar, MaxSize, MaxDepth, maxVariables: 3);261 GenerateCode(features, RegressionProblemData .Dataset);208 features = GenerateFeaturesSystematic(10000, new MersenneTwister(31415), Grammar, MaxSize, MaxDepth, maxVariables: 3); 209 GenerateCode(features, RegressionProblemData); 262 210 var formatter = new InfixExpressionFormatter(); 263 211 Features = new ItemArray<StringValue>(features.Select(fi => new StringValue(formatter.Format(fi, System.Globalization.NumberFormatInfo.InvariantInfo, formatString: "0.0")).AsReadOnly())).AsReadOnly(); … … 268 216 269 217 public override double Evaluate(Individual individual, IRandom random) { 270 if (cachedData == null  code == null) { 271 // after loading from file or cloning the data cache and code must be restored 272 UpdateDataCache(); 218 if (featCode == null) { 273 219 UpdateFeaturesAndCode(); 274 220 } 275 221 var b = individual.BinaryVector(Encoding.Name); 276 var fittingOptions = new NativeInterpreter.SolverOptions();277 fittingOptions.Iterations = 10;278 fittingOptions.Algorithm = NativeInterpreter.Algorithm.Krogh;279 fittingOptions.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR;280 fittingOptions.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION;281 fittingOptions.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT;282 283 var evalOptions = new NativeInterpreter.SolverOptions();284 evalOptions.Iterations = 0;285 evalOptions.Algorithm = NativeInterpreter.Algorithm.Krogh;286 evalOptions.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR;287 evalOptions.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION;288 evalOptions.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT;289 222 290 223 var rows = RegressionProblemData.TrainingIndices.ToArray(); … … 295 228 for (int i = 0; i < b.Length; i++) { 296 229 if (b[i] == true) { 297 termIndexList.Add(featureIdx[i]); 298 } 299 } 300 301 var coefficients = new double[termIndexList.Count + 1]; // coefficients for all terms + offset 302 // var avgCoefficients = new double[Features.Length]; 303 304 // 5fold CV 305 var nFolds = 5; 306 var nFoldRows = nRows / nFolds; 307 var fittingRows = new int[(nFolds  1) * nFoldRows]; 308 var fittingResult = new double[fittingRows.Length]; 309 310 var testRows = new int[nFoldRows]; 311 var testResult = new double[nFoldRows]; 312 var cvMSE = new List<double>(); 313 var trainMSE = new List<double>(); 314 for (int testFold = 0; testFold < nFolds; testFold++) { 315 // TODO: THREAD SAFETY FOR PARALLEL EVALUATION 316 // CODE IS MANIPULATED 317 lock (code) { 318 var oldParameterValues = ExtractParameters(code, termIndexList); 319 320 #region fit to training folds 321 // copy rows from the four training folds 322 var nextFoldIdx = 0; 323 for (int fold = 0; fold < nFolds; fold++) { 324 if (fold == testFold) { 325 Array.Copy(allRows, fold * nFoldRows, testRows, 0, nFoldRows); 326 } else { 327 Array.Copy(allRows, fold * nFoldRows, fittingRows, nextFoldIdx, nFoldRows); 328 nextFoldIdx += nFoldRows; 230 termIndexList.Add(i); 231 } 232 } 233 234 var oldParameterValues = ExtractParameters(termIndexList); 235 var alpha = (double[])oldParameterValues.Clone(); 236 237 var target = RegressionProblemData.TargetVariableTrainingValues.ToArray(); 238 239 // local function for feature evaluation 240 void Phi(double[] a, ref double[,] phi) { 241 if (phi == null) { 242 phi = new double[nRows, termIndexList.Count + 1]; // + offset term 243 // last term is constant = 1 244 for (int i = 0; i < nRows; i++) 245 phi[i, termIndexList.Count] = 1.0; 246 } 247 var offset = 0; 248 // for each term 249 for (int i = 0; i < termIndexList.Count; i++) { 250 var termIdx = termIndexList[i]; 251 var numFeatParam = this.featParam[termIdx].Length; 252 var variableValues = new double[featVariables[termIdx].Length]; 253 var featParam = new double[numFeatParam]; 254 Array.Copy(a, offset, featParam, 0, featParam.Length); 255 // for each row 256 for (int j = 0; j < nRows; j++) { 257 // copy row values 258 for (int k = 0; k < variableValues.Length; k++) { 259 variableValues[k] = featVariables[termIdx][k][j]; // featVariables is columnorder 329 260 } 330 } 331 332 var fittingTarget = RegressionProblemData.Dataset.GetDoubleValues(RegressionProblemData.TargetVariable, fittingRows).ToArray(); 333 334 Array.Clear(fittingResult, 0, fittingResult.Length); 335 Array.Clear(coefficients, 0, coefficients.Length); 336 337 338 NativeInterpreter.NativeWrapper.GetValuesVarPro(code, code.Length, termIndexList.ToArray(), termIndexList.Count, fittingRows, fittingRows.Length, 339 coefficients, fittingOptions, fittingResult, fittingTarget, out var optSummary); 340 #endregion 341 342 if (optSummary.InitialCost < 0  optSummary.FinalCost < 0) throw new InvalidProgramException(); 343 trainMSE.Add(optSummary.FinalCost * 2.0 / fittingRows.Length); // ceres cost is 0.5 * sum of squared residuals 344 // average of all coefficients 345 // TODO return a statistic for the relevance of the term instead of the coefficient 346 // for (int k = 0; k < termIndexList.Count; k++) { 347 // avgCoefficients[Array.IndexOf(featureIdx, termIndexList[k])] += coefficients[k] / nFolds; // TODO perf 348 // } 349 350 #region calculate output on test fold to determine CV MSE 351 352 // evaluate all terms for test rows 353 var sumOutput = new double[testResult.Length]; 354 for (int r = 0; r < sumOutput.Length; r++) { 355 sumOutput[r] = coefficients[coefficients.Length  1]; // offset 356 } 357 358 for (int k = 0; k < termIndexList.Count; k++) { 359 var termIdx = termIndexList[k]; 360 // copy relevant part of the code 361 var termCode = new NativeInterpreter.NativeInstruction[code[termIdx].length]; 362 Array.Copy(code, termIdx  termCode.Length + 1, termCode, 0, termCode.Length); 363 NativeInterpreter.NativeWrapper.GetValues(termCode, termCode.Length, testRows, testRows.Length, evalOptions, testResult, null, out var evalSummary); 364 365 for (int r = 0; r < sumOutput.Length; r++) { 366 sumOutput[r] += coefficients[k] * testResult[r]; 261 var tup = featCode[termIdx].Invoke(featParam, variableValues); // TODO for phi we do not actually need g 262 phi[j, i] = tup.Item2; 263 } 264 offset += numFeatParam; 265 } 266 } 267 268 // local function for Jacobian evaluation 269 void Jac(double[] a, ref double[,] J, ref int[,] ind) { 270 if (J == null) { 271 J = new double[nRows, featParam.Sum(fp => fp.Length)]; // all parameters 272 ind = new int[2, featParam.Sum(fp => fp.Length)]; 273 } 274 var offset = 0; 275 // for each term 276 for (int i = 0; i < termIndexList.Count; i++) { 277 var termIdx = termIndexList[i]; 278 var numFeatParam = this.featParam[termIdx].Length; 279 var variableValues = new double[featVariables[termIdx].Length]; 280 var featParam = new double[numFeatParam]; 281 Array.Copy(a, offset, featParam, 0, featParam.Length); 282 283 // for each parameter 284 for (int k = 0; k < featParam.Length; k++) { 285 ind[0, offset + k] = i; // column idx in phi 286 ind[1, offset + k] = offset + k; // parameter idx (no parameter is used twice) 287 } 288 289 // for each row 290 for (int j = 0; j < nRows; j++) { 291 // copy row values 292 for (int k = 0; k < variableValues.Length; k++) { 293 variableValues[k] = featVariables[termIdx][k][j]; // featVariables is columnorder 367 294 } 368 } 369 370 var evalTarget = RegressionProblemData.Dataset.GetDoubleValues(RegressionProblemData.TargetVariable, testRows); 371 if (sumOutput.Any(d => double.IsNaN(d))) cvMSE.Add(evalTarget.VariancePop()); // variance of target is MSE of constant model 372 else cvMSE.Add(sumOutput.Zip(evalTarget, (ri, ti) => { var res = ti  ri; return res * res; }).Average()); 373 374 #endregion 375 376 #region prepare for next varpro call 377 var newParameterValues = ExtractParameters(code, termIndexList); 378 // keep the updated parameter values only when the coefficient is significantly <> 0 379 for (int i = 0; i < termIndexList.Count; i++) { 380 // TODO would actually need to take variance of term into account 381 if (Math.Abs(coefficients[i]) > 1e5) { 382 oldParameterValues[i] = newParameterValues[i]; 295 var tup = featCode[termIdx].Invoke(featParam, variableValues); 296 // for each parameter 297 for (int k = 0; k < featParam.Length; k++) { 298 J[j, offset + k] = tup.Item1[k]; 383 299 } 384 300 } 385 UpdateParameter(code, termIndexList, oldParameterValues); 386 } 387 #endregion 388 } 389 390 391 individual["Train MSE (avg)"] = new DoubleValue(trainMSE.Average()); 392 individual["Parameter"] = new DoubleArray(ExtractParameters(code, termIndexList).SelectMany(arr => arr).ToArray()); // store the parameter which worked for this individual for solution creation 393 // individual["Coefficients"] = new DoubleArray(avgCoefficients); // for allele frequency analysis 394 // return new double[] { avgCost, coefficients.Count(ci => Math.Abs(ci) > 1e9) }; 395 // average of averages 396 individual["CV MSE (avg)"] = new DoubleValue(cvMSE.Average()); 397 individual["CV MSE (stdev)"] = new DoubleValue(cvMSE.StandardDeviation()); 398 return cvMSE.Average() + cvMSE.StandardDeviation(); 399 // return new double[] { cvMSE.Average() + cvMSE.StandardDeviation(), termIndexList.Count }; 301 offset += numFeatParam; 302 } 303 } 304 305 try { 306 HEAL.VarPro.VariableProjection.Fit(Phi, Jac, target, alpha, out var coeff, out var report); 307 308 309 if (report.residNorm < 0) throw new InvalidProgramException(); 310 UpdateParameter(termIndexList, alpha); 311 312 313 individual["Parameter"] = new DoubleArray(alpha); // store the parameter which worked for this individual for solution creation 314 individual["Coeff"] = new DoubleArray(coeff); 315 316 return report.residNormSqr / nRows; 317 } catch (Exception _) { 318 individual["Parameter"] = new DoubleArray(alpha); // store the parameter which worked for this individual for solution creation 319 individual["Coeff"] = new DoubleArray(termIndexList.Count + 1); 320 return double.MaxValue; 321 } 400 322 } 401 323 … … 415 337 if (double.IsNaN(curBestQuality)  bestQ < curBestQuality) { 416 338 var bestVector = bestIndividual.BinaryVector(Encoding.Name); 417 418 var options = new NativeInterpreter.SolverOptions(); 419 options.Iterations = 10; // optimize on full dataset 420 options.Algorithm = NativeInterpreter.Algorithm.Krogh; 421 options.LinearSolver = NativeInterpreter.CeresTypes.LinearSolverType.DENSE_QR; 422 options.Minimizer = NativeInterpreter.CeresTypes.MinimizerType.TRUST_REGION; 423 options.TrustRegionStrategy = NativeInterpreter.CeresTypes.TrustRegionStrategyType.LEVENBERG_MARQUARDT; 424 425 var rows = RegressionProblemData.TrainingIndices.ToArray(); 426 var target = RegressionProblemData.TargetVariableTrainingValues.ToArray(); 427 428 var rowsArray = rows.ToArray(); 429 var nRows = rowsArray.Length; 430 var result = new double[nRows]; 339 var bestParams = ((DoubleArray)bestIndividual["Parameter"]).ToArray(); 340 var bestCoeff = ((DoubleArray)bestIndividual["Coeff"]).ToArray(); 341 // var rows = RegressionProblemData.TrainingIndices.ToArray(); 342 // var target = RegressionProblemData.TargetVariableTrainingValues.ToArray(); 343 // 344 // var rowsArray = rows.ToArray(); 345 // var nRows = rowsArray.Length; 346 // var result = new double[nRows]; 431 347 var termIndexList = new List<int>(); 432 var predictorNames = new List<string>();348 // var predictorNames = new List<string>(); 433 349 for (int i = 0; i < bestVector.Length; i++) { 434 350 if (bestVector[i] == true) { 435 termIndexList.Add(featureIdx[i]); 436 predictorNames.Add(Features[i].Value); 437 } 438 } 439 var coefficients = new double[termIndexList.Count + 1]; // coefficients for all terms + offset 440 // TODO: threadsafety 441 lock (code) { 442 UpdateParameter(code, termIndexList, ((DoubleArray)bestIndividual["Parameter"]).ToArray()); 443 444 NativeInterpreter.NativeWrapper.GetValuesVarPro(code, code.Length, termIndexList.ToArray(), termIndexList.Count, rows, nRows, coefficients, options, result, target, out var optSummary); 445 } 446 447 #region linear model statistics 448 var xy = new double[nRows, termIndexList.Count + 1]; 449 for(int j=0;j<termIndexList.Count;j++) { 450 var t = termIndexList[j]; 451 var termCode = new NativeInterpreter.NativeInstruction[code[t].length]; 452 Array.Copy(code, t  code[t].length + 1, termCode, 0, code[t].length); 453 options.Iterations = 0; 454 NativeInterpreter.NativeWrapper.GetValues(termCode, termCode.Length, rows, nRows, options, result, null, out _); 455 for(int i=0;i<nRows;i++) { xy[i, j] = result[i]; } // copy to matrix 456 } 457 // copy target to matrix 458 for (int i = 0; i < nRows; i++) { xy[i, termIndexList.Count] = target[i]; } // copy to matrix 459 predictorNames.Add("<const>"); // last coefficient is offset 460 // var stats = Statistics.CalculateLinearModelStatistics(xy, coefficients); 461 // results.AddOrUpdateResult("Statistics", stats.AsResultCollection(predictorNames)); 462 #endregion 463 464 results.AddOrUpdateResult("Solution", CreateRegressionSolution(code, termIndexList, coefficients, RegressionProblemData)); 351 termIndexList.Add(i); 352 } 353 } 354 355 results.AddOrUpdateResult("Solution", CreateRegressionSolution(termIndexList.ToArray(), bestParams, bestCoeff, RegressionProblemData)); 465 356 results.AddOrUpdateResult("BestQuality", new DoubleValue(bestQ)); 466 results.AddOrUpdateResult("CV MSE (avg)", bestIndividual["CV MSE (avg)"]);467 results.AddOrUpdateResult("CV MSE (stdev)", bestIndividual["CV MSE (stdev)"]);468 357 } 469 358 } 470 359 471 360 #region retrieval / update of nonlinear parameters 472 private double[] [] ExtractParameters(NativeInterpreter.NativeInstruction[] code,List<int> termIndexList) {473 var p = new double[termIndexList.Count][];361 private double[] ExtractParameters(List<int> termIndexList) { 362 var p = new List<double>(); 474 363 for (int i = 0; i < termIndexList.Count; i++) { 475 var termIdx = termIndexList[i]; 476 var pList = new List<double>(); 477 var start = termIdx  code[termIdx].length + 1; 478 for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) { 479 if (code[codeIdx].optimize) 480 pList.Add(code[codeIdx].value); 481 } 482 p[i] = pList.ToArray(); 483 } 484 return p; 485 } 486 487 // parameters are given as array for each term 488 private void UpdateParameter(NativeInterpreter.NativeInstruction[] code, List<int> termIndexList, double[][] p) { 364 p.AddRange(featParam[termIndexList[i]]); 365 } 366 return p.ToArray(); 367 } 368 369 370 // parameters are given as a flat array 371 private void UpdateParameter(List<int> termIndexList, double[] p) { 372 var offset = 0; 489 373 for (int i = 0; i < termIndexList.Count; i++) { 490 if (p[i].Length == 0) continue; // nothing to update 491 var termIdx = termIndexList[i]; 492 var pIdx = 0; 493 var start = termIdx  code[termIdx].length + 1; 494 for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) { 495 if (code[codeIdx].optimize) 496 code[codeIdx].value = p[i][pIdx++]; 497 } 498 } 499 } 500 501 // parameters are given as a flat array 502 private void UpdateParameter(NativeInterpreter.NativeInstruction[] code, List<int> termIndexList, double[] p) { 503 var pIdx = 0; 504 for (int i = 0; i < termIndexList.Count; i++) { 505 var termIdx = termIndexList[i]; 506 var start = termIdx  code[termIdx].length + 1; 507 for (int codeIdx = start; codeIdx <= termIdx; codeIdx++) { 508 if (code[codeIdx].optimize) 509 code[codeIdx].value = p[pIdx++]; 510 } 374 var numFeatParam = featParam[termIndexList[i]].Length; 375 Array.Copy(p, offset, featParam[termIndexList[i]], 0, numFeatParam); 376 offset += numFeatParam; 511 377 } 512 378 } 513 379 #endregion 514 380 515 516 517 381 #region feature generation 382 /* 518 383 private static ISymbolicExpressionTree[] GenerateFeatures(int n, IRandom random, ISymbolicDataAnalysisGrammar grammar, int maxSize, int maxDepth) { 519 384 var features = new ISymbolicExpressionTree[n]; … … 531 396 return features; 532 397 } 398 */ 533 399 private static ISymbolicExpressionTree[] GenerateFeaturesSystematic(int n, IRandom random, ISymbolicDataAnalysisGrammar grammar, int maxSize, int maxDepth, int maxVariables) { 534 400 var hashes = new HashSet<ulong>(); … … 607 473 608 474 609 private void GenerateCode(ISymbolicExpressionTree[] features, IDataset dataset) { 610 byte mapSupportedSymbols(ISymbolicExpressionTreeNode node) { 611 var opCode = OpCodes.MapSymbolToOpCode(node); 612 if (supportedOpCodes.Contains(opCode)) return opCode; 613 else throw new NotSupportedException($"VarPro does not support {node.Symbol.Name}"); 614 }; 615 616 var i = 0; 617 featureIdx = new int[features.Length]; 618 619 var code = new List<NativeInterpreter.NativeInstruction>(capacity: (int)(features.Sum(fi => fi.Length  2) * 1.2)); 475 private void GenerateCode(ISymbolicExpressionTree[] features, IRegressionProblemData problemData) { 476 this.featCode = new List<TreeToAutoDiffTermConverter.ParametricFunctionGradient>(); 477 this.featParam = new List<double[]>(); 478 this.featVariables = new List<double[][]>(); 620 479 foreach (var f in features) { 621 var featureCode = Compile(f, mapSupportedSymbols);622 623 code.AddRange(featureCode);624 feat ureIdx[i++] = code.Count  1;625 }626 this.code = code.ToArray();480 var featureCode = Compile(f, problemData, out var initialParamValues, out var variableValues); 481 482 featCode.Add(featureCode); 483 featParam.Add(initialParamValues); 484 featVariables.Add(variableValues); 485 } 627 486 } 628 487 … … 650 509 (byte)OpCode.AnalyticQuotient 651 510 }; 652 private NativeInterpreter.NativeInstruction[] Compile(ISymbolicExpressionTree tree, Func<ISymbolicExpressionTreeNode, byte> opCodeMapper) { 653 var root = tree.Root.GetSubtree(0).GetSubtree(0); 654 var code = new List<NativeInterpreter.NativeInstruction>(); 655 foreach (var n in root.IterateNodesPrefix()) { 656 var instr = new NativeInterpreter.NativeInstruction { narg = (ushort)n.SubtreeCount, opcode = opCodeMapper(n), length = 1 }; // length is updated in a second pass below 657 if (n is VariableTreeNode variable) { 658 instr.value = variable.Weight; 659 instr.optimize = false; 660 instr.data = cachedData[variable.VariableName].AddrOfPinnedObject(); 661 } else if (n is ConstantTreeNode constant) { 662 instr.value = constant.Value; 663 if (n.Symbol.Name != "<1.0>") // HACK TODO this depends on the name given in the grammar! 664 instr.optimize = true; // the VarPro grammar is specifically designed to make sure we have all necessary and only necessary nonlinear parameters 665 } 666 if (n.Symbol is Logarithm) { 667 // for log(f(x)) generate code log ( sqrt(f(x)²)) to ensure argument to log is positive 668 code.Add(instr); 669 code.Add(new NativeInterpreter.NativeInstruction { narg = 1, opcode = (byte)OpCode.SquareRoot, length = 1 }); // length is updated in a second pass below 670 code.Add(new NativeInterpreter.NativeInstruction { narg = 1, opcode = (byte)OpCode.Square, length = 1 }); 671 } else { 672 // all other operations are added verbatim 673 code.Add(instr); 674 } 675 } 676 677 code.Reverse(); 678 var codeArr = code.ToArray(); 679 680 // second pass to calculate lengths 681 for (int i = 0; i < codeArr.Length; i++) { 682 var c = i  1; 683 for (int j = 0; j < codeArr[i].narg; ++j) { 684 codeArr[i].length += codeArr[c].length; 685 c = codeArr[c].length; 686 } 687 } 688 689 return codeArr; 511 private TreeToAutoDiffTermConverter.ParametricFunctionGradient Compile(ISymbolicExpressionTree tree, IRegressionProblemData problemData, 512 out double[] initialParameterValues, out double[][] variableValues) { 513 TreeToAutoDiffTermConverter.TryConvertToAutoDiff(tree, makeVariableWeightsVariable: false, addLinearScalingTerms: false, 514 out var parameters, out initialParameterValues, out var func, out var func_grad); 515 variableValues = new double[parameters.Count][]; 516 for (int i = 0; i < parameters.Count; i++) { 517 variableValues[i] = problemData.Dataset.GetDoubleValues(parameters[i].variableName, problemData.TrainingIndices).ToArray(); // TODO: we could reuse the arrays 518 } 519 return func_grad; 690 520 } 691 521 #endregion 692 522 693 523 #region solution creation 694 private IRegressionSolution CreateRegressionSolution(NativeInterpreter.NativeInstruction[] code, IList<int> termIndexList, double[] coefficients, IRegressionProblemData problemData) { 695 // parse back solution from code (required because we need to used optimized parameter values) 696 var addSy = symbols[(byte)OpCode.Add]; 697 var mulSy = symbols[(byte)OpCode.Mul]; 698 var sum = addSy.CreateTreeNode(); 699 for (int i = 0; i < termIndexList.Count; i++) { 700 if (Math.Abs(coefficients[i]) < 1e8) continue; 701 var termIdx = termIndexList[i]; 702 var prod = mulSy.CreateTreeNode(); 703 var constNode = (ConstantTreeNode)constSy.CreateTreeNode(); 704 constNode.Value = coefficients[i]; 705 var term = CreateTree(code, termIdx); 706 prod.AddSubtree(constNode); 707 prod.AddSubtree(term); 708 sum.AddSubtree(prod); 709 } 710 { 711 var constNode = (ConstantTreeNode)constSy.CreateTreeNode(); 712 constNode.Value = coefficients.Last(); 713 sum.AddSubtree(constNode); 714 } 524 private IRegressionSolution CreateRegressionSolution(int[] featIdx, double[] parameters, double[] coefficients, IRegressionProblemData problemData) { 715 525 var root = (new ProgramRootSymbol()).CreateTreeNode(); 716 526 var start = (new StartSymbol()).CreateTreeNode(); 527 var add = (new Addition()).CreateTreeNode(); 717 528 root.AddSubtree(start); 718 start.AddSubtree(sum); 529 start.AddSubtree(add); 530 var offset = 0; 531 for (int i = 0; i < featIdx.Length; i++) { 532 var term = (ISymbolicExpressionTreeNode)features[featIdx[i]].Root.GetSubtree(0).GetSubtree(0).Clone(); 533 534 var termParameters = new double[featParam[featIdx[i]].Length]; 535 Array.Copy(parameters, offset, termParameters, 0, termParameters.Length); 536 ReplaceParameters(term, termParameters); 537 offset += termParameters.Length; 538 539 var mul = (new Multiplication()).CreateTreeNode(); 540 mul.AddSubtree(term); 541 mul.AddSubtree(CreateConstant(coefficients[i])); 542 add.AddSubtree(mul); 543 } 544 // last coeff is offset 545 add.AddSubtree(CreateConstant(coefficients[coefficients.Length  1])); 546 719 547 var tree = new SymbolicExpressionTree(root); 720 548 var ds = problemData.Dataset; 721 var scaledDataset = new Dataset(ds.DoubleVariables, ds.ToArray(ds.DoubleVariables, transformations,Enumerable.Range(0, ds.Rows)));722 var scaledProblemData = new RegressionProblemData(scaledDataset, problemData.AllowedInputVariables, problemData.TargetVariable , transformations);549 var scaledDataset = new Dataset(ds.DoubleVariables, ds.ToArray(ds.DoubleVariables, Enumerable.Range(0, ds.Rows))); 550 var scaledProblemData = new RegressionProblemData(scaledDataset, problemData.AllowedInputVariables, problemData.TargetVariable); 723 551 scaledProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start; 724 552 scaledProblemData.TrainingPartition.End = problemData.TrainingPartition.End; … … 727 555 return new SymbolicRegressionSolution( 728 556 new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeNativeInterpreter()), scaledProblemData); 557 } 558 559 private void ReplaceParameters(ISymbolicExpressionTreeNode term, double[] termParameters) { 560 // Autodiff converter extracts parameters using a preorder tree traversal. 561 // Therefore, we must use a preorder tree traversal here as well. 562 // Only ConstantTreeNode values are optimized. 563 var paramIdx = 0; 564 foreach (var node in term.IterateNodesPrefix().OfType<ConstantTreeNode>()) { 565 node.Value = termParameters[paramIdx++]; 566 } 567 if (paramIdx != termParameters.Length) throw new InvalidProgramException(); 568 } 569 570 private ISymbolicExpressionTreeNode CreateConstant(double coeff) { 571 var constNode = (ConstantTreeNode)(new Constant()).CreateTreeNode(); 572 constNode.Value = coeff; 573 return constNode; 729 574 } 730 575 … … 753 598 754 599 755 private ISymbolicExpressionTreeNode CreateTree(NativeInterpreter.NativeInstruction[] code, int i) {756 switch (code[i].opcode) {757 case (byte)OpCode.Add:758 case (byte)OpCode.Sub:759 case (byte)OpCode.Mul:760 case (byte)OpCode.Div:761 case (byte)OpCode.Exp:762 case (byte)OpCode.Log:763 case (byte)OpCode.Sin:764 case (byte)OpCode.Cos:765 case (byte)OpCode.Tan:766 case (byte)OpCode.Tanh:767 case (byte)OpCode.Square:768 case (byte)OpCode.SquareRoot:769 case (byte)OpCode.Cube:770 case (byte)OpCode.CubeRoot:771 case (byte)OpCode.Absolute:772 case (byte)OpCode.AnalyticQuotient: {773 var node = symbols[code[i].opcode].CreateTreeNode();774 var c = i  1;775 for (int childIdx = 0; childIdx < code[i].narg; childIdx++) {776 node.AddSubtree(CreateTree(code, c));777 c = c  code[c].length;778 }779 return node;780 }781 case (byte)OpCode.Constant: {782 var node = (ConstantTreeNode)constSy.CreateTreeNode();783 node.Value = code[i].value;784 return node;785 }786 case (byte)OpCode.Variable: {787 var node = (VariableTreeNode)varSy.CreateTreeNode();788 node.Weight = code[i].value;789 // TODO perf790 node.VariableName = string.Empty;791 foreach (var tup in this.cachedData) {792 if (tup.Value.AddrOfPinnedObject() == code[i].data) {793 node.VariableName = tup.Key;794 break;795 }796 }797 return node;798 }799 default: throw new NotSupportedException("unknown opcode");800 }801 }802 600 #endregion 803 601 … … 809 607 Operators.Add(new AlleleFrequencyAnalyzer()); 810 608 811 var cvMSEAnalyzer = new BestAverageWorstQualityAnalyzer();812 cvMSEAnalyzer.Name = "CVMSE Analzer";813 ParameterizeAnalyzer(cvMSEAnalyzer, "CV MSE (avg)");814 Operators.Add(cvMSEAnalyzer);815 816 var trainingMSEAnalyzer = new BestAverageWorstQualityAnalyzer();817 trainingMSEAnalyzer.Name = "Training MSE Analzer";818 ParameterizeAnalyzer(trainingMSEAnalyzer, "Train MSE (avg)");819 Operators.Add(trainingMSEAnalyzer);609 // var cvMSEAnalyzer = new BestAverageWorstQualityAnalyzer(); 610 // cvMSEAnalyzer.Name = "CVMSE Analzer"; 611 // ParameterizeAnalyzer(cvMSEAnalyzer, "CV MSE (avg)"); 612 // Operators.Add(cvMSEAnalyzer); 613 // 614 // var trainingMSEAnalyzer = new BestAverageWorstQualityAnalyzer(); 615 // trainingMSEAnalyzer.Name = "Training MSE Analzer"; 616 // ParameterizeAnalyzer(trainingMSEAnalyzer, "Train MSE (avg)"); 617 // Operators.Add(trainingMSEAnalyzer); 820 618 821 619 ParameterizeOperators();
Note: See TracChangeset
for help on using the changeset viewer.