Context Navigation

← Previous Changeset
Next Changeset →

Changeset 11747

Timestamp:

01/12/15 21:23:01 (10 years ago)

Author:

gkronber

Message:

#2283: implemented test problems for MCTS

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization

Files:

: 3 added
: 21 edited

HeuristicLab.Algorithms.Bandits/BanditPolicies/ActiveLearningPolicy.cs (added)
HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliPolicyActionInfo.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs (modified) (3 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs (modified) (3 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs (modified) (2 diffs)
HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj (modified) (1 diff)
HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs (modified) (1 diff)
HeuristicLab.Algorithms.Bandits/Models/GaussianMixtureModel.cs (modified) (4 diffs)
HeuristicLab.Algorithms.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization.csproj (modified) (1 diff)
HeuristicLab.Algorithms.GrammaticalOptimization/MctsContextualSampler.cs (modified) (16 diffs)
HeuristicLab.Algorithms.GrammaticalOptimization/MctsSampler.cs (modified) (9 diffs)
HeuristicLab.Algorithms.GrammaticalOptimization/TemporalDifferenceTreeSearchSampler.cs (modified) (6 diffs)
HeuristicLab.Problems.GrammaticalOptimization.SymbReg/SymbolicRegressionProblem.cs (modified) (1 diff)
HeuristicLab.Problems.GrammaticalOptimization.Test/TestInstances.cs (modified) (1 diff)
HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.csproj (modified) (1 diff)
HeuristicLab.Problems.GrammaticalOptimization/RoyalPhraseSequenceProblem.cs (added)
HeuristicLab.Problems.GrammaticalOptimization/RoyalSequenceProblem.cs (added)
HeuristicLab.Problems.GrammaticalOptimization/SantaFeAntProblem.cs (modified) (1 diff)
HeuristicLab.Problems.GrammaticalOptimization/SymbolicRegressionPoly10Problem.cs (modified) (1 diff)
Main/Program.cs (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BernoulliPolicyActionInfo.cs

-                      r11742
+                      r11747
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
   public class BernoulliPolicyActionInfo : IBanditPolicyActionInfo {
+    private double knownValue;
     public bool Disabled { get { return NumSuccess == -1; } }
     public int NumSuccess { get; private set; }
     public int NumFailure { get; private set; }
     public int Tries { get { return NumSuccess + NumFailure; } }
+    public double Value { get { return NumSuccess / (double)(Tries); } }
+    public double Value {
+      get {
+        if (Disabled) return knownValue;
+        else
+          return NumSuccess / (double)(Tries);
+      }
+    }
     public void UpdateReward(double reward) {
       Debug.Assert(!Disabled);
 …
       else NumFailure++;
+    }
     public void Disable() {
+    public void Disable(double reward) {
       this.NumSuccess = -1;
       this.NumFailure = -1;
+      this.knownValue = reward;
+    }
     public void Reset() {
       NumSuccess = 0;
       NumFailure = 0;
+      knownValue = 0.0;
+    }
     public void PrintStats() {

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs

-                      r11742
+                      r11747
     private readonly Func<DefaultPolicyActionInfo, double> valueFunction;
     public BoltzmannExplorationPolicy(double eps) : this(eps, DefaultPolicyActionInfo.AverageReward) { }
+    public BoltzmannExplorationPolicy(double beta) : this(beta, DefaultPolicyActionInfo.AverageReward) { }
     public BoltzmannExplorationPolicy(double beta, Func<DefaultPolicyActionInfo, double> valueFunction) {
 …
       // select best
       var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
+      Debug.Assert(myActionInfos.Any(a => !a.Disabled));
+      // try any of the untries actions randomly
+      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
+      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
+      //  return myActionInfos
+      //  .Select((aInfo, idx) => new { aInfo, idx })
+      //  .Where(p => !p.aInfo.Disabled)
+      //  .Where(p => p.aInfo.Tries == 0)
+      //  .SelectRandom(random).idx;
+      //}
       var w = from aInfo in myActionInfos

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs

-                      r11742
+                      r11747
   // stores information that is relevant for most of the policies
   public class DefaultPolicyActionInfo : IBanditPolicyActionInfo {
+    private double knownValue;
     public bool Disabled { get { return Tries == -1; } }
     public double SumReward { get; private set; }
     public int Tries { get; private set; }
     public double MaxReward { get; private set; }
+    public double Value { get { return SumReward / Tries; } }
+    public double Value {
+      get {
+        if (Disabled) return knownValue;
+        else
+          return Tries > 0 ? SumReward / Tries : 0.0;
+      }
+    }
     public DefaultPolicyActionInfo() {
       MaxReward = double.MinValue;
 …
       MaxReward = Math.Max(MaxReward, reward);
+    }
     public void Disable() {
+    public void Disable(double reward) {
       this.Tries = -1;
       this.SumReward = 0.0;
+      this.knownValue = reward;
+    }
     public void Reset() {
 …
       Tries = 0;
       MaxReward = 0.0;
+      knownValue = 0.0;
+    }
     public void PrintStats() {

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs

-                      r11742
+                      r11747
     public bool Disabled { get { return disabled; } }
     private OnlineMeanAndVarianceEstimator estimator = new OnlineMeanAndVarianceEstimator();
+    private double knownValue;
     public int Tries { get { return estimator.N; } }
     public double SumReward { get { return estimator.Sum; } }
     public double AvgReward { get { return estimator.Avg; } }
     public double RewardVariance { get { return estimator.Variance; } }
+    public double Value { get { return AvgReward; } }
+    public double Value {
+      get {
+        if (disabled) return knownValue;
+        else
+          return AvgReward;
+      }
+    }
     public void UpdateReward(double reward) {
 …
+    }
     public void Disable() {
+    public void Disable(double reward) {
       disabled = true;
+      this.knownValue = reward;
+    }
     public void Reset() {
       disabled = false;
+      knownValue = 0.0;
       estimator.Reset();
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs

-                      r11744
+                      r11747
   public class ModelPolicyActionInfo : IBanditPolicyActionInfo {
     private readonly IModel model;
+    private double knownValue;
     public bool Disabled { get { return Tries == -1; } }
+    public double Value { get { return model.SampleExpectedReward(new Random()); } }
+    public double Value {
+      get {
+        if (Disabled) return knownValue;
+        else
+          return model.SampleExpectedReward(new Random());
+      }
+    }
     public int Tries { get; private set; }
 …
+    }
     public void Disable() {
+    public void Disable(double reward) {
       this.Tries = -1;
+      this.knownValue = reward;
+    }
     public void Reset() {
       Tries = 0;
+      knownValue = 0.0;
       model.Reset();
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

-                      r11744
+                      r11747
       public int Tries { get; private set; }
       public int thresholdBin = 1;
+      public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }
+      private double knownValue;
+      public double Value {
+        get {
+          if (Disabled) return knownValue;
+          if(Tries == 0.0) return 0.0;
+          return rewardHistogram[thresholdBin] / (double)Tries;
+        }
+      }
       public bool Disabled { get { return Tries == -1; } }
 …
+      }
+      public void Disable() {
+      public void Disable(double reward) {
+        this.knownValue = reward;
         Tries = -1;
+      }
 …
         Tries = 0;
         thresholdBin = 1;
+        this.knownValue = 0.0;
         Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
+      }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs

-                      r11745
+                      r11747
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
 …
     public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
       var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
-      int bestAction = -1;
       double bestQ = double.NegativeInfinity;
       int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      var bestActions = new List<int>();
       int aIdx = -1;
       foreach (var aInfo in myActionInfos) {
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        var q = aInfo.SumReward / aInfo.Tries + Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
+        double q;
+        if (aInfo.Tries == 0) {
+          q = double.PositiveInfinity;
+        } else {
+          q = aInfo.SumReward / aInfo.Tries + 0.5 * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries);
+        }
         if (q > bestQ) {
           bestQ = q;
+          bestAction = aIdx;
+          bestActions.Clear();
+          bestActions.Add(aIdx);
+        } else if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
       Debug.Assert(bestAction > -1);
       return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs

-                      r11742
+                      r11747
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
   /* Kocsis et al. Bandit based Monte-Carlo Planning */
 …
       int aIdx = -1;
+      var bestActions = new List<int>();
       foreach (var aInfo in myActionInfos) {
         aIdx++;
         if (aInfo.Disabled) continue;
+        if (aInfo.Tries == 0) return aIdx;
+        var q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries);
+        double q;
+        if (aInfo.Tries == 0) {
+          q = double.PositiveInfinity;
+        } else {
+          q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries);
+        }
         if (q > bestQ) {
+          bestActions.Clear();
           bestQ = q;
           bestAction = aIdx;
+          bestActions.Add(aIdx);
+        }
+        if (q == bestQ) {
+          bestActions.Add(aIdx);
+        }
+      }
       Debug.Assert(bestAction > -1);
       return bestAction;
+      Debug.Assert(bestActions.Any());
+      return bestActions.SelectRandom(random);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

r11744	r11747
48	48	<Compile Include="BanditPolicies\BoltzmannExplorationPolicy.cs" />
49	49	<Compile Include="BanditPolicies\ChernoffIntervalEstimationPolicy.cs" />
	50	<Compile Include="BanditPolicies\ActiveLearningPolicy.cs" />
50	51	<Compile Include="BanditPolicies\DefaultPolicyActionInfo.cs" />
51	52	<Compile Include="BanditPolicies\EpsGreedyPolicy.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs

r11742	r11747
11	11	int Tries { get; }
12	12	void UpdateReward(double reward);
13		void Disable();
	13	void Disable(double reward);
14	14	// reset causes the state of the action to be reinitialized (as after constructor-call)
15	15	void Reset();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/GaussianMixtureModel.cs

-                      r11744
+                      r11747
 namespace HeuristicLab.Algorithms.Bandits.Models {
   public class GaussianMixtureModel : IModel {
+    private readonly double[] componentMeans;
+    private readonly double[] componentVars;
+    private readonly double[] componentProbs;
+    private double[] componentMeans;
+    private double[] componentVars;
+    private double[] componentProbs;
+    private readonly List<double> allRewards = new List<double>();
     private int numComponents;
 …
     public GaussianMixtureModel(int nComponents = 5) {
       this.numComponents = nComponents;
+      this.componentProbs = new double[nComponents];
+      this.componentMeans = new double[nComponents];
+      this.componentVars = new double[nComponents];
+      Reset();
+    }
 …
     public void Update(double reward) {
+      // see http://www.cs.toronto.edu/~mackay/itprnn/ps/302.320.pdf Algorithm 22.2 soft k-means
+      throw new NotImplementedException();
+      allRewards.Add(reward);
+      throw new NotSupportedException("this does not yet work");
+      if (allRewards.Count < 1000 && allRewards.Count % 10 == 0) {
+        // see http://www.cs.toronto.edu/~mackay/itprnn/ps/302.320.pdf Algorithm 22.2 soft k-means
+        Reset();
+        for (int i = 0; i < 20; i++) {
+          var responsibilities = allRewards.Select(r => CalcResponsibility(r)).ToArray();
+          var sumWeightedRewards = new double[numComponents];
+          var sumResponsibilities = new double[numComponents];
+          foreach (var p in allRewards.Zip(responsibilities, Tuple.Create)) {
+            for (int k = 0; k < numComponents; k++) {
+              sumWeightedRewards[k] += p.Item2[k] * p.Item1;
+              sumResponsibilities[k] += p.Item2[k];
+            }
+          }
+          for (int k = 0; k < numComponents; k++) {
+            componentMeans[k] = sumWeightedRewards[k] / sumResponsibilities[k];
+          }
+          sumWeightedRewards = new double[numComponents];
+          foreach (var p in allRewards.Zip(responsibilities, Tuple.Create)) {
+            for (int k = 0; k < numComponents; k++) {
+              sumWeightedRewards[k] += p.Item2[k] * Math.Pow(p.Item1 - componentMeans[k], 2);
+            }
+          }
+          for (int k = 0; k < numComponents; k++) {
+            componentVars[k] = sumWeightedRewards[k] / sumResponsibilities[k];
+            componentProbs[k] = sumResponsibilities[k] / sumResponsibilities.Sum();
+          }
+        }
+      }
+    }
+    private double[] CalcResponsibility(double r) {
+      var res = new double[numComponents];
+      for (int k = 0; k < numComponents; k++) {
+        componentVars[k] = Math.Max(componentVars[k], 0.001);
+        res[k] = componentProbs[k] * alglib.normaldistribution((r - componentMeans[k]) / Math.Sqrt(componentVars[k]));
+        res[k] = Math.Max(res[k], 0.0001);
+      }
+      var sum = res.Sum();
+      for (int k = 0; k < numComponents; k++) {
+        res[k] /= sum;
+      }
+      return res;
+    }
 …
     public void Reset() {
+      Array.Clear(componentMeans, 0, numComponents);
+      Array.Clear(componentVars, 0, numComponents);
+      Array.Clear(componentProbs, 0, numComponents);
+      var rand = new Random();
+      this.componentProbs = Enumerable.Range(0, numComponents).Select((_) => rand.NextDouble()).ToArray();
+      var sum = componentProbs.Sum();
+      for (int i = 0; i < componentProbs.Length; i++) componentProbs[i] /= sum;
+      this.componentMeans = Enumerable.Range(0, numComponents).Select((_) => Rand.RandNormal(rand)).ToArray();
+      this.componentVars = Enumerable.Range(0, numComponents).Select((_) => 0.01).ToArray();
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization.csproj

r11744	r11747
45	45	<Compile Include="AlternativesSampler.cs" />
46	46	<Compile Include="AlternativesContextSampler.cs" />
	47	<Compile Include="MctsQLearningSampler.cs" />
47	48	<Compile Include="TemporalDifferenceTreeSearchSampler.cs" />
48	49	<Compile Include="ExhaustiveRandomFirstSearch.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/MctsContextualSampler.cs

-                      r11745
+                      r11747
       public int randomTries;
       public int tries;
+      public List<TreeNode> parents;
       public TreeNode[] children;
       public bool done = false;
 …
         this.ident = id;
         this.alt = alt;
+        this.parents = new List<TreeNode>();
+      }
 …
+    }
+    private Dictionary<string, TreeNode> treeNodes;
+    private TreeNode GetTreeNode(string id, ReadonlySequence alt) {
+      TreeNode n;
+      var canonicalId = problem.CanonicalRepresentation(id);
+      if (!treeNodes.TryGetValue(canonicalId, out n)) {
+        n = new TreeNode(canonicalId, alt);
+        tries.TryGetValue(canonicalId, out n.tries);
+        treeNodes[canonicalId] = n;
+      }
+      return n;
+    }
     public event Action<string, double> FoundNewBestSolution;
 …
       this.v = new Dictionary<string, double>(1000000);
       this.tries = new Dictionary<string, int>(1000000);
+      treeNodes = new Dictionary<string, TreeNode>();
+    }
 …
       InitPolicies(problem.Grammar);
       for (int i = 0; !rootNode.done && i < maxIterations; i++) {
+        var sentence = SampleSentence(problem.Grammar).ToString();
+        var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
+        Debug.Assert(quality >= 0 && quality <= 1.0);
+        DistributeReward(quality);
+        RaiseSolutionEvaluated(sentence, quality);
+        if (quality > bestQuality) {
+          bestQuality = quality;
+          RaiseFoundNewBestSolution(sentence, quality);
+        bool success;
+        var sentence = SampleSentence(problem.Grammar, out success).ToString();
+        if (success) {
+          var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
+          Debug.Assert(quality >= 0 && quality <= 1.0);
+          DistributeReward(quality);
+          RaiseSolutionEvaluated(sentence, quality);
+          if (quality > bestQuality) {
+            bestQuality = quality;
+            RaiseFoundNewBestSolution(sentence, quality);
+          }
+        }
+      }
 …
       Console.WriteLine("depth: {0,5} size: {1,10} root tries {2,10}, rootQ {3:F3}, bestQ {4:F3}", treeDepth, treeSize, n.tries, V(n), bestQuality);
       while (n.children != null) {
         Console.WriteLine("{0}", n.ident);
         double maxVForRow = n.children.Select(ch => V(ch)).Max();
+        Console.WriteLine("{0,-30}", n.ident);
+        double maxVForRow = n.children.Select(ch => Math.Min(1.0, Math.Max(0.0, V(ch)))).Max();
         if (maxVForRow == 0) maxVForRow = 1.0;
         for (int i = 0; i < n.children.Length; i++) {
           var ch = n.children[i];
           Console.ForegroundColor = ConsoleEx.ColorForValue(V(ch) / maxVForRow);
+          Console.ForegroundColor = ConsoleEx.ColorForValue(Math.Min(1.0, V(ch)) / maxVForRow);
           Console.Write("{0,5}", ch.alt);
+        }
 …
         for (int i = 0; i < n.children.Length; i++) {
           var ch = n.children[i];
           Console.ForegroundColor = ConsoleEx.ColorForValue(V(ch) / maxVForRow);
           Console.Write("{0,5:F2}", V(ch) * 10);
+          Console.ForegroundColor = ConsoleEx.ColorForValue(Math.Min(1.0, V(ch)) / maxVForRow);
+          Console.Write("{0,5:F2}", Math.Min(1.0, V(ch)) * 10);
+        }
         Console.WriteLine();
         for (int i = 0; i < n.children.Length; i++) {
           var ch = n.children[i];
           Console.ForegroundColor = ConsoleEx.ColorForValue(V(ch) / maxVForRow);
+          Console.ForegroundColor = ConsoleEx.ColorForValue(Math.Min(1.0, V(ch)) / maxVForRow);
           Console.Write("{0,5}", ch.done ? "X" : ch.tries.ToString());
+        }
 …
         Console.WriteLine();
         //n.policy.PrintStats();
         n = n.children.Where(ch => !ch.done).OrderByDescending(c => V(c)).First();
+        n = n.children.Where(ch => !ch.done).OrderByDescending(c => c.tries).First();
+      }
+    }
 …
       this.tries.Clear();
       rootNode = new TreeNode(grammar.SentenceSymbol.ToString(), new ReadonlySequence("$"));
+      rootNode = GetTreeNode(grammar.SentenceSymbol.ToString(), new ReadonlySequence("$"));
       treeDepth = 0;
       treeSize = 0;
+    }
     private Sequence SampleSentence(IGrammar grammar) {
+    private Sequence SampleSentence(IGrammar grammar, out bool success) {
       updateChain.Clear();
       //var startPhrase = new Sequence("a*b+c*d+e*f+E");
       var startPhrase = new Sequence(grammar.SentenceSymbol);
       return CompleteSentence(grammar, startPhrase);
+    }
     private Sequence CompleteSentence(IGrammar g, Sequence phrase) {
+      return CompleteSentence(grammar, startPhrase, out success);
+    }
+    private Sequence CompleteSentence(IGrammar g, Sequence phrase, out bool success) {
       if (phrase.Length > maxLen) throw new ArgumentException();
       if (g.MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
 …
           n.randomTries++;
           treeDepth = Math.Max(treeDepth, curDepth);
+          success = true;
           return g.CompleteSentenceRandomly(random, phrase, maxLen);
         } else {
 …
               newPhrase.ReplaceAt(newPhrase.FirstNonTerminalIndex, 1, alt);
               if (!newPhrase.IsTerminal) newPhrase = newPhrase.Subsequence(0, newPhrase.FirstNonTerminalIndex + 1);
+              n.children[i++] = new TreeNode(newPhrase.ToString(), new ReadonlySequence(alt));
+              var treeNode = GetTreeNode(newPhrase.ToString(), new ReadonlySequence(alt));
+              treeNode.parents.Add(n);
+              n.children[i++] = treeNode;
+            }
             treeSize += n.children.Length;
+            UpdateDone(n);
+            // it could happend that we already finished all variations starting from the branch
+            // stop
+            if (n.done) {
+              success = false;
+              return phrase;
+            }
+          }
+          //int selectedAltIdx = SelectRandom(random, n.children);
           // => select using eps-greedy
           int selectedAltIdx = SelectEpsGreedy(random, n.children);
 …
           curDepth++;
           // prepare for next iteration
           parent = n;
           n = n.children[selectedAltIdx];
+          //UpdateTD(parent, n, 0.0);
+        }
       } // while
 …
       treeDepth = Math.Max(treeDepth, curDepth);
+      success = true;
       return phrase;
+    }
+    //private void UpdateTD(TreeNode parent, TreeNode child, double reward) {
+    //  double alpha = 1.0;
+    //  var vParent = V(parent);
+    //  var vChild = V(child);
+    //  if (double.IsInfinity(vParent)) vParent = 0.0;
+    //  if (double.IsInfinity(vChild)) vChild = 0.0;
+    //  UpdateV(parent, (alpha * (reward + vChild - vParent)));
+    //}
     private void DistributeReward(double reward) {
       // iterate in reverse order (bottom up)
+      updateChain.Reverse();
+      //updateChain.Reverse();
+      UpdateDone(updateChain.Last().Item1);
+      //UpdateTD(updateChain.Last().Item2, updateChain.Last().Item1, reward);
+      //return;
+      BackPropReward(updateChain.Last().Item1, reward);
+      /*
       foreach (var e in updateChain) {
         var node = e.Item1;
         var parent = e.Item2;
+        //var parent = e.Item2;
         node.tries++;
         if (node.children != null && node.children.All(c => c.done)) {
           node.done = true;
+        }
+        //if (node.children != null && node.children.All(c => c.done)) {
+        //  node.done = true;
+        //}
         UpdateV(node, reward);
         // the reward for the parent is either the just recieved reward or the value of the best action so far
         double value = 0.0;
         if (parent != null) {
           var doneChilds = parent.children.Where(ch => ch.done);
           if (doneChilds.Any()) value = doneChilds.Select(ch => V(ch)).Max();
+        }
+        //double value = 0.0;
+        //if (parent != null) {
+        //  var doneChilds = parent.children.Where(ch => ch.done);
+        //  if (doneChilds.Any()) value = doneChilds.Select(ch => V(ch)).Max();
+        //}
         //if (value > reward) reward = value;
+      }
+    }
+      }*/
+    }
+    private void BackPropReward(TreeNode n, double reward) {
+      n.tries++;
+      UpdateV(n, reward);
+      foreach (var p in n.parents) BackPropReward(p, reward);
+    }
+    private void UpdateDone(TreeNode n) {
+      if (!n.done && n.children != null && n.children.All(c => c.done)) n.done = true;
+      if (n.done) foreach (var p in n.parents) UpdateDone(p);
+    }
     private Dictionary<string, double> v;
 …
         tries.Add(canonicalStr, 1);
       } else {
         v[canonicalStr] = stateV + 0.005 * (reward - stateV);
         //v[canonicalStr] = stateV + (1.0 / tries[canonicalStr]) * (reward - stateV);
+        //v[canonicalStr] = stateV + 0.005 * (reward - stateV);
+        v[canonicalStr] = stateV + (1.0 / tries[canonicalStr]) * (reward - stateV);
         tries[canonicalStr]++;
+      }
 …
       //var canonicalStr = n.ident;
       double stateV;
+      if (!tries.ContainsKey(canonicalStr)) return double.PositiveInfinity;
       if (!v.TryGetValue(canonicalStr, out  stateV)) {
         return 0.0;
 …
+    }
+    private int SelectRandom(Random random, TreeNode[] children) {
+      return children.Select((ch, i) => Tuple.Create(ch, i)).Where(p => !p.Item1.done).SelectRandom(random).Item2;
+    }
     private int SelectEpsGreedy(Random random, TreeNode[] children) {
       if (random.NextDouble() < 0.2) {
+        return children.Select((ch, i) => Tuple.Create(ch, i)).Where(p => !p.Item1.done).SelectRandom(random).Item2;
+        return SelectRandom(random, children);
       } else {
         var bestQ = double.NegativeInfinity;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/MctsSampler.cs

-                      r11745
+                      r11747
 using System.Text;
 using HeuristicLab.Algorithms.Bandits;
+using HeuristicLab.Common;
 using HeuristicLab.Problems.GrammaticalOptimization;
 …
       public int randomTries;
       public IBanditPolicyActionInfo actionInfo;
+      public TreeNode parent;
       public TreeNode[] children;
       public bool done = false;
       public TreeNode(string id) {
+      public TreeNode(string id, TreeNode parent) {
         this.ident = id;
+        this.parent = parent;
+      }
 …
     private readonly IBanditPolicy policy;
     private List<TreeNode> updateChain;
+    private TreeNode lastNode; // the bottom node in one episode
     private TreeNode rootNode;
 …
       Console.WriteLine("depth: {0,5} size: {1,10} root tries {2,10}, rootQ {3:F3}, bestQ {4:F3}", treeDepth, treeSize, n.actionInfo.Tries, n.actionInfo.Value, bestQuality);
       while (n.children != null) {
+        Console.WriteLine("{0,-30}", n.ident);
+        double maxVForRow = n.children.Select(ch => ch.actionInfo.Value).Max();
+        if (maxVForRow == 0) maxVForRow = 1.0;
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          SetColorForChild(ch, maxVForRow);
+          Console.Write("{0,5}", ch.ident);
+        }
         Console.WriteLine();
+        Console.WriteLine("{0,5}->{1,-50}", n.ident, string.Join(" ", n.children.Select(ch => string.Format("{0,4}", ch.ident))));
+        Console.WriteLine("{0,5}  {1,-50}", string.Empty, string.Join(" ", n.children.Select(ch => string.Format("{0,4:F2}", ch.actionInfo.Value * 10))));
+        Console.WriteLine("{0,5}  {1,-50}", string.Empty, string.Join(" ", n.children.Select(ch => string.Format("{0,4}", ch.done ? "X" : ch.actionInfo.Tries.ToString()))));
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          SetColorForChild(ch, maxVForRow);
+          Console.Write("{0,5:F2}", ch.actionInfo.Value * 10);
+        }
+        Console.WriteLine();
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          SetColorForChild(ch, maxVForRow);
+          Console.Write("{0,5}", ch.done ? "X" : ch.actionInfo.Tries.ToString());
+        }
+        Console.ForegroundColor = ConsoleColor.White;
+        Console.WriteLine();
         //n.policy.PrintStats();
+        n = n.children.Where(ch => !ch.done).OrderByDescending(c => c.actionInfo.Value).First();
+      }
+        //n = n.children.Where(ch => !ch.done).OrderByDescending(c => c.actionInfo.Value).First();
+        n = n.children.Where(ch=>!ch.done).OrderByDescending(c => c.actionInfo.Value).First();
+      }
+      Console.WriteLine("-----------------------");
+    }
+    private void SetColorForChild(TreeNode ch, double maxVForRow) {
+      //if (ch.done) Console.ForegroundColor = ConsoleColor.White;
+      //else
+      Console.ForegroundColor = ConsoleEx.ColorForValue(ch.actionInfo.Value / maxVForRow);
+    }
     private void InitPolicies(IGrammar grammar) {
+      this.updateChain = new List<TreeNode>();
       rootNode = new TreeNode(grammar.SentenceSymbol.ToString());
+      rootNode = new TreeNode(grammar.SentenceSymbol.ToString(), null);
       rootNode.actionInfo = policy.CreateActionInfo();
       treeDepth = 0;
 …
     private Sequence SampleSentence(IGrammar grammar) {
       updateChain.Clear();
+      lastNode = null;
       var startPhrase = new Sequence(grammar.SentenceSymbol);
+      //var startPhrase = new Sequence("a*b+c*d+e*f+E");
       return CompleteSentence(grammar, startPhrase);
+    }
 …
       var curDepth = 0;
       while (!phrase.IsTerminal) {
-        updateChain.Add(n);
         if (n.randomTries < randomTries) {
           n.randomTries++;
           treeDepth = Math.Max(treeDepth, curDepth);
+          lastNode = n;
           return g.CompleteSentenceRandomly(random, phrase, maxLen);
         } else {
 …
           if (n.randomTries == randomTries && n.children == null) {
             n.children = alts.Select(alt => new TreeNode(alt.ToString())).ToArray(); // create a new node for each alternative
+            n.children = alts.Select(alt => new TreeNode(alt.ToString(), n)).ToArray(); // create a new node for each alternative
             foreach (var ch in n.children) ch.actionInfo = policy.CreateActionInfo();
             treeSize += n.children.Length;
 …
       } // while
       updateChain.Add(n);
+      lastNode = n;
 …
     private void DistributeReward(double reward) {
       // iterate in reverse order (bottom up)
+      updateChain.Reverse();
+      foreach (var e in updateChain) {
+        var node = e;
+        if (node.done) node.actionInfo.Disable();
+      var node = lastNode;
+      while (node != null) {
+        if (node.done) node.actionInfo.Disable(reward);
         if (node.children != null && node.children.All(c => c.done)) {
           node.done = true;
+          node.actionInfo.Disable();
+          var bestActionValue = node.children.Select(c => c.actionInfo.Value).Max();
+          node.actionInfo.Disable(bestActionValue);
+        }
         if (!node.done) {
           node.actionInfo.UpdateReward(reward);
+        }
+        node = node.parent;
+      }
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/TemporalDifferenceTreeSearchSampler.cs

-                      r11744
+                      r11747
     private readonly Random random;
     private readonly int randomTries;
-    private readonly IBanditPolicy policy;
     private List<TreeNode> updateChain;
 …
     public TemporalDifferenceTreeSearchSampler(IProblem problem, int maxLen, Random random, int randomTries, IBanditPolicy policy) {
+    public TemporalDifferenceTreeSearchSampler(IProblem problem, int maxLen, Random random, int randomTries) {
       this.maxLen = maxLen;
       this.problem = problem;
       this.random = random;
       this.randomTries = randomTries;
-      this.policy = policy;
+    }
 …
       Console.WriteLine("depth: {0,5} size: {1,10} root tries {2,10}, rootQ {3:F3}, bestQ {4:F3}", treeDepth, treeSize, n.tries, n.q, bestQuality);
       while (n.children != null) {
+        Console.WriteLine("{0,-30}", n.ident);
+        double maxVForRow = n.children.Select(ch => ch.q).Max();
+        if (maxVForRow == 0) maxVForRow = 1.0;
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          Console.ForegroundColor = ConsoleEx.ColorForValue(ch.q / maxVForRow);
+          Console.Write("{0,5}", ch.ident);
+        }
         Console.WriteLine();
+        Console.WriteLine("{0,5}->{1,-50}", n.ident, string.Join(" ", n.children.Select(ch => string.Format("{0,4}", ch.ident))));
+        Console.WriteLine("{0,5}  {1,-50}", string.Empty, string.Join(" ", n.children.Select(ch => string.Format("{0,4:F2}", ch.q * 10))));
+        Console.WriteLine("{0,5}  {1,-50}", string.Empty, string.Join(" ", n.children.Select(ch => string.Format("{0,4}", ch.done ? "X" : ch.tries.ToString()))));
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          Console.ForegroundColor = ConsoleEx.ColorForValue(ch.q / maxVForRow);
+          Console.Write("{0,5:F2}", ch.q * 10);
+        }
+        Console.WriteLine();
+        for (int i = 0; i < n.children.Length; i++) {
+          var ch = n.children[i];
+          Console.ForegroundColor = ConsoleEx.ColorForValue(ch.q / maxVForRow);
+          Console.Write("{0,5}", ch.done ? "X" : ch.tries.ToString());
+        }
+        Console.ForegroundColor = ConsoleColor.White;
+        Console.WriteLine();
         //n.policy.PrintStats();
         n = n.children.Where(ch => !ch.done).OrderByDescending(c => c.q).First();
+      }
-      //Console.ReadLine();
+    }
 …
+          }
           // => select using bandit policy
           int selectedAltIdx = SelectAction(random, n.children);
+          int selectedAltIdx = SelectEpsGreedy(random, n.children);
           Sequence selectedAlt = alts.ElementAt(selectedAltIdx);
 …
     // eps-greedy
     private int SelectAction(Random random, TreeNode[] children) {
+    private int SelectEpsGreedy(Random random, TreeNode[] children) {
       if (random.NextDouble() < 0.1) {
 …
       } else {
         var bestQ = double.NegativeInfinity;
         var bestChildIdx = -1;
+        var bestChildIdx = new List<int>();
         for (int i = 0; i < children.Length; i++) {
           if (children[i].done) continue;
+          if (children[i].tries == 0) return i;
+          if (children[i].q > bestQ) {
+            bestQ = children[i].q;
+            bestChildIdx = i;
+          // if (children[i].tries == 0) return i;
+          var q = children[i].q;
+          if (q > bestQ) {
+            bestQ = q;
+            bestChildIdx.Clear();
+            bestChildIdx.Add(i);
+          } else if (q == bestQ) {
+            bestChildIdx.Add(i);
+          }
+        }
         Debug.Assert(bestChildIdx > -1);
         return bestChildIdx;
+        Debug.Assert(bestChildIdx.Any());
+        return bestChildIdx.SelectRandom(random);
+      }
+    }
     private void DistributeReward(double reward) {
-      const double alpha = 0.1;
-      const double gamma = 1;
-      // iterate in reverse order (bottom up)
       updateChain.Reverse();
+      var nextQ = 0.0;
+      foreach (var e in updateChain) {
+        var node = e;
+        node.tries++;
+      foreach (var node in updateChain) {
         if (node.children != null && node.children.All(c => c.done)) {
           node.done = true;
+        }
+        // reward is recieved only for the last action
+        if (e == updateChain.First()) {
+          node.q = node.q + alpha * (reward + gamma * nextQ - node.q);
+          nextQ = node.q;
+        } else {
+          node.q = node.q + alpha * (0 + gamma * nextQ - node.q);
+          nextQ = node.q;
+        }
+      }
+      }
+      updateChain.Reverse();
+      //const double alpha = 0.1;
+      const double gamma = 1;
+      double alpha;
+      foreach (var p in updateChain.Zip(updateChain.Skip(1), Tuple.Create)) {
+        var parent = p.Item1;
+        var child = p.Item2;
+        parent.tries++;
+        alpha = 1.0 / parent.tries;
+        //alpha = 0.01;
+        parent.q = parent.q + alpha * (0 + gamma * child.q - parent.q);
+      }
+      // reward is recieved only for the last action
+      var n = updateChain.Last();
+      n.tries++;
+      alpha = 1.0 / n.tries;
+      //alpha = 0.1;
+      n.q = n.q + alpha * reward;
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.SymbReg/SymbolicRegressionProblem.cs

-                      r11742
+                      r11747
     // right now only + and * is supported
     public string CanonicalRepresentation(string terminalPhrase) {
       return terminalPhrase;
       //var terms = terminalPhrase.Split('+');
       //return string.Join("+", terms.Select(term => string.Join("", term.Replace("*", "").OrderBy(ch => ch)))
       //  .OrderBy(term => term));
+      //return terminalPhrase;
+      var terms = terminalPhrase.Split('+');
+      return string.Join("+", terms.Select(term => string.Join("", term.Replace("*", "").OrderBy(ch => ch)))
+        .OrderBy(term => term));
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.Test/TestInstances.cs

r11730	r11747
256	256	Assert.AreEqual(0.116199534934045, p.Evaluate("cfj"), 1.0E-7);
257	257
	258	Assert.AreEqual(0.824522210419616, p.Evaluate("ab+cd+e*f"), 1E-7);
	259
258	260
259	261	Assert.AreEqual(1.0, p.Evaluate("ab+cd+ef+agi+cf*j"), 1.0E-7);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.csproj

r11732	r11747
43	43	</ItemGroup>
44	44	<ItemGroup>
	45	<Compile Include="RoyalPhraseSequenceProblem.cs" />
	46	<Compile Include="RoyalSequenceProblem.cs" />
45	47	<Compile Include="ExpressionInterpreter.cs" />
46	48	<Compile Include="Grammar.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/SantaFeAntProblem.cs

-                      r11742
+                      r11747
     public string CanonicalRepresentation(string terminalPhrase) {
+      return terminalPhrase.Replace("rl", "").Replace("lr", "");
+      //return terminalPhrase;
+      string oldPhrase;
+      do {
+        oldPhrase = terminalPhrase;
+        terminalPhrase.Replace("ll", "rr").Replace("rl", "lr");
+      } while (terminalPhrase != oldPhrase);
+      return terminalPhrase;
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/SymbolicRegressionPoly10Problem.cs

r11745	r11747
16	16	private const string grammarString = @"
17	17	G(E):
18		E -> a \| b \| c \| d \| e \| f \| g \| h \| i \| j \| a+E \| b+E \| c+E \| d+E \| e+E \| f+E \| g+E \| h+E \| i+E \| j+E \| aE \| bE \| cE \| dE \| eE \| fE \| gE \| hE \| iE \| jE
	18	E -> a \| b \| c \| d \| e \| f \| g \| h \| i \| j \| a+E \| b+E \| c+E \| d+E \| e+E \| f+E \| g+E \| h+E \| i+E \| j+E \| aE \| bE \| cE \| dE \| eE \| fE \| gE \| hE \| iE \| jE
19	19	";
20	20

branches/HeuristicLab.Problems.GrammaticalOptimization/Main/Program.cs

-                      r11745
+                      r11747
           Tuple.Create((IProblem)new SymbolicRegressionPoly10Problem(), 23),
         })
         foreach (var randomTries in new int[] { 1, 10, /* 5, 100 /*, 500, 1000 */}) {
+        foreach (var randomTries in new int[] { 0, 1, 10, /* 5, 100 /*, 500, 1000 */}) {
           foreach (var policy in policies) {
             var myRandomTries = randomTries;
 …
     private static void RunDemo() {
+      // TODO: unify MCTS, TD and ContextMCTS Solvers (stateInfos)
       // TODO: test with eps-greedy using max instead of average as value (seems to work well for symb-reg! explore further!)
       // TODO: separate value function from policy
 …
       var random = new Random();
+      var problem = new SymbolicRegressionPoly10Problem();   // good results e.g. 10 randomtries and EpsGreedyPolicy(0.2, (aInfo)=>aInfo.MaxReward)
+      var phraseLen = 1;
+      var sentenceLen = 25;
+      var numPhrases = sentenceLen / phraseLen;
+      var problem = new RoyalPhraseSequenceProblem(random, 10, numPhrases, phraseLen: 1, k: 1, correctReward: 1, incorrectReward: 0);
+      //var problem = new SymbolicRegressionPoly10Problem();   // good results e.g. 10 randomtries and EpsGreedyPolicy(0.2, (aInfo)=>aInfo.MaxReward)
       // Ant
       // good results e.g. with       var alg = new MctsSampler(problem, 17, random, 1, (rand, numActions) => new ThresholdAscentPolicy(numActions, 500, 0.01));
 …
       //var problem = new RoyalPairProblem();
       //var problem = new EvenParityProblem();
+      //var alg = new MctsSampler(problem, 23, random, 0, new GaussianThompsonSamplingPolicy(true));
+      var alg = new MctsContextualSampler(problem, 23, random, 0);
+      //var alg = new TemporalDifferenceTreeSearchSampler(problem, 17, random, 10, new EpsGreedyPolicy(0.1));
+      //var alg = new ExhaustiveBreadthFirstSearch(problem, 17);
+      // symbreg length = 11 q = 0.824522210419616
+      var alg = new MctsSampler(problem, sentenceLen, random, 0, new BoltzmannExplorationPolicy(200));
+      //var alg = new MctsQLearningSampler(problem, sentenceLen, random, 0, null);
+      //var alg = new MctsQLearningSampler(problem, 30, random, 0, new EpsGreedyPolicy(0.2));
+      //var alg = new MctsContextualSampler(problem, 23, random, 0); // must visit each canonical solution only once
+      //var alg = new TemporalDifferenceTreeSearchSampler(problem, 30, random, 1);
+      //var alg = new ExhaustiveBreadthFirstSearch(problem, 7);
       //var alg = new AlternativesContextSampler(problem, random, 17, 4, (rand, numActions) => new RandomPolicy(rand, numActions));
       //var alg = new ExhaustiveDepthFirstSearch(problem, 17);
       // var alg = new AlternativesSampler(problem, 17);
       // var alg = new RandomSearch(problem, random, 17);
       // var alg = new ExhaustiveRandomFirstSearch(problem, random, 17);
+      //var alg = new ExhaustiveRandomFirstSearch(problem, random, 17);
       alg.FoundNewBestSolution += (sentence, quality) => {
 …
           alg.PrintStats();
+        }
+        //Console.WriteLine(sentence);
         if (iterations % 10000 == 0) {
-          //Console.WriteLine("{0,10} {1,10:F5} {2,10:F5} {3}", iterations, bestQuality, quality, sentence);
-          //Console.WriteLine("{0,4} {1,7} {2}", alg.treeDepth, alg.treeSize, globalStatistics);
           //Console.WriteLine("{0,4} {1,7} {2}", alg.treeDepth, alg.treeSize, globalStatistics);
+        }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11747

Legend:

Download in other formats: