Context Navigation

← Previous Changeset
Next Changeset →

Changeset 12294

Timestamp:

04/08/15 10:09:47 (10 years ago)

Author:

gkronber

Message:

#2283: worked on Q-Learning for poly10

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr

Files:

: 4 edited

HeuristicLab.Algorithms.GrammaticalOptimization/SequentialDecisionPolicies/GenericPolicy.cs (modified) (5 diffs)
HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs (modified) (1 diff)
HeuristicLab.Problems.GrammaticalOptimization/Problems/SymbolicRegressionPoly10Problem.cs (modified) (1 diff)
Main/Program.cs (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.GrammaticalOptimization/SequentialDecisionPolicies/GenericPolicy.cs

-                      r12291
+                      r12294
   // resampling is not prevented
   public sealed class GenericPolicy : IGrammarPolicy {
+    private Dictionary<string, IBanditPolicyActionInfo> stateInfo; // stores the necessary information for bandit policies for each state
+    private Dictionary<string, double> Q; // stores the necessary information for bandit policies for each state
+    private Dictionary<string, int> T; // tries;
+    private Dictionary<string, List<string>> followStates;
     private readonly IProblem problem;
-    private readonly IBanditPolicy banditPolicy;
     private readonly HashSet<string> done; // contains all visited chains
     public GenericPolicy(IProblem problem, IBanditPolicy banditPolicy) {
+    public GenericPolicy(IProblem problem) {
       this.problem = problem;
+      this.banditPolicy = banditPolicy;
+      this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>();
+      this.Q = new Dictionary<string, double>();
+      this.T = new Dictionary<string, int>();
+      this.followStates = new Dictionary<string, List<string>>();
       this.done = new HashSet<string>();
+    }
     private IBanditPolicyActionInfo[] activeAfterStates; // don't allocate each time
+    private double[] activeAfterStates; // don't allocate each time
     private int[] actionIndexMap; // don't allocate each time
 …
       if (activeAfterStates == null || activeAfterStates.Length < afterStates.Count()) {
         activeAfterStates = new IBanditPolicyActionInfo[afterStates.Count()];
+        activeAfterStates = new double[afterStates.Count()];
         actionIndexMap = new int[afterStates.Count()];
+      }
+      if (!followStates.ContainsKey(curState)) {
+        followStates[curState] = new List<string>(afterStates);
+      }
       var idx = 0; int originalIdx = 0;
       foreach (var afterState in afterStates) {
         if (!Done(afterState)) {
           activeAfterStates[idx] = GetStateInfo(afterState);
+          activeAfterStates[idx] = GetValue(afterState);
           actionIndexMap[idx] = originalIdx;
           idx++;
 …
+      }
+      selectedStateIdx = actionIndexMap[banditPolicy.SelectAction(random, activeAfterStates.Take(idx))];
+      //var eps = Math.Max(500.0 / (GetTries(curState) + 1), 0.01);
+      //var eps = 10.0 / Math.Sqrt(GetTries(curState) + 1);
+      var eps = 0.2;
+      selectedStateIdx = actionIndexMap[SelectEpsGreedy(random, activeAfterStates.Take(idx), eps)];
       return true;
+    }
+    private int SelectBoltzmann(Random random, IEnumerable<double> qs, double beta = 10) {
+      // select best
+      // try any of the untries actions randomly
+      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
+      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
+      //  return myActionInfos
+      //  .Select((aInfo, idx) => new { aInfo, idx })
+      //  .Where(p => !p.aInfo.Disabled)
+      //  .Where(p => p.aInfo.Tries == 0)
+      //  .SelectRandom(random).idx;
+      //}
+    private IBanditPolicyActionInfo GetStateInfo(string state) {
+      var s = CalcState(state);
+      IBanditPolicyActionInfo info;
+      if (!stateInfo.TryGetValue(s, out info)) {
+        info = banditPolicy.CreateActionInfo();
+        stateInfo[s] = info;
+      }
+      return info;
+      var w = from q in qs
+              select Math.Exp(beta * q);
+      var bestAction = Enumerable.Range(0, qs.Count()).SampleProportional(random, w);
+      Debug.Assert(bestAction >= 0);
+      return bestAction;
+    }
+    public void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
+      foreach (var state in stateTrajectory.Reverse()) {
+        GetStateInfo(state).UpdateReward(reward);
+    private int SelectEpsGreedy(Random random, IEnumerable<double> qs, double eps = 0.2) {
+      if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
+        // select best
+        var bestActions = new List<int>();
+        double bestQ = double.NegativeInfinity;
+        // actually only the last state can be terminal
+        if (problem.Grammar.IsTerminal(state)) {
+          MarkAsDone(state);
+        int aIdx = -1;
+        foreach (var q in qs) {
+          aIdx++;
+          if (q > bestQ) {
+            bestActions.Clear();
+            bestActions.Add(aIdx);
+            bestQ = q;
+          } else if (q.IsAlmost(bestQ)) {
+            bestActions.Add(aIdx);
+          }
+        }
+        Debug.Assert(bestActions.Any());
+        return bestActions.SelectRandom(random);
+      } else {
+        // select random
+        return SelectRandom(random, qs);
+      }
+    }
+    private int SelectRandom(Random random, IEnumerable<double> qs) {
+      return qs
+         .Select((aInfo, idx) => Tuple.Create(aInfo, idx))
+         .SelectRandom(random).Item2;
+    }
+    public void UpdateReward(IEnumerable<string> chainTrajectory, double reward) {
+      const double gamma = 0.95;
+      const double minAlpha = 0.01;
+      var reverseChains = chainTrajectory.Reverse();
+      var terminalChain = reverseChains.First();
+      var terminalState = CalcState(terminalChain);
+      T[terminalState] = GetTries(terminalChain) + 1;
+      double alpha = Math.Max(1.0 / GetTries(terminalChain), minAlpha);
+      Q[terminalState] = (1 - alpha) * GetValue(terminalChain) + alpha * reward;
+      foreach (var chain in reverseChains.Skip(1)) {
+        var maxNextQ = followStates[chain]
+          //.Where(s=>!Done(s))
+          .Select(GetValue).Max();
+        T[CalcState(chain)] = GetTries(chain) + 1;
+        alpha = Math.Max(1.0 / GetTries(chain), minAlpha);
+        Q[CalcState(chain)] = (1 - alpha) * GetValue(chain) + gamma * alpha * maxNextQ; // direct contribution is zero
+      }
+      if (problem.Grammar.IsTerminal(terminalChain)) MarkAsDone(terminalChain);
+    }
     public void Reset() {
       stateInfo.Clear();
+      Q.Clear();
       done.Clear();
+      followStates.Clear();
+    }
 …
     public int GetTries(string state) {
       var s = CalcState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
+      if (T.ContainsKey(s)) return T[s];
       else return 0;
+    }
     public double GetValue(string state) {
       var s = CalcState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
+    public double GetValue(string chain) {
+      var s = CalcState(chain);
+      if (Q.ContainsKey(s)) return Q[s];
       else return 0.0; // TODO: check alternatives
+    }
 …
       return f.First().Id;
+    }
+    public void PrintStats() {
+      Console.WriteLine(Q.Values.Max());
+      var topTries = Q.Keys.OrderByDescending(key => T[key]).Take(50);
+      var topQs = Q.Keys.Where(key=>key.Contains(",")).OrderByDescending(key => Q[key]).Take(50);
+      foreach (var t in topTries.Zip(topQs, Tuple.Create)) {
+        var id1 = t.Item1;
+        var id2 = t.Item2;
+        Console.WriteLine("{0,30} {1,6} {2:N4} {3,30} {4,6} {5:N4}", id1, T[id1], Q[id1], id2, T[id2], Q[id2]);
+      }
+    }
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs

r12014	r12294
184	184	Debug.Assert(maxLenOfReplacement > 0);
185	185
186		var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
	186	var alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
187	187	Debug.Assert(alts.Any());
188	188

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Problems/SymbolicRegressionPoly10Problem.cs

-                      r12290
+                      r12294
       //   .Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) });
+      var partialInterpreter = new PartialExpressionInterpreter();
+      var vars = new double[] { 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, };
+      var s = partialInterpreter.Interpret(phrase, vars);
+      //if (s.Any())
+      //  return new Feature[] { new Feature(s.Pop().ToString(), 1.0), };
+      //else
+      //  return new Feature[] { new Feature("$", 1.0), };
+      return new Feature[] { new Feature(string.Join(",", s), 1.0) };
+      //return new Feature[] { new Feature(phrase, 1.0), };
+       var partialInterpreter = new PartialExpressionInterpreter();
+       var vars = new double[] { 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, };
+       var s = partialInterpreter.Interpret(phrase, vars);
+       //if (s.Any())
+       //  return new Feature[] { new Feature(s.Pop().ToString(), 1.0), };
+       //else
+       //  return new Feature[] { new Feature("$", 1.0), };
+       return new Feature[] { new Feature(string.Join(",", s), 1.0) };
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/Main/Program.cs

-                      r12290
+                      r12294
         // var alg = new SequentialSearch(problem, 25, random, 0,
         //   new HeuristicLab.Algorithms.Bandits.GrammarPolicies.GenericGrammarPolicy(problem, new UCB1TunedPolicy()));
+        var alg = new SequentialSearch(problem, 25, random, 0,
+          new GenericPolicy(problem, new HeuristicLab.Algorithms.Bandits.BanditPolicies.EpsGreedyPolicy(0.1)));
+        var policy = new GenericPolicy(problem);
+        var alg = new SequentialSearch(problem, 23, random, 1,
+          policy);
         //var alg = new MonteCarloTreeSearch(problem, 23, random, new UCB1Policy(), new RandomSimulation(problem, random, 30));
 …
             if (iterations % 1000 == 0) Console.Clear();
             Console.SetCursorPosition(0, 0);
+            alg.PrintStats();
+            // alg.PrintStats();
+            policy.PrintStats();
+          }

Note: See TracChangeset for help on using the changeset viewer.