Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.GrammaticalOptimization

Timestamp:

04/08/15 10:09:47 (9 years ago)

Author:

gkronber

Message:

#2283: worked on Q-Learning for poly10

File:

: 1 edited

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.GrammaticalOptimization/SequentialDecisionPolicies/GenericPolicy.cs (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.GrammaticalOptimization/SequentialDecisionPolicies/GenericPolicy.cs

-                      r12291
+                      r12294
   // resampling is not prevented
   public sealed class GenericPolicy : IGrammarPolicy {
+    private Dictionary<string, IBanditPolicyActionInfo> stateInfo; // stores the necessary information for bandit policies for each state
+    private Dictionary<string, double> Q; // stores the necessary information for bandit policies for each state
+    private Dictionary<string, int> T; // tries;
+    private Dictionary<string, List<string>> followStates;
     private readonly IProblem problem;
-    private readonly IBanditPolicy banditPolicy;
     private readonly HashSet<string> done; // contains all visited chains
     public GenericPolicy(IProblem problem, IBanditPolicy banditPolicy) {
+    public GenericPolicy(IProblem problem) {
       this.problem = problem;
+      this.banditPolicy = banditPolicy;
+      this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>();
+      this.Q = new Dictionary<string, double>();
+      this.T = new Dictionary<string, int>();
+      this.followStates = new Dictionary<string, List<string>>();
       this.done = new HashSet<string>();
+    }
     private IBanditPolicyActionInfo[] activeAfterStates; // don't allocate each time
+    private double[] activeAfterStates; // don't allocate each time
     private int[] actionIndexMap; // don't allocate each time
 …
       if (activeAfterStates == null || activeAfterStates.Length < afterStates.Count()) {
         activeAfterStates = new IBanditPolicyActionInfo[afterStates.Count()];
+        activeAfterStates = new double[afterStates.Count()];
         actionIndexMap = new int[afterStates.Count()];
+      }
+      if (!followStates.ContainsKey(curState)) {
+        followStates[curState] = new List<string>(afterStates);
+      }
       var idx = 0; int originalIdx = 0;
       foreach (var afterState in afterStates) {
         if (!Done(afterState)) {
           activeAfterStates[idx] = GetStateInfo(afterState);
+          activeAfterStates[idx] = GetValue(afterState);
           actionIndexMap[idx] = originalIdx;
           idx++;
 …
+      }
+      selectedStateIdx = actionIndexMap[banditPolicy.SelectAction(random, activeAfterStates.Take(idx))];
+      //var eps = Math.Max(500.0 / (GetTries(curState) + 1), 0.01);
+      //var eps = 10.0 / Math.Sqrt(GetTries(curState) + 1);
+      var eps = 0.2;
+      selectedStateIdx = actionIndexMap[SelectEpsGreedy(random, activeAfterStates.Take(idx), eps)];
       return true;
+    }
+    private int SelectBoltzmann(Random random, IEnumerable<double> qs, double beta = 10) {
+      // select best
+      // try any of the untries actions randomly
+      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
+      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
+      //  return myActionInfos
+      //  .Select((aInfo, idx) => new { aInfo, idx })
+      //  .Where(p => !p.aInfo.Disabled)
+      //  .Where(p => p.aInfo.Tries == 0)
+      //  .SelectRandom(random).idx;
+      //}
+    private IBanditPolicyActionInfo GetStateInfo(string state) {
+      var s = CalcState(state);
+      IBanditPolicyActionInfo info;
+      if (!stateInfo.TryGetValue(s, out info)) {
+        info = banditPolicy.CreateActionInfo();
+        stateInfo[s] = info;
+      }
+      return info;
+      var w = from q in qs
+              select Math.Exp(beta * q);
+      var bestAction = Enumerable.Range(0, qs.Count()).SampleProportional(random, w);
+      Debug.Assert(bestAction >= 0);
+      return bestAction;
+    }
+    public void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
+      foreach (var state in stateTrajectory.Reverse()) {
+        GetStateInfo(state).UpdateReward(reward);
+    private int SelectEpsGreedy(Random random, IEnumerable<double> qs, double eps = 0.2) {
+      if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
+        // select best
+        var bestActions = new List<int>();
+        double bestQ = double.NegativeInfinity;
+        // actually only the last state can be terminal
+        if (problem.Grammar.IsTerminal(state)) {
+          MarkAsDone(state);
+        int aIdx = -1;
+        foreach (var q in qs) {
+          aIdx++;
+          if (q > bestQ) {
+            bestActions.Clear();
+            bestActions.Add(aIdx);
+            bestQ = q;
+          } else if (q.IsAlmost(bestQ)) {
+            bestActions.Add(aIdx);
+          }
+        }
+        Debug.Assert(bestActions.Any());
+        return bestActions.SelectRandom(random);
+      } else {
+        // select random
+        return SelectRandom(random, qs);
+      }
+    }
+    private int SelectRandom(Random random, IEnumerable<double> qs) {
+      return qs
+         .Select((aInfo, idx) => Tuple.Create(aInfo, idx))
+         .SelectRandom(random).Item2;
+    }
+    public void UpdateReward(IEnumerable<string> chainTrajectory, double reward) {
+      const double gamma = 0.95;
+      const double minAlpha = 0.01;
+      var reverseChains = chainTrajectory.Reverse();
+      var terminalChain = reverseChains.First();
+      var terminalState = CalcState(terminalChain);
+      T[terminalState] = GetTries(terminalChain) + 1;
+      double alpha = Math.Max(1.0 / GetTries(terminalChain), minAlpha);
+      Q[terminalState] = (1 - alpha) * GetValue(terminalChain) + alpha * reward;
+      foreach (var chain in reverseChains.Skip(1)) {
+        var maxNextQ = followStates[chain]
+          //.Where(s=>!Done(s))
+          .Select(GetValue).Max();
+        T[CalcState(chain)] = GetTries(chain) + 1;
+        alpha = Math.Max(1.0 / GetTries(chain), minAlpha);
+        Q[CalcState(chain)] = (1 - alpha) * GetValue(chain) + gamma * alpha * maxNextQ; // direct contribution is zero
+      }
+      if (problem.Grammar.IsTerminal(terminalChain)) MarkAsDone(terminalChain);
+    }
     public void Reset() {
       stateInfo.Clear();
+      Q.Clear();
       done.Clear();
+      followStates.Clear();
+    }
 …
     public int GetTries(string state) {
       var s = CalcState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
+      if (T.ContainsKey(s)) return T[s];
       else return 0;
+    }
     public double GetValue(string state) {
       var s = CalcState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
+    public double GetValue(string chain) {
+      var s = CalcState(chain);
+      if (Q.ContainsKey(s)) return Q[s];
       else return 0.0; // TODO: check alternatives
+    }
 …
       return f.First().Id;
+    }
+    public void PrintStats() {
+      Console.WriteLine(Q.Values.Max());
+      var topTries = Q.Keys.OrderByDescending(key => T[key]).Take(50);
+      var topQs = Q.Keys.Where(key=>key.Contains(",")).OrderByDescending(key => Q[key]).Take(50);
+      foreach (var t in topTries.Zip(topQs, Tuple.Create)) {
+        var id1 = t.Item1;
+        var id2 = t.Item2;
+        Console.WriteLine("{0,30} {1,6} {2:N4} {3,30} {4,6} {5:N4}", id1, T[id1], Q[id1], id2, T[id2], Q[id2]);
+      }
+    }
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12294 for branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.GrammaticalOptimization

Legend:

branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.GrammaticalOptimization/SequentialDecisionPolicies/GenericPolicy.cs

Download in other formats: