Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.Bandits

Timestamp:

01/18/15 18:24:58 (10 years ago)

Author:

gkronber

Message:

#2283 fixed compile errors and refactoring

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Files:

: 6 deleted
: 8 edited

BanditPolicies/EmptyPolicyActionInfo.cs (deleted)
BanditPolicies/EpsGreedyPolicy.cs (modified) (1 diff)
GrammarPolicies/BoltzmanExplorationPolicy.cs (deleted)
GrammarPolicies/EpsGreedyPolicy.cs (deleted)
GrammarPolicies/GenericGrammarPolicy.cs (modified) (7 diffs)
GrammarPolicies/GrammarPolicy.cs (modified) (3 diffs)
GrammarPolicies/GreedyPolicy.cs (deleted)
GrammarPolicies/IGrammarPolicy.cs (modified) (1 diff)
GrammarPolicies/RandomNoResamplingPolicy.cs (deleted)
GrammarPolicies/RandomPolicy.cs (modified) (1 diff)
GrammarPolicies/TDPolicy.cs (modified) (7 diffs)
GrammarPolicies/UCTPolicy.cs (deleted)
HeuristicLab.Algorithms.Bandits.csproj (modified) (1 diff)
IPolicy.cs (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EpsGreedyPolicy.cs

r11742	r11793
26	26	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
27	27	Debug.Assert(actionInfos.Any());
28		if (random.NextDouble() > ~~eps) {~~
	28	if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
29	29	// select best
30	30	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs

-                      r11792
+                      r11793
     private readonly IProblem problem;
     private readonly IBanditPolicy banditPolicy;
     private readonly HashSet<string> done;
+    //private readonly HashSet<string> done;
     public GenericGrammarPolicy(IProblem problem, IBanditPolicy banditPolicy, bool useCanonicalState = false) {
 …
       this.banditPolicy = banditPolicy;
       this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>();
       this.done = new HashSet<string>();
+      //this.done = new HashSet<string>();
+    }
+    public bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates,
+      out ReadonlySequence selectedState) {
+      // only select states that are not yet done
+      afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray();
+      if (!afterStates.Any()) {
+    public bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
+      // fail if all states are done (corresponding state infos are disabled)
+      if (afterStates.All(s => GetStateInfo(s).Disabled)) {
         // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored)
         done.Add(CanonicalState(curState));
         selectedState = null;
+        GetStateInfo(curState).Disable(0.0); // should the value be max of afterstate values instead of 0.0?
+        selectedStateIdx = -1;
         return false;
+      }
+      selectedStateIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s)));
-      var selectedIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s)));
-      selectedState = afterStates.ElementAt(selectedIdx);
       return true;
+    }
     private IBanditPolicyActionInfo GetStateInfo(ReadonlySequence state) {
+    private IBanditPolicyActionInfo GetStateInfo(string state) {
       var s = CanonicalState(state);
       IBanditPolicyActionInfo info;
 …
+    }
     public virtual void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
+    public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
       // the last state could be terminal
       var lastState = stateTrajectory.Last();
+      if (lastState.IsTerminal) done.Add(CanonicalState(lastState));
+      if (problem.Grammar.IsTerminal(lastState)) {
+        GetStateInfo(lastState).Disable(reward);
+      }
+      foreach (var state in stateTrajectory) {
+      // update remaining states
+      foreach (var state in stateTrajectory.Reverse().Skip(1)) {
         GetStateInfo(state).UpdateReward(reward);
+      }
 …
     public virtual void Reset() {
       stateInfo.Clear();
       done.Clear();
+      //done.Clear();
+    }
     public int GetTries(ReadonlySequence state) {
+    public int GetTries(string state) {
       var s = CanonicalState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
 …
+    }
     public double GetValue(ReadonlySequence state) {
+    public double GetValue(string state) {
       var s = CanonicalState(state);
       if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
 …
+    }
     protected string CanonicalState(ReadonlySequence state) {
+    protected string CanonicalState(string state) {
       if (useCanonicalState) {
         if (state.IsTerminal)
           return problem.CanonicalRepresentation(state.ToString());
+        if (problem.Grammar.IsTerminal(state))
+          return problem.CanonicalRepresentation(state);
         else {
           // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored
 …
           // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored
           // solution: we disable the state rS4
           return problem.CanonicalRepresentation(state.ToString()) + state.Length;
+          return problem.CanonicalRepresentation(state) + state.Length;
+        }
       } else
         return state.ToString();
+        return state;
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GrammarPolicy.cs

-                      r11770
+                      r11793
 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
   // stores: tries, avg reward and max reward for each state
+  // stores: tries, avg reward and max reward for each state (base class for RandomPolicy and TDPolicy
   public abstract class GrammarPolicy : IGrammarPolicy {
     protected Dictionary<string, double> avgReward;
     protected Dictionary<string, int> tries;
     protected Dictionary<string, double> maxReward;
     private readonly bool useCanonicalState;
     private readonly IProblem problem;
+    protected readonly bool useCanonicalState;
+    protected readonly IProblem problem;
     public GrammarPolicy(IProblem problem, bool useCanonicalState = false) {
+    protected GrammarPolicy(IProblem problem, bool useCanonicalState = false) {
       this.useCanonicalState = useCanonicalState;
       this.problem = problem;
 …
+    }
     public abstract bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState);
+    public abstract bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx);
     public virtual void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
+    public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
       foreach (var state in stateTrajectory) {
         var s = CanonicalState(state.ToString());
+        var s = CanonicalState(state);
         if (!tries.ContainsKey(s)) tries.Add(s, 0);
 …
+    }
     public double AvgReward(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+    public double AvgReward(string state) {
+      var s = CanonicalState(state);
       if (avgReward.ContainsKey(s)) return avgReward[s];
       else return 0.0;
+    }
     public double MaxReward(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+    public double MaxReward(string state) {
+      var s = CanonicalState(state);
       if (maxReward.ContainsKey(s)) return maxReward[s];
       else return 0.0;
+    }
     public virtual int GetTries(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+    public virtual int GetTries(string state) {
+      var s = CanonicalState(state);
       if (tries.ContainsKey(s)) return tries[s];
       else return 0;
+    }
     public virtual double GetValue(ReadonlySequence state) {
+    public virtual double GetValue(string state) {
       return AvgReward(state);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/IGrammarPolicy.cs

r11770	r11793
8	8
9	9	namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
10		public interface IGrammarPolicy : IPolicy<~~ReadonlySequence~~> {
	10	public interface IGrammarPolicy : IPolicy<string> {
11	11	}
12	12	}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomPolicy.cs

r11770	r11793
13	13	}
14	14
15		public override bool TrySelect(Random random, ~~ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState~~) {
	15	public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
16	16	// never fail => allows re-visits of terminal states
17		selectedState ~~= afterStates.SelectRandom(random~~);
	17	selectedStateIdx = random.Next(afterStates.Count());
18	18	return true;
19	19	}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/TDPolicy.cs

-                      r11770
+                      r11793
 using System.Collections.Generic;
 using System.Configuration;
+using System.Diagnostics;
 using System.Linq;
 using System.Security.Policy;
 …
 using System.Threading;
 using System.Threading.Tasks;
+using HeuristicLab.Algorithms.Bandits.BanditPolicies;
 using HeuristicLab.Common;
 using HeuristicLab.Problems.GrammaticalOptimization;
 …
     private readonly HashSet<string> done;
     private readonly Dictionary<string, double> v;
     private EpsGreedyPolicy epsGreedy;
+    private IGrammarPolicy epsGreedy;
     public TDPolicy(IProblem problem, bool useCanonicalRepresentation = false)
 …
       this.done = new HashSet<string>();
       this.v = new Dictionary<string, double>();
       this.epsGreedy = new EpsGreedyPolicy(problem, useCanonicalRepresentation, 0.1);
+      this.epsGreedy = new GenericGrammarPolicy(problem, new EpsGreedyPolicy(0.1), useCanonicalRepresentation);
+    }
     public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
+    public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
       // only select states that are not yet done
       afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a.ToString()))).ToArray();
       if (!afterStates.Any()) {
         // fail because all follow states have already been visited => also disable the current state
         done.Add(CanonicalState(curState.ToString()));
         selectedState = null;
+        done.Add(CanonicalState(curState));
+        selectedStateIdx = -1;
         return false;
+      }
+      throw new NotImplementedException(); // TODO: remap indices of reduced action enumerable to indices of original enumerable
       //return epsGreedy.TrySelect(random, curState, afterStates, out selectedState);
       var bestQ = double.NegativeInfinity;
+      selectedState = null;
+      int idx = -1;
+      selectedStateIdx = -1;
       foreach (var state in afterStates) {
+        idx++;
         // try each state at least once
         if (GetTries(state) == 0) {
           selectedState = state;
+          selectedStateIdx = idx;
           return true;
+        }
 …
         if (q > bestQ) {
           bestQ = q;
           selectedState = state;
+          selectedStateIdx = idx;
+        }
+      }
+      Debug.Assert(selectedStateIdx > -1);
       return true;
+    }
     private double V(ReadonlySequence state) {
       var s = CanonicalState(state.ToString());
+    private double V(string state) {
+      var s = CanonicalState(state);
       if (v.ContainsKey(s)) return v[s];
       else return 0.0;
+    }
     public override void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
+    public override void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
       base.UpdateReward(stateTrajectory, reward);
       epsGreedy.UpdateReward(stateTrajectory, reward);
       // the last state could be terminal
       var lastState = stateTrajectory.Last();
       if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
+      if (problem.Grammar.IsTerminal(lastState)) done.Add(CanonicalState(lastState));
       v[CanonicalState(lastState.ToString())] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState));
+      v[CanonicalState(lastState)] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState));
       foreach (var p in stateTrajectory.Zip(stateTrajectory.Skip(1), Tuple.Create).Reverse()) {
 …
         var next = p.Item2;
         v[CanonicalState(cur.ToString())] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur));
+        v[CanonicalState(cur)] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur));
         //v[CanonicalState(cur.ToString())] = V(cur) + 0.1 * (V(next) - V(cur));
+      }
 …
+    }
     public override double GetValue(ReadonlySequence state) {
+    public override double GetValue(string state) {
       return V(state);
+    }
     public void Reset() {
+    public override void Reset() {
       base.Reset();
       epsGreedy.Reset();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

-                      r11770
+                      r11793
     <Compile Include="Bandits\IBandit.cs" />
     <Compile Include="Bandits\TruncatedNormalBandit.cs" />
-    <Compile Include="GrammarPolicies\BoltzmanExplorationPolicy.cs" />
     <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs">
       <SubType>Code</SubType>
     </Compile>
+    <Compile Include="GrammarPolicies\RandomPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
     <Compile Include="GrammarPolicies\TDPolicy.cs" />
-    <Compile Include="GrammarPolicies\UCTPolicy.cs" />
     <Compile Include="GrammarPolicies\GrammarPolicy.cs" />
-    <Compile Include="GrammarPolicies\EpsGreedyPolicy.cs" />
-    <Compile Include="GrammarPolicies\GreedyPolicy.cs" />
     <Compile Include="GrammarPolicies\IGrammarPolicy.cs" />
-    <Compile Include="GrammarPolicies\RandomNoResamplingPolicy.cs" />
-    <Compile Include="GrammarPolicies\RandomPolicy.cs" />
     <Compile Include="IPolicy.cs" />
     <Compile Include="IBanditPolicy.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

-                      r11770
+                      r11793
   // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete
   // we also assume that the policy can fail to select one of the followStates
   public interface IPolicy<TState> {
     bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out TState selectedState); // selectedState \in afterStates
+  public interface IPolicy<in TState> {
+    bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out int selectedStateIdx); // selectedState \in afterStates
     // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state)

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11793 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Legend:

Download in other formats: