Changeset 11793 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 01/18/15 18:24:58 (10 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 6 deleted
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EpsGreedyPolicy.cs
r11742 r11793 26 26 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 27 27 Debug.Assert(actionInfos.Any()); 28 if (random.NextDouble() > eps) {28 if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration 29 29 // select best 30 30 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs
r11792 r11793 14 14 private readonly IProblem problem; 15 15 private readonly IBanditPolicy banditPolicy; 16 private readonly HashSet<string> done;16 //private readonly HashSet<string> done; 17 17 18 18 public GenericGrammarPolicy(IProblem problem, IBanditPolicy banditPolicy, bool useCanonicalState = false) { … … 21 21 this.banditPolicy = banditPolicy; 22 22 this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>(); 23 this.done = new HashSet<string>();23 //this.done = new HashSet<string>(); 24 24 } 25 25 26 public bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, 27 out ReadonlySequence selectedState) { 28 // only select states that are not yet done 29 afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray(); 30 if (!afterStates.Any()) { 26 public bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) { 27 // fail if all states are done (corresponding state infos are disabled) 28 if (afterStates.All(s => GetStateInfo(s).Disabled)) { 31 29 // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored) 32 30 33 done.Add(CanonicalState(curState));34 selectedState = null;31 GetStateInfo(curState).Disable(0.0); // should the value be max of afterstate values instead of 0.0? 32 selectedStateIdx = -1; 35 33 return false; 36 34 } 37 35 36 selectedStateIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s))); 38 37 39 var selectedIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s)));40 selectedState = afterStates.ElementAt(selectedIdx);41 38 return true; 42 39 } 43 40 44 private IBanditPolicyActionInfo GetStateInfo( ReadonlySequencestate) {41 private IBanditPolicyActionInfo GetStateInfo(string state) { 45 42 var s = CanonicalState(state); 46 43 IBanditPolicyActionInfo info; … … 52 49 } 53 50 54 public virtual void UpdateReward(IEnumerable< ReadonlySequence> stateTrajectory, double reward) {51 public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) { 55 52 // the last state could be terminal 56 53 var lastState = stateTrajectory.Last(); 57 if (lastState.IsTerminal) done.Add(CanonicalState(lastState)); 54 if (problem.Grammar.IsTerminal(lastState)) { 55 GetStateInfo(lastState).Disable(reward); 56 } 58 57 59 foreach (var state in stateTrajectory) { 58 // update remaining states 59 foreach (var state in stateTrajectory.Reverse().Skip(1)) { 60 60 GetStateInfo(state).UpdateReward(reward); 61 61 } … … 64 64 public virtual void Reset() { 65 65 stateInfo.Clear(); 66 done.Clear();66 //done.Clear(); 67 67 } 68 68 69 public int GetTries( ReadonlySequencestate) {69 public int GetTries(string state) { 70 70 var s = CanonicalState(state); 71 71 if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries; … … 73 73 } 74 74 75 public double GetValue( ReadonlySequencestate) {75 public double GetValue(string state) { 76 76 var s = CanonicalState(state); 77 77 if (stateInfo.ContainsKey(s)) return stateInfo[s].Value; … … 79 79 } 80 80 81 protected string CanonicalState( ReadonlySequencestate) {81 protected string CanonicalState(string state) { 82 82 if (useCanonicalState) { 83 if ( state.IsTerminal)84 return problem.CanonicalRepresentation(state .ToString());83 if (problem.Grammar.IsTerminal(state)) 84 return problem.CanonicalRepresentation(state); 85 85 else { 86 86 // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored … … 88 88 // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored 89 89 // solution: we disable the state rS4 90 return problem.CanonicalRepresentation(state .ToString()) + state.Length;90 return problem.CanonicalRepresentation(state) + state.Length; 91 91 } 92 92 } else 93 return state .ToString();93 return state; 94 94 } 95 95 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GrammarPolicy.cs
r11770 r11793 8 8 9 9 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies { 10 // stores: tries, avg reward and max reward for each state 10 // stores: tries, avg reward and max reward for each state (base class for RandomPolicy and TDPolicy 11 11 public abstract class GrammarPolicy : IGrammarPolicy { 12 12 protected Dictionary<string, double> avgReward; 13 13 protected Dictionary<string, int> tries; 14 14 protected Dictionary<string, double> maxReward; 15 pr ivatereadonly bool useCanonicalState;16 pr ivatereadonly IProblem problem;15 protected readonly bool useCanonicalState; 16 protected readonly IProblem problem; 17 17 18 p ublicGrammarPolicy(IProblem problem, bool useCanonicalState = false) {18 protected GrammarPolicy(IProblem problem, bool useCanonicalState = false) { 19 19 this.useCanonicalState = useCanonicalState; 20 20 this.problem = problem; … … 24 24 } 25 25 26 public abstract bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState);26 public abstract bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx); 27 27 28 public virtual void UpdateReward(IEnumerable< ReadonlySequence> stateTrajectory, double reward) {28 public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) { 29 29 foreach (var state in stateTrajectory) { 30 var s = CanonicalState(state .ToString());30 var s = CanonicalState(state); 31 31 32 32 if (!tries.ContainsKey(s)) tries.Add(s, 0); … … 47 47 } 48 48 49 public double AvgReward( ReadonlySequencestate) {50 var s = CanonicalState(state .ToString());49 public double AvgReward(string state) { 50 var s = CanonicalState(state); 51 51 if (avgReward.ContainsKey(s)) return avgReward[s]; 52 52 else return 0.0; 53 53 } 54 54 55 public double MaxReward( ReadonlySequencestate) {56 var s = CanonicalState(state .ToString());55 public double MaxReward(string state) { 56 var s = CanonicalState(state); 57 57 if (maxReward.ContainsKey(s)) return maxReward[s]; 58 58 else return 0.0; 59 59 } 60 60 61 public virtual int GetTries( ReadonlySequencestate) {62 var s = CanonicalState(state .ToString());61 public virtual int GetTries(string state) { 62 var s = CanonicalState(state); 63 63 if (tries.ContainsKey(s)) return tries[s]; 64 64 else return 0; 65 65 } 66 66 67 public virtual double GetValue( ReadonlySequencestate) {67 public virtual double GetValue(string state) { 68 68 return AvgReward(state); 69 69 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/IGrammarPolicy.cs
r11770 r11793 8 8 9 9 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies { 10 public interface IGrammarPolicy : IPolicy< ReadonlySequence> {10 public interface IGrammarPolicy : IPolicy<string> { 11 11 } 12 12 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomPolicy.cs
r11770 r11793 13 13 } 14 14 15 public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {15 public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) { 16 16 // never fail => allows re-visits of terminal states 17 selectedState = afterStates.SelectRandom(random);17 selectedStateIdx = random.Next(afterStates.Count()); 18 18 return true; 19 19 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/TDPolicy.cs
r11770 r11793 2 2 using System.Collections.Generic; 3 3 using System.Configuration; 4 using System.Diagnostics; 4 5 using System.Linq; 5 6 using System.Security.Policy; … … 7 8 using System.Threading; 8 9 using System.Threading.Tasks; 10 using HeuristicLab.Algorithms.Bandits.BanditPolicies; 9 11 using HeuristicLab.Common; 10 12 using HeuristicLab.Problems.GrammaticalOptimization; … … 15 17 private readonly HashSet<string> done; 16 18 private readonly Dictionary<string, double> v; 17 private EpsGreedyPolicy epsGreedy;19 private IGrammarPolicy epsGreedy; 18 20 19 21 public TDPolicy(IProblem problem, bool useCanonicalRepresentation = false) … … 21 23 this.done = new HashSet<string>(); 22 24 this.v = new Dictionary<string, double>(); 23 this.epsGreedy = new EpsGreedyPolicy(problem, useCanonicalRepresentation, 0.1);25 this.epsGreedy = new GenericGrammarPolicy(problem, new EpsGreedyPolicy(0.1), useCanonicalRepresentation); 24 26 } 25 27 26 public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {28 public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) { 27 29 // only select states that are not yet done 28 30 afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a.ToString()))).ToArray(); 29 31 if (!afterStates.Any()) { 30 32 // fail because all follow states have already been visited => also disable the current state 31 done.Add(CanonicalState(curState .ToString()));32 selectedState = null;33 done.Add(CanonicalState(curState)); 34 selectedStateIdx = -1; 33 35 return false; 34 36 } 37 throw new NotImplementedException(); // TODO: remap indices of reduced action enumerable to indices of original enumerable 35 38 36 39 //return epsGreedy.TrySelect(random, curState, afterStates, out selectedState); 37 40 38 41 var bestQ = double.NegativeInfinity; 39 selectedState = null; 42 int idx = -1; 43 selectedStateIdx = -1; 40 44 foreach (var state in afterStates) { 45 idx++; 41 46 // try each state at least once 42 47 if (GetTries(state) == 0) { 43 selectedState = state;48 selectedStateIdx = idx; 44 49 return true; 45 50 } … … 47 52 if (q > bestQ) { 48 53 bestQ = q; 49 selectedState = state;54 selectedStateIdx = idx; 50 55 } 51 56 } 52 57 58 Debug.Assert(selectedStateIdx > -1); 53 59 return true; 54 60 } 55 61 56 private double V( ReadonlySequencestate) {57 var s = CanonicalState(state .ToString());62 private double V(string state) { 63 var s = CanonicalState(state); 58 64 if (v.ContainsKey(s)) return v[s]; 59 65 else return 0.0; 60 66 } 61 67 62 public override void UpdateReward(IEnumerable< ReadonlySequence> stateTrajectory, double reward) {68 public override void UpdateReward(IEnumerable<string> stateTrajectory, double reward) { 63 69 base.UpdateReward(stateTrajectory, reward); 64 70 epsGreedy.UpdateReward(stateTrajectory, reward); 65 71 // the last state could be terminal 66 72 var lastState = stateTrajectory.Last(); 67 if ( lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));73 if (problem.Grammar.IsTerminal(lastState)) done.Add(CanonicalState(lastState)); 68 74 69 v[CanonicalState(lastState .ToString())] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState));75 v[CanonicalState(lastState)] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState)); 70 76 71 77 foreach (var p in stateTrajectory.Zip(stateTrajectory.Skip(1), Tuple.Create).Reverse()) { … … 73 79 var next = p.Item2; 74 80 75 v[CanonicalState(cur .ToString())] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur));81 v[CanonicalState(cur)] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur)); 76 82 //v[CanonicalState(cur.ToString())] = V(cur) + 0.1 * (V(next) - V(cur)); 77 83 } … … 79 85 } 80 86 81 public override double GetValue( ReadonlySequencestate) {87 public override double GetValue(string state) { 82 88 return V(state); 83 89 } 84 90 85 public void Reset() {91 public override void Reset() { 86 92 base.Reset(); 87 93 epsGreedy.Reset(); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj
r11770 r11793 66 66 <Compile Include="Bandits\IBandit.cs" /> 67 67 <Compile Include="Bandits\TruncatedNormalBandit.cs" /> 68 <Compile Include="GrammarPolicies\BoltzmanExplorationPolicy.cs" />69 68 <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs"> 70 69 <SubType>Code</SubType> 71 70 </Compile> 71 <Compile Include="GrammarPolicies\RandomPolicy.cs"> 72 <SubType>Code</SubType> 73 </Compile> 72 74 <Compile Include="GrammarPolicies\TDPolicy.cs" /> 73 <Compile Include="GrammarPolicies\UCTPolicy.cs" />74 75 <Compile Include="GrammarPolicies\GrammarPolicy.cs" /> 75 <Compile Include="GrammarPolicies\EpsGreedyPolicy.cs" />76 <Compile Include="GrammarPolicies\GreedyPolicy.cs" />77 76 <Compile Include="GrammarPolicies\IGrammarPolicy.cs" /> 78 <Compile Include="GrammarPolicies\RandomNoResamplingPolicy.cs" />79 <Compile Include="GrammarPolicies\RandomPolicy.cs" />80 77 <Compile Include="IPolicy.cs" /> 81 78 <Compile Include="IBanditPolicy.cs" /> -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs
r11770 r11793 11 11 // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete 12 12 // we also assume that the policy can fail to select one of the followStates 13 public interface IPolicy< TState> {14 bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out TState selectedState); // selectedState \in afterStates13 public interface IPolicy<in TState> { 14 bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out int selectedStateIdx); // selectedState \in afterStates 15 15 16 16 // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state)
Note: See TracChangeset
for help on using the changeset viewer.