Changeset 11806 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 01/20/15 20:25:00 (10 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 2 added
- 17 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ActiveLearningPolicy.cs
r11792 r11806 11 11 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 12 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 13 int totalTries = myActionInfos. Where(a => !a.Disabled).Sum(a => a.Tries);13 int totalTries = myActionInfos.Sum(a => a.Tries); 14 14 const double delta = 0.1; 15 int k = myActionInfos. Where(a => !a.Disabled).Count();15 int k = myActionInfos.Count(); 16 16 var bestActions = new List<int>(); 17 17 var us = new List<double>(); … … 20 20 foreach (var aInfo in myActionInfos) { 21 21 aIdx++; 22 if (aInfo.Disabled) continue;23 22 double q; 24 23 double u; -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/BoltzmannExplorationPolicy.cs
r11799 r11806 37 37 38 38 var w = from aInfo in myActionInfos 39 select aInfo.Disabled 40 ? 0.0 41 : Math.Exp(beta * valueFunction(aInfo)); 39 select Math.Exp(beta * valueFunction(aInfo)); 42 40 43 41 var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs
r11792 r11806 21 21 // select best 22 22 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 23 int k = myActionInfos.Count( a => !a.Disabled);24 int totalTries = myActionInfos. Where(a => !a.Disabled).Sum(a => a.Tries);23 int k = myActionInfos.Count(); 24 int totalTries = myActionInfos.Sum(a => a.Tries); 25 25 double bestQ = double.NegativeInfinity; 26 26 var bestActions = new List<int>(); … … 28 28 foreach (var aInfo in myActionInfos) { 29 29 aIdx++; 30 if (aInfo.Disabled) continue;31 30 double q; 32 31 if (aInfo.Tries == 0) { … … 46 45 bestActions.Clear(); 47 46 bestActions.Add(aIdx); 48 } else if (q == bestQ) {47 } else if (q.IsAlmost(bestQ)) { 49 48 bestActions.Add(aIdx); 50 49 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/DefaultPolicyActionInfo.cs
r11747 r11806 9 9 // stores information that is relevant for most of the policies 10 10 public class DefaultPolicyActionInfo : IBanditPolicyActionInfo { 11 private double knownValue;12 public bool Disabled { get { return Tries == -1; } }13 11 public double SumReward { get; private set; } 14 12 public int Tries { get; private set; } … … 16 14 public double Value { 17 15 get { 18 if (Disabled) return knownValue;19 else20 16 return Tries > 0 ? SumReward / Tries : 0.0; 21 17 } … … 26 22 27 23 public void UpdateReward(double reward) { 28 Debug.Assert(!Disabled);29 30 24 Tries++; 31 25 SumReward += reward; 32 26 MaxReward = Math.Max(MaxReward, reward); 33 27 } 34 public void Disable(double reward) { 35 this.Tries = -1; 36 this.SumReward = 0.0; 37 this.knownValue = reward; 38 } 28 39 29 public void Reset() { 40 30 SumReward = 0.0; 41 31 Tries = 0; 42 32 MaxReward = 0.0; 43 knownValue = 0.0;44 }45 public void PrintStats() {46 Console.WriteLine("avg reward {0,5:F2} disabled {1}", SumReward / Tries, Disabled);47 33 } 48 34 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EpsGreedyPolicy.cs
r11793 r11806 35 35 foreach (var aInfo in myActionInfos) { 36 36 aIdx++; 37 if (aInfo.Disabled) continue;38 37 39 38 var q = valueFunction(aInfo); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/GenericThompsonSamplingPolicy.cs
r11799 r11806 22 22 foreach (var aInfo in myActionInfos) { 23 23 aIdx++; 24 if (aInfo.Disabled) continue;25 //if (aInfo.Tries == 0) return aIdx;26 24 var q = aInfo.SampleExpectedReward(random); 27 25 if (q > bestQ) { -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/MeanAndVariancePolicyActionInfo.cs
r11747 r11806 39 39 estimator.Reset(); 40 40 } 41 42 public void PrintStats() {43 Console.WriteLine("avg reward {0,5:F2} disabled {1}", AvgReward, Disabled);44 }45 41 } 46 42 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ModelPolicyActionInfo.cs
r11747 r11806 10 10 public class ModelPolicyActionInfo : IBanditPolicyActionInfo { 11 11 private readonly IModel model; 12 private double knownValue;13 public bool Disabled { get { return Tries == -1; } }14 12 public double Value { 15 13 get { 16 if (Disabled) return knownValue; 17 else 18 return model.SampleExpectedReward(new Random()); 14 return model.SampleExpectedReward(new Random()); 19 15 } 20 16 } … … 26 22 27 23 public void UpdateReward(double reward) { 28 Debug.Assert(!Disabled);29 24 Tries++; 30 25 model.Update(reward); … … 35 30 } 36 31 37 public void Disable(double reward) {38 this.Tries = -1;39 this.knownValue = reward;40 }41 42 32 public void Reset() { 43 33 Tries = 0; 44 knownValue = 0.0;45 34 model.Reset(); 46 35 } 47 36 48 public void PrintStats() {49 model.PrintStats();50 }51 52 37 public override string ToString() { 53 return string.Format(" disabled {0} model {1}", Disabled, model);38 return string.Format("model {1}", model); 54 39 } 55 40 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/RandomPolicy.cs
r11742 r11806 17 17 return actionInfos 18 18 .Select((aInfo, idx) => Tuple.Create(aInfo, idx)) 19 .Where(p => !p.Item1.Disabled)20 19 .SelectRandom(random).Item2; 21 20 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs
r11792 r11806 28 28 public int Tries { get; private set; } 29 29 public int thresholdBin = 1; 30 private double knownValue;31 30 32 31 public double Value { 33 32 get { 34 if (Disabled) return knownValue;35 33 if (Tries == 0.0) return 0.0; 36 34 return rewardHistogram[thresholdBin] / (double)Tries; 37 35 } 38 36 } 39 40 public bool Disabled { get { return Tries == -1; } }41 37 42 38 public void UpdateReward(double reward) { … … 46 42 } 47 43 48 public void Disable(double reward) {49 this.knownValue = reward;50 Tries = -1;51 }52 53 44 public void Reset() { 54 45 Tries = 0; 55 46 thresholdBin = 1; 56 this.knownValue = 0.0;57 47 Array.Clear(rewardHistogram, 0, rewardHistogram.Length); 58 }59 60 public void PrintStats() {61 if (Tries >= 0) {62 Console.Write("{0,6}", Tries);63 } else {64 Console.Write("{0,6}", "");65 }66 48 } 67 49 … … 101 83 var bestActions = new List<int>(); 102 84 double bestQ = double.NegativeInfinity; 103 int k = myActionInfos.Count( a => !a.Disabled);104 var totalTries = myActionInfos. Where(a => !a.Disabled).Sum(a => a.Tries);85 int k = myActionInfos.Count(); 86 var totalTries = myActionInfos.Sum(a => a.Tries); 105 87 int aIdx = -1; 106 88 foreach (var aInfo in myActionInfos) { 107 89 aIdx++; 108 if (aInfo.Disabled) continue;109 90 double q; 110 91 if (aInfo.Tries == 0) { … … 118 99 bestActions.Clear(); 119 100 bestActions.Add(aIdx); 120 } else if (q == bestQ) {101 } else if (q.IsAlmost(bestQ)) { 121 102 bestActions.Add(aIdx); 122 103 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1Policy.cs
r11747 r11806 13 13 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 14 14 double bestQ = double.NegativeInfinity; 15 int totalTries = myActionInfos. Where(a => !a.Disabled).Sum(a => a.Tries);15 int totalTries = myActionInfos.Sum(a => a.Tries); 16 16 17 17 var bestActions = new List<int>(); … … 19 19 foreach (var aInfo in myActionInfos) { 20 20 aIdx++; 21 if (aInfo.Disabled) continue;22 21 double q; 23 22 if (aInfo.Tries == 0) { … … 31 30 bestActions.Clear(); 32 31 bestActions.Add(aIdx); 33 } else if (q == bestQ) {32 } else if (q.IsAlmost(bestQ)) { 34 33 bestActions.Add(aIdx); 35 34 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1TunedPolicy.cs
r11792 r11806 37 37 bestActions.Clear(); 38 38 bestActions.Add(aIdx); 39 } else if (q == bestQ) {39 } else if (q.IsAlmost(bestQ)) { 40 40 bestActions.Add(aIdx); 41 41 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCBNormalPolicy.cs
r11792 r11806 33 33 bestActions.Clear(); 34 34 bestActions.Add(aIdx); 35 } else if (q == bestQ) {35 } else if (q.IsAlmost(bestQ)) { 36 36 bestActions.Add(aIdx); 37 37 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCTPolicy.cs
r11747 r11806 21 21 int bestAction = -1; 22 22 double bestQ = double.NegativeInfinity; 23 int totalTries = myActionInfos. Where(a => !a.Disabled).Sum(a => a.Tries);23 int totalTries = myActionInfos.Sum(a => a.Tries); 24 24 25 25 int aIdx = -1; … … 27 27 foreach (var aInfo in myActionInfos) { 28 28 aIdx++; 29 if (aInfo.Disabled) continue;30 29 double q; 31 30 if (aInfo.Tries == 0) { … … 38 37 bestQ = q; 39 38 bestActions.Add(aIdx); 40 } 41 if (q == bestQ) { 39 } else if (q.IsAlmost(bestQ)) { 42 40 bestActions.Add(aIdx); 43 41 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs
r11799 r11806 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 9 10 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies { 10 11 // this represents grammar policies that use one of the available bandit policies for state selection 11 public class GenericGrammarPolicy : IGrammarPolicy { 12 protected Dictionary<string, IBanditPolicyActionInfo> stateInfo; // stores the necessary information for bandit policies for each state 13 private readonly bool useCanonicalState; 12 // any bandit policy can be used to select actions for states 13 // a separate datastructure is used to store visited states and to prevent revisiting of states 14 public sealed class GenericGrammarPolicy : IGrammarPolicy { 15 private Dictionary<string, IBanditPolicyActionInfo> stateInfo; // stores the necessary information for bandit policies for each state (=canonical phrase) 16 private HashSet<string> done; 17 private readonly bool useCanonicalPhrases; 14 18 private readonly IProblem problem; 15 19 private readonly IBanditPolicy banditPolicy; 16 20 17 public GenericGrammarPolicy(IProblem problem, IBanditPolicy banditPolicy, bool useCanonical State= false) {18 this.useCanonical State = useCanonicalState;21 public GenericGrammarPolicy(IProblem problem, IBanditPolicy banditPolicy, bool useCanonicalPhrases = false) { 22 this.useCanonicalPhrases = useCanonicalPhrases; 19 23 this.problem = problem; 20 24 this.banditPolicy = banditPolicy; 21 25 this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>(); 26 this.done = new HashSet<string>(); 22 27 } 28 29 private IBanditPolicyActionInfo[] activeAfterStates; // don't allocate each time 30 private int[] actionIndexMap; // don't allocate each time 23 31 24 32 public bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) { 25 33 // fail if all states are done (corresponding state infos are disabled) 26 if (afterStates.All(s => GetStateInfo(s).Disabled)) {34 if (afterStates.All(s => Done(s))) { 27 35 // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored) 36 MarkAsDone(curState); 28 37 29 GetStateInfo(curState).Disable(afterStates.Select(afterState => GetStateInfo(afterState).Value).Max());30 38 selectedStateIdx = -1; 31 39 return false; 32 40 } 33 41 34 selectedStateIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s))); 42 // determine active actions (not done yet) and create an array to map the selected index back to original actions 43 if (activeAfterStates == null || activeAfterStates.Length < afterStates.Count()) { 44 activeAfterStates = new IBanditPolicyActionInfo[afterStates.Count()]; 45 actionIndexMap = new int[afterStates.Count()]; 46 } 47 var idx = 0; int originalIdx = 0; 48 foreach (var afterState in afterStates) { 49 if (!Done(afterState)) { 50 activeAfterStates[idx] = GetStateInfo(afterState); 51 actionIndexMap[idx] = originalIdx; 52 idx++; 53 } 54 originalIdx++; 55 } 56 57 selectedStateIdx = actionIndexMap[banditPolicy.SelectAction(random, activeAfterStates.Take(idx))]; 35 58 36 59 return true; 37 60 } 61 62 38 63 39 64 private IBanditPolicyActionInfo GetStateInfo(string state) { … … 47 72 } 48 73 49 public v irtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {74 public void UpdateReward(IEnumerable<string> stateTrajectory, double reward) { 50 75 foreach (var state in stateTrajectory) { 51 76 GetStateInfo(state).UpdateReward(reward); … … 53 78 // only the last state can be terminal 54 79 if (problem.Grammar.IsTerminal(state)) { 55 GetStateInfo(state).Disable(reward);80 MarkAsDone(state); 56 81 } 57 82 } 58 83 } 59 84 60 public virtual void Reset() { 85 86 public void Reset() { 61 87 stateInfo.Clear(); 88 done.Clear(); 62 89 } 63 90 … … 74 101 } 75 102 76 protected string CanonicalState(string state) { 77 if (useCanonicalState) { 103 // the canonical states for the value function (banditInfos) and the done set must be distinguished 104 // sequences of different length could have the same canonical representation and can have the same value (banditInfo) 105 // however, if the canonical representation of a state is shorter than we must not mark the canonical state as done when all possible derivations from the initial state have been explored 106 // eg. in the ant problem the canonical representation for ...lllA is ...rA 107 // even though all possible derivations (of limited length) of lllA have been visited we must not mark the state rA as done 108 private void MarkAsDone(string state) { 109 var s = CanonicalState(state); 110 // when the lengths of the canonical string and the original string are the same we also disable the actions 111 // always disable terminals 112 Debug.Assert(s.Length <= state.Length); 113 if (s.Length == state.Length || problem.Grammar.IsTerminal(state)) { 114 Debug.Assert(!done.Contains(s)); 115 done.Add(s); 116 } else { 117 // for non-terminals where the canonical string is shorter than the original string we can only disable the canonical representation for all states in the same level 118 Debug.Assert(!done.Contains(s + state.Length)); 119 done.Add(s + state.Length); // encode the original length of the state, states in the same level of the tree are treated as equivalent 120 } 121 } 122 123 // symmetric to MarkDone 124 private bool Done(string state) { 125 var s = CanonicalState(state); 126 if (s.Length == state.Length || problem.Grammar.IsTerminal(state)) { 127 return done.Contains(s); 128 } else { 129 // it is not necessary to visit states if the canonical representation has already been fully explored 130 if (done.Contains(s)) return true; 131 if (done.Contains(s + state.Length)) return true; 132 for (int i = 1; i < state.Length; i++) { 133 if (done.Contains(s + i)) return true; 134 } 135 return false; 136 } 137 } 138 139 private string CanonicalState(string state) { 140 if (useCanonicalPhrases) { 78 141 return problem.CanonicalRepresentation(state); 79 142 } else -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj
r11793 r11806 49 49 <Compile Include="BanditPolicies\ChernoffIntervalEstimationPolicy.cs" /> 50 50 <Compile Include="BanditPolicies\ActiveLearningPolicy.cs" /> 51 <Compile Include="BanditPolicies\ModifiedUCTPolicy.cs" /> 51 52 <Compile Include="BanditPolicies\DefaultPolicyActionInfo.cs" /> 52 53 <Compile Include="BanditPolicies\EpsGreedyPolicy.cs" /> … … 66 67 <Compile Include="Bandits\IBandit.cs" /> 67 68 <Compile Include="Bandits\TruncatedNormalBandit.cs" /> 69 <Compile Include="GrammarPolicies\GenericTDPolicy.cs" /> 68 70 <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs"> 69 71 <SubType>Code</SubType> … … 72 74 <SubType>Code</SubType> 73 75 </Compile> 74 <Compile Include="GrammarPolicies\TDPolicy.cs" />75 76 <Compile Include="GrammarPolicies\GrammarPolicy.cs" /> 76 77 <Compile Include="GrammarPolicies\IGrammarPolicy.cs" /> -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs
r11770 r11806 1 1 namespace HeuristicLab.Algorithms.Bandits { 2 2 public interface IBanditPolicyActionInfo { 3 bool Disabled { get; }3 //bool Disabled { get; } 4 4 double Value { get; } 5 5 int Tries { get; } 6 6 void UpdateReward(double reward); 7 void Disable(double reward);7 //void Disable(double reward); 8 8 // reset causes the state of the action to be reinitialized (as after constructor-call) 9 9 void Reset(); 10 void PrintStats();11 10 } 12 11 }
Note: See TracChangeset
for help on using the changeset viewer.