Changeset 11792 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 01/16/15 18:26:35 (9 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ActiveLearningPolicy.cs
r11747 r11792 11 11 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 12 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 13 double bestQ = double.NegativeInfinity;14 13 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 15 14 const double delta = 0.1; … … 26 25 double l; 27 26 if (aInfo.Tries == 0) { 28 u = 1.0;29 l = 0.0;27 u = double.PositiveInfinity; 28 l = double.NegativeInfinity; 30 29 } else { 31 30 q = aInfo.SumReward / aInfo.Tries; -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs
r11742 r11792 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { … … 22 23 int k = myActionInfos.Count(a => !a.Disabled); 23 24 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 24 int bestAction = -1;25 25 double bestQ = double.NegativeInfinity; 26 var bestActions = new List<int>(); 26 27 var aIdx = -1; 27 28 foreach (var aInfo in myActionInfos) { 28 29 aIdx++; 29 30 if (aInfo.Disabled) continue; 30 if (aInfo.Tries == 0) return aIdx; 31 double q; 32 if (aInfo.Tries == 0) { 33 q = double.PositiveInfinity; 34 } else { 31 35 32 var avgReward = aInfo.SumReward / aInfo.Tries;36 var avgReward = aInfo.SumReward / aInfo.Tries; 33 37 34 // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" 35 // var alpha = Math.Log(2 * totalTries * k / delta); 36 double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper 37 var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries; 38 // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" 39 // var alpha = Math.Log(2 * totalTries * k / delta); 40 double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); 41 // total tries is max tries in the original paper 42 q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries; 43 } 38 44 if (q > bestQ) { 39 45 bestQ = q; 40 bestAction = aIdx; 46 bestActions.Clear(); 47 bestActions.Add(aIdx); 48 } else if (q == bestQ) { 49 bestActions.Add(aIdx); 41 50 } 42 51 } 43 Debug.Assert(bestAction >= 0);44 return bestAction ;52 Debug.Assert(bestActions.Any()); 53 return bestActions.SelectRandom(random); 45 54 } 46 55 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs
r11747 r11792 33 33 get { 34 34 if (Disabled) return knownValue; 35 if (Tries == 0.0) return 0.0;35 if (Tries == 0.0) return 0.0; 36 36 return rewardHistogram[thresholdBin] / (double)Tries; 37 37 } … … 99 99 UpdateThreshold(myActionInfos); 100 100 101 int bestAction = -1;101 var bestActions = new List<int>(); 102 102 double bestQ = double.NegativeInfinity; 103 103 int k = myActionInfos.Count(a => !a.Disabled); … … 107 107 aIdx++; 108 108 if (aInfo.Disabled) continue; 109 if (aInfo.Tries == 0) return aIdx; 110 double mu = aInfo.Value; // probability of rewards > T 111 double q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper 109 double q; 110 if (aInfo.Tries == 0) { 111 q = double.PositiveInfinity; 112 } else { 113 double mu = aInfo.Value; // probability of rewards > T 114 q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper 115 } 112 116 if (q > bestQ) { 113 117 bestQ = q; 114 bestAction = aIdx; 118 bestActions.Clear(); 119 bestActions.Add(aIdx); 120 } else if (q == bestQ) { 121 bestActions.Add(aIdx); 115 122 } 116 123 } 117 Debug.Assert(bestAction > -1);118 return bestAction ;124 Debug.Assert(bestActions.Any()); 125 return bestActions.SelectRandom(random); 119 126 } 120 127 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1TunedPolicy.cs
r11742 r11792 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { … … 12 13 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 13 14 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>(); 14 int bestAction = -1; 15 double bestQ = double.NegativeInfinity; 15 16 16 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 17 17 18 18 int aIdx = -1; 19 double bestQ = double.NegativeInfinity; 20 var bestActions = new List<int>(); 19 21 foreach (var aInfo in myActionInfos) { 20 22 aIdx++; 21 23 if (aInfo.Disabled) continue; 22 if (aInfo.Tries == 0) return aIdx; 24 double q; 25 if (aInfo.Tries == 0) { 26 q = double.PositiveInfinity; 27 } else { 28 var sumReward = aInfo.SumReward; 29 var tries = aInfo.Tries; 23 30 24 var sumReward = aInfo.SumReward; 25 var tries = aInfo.Tries; 26 27 var avgReward = sumReward / tries; 28 var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable 31 var avgReward = sumReward / tries; 32 q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); 33 // 1/4 is upper bound of bernoulli distributed variable 34 } 29 35 if (q > bestQ) { 30 36 bestQ = q; 31 bestAction = aIdx; 37 bestActions.Clear(); 38 bestActions.Add(aIdx); 39 } else if (q == bestQ) { 40 bestActions.Add(aIdx); 32 41 } 33 42 } 34 Debug.Assert(bestAction > -1); 35 return bestAction; 43 Debug.Assert(bestActions.Any()); 44 45 return bestActions.SelectRandom(random); 36 46 } 37 47 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCBNormalPolicy.cs
r11742 r11792 5 5 using System.Text; 6 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 7 8 8 9 namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { … … 11 12 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 12 13 var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>(); 13 int bestAction = -1;14 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 14 15 double bestQ = double.NegativeInfinity; 15 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);16 16 int aIdx = -1; 17 var bestActions = new List<int>(); 17 18 foreach (var aInfo in myActionInfos) { 18 19 aIdx++; 19 20 if (aInfo.Disabled) continue; 20 if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return aIdx; 21 22 var tries = aInfo.Tries; 23 var avgReward = aInfo.AvgReward; 24 var rewardVariance = aInfo.RewardVariance; 25 var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries); 26 var q = avgReward + Math.Sqrt(estVariance); 21 double q; 22 if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) { 23 q = double.PositiveInfinity; 24 } else { 25 var tries = aInfo.Tries; 26 var avgReward = aInfo.AvgReward; 27 var rewardVariance = aInfo.RewardVariance; 28 var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries); 29 q = avgReward + Math.Sqrt(estVariance); 30 } 27 31 if (q > bestQ) { 28 32 bestQ = q; 29 bestAction = aIdx; 33 bestActions.Clear(); 34 bestActions.Add(aIdx); 35 } else if (q == bestQ) { 36 bestActions.Add(aIdx); 30 37 } 31 38 } 32 Debug.Assert(bestAction > -1);33 return bestAction ;39 Debug.Assert(bestActions.Any()); 40 return bestActions.SelectRandom(random); 34 41 } 35 42 -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs
r11770 r11792 27 27 out ReadonlySequence selectedState) { 28 28 // only select states that are not yet done 29 afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a .ToString()))).ToArray();29 afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray(); 30 30 if (!afterStates.Any()) { 31 31 // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored) 32 throw new NotImplementedException(); 33 //var curStateCanonical = CanonicalState(curState.ToString()); 34 //if (curState.ToString().Length == curStateCanonical.Length) 35 done.Add(CanonicalState(curState.ToString())); 32 33 done.Add(CanonicalState(curState)); 36 34 selectedState = null; 37 35 return false; … … 45 43 46 44 private IBanditPolicyActionInfo GetStateInfo(ReadonlySequence state) { 47 var s = CanonicalState(state .ToString());45 var s = CanonicalState(state); 48 46 IBanditPolicyActionInfo info; 49 47 if (!stateInfo.TryGetValue(s, out info)) { … … 57 55 // the last state could be terminal 58 56 var lastState = stateTrajectory.Last(); 59 if (lastState.IsTerminal) done.Add(CanonicalState(lastState .ToString()));57 if (lastState.IsTerminal) done.Add(CanonicalState(lastState)); 60 58 61 59 foreach (var state in stateTrajectory) { … … 70 68 71 69 public int GetTries(ReadonlySequence state) { 72 var s = CanonicalState(state .ToString());70 var s = CanonicalState(state); 73 71 if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries; 74 72 else return 0; … … 76 74 77 75 public double GetValue(ReadonlySequence state) { 78 var s = CanonicalState(state .ToString());76 var s = CanonicalState(state); 79 77 if (stateInfo.ContainsKey(s)) return stateInfo[s].Value; 80 78 else return 0.0; // TODO: check alternatives 81 79 } 82 80 83 protected string CanonicalState(string state) { 84 if (useCanonicalState) return problem.CanonicalRepresentation(state); 85 else return state; 81 protected string CanonicalState(ReadonlySequence state) { 82 if (useCanonicalState) { 83 if (state.IsTerminal) 84 return problem.CanonicalRepresentation(state.ToString()); 85 else { 86 // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored 87 // e.g. if for the ant problem we have the phrase lllS (and we are limited to 4 symbols) and lllr as well as llll are explored 88 // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored 89 // solution: we disable the state rS4 90 return problem.CanonicalRepresentation(state.ToString()) + state.Length; 91 } 92 } else 93 return state.ToString(); 86 94 } 87 95 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/BernoulliModel.cs
r11742 r11792 28 28 29 29 public void Update(double reward) { 30 Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));31 if (reward .IsAlmost(1.0)) {30 // Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0)); 31 if (reward > 0) { 32 32 success++; 33 33 } else {
Note: See TracChangeset
for help on using the changeset viewer.