Changeset 11732 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs
- Timestamp:
- 01/07/15 09:21:46 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs
r11730 r11732 10 10 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */ 11 11 12 public class ChernoffIntervalEstimationPolicy : BanditPolicy { 13 private readonly int[] tries; 14 private readonly double[] sumReward; 15 private int totalTries = 0; 12 public class ChernoffIntervalEstimationPolicy : IPolicy { 16 13 private readonly double delta; 17 14 18 public ChernoffIntervalEstimationPolicy(int numActions, double delta = 0.01) 19 : base(numActions) { 15 public ChernoffIntervalEstimationPolicy(double delta = 0.01) { 20 16 this.delta = delta; 21 this.tries = new int[numActions];22 this.sumReward = new double[numActions];23 17 } 24 25 public override int SelectAction() { 18 public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) { 19 Debug.Assert(actionInfos.Any()); 20 // select best 21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance 22 int k = myActionInfos.Length; 23 int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); 26 24 int bestAction = -1; 27 25 double bestQ = double.NegativeInfinity; 28 double k = Actions.Count(); 29 Debug.Assert(k > 0); 30 foreach (var a in Actions) { 31 if (tries[a] == 0) return a; 26 for (int a = 0; a < myActionInfos.Length; a++) { 27 if (myActionInfos[a].Disabled) continue; 28 if (myActionInfos[a].Tries == 0) return a; 29 30 var sumReward = myActionInfos[a].SumReward; 31 var tries = myActionInfos[a].Tries; 32 33 var avgReward = sumReward / tries; 34 32 35 // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem" 33 36 // var alpha = Math.Log(2 * totalTries * k / delta); 34 37 double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper 35 double mu = sumReward[a] / tries[a]; 36 var q = mu + (alpha + Math.Sqrt(2 * tries[a] * mu * alpha + alpha * alpha)) / tries[a]; 38 var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries; 37 39 if (q > bestQ) { 38 40 bestQ = q; … … 40 42 } 41 43 } 44 Debug.Assert(bestAction >= 0); 42 45 return bestAction; 43 46 } 44 public override void UpdateReward(int action, double reward) { 45 Debug.Assert(Actions.Contains(action)); 46 totalTries++; 47 tries[action]++; 48 sumReward[action] += reward; 47 48 public IPolicyActionInfo CreateActionInfo() { 49 return new DefaultPolicyActionInfo(); 49 50 } 50 51 51 public override void DisableAction(int action) {52 base.DisableAction(action);53 totalTries -= tries[action];54 tries[action] = -1;55 sumReward[action] = 0;56 }57 58 public override void Reset() {59 base.Reset();60 totalTries = 0;61 Array.Clear(tries, 0, tries.Length);62 Array.Clear(sumReward, 0, sumReward.Length);63 }64 65 public override void PrintStats() {66 for (int i = 0; i < sumReward.Length; i++) {67 if (tries[i] >= 0) {68 Console.Write("{0,5:F2}", sumReward[i] / tries[i]);69 } else {70 Console.Write("{0,5}", "");71 }72 }73 Console.WriteLine();74 }75 52 public override string ToString() { 76 53 return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
Note: See TracChangeset
for help on using the changeset viewer.