Changeset 12893 for branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies
- Timestamp:
- 08/24/15 13:56:27 (9 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies
- Files:
-
- 2 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ActiveLearningPolicy.cs
r12876 r12893 31 31 l = double.NegativeInfinity; 32 32 } else { 33 q = aInfo. SumReward / aInfo.Tries;33 q = aInfo.MaxReward; 34 34 var b = Math.Sqrt(Math.Log(2.0 * k * totalTries / delta) / (2.0 * aInfo.Tries)); 35 35 u = q + MaxReward * b; -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs
r12290 r12893 12 12 private readonly double beta; 13 13 14 public BoltzmannExplorationPolicy(double beta) {14 public BoltzmannExplorationPolicy(double beta) { 15 15 if (beta < 0) throw new ArgumentException(); 16 16 this.beta = beta; … … 19 19 Debug.Assert(actionInfos.Any()); 20 20 21 // select best22 21 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 23 22 24 23 // try any of the untries actions randomly 25 // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first) 26 //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) { 27 // return myActionInfos 28 // .Select((aInfo, idx) => new { aInfo, idx }) 29 // .Where(p => !p.aInfo.Disabled) 30 // .Where(p => p.aInfo.Tries == 0) 31 // .SelectRandom(random).idx; 24 if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) { 25 return myActionInfos 26 .Select((aInfo, idx) => new { aInfo, idx }) 27 .Where(p => p.aInfo.Tries == 0) 28 .SelectRandom(random).idx; 29 } 30 31 // using ranks 32 //var qualities = actionInfos.Select(i => i.MaxReward).ToArray(); // largest reward should have largest rank 33 //var ranks = Enumerable.Range(0, myActionInfos.Count()).ToArray(); 34 //Array.Sort(qualities, ranks); 35 // 36 //// set same rank for same quality 37 ////for (int i = 0; i < ranks.Length - 1; i++) { 38 //// if (qualities[i] == qualities[i + 1]) ranks[i + 1] = ranks[i]; 39 ////} 40 //// 41 // 42 //var rankForAction = new int[myActionInfos.Count()]; 43 //for (int i = 0; i < rankForAction.Length; i++) { 44 // rankForAction[ranks[i]] = i; 32 45 //} 46 // 47 //var w = from idx in Enumerable.Range(0, myActionInfos.Count()) 48 // select Math.Exp(beta * rankForAction[idx]); 33 49 34 var w = from aInfo in myActionInfos 35 select Math.Exp(beta * aInfo.Value); 50 51 // windowing 52 var max = actionInfos.Select(i => i.MaxReward).Max(); 53 var min = actionInfos.Select(i => i.MaxReward).Min(); 54 double range = max - min; 55 var w = from aInfo in actionInfos 56 select Math.Exp(beta * (aInfo.MaxReward - min) / range); 36 57 37 58 var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w); -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs
r12876 r12893 33 33 } else { 34 34 35 var avgReward = aInfo.SumReward / aInfo.Tries; 35 var avgReward = aInfo.SumReward / aInfo.Tries; 36 36 37 37 // page 5 of "A simple distribution-free approach to the max k-armed bandit problem" -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs
r12290 r12893 24 24 public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) { 25 25 Debug.Assert(actionInfos.Any()); 26 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>(); 27 int totalTries = myActionInfos.Select(i => i.Tries).Sum(); 28 29 //var eps = Math.Exp(Math.Exp(-totalTries/200.0)) - 1; 30 26 31 if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration 27 32 // select best 28 var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();29 33 var bestActions = new List<int>(); 30 34 double bestQ = double.NegativeInfinity; … … 34 38 aIdx++; 35 39 36 var q = aInfo. Value;40 var q = aInfo.MaxReward; 37 41 38 42 if (q > bestQ) { … … 45 49 } 46 50 Debug.Assert(bestActions.Any()); 47 return bestActions.SelectRandom(random); 51 //return bestActions.SelectRandom(random); 52 return bestActions.First(); 48 53 } else { 49 54 // select random -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ExtremeHunterPolicy.cs
r12876 r12893 16 16 public double delta { get; set; } 17 17 public double b { get; set; } 18 public double n { get; set; } 19 public int minPulls { get; set; } 18 20 19 public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0 ) {21 public ExtremeHunterPolicy(double E = 1.0E-3, double D = 1.0E-2, double b = 1.0, double n = 1.0E4, int minPulls = 100) { 20 22 this.E = E; // parameter TODO 21 23 this.D = D; // parameter TODO 22 this.b = b; // parameter TODO24 this.b = b; // parameter: set to 1 in original paper "to consider a wide class of distributions" 23 25 // private communication with Alexandra Carpentier: 24 26 // For instance, on our synthetic experiments, we calibrated the constants by … … 26 28 // out that taking E = 1e-3 is acceptable. For all the datasets 27 29 // (exact Pareto, approximate Pareto, and network data), we kept this same constant 30 31 // minPulls seems to be set to 100 in the experiments in extreme bandit paper 32 this.minPulls = minPulls; // parameter: TODO (there are conditions for N given in the paper) 33 this.n = n; 28 34 } 29 35 … … 31 37 var myActionInfos = actionInfos.OfType<ExtremeHunterActionInfo>(); 32 38 double bestQ = double.NegativeInfinity; 33 int totalTries = myActionInfos.Sum(a => a.Tries);39 // int totalTries = myActionInfos.Sum(a => a.Tries); 34 40 int K = myActionInfos.Count(); 35 double n = 1.0E2; // total tries parameter36 double minPulls = 100; // parameter: TODO (there are conditions for N given in the paper)37 41 38 42 this.delta = Math.Exp(-Math.Log(Math.Log(n))) / (2.0 * n * K); // TODO … … 48 52 double t = aInfo.Tries; 49 53 double h = aInfo.Value; 50 var thres = Math.Pow(t, h / (2 * b + 1)); 51 double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres)); 52 q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5) 53 Debug.Assert(q > 0); 54 if (double.IsInfinity(h)) q = 0; 55 else { 56 var thres = Math.Pow(t, h / (2 * b + 1)); 57 double c = Math.Pow(t, 1.0 / (2 * b + 1)) * ((1.0 / t) * aInfo.Rewards.Count(r => r >= thres)); 58 q = Math.Pow((c + B2(t)) * n, h + B1(t)) * Gamma(h, B1(t)); // eqn (5) 59 Debug.Assert(q > 0); 60 } 54 61 } 55 62 if (q > bestQ) { … … 84 91 } 85 92 public override string ToString() { 86 return "ExtremeHunter";93 return string.Format("ExtremeHunter(E={0:F2},D={1:F2},b={2:F2},n={3:F0},minPulls={4:F0}", E, D, b, n, minPulls); 87 94 } 88 95 } -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs
r11976 r12893 28 28 public int Tries { get; private set; } 29 29 public int thresholdBin = 1; 30 30 //public double MaxReward { get { return Value; }} 31 public double MaxReward { get; private set; } 31 32 public double Value { 32 33 get { … … 37 38 38 39 public void UpdateReward(double reward) { 40 MaxReward = Math.Max(MaxReward, reward); 39 41 Tries++; 40 42 for (var idx = thresholdBin; idx <= RewardBin(reward); idx++) … … 43 45 44 46 public void Reset() { 47 MaxReward = double.NegativeInfinity; 45 48 Tries = 0; 46 49 thresholdBin = 1; … … 84 87 double bestQ = double.NegativeInfinity; 85 88 int k = myActionInfos.Count(); 86 var totalTries = myActionInfos.Sum(a => a.Tries); 89 //var totalTries = myActionInfos.Sum(a => a.Tries); 90 var totalTries = 100000; 87 91 int aIdx = -1; 88 92 foreach (var aInfo in myActionInfos) { -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs
r12876 r12893 28 28 } else { 29 29 30 q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 30 //q = aInfo.SumReward / aInfo.Tries + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 31 q = aInfo.MaxReward + MaxReward * Math.Sqrt((2 * Math.Log(totalTries)) / aInfo.Tries); 31 32 } 32 33 if (q > bestQ) { … … 46 47 } 47 48 public override string ToString() { 48 return "UCB1Policy";49 return string.Format("UCB1Policy({0})", MaxReward); 49 50 } 50 51 } -
branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs
r12876 r12893 29 29 var tries = aInfo.Tries; 30 30 31 //var avgReward = aInfo.MaxReward; 31 32 var avgReward = sumReward / tries; 32 33 q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));
Note: See TracChangeset
for help on using the changeset viewer.