Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs @ 12448

Visit:

Last change on this file since 12448 was 11806, checked in by gkronber, 10 years ago
#2283: separated value-states from done-states in GenericGrammarPolicy and removed disabling of actions from bandit policies
File size: 2.3 KB

Line
1	using System;
2	using System.Collections.Generic;
3	using System.Diagnostics;
4	using System.Linq;
5	using System.Text;
6	using System.Threading.Tasks;
7	using HeuristicLab.Common;
8
9	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
11	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13	public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
14	private readonly double delta;
15
16	public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
17	this.delta = delta;
18	}
19	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
20	Debug.Assert(actionInfos.Any());
21	// select best
22	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
23	int k = myActionInfos.Count();
24	int totalTries = myActionInfos.Sum(a => a.Tries);
25	double bestQ = double.NegativeInfinity;
26	var bestActions = new List<int>();
27	var aIdx = -1;
28	foreach (var aInfo in myActionInfos) {
29	aIdx++;
30	double q;
31	if (aInfo.Tries == 0) {
32	q = double.PositiveInfinity;
33	} else {
34
35	var avgReward = aInfo.SumReward / aInfo.Tries;
36
37	// page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
38	// var alpha = Math.Log(2 * totalTries * k / delta);
39	double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
40	// total tries is max tries in the original paper
41	q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
42	}
43	if (q > bestQ) {
44	bestQ = q;
45	bestActions.Clear();
46	bestActions.Add(aIdx);
47	} else if (q.IsAlmost(bestQ)) {
48	bestActions.Add(aIdx);
49	}
50	}
51	Debug.Assert(bestActions.Any());
52	return bestActions.SelectRandom(random);
53	}
54
55	public IBanditPolicyActionInfo CreateActionInfo() {
56	return new DefaultPolicyActionInfo();
57	}
58
59	public override string ToString() {
60	return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
61	}
62	}
63	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences