Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs @ 11744

Visit:

Last change on this file since 11744 was 11744, checked in by gkronber, 10 years ago
#2283 worked on TD, and models for MCTS
File size: 4.5 KB

Line
1	using System;
2	using System.Collections.Generic;
3	using System.Diagnostics;
4	using System.Linq;
5	using System.Text;
6	using System.Threading.Tasks;
7	using HeuristicLab.Common;
8
9	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
11	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13	public class ThresholdAscentPolicy : IBanditPolicy {
14	public const int numBins = 101;
15	public const double binSize = 1.0 / (numBins - 1);
16
17	private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19	// for each arm store the number of observed rewards for each bin of size delta
20	// for delta = 0.01 we have 101 bins
21	// the first bin is freq of rewards >= 0 // all
22	// the second bin is freq of rewards > 0
23	// the third bin is freq of rewards > 0.01
24	// the last bin is for rewards > 0.99
25	//
26	// (also see RewardBin function)
27	public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28	public int Tries { get; private set; }
29	public int thresholdBin = 1;
30	public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }
31
32	public bool Disabled { get { return Tries == -1; } }
33
34	public void UpdateReward(double reward) {
35	Tries++;
36	for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
37	rewardHistogram[idx]++;
38	}
39
40	public void Disable() {
41	Tries = -1;
42	}
43
44	public void Reset() {
45	Tries = 0;
46	thresholdBin = 1;
47	Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
48	}
49
50	public void PrintStats() {
51	if (Tries >= 0) {
52	Console.Write("{0,6}", Tries);
53	} else {
54	Console.Write("{0,6}", "");
55	}
56	}
57
58	// maps a reward value to it's bin
59	private static int RewardBin(double reward) {
60	Debug.Assert(reward >= 0 && reward <= 1.0);
61	// reward = 0 => 0
62	// ]0.00 .. 0.01] => 1
63	// ]0.01 .. 0.02] => 2
64	// ...
65	// ]0.99 .. 1.00] => 100
66	if (reward <= 0) return 0;
67	return (int)Math.Ceiling((reward / binSize));
68	}
69	}
70
71	private readonly int s;
72	private readonly double delta;
73
74	public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
75	this.s = s;
76	this.delta = delta;
77	}
78
79	private double U(double mu, double totalTries, int n, int k) {
80	//var alpha = Math.Log(2.0 * totalTries * k / delta);
81	double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
82	return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
83	}
84
85
86	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
87	Debug.Assert(actionInfos.Any());
88	var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
89	UpdateThreshold(myActionInfos);
90
91	int bestAction = -1;
92	double bestQ = double.NegativeInfinity;
93	int k = myActionInfos.Count(a => !a.Disabled);
94	var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
95	int aIdx = -1;
96	foreach (var aInfo in myActionInfos) {
97	aIdx++;
98	if (aInfo.Disabled) continue;
99	if (aInfo.Tries == 0) return aIdx;
100	double mu = aInfo.Value; // probability of rewards > T
101	double q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
102	if (q > bestQ) {
103	bestQ = q;
104	bestAction = aIdx;
105	}
106	}
107	Debug.Assert(bestAction > -1);
108	return bestAction;
109	}
110
111
112	private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
113	var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
114	while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
115	thresholdBin++;
116	// Console.WriteLine("New threshold {0:F2}", T);
117	}
118	foreach (var aInfo in actionInfos) {
119	aInfo.thresholdBin = thresholdBin;
120	}
121	}
122
123
124	public IBanditPolicyActionInfo CreateActionInfo() {
125	return new ThresholdAscentActionInfo();
126	}
127
128	public override string ToString() {
129	return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
130	}
131
132	}
133	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences