Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs @ 11792

Visit:

Last change on this file since 11792 was 11792, checked in by gkronber, 9 years ago
#2283 work-in-progress commit (does not compile)
File size: 5.0 KB

Line
1	using System;
2	using System.Collections.Generic;
3	using System.Diagnostics;
4	using System.Linq;
5	using System.Text;
6	using System.Threading.Tasks;
7	using HeuristicLab.Common;
8
9	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
11	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
12
13	public class ThresholdAscentPolicy : IBanditPolicy {
14	public const int numBins = 101;
15	public const double binSize = 1.0 / (numBins - 1);
16
17	private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
18
19	// for each arm store the number of observed rewards for each bin of size delta
20	// for delta = 0.01 we have 101 bins
21	// the first bin is freq of rewards >= 0 // all
22	// the second bin is freq of rewards > 0
23	// the third bin is freq of rewards > 0.01
24	// the last bin is for rewards > 0.99
25	//
26	// (also see RewardBin function)
27	public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
28	public int Tries { get; private set; }
29	public int thresholdBin = 1;
30	private double knownValue;
31
32	public double Value {
33	get {
34	if (Disabled) return knownValue;
35	if (Tries == 0.0) return 0.0;
36	return rewardHistogram[thresholdBin] / (double)Tries;
37	}
38	}
39
40	public bool Disabled { get { return Tries == -1; } }
41
42	public void UpdateReward(double reward) {
43	Tries++;
44	for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
45	rewardHistogram[idx]++;
46	}
47
48	public void Disable(double reward) {
49	this.knownValue = reward;
50	Tries = -1;
51	}
52
53	public void Reset() {
54	Tries = 0;
55	thresholdBin = 1;
56	this.knownValue = 0.0;
57	Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
58	}
59
60	public void PrintStats() {
61	if (Tries >= 0) {
62	Console.Write("{0,6}", Tries);
63	} else {
64	Console.Write("{0,6}", "");
65	}
66	}
67
68	// maps a reward value to it's bin
69	private static int RewardBin(double reward) {
70	Debug.Assert(reward >= 0 && reward <= 1.0);
71	// reward = 0 => 0
72	// ]0.00 .. 0.01] => 1
73	// ]0.01 .. 0.02] => 2
74	// ...
75	// ]0.99 .. 1.00] => 100
76	if (reward <= 0) return 0;
77	return (int)Math.Ceiling((reward / binSize));
78	}
79	}
80
81	private readonly int s;
82	private readonly double delta;
83
84	public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
85	this.s = s;
86	this.delta = delta;
87	}
88
89	private double U(double mu, double totalTries, int n, int k) {
90	//var alpha = Math.Log(2.0 * totalTries * k / delta);
91	double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
92	return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
93	}
94
95
96	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
97	Debug.Assert(actionInfos.Any());
98	var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
99	UpdateThreshold(myActionInfos);
100
101	var bestActions = new List<int>();
102	double bestQ = double.NegativeInfinity;
103	int k = myActionInfos.Count(a => !a.Disabled);
104	var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
105	int aIdx = -1;
106	foreach (var aInfo in myActionInfos) {
107	aIdx++;
108	if (aInfo.Disabled) continue;
109	double q;
110	if (aInfo.Tries == 0) {
111	q = double.PositiveInfinity;
112	} else {
113	double mu = aInfo.Value; // probability of rewards > T
114	q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
115	}
116	if (q > bestQ) {
117	bestQ = q;
118	bestActions.Clear();
119	bestActions.Add(aIdx);
120	} else if (q == bestQ) {
121	bestActions.Add(aIdx);
122	}
123	}
124	Debug.Assert(bestActions.Any());
125	return bestActions.SelectRandom(random);
126	}
127
128
129	private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
130	var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
131	while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
132	thresholdBin++;
133	// Console.WriteLine("New threshold {0:F2}", T);
134	}
135	foreach (var aInfo in actionInfos) {
136	aInfo.thresholdBin = thresholdBin;
137	}
138	}
139
140
141	public IBanditPolicyActionInfo CreateActionInfo() {
142	return new ThresholdAscentActionInfo();
143	}
144
145	public override string ToString() {
146	return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
147	}
148
149	}
150	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences