Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ThresholdAscentPolicy.cs @ 13042

Visit:

Last change on this file since 13042 was 12893, checked in by gkronber, 9 years ago
#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)
File size: 4.7 KB

Rev	Line
[11730]	1	using System;
	2	using System.Collections.Generic;
	3	using System.Diagnostics;
	4	using System.Linq;
	5	using System.Text;
	6	using System.Threading.Tasks;
[11742]	7	using HeuristicLab.Common;
[11730]	8
[11742]	9	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]	10	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
	11	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
	12
[11742]	13	public class ThresholdAscentPolicy : IBanditPolicy {
	14	public const int numBins = 101;
	15	public const double binSize = 1.0 / (numBins - 1);
[11730]	16
[11742]	17	private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {
[11730]	18
[11742]	19	// for each arm store the number of observed rewards for each bin of size delta
	20	// for delta = 0.01 we have 101 bins
	21	// the first bin is freq of rewards >= 0 // all
	22	// the second bin is freq of rewards > 0
	23	// the third bin is freq of rewards > 0.01
	24	// the last bin is for rewards > 0.99
	25	//
	26	// (also see RewardBin function)
	27	public int[] rewardHistogram = new int[numBins]; // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
	28	public int Tries { get; private set; }
	29	public int thresholdBin = 1;
[12893]	30	//public double MaxReward { get { return Value; }}
	31	public double MaxReward { get; private set; }
[11747]	32	public double Value {
	33	get {
[11976]	34	if (Tries == 0.0) return double.PositiveInfinity;
[11747]	35	return rewardHistogram[thresholdBin] / (double)Tries;
	36	}
	37	}
	38
[11742]	39	public void UpdateReward(double reward) {
[12893]	40	MaxReward = Math.Max(MaxReward, reward);
[11742]	41	Tries++;
	42	for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
	43	rewardHistogram[idx]++;
	44	}
	45
	46	public void Reset() {
[12893]	47	MaxReward = double.NegativeInfinity;
[11742]	48	Tries = 0;
	49	thresholdBin = 1;
	50	Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
	51	}
	52
	53	// maps a reward value to it's bin
	54	private static int RewardBin(double reward) {
	55	Debug.Assert(reward >= 0 && reward <= 1.0);
	56	// reward = 0 => 0
	57	// ]0.00 .. 0.01] => 1
	58	// ]0.01 .. 0.02] => 2
	59	// ...
	60	// ]0.99 .. 1.00] => 100
	61	if (reward <= 0) return 0;
	62	return (int)Math.Ceiling((reward / binSize));
	63	}
	64	}
	65
[11730]	66	private readonly int s;
	67	private readonly double delta;
	68
[11742]	69	public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
[11730]	70	this.s = s;
	71	this.delta = delta;
	72	}
	73
[11744]	74	private double U(double mu, double totalTries, int n, int k) {
[11730]	75	//var alpha = Math.Log(2.0 * totalTries * k / delta);
[11742]	76	double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
[11730]	77	return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
	78	}
	79
	80
[11742]	81	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
	82	Debug.Assert(actionInfos.Any());
	83	var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
	84	UpdateThreshold(myActionInfos);
	85
[11792]	86	var bestActions = new List<int>();
[11730]	87	double bestQ = double.NegativeInfinity;
[11806]	88	int k = myActionInfos.Count();
[12893]	89	//var totalTries = myActionInfos.Sum(a => a.Tries);
	90	var totalTries = 100000;
[11742]	91	int aIdx = -1;
	92	foreach (var aInfo in myActionInfos) {
	93	aIdx++;
[11792]	94	double q;
	95	if (aInfo.Tries == 0) {
	96	q = double.PositiveInfinity;
	97	} else {
	98	double mu = aInfo.Value; // probability of rewards > T
	99	q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
	100	}
[11730]	101	if (q > bestQ) {
	102	bestQ = q;
[11792]	103	bestActions.Clear();
	104	bestActions.Add(aIdx);
[11806]	105	} else if (q.IsAlmost(bestQ)) {
[11792]	106	bestActions.Add(aIdx);
[11730]	107	}
	108	}
[11792]	109	Debug.Assert(bestActions.Any());
	110	return bestActions.SelectRandom(random);
[11730]	111	}
	112
[11742]	113
	114	private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
	115	var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
	116	while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
[11730]	117	thresholdBin++;
	118	// Console.WriteLine("New threshold {0:F2}", T);
	119	}
[11742]	120	foreach (var aInfo in actionInfos) {
	121	aInfo.thresholdBin = thresholdBin;
	122	}
[11730]	123	}
	124
	125
[11742]	126	public IBanditPolicyActionInfo CreateActionInfo() {
	127	return new ThresholdAscentActionInfo();
[11730]	128	}
	129
	130	public override string ToString() {
	131	return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
	132	}
	133
	134	}
	135	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences