Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs @ 11742

Visit:

Last change on this file since 11742 was 11742, checked in by gkronber, 9 years ago
#2283 refactoring
File size: 2.1 KB

Rev	Line
[11730]	1	using System;
	2	using System.Collections.Generic;
	3	using System.Diagnostics;
	4	using System.Linq;
	5	using System.Text;
	6	using System.Threading.Tasks;
	7
[11742]	8	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]	9	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
	10	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
	11
[11742]	12	public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
[11730]	13	private readonly double delta;
	14
[11732]	15	public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
[11730]	16	this.delta = delta;
	17	}
[11742]	18	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
[11732]	19	Debug.Assert(actionInfos.Any());
	20	// select best
[11742]	21	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
	22	int k = myActionInfos.Count(a => !a.Disabled);
[11732]	23	int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
[11730]	24	int bestAction = -1;
	25	double bestQ = double.NegativeInfinity;
[11742]	26	var aIdx = -1;
	27	foreach (var aInfo in myActionInfos) {
	28	aIdx++;
	29	if (aInfo.Disabled) continue;
	30	if (aInfo.Tries == 0) return aIdx;
[11732]	31
[11742]	32	var avgReward = aInfo.SumReward / aInfo.Tries;
[11732]	33
[11730]	34	// page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
	35	// var alpha = Math.Log(2 * totalTries * k / delta);
[11742]	36	double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
	37	var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
[11730]	38	if (q > bestQ) {
	39	bestQ = q;
[11742]	40	bestAction = aIdx;
[11730]	41	}
	42	}
[11732]	43	Debug.Assert(bestAction >= 0);
[11730]	44	return bestAction;
	45	}
	46
[11742]	47	public IBanditPolicyActionInfo CreateActionInfo() {
[11732]	48	return new DefaultPolicyActionInfo();
[11730]	49	}
	50
	51	public override string ToString() {
	52	return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
	53	}
	54	}
	55	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences