Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs @ 13042

Visit:

Last change on this file since 13042 was 12893, checked in by gkronber, 9 years ago
#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)
File size: 2.3 KB

Rev	Line
[11730]	1	using System;
	2	using System.Collections.Generic;
	3	using System.Diagnostics;
	4	using System.Linq;
	5	using System.Text;
	6	using System.Threading.Tasks;
[11792]	7	using HeuristicLab.Common;
[11730]	8
[11742]	9	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
[11730]	10	/* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings of the 12th
	11	International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
	12
[11742]	13	public class ChernoffIntervalEstimationPolicy : IBanditPolicy {
[11730]	14	private readonly double delta;
	15
[11732]	16	public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
[11730]	17	this.delta = delta;
	18	}
[11742]	19	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
[11732]	20	Debug.Assert(actionInfos.Any());
	21	// select best
[11742]	22	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
[11806]	23	int k = myActionInfos.Count();
	24	int totalTries = myActionInfos.Sum(a => a.Tries);
[11730]	25	double bestQ = double.NegativeInfinity;
[11792]	26	var bestActions = new List<int>();
[11742]	27	var aIdx = -1;
	28	foreach (var aInfo in myActionInfos) {
	29	aIdx++;
[11792]	30	double q;
	31	if (aInfo.Tries == 0) {
	32	q = double.PositiveInfinity;
	33	} else {
[11732]	34
[12893]	35	var avgReward = aInfo.SumReward / aInfo.Tries;
[11732]	36
[12876]	37	// page 5 of "A simple distribution-free approach to the max k-armed bandit problem"
[11792]	38	// var alpha = Math.Log(2 * totalTries * k / delta);
	39	double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
	40	// total tries is max tries in the original paper
	41	q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
	42	}
[11730]	43	if (q > bestQ) {
	44	bestQ = q;
[11792]	45	bestActions.Clear();
	46	bestActions.Add(aIdx);
[11806]	47	} else if (q.IsAlmost(bestQ)) {
[11792]	48	bestActions.Add(aIdx);
[11730]	49	}
	50	}
[11792]	51	Debug.Assert(bestActions.Any());
	52	return bestActions.SelectRandom(random);
[11730]	53	}
	54
[11742]	55	public IBanditPolicyActionInfo CreateActionInfo() {
[11732]	56	return new DefaultPolicyActionInfo();
[11730]	57	}
	58
	59	public override string ToString() {
	60	return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);
	61	}
	62	}
	63	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences