Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationWithCoolingPolicy.cs @ 13264

Visit:

Last change on this file since 13264 was 12893, checked in by gkronber, 9 years ago
#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)
File size: 2.3 KB

Rev	Line
[12893]	1	using System;
	2	using System.Collections.Generic;
	3	using System.ComponentModel;
	4	using System.Diagnostics;
	5	using System.Linq;
	6	using System.Text;
	7	using System.Threading.Tasks;
	8	using HeuristicLab.Common;
	9
	10	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
	11	// Reference: Cicirello and Smith, The Max K-armed Bandit: A New Model of Exploration Applied to Search Heuristic Selection, AAAI 2005
	12	// uses exponentially decreasing cooling schedule
	13	public class BoltzmannExplorationWithCoolingPolicy : IBanditPolicy {
	14	private readonly double beta;
	15
	16	public BoltzmannExplorationWithCoolingPolicy(double beta) {
	17	if (beta < 0) throw new ArgumentException();
	18	this.beta = beta;
	19	}
	20	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
	21	Debug.Assert(actionInfos.Any());
	22
	23	// select best
	24	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
	25
	26	// try any of the untries actions randomly
	27	// for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
	28	if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
	29	return myActionInfos
	30	.Select((aInfo, idx) => new { aInfo, idx })
	31	.Where(p => p.aInfo.Tries == 0)
	32	.SelectRandom(random).idx;
	33	}
	34
	35	var totalTries = myActionInfos.Sum(i => i.Tries);
	36	if (totalTries > 10000) {
	37	// take best arm
	38	return myActionInfos
	39	.Select((aInfo, idx) => new { aInfo, idx })
	40	.OrderByDescending(t => t.aInfo.MaxReward)
	41	.First().idx;
	42	}
	43	var w = from aInfo in myActionInfos
	44	let q = aInfo.MaxReward // this should be an estimator for the expected maximum of the distribution
	45	select Math.Exp(beta * q / Math.Exp(-totalTries / 2000.0));
	46
	47	var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
	48	Debug.Assert(bestAction >= 0);
	49	return bestAction;
	50	}
	51
	52	public IBanditPolicyActionInfo CreateActionInfo() {
	53	return new DefaultPolicyActionInfo();
	54	}
	55
	56	public override string ToString() {
	57	return string.Format("BoltzmannExplorationWithCoolingPolicy({0:F2})", beta);
	58	}
	59	}
	60	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences