Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationWithCoolingPolicy.cs @ 13264

Visit:

Last change on this file since 13264 was 12893, checked in by gkronber, 9 years ago
#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)
File size: 2.3 KB

Line
1	using System;
2	using System.Collections.Generic;
3	using System.ComponentModel;
4	using System.Diagnostics;
5	using System.Linq;
6	using System.Text;
7	using System.Threading.Tasks;
8	using HeuristicLab.Common;
9
10	namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
11	// Reference: Cicirello and Smith, The Max K-armed Bandit: A New Model of Exploration Applied to Search Heuristic Selection, AAAI 2005
12	// uses exponentially decreasing cooling schedule
13	public class BoltzmannExplorationWithCoolingPolicy : IBanditPolicy {
14	private readonly double beta;
15
16	public BoltzmannExplorationWithCoolingPolicy(double beta) {
17	if (beta < 0) throw new ArgumentException();
18	this.beta = beta;
19	}
20	public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
21	Debug.Assert(actionInfos.Any());
22
23	// select best
24	var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
25
26	// try any of the untries actions randomly
27	// for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
28	if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
29	return myActionInfos
30	.Select((aInfo, idx) => new { aInfo, idx })
31	.Where(p => p.aInfo.Tries == 0)
32	.SelectRandom(random).idx;
33	}
34
35	var totalTries = myActionInfos.Sum(i => i.Tries);
36	if (totalTries > 10000) {
37	// take best arm
38	return myActionInfos
39	.Select((aInfo, idx) => new { aInfo, idx })
40	.OrderByDescending(t => t.aInfo.MaxReward)
41	.First().idx;
42	}
43	var w = from aInfo in myActionInfos
44	let q = aInfo.MaxReward // this should be an estimator for the expected maximum of the distribution
45	select Math.Exp(beta * q / Math.Exp(-totalTries / 2000.0));
46
47	var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
48	Debug.Assert(bestAction >= 0);
49	return bestAction;
50	}
51
52	public IBanditPolicyActionInfo CreateActionInfo() {
53	return new DefaultPolicyActionInfo();
54	}
55
56	public override string ToString() {
57	return string.Format("BoltzmannExplorationWithCoolingPolicy({0:F2})", beta);
58	}
59	}
60	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences