Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs @ 12533

Last change on this file since 12533 was 11806, checked in by gkronber, 10 years ago

#2283: separated value-states from done-states in GenericGrammarPolicy and removed disabling of actions from bandit policies

File size: 2.0 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  // also called softmax policy
11  public class BoltzmannExplorationPolicy : IBanditPolicy {
12    private readonly double beta;
13    private readonly Func<DefaultPolicyActionInfo, double> valueFunction;
14
15    public BoltzmannExplorationPolicy(double beta) : this(beta, DefaultPolicyActionInfo.AverageReward) { }
16
17    public BoltzmannExplorationPolicy(double beta, Func<DefaultPolicyActionInfo, double> valueFunction) {
18      if (beta < 0) throw new ArgumentException();
19      this.beta = beta;
20      this.valueFunction = valueFunction;
21    }
22    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
23      Debug.Assert(actionInfos.Any());
24
25      // select best
26      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
27
28      // try any of the untries actions randomly
29      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
30      //if (myActionInfos.Any(aInfo => !aInfo.Disabled && aInfo.Tries == 0)) {
31      //  return myActionInfos
32      //  .Select((aInfo, idx) => new { aInfo, idx })
33      //  .Where(p => !p.aInfo.Disabled)
34      //  .Where(p => p.aInfo.Tries == 0)
35      //  .SelectRandom(random).idx;
36      //}
37
38      var w = from aInfo in myActionInfos
39              select Math.Exp(beta * valueFunction(aInfo));
40
41      var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
42      Debug.Assert(bestAction >= 0);
43      return bestAction;
44    }
45
46    public IBanditPolicyActionInfo CreateActionInfo() {
47      return new DefaultPolicyActionInfo();
48    }
49
50    public override string ToString() {
51      return string.Format("BoltzmannExplorationPolicy({0:F2})", beta);
52    }
53  }
54}
Note: See TracBrowser for help on using the repository browser.