Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs @ 13620

Last change on this file since 13620 was 12893, checked in by gkronber, 9 years ago

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

File size: 2.4 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7using HeuristicLab.Common;
8
9namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
10  // also called softmax policy
11  public class BoltzmannExplorationPolicy : IBanditPolicy {
12    private readonly double beta;
13
14    public BoltzmannExplorationPolicy(double beta) {
15      if (beta < 0) throw new ArgumentException();
16      this.beta = beta;
17    }
18    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
19      Debug.Assert(actionInfos.Any());
20
21      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
22
23      // try any of the untries actions randomly
24      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
25        return myActionInfos
26        .Select((aInfo, idx) => new { aInfo, idx })
27        .Where(p => p.aInfo.Tries == 0)
28        .SelectRandom(random).idx;
29      }
30
31      // using ranks
32      //var qualities = actionInfos.Select(i => i.MaxReward).ToArray(); // largest reward should have largest rank
33      //var ranks = Enumerable.Range(0, myActionInfos.Count()).ToArray();
34      //Array.Sort(qualities, ranks);
35      //
36      //// set same rank for same quality
37      ////for (int i = 0; i < ranks.Length - 1; i++) {
38      ////  if (qualities[i] == qualities[i + 1]) ranks[i + 1] = ranks[i];
39      ////}
40      ////
41      //
42      //var rankForAction = new int[myActionInfos.Count()];
43      //for (int i = 0; i < rankForAction.Length; i++) {
44      //  rankForAction[ranks[i]] = i;
45      //}
46      //
47      //var w = from idx in Enumerable.Range(0, myActionInfos.Count())
48      //        select Math.Exp(beta * rankForAction[idx]);
49
50
51      // windowing
52      var max = actionInfos.Select(i => i.MaxReward).Max();
53      var min = actionInfos.Select(i => i.MaxReward).Min();
54      double range = max - min;
55      var w = from aInfo in actionInfos
56              select Math.Exp(beta * (aInfo.MaxReward - min) / range);
57
58      var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
59      Debug.Assert(bestAction >= 0);
60      return bestAction;
61    }
62
63    public IBanditPolicyActionInfo CreateActionInfo() {
64      return new DefaultPolicyActionInfo();
65    }
66
67    public override string ToString() {
68      return string.Format("BoltzmannExplorationPolicy({0:F2})", beta);
69    }
70  }
71}
Note: See TracBrowser for help on using the repository browser.