Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationWithCoolingPolicy.cs @ 13264

Last change on this file since 13264 was 12893, checked in by gkronber, 9 years ago

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

File size: 2.3 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.ComponentModel;
4using System.Diagnostics;
5using System.Linq;
6using System.Text;
7using System.Threading.Tasks;
8using HeuristicLab.Common;
9
10namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
11  // Reference: Cicirello and Smith, The Max K-armed Bandit: A New Model of Exploration Applied to Search Heuristic Selection, AAAI 2005
12  // uses exponentially decreasing cooling schedule
13  public class BoltzmannExplorationWithCoolingPolicy : IBanditPolicy {
14    private readonly double beta;
15
16    public BoltzmannExplorationWithCoolingPolicy(double beta) {
17      if (beta < 0) throw new ArgumentException();
18      this.beta = beta;
19    }
20    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
21      Debug.Assert(actionInfos.Any());
22
23      // select best
24      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
25
26      // try any of the untries actions randomly
27      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
28      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
29        return myActionInfos
30        .Select((aInfo, idx) => new { aInfo, idx })
31        .Where(p => p.aInfo.Tries == 0)
32        .SelectRandom(random).idx;
33      }
34
35      var totalTries = myActionInfos.Sum(i => i.Tries);
36      if (totalTries > 10000) {
37        // take best arm
38        return myActionInfos
39                  .Select((aInfo, idx) => new { aInfo, idx })
40                  .OrderByDescending(t => t.aInfo.MaxReward)
41                  .First().idx;
42      }
43      var w = from aInfo in myActionInfos
44              let q = aInfo.MaxReward // this should be an estimator for the expected maximum of the distribution
45              select Math.Exp(beta * q / Math.Exp(-totalTries / 2000.0));
46
47      var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
48      Debug.Assert(bestAction >= 0);
49      return bestAction;
50    }
51
52    public IBanditPolicyActionInfo CreateActionInfo() {
53      return new DefaultPolicyActionInfo();
54    }
55
56    public override string ToString() {
57      return string.Format("BoltzmannExplorationWithCoolingPolicy({0:F2})", beta);
58    }
59  }
60}
Note: See TracBrowser for help on using the repository browser.