source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCTPolicy.cs @ 11732

Last change on this file since 11732 was 11732, checked in by gkronber, 5 years ago

#2283: refactoring and bug fixes

File size: 1.4 KB
RevLine 
[11730]1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace HeuristicLab.Algorithms.Bandits {
9  /* Kocsis et al. Bandit based Monte-Carlo Planning */
[11732]10  public class UCTPolicy : IPolicy {
[11730]11    private readonly double c;
12
[11732]13    public UCTPolicy(double c = 1.0) {
[11730]14      this.c = c;
15    }
16
[11732]17
18    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
19      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
[11730]20      int bestAction = -1;
21      double bestQ = double.NegativeInfinity;
[11732]22      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
23
24      for (int a = 0; a < myActionInfos.Length; a++) {
25        if (myActionInfos[a].Disabled) continue;
26        if (myActionInfos[a].Tries == 0) return a;
27        var q = myActionInfos[a].SumReward / myActionInfos[a].Tries + 2 * c * Math.Sqrt(Math.Log(totalTries) / myActionInfos[a].Tries);
[11730]28        if (q > bestQ) {
29          bestQ = q;
30          bestAction = a;
31        }
32      }
[11732]33      Debug.Assert(bestAction > -1);
[11730]34      return bestAction;
35    }
36
[11732]37    public IPolicyActionInfo CreateActionInfo() {
38      return new DefaultPolicyActionInfo();
[11730]39    }
40
41    public override string ToString() {
42      return string.Format("UCTPolicy({0:F2})", c);
43    }
44  }
45}
Note: See TracBrowser for help on using the repository browser.