Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCBNormalPolicy.cs @ 11727

Last change on this file since 11727 was 11727, checked in by gkronber, 9 years ago

#2283: worked on grammatical optimization problem solvers (simple MCTS done)

File size: 1.9 KB
RevLine 
[11710]1using System;
2using System.Collections.Generic;
[11727]3using System.Diagnostics;
[11710]4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace HeuristicLab.Algorithms.Bandits {
9  public class UCBNormalPolicy : BanditPolicy {
10    private readonly int[] tries;
11    private readonly double[] sumReward;
12    private readonly double[] sumSqrReward;
13    private int totalTries = 0;
14    public UCBNormalPolicy(int numActions)
15      : base(numActions) {
[11727]16      this.tries = new int[numActions];
17      this.sumReward = new double[numActions];
18      this.sumSqrReward = new double[numActions];
[11710]19    }
20
21    public override int SelectAction() {
[11727]22      Debug.Assert(Actions.Any());
[11710]23      int bestAction = -1;
24      double bestQ = double.NegativeInfinity;
[11727]25      foreach (var a in Actions) {
26        if (totalTries == 0 || tries[a] == 0 || tries[a] < Math.Ceiling(8 * Math.Log(totalTries))) return a;
27        var avgReward = sumReward[a] / tries[a];
[11710]28        var q = avgReward
[11727]29          + Math.Sqrt(16 * ((sumSqrReward[a] - tries[a] * Math.Pow(avgReward, 2)) / (tries[a] - 1)) * (Math.Log(totalTries - 1) / tries[a]));
[11710]30        if (q > bestQ) {
31          bestQ = q;
[11727]32          bestAction = a;
[11710]33        }
34      }
35      return bestAction;
36    }
37    public override void UpdateReward(int action, double reward) {
[11727]38      Debug.Assert(Actions.Contains(action));
[11710]39      totalTries++;
40      tries[action]++;
41      sumReward[action] += reward;
42      sumSqrReward[action] += reward * reward;
43    }
[11727]44
45    public override void DisableAction(int action) {
46      base.DisableAction(action);
47      totalTries -= tries[action];
48      tries[action] = -1;
49      sumReward[action] = 0;
50      sumSqrReward[action] = 0;
51    }
52
[11710]53    public override void Reset() {
[11727]54      base.Reset();
[11710]55      totalTries = 0;
56      Array.Clear(tries, 0, tries.Length);
57      Array.Clear(sumReward, 0, sumReward.Length);
58      Array.Clear(sumSqrReward, 0, sumSqrReward.Length);
59    }
60  }
61}
Note: See TracBrowser for help on using the repository browser.