using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits { /* Kocsis et al. Bandit based Monte-Carlo Planning */ public class UCTPolicy : BanditPolicy { private readonly int[] tries; private readonly double[] sumReward; private int totalTries = 0; private readonly double c; public UCTPolicy(int numActions, double c = 1.0) : base(numActions) { this.tries = new int[numActions]; this.sumReward = new double[numActions]; this.c = c; } public override int SelectAction() { int bestAction = -1; double bestQ = double.NegativeInfinity; foreach (var a in Actions) { if (tries[a] == 0) return a; var q = sumReward[a] / tries[a] + 2 * c * Math.Sqrt(Math.Log(totalTries) / tries[a]); if (q > bestQ) { bestQ = q; bestAction = a; } } return bestAction; } public override void UpdateReward(int action, double reward) { Debug.Assert(Actions.Contains(action)); totalTries++; tries[action]++; sumReward[action] += reward; } public override void DisableAction(int action) { base.DisableAction(action); totalTries -= tries[action]; tries[action] = -1; sumReward[action] = 0; } public override void Reset() { base.Reset(); totalTries = 0; Array.Clear(tries, 0, tries.Length); Array.Clear(sumReward, 0, sumReward.Length); } public override void PrintStats() { for (int i = 0; i < sumReward.Length; i++) { if (tries[i] >= 0) { Console.Write("{0,5:F2}", sumReward[i] / tries[i]); } else { Console.Write("{0,5}", ""); } } Console.WriteLine(); } public override string ToString() { return string.Format("UCTPolicy({0:F2})", c); } } }