using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { /* Kocsis et al. Bandit based Monte-Carlo Planning */ public class UCTPolicy : IBanditPolicy { private readonly double c; public UCTPolicy(double c = 1.0) { this.c = c; } public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); int bestAction = -1; double bestQ = double.NegativeInfinity; int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries); int aIdx = -1; var bestActions = new List(); foreach (var aInfo in myActionInfos) { aIdx++; if (aInfo.Disabled) continue; double q; if (aInfo.Tries == 0) { q = double.PositiveInfinity; } else { q = aInfo.SumReward / aInfo.Tries + 2.0 * c * Math.Sqrt(Math.Log(totalTries) / aInfo.Tries); } if (q > bestQ) { bestActions.Clear(); bestQ = q; bestActions.Add(aIdx); } if (q == bestQ) { bestActions.Add(aIdx); } } Debug.Assert(bestActions.Any()); return bestActions.SelectRandom(random); } public IBanditPolicyActionInfo CreateActionInfo() { return new DefaultPolicyActionInfo(); } public override string ToString() { return string.Format("UCTPolicy({0:F2})", c); } } }