using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HeuristicLab.Algorithms.Bandits { public class EpsGreedyPolicy : BanditPolicy { private readonly Random random; private readonly double eps; private readonly int[] tries; private readonly double[] sumReward; public EpsGreedyPolicy(Random random, int numActions, double eps) : base(numActions) { this.random = random; this.eps = eps; this.tries = new int[NumActions]; this.sumReward = new double[NumActions]; } public override int SelectAction() { if (random.NextDouble() > eps) { // select best var maxReward = double.NegativeInfinity; int bestAction = -1; for (int i = 0; i < NumActions; i++) { if (tries[i] == 0) return i; var avgReward = sumReward[i] / tries[i]; if (maxReward < avgReward) { maxReward = avgReward; bestAction = i; } } return bestAction; } else { // select random return random.Next(NumActions); } } public override void UpdateReward(int action, double reward) { tries[action]++; sumReward[action] += reward; } public override void Reset() { Array.Clear(tries, 0, tries.Length); Array.Clear(sumReward, 0, sumReward.Length); } } }