using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace HeuristicLab.Algorithms.Bandits {
  public class UCB1Policy : BanditPolicy {
    private readonly int[] tries;
    private readonly double[] sumReward;
    private int totalTries = 0;
    public UCB1Policy(int numActions)
      : base(numActions) {
      this.tries = new int[NumActions];
      this.sumReward = new double[NumActions];
    }

    public override int SelectAction() {
      int bestAction = -1;
      double bestQ = double.NegativeInfinity;
      for (int i = 0; i < NumActions; i++) {
        if (tries[i] == 0) return i;
        var q = sumReward[i] / tries[i] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[i]);
        if (q > bestQ) {
          bestQ = q;
          bestAction = i;
        }
      }
      return bestAction;
    }
    public override void UpdateReward(int action, double reward) {
      totalTries++;
      tries[action]++;
      sumReward[action] += reward;
    }
    public override void Reset() {
      totalTries = 0;
      Array.Clear(tries, 0, tries.Length);
      Array.Clear(sumReward, 0, sumReward.Length);
    }
  }
}