using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace HeuristicLab.Algorithms.Bandits {
  public class UCB1TunedPolicy : BanditPolicy {
    private readonly int[] tries;
    private readonly double[] sumReward;
    private readonly double[] sumSqrReward;
    private int totalTries = 0;
    public UCB1TunedPolicy(int numActions)
      : base(numActions) {
      this.tries = new int[numActions];
      this.sumReward = new double[numActions];
      this.sumSqrReward = new double[numActions];
    }

    private double V(int arm) {
      var s = tries[arm];
      return sumSqrReward[arm] / s - Math.Pow(sumReward[arm] / s, 2) + Math.Sqrt(2 * Math.Log(totalTries) / s);
    }


    public override int SelectAction() {
      Debug.Assert(Actions.Any());
      int bestAction = -1;
      double bestQ = double.NegativeInfinity;
      foreach (var a in Actions) {
        if (tries[a] == 0) return a;
        var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable
        if (q > bestQ) {
          bestQ = q;
          bestAction = a;
        }
      }
      return bestAction;
    }
    public override void UpdateReward(int action, double reward) {
      Debug.Assert(Actions.Contains(action));
      totalTries++;
      tries[action]++;
      sumReward[action] += reward;
      sumSqrReward[action] += reward * reward;
    }

    public override void DisableAction(int action) {
      base.DisableAction(action);
      totalTries -= tries[action];
      tries[action] = -1;
      sumReward[action] = 0;
      sumSqrReward[action] = 0;
    }

    public override void Reset() {
      base.Reset();
      totalTries = 0;
      Array.Clear(tries, 0, tries.Length);
      Array.Clear(sumReward, 0, sumReward.Length);
      Array.Clear(sumSqrReward, 0, sumSqrReward.Length);
    }
    public override void PrintStats() {
      for (int i = 0; i < sumReward.Length; i++) {
        if (tries[i] >= 0) {
          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
        } else {
          Console.Write("{0,5}", "");
        }
      }
      Console.WriteLine();
    }
    public override string ToString() {
      return "UCB1TunedPolicy";
    }
  }
}