using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HeuristicLab.Common;

namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
  /* see: Streeter and Smith: A simple distribution-free approach to the max k-armed bandit problem, Proceedings  of the 12th
 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */

  public class ThresholdAscentPolicy : IBanditPolicy {
    public const int numBins = 101;
    public const double binSize = 1.0 / (numBins - 1);

    private class ThresholdAscentActionInfo : IBanditPolicyActionInfo {

      // for each arm store the number of observed rewards for each bin of size delta
      // for delta = 0.01 we have 101 bins
      // the first bin is freq of rewards  >= 0 // all
      // the second bin is freq of rewards > 0
      // the third bin is freq of rewards > 0.01
      // the last bin is for rewards > 0.99
      // 
      // (also see RewardBin function)
      public int[] rewardHistogram = new int[numBins];    // for performance reasons we store cumulative counts (freq of rewards > lower threshold)
      public int Tries { get; private set; }
      public int thresholdBin = 1;
      public double Value { get { return rewardHistogram[thresholdBin] / (double)Tries; } }

      public bool Disabled { get { return Tries == -1; } }

      public void UpdateReward(double reward) {
        Tries++;
        for (var idx = thresholdBin; idx <= RewardBin(reward); idx++)
          rewardHistogram[idx]++;
      }

      public void Disable() {
        Tries = -1;
      }

      public void Reset() {
        Tries = 0;
        thresholdBin = 1;
        Array.Clear(rewardHistogram, 0, rewardHistogram.Length);
      }

      public void PrintStats() {
        if (Tries >= 0) {
          Console.Write("{0,6}", Tries);
        } else {
          Console.Write("{0,6}", "");
        }
      }

      // maps a reward value to it's bin
      private static int RewardBin(double reward) {
        Debug.Assert(reward >= 0 && reward <= 1.0);
        // reward = 0 => 0
        // ]0.00 .. 0.01] => 1
        // ]0.01 .. 0.02] => 2
        // ...
        // ]0.99 .. 1.00] => 100
        if (reward <= 0) return 0;
        return (int)Math.Ceiling((reward / binSize));
      }
    }

    private readonly int s;
    private readonly double delta;

    public ThresholdAscentPolicy(int s = 100, double delta = 0.05) {
      this.s = s;
      this.delta = delta;
    }

    private double U(double mu, int totalTries, int n, int k) {
      //var alpha = Math.Log(2.0 * totalTries * k / delta);
      double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
      return mu + (alpha + Math.Sqrt(2 * n * mu * alpha + alpha * alpha)) / n;
    }


    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
      Debug.Assert(actionInfos.Any());
      var myActionInfos = actionInfos.OfType<ThresholdAscentActionInfo>();
      UpdateThreshold(myActionInfos);

      int bestAction = -1;
      double bestQ = double.NegativeInfinity;
      int k = myActionInfos.Count(a => !a.Disabled);
      var totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
      int aIdx = -1;
      foreach (var aInfo in myActionInfos) {
        aIdx++;
        if (aInfo.Disabled) continue;
        if (aInfo.Tries == 0) return aIdx;
        double mu = aInfo.Value; // probability of rewards > T 
        double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
        if (q > bestQ) {
          bestQ = q;
          bestAction = aIdx;
        }
      }
      Debug.Assert(bestAction > -1);
      return bestAction;
    }


    private void UpdateThreshold(IEnumerable<ThresholdAscentActionInfo> actionInfos) {
      var thresholdBin = 1; // first bin to check is bin idx 1 == freq of rewards > 0
      while (thresholdBin < (numBins - 1) && actionInfos.Sum(a => a.rewardHistogram[thresholdBin]) >= s) {
        thresholdBin++;
        // Console.WriteLine("New threshold {0:F2}", T);
      }
      foreach (var aInfo in actionInfos) {
        aInfo.thresholdBin = thresholdBin;
      }
    }


    public IBanditPolicyActionInfo CreateActionInfo() {
      return new ThresholdAscentActionInfo();
    }

    public override string ToString() {
      return string.Format("ThresholdAscentPolicy({0},{1:F2})", s, delta);
    }

  }
}