using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HeuristicLab.Common;

namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
  // Reference: Cicirello and Smith, The Max K-armed Bandit: A New Model of Exploration Applied to Search Heuristic Selection, AAAI 2005
  // uses exponentially decreasing cooling schedule
  public class BoltzmannExplorationWithCoolingPolicy : IBanditPolicy {
    private readonly double beta;

    public BoltzmannExplorationWithCoolingPolicy(double beta) {
      if (beta < 0) throw new ArgumentException();
      this.beta = beta;
    }
    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
      Debug.Assert(actionInfos.Any());

      // select best
      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();

      // try any of the untries actions randomly
      // for RoyalSequence it is much better to select the actions in the order of occurrence (all terminal alternatives first)
      if (myActionInfos.Any(aInfo => aInfo.Tries == 0)) {
        return myActionInfos
        .Select((aInfo, idx) => new { aInfo, idx })
        .Where(p => p.aInfo.Tries == 0)
        .SelectRandom(random).idx;
      }

      var totalTries = myActionInfos.Sum(i => i.Tries);
      if (totalTries > 10000) {
        // take best arm
        return myActionInfos
                  .Select((aInfo, idx) => new { aInfo, idx })
                  .OrderByDescending(t => t.aInfo.MaxReward)
                  .First().idx;
      }
      var w = from aInfo in myActionInfos
              let q = aInfo.MaxReward // this should be an estimator for the expected maximum of the distribution
              select Math.Exp(beta * q / Math.Exp(-totalTries / 2000.0));

      var bestAction = Enumerable.Range(0, myActionInfos.Count()).SampleProportional(random, w);
      Debug.Assert(bestAction >= 0);
      return bestAction;
    }

    public IBanditPolicyActionInfo CreateActionInfo() {
      return new DefaultPolicyActionInfo();
    }

    public override string ToString() {
      return string.Format("BoltzmannExplorationWithCoolingPolicy({0:F2})", beta);
    }
  }
}