using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using HeuristicLab.Common;

namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {

  [Obsolete("Replaced by GenericThompsonSamplingPolicy(GaussianModel(0.5, 1.0, 0.1))")]
  public class GaussianThompsonSamplingPolicy : IBanditPolicy {
    private bool compatibility;

    // assumes a Gaussian reward distribution with different means but the same variances for each action
    // the prior for the mean is also Gaussian with the following parameters
    private readonly double rewardVariance = 0.1; // we assume a known variance 

    private readonly double priorMean = 0.5;
    private readonly double priorVariance = 1;


    public GaussianThompsonSamplingPolicy(bool compatibility = false) {
      this.compatibility = compatibility;
    }

    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
      int bestAction = -1;
      double bestQ = double.NegativeInfinity;

      int aIdx = -1;
      foreach (var aInfo in myActionInfos) {
        aIdx++;

        var tries = aInfo.Tries;
        var sampleMean = aInfo.AvgReward;
        var sampleVariance = aInfo.RewardVariance;

        double theta;
        if (compatibility) {
          // old code used for old experiments (preserved because it performed very well)
          if (tries < 2) return aIdx;
          var mu = sampleMean;
          var variance = sampleVariance;
          var stdDev = Math.Sqrt(variance);
          theta = Rand.RandNormal(random) * stdDev + mu;
        } else {
          // calculate posterior mean and variance (for mean reward)

          // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
          var posteriorVariance = 1.0 / (tries / rewardVariance + 1.0 / priorVariance);
          var posteriorMean = posteriorVariance * (priorMean / priorVariance + tries * sampleMean / rewardVariance);

          // sample a mean from the posterior 
          theta = Rand.RandNormal(random) * Math.Sqrt(posteriorVariance) + posteriorMean;

          // theta already represents the expected reward value => nothing else to do
        }

        // very unlikely to be the same (and we don't care)
        if (theta > bestQ) {
          bestQ = theta;
          bestAction = aIdx;
        }
      }
      Debug.Assert(bestAction > -1);
      return bestAction;
    }

    public IBanditPolicyActionInfo CreateActionInfo() {
      return new MeanAndVariancePolicyActionInfo();
    }


    //public override void UpdateReward(int action, double reward) {
    //  Debug.Assert(Actions.Contains(action));
    //  tries[action]++;
    //  var delta = reward - sampleMean[action];
    //  sampleMean[action] += delta / tries[action];
    //  sampleM2[action] += sampleM2[action] + delta * (reward - sampleMean[action]);
    //}

    public override string ToString() {
      return "GaussianThompsonSamplingPolicy";
    }
  }
}