using System;
using System.Linq;
using System.Collections.Generic;
using System.Globalization;
using HeuristicLab.Algorithms.Bandits;
using HeuristicLab.Algorithms.Bandits.BanditPolicies;
using HeuristicLab.Algorithms.Bandits.Models;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace HeuristicLab.Problems.GrammaticalOptimization.Test {
  [TestClass]
  public class TestBanditPolicies {
    [TestMethod]
    public void ComparePoliciesForGaussianUnknownVarianceBandit() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      var nArms = 20;

      // some of the policies are specific to rewards in [0..1], e.g. Treshold Ascent or UCB1
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new ExtremeHunterPolicy());
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new IntervalEstimationPolicy());
      //TestPolicyGaussianUnknownVariance(randSeed, nArms, new UCBPolicy(10));
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new UCBNormalPolicy());
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new UCB1TunedPolicy());
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new UCB1Policy(10));
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new ActiveLearningPolicy(10));
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new ChernoffIntervalEstimationPolicy());
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new BoltzmannExplorationPolicy(100));
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new EpsGreedyPolicy(0.1));
      TestPolicyGaussianUnknownVariance(randSeed, nArms, new RandomPolicy());
    }

    [TestMethod]
    // test case I as described in Extreme Bandits paper
    public void ComparePoliciesExtremeBandits1() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      TestPolicyExtremeBandit1(randSeed, new RandomPolicy());
      TestPolicyExtremeBandit1(randSeed, new SingleArmPolicy(1));
      TestPolicyExtremeBandit1(randSeed, new ExtremeHunterPolicy());
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(10000));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(1000));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(100));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(10));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(2));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(1));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(0.5));
      TestPolicyExtremeBandit1(randSeed, new UCB1Policy(0.1));
      TestPolicyExtremeBandit1(randSeed, new EpsGreedyPolicy(0.1));
      TestPolicyExtremeBandit1(randSeed, new EpsGreedyPolicy(0.05));
      TestPolicyExtremeBandit1(randSeed, new EpsGreedyPolicy(0.01));
    }

    [TestMethod]
    // test case II as described in Extreme Bandits paper
    public void ComparePoliciesExtremeBandits2() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      //TestPolicyExtremeBandit2(randSeed, new RandomPolicy());
      //TestPolicyExtremeBandit2(randSeed, new SingleArmPolicy(0));
      //TestPolicyExtremeBandit2(randSeed, new SingleArmPolicy(1));
      //TestPolicyExtremeBandit2(randSeed, new SingleArmPolicy(2));
      // TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy());
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 1, minPulls: 30));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 2, minPulls: 30));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 0.5, minPulls: 30));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 5, minPulls: 30));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 1, minPulls: 100));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 2, minPulls: 100));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 0.5, minPulls: 100));
      TestPolicyExtremeBandit2(randSeed, new ExtremeHunterPolicy(D: 5, minPulls: 100));
      // TestPolicyExtremeBandit2(randSeed, new UCB1Policy(10000));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(1000));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(100));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(10));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(2));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(1));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(0.5));
      //TestPolicyExtremeBandit2(randSeed, new UCB1Policy(0.1));
      //TestPolicyExtremeBandit2(randSeed, new EpsGreedyPolicy(0.1));
      //TestPolicyExtremeBandit2(randSeed, new EpsGreedyPolicy(0.05));
      //TestPolicyExtremeBandit2(randSeed, new EpsGreedyPolicy(0.01));
      //TestPolicyExtremeBandit2(randSeed, new ThresholdAscentPolicy());
    }

    [TestMethod]
    // my own test case for ExtremeHunter
    // using truncated normal distributions
    public void ComparePoliciesExtremeBandits3() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      TestPolicyExtremeBandit3(randSeed, new RandomPolicy());
      TestPolicyExtremeBandit3(randSeed, new SingleArmPolicy(0));
      TestPolicyExtremeBandit3(randSeed, new SingleArmPolicy(1));
      TestPolicyExtremeBandit3(randSeed, new SingleArmPolicy(2));
      TestPolicyExtremeBandit3(randSeed, new ExtremeHunterPolicy());
      TestPolicyExtremeBandit3(randSeed, new UCB1Policy(3));
      TestPolicyExtremeBandit3(randSeed, new EpsGreedyPolicy(0.1));
    }

    [TestMethod]
    // a unit test to experiment with bandit policies for completing a GP sentence
    public void ComparePoliciesSentenceCompletionProblem() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;


      Func<Random, IBandit> sentenceCompletionBanditFactory = (banditRandom) => {
        var problem = new SymbolicRegressionPoly10Problem();
        return new SentenceBandit(banditRandom, problem, "a*b+c*d+e*f+E", 23);
      };

      // ignore number of arms

      // var b = sentenceCompletionBanditFactory(new Random());
      // all reference policies (always pulling one arm)
      // for (int i = 0; i < b.NumArms; i++) {
      //   TestPolicy(randSeed, new SingleArmPolicy(i), sentenceCompletionBanditFactory);
      // }

      // for the completition of a*b+c*d+e*f+a*g*i+E the arms 12, 15, and 19 are optimal
      TestPolicy(randSeed, new SingleArmPolicy(12), sentenceCompletionBanditFactory);

      TestPolicy(randSeed, new RandomPolicy(), sentenceCompletionBanditFactory);

      TestPolicy(randSeed, new ExtremeHunterPolicy(), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new ExtremeHunterPolicy(D: 0.5), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new UCB1Policy(3), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new UCB1Policy(1), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new UCB1Policy(0.5), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new ThresholdAscentPolicy(), sentenceCompletionBanditFactory);
      TestPolicy(randSeed, new EpsGreedyPolicy(0.1), sentenceCompletionBanditFactory);
    }

    [TestMethod]
    public void ComparePoliciesForBernoulliBandit() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      var nArms = 20;
      //Console.WriteLine("Exp3 (gamma=0.01)");
      //TestPolicyBernoulli(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 1));
      //Console.WriteLine("Exp3 (gamma=0.05)");
      //estPolicyBernoulli(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 1));
      Console.WriteLine("Thompson (Bernoulli)"); TestPolicyBernoulli(randSeed, nArms, new BernoulliThompsonSamplingPolicy());
      Console.WriteLine("Generic Thompson (Bernoulli)"); TestPolicyBernoulli(randSeed, nArms, new GenericThompsonSamplingPolicy(new BernoulliModel()));
      Console.WriteLine("Random");
      TestPolicyBernoulli(randSeed, nArms, new RandomPolicy());
      Console.WriteLine("UCB1");
      TestPolicyBernoulli(randSeed, nArms, new UCB1Policy());
      Console.WriteLine("UCB1Tuned");
      TestPolicyBernoulli(randSeed, nArms, new UCB1TunedPolicy());
      Console.WriteLine("UCB1Normal");
      TestPolicyBernoulli(randSeed, nArms, new UCBNormalPolicy());
      Console.WriteLine("Eps(0.01)");
      TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.01));
      Console.WriteLine("Eps(0.05)");
      TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.05));
      //Console.WriteLine("Eps(0.1)");
      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.1));
      //Console.WriteLine("Eps(0.2)");
      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.2));
      //Console.WriteLine("Eps(0.5)");
      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.5));
      Console.WriteLine("UCT(0.1)"); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(0.1));
      Console.WriteLine("UCT(0.5)"); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(0.5));
      Console.WriteLine("UCT(1)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(1));
      Console.WriteLine("UCT(2)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(2));
      Console.WriteLine("UCT(5)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(5));
      Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(0.1));
      Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(0.5));
      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(1));
      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(10));
      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(100));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.01));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.05));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.1));

      // not applicable to bernoulli rewards
      //Console.WriteLine("ThresholdAscent(10, 0.01)  "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
      //Console.WriteLine("ThresholdAscent(10, 0.05)  "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05));
      //Console.WriteLine("ThresholdAscent(10, 0.1)   "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1));
      //Console.WriteLine("ThresholdAscent(100, 0.01) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
      //Console.WriteLine("ThresholdAscent(100, 0.05) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05));
      //Console.WriteLine("ThresholdAscent(100, 0.1)  "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1));
      //Console.WriteLine("ThresholdAscent(1000, 0.01)"); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
      //Console.WriteLine("ThresholdAscent(1000, 0.05)"); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05));
      //Console.WriteLine("ThresholdAscent(1000, 0.1) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1));
    }

    [TestMethod]
    public void ComparePoliciesForGaussianBandit() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;

      var randSeed = 31415;
      var nArms = 20;
      Console.WriteLine("Threshold Ascent (20)"); TestPolicyGaussian(randSeed, nArms, new ThresholdAscentPolicy(20, 0.01));
      Console.WriteLine("Threshold Ascent (100)"); TestPolicyGaussian(randSeed, nArms, new ThresholdAscentPolicy(100, 0.01));
      Console.WriteLine("Threshold Ascent (500)"); TestPolicyGaussian(randSeed, nArms, new ThresholdAscentPolicy(500, 0.01));
      Console.WriteLine("Threshold Ascent (1000)"); TestPolicyGaussian(randSeed, nArms, new ThresholdAscentPolicy(1000, 0.01));
      Console.WriteLine("Generic Thompson (Gaussian fixed var)"); TestPolicyGaussian(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1)));
      Console.WriteLine("Generic Thompson (Gaussian unknown var)"); TestPolicyGaussian(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1, 1, 1)));
      Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussian(randSeed, nArms, new GaussianThompsonSamplingPolicy(true));
      Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussian(randSeed, nArms, new GaussianThompsonSamplingPolicy());

      /*
      Console.WriteLine("Random"); TestPolicyNormal(randSeed, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
      Console.WriteLine("UCB1"); TestPolicyNormal(randSeed, nArms, new UCB1Policy(nArms));
      Console.WriteLine("UCB1Tuned"); TestPolicyNormal(randSeed, nArms, new UCB1TunedPolicy(nArms));
      Console.WriteLine("UCB1Normal"); TestPolicyNormal(randSeed, nArms, new UCBNormalPolicy(nArms));
      //Console.WriteLine("Exp3 (gamma=0.01)");
      //TestPolicyNormal(randSeed, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.01));
      //Console.WriteLine("Exp3 (gamma=0.05)");
      //TestPolicyNormal(randSeed, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.05));
      Console.WriteLine("Eps(0.01)"); TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
      Console.WriteLine("Eps(0.05)"); TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
      //Console.WriteLine("Eps(0.1)");
      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1));
      //Console.WriteLine("Eps(0.2)");
      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2));
      //Console.WriteLine("Eps(0.5)");
      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5));
      Console.WriteLine("UCT(0.1)"); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 0.1));
      Console.WriteLine("UCT(0.5)"); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 0.5));
      Console.WriteLine("UCT(1)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 1));
      Console.WriteLine("UCT(2)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 2));
      Console.WriteLine("UCT(5)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 5));
      Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1));
      Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5));
      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05));
      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1));      
      Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
      Console.WriteLine("ThresholdAscent(10,0.05)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05));
      Console.WriteLine("ThresholdAscent(10,0.1)   "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1));
      Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
      Console.WriteLine("ThresholdAscent(100,0.05) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05));
      Console.WriteLine("ThresholdAscent(100,0.1)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1));
      Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
      Console.WriteLine("ThresholdAscent(1000,0.05)"); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05));
      Console.WriteLine("ThresholdAscent(1000,0.1) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1));
       */
    }

    [TestMethod]
    public void ComparePoliciesForGaussianMixtureBandit() {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
      var randSeed = 31415;
      var nArms = 20;

      Console.WriteLine("Generic Thompson (Gaussian Mixture)"); TestPolicyGaussianMixture(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianMixtureModel()));
      // Console.WriteLine("Threshold Ascent (20)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(20, 0.01));
      // Console.WriteLine("Threshold Ascent (100)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(100, 0.01));
      // Console.WriteLine("Threshold Ascent (500)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(500, 0.01));
      // Console.WriteLine("Threshold Ascent (1000)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(1000, 0.01));
      // Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussianMixture(randSeed, nArms, new GaussianThompsonSamplingPolicy(true));
      // Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussianMixture(randSeed, nArms, new GaussianThompsonSamplingPolicy());
      // Console.WriteLine("Generic Thompson (Gaussian fixed variance)"); TestPolicyGaussianMixture(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1, 0.1)));
      // Console.WriteLine("Generic Thompson (Gaussian unknown variance)"); TestPolicyGaussianMixture(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1, 1, 1)));

      /*
      Console.WriteLine("Random"); TestPolicyGaussianMixture(randSeed, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
      Console.WriteLine("UCB1"); TestPolicyGaussianMixture(randSeed, nArms, new UCB1Policy(nArms));
      Console.WriteLine("UCB1Tuned "); TestPolicyGaussianMixture(randSeed, nArms, new UCB1TunedPolicy(nArms));
      Console.WriteLine("UCB1Normal"); TestPolicyGaussianMixture(randSeed, nArms, new UCBNormalPolicy(nArms));
      Console.WriteLine("Eps(0.01) "); TestPolicyGaussianMixture(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
      Console.WriteLine("Eps(0.05) "); TestPolicyGaussianMixture(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
      Console.WriteLine("UCT(1)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 1));
      Console.WriteLine("UCT(2)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 2));
      Console.WriteLine("UCT(5)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 5));
      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));

      Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
      Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
      Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
      Console.WriteLine("ThresholdAscent(10000,0.01)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10000, 0.01));
       */
    }


    private void TestPolicyBernoulli(int randSeed, int nArms, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new BernoulliBandit(banditRandom, nArms));
    }
    private void TestPolicyGaussian(int randSeed, int nArms, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new TruncatedNormalBandit(banditRandom, nArms));
    }
    private void TestPolicyGaussianMixture(int randSeed, int nArms, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new GaussianMixtureBandit(banditRandom, nArms));
    }
    private void TestPolicyGaussianUnknownVariance(int randSeed, int nArms, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new GaussianBandit(banditRandom, nArms, 0, 10));
    }

    private void TestPolicyExtremeBandit1(int randSeed, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new ParetoBandit(banditRandom, new double[] { 5, 1.1, 2 }));
    }
    private void TestPolicyExtremeBandit2(int randSeed, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new ParetoBandit(banditRandom, new double[] { 1.5, 1.1, 3 }, new double[] { 0.0, 0.8, 0.0 }, 0, 1));
    }
    private void TestPolicyExtremeBandit3(int randSeed, IBanditPolicy policy) {
      TestPolicy(randSeed, policy, (banditRandom) => new Bandit(banditRandom, new IModel[]
      {
        new GammaModel(10, 1),   // exp=10, var=10
        new GammaModel(6, 2),    // exp=12, var=24
        new GammaModel(3, 3),    // exp= 9, var=27
      }, 1, 2));
    }


    private void TestPolicy(int randSeed, IBanditPolicy policy, Func<Random, IBandit> banditFactory) {
      var maxIt = 1E5;
      var reps = 30; // independent runs
      //var regretForIteration = new Dictionary<int, List<double>>();
      //var numberOfPullsOfSuboptimalArmsForExp = new Dictionary<int, double>();
      //var numberOfPullsOfSuboptimalArmsForMax = new Dictionary<int, double>();
      //var bestRewardForIteration = new Dictionary<int, List<double>>();
      var globalRandom = new Random(randSeed);
      var banditRandom = new Random(globalRandom.Next()); // bandits must produce the same rewards for each test
      var policyRandom = new Random(globalRandom.Next());

      // calculate statistics
      for (int r = 0; r < reps; r++) {
        var nextLogStep = 1;
        var b = banditFactory(banditRandom);
        var totalReward = 0.0;
        int totalPullsOfOptimalArmExp = 0;
        int totalPullsOfOptimalArmMax = 0;
        var maxReward = double.NegativeInfinity;
        var actionInfos = Enumerable.Range(0, b.NumArms).Select(_ => policy.CreateActionInfo()).ToArray();
        for (int i = 0; i <= maxIt + 1; i++) {
          var selectedAction = policy.SelectAction(policyRandom, actionInfos);
          var reward = b.Pull(selectedAction);
          actionInfos[selectedAction].UpdateReward(reward);

          // collect stats
          if (selectedAction == b.OptimalExpectedRewardArm) totalPullsOfOptimalArmExp++;
          if (selectedAction == b.OptimalMaximalRewardArm) totalPullsOfOptimalArmMax++;
          totalReward += reward;
          maxReward = Math.Max(maxReward, reward);

          if (i == nextLogStep) {
            nextLogStep += 500;
            //if (!regretForIteration.ContainsKey(i)) {
            //  regretForIteration.Add(i, new List<double>());
            //}
            //regretForIteration[i].Add(totalRegret / i);
            //
            //if (!numberOfPullsOfSuboptimalArmsForExp.ContainsKey(i)) {
            //  numberOfPullsOfSuboptimalArmsForExp.Add(i, 0.0);
            //}
            //numberOfPullsOfSuboptimalArmsForExp[i] += totalPullsOfSuboptimalArmsExp;
            //
            //if (!numberOfPullsOfSuboptimalArmsForMax.ContainsKey(i)) {
            //  numberOfPullsOfSuboptimalArmsForMax.Add(i, 0.0);
            //}
            //numberOfPullsOfSuboptimalArmsForMax[i] += totalPullsOfSuboptimalArmsMax;
            //
            //if (!bestRewardForIteration.ContainsKey(i)) {
            //  bestRewardForIteration.Add(i, new List<double>());
            //}
            //bestRewardForIteration[i].Add(bestReward);
            Console.WriteLine("{0};{1,8};{2,7:F5};{3,7:F2};{4,7:F2};{5:F2};{6:F2};{7:F2};{8:F2}",
              policy, i, totalReward, totalPullsOfOptimalArmExp, totalPullsOfOptimalArmMax, maxReward,
              totalReward / i, totalPullsOfOptimalArmExp / (double)i, totalPullsOfOptimalArmMax / (double)i);
          }
        }
      }
      // print
      //foreach (var p in regretForIteration.Keys.OrderBy(k => k)) {
      //  Console.WriteLine("iter {0,8} regret avg {1,7:F5} min {2,7:F5} max {3,7:F5} suboptimal pulls (exp) {4,7:F2} suboptimal pulls (max) {5,7:F2} max rewards: {6}",
      //    p,
      //    regretForIteration[p].Average(),
      //    regretForIteration[p].Min(),
      //    regretForIteration[p].Max(),
      //    numberOfPullsOfSuboptimalArmsForExp[p] / (double)reps,
      //    numberOfPullsOfSuboptimalArmsForMax[p] / (double)reps,
      //    string.Join(" ", bestRewardForIteration[p])
      //    );
      //}
    }

  }
}