using System;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using HeuristicLab.Algorithms.Bandits;
using HeuristicLab.Algorithms.Bandits.BanditPolicies;
using HeuristicLab.Algorithms.Bandits.GrammarPolicies;
using HeuristicLab.Algorithms.Bandits.Models;
using HeuristicLab.Algorithms.GrammaticalOptimization;
using HeuristicLab.Problems.GrammaticalOptimization;
using HeuristicLab.Problems.GrammaticalOptimization.SymbReg;
using BoltzmannExplorationPolicy = HeuristicLab.Algorithms.Bandits.BanditPolicies.BoltzmannExplorationPolicy;
using EpsGreedyPolicy = HeuristicLab.Algorithms.Bandits.BanditPolicies.EpsGreedyPolicy;
using RandomPolicy = HeuristicLab.Algorithms.Bandits.BanditPolicies.RandomPolicy;
using UCTPolicy = HeuristicLab.Algorithms.Bandits.BanditPolicies.UCTPolicy;

namespace Main {
  class Program {
    static void Main(string[] args) {
      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;

      //RunDemo();
      RunGridTest();
    }

    private static void RunGridTest() {
      int maxIterations = 70000; // for poly-10 with 50000 evaluations no successful try with hl yet
      //var globalRandom = new Random(31415);
      var localRandSeed = 31415;
      var reps = 30;

      var policyFactories = new Func<IBanditPolicy>[]
        {
         () => new RandomPolicy(), 
          () => new ActiveLearningPolicy(),  
         () => new EpsGreedyPolicy(0.01, (aInfo)=> aInfo.MaxReward, "max"), 
         () => new EpsGreedyPolicy(0.05, (aInfo)=> aInfo.MaxReward, "max"), 
         () => new EpsGreedyPolicy(0.1, (aInfo)=> aInfo.MaxReward, "max"), 
         () => new EpsGreedyPolicy(0.2, (aInfo)=> aInfo.MaxReward, "max"), 
         //() => new GaussianThompsonSamplingPolicy(), 
         () => new GaussianThompsonSamplingPolicy(true), 
         () => new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 10, 1)), 
         () => new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 10, 1, 1)), 
         //() => new BernoulliThompsonSamplingPolicy(),
         () => new GenericThompsonSamplingPolicy(new BernoulliModel(1, 1)), 
         () => new EpsGreedyPolicy(0.01), 
         () => new EpsGreedyPolicy(0.05), 
         () => new EpsGreedyPolicy(0.1), 
         () => new EpsGreedyPolicy(0.2), 
         () => new EpsGreedyPolicy(0.5), 
         () => new UCTPolicy(0.01),
         () => new UCTPolicy(0.05),
         () => new UCTPolicy(0.1),
         () => new UCTPolicy(0.5),
         () => new UCTPolicy(1),
         () => new UCTPolicy(2),
         () => new UCTPolicy( 5),
         () => new UCTPolicy( 10),
         () => new ModifiedUCTPolicy(0.01),
         () => new ModifiedUCTPolicy(0.05),
         () => new ModifiedUCTPolicy(0.1),
         () => new ModifiedUCTPolicy(0.5),
         () => new ModifiedUCTPolicy(1),
         () => new ModifiedUCTPolicy(2),
         () => new ModifiedUCTPolicy( 5),
         () => new ModifiedUCTPolicy( 10),
         () => new UCB1Policy(), 
         () => new UCB1TunedPolicy(), 
         () => new UCBNormalPolicy(), 
         () => new BoltzmannExplorationPolicy(1),
         () => new BoltzmannExplorationPolicy(10),
         () => new BoltzmannExplorationPolicy(20),
         () => new BoltzmannExplorationPolicy(100),
         () => new BoltzmannExplorationPolicy(200),
         () => new BoltzmannExplorationPolicy(500),
         () => new ChernoffIntervalEstimationPolicy( 0.01), 
         () => new ChernoffIntervalEstimationPolicy( 0.05),
         () => new ChernoffIntervalEstimationPolicy( 0.1),
         () => new ChernoffIntervalEstimationPolicy( 0.2),
         () => new ThresholdAscentPolicy(5, 0.01), 
         () => new ThresholdAscentPolicy(5, 0.05),
         () => new ThresholdAscentPolicy(5, 0.1),
         () => new ThresholdAscentPolicy(5, 0.2),
         () => new ThresholdAscentPolicy(10, 0.01), 
         () => new ThresholdAscentPolicy(10, 0.05),
         () => new ThresholdAscentPolicy(10, 0.1),
         () => new ThresholdAscentPolicy(10, 0.2),
         () => new ThresholdAscentPolicy(50, 0.01), 
         () => new ThresholdAscentPolicy(50, 0.05),
         () => new ThresholdAscentPolicy(50, 0.1),
         () => new ThresholdAscentPolicy(50, 0.2),
         () => new ThresholdAscentPolicy(100, 0.01), 
         () => new ThresholdAscentPolicy(100, 0.05),
         () => new ThresholdAscentPolicy(100, 0.1),
         () => new ThresholdAscentPolicy(100, 0.2),
         () => new ThresholdAscentPolicy(500, 0.01), 
         () => new ThresholdAscentPolicy(500, 0.05),
         () => new ThresholdAscentPolicy(500, 0.1),
         () => new ThresholdAscentPolicy(500, 0.2),
         //() => new ThresholdAscentPolicy(5000, 0.01), 
         //() => new ThresholdAscentPolicy(10000, 0.01), 
        };

      var instanceFactories = new Func<Random, Tuple<IProblem, int>>[]
      {
        //(rand) => Tuple.Create((IProblem)new SantaFeAntProblem(), 17),
        //(rand) => Tuple.Create((IProblem)new FindPhrasesProblem(rand, 10, numPhrases:5, phraseLen:3, numOptimalPhrases:5, numDecoyPhrases:0, correctReward:1, decoyReward:0, phrasesAsSets:false ), 15),
        //(rand) => Tuple.Create((IProblem)new FindPhrasesProblem(rand, 10, numPhrases:5, phraseLen:3, numOptimalPhrases:5, numDecoyPhrases:0, correctReward:1, decoyReward:0, phrasesAsSets:true ), 15),
        //(rand) => Tuple.Create((IProblem)new FindPhrasesProblem(rand, 10, numPhrases:5, phraseLen:3, numOptimalPhrases:5, numDecoyPhrases:200, correctReward:1, decoyReward:0.5, phrasesAsSets:false), 15),
        //(rand) => Tuple.Create((IProblem)new FindPhrasesProblem(rand, 10, numPhrases:5, phraseLen:3, numOptimalPhrases:5, numDecoyPhrases:200, correctReward:1, decoyReward:0.5, phrasesAsSets:true), 15),
        (rand) => Tuple.Create((IProblem)new SymbolicRegressionPoly10Problem(), 23)
      };

      foreach (var instanceFactory in instanceFactories) {
        foreach (var useCanonical in new bool[] { true /*, false */}) {
          foreach (var randomTries in new int[] { 0 /*, 1, 10 /*, /* 5, 100 /*, 500, 1000 */}) {
            foreach (var policyFactory in policyFactories) {
              var myRandomTries = randomTries;
              var localRand = new Random(localRandSeed);
              var options = new ParallelOptions();
              options.MaxDegreeOfParallelism = 4;
              Parallel.For(0, reps, options, (i) => {
                Random myLocalRand;
                lock (localRand)
                  myLocalRand = new Random(localRand.Next());

                int iterations = 0;
                var globalStatistics = new SentenceSetStatistics();

                // var problem = new SymbolicRegressionPoly10Problem();
                // var problem = new SantaFeAntProblem();
                //var problem = new PalindromeProblem();
                //var problem = new HardPalindromeProblem();
                //var problem = new RoyalPairProblem();
                //var problem = new EvenParityProblem();
                // var alg = new MctsSampler(problem.Item1, problem.Item2, myLocalRand, myRandomTries, policy()); 
                var instance = instanceFactory(myLocalRand);
                var problem = instance.Item1;
                var maxLen = instance.Item2;
                //var alg = new SequentialSearch(problem, maxLen, myLocalRand, myRandomTries,
                //  new GenericGrammarPolicy(problem, policyFactory(), useCanonical));
                var alg = new SequentialSearch(problem, maxLen, myLocalRand,
                  myRandomTries,
                  new GenericFunctionApproximationGrammarPolicy(problem,
                    useCanonical));
                //var alg = new ExhaustiveBreadthFirstSearch(problem, 25);
                //var alg = new AlternativesContextSampler(problem, 25);

                alg.SolutionEvaluated += (sentence, quality) => {
                  iterations++;
                  globalStatistics.AddSentence(sentence, quality);
                  if (iterations % 1000 == 0) {
                    Console.WriteLine("{0,3} {1,5} \"{2,25}\" {3} {4} {5}", i, myRandomTries, policyFactory(), useCanonical, problem.ToString(), globalStatistics);
                  }
                };
                alg.FoundNewBestSolution += (sentence, quality) => {
                  //Console.WriteLine("{0,5} {1,25} {2} {3}",
                  //  myRandomTries, policyFactory(), useCanonical,
                  //  globalStatistics);
                };

                alg.Run(maxIterations);
              });
            }
          }
        }
      }
    }

    private static void RunDemo() {
      // TODO: implement bridge to HL-GP 
      // TODO: unify MCTS, TD and ContextMCTS Solvers (stateInfos)
      // TODO: test with eps-greedy using max instead of average as value (seems to work well for symb-reg! explore further!)
      // TODO: separate value function from policy
      // TODO: in contextual MCTS store a bandit info for each node in the _graph_ and also update all bandit infos of all parents
      // TODO: exhaustive search with priority list
      // TODO: warum funktioniert die alte Implementierung von GaussianThompson besser für SantaFe als neue? Siehe Vergleich: alte vs. neue implementierung GaussianThompsonSampling
      // TODO: why does GaussianThompsonSampling work so well with MCTS for the artificial ant problem?
      // TODO: research thompson sampling for max bandit?
      // TODO: ausführlicher test von strategien für numCorrectPhrases-armed max bandit
      // TODO: verify TA implementation using example from the original paper      
      // TODO: separate policy from MCTS tree data structure to allow sharing of information over disconnected parts of the tree (semantic equivalence)
      // TODO: implement thompson sampling for gaussian mixture models
      // TODO: implement inspection for MCTS (eventuell interactive command line für statistiken aus dem baum anzeigen)
      // TODO: implement ACO-style bandit policy
      // TODO: gleichzeitige modellierung von transformierter zielvariable (y, 1/y, log(y), exp(y), sqrt(y), ...)
      // TODO: vergleich bei complete-randomly möglichst kurze sätze generieren vs. einfach zufällig alternativen wählen
      // TODO: reward discounting (für veränderliche reward distributions über zeit). speziellen unit-test dafür erstellen
      // TODO: constant optimization 


      int maxIterations = 1000000;
      int iterations = 0;
      var sw = new Stopwatch();

      var globalStatistics = new SentenceSetStatistics();
      var random = new Random();


      //var problem = new RoyalSequenceProblem(random, 10, 30, 2, 1, 0);
      // var phraseLen = 3;
      // var numPhrases = 5;
      // var problem = new RoyalPhraseSequenceProblem(random, 10, numPhrases, phraseLen: phraseLen, numCorrectPhrases: 1, correctReward: 1, incorrectReward: 0.0, phrasesAsSets: false);

      //var phraseLen = 3;
      //var numPhrases = 5;
      //var problem = new FindPhrasesProblem(random, 10, numPhrases, phraseLen, numOptimalPhrases: numPhrases, numDecoyPhrases: 0, correctReward: 1.0, decoyReward: 0, phrasesAsSets: false);

      // good results for symb-reg
      // prev results: e.g. 10 randomtries and EpsGreedyPolicy(0.2, (aInfo)=>aInfo.MaxReward)
      // 2015 01 19: grid test with canonical states: 
      // - EpsGreedyPolicy(0.20,max)
      // - GenericThompsonSamplingPolicy("")
      // - UCTPolicy(0.10) (5 of 5 runs, 35000 iters avg.), 10 successful runs of 10 with rand-tries 0, bei 40000 iters 9 / 10, bei 30000 1 / 10
      // 2015 01 22: symb-reg: grid test on find-phrases problem showed good results for UCB1TunedPolicy and SequentialSearch with canonical states
      // - symb-reg: consistent results with UCB1Tuned. finds optimal solution in ~50k iters (new GenericGrammarPolicy(problem, new UCB1TunedPolicy(), true));
      // 2015 01 23: grid test with canonical states:
      // - UCTPolicy(0.10) und UCBNormalPolicy 10/10 optimale Lösungen bei max. 50k iters, etwas schlechter: generic-thompson with variable sigma und bolzmannexploration (100)


      // good results for artificial ant:
      // prev results: 
      // - var alg = new MctsSampler(problem, 17, random, 1, (rand, numActions) => new ThresholdAscentPolicy(numActions, 500, 0.01));
      // - GaussianModelWithUnknownVariance (and Q= 0.99-quantil) also works well for Ant
      // 2015 01 19: grid test with canonical states (non-canonical slightly worse)
      // - ant: Threshold Ascent (best 100, 0.01; all variants relatively good)
      // - ant: Policies where the variance has a large weight compared to the mean? (Gaussian(compatible), Gaussian with fixed variance, UCT with large c, alle TA)
      // - ant: UCB1Tuned with canonical states also works very well for the artificial ant! constistent solutions in less than 10k iters      

      var problem = new SymbolicRegressionPoly10Problem();
      //var problem = new SantaFeAntProblem();
      //var problem = new SymbolicRegressionProblem(random, "Tower"); 
      //var problem = new PalindromeProblem();
      //var problem = new HardPalindromeProblem();
      //var problem = new RoyalPairProblem();
      //var problem = new EvenParityProblem();
      // symbreg length = 11 q = 0.824522210419616
      //var alg = new MctsSampler(problem, 23, random, 0, new BoltzmannExplorationPolicy(100));
      //var alg = new MctsSampler(problem, 23, random, 0, new EpsGreedyPolicy(0.1));
      //var alg = new SequentialSearch(problem, 23, random, 0,
      //  new HeuristicLab.Algorithms.Bandits.GrammarPolicies.QLearningGrammarPolicy(problem, new BoltzmannExplorationPolicy(10),
      //    1, 1, true));
      //var alg = new SequentialSearch(problem, 23, random, 0,
      //  new HeuristicLab.Algorithms.Bandits.GrammarPolicies.GenericContextualGrammarPolicy(problem, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 10, 1, 1)), true));
      var alg = new SequentialSearch(problem, 23, random, 0,
        new HeuristicLab.Algorithms.Bandits.GrammarPolicies.GenericFunctionApproximationGrammarPolicy(problem, true));
      //var alg = new MctsQLearningSampler(problem, sentenceLen, random, 0, null);
      //var alg = new MctsQLearningSampler(problem, 30, random, 0, new EpsGreedyPolicy(0.2));
      //var alg = new MctsContextualSampler(problem, 23, random, 0); // must visit each canonical solution only once
      //var alg = new TemporalDifferenceTreeSearchSampler(problem, 30, random, 1);
      //var alg = new ExhaustiveBreadthFirstSearch(problem, 7);
      //var alg = new AlternativesContextSampler(problem, random, 17, 4, (rand, numActions) => new RandomPolicy(rand, numActions));
      //var alg = new ExhaustiveDepthFirstSearch(problem, 17);
      // var alg = new AlternativesSampler(problem, 17);
      // var alg = new RandomSearch(problem, random, 17);
      //var alg = new ExhaustiveRandomFirstSearch(problem, random, 17);

      alg.FoundNewBestSolution += (sentence, quality) => {
        //Console.WriteLine("{0}", globalStatistics);
        //Console.ReadLine();
      };
      alg.SolutionEvaluated += (sentence, quality) => {
        iterations++;
        globalStatistics.AddSentence(sentence, quality);

        if (iterations % 1000 == 0) {
          if (iterations % 10000 == 0) Console.Clear();
          Console.SetCursorPosition(0, 0);
          alg.PrintStats();
        }

        //Console.WriteLine(sentence);

        //if (iterations % 10000 == 0) {
        //  Console.WriteLine("{0}", globalStatistics);
        //}
      };


      sw.Start();

      alg.Run(maxIterations);

      sw.Stop();

      Console.Clear();
      alg.PrintStats();
      Console.WriteLine(globalStatistics);
      Console.WriteLine("{0:F2} sec {1,10:F1} sols/sec {2,10:F1} ns/sol",
        sw.Elapsed.TotalSeconds,
        maxIterations / (double)sw.Elapsed.TotalSeconds,
        (double)sw.ElapsedMilliseconds * 1000 / maxIterations);
    }
  }
}