Changeset 11732

branches/HeuristicLab.Problems.GrammaticalOptimization/GrammaticalOptimization.sln

-                      r11727
+                      r11732
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.Common", "HeuristicLab.Common\HeuristicLab.Common.csproj", "{3A2FBBCB-F9DF-4970-87F3-F13337D941AD}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HeuristicLab.Problems.GrammaticalOptimization.SymbReg", "HeuristicLab.Problems.GrammaticalOptimization.SymbReg\HeuristicLab.Problems.GrammaticalOptimization.SymbReg.csproj", "{17A7A380-86CE-482D-8D22-CBD70CC97F0D}"
 EndProject
 Global
 …
     {3A2FBBCB-F9DF-4970-87F3-F13337D941AD}.Release|Any CPU.ActiveCfg = Release|Any CPU
     {3A2FBBCB-F9DF-4970-87F3-F13337D941AD}.Release|Any CPU.Build.0 = Release|Any CPU
+    {17A7A380-86CE-482D-8D22-CBD70CC97F0D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+    {17A7A380-86CE-482D-8D22-CBD70CC97F0D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+    {17A7A380-86CE-482D-8D22-CBD70CC97F0D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+    {17A7A380-86CE-482D-8D22-CBD70CC97F0D}.Release|Any CPU.Build.0 = Release|Any CPU
   EndGlobalSection
   GlobalSection(SolutionProperties) = preSolution

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

-                      r11730
+                      r11732
   </PropertyGroup>
   <ItemGroup>
+    <Reference Include="ALGLIB-3.7.0">
+      <HintPath>..\..\..\trunk\sources\bin\ALGLIB-3.7.0.dll</HintPath>
+    </Reference>
     <Reference Include="System" />
     <Reference Include="System.Core" />
 …
     <Compile Include="BanditHelper.cs" />
     <Compile Include="Bandits\BernoulliBandit.cs" />
+    <Compile Include="Bandits\GaussianBandit.cs" />
     <Compile Include="Bandits\GaussianMixtureBandit.cs" />
     <Compile Include="Bandits\IBandit.cs" />
     <Compile Include="Bandits\TruncatedNormalBandit.cs" />
+    <Compile Include="OnlineMeanAndVarianceEstimator.cs" />
+    <Compile Include="IPolicyActionInfo.cs" />
     <Compile Include="Models\BernoulliModel.cs" />
     <Compile Include="Models\GaussianModel.cs" />
-    <Compile Include="Models\GaussianMixtureModel.cs" />
     <Compile Include="Models\IModel.cs" />
+    <Compile Include="Policies\BanditPolicy.cs" />
+    <Compile Include="Policies\BernoulliThompsonSamplingPolicy.cs" />
+    <Compile Include="Policies\BoltzmannExplorationPolicy.cs" />
+    <Compile Include="Policies\ChernoffIntervalEstimationPolicy.cs" />
+    <Compile Include="Policies\GenericThompsonSamplingPolicy.cs" />
+    <Compile Include="Policies\ThresholdAscentPolicy.cs" />
+    <Compile Include="Policies\UCTPolicy.cs" />
+    <Compile Include="Policies\GaussianThompsonSamplingPolicy.cs" />
+    <Compile Include="Policies\Exp3Policy.cs" />
+    <Compile Include="Policies\EpsGreedyPolicy.cs" />
+    <Compile Include="Policies\BernoulliThompsonSamplingPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\BoltzmannExplorationPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\ChernoffIntervalEstimationPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\BernoulliPolicyActionInfo.cs" />
+    <Compile Include="Policies\ModelPolicyActionInfo.cs" />
+    <Compile Include="Policies\EpsGreedyPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\GaussianThompsonSamplingPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\GenericThompsonSamplingPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\MeanAndVariancePolicyActionInfo.cs" />
+    <Compile Include="Policies\DefaultPolicyActionInfo.cs" />
+    <Compile Include="Policies\EmptyPolicyActionInfo.cs" />
     <Compile Include="Policies\RandomPolicy.cs" />
     <Compile Include="Policies\UCB1Policy.cs" />
-    <Compile Include="Policies\UCB1TunedPolicy.cs" />
-    <Compile Include="Policies\UCBNormalPolicy.cs" />
     <Compile Include="IPolicy.cs" />
+    <Compile Include="Policies\UCB1TunedPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\UCBNormalPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="Policies\UCTPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
     <Compile Include="Properties\AssemblyInfo.cs" />
   </ItemGroup>

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

-                      r11730
+                      r11732
   // this interface represents a policy for reinforcement learning
   public interface IPolicy {
+    IEnumerable<int> Actions { get; }
+    int SelectAction(); // action selection ...
+    void UpdateReward(int action, double reward); // ... and reward update are defined as usual
+    // policies must also support disabling of potential actions
+    // for instance if we know that an action in a state has a deterministic
+    // reward we need to sample it only once
+    // it is necessary to sample an action only once
+    void DisableAction(int action);
+    // reset causes the policy to be reinitialized to it's initial state (as after constructor-call)
+    void Reset();
+    void PrintStats();
+    int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos);
+    IPolicyActionInfo CreateActionInfo();
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/BernoulliModel.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits.Models {
   public class BernoulliModel : IModel {
+    private readonly int numActions;
+    private readonly int[] success;
+    private readonly int[] failure;
+    private int success;
+    private int failure;
     // parameters of beta prior distribution
 …
     private readonly double beta;
+    public BernoulliModel(int numActions, double alpha = 1.0, double beta = 1.0) {
+      this.numActions = numActions;
+      this.success = new int[numActions];
+      this.failure = new int[numActions];
+    public BernoulliModel(double alpha = 1.0, double beta = 1.0) {
       this.alpha = alpha;
       this.beta = beta;
+    }
+    public double[] SampleExpectedRewards(Random random) {
+    public double SampleExpectedReward(Random random) {
       // sample bernoulli mean from beta prior
+      var theta = new double[numActions];
+      for (int a = 0; a < numActions; a++) {
+        if (success[a] == -1)
+          theta[a] = 0.0;
+        else {
+          theta[a] = Rand.BetaRand(random, success[a] + alpha, failure[a] + beta);
+        }
+      }
+      // no need to sample we know the exact expected value
+      // the expected value of a bernoulli variable is just theta
+      return theta.Select(t => t).ToArray();
+      return Rand.BetaRand(random, success + alpha, failure + beta);
+    }
+    public void Update(int action, double reward) {
+      const double EPSILON = 1E-6;
+      Debug.Assert(Math.Abs(reward - 0.0) < EPSILON || Math.Abs(reward - 1.0) < EPSILON);
+      if (Math.Abs(reward - 1.0) < EPSILON) {
+        success[action]++;
+    public void Update(double reward) {
+      Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));
+      if (reward.IsAlmost(1.0)) {
+        success++;
       } else {
         failure[action]++;
+        failure++;
+      }
+    }
-    public void Disable(int action) {
-      success[action] = -1;
+    }
     public void Reset() {
       Array.Clear(success, 0, numActions);
       Array.Clear(failure, 0, numActions);
+      success = 0;
+      failure = 0;
+    }
     public void PrintStats() {
+      for (int i = 0; i < numActions; i++) {
+        Console.Write("{0:F2} ", success[i] / (double)failure[i]);
+      }
+      Console.Write("{0:F2} ", success / (double)failure);
+    }
+    public object Clone() {
+      return new BernoulliModel() { failure = this.failure, success = this.success };
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/GaussianModel.cs

-                      r11730
+                      r11732
 using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits.Models {
+  // bayesian estimation of a Gaussian with unknown mean and known variance
+  // bayesian estimation of a Gaussian with
+  // 1) unknown mean and known variance
+  // 2) unknown mean and unknown variance
   public class GaussianModel : IModel {
+    private readonly int numActions;
+    private readonly int[] tries;
+    private readonly double[] sumRewards;
+    private OnlineMeanAndVarianceEstimator estimator = new OnlineMeanAndVarianceEstimator();
     // parameters of Gaussian prior for mean
 …
     private readonly double meanPriorVariance;
+    private readonly bool knownVariance;
     private readonly double rewardVariance = 0.1; // assumed know reward variance
+    public GaussianModel(int numActions, double meanPriorMu, double meanPriorVariance) {
+      this.numActions = numActions;
+      this.tries = new int[numActions];
+      this.sumRewards = new double[numActions];
+    // parameters of Gamma prior for precision (= inverse variance)
+    private readonly int precisionPriorAlpha;
+    private readonly double precisionPriorBeta;
+    // non-informative prior
+    private const double priorK = 1.0;
+    // this constructor assumes the variance is known
+    public GaussianModel(double meanPriorMu, double meanPriorVariance, double rewardVariance = 0.1) {
       this.meanPriorMu = meanPriorMu;
       this.meanPriorVariance = meanPriorVariance;
+      this.knownVariance = true;
+      this.rewardVariance = rewardVariance;
+    }
+    // this constructor assumes the variance is also unknown
+    // uses Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution equation 85 - 89
+    public GaussianModel(double meanPriorMu, double meanPriorVariance, int precisionPriorAlpha, double precisionPriorBeta) {
+      this.meanPriorMu = meanPriorMu;
+      this.meanPriorVariance = meanPriorVariance;
+      this.knownVariance = false;
+      this.precisionPriorAlpha = precisionPriorAlpha;
+      this.precisionPriorBeta = precisionPriorBeta;
+    }
+    public double[] SampleExpectedRewards(Random random) {
+    public double SampleExpectedReward(Random random) {
+      if (knownVariance) {
+        return SampleExpectedRewardKnownVariance(random);
+      } else {
+        return SampleExpectedRewardUnknownVariance(random);
+      }
+    }
+    private double SampleExpectedRewardKnownVariance(Random random) {
       // expected values for reward
       var theta = new double[numActions];
+      // calculate posterior mean and variance (for mean reward)
+      for (int a = 0; a < numActions; a++) {
+        if (tries[a] == -1) {
+          theta[a] = double.NegativeInfinity; // disabled action
+        } else {
+          // calculate posterior mean and variance (for mean reward)
+      // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
+      var posteriorMeanVariance = 1.0 / (estimator.N / rewardVariance + 1.0 / meanPriorVariance);
+      var posteriorMeanMean = posteriorMeanVariance * (meanPriorMu / meanPriorVariance + estimator.Sum / rewardVariance);
+          // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
+          var posteriorVariance = 1.0 / (tries[a] / rewardVariance + 1.0 / meanPriorVariance);
+          var posteriorMean = posteriorVariance * (meanPriorMu / meanPriorVariance + sumRewards[a] / rewardVariance);
+      // sample a mean from the posterior
+      var posteriorMeanSample = Rand.RandNormal(random) * Math.Sqrt(posteriorMeanVariance) + posteriorMeanMean;
+      // theta already represents the expected reward value => nothing else to do
+      return posteriorMeanSample;
+          // sample a mean from the posterior
+          theta[a] = Rand.RandNormal(random) * Math.Sqrt(posteriorVariance) + posteriorMean;
+          // theta already represents the expected reward value => nothing else to do
+        }
+      // return 0.99-quantile value
+      //return alglib.invnormaldistribution(0.99) * Math.Sqrt(rewardVariance + posteriorMeanVariance) + posteriorMeanMean;
+    }
+    // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution page 6 onwards (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
+    private double SampleExpectedRewardUnknownVariance(Random random) {
+      var posteriorMean = (priorK * meanPriorMu + estimator.Sum) / (priorK + estimator.N);
+      var posteriorK = priorK + estimator.N;
+      var posteriorAlpha = precisionPriorAlpha + estimator.N / 2.0;
+      double posteriorBeta;
+      if (estimator.N > 0) {
+        posteriorBeta = precisionPriorBeta + 0.5 * estimator.N * estimator.Variance + priorK * estimator.N * Math.Pow(estimator.Avg - meanPriorMu, 2) / (2.0 * (priorK + estimator.N));
+      } else {
+        posteriorBeta = precisionPriorBeta;
+      }
+      // sample from the posterior marginal for mu (expected value) equ. 91
+      // p(µ|D) = T2αn (µ| µn, βn/(αnκn))
+      // sample from Tk distribution : http://stats.stackexchange.com/a/70270
+      var t2alpha = alglib.invstudenttdistribution((int)(2 * posteriorAlpha), random.NextDouble());
+      var theta = t2alpha * posteriorBeta / (posteriorAlpha * posteriorK) + posteriorMean;
       return theta;
+      //return alglib.invnormaldistribution(random.NextDouble()) * + theta;
+      //return alglib.invstudenttdistribution((int)(2 * posteriorAlpha), 0.99) * (posteriorBeta*posteriorK + posteriorBeta) / (posteriorAlpha*posteriorK) + posteriorMean;
+    }
-    public void Update(int action, double reward) {
-      sumRewards[action] += reward;
-      tries[action]++;
+    }
+    public void Disable(int action) {
+      tries[action] = -1;
+      sumRewards[action] = 0.0;
+    public void Update(double reward) {
+      estimator.UpdateReward(reward);
+    }
     public void Reset() {
+      Array.Clear(tries, 0, numActions);
+      Array.Clear(sumRewards, 0, numActions);
+      estimator.Reset();
+    }
     public void PrintStats() {
+      for (int i = 0; i < numActions; i++) {
+        Console.Write("{0:F2} ", sumRewards[i] / (double)tries[i]);
+      }
+      Console.Write("{0:F2} ", estimator.Avg);
+    }
+    public object Clone() {
+      if (knownVariance)
+        return new GaussianModel(meanPriorMu, meanPriorVariance, rewardVariance);
+      else
+        return new GaussianModel(meanPriorMu, meanPriorVariance, precisionPriorAlpha, precisionPriorBeta);
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/IModel.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
   public interface IModel {
     double[] SampleExpectedRewards(Random random);
     void Update(int action, double reward);
     void Disable(int action);
+  // represents a model for the reward distribution (of an action given a state)
+  public interface IModel : ICloneable {
+    double SampleExpectedReward(Random random);
+    void Update(double reward);
     void Reset();
     void PrintStats();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BanditPolicy.cs

r11730	r11732
7	7
8	8	namespace HeuristicLab.Algorithms.Bandits {
9		public abstract class BanditPolicy ~~: IPolicy~~ {
	9	public abstract class BanditPolicy<TPolicyActionInfo> : IPolicy<TPolicyActionInfo> where TPolicyActionInfo : IPolicyActionInfo {
10	10	public IEnumerable<int> Actions { get; private set; }
11	11	private readonly int numInitialActions;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BernoulliThompsonSamplingPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class BernoulliThompsonSamplingPolicy : BanditPolicy {
+    private readonly Random random;
+    private readonly int[] success;
+    private readonly int[] failure;
+  public class BernoulliThompsonSamplingPolicy : IPolicy {
     // parameters of beta prior distribution
     private readonly double alpha = 1.0;
     private readonly double beta = 1.0;
+    public BernoulliThompsonSamplingPolicy(Random random, int numActions)
+      : base(numActions) {
+      this.random = random;
+      this.success = new int[numActions];
+      this.failure = new int[numActions];
+    }
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<BernoulliPolicyActionInfo>(); // TODO: performance
+      int bestAction = -1;
+      double maxTheta = double.NegativeInfinity;
+      var aIdx = -1;
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+      var maxTheta = double.NegativeInfinity;
+      int bestAction = -1;
+      foreach (var a in Actions) {
+        var theta = Rand.BetaRand(random, success[a] + alpha, failure[a] + beta);
+      foreach (var aInfo in myActionInfos) {
+        aIdx++;
+        if (aInfo.Disabled) continue;
+        var theta = Rand.BetaRand(random, aInfo.NumSuccess + alpha, aInfo.NumFailure + beta);
         if (theta > maxTheta) {
           maxTheta = theta;
           bestAction = a;
+          bestAction = aIdx;
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      if (reward > 0) success[action]++;
+      else failure[action]++;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new BernoulliPolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      success[action] = -1;
+    }
-    public override void Reset() {
-      base.Reset();
-      Array.Clear(success, 0, success.Length);
-      Array.Clear(failure, 0, failure.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < success.Length; i++) {
-        if (success[i] >= 0) {
-          Console.Write("{0,5:F2}", success[i] / failure[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BoltzmannExplorationPolicy.cs

-                      r11730
+                      r11732
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits {
   // also called softmax policy
+  public class BoltzmannExplorationPolicy : BanditPolicy {
+    private readonly Random random;
+    private readonly double eps;
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+  public class BoltzmannExplorationPolicy : IPolicy {
     private readonly double beta;
+    public BoltzmannExplorationPolicy(Random random, int numActions, double beta)
+      : base(numActions) {
+    public BoltzmannExplorationPolicy(double beta) {
       if (beta < 0) throw new ArgumentException();
-      this.random = random;
       this.beta = beta;
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+    }
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      Debug.Assert(actionInfos.Any());
+      // select best
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
+      Debug.Assert(myActionInfos.Any(a => !a.Disabled));
+      double[] w = new double[myActionInfos.Length];
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) {
+          w[a] = 0; continue;
+        }
+        if (myActionInfos[a].Tries == 0) return a;
+        var sumReward = myActionInfos[a].SumReward;
+        var tries = myActionInfos[a].Tries;
+        var avgReward = sumReward / tries;
+        w[a] = Math.Exp(beta * avgReward);
+      }
+      var bestAction = Enumerable.Range(0, w.Length).SampleProportional(random, w).First();
+      Debug.Assert(bestAction >= 0);
+      Debug.Assert(bestAction < w.Length);
+      Debug.Assert(!myActionInfos[bestAction].Disabled);
+      return bestAction;
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+      // select best
+      var maxReward = double.NegativeInfinity;
+      int bestAction = -1;
+      if (Actions.Any(a => tries[a] == 0))
+        return Actions.First(a => tries[a] == 0);
+      var ts = Actions.Select(a => Math.Exp(beta * sumReward[a] / tries[a]));
+      var r = random.NextDouble() * ts.Sum();
+      var agg = 0.0;
+      foreach (var p in Actions.Zip(ts, Tuple.Create)) {
+        agg += p.Item2;
+        if (agg >= r) return p.Item1;
+      }
+      throw new InvalidProgramException();
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      tries[action]++;
+      sumReward[action] += reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      sumReward[action] = 0;
+      tries[action] = -1;
+    }
+    public override void Reset() {
+      base.Reset();
+      Array.Clear(tries, 0, tries.Length);
+      Array.Clear(sumReward, 0, sumReward.Length);
+    }
+    public override void PrintStats() {
+      for (int i = 0; i < sumReward.Length; i++) {
+        if (tries[i] >= 0) {
+          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
+        } else {
+          Console.Write("{0,5}", "");
+        }
+      }
+      Console.WriteLine();
+    public IPolicyActionInfo CreateActionInfo() {
+      return new DefaultPolicyActionInfo();
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/ChernoffIntervalEstimationPolicy.cs

-                      r11730
+                      r11732
 International Conference, CP 2006, Nantes, France, September 25-29, 2006. pp 560-574 */
+  public class ChernoffIntervalEstimationPolicy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private int totalTries = 0;
+  public class ChernoffIntervalEstimationPolicy : IPolicy {
     private readonly double delta;
+    public ChernoffIntervalEstimationPolicy(int numActions, double delta = 0.01)
+      : base(numActions) {
+    public ChernoffIntervalEstimationPolicy(double delta = 0.01) {
       this.delta = delta;
-      this.tries = new int[numActions];
-      this.sumReward = new double[numActions];
+    }
+    public override int SelectAction() {
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      Debug.Assert(actionInfos.Any());
+      // select best
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
+      int k = myActionInfos.Length;
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      double k = Actions.Count();
+      Debug.Assert(k > 0);
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (myActionInfos[a].Tries == 0) return a;
+        var sumReward = myActionInfos[a].SumReward;
+        var tries = myActionInfos[a].Tries;
+        var avgReward = sumReward / tries;
         // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
         // var alpha = Math.Log(2 * totalTries * k / delta);
         double alpha = Math.Log(2) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
+        double mu = sumReward[a] / tries[a];
+        var q = mu + (alpha + Math.Sqrt(2 * tries[a] * mu * alpha + alpha * alpha)) / tries[a];
+        var q = avgReward + (alpha + Math.Sqrt(2 * tries * avgReward * alpha + alpha * alpha)) / tries;
         if (q > bestQ) {
           bestQ = q;
 …
+        }
+      }
+      Debug.Assert(bestAction >= 0);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      sumReward[action] += reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new DefaultPolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      totalTries -= tries[action];
-      tries[action] = -1;
-      sumReward[action] = 0;
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return string.Format("ChernoffIntervalEstimationPolicy({0:F2})", delta);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class EpsGreedyPolicy : BanditPolicy {
+    private readonly Random random;
+  public class EpsGreedyPolicy : IPolicy {
     private readonly double eps;
-    private readonly int[] tries;
-    private readonly double[] sumReward;
     private readonly RandomPolicy randomPolicy;
+    public EpsGreedyPolicy(Random random, int numActions, double eps)
+      : base(numActions) {
+      this.random = random;
+    public EpsGreedyPolicy(double eps) {
       this.eps = eps;
+      this.randomPolicy = new RandomPolicy(random, numActions);
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.randomPolicy = new RandomPolicy();
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      Debug.Assert(actionInfos.Any());
       if (random.NextDouble() > eps) {
         // select best
         var bestQ = double.NegativeInfinity;
+        var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
         int bestAction = -1;
+        foreach (var a in Actions) {
+          if (tries[a] == 0) return a;
+          var q = sumReward[a] / tries[a];
+          if (bestQ < q) {
+        double bestQ = double.NegativeInfinity;
+        int aIdx = -1;
+        foreach (var aInfo in myActionInfos) {
+          aIdx++;
+          if (aInfo.Disabled) continue;
+          if (aInfo.Tries == 0) return aIdx;
+          var avgReward = aInfo.SumReward / aInfo.Tries;
+          //var q = avgReward;
+          var q = aInfo.MaxReward;
+          if (q > bestQ) {
             bestQ = q;
             bestAction = a;
+            bestAction = aIdx;
+          }
+        }
 …
       } else {
         // select random
         return randomPolicy.SelectAction();
+        return randomPolicy.SelectAction(random, actionInfos);
+      }
+    }
-    public override void UpdateReward(int action, double reward) {
-      Debug.Assert(Actions.Contains(action));
+      randomPolicy.UpdateReward(action, reward); // does nothing
+      tries[action]++;
+      sumReward[action] += reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new DefaultPolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      randomPolicy.DisableAction(action);
-      sumReward[action] = 0;
-      tries[action] = -1;
+    }
-    public override void Reset() {
-      base.Reset();
-      randomPolicy.Reset();
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write(" {0,5:F2} {1}", sumReward[i] / tries[i], tries[i]);
-        } else {
-          Console.Write("-", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return string.Format("EpsGreedyPolicy({0:F2})", eps);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/GaussianThompsonSamplingPolicy.cs

-                      r11730
+                      r11732
 using System;
+using System.Collections.Generic;
 using System.Diagnostics;
 using System.Linq;
 …
 namespace HeuristicLab.Algorithms.Bandits {
+  public class GaussianThompsonSamplingPolicy : BanditPolicy {
+    private readonly Random random;
+    private readonly double[] sampleMean;
+    private readonly double[] sampleM2;
+    private readonly int[] tries;
+  public class GaussianThompsonSamplingPolicy : IPolicy {
     private bool compatibility;
 …
+    public GaussianThompsonSamplingPolicy(Random random, int numActions, bool compatibility = false)
+      : base(numActions) {
+      this.random = random;
+      this.sampleMean = new double[numActions];
+      this.sampleM2 = new double[numActions];
+      this.tries = new int[numActions];
+    public GaussianThompsonSamplingPolicy(bool compatibility = false) {
       this.compatibility = compatibility;
+    }
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
+      int bestAction = -1;
+      double bestQ = double.NegativeInfinity;
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+      var maxTheta = double.NegativeInfinity;
+      int bestAction = -1;
+      foreach (var a in Actions) {
+        if(tries[a] == -1) continue; // skip disabled actions
+      int aIdx = -1;
+      foreach (var aInfo in myActionInfos) {
+        aIdx++;
+        if (aInfo.Disabled) continue;
+        var tries = aInfo.Tries;
+        var sampleMean = aInfo.AvgReward;
+        var sampleVariance = aInfo.RewardVariance;
         double theta;
         if (compatibility) {
           if (tries[a] < 2) return a;
           var mu = sampleMean[a];
           var variance = sampleM2[a] / tries[a];
+          if (tries < 2) return aIdx;
+          var mu = sampleMean;
+          var variance = sampleVariance;
           var stdDev = Math.Sqrt(variance);
           theta = Rand.RandNormal(random) * stdDev + mu;
 …
           // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
           var posteriorVariance = 1.0 / (tries[a] / rewardVariance + 1.0 / priorVariance);
           var posteriorMean = posteriorVariance * (priorMean / priorVariance + tries[a] * sampleMean[a] / rewardVariance);
+          var posteriorVariance = 1.0 / (tries / rewardVariance + 1.0 / priorVariance);
+          var posteriorMean = posteriorVariance * (priorMean / priorVariance + tries * sampleMean / rewardVariance);
           // sample a mean from the posterior
 …
           // theta already represents the expected reward value => nothing else to do
+        }
+        if (theta > maxTheta) {
+          maxTheta = theta;
+          bestAction = a;
+        if (theta > bestQ) {
+          bestQ = theta;
+          bestAction = aIdx;
+        }
+      }
       Debug.Assert(Actions.Contains(bestAction));
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      tries[action]++;
+      var delta = reward - sampleMean[action];
+      sampleMean[action] += delta / tries[action];
+      sampleM2[action] += sampleM2[action] + delta * (reward - sampleMean[action]);
+    public IPolicyActionInfo CreateActionInfo() {
+      return new MeanAndVariancePolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      sampleMean[action] = 0;
-      sampleM2[action] = 0;
-      tries[action] = -1;
+    }
+    public override void Reset() {
+      base.Reset();
+      Array.Clear(sampleMean, 0, sampleMean.Length);
+      Array.Clear(sampleM2, 0, sampleM2.Length);
+      Array.Clear(tries, 0, tries.Length);
+    }
+    //public override void UpdateReward(int action, double reward) {
+    //  Debug.Assert(Actions.Contains(action));
+    //  tries[action]++;
+    //  var delta = reward - sampleMean[action];
+    //  sampleMean[action] += delta / tries[action];
+    //  sampleM2[action] += sampleM2[action] + delta * (reward - sampleMean[action]);
+    //}
-    public override void PrintStats() {
-      for (int i = 0; i < sampleMean.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write(" {0,5:F2} {1}", sampleMean[i] / tries[i], tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return "GaussianThompsonSamplingPolicy";

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/GenericThompsonSamplingPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class GenericThompsonSamplingPolicy : BanditPolicy {
+    private readonly Random random;
+  public class GenericThompsonSamplingPolicy : IPolicy {
     private readonly IModel model;
+    public GenericThompsonSamplingPolicy(Random random, int numActions, IModel model)
+      : base(numActions) {
+      this.random = random;
+    public GenericThompsonSamplingPolicy(IModel model) {
       this.model = model;
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+      var maxR = double.NegativeInfinity;
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<ModelPolicyActionInfo>();
       int bestAction = -1;
+      var expRewards = model.SampleExpectedRewards(random);
+      foreach (var a in Actions) {
+        var r = expRewards[a];
+        if (r > maxR) {
+          maxR = r;
+          bestAction = a;
+      double bestQ = double.NegativeInfinity;
+      var aIdx = -1;
+      foreach (var aInfo in myActionInfos) {
+        aIdx++;
+        if (aInfo.Disabled) continue;
+        //if (aInfo.Tries == 0) return aIdx;
+        var q = aInfo.SampleExpectedReward(random);
+        if (q > bestQ) {
+          bestQ = q;
+          bestAction = aIdx;
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      model.Update(action, reward);
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      model.Disable(action);
+    }
+    public override void Reset() {
+      base.Reset();
+      model.Reset();
+    }
+    public override void PrintStats() {
+      model.PrintStats();
+    public IPolicyActionInfo CreateActionInfo() {
+      return new ModelPolicyActionInfo((IModel)model.Clone());
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/RandomPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class RandomPolicy : BanditPolicy {
+    private readonly Random random;
+  public class RandomPolicy : IPolicy {
-    public RandomPolicy(Random random, int numActions)
-      : base(numActions) {
-      this.random = random;
+    }
-    public override int SelectAction() {
-      Debug.Assert(Actions.Any());
-      return Actions.SelectRandom(random);
+    }
-    public override void UpdateReward(int action, double reward) {
-      // do nothing
+    }
-    public override void PrintStats() {
-      Console.WriteLine("Random");
+    }
     public override string ToString() {
       return "RandomPolicy";
+    }
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      return actionInfos
+        .Select((a, i) => Tuple.Create(a, i))
+        .Where(p => !p.Item1.Disabled)
+        .SelectRandom(random).Item2;
+    }
+    public IPolicyActionInfo CreateActionInfo() {
+      return new EmptyPolicyActionInfo();
+    }
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class UCB1Policy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private int totalTries = 0;
+    public UCB1Policy(int numActions)
+      : base(numActions) {
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+    }
+    public override int SelectAction() {
+  public class UCB1Policy : IPolicy {
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[a]);
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (myActionInfos[a].Tries == 0) return a;
+        var q = myActionInfos[a].SumReward / myActionInfos[a].Tries + Math.Sqrt((2 * Math.Log(totalTries)) / myActionInfos[a].Tries);
         if (q > bestQ) {
           bestQ = q;
 …
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
-    public override void UpdateReward(int action, double reward) {
-      Debug.Assert(Actions.Contains(action));
-      totalTries++;
-      tries[action]++;
-      sumReward[action] += reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+    }
+    public override void Reset() {
+      base.Reset();
+      totalTries = 0;
+      Array.Clear(tries, 0, tries.Length);
+      Array.Clear(sumReward, 0, sumReward.Length);
+    }
+    public override void PrintStats() {
+      for (int i = 0; i < sumReward.Length; i++) {
+        if (tries[i] >= 0) {
+          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
+        } else {
+          Console.Write("{0,5}", "");
+        }
+      }
+      Console.WriteLine();
+    public IPolicyActionInfo CreateActionInfo() {
+      return new DefaultPolicyActionInfo();
+    }
     public override string ToString() {

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class UCB1TunedPolicy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private readonly double[] sumSqrReward;
+    private int totalTries = 0;
+    public UCB1TunedPolicy(int numActions)
+      : base(numActions) {
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
+  public class UCB1TunedPolicy : IPolicy {
+    private double V(int arm) {
+      var s = tries[arm];
+      return sumSqrReward[arm] / s - Math.Pow(sumReward[arm] / s, 2) + Math.Sqrt(2 * Math.Log(totalTries) / s);
+    }
+    public override int SelectAction() {
+      Debug.Assert(Actions.Any());
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>().ToArray(); // TODO: performance
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (myActionInfos[a].Tries == 0) return a;
+        var sumReward = myActionInfos[a].SumReward;
+        var tries = myActionInfos[a].Tries;
+        var avgReward = sumReward / tries;
+        var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(myActionInfos[a], totalTries))); // 1/4 is upper bound of bernoulli distributed variable
         if (q > bestQ) {
           bestQ = q;
 …
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      sumReward[action] += reward;
+      sumSqrReward[action] += reward * reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new MeanAndVariancePolicyActionInfo();
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+      sumSqrReward[action] = 0;
+    private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) {
+      var s = actionInfo.Tries;
+      return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s);
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
-      Array.Clear(sumSqrReward, 0, sumSqrReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return "UCB1TunedPolicy";

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCBNormalPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
+  public class UCBNormalPolicy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private readonly double[] sumSqrReward;
+    private int totalTries = 0;
+    public UCBNormalPolicy(int numActions)
+      : base(numActions) {
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
+  public class UCBNormalPolicy : IPolicy {
     public override int SelectAction() {
       Debug.Assert(Actions.Any());
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>().ToArray(); // TODO: performance
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      foreach (var a in Actions) {
+        if (totalTries <= 1 || tries[a] <= 1 || tries[a] <= Math.Ceiling(8 * Math.Log(totalTries))) return a;
+        var avgReward = sumReward[a] / tries[a];
+        var estVariance = 16 * ((sumSqrReward[a] - tries[a] * Math.Pow(avgReward, 2)) / (tries[a] - 1)) * (Math.Log(totalTries - 1) / tries[a]);
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (totalTries <= 1 || myActionInfos[a].Tries <= 1 || myActionInfos[a].Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return a;
+        var tries = myActionInfos[a].Tries;
+        var avgReward = myActionInfos[a].AvgReward;
+        var rewardVariance = myActionInfos[a].RewardVariance;
+        var estVariance = 16 * rewardVariance * (Math.Log(totalTries - 1) / tries);
         if (estVariance < 0) estVariance = 0; // numerical problems
         var q = avgReward
 …
+        }
+      }
       Debug.Assert(Actions.Contains(bestAction));
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      sumReward[action] += reward;
+      sumSqrReward[action] += reward * reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new MeanAndVariancePolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      totalTries -= tries[action];
-      tries[action] = -1;
-      sumReward[action] = 0;
-      sumSqrReward[action] = 0;
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
-      Array.Clear(sumSqrReward, 0, sumSqrReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return "UCBNormalPolicy";

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCTPolicy.cs

-                      r11730
+                      r11732
 namespace HeuristicLab.Algorithms.Bandits {
   /* Kocsis et al. Bandit based Monte-Carlo Planning */
+  public class UCTPolicy : BanditPolicy {
+    private readonly int[] tries;
+    private readonly double[] sumReward;
+    private int totalTries = 0;
+  public class UCTPolicy : IPolicy {
     private readonly double c;
+    public UCTPolicy(int numActions, double c = 1.0)
+      : base(numActions) {
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+    public UCTPolicy(double c = 1.0) {
       this.c = c;
+    }
+    public override int SelectAction() {
+    public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
+      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>().ToArray(); // TODO: performance
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + 2 * c * Math.Sqrt(Math.Log(totalTries) / tries[a]);
+      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
+      for (int a = 0; a < myActionInfos.Length; a++) {
+        if (myActionInfos[a].Disabled) continue;
+        if (myActionInfos[a].Tries == 0) return a;
+        var q = myActionInfos[a].SumReward / myActionInfos[a].Tries + 2 * c * Math.Sqrt(Math.Log(totalTries) / myActionInfos[a].Tries);
         if (q > bestQ) {
           bestQ = q;
 …
+        }
+      }
+      Debug.Assert(bestAction > -1);
       return bestAction;
+    }
+    public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      totalTries++;
+      tries[action]++;
+      sumReward[action] += reward;
+    public IPolicyActionInfo CreateActionInfo() {
+      return new DefaultPolicyActionInfo();
+    }
-    public override void DisableAction(int action) {
-      base.DisableAction(action);
-      totalTries -= tries[action];
-      tries[action] = -1;
-      sumReward[action] = 0;
+    }
-    public override void Reset() {
-      base.Reset();
-      totalTries = 0;
-      Array.Clear(tries, 0, tries.Length);
-      Array.Clear(sumReward, 0, sumReward.Length);
+    }
-    public override void PrintStats() {
-      for (int i = 0; i < sumReward.Length; i++) {
-        if (tries[i] >= 0) {
-          Console.Write("{0,5:F2}", sumReward[i] / tries[i]);
-        } else {
-          Console.Write("{0,5}", "");
+        }
+      }
-      Console.WriteLine();
+    }
     public override string ToString() {
       return string.Format("UCTPolicy({0:F2})", c);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/AlternativesContextSampler.cs

-                      r11730
+                      r11732
     private readonly Random random;
     private readonly int contextLen;
     private readonly Func<Random, int, IPolicy> policyFactory;
+    private readonly IPolicy policy;
     public AlternativesContextSampler(IProblem problem, Random random, int maxLen, int contextLen, Func<Random, int, IPolicy> policyFactory) {
+    public AlternativesContextSampler(IProblem problem, Random random, int maxLen, int contextLen, IPolicy policy) {
       this.maxLen = maxLen;
       this.problem = problem;
       this.random = random;
       this.contextLen = contextLen;
       this.policyFactory = policyFactory;
+      this.policy = policy;
+    }
 …
       for (int i = 0; i < maxIterations; i++) {
         var sentence = SampleSentence(problem.Grammar).ToString();
         var quality = problem.Evaluate(sentence) / problem.GetBestKnownQuality(maxLen);
+        var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
         DistributeReward(quality);
 …
     private Dictionary<string, IPolicy> ntPolicy;
+    private Dictionary<string, IPolicyActionInfo[]> contextActionInfos;
     private List<Tuple<string, int>> updateChain;
     private void InitPolicies(IGrammar grammar) {
       this.ntPolicy = new Dictionary<string, IPolicy>();
+      this.contextActionInfos = new Dictionary<string, IPolicyActionInfo[]>();
       this.updateChain = new List<Tuple<string, int>>();
+    }
 …
           var lft = phrase.Subsequence(startIdx, endIdx - startIdx + 1).ToString();
           lft = problem.Hash(lft);
           if (!ntPolicy.ContainsKey(lft)) {
             ntPolicy.Add(lft, policyFactory(random, g.GetAlternatives(nt).Count()));
+          if (!contextActionInfos.ContainsKey(lft)) {
+            contextActionInfos.Add(lft, g.GetAlternatives(nt).Select(_ => policy.CreateActionInfo()).ToArray());
+          }
           var selectedAltIdx = ntPolicy[lft].SelectAction();
+          var selectedAltIdx = policy.SelectAction(random, contextActionInfos[lft]);
           selectedAlt = alts.ElementAt(selectedAltIdx);
           updateChain.Add(Tuple.Create(lft, selectedAltIdx));
 …
         var lft = e.Item1;
         var action = e.Item2;
         ntPolicy[lft].UpdateReward(action, reward);
+        contextActionInfos[lft][action].UpdateReward(reward);
+      }
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/AlternativesSampler.cs

-                      r11730
+                      r11732
     private readonly Random random;
     private readonly IProblem problem;
+    private readonly IPolicy policy;
     public AlternativesSampler(IProblem problem, int maxLen) {
+    public AlternativesSampler(IProblem problem, IPolicy policy, int maxLen) {
       this.problem = problem;
       this.maxLen = maxLen;
+      this.random = new Random(31415);
+      this.random = new Random();
+      this.policy = policy;
+    }
 …
       for (int i = 0; i < maxIterations; i++) {
         var sentence = SampleSentence(problem.Grammar).ToString();
         var quality = problem.Evaluate(sentence) / problem.GetBestKnownQuality(maxLen);
+        var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
         DistributeReward(quality);
 …
     private Dictionary<char, IPolicy> ntPolicy;
+    private Dictionary<char, IPolicyActionInfo[]> ntActionInfos;
     private List<Tuple<char, int>> updateChain;
     private void InitPolicies(IGrammar grammar) {
       this.ntPolicy = new Dictionary<char, IPolicy>();
+      this.ntActionInfos = new Dictionary<char, IPolicyActionInfo[]>();
       this.updateChain = new List<Tuple<char, int>>();
       foreach (var nt in grammar.NonTerminalSymbols) {
         ntPolicy.Add(nt, new EpsGreedyPolicy(random, grammar.GetAlternatives(nt).Count(), 0.1));
+        ntActionInfos.Add(nt, grammar.GetAlternatives(nt).Select(_ => policy.CreateActionInfo()).ToArray());
+      }
+    }
 …
         } else {
           // all alts are allowed => select using bandit policy
           var selectedAltIdx = ntPolicy[nt].SelectAction();
+          var selectedAltIdx = policy.SelectAction(random, ntActionInfos[nt]);
           selectedAlt = alts.ElementAt(selectedAltIdx);
           updateChain.Add(Tuple.Create(nt, selectedAltIdx));
 …
         var nt = e.Item1;
         var action = e.Item2;
         ntPolicy[nt].UpdateReward(action, reward);
+        ntActionInfos[nt][action].UpdateReward(reward);
+      }
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/ExhaustiveBreadthFirstSearch.cs

r11730	r11732
26	26	for (int i = 0; sentenceEnumerator.MoveNext() && i < maxIterations; i++) {
27	27	var sentence = sentenceEnumerator.Current.ToString();
28		var quality = problem.Evaluate(sentence) / problem.~~Get~~BestKnownQuality(maxLen);
	28	var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
29	29	RaiseSolutionEvaluated(sentence, quality);
30	30

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/ExhaustiveDepthFirstSearch.cs

r11730	r11732
26	26	for (int i = 0; sentenceEnumerator.MoveNext() && i < maxIterations; i++) {
27	27	var sentence = sentenceEnumerator.Current.ToString();
28		var quality = problem.Evaluate(sentence) / problem.~~Get~~BestKnownQuality(maxLen);
	28	var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
29	29	RaiseSolutionEvaluated(sentence, quality);
30	30

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization.csproj

r11727	r11732
45	45	<Compile Include="AlternativesSampler.cs" />
46	46	<Compile Include="AlternativesContextSampler.cs" />
	47	<Compile Include="ExhaustiveRandomFirstSearch.cs" />
47	48	<Compile Include="MctsSampler.cs" />
48	49	<Compile Include="ExhaustiveDepthFirstSearch.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/MctsSampler.cs

-                      r11730
+                      r11732
       public int randomTries;
       public int policyTries;
       public IPolicy policy;
+      public IPolicyActionInfo actionInfo;
       public TreeNode[] children;
       public bool done = false;
 …
       public override string ToString() {
         return string.Format("Node({0} tries: {1}, done: {2}, policy: {3})", ident, randomTries + policyTries, done, policy);
+        return string.Format("Node({0} tries: {1}, done: {2}, policy: {3})", ident, randomTries + policyTries, done, actionInfo);
+      }
+    }
 …
     private readonly Random random;
     private readonly int randomTries;
     private readonly Func<Random, int, IPolicy> policyFactory;
+    private readonly IPolicy policy;
     private List<Tuple<TreeNode, int>> updateChain;
+    private List<TreeNode> updateChain;
     private TreeNode rootNode;
 …
     public int treeSize;
+    public MctsSampler(IProblem problem, int maxLen, Random random) :
+      this(problem, maxLen, random, 10, (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.1)) {
+    // public MctsSampler(IProblem problem, int maxLen, Random random) :
+    //   this(problem, maxLen, random, 10, (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.1)) {
+    //
+    // }
+    }
+    public MctsSampler(IProblem problem, int maxLen, Random random, int randomTries, Func<Random, int, IPolicy> policyFactory) {
+    public MctsSampler(IProblem problem, int maxLen, Random random, int randomTries, IPolicy policy) {
       this.maxLen = maxLen;
       this.problem = problem;
       this.random = random;
       this.randomTries = randomTries;
       this.policyFactory = policyFactory;
+      this.policy = policy;
+    }
 …
       for (int i = 0; !rootNode.done && i < maxIterations; i++) {
         var sentence = SampleSentence(problem.Grammar).ToString();
         var quality = problem.Evaluate(sentence) / problem.GetBestKnownQuality(maxLen);
+        var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
         Debug.Assert(quality >= 0 && quality <= 1.0);
         DistributeReward(quality);
 …
       var n = rootNode;
       Console.WriteLine("depth: {0,5} size: {1,10} root tries {2,10}", treeDepth, treeSize, rootNode.policyTries + rootNode.randomTries);
       while (n.policy != null) {
+      while (n.children != null) {
         Console.WriteLine();
         Console.WriteLine("{0,5}->{1,-50}", n.ident, string.Join(" ", n.children.Select(ch => string.Format("{0,4}", ch.ident))));
 …
     private void InitPolicies(IGrammar grammar) {
       this.updateChain = new List<Tuple<TreeNode, int>>();
+      this.updateChain = new List<TreeNode>();
       rootNode = new TreeNode(grammar.SentenceSymbol.ToString());
+      rootNode.actionInfo = policy.CreateActionInfo();
       treeDepth = 0;
       treeSize = 0;
 …
       TreeNode n = rootNode;
       bool done = phrase.IsTerminal;
-      int selectedAltIdx = -1;
       var curDepth = 0;
       while (!done) {
+        char nt = phrase.FirstNonTerminal;
+        int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
+        Debug.Assert(maxLenOfReplacement > 0);
+        var alts = g.GetAlternatives(nt).Where(alt => g.MinPhraseLength(alt) <= maxLenOfReplacement);
+        updateChain.Add(n);
         if (n.randomTries < randomTries) {
           n.randomTries++;
+          treeDepth = Math.Max(treeDepth, curDepth);
+          return g.CompleteSentenceRandomly(random, phrase, maxLen);
+        } else {
+          char nt = phrase.FirstNonTerminal;
+          treeDepth = Math.Max(treeDepth, curDepth);
+          int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
+          Debug.Assert(maxLenOfReplacement > 0);
+          return g.CompleteSentenceRandomly(random, phrase, maxLen);
+        } else if (n.randomTries == randomTries && n.policy == null) {
+          n.policy = policyFactory(random, alts.Count());
+          //n.children = alts.Select(alt => new TreeNode(alt.ToString())).ToArray(); // create a new node for each alternative
+          n.children = alts.Select(alt => new TreeNode(string.Empty)).ToArray(); // create a new node for each alternative
+          var alts = g.GetAlternatives(nt).Where(alt => g.MinPhraseLength(alt) <= maxLenOfReplacement);
+          treeSize += n.children.Length;
+        }
+        n.policyTries++;
+        // => select using bandit policy
+        selectedAltIdx = n.policy.SelectAction();
+        Sequence selectedAlt = alts.ElementAt(selectedAltIdx);
+          if (n.randomTries == randomTries && n.children == null) {
+            n.children = alts.Select(alt => new TreeNode(alt.ToString())).ToArray(); // create a new node for each alternative
+            //n.children = alts.Select(alt => new TreeNode(string.Empty)).ToArray(); // create a new node for each alternative
+            foreach (var ch in n.children) ch.actionInfo = policy.CreateActionInfo();
+            treeSize += n.children.Length;
+          }
+          n.policyTries++;
+          // => select using bandit policy
+          int selectedAltIdx = policy.SelectAction(random, n.children.Select(c => c.actionInfo));
+          Sequence selectedAlt = alts.ElementAt(selectedAltIdx);
         // replace nt with alt
         phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
+          // replace nt with alt
+          phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
         updateChain.Add(Tuple.Create(n, selectedAltIdx));
+          curDepth++;
         curDepth++;
+          done = phrase.IsTerminal;
-        done = phrase.IsTerminal;
-        if (!done) {
           // prepare for next iteration
           n = n.children[selectedAltIdx];
-          Debug.Assert(!n.done);
+        }
       } // while
+      updateChain.Add(n);
       // the last node is a leaf node (sentence is done), so we never need to visit this node again
+      n.children[selectedAltIdx].done = true;
+      n.done = true;
+      n.actionInfo.Disable();
       treeDepth = Math.Max(treeDepth, curDepth);
 …
       foreach (var e in updateChain) {
+        var node = e.Item1;
+        var policy = node.policy;
+        var action = e.Item2;
+        //policy.UpdateReward(action, reward / updateChain.Count);
+        policy.UpdateReward(action, reward);
+        if (node.children[action].done) node.policy.DisableAction(action);
+        if (node.children.All(c => c.done)) node.done = true;
+        var node = e;
+        if (node.children != null && node.children.All(c => c.done)) {
+          node.done = true;
+          node.actionInfo.Disable();
+        }
+        if (!node.done) {
+          node.actionInfo.UpdateReward(reward);
+          //policy.UpdateReward(action, reward / updateChain.Count);
+        }
+      }
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.GrammaticalOptimization/RandomSearch.cs

r11730	r11732
25	25	for (int i = 0; i < maxIterations; i++) {
26	26	var sentence = CreateSentence(problem.Grammar).ToString();
27		var quality = problem.Evaluate(sentence) / problem.~~Get~~BestKnownQuality(maxLen);
	27	var quality = problem.Evaluate(sentence) / problem.BestKnownQuality(maxLen);
28	28	RaiseSolutionEvaluated(sentence, quality);
29	29

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Common/Extensions.cs

-                      r11730
+                      r11732
+      }
+    }
+    public static double RSq(IEnumerable<double> xs, IEnumerable<double> ys) {
+      // two pass implementation, but we don't care
+      var meanX = xs.Average();
+      var meanY = ys.Average();
+      var s = 0.0;
+      var ssX = 0.0;
+      var ssY = 0.0;
+      foreach (var p in xs.Zip(ys, (x, y) => new { x, y })) {
+        s += (p.x - meanX) * (p.y - meanY);
+        ssX += Math.Pow(p.x - meanX, 2);
+        ssY += Math.Pow(p.y - meanY, 2);
+      }
+      if (s.IsAlmost(0)) return 0;
+      if (ssX.IsAlmost(0) || ssY.IsAlmost(0)) return 0;
+      return s * s / (ssX * ssY);
+    }
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Common/Rand.cs

r11727	r11732
32	32	* For Gamma(a,b), scale the result by b.
33	33	*/
34		p~~rivate~~ static double GammaRand(Random random, double a) {
	34	public static double GammaRand(Random random, double a) {
35	35	/* Algorithm:
36	36	* G. Marsaglia and W.W. Tsang, A simple method for generating gamma

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.Test/HeuristicLab.Problems.GrammaticalOptimization.Test.csproj

-                      r11730
+                      r11732
   </PropertyGroup>
   <ItemGroup>
+    <Reference Include="HeuristicLab.Problems.Instances-3.3">
+      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances-3.3.dll</HintPath>
+    </Reference>
+    <Reference Include="HeuristicLab.Problems.Instances.DataAnalysis-3.3">
+      <HintPath>..\..\..\trunk\sources\bin\HeuristicLab.Problems.Instances.DataAnalysis-3.3.dll</HintPath>
+    </Reference>
     <Reference Include="System" />
     <Reference Include="System.Core">
 …
   </Choose>
   <ItemGroup>
+    <Compile Include="TestSymbRegInstances.cs" />
     <Compile Include="TestSequence.cs" />
     <Compile Include="TestBanditPolicies.cs" />
 …
       <Project>{eea07488-1a51-412a-a52c-53b754a628b3}</Project>
       <Name>HeuristicLab.Algorithms.GrammaticalOptimization</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\HeuristicLab.Problems.GrammaticalOptimization.SymbReg\HeuristicLab.Problems.GrammaticalOptimization.SymbReg.csproj">
+      <Project>{17a7a380-86ce-482d-8d22-cbd70cc97f0d}</Project>
+      <Name>HeuristicLab.Problems.GrammaticalOptimization.SymbReg</Name>
     </ProjectReference>
     <ProjectReference Include="..\HeuristicLab.Problems.GrammaticalOptimization\HeuristicLab.Problems.GrammaticalOptimization.csproj">

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.Test/TestBanditPolicies.cs

-                      r11730
+                      r11732
   [TestClass]
   public class TestBanditPolicies {
+    [TestMethod]
+    public void ComparePoliciesForGaussianUnknownVarianceBandit() {
+      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
+      var randSeed = 31415;
+      var nArms = 20;
+      // Console.WriteLine("Threshold Ascent (20)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new ThresholdAscent(20, 0.01));
+      // Console.WriteLine("Threshold Ascent (100)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new ThresholdAscent(100, 0.01));
+      // Console.WriteLine("Threshold Ascent (500)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new ThresholdAscent(500, 0.01));
+      // Console.WriteLine("Threshold Ascent (1000)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new ThresholdAscent(1000, 0.01));
+      Console.WriteLine("Thompson (Gaussian fixed variance)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0, 1, 1)));
+      Console.WriteLine("Thompson (Gaussian est variance)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0, 1, 1, 0.1)));
+      Console.WriteLine("GaussianThompson (compat)"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new GaussianThompsonSamplingPolicy(true));
+      Console.WriteLine("GaussianThompson"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new GaussianThompsonSamplingPolicy());
+      Console.WriteLine("UCBNormal"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new UCBNormalPolicy());
+      Console.WriteLine("Random"); TestPolicyGaussianUnknownVariance(randSeed, nArms, new RandomPolicy());
+    }
 …
     public void ComparePoliciesForBernoulliBandit() {
       CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
+      var globalRand = new Random(31415);
+      var seedForPolicy = globalRand.Next();
+      var randSeed = 31415;
       var nArms = 20;
       //Console.WriteLine("Exp3 (gamma=0.01)");
 …
       //Console.WriteLine("Exp3 (gamma=0.05)");
       //estPolicyBernoulli(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 1));
       Console.WriteLine("Thompson (Bernoulli)"); TestPolicyBernoulli(globalRand, nArms, new BernoulliThompsonSamplingPolicy(new Random(seedForPolicy), nArms));
       Console.WriteLine("Generic Thompson (Bernoulli)"); TestPolicyBernoulli(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new BernoulliModel(nArms)));
+      Console.WriteLine("Thompson (Bernoulli)"); TestPolicyBernoulli(randSeed, nArms, new BernoulliThompsonSamplingPolicy());
+      Console.WriteLine("Generic Thompson (Bernoulli)"); TestPolicyBernoulli(randSeed, nArms, new GenericThompsonSamplingPolicy(new BernoulliModel()));
       Console.WriteLine("Random");
       TestPolicyBernoulli(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
+      TestPolicyBernoulli(randSeed, nArms, new RandomPolicy());
       Console.WriteLine("UCB1");
       TestPolicyBernoulli(globalRand, nArms, new UCB1Policy(nArms));
+      TestPolicyBernoulli(randSeed, nArms, new UCB1Policy());
       Console.WriteLine("UCB1Tuned");
       TestPolicyBernoulli(globalRand, nArms, new UCB1TunedPolicy(nArms));
+      TestPolicyBernoulli(randSeed, nArms, new UCB1TunedPolicy());
       Console.WriteLine("UCB1Normal");
       TestPolicyBernoulli(globalRand, nArms, new UCBNormalPolicy(nArms));
+      TestPolicyBernoulli(randSeed, nArms, new UCBNormalPolicy());
       Console.WriteLine("Eps(0.01)");
       TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
+      TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.01));
       Console.WriteLine("Eps(0.05)");
       TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
+      TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.05));
       //Console.WriteLine("Eps(0.1)");
       //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1));
+      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.1));
       //Console.WriteLine("Eps(0.2)");
       //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2));
+      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.2));
       //Console.WriteLine("Eps(0.5)");
       //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5));
       Console.WriteLine("UCT(0.1)"); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 0.1));
       Console.WriteLine("UCT(0.5)"); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 0.5));
       Console.WriteLine("UCT(1)  "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 1));
       Console.WriteLine("UCT(2)  "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 2));
       Console.WriteLine("UCT(5)  "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 5));
       Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1));
       Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5));
       Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
       Console.WriteLine("BoltzmannExploration(10) "); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
       Console.WriteLine("BoltzmannExploration(100)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1));
+      //TestPolicyBernoulli(randSeed, nArms, new EpsGreedyPolicy(0.5));
+      Console.WriteLine("UCT(0.1)"); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(0.1));
+      Console.WriteLine("UCT(0.5)"); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(0.5));
+      Console.WriteLine("UCT(1)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(1));
+      Console.WriteLine("UCT(2)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(2));
+      Console.WriteLine("UCT(5)  "); TestPolicyBernoulli(randSeed, nArms, new UCTPolicy(5));
+      Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(0.1));
+      Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(0.5));
+      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(1));
+      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(10));
+      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyBernoulli(randSeed, nArms, new BoltzmannExplorationPolicy(100));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.01));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.05));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyBernoulli(randSeed, nArms, new ChernoffIntervalEstimationPolicy(0.1));
       // not applicable to bernoulli rewards
 …
     [TestMethod]
+    public void ComparePoliciesForNormalBandit() {
+      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
+      var globalRand = new Random(31415);
+      var seedForPolicy = globalRand.Next();
+      var nArms = 20;
+      Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyNormal(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms, true));
+      Console.WriteLine("Thompson (Gaussian new)"); TestPolicyNormal(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms));
+      Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyNormal(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new GaussianModel(nArms, 0.5, 1)));
+    public void ComparePoliciesForGaussianBandit() {
+      CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
+      var randSeed = 31415;
+      var nArms = 20;
+      Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussian(randSeed, nArms, new GaussianThompsonSamplingPolicy(true));
+      Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussian(randSeed, nArms, new GaussianThompsonSamplingPolicy());
+      Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyGaussian(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1)));
       /*
       Console.WriteLine("Random"); TestPolicyNormal(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
       Console.WriteLine("UCB1"); TestPolicyNormal(globalRand, nArms, new UCB1Policy(nArms));
       Console.WriteLine("UCB1Tuned"); TestPolicyNormal(globalRand, nArms, new UCB1TunedPolicy(nArms));
       Console.WriteLine("UCB1Normal"); TestPolicyNormal(globalRand, nArms, new UCBNormalPolicy(nArms));
+      Console.WriteLine("Random"); TestPolicyNormal(randSeed, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
+      Console.WriteLine("UCB1"); TestPolicyNormal(randSeed, nArms, new UCB1Policy(nArms));
+      Console.WriteLine("UCB1Tuned"); TestPolicyNormal(randSeed, nArms, new UCB1TunedPolicy(nArms));
+      Console.WriteLine("UCB1Normal"); TestPolicyNormal(randSeed, nArms, new UCBNormalPolicy(nArms));
       //Console.WriteLine("Exp3 (gamma=0.01)");
       //TestPolicyNormal(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.01));
+      //TestPolicyNormal(randSeed, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.01));
       //Console.WriteLine("Exp3 (gamma=0.05)");
       //TestPolicyNormal(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.05));
       Console.WriteLine("Eps(0.01)"); TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
       Console.WriteLine("Eps(0.05)"); TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
+      //TestPolicyNormal(randSeed, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.05));
+      Console.WriteLine("Eps(0.01)"); TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
+      Console.WriteLine("Eps(0.05)"); TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
       //Console.WriteLine("Eps(0.1)");
       //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1));
+      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1));
       //Console.WriteLine("Eps(0.2)");
       //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2));
+      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2));
       //Console.WriteLine("Eps(0.5)");
       //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5));
       Console.WriteLine("UCT(0.1)"); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 0.1));
       Console.WriteLine("UCT(0.5)"); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 0.5));
       Console.WriteLine("UCT(1)  "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 1));
       Console.WriteLine("UCT(2)  "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 2));
       Console.WriteLine("UCT(5)  "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 5));
       Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1));
       Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5));
       Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
       Console.WriteLine("BoltzmannExploration(10) "); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
       Console.WriteLine("BoltzmannExploration(100)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05));
       Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1));
       Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
       Console.WriteLine("ThresholdAscent(10,0.05)  "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05));
       Console.WriteLine("ThresholdAscent(10,0.1)   "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1));
       Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
       Console.WriteLine("ThresholdAscent(100,0.05) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05));
       Console.WriteLine("ThresholdAscent(100,0.1)  "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1));
       Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
       Console.WriteLine("ThresholdAscent(1000,0.05)"); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05));
       Console.WriteLine("ThresholdAscent(1000,0.1) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1));
+      //TestPolicyNormal(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5));
+      Console.WriteLine("UCT(0.1)"); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 0.1));
+      Console.WriteLine("UCT(0.5)"); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 0.5));
+      Console.WriteLine("UCT(1)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 1));
+      Console.WriteLine("UCT(2)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 2));
+      Console.WriteLine("UCT(5)  "); TestPolicyNormal(randSeed, nArms, new UCTPolicy(nArms, 5));
+      Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1));
+      Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5));
+      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
+      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
+      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyNormal(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05));
+      Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyNormal(randSeed, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1));
+      Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
+      Console.WriteLine("ThresholdAscent(10,0.05)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05));
+      Console.WriteLine("ThresholdAscent(10,0.1)   "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1));
+      Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
+      Console.WriteLine("ThresholdAscent(100,0.05) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05));
+      Console.WriteLine("ThresholdAscent(100,0.1)  "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1));
+      Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
+      Console.WriteLine("ThresholdAscent(1000,0.05)"); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05));
+      Console.WriteLine("ThresholdAscent(1000,0.1) "); TestPolicyNormal(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1));
        */
+    }
 …
     public void ComparePoliciesForGaussianMixtureBandit() {
       CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture;
+      var globalRand = new Random(31415);
+      var seedForPolicy = globalRand.Next();
+      var nArms = 20;
+      Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussianMixture(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms, true));
+      Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussianMixture(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms));
+      Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyGaussianMixture(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new GaussianModel(nArms, 0.5, 1)));
+      var randSeed = 31415;
+      var nArms = 20;
+      Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussianMixture(randSeed, nArms, new GaussianThompsonSamplingPolicy(true));
+      Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussianMixture(randSeed, nArms, new GaussianThompsonSamplingPolicy());
+      Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyGaussianMixture(randSeed, nArms, new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1)));
       /*
       Console.WriteLine("Random"); TestPolicyGaussianMixture(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
       Console.WriteLine("UCB1"); TestPolicyGaussianMixture(globalRand, nArms, new UCB1Policy(nArms));
       Console.WriteLine("UCB1Tuned "); TestPolicyGaussianMixture(globalRand, nArms, new UCB1TunedPolicy(nArms));
       Console.WriteLine("UCB1Normal"); TestPolicyGaussianMixture(globalRand, nArms, new UCBNormalPolicy(nArms));
       Console.WriteLine("Eps(0.01) "); TestPolicyGaussianMixture(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
       Console.WriteLine("Eps(0.05) "); TestPolicyGaussianMixture(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
       Console.WriteLine("UCT(1)  "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 1));
       Console.WriteLine("UCT(2)  "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 2));
       Console.WriteLine("UCT(5)  "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 5));
       Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
       Console.WriteLine("BoltzmannExploration(10) "); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
       Console.WriteLine("BoltzmannExploration(100)"); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
       Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
       Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
       Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
       Console.WriteLine("ThresholdAscent(10000,0.01)"); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10000, 0.01));
+      Console.WriteLine("Random"); TestPolicyGaussianMixture(randSeed, nArms, new RandomPolicy(new Random(seedForPolicy), nArms));
+      Console.WriteLine("UCB1"); TestPolicyGaussianMixture(randSeed, nArms, new UCB1Policy(nArms));
+      Console.WriteLine("UCB1Tuned "); TestPolicyGaussianMixture(randSeed, nArms, new UCB1TunedPolicy(nArms));
+      Console.WriteLine("UCB1Normal"); TestPolicyGaussianMixture(randSeed, nArms, new UCBNormalPolicy(nArms));
+      Console.WriteLine("Eps(0.01) "); TestPolicyGaussianMixture(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01));
+      Console.WriteLine("Eps(0.05) "); TestPolicyGaussianMixture(randSeed, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05));
+      Console.WriteLine("UCT(1)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 1));
+      Console.WriteLine("UCT(2)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 2));
+      Console.WriteLine("UCT(5)  "); TestPolicyGaussianMixture(randSeed, nArms, new UCTPolicy(nArms, 5));
+      Console.WriteLine("BoltzmannExploration(1)  "); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1));
+      Console.WriteLine("BoltzmannExploration(10) "); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10));
+      Console.WriteLine("BoltzmannExploration(100)"); TestPolicyGaussianMixture(randSeed, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100));
+      Console.WriteLine("ThresholdAscent(10,0.01)  "); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01));
+      Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01));
+      Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01));
+      Console.WriteLine("ThresholdAscent(10000,0.01)"); TestPolicyGaussianMixture(randSeed, nArms, new ThresholdAscentPolicy(nArms, 10000, 0.01));
        */
+    }
+    private void TestPolicyBernoulli(Random globalRand, int nArms, IPolicy policy) {
+      TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new BernoulliBandit(banditRandom, nActions));
+    }
+    private void TestPolicyNormal(Random globalRand, int nArms, IPolicy policy) {
+      TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new TruncatedNormalBandit(banditRandom, nActions));
+    }
+    private void TestPolicyGaussianMixture(Random globalRand, int nArms, IPolicy policy) {
+      TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new GaussianMixtureBandit(banditRandom, nActions));
+    }
+    private void TestPolicy(Random globalRand, int nArms, IPolicy policy, Func<Random, int, IBandit> banditFactory) {
+    private void TestPolicyBernoulli(int randSeed, int nArms, IPolicy policy) {
+      TestPolicy(randSeed, nArms, policy, (banditRandom, nActions) => new BernoulliBandit(banditRandom, nActions));
+    }
+    private void TestPolicyGaussian(int randSeed, int nArms, IPolicy policy) {
+      TestPolicy(randSeed, nArms, policy, (banditRandom, nActions) => new TruncatedNormalBandit(banditRandom, nActions));
+    }
+    private void TestPolicyGaussianMixture(int randSeed, int nArms, IPolicy policy) {
+      TestPolicy(randSeed, nArms, policy, (banditRandom, nActions) => new GaussianMixtureBandit(banditRandom, nActions));
+    }
+    private void TestPolicyGaussianUnknownVariance(int randSeed, int nArms, IPolicy policy) {
+      TestPolicy(randSeed, nArms, policy, (banditRandom, nActions) => new GaussianBandit(banditRandom, nActions));
+    }
+    private void TestPolicy(int randSeed, int nArms, IPolicy policy, Func<Random, int, IBandit> banditFactory) {
       var maxIt = 1E5;
       var reps = 30; // independent runs
+      var reps = 10; // independent runs
       var regretForIteration = new Dictionary<int, List<double>>();
       var numberOfPullsOfSuboptimalArmsForExp = new Dictionary<int, double>();
       var numberOfPullsOfSuboptimalArmsForMax = new Dictionary<int, double>();
+      var globalRandom = new Random(randSeed);
+      var banditRandom = new Random(globalRandom.Next()); // bandits must produce the same rewards for each test
+      var policyRandom = new Random(globalRandom.Next());
       // calculate statistics
       for (int r = 0; r < reps; r++) {
         var nextLogStep = 1;
+        var b = banditFactory(new Random(globalRand.Next()), nArms);
+        policy.Reset();
+        var b = banditFactory(banditRandom, nArms);
         var totalRegret = 0.0;
         var totalPullsOfSuboptimalArmsExp = 0.0;
         var totalPullsOfSuboptimalArmsMax = 0.0;
+        var actionInfos = Enumerable.Range(0, nArms).Select(_ => policy.CreateActionInfo()).ToArray();
         for (int i = 0; i <= maxIt; i++) {
           var selectedAction = policy.SelectAction();
+          var selectedAction = policy.SelectAction(policyRandom, actionInfos);
           var reward = b.Pull(selectedAction);
           policy.UpdateReward(selectedAction, reward);
+          actionInfos[selectedAction].UpdateReward(reward);
           // collect stats

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/EvenParityProblem.cs

-                      r11727
+                      r11732
     private readonly ExpressionInterpreter interpreter = new ExpressionInterpreter();
     public EvenParityProblem() {
       this.grammar = new Grammar(grammarString);
+      this.grammar = new Grammar (grammarString);
+    }
     public double GetBestKnownQuality(int maxLen) {
+    public double BestKnownQuality(int maxLen) {
       // for now only an upper bound is returned, ideally all fitness cases are predicted correctly
       return Math.Pow(2, 4);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/ExpressionInterpreter.cs

-                      r11727
+                      r11732
       var r = 0.0;
       r = Term(d);
+      while (CurSy() == '+' || CurSy() == '-' || CurSy() == '^') {
+        if (CurSy() == '+') {
+      var curSy = CurSy();
+      while (curSy == '+' || curSy == '-' || curSy == '^') {
+        if (curSy == '+') {
           NewSy();
           r += Expr(d);
         } else if (CurSy() == '-') {
+        } else if (curSy == '-') {
           NewSy();
           r -= Expr(d);
         } else if (CurSy() == '^') {
+        } else {
           NewSy();
           var e = Expr(d);
           r = Not(r) * e + r * Not(e); // xor = (!x AND y) OR (x AND !y)
+        }
+        curSy = CurSy();
+      }
       return r;
 …
       var r = 0.0;
       r = Fact(d);
+      while (CurSy() == '*' || CurSy() == '/') {
+        if (CurSy() == '*') {
+      var curSy = CurSy();
+      while (curSy == '*' || curSy == '/') {
+        if (curSy == '*') {
           NewSy();
           r *= Term(d);
         } else if (CurSy() == '/') {
+        } else {
           NewSy();
           r /= Term(d);
+        }
+        curSy = CurSy();
+      }
       return r;
 …
     private double Fact(double[] d) {
       double r = 0.0;
+      if (CurSy() == '!') {
+      var curSy = CurSy();
+      if (curSy == '!') {
         NewSy();
         r = Not(Expr(d));
       } else if (CurSy() == '(') {
+      } else if (curSy == '(') {
         NewSy();
         r = Expr(d);
         if (CurSy() != ')') throw new ArgumentException();
         NewSy();
+      } else if (CurSy() >= 'a' && CurSy() <= 'z') {
+        int o = Convert.ToByte(CurSy()) - Convert.ToByte('a');
+      } else /* if (curSy >= 'a' && curSy <= 'z') */ {
+        int o = (byte)curSy - (byte)'a';
+        //int o = Convert.ToByte(CurSy()) - Convert.ToByte('a');
         if (o < 0 || o >= d.Length) throw new ArgumentException();
         r = d[o];
         NewSy();
+      } else throw new ArgumentException();
+      }
+      //} else throw new ArgumentException();
       return r;
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs

-                      r11730
+                      r11732
       this.rules = new Dictionary<char, List<Sequence>>();
       foreach (var r in orig.rules)
         this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new Sequence(v)))); // clone sequences
+        this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences
       this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
       this.sentenceSymbol = orig.sentenceSymbol;
       this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
       this.maxPhraseLength = new Dictionary<Sequence, int>();
       foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new Sequence(p.Key), p.Value);
+      foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
       this.minPhraseLength = new Dictionary<Sequence, int>();
       foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new Sequence(p.Key), p.Value);
+      foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
+    }
 …
       foreach (var r in rules) {
         if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<Sequence>());
         this.rules[r.Item1].Add(new Sequence(r.Item2)); // here we store an array of symbols for a phase
+        this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase
+      }
       CheckValidity();
       CalculatePhaseLengthBounds();
+      CalculatePhraseLengthBounds();
+    }
 …
+    }
+    private void CalculatePhaseLengthBounds() {
+      minPhraseLength.Clear();
+      maxPhraseLength.Clear();
+    private void CalculatePhraseLengthBounds() {
       // cache phrase lengths
       foreach (var nt in nonTerminalSymbols) {
 …
         var max = int.MinValue;
         foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
           minPhraseLength[alt] = MinPhraseLength(alt);
           maxPhraseLength[alt] = MaxPhraseLength(alt);
+          CalcAndSetMinPhraseLength(alt);
+          CalcAndSetMaxPhraseLength(alt);
           min = Math.Min(min, minPhraseLength[alt]);
           max = Math.Max(max, maxPhraseLength[alt]);
+        }
+        minPhraseLength[new Sequence(nt)] = min;
+        maxPhraseLength[new Sequence(nt)] = max;
+      }
+    }
+        minPhraseLength[new ReadonlySequence(nt)] = min;
+        maxPhraseLength[new ReadonlySequence(nt)] = max;
+      }
+    }
     public IEnumerable<Sequence> GetAlternatives(char nt) {
 …
+    }
+    // caches for this are build in construction of object
+    public int MinPhraseLength(Sequence phrase) {
+    #region population of minphraselength cache
+    private int CalcAndSetMinSymbolLength(char symb) {
+      if (IsTerminal(symb)) return 1;
+      else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    }
+    private int CalcAndSetMinPhraseLength(Sequence phrase) {
+      Debug.Assert(phrase is ReadonlySequence);
       int l;
       if (minPhraseLength.TryGetValue(phrase, out l)) return l;
 …
       foreach (var symb in phrase) {
+        if (IsNonTerminal(symb)) {
+          l += MinSymbolLength(symb);
+        } else {
+          l++;
+        }
+        l += CalcAndSetMinSymbolLength(symb);
+      }
 …
       return l;
+    }
+    #endregion
+    // read only access to caches
+    private int MinSymbolLength(char symb) {
+      if (IsTerminal(symb)) return 1;
+      else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    }
+    public int MinPhraseLength(Sequence phrase) {
+      var minLen = 0;
+      if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen;
+      foreach (var s in phrase) {
+        minLen += MinSymbolLength(s);
+      }
+      return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue
+    }
+    #region population of maxphraselength cache
+    private int CalcAndSetMaxSymbolLength(char symb) {
+      if (IsTerminal(symb)) return 1;
+      else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    }
     // caches for this are build in construction of object
+    public int MaxPhraseLength(Sequence phrase) {
+    private int CalcAndSetMaxPhraseLength(Sequence phrase) {
+      Debug.Assert(phrase is ReadonlySequence);
       int l;
       if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
 …
       foreach (var symb in phrase) {
+        if (IsNonTerminal(symb)) {
+          l += MaxSymbolLength(symb);
+        } else {
+          l++;
+        }
+        l += CalcAndSetMaxSymbolLength(symb);
+      }
       l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
 …
       return l;
+    }
+    private int MinSymbolLength(char nt) {
+      if (IsTerminal(nt)) return 1;
+      else return Math.Min(short.MaxValue, rules[nt].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    }
+    private int MaxSymbolLength(char nt) {
+      if (IsTerminal(nt)) return 1;
+      else return Math.Min(short.MaxValue, rules[nt].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    #endregion
+    // read only access to caches
+    public int MaxSymbolLength(char symb) {
+      if (IsTerminal(symb)) return 1;
+      else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
+    }
+    public int MaxPhraseLength(Sequence phrase) {
+      var maxLen = 0;
+      if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen;
+      foreach (var s in phrase) {
+        maxLen += MaxSymbolLength(s);
+      }
+      return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue
+    }
 …
       if (phrase.Length > maxLen) throw new ArgumentException();
       if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
+      bool done = phrase.IsTerminal; // terminal phrase means we are done
+      while (!done) {
+      while (!phrase.IsTerminal) {
         char nt = phrase.FirstNonTerminal;
 …
         phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
-        done = phrase.All(IsTerminal); // terminal phrase means we are done
+      }
       return phrase;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/HardPalindromeProblem.cs

r11727	r11732
18	18	}
19	19
20		public double ~~Get~~BestKnownQuality(int maxLen) {
	20	public double BestKnownQuality(int maxLen) {
21	21	// the whole sentence is a palindrome + each symbol occurs only once or twice
22	22	// for odd total length the number of different characters can be larger than len/2 (aba)

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization.csproj

r11730	r11732
46	46	<Compile Include="Grammar.cs" />
47	47	<Compile Include="EvenParityProblem.cs" />
	48	<Compile Include="ReadonlySequence.cs" />
48	49	<Compile Include="SentenceSetStatistics.cs" />
49	50	<Compile Include="Sequence.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/IProblem.cs

r11727	r11732
6	6	namespace HeuristicLab.Problems.GrammaticalOptimization {
7	7	public interface IProblem {
8		double ~~Get~~BestKnownQuality(int maxLen);
	8	double BestKnownQuality(int maxLen);
9	9	IGrammar Grammar { get; }
10	10	double Evaluate(string sentence);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/PalindromeProblem.cs

r11727	r11732
18	18	}
19	19
20		public double ~~Get~~BestKnownQuality(int maxLen) {
	20	public double BestKnownQuality(int maxLen) {
21	21	// the whole sentence is a palindrome
22	22	return maxLen;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/RoyalPairProblem.cs

r11727	r11732
19	19	}
20	20
21		public double ~~Get~~BestKnownQuality(int maxLen) {
	21	public double BestKnownQuality(int maxLen) {
22	22	return maxLen - 1;
23	23	}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/RoyalRoadProblem.cs

r11727	r11732
17	17	}
18	18
19		public double ~~Get~~BestKnownQuality(int maxLen) {
	19	public double BestKnownQuality(int maxLen) {
20	20	// for now only an upper bound is returned, ideally all fitness cases are predicted correctly
21	21	throw new NotImplementedException();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/RoyalSymbolProblem.cs

r11727	r11732
19	19	}
20	20
21		public double ~~Get~~BestKnownQuality(int maxLen) {
	21	public double BestKnownQuality(int maxLen) {
22	22	return maxLen;
23	23	}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/RoyalTreeProblem.cs

r11727	r11732
17	17	}
18	18
19		public double ~~Get~~BestKnownQuality(int maxLen) {
	19	public double BestKnownQuality(int maxLen) {
20	20	// for now only an upper bound is returned, ideally all fitness cases are predicted correctly
21	21	throw new NotImplementedException();

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/SantaFeAntProblem.cs

r11730	r11732
23	23	}
24	24
25		public double ~~Get~~BestKnownQuality(int maxLen) {
	25	public double BestKnownQuality(int maxLen) {
26	26	// for now only an upper bound is returned, ideally all food pellets are discovered
27	27	return 89;

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Sequence.cs

-                      r11730
+                      r11732
+    }
     public void ReplaceAt(int position, int len, Sequence replacement) {
+    public virtual void ReplaceAt(int position, int len, Sequence replacement) {
       if (replacement == null) throw new ArgumentNullException();
       if (len <= 0) throw new ArgumentException();
 …
     public Sequence Subsequence(int startIdx, int len) {
       if (startIdx < 0 || len <= 0) throw new ArgumentException();
+      if (startIdx < 0 || len < 0) throw new ArgumentException();
       if (startIdx >= this.len) throw new ArgumentException();
       if (startIdx + len > this.len) throw new ArgumentException();
       var subsequence = new Sequence {len = len};
+      var subsequence = new Sequence { len = len };
       Array.Copy(this.symbols, startIdx, subsequence.symbols, 0, len);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/SymbolicRegressionPoly10Problem.cs

-                      r11730
+                      r11732
+    }
     public double GetBestKnownQuality(int maxLen) {
+    public double BestKnownQuality(int maxLen) {
       // for now only an upper bound is returned, ideally we have an R² of 1.0
       // the optimal R² can only be reached for sentences of at least 23 symbols
 …
     public double Evaluate(string sentence) {
       return RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
+      return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
+    }
-    private double RSq(IEnumerable<double> xs, IEnumerable<double> ys) {
-      // two pass implementation, but we don't care
-      var meanX = xs.Average();
-      var meanY = ys.Average();
-      var s = 0.0;
-      var ssX = 0.0;
-      var ssY = 0.0;
-      foreach (var p in xs.Zip(ys, (x, y) => new { x, y })) {
-        s += (p.x - meanX) * (p.y - meanY);
-        ssX += Math.Pow(p.x - meanX, 2);
-        ssY += Math.Pow(p.y - meanY, 2);
+      }
-      if (s.IsAlmost(0)) return 0;
-      if (ssX.IsAlmost(0) || ssY.IsAlmost(0)) return 0;
-      return s * s / (ssX * ssY);
+    }

branches/HeuristicLab.Problems.GrammaticalOptimization/Main/Main.csproj

-                      r11727
+                      r11732
       <Name>HeuristicLab.Algorithms.GrammaticalOptimization</Name>
     </ProjectReference>
+    <ProjectReference Include="..\HeuristicLab.Problems.GrammaticalOptimization.SymbReg\HeuristicLab.Problems.GrammaticalOptimization.SymbReg.csproj">
+      <Project>{17A7A380-86CE-482D-8D22-CBD70CC97F0D}</Project>
+      <Name>HeuristicLab.Problems.GrammaticalOptimization.SymbReg</Name>
+    </ProjectReference>
     <ProjectReference Include="..\HeuristicLab.Problems.GrammaticalOptimization\HeuristicLab.Problems.GrammaticalOptimization.csproj">
       <Project>{cb9dccf6-667e-4a13-b82d-dbd6b45a045e}</Project>

branches/HeuristicLab.Problems.GrammaticalOptimization/Main/Program.cs

-                      r11730
+                      r11732
 using HeuristicLab.Algorithms.GrammaticalOptimization;
 using HeuristicLab.Problems.GrammaticalOptimization;
+using HeuristicLab.Problems.GrammaticalOptimization.SymbReg;
 namespace Main {
 …
     private static void RunGridTest() {
       int maxIterations = 100000; // for poly-10 with 50000 evaluations no successful try with hl yet
       // var globalRandom = new Random(31415);
+      int maxIterations = 200000; // for poly-10 with 50000 evaluations no successful try with hl yet
+      //var globalRandom = new Random(31415);
       var localRandSeed = 31415;
       var reps = 20;
       var policyFactories = new Func<Random, int, IPolicy>[]
+      var policies = new Func<IPolicy>[]
+        {
+          (rand, numActions) => new GaussianThompsonSamplingPolicy(rand, numActions),
+          (rand, numActions) => new BernoulliThompsonSamplingPolicy(rand, numActions),
+          (rand, numActions) => new RandomPolicy(rand, numActions),
+          (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.01),
+          (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.05),
+          (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.1),
+          (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.2),
+          (rand, numActions) => new EpsGreedyPolicy(rand, numActions, 0.5),
+          (rand, numActions) => new UCTPolicy(numActions, 0.1),
+          (rand, numActions) => new UCTPolicy(numActions, 0.5),
+          (rand, numActions) => new UCTPolicy(numActions, 1),
+          (rand, numActions) => new UCTPolicy(numActions, 2),
+          (rand, numActions) => new UCTPolicy(numActions, 5),
+          (rand, numActions) => new UCTPolicy(numActions, 10),
+          (rand, numActions) => new UCB1Policy(numActions),
+          (rand, numActions) => new UCB1TunedPolicy(numActions),
+          (rand, numActions) => new UCBNormalPolicy(numActions),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 0.1),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 0.5),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 1),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 5),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 10),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 20),
+          (rand, numActions) => new BoltzmannExplorationPolicy(rand, numActions, 100),
+          (rand, numActions) => new ChernoffIntervalEstimationPolicy(numActions, 0.01),
+          (rand, numActions) => new ChernoffIntervalEstimationPolicy(numActions, 0.05),
+          (rand, numActions) => new ChernoffIntervalEstimationPolicy(numActions, 0.1),
+          (rand, numActions) => new ChernoffIntervalEstimationPolicy(numActions, 0.2),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 10, 0.01),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 10, 0.05),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 10, 0.1),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 10, 0.2),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 100, 0.01),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 100, 0.05),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 100, 0.1),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 100, 0.2),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 1000, 0.01),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 1000, 0.05),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 1000, 0.1),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 1000, 0.2),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 5000, 0.01),
+          (rand, numActions) => new ThresholdAscentPolicy(numActions, 10000, 0.01),
+         () => new GaussianThompsonSamplingPolicy(),
+         () => new GaussianThompsonSamplingPolicy(true),
+         () => new GenericThompsonSamplingPolicy(new GaussianModel(0.5, 1)),
+         () => new BernoulliThompsonSamplingPolicy(),
+         () => new GenericThompsonSamplingPolicy(new BernoulliModel(1, 1)),
+         () => new RandomPolicy(),
+         () => new EpsGreedyPolicy(0.01),
+         () => new EpsGreedyPolicy(0.05),
+         () => new EpsGreedyPolicy(0.1),
+         () => new EpsGreedyPolicy(0.2),
+         () => new EpsGreedyPolicy(0.5),
+         () => new UCTPolicy(0.1),
+         () => new UCTPolicy(0.5),
+         () => new UCTPolicy(1),
+         () => new UCTPolicy(2),
+         () => new UCTPolicy( 5),
+         () => new UCTPolicy( 10),
+         () => new UCB1Policy(),
+         () => new UCB1TunedPolicy(),
+         () => new UCBNormalPolicy(),
+         () => new BoltzmannExplorationPolicy(0.1),
+         () => new BoltzmannExplorationPolicy(0.5),
+         () => new BoltzmannExplorationPolicy(1),
+         () => new BoltzmannExplorationPolicy(5),
+         () => new BoltzmannExplorationPolicy(10),
+         () => new BoltzmannExplorationPolicy(20),
+         () => new BoltzmannExplorationPolicy(100),
+         () => new ChernoffIntervalEstimationPolicy( 0.01),
+         () => new ChernoffIntervalEstimationPolicy( 0.05),
+         () => new ChernoffIntervalEstimationPolicy( 0.1),
+         () => new ChernoffIntervalEstimationPolicy( 0.2),
+         // (rand) => new ThresholdAscentPolicy(10, 0.01),
+         // (rand) => new ThresholdAscentPolicy(10, 0.05),
+         // (rand) => new ThresholdAscentPolicy(10, 0.1),
+         // (rand) => new ThresholdAscentPolicy(10, 0.2),
+         // (rand) => new ThresholdAscentPolicy(100, 0.01),
+         // (rand) => new ThresholdAscentPolicy(100, 0.05),
+         // (rand) => new ThresholdAscentPolicy(100, 0.1),
+         // (rand) => new ThresholdAscentPolicy(100, 0.2),
+         // (rand) => new ThresholdAscentPolicy(1000, 0.01),
+         // (rand) => new ThresholdAscentPolicy(1000, 0.05),
+         // (rand) => new ThresholdAscentPolicy(1000, 0.1),
+         // (rand) => new ThresholdAscentPolicy(1000, 0.2),
+         // (rand) => new ThresholdAscentPolicy(5000, 0.01),
+         // (rand) => new ThresholdAscentPolicy(10000, 0.01),
         };
+      var tasks = new List<Task>();
+      foreach (var randomTries in new int[] { 1, 10, /* 5, 100 /*, 500, 1000 */}) {
+        foreach (var policyFactory in policyFactories) {
+          var myPolicyFactory = policyFactory;
+          var myRandomTries = randomTries;
+          var localRand = new Random(localRandSeed);
+          var options = new ParallelOptions();
+          options.MaxDegreeOfParallelism = 1;
+          Parallel.For(0, reps, options, (i) => {
+            //var t = Task.Run(() => {
+            Random myLocalRand;
+            lock (localRand)
+              myLocalRand = new Random(localRand.Next());
+            //for (int i = 0; i < reps; i++) {
+            int iterations = 0;
+            var sw = new Stopwatch();
+            var globalStatistics = new SentenceSetStatistics();
+            var problem = new SymbolicRegressionPoly10Problem();
+            //var problem = new SantaFeAntProblem();
+            //var problem = new PalindromeProblem();
+            //var problem = new HardPalindromeProblem();
+            //var problem = new RoyalPairProblem();
+            //var problem = new EvenParityProblem();
+            var alg = new MctsSampler(problem, 25, myLocalRand, myRandomTries, myPolicyFactory);
+            //var alg = new ExhaustiveBreadthFirstSearch(problem, 25);
+            //var alg = new AlternativesContextSampler(problem, 25);
+            alg.SolutionEvaluated += (sentence, quality) => {
+              iterations++;
+              globalStatistics.AddSentence(sentence, quality);
+              if (iterations % 10000 == 0) {
+                Console.WriteLine("{0,4} {1,7} {2,5} {3,25} {4}", alg.treeDepth, alg.treeSize, myRandomTries, myPolicyFactory(myLocalRand, 1), globalStatistics);
+              }
+            };
+            sw.Start();
+            alg.Run(maxIterations);
+            sw.Stop();
+            //Console.WriteLine("{0,5} {1} {2}", randomTries, policyFactory(1), globalStatistics);
+            //}
+            //});
+            //tasks.Add(t);
+          });
+      foreach (var problem in new Tuple<IProblem, int>[]
+        {
+          Tuple.Create((IProblem)new SantaFeAntProblem(), 17),
+          Tuple.Create((IProblem)new SymbolicRegressionPoly10Problem(), 23),
+        })
+        foreach (var randomTries in new int[] { 1, 10, /* 5, 100 /*, 500, 1000 */}) {
+          foreach (var policy in policies) {
+            var myRandomTries = randomTries;
+            var localRand = new Random(localRandSeed);
+            var options = new ParallelOptions();
+            options.MaxDegreeOfParallelism = 1;
+            Parallel.For(0, reps, options, (i) => {
+              //var t = Task.Run(() => {
+              Random myLocalRand;
+              lock (localRand)
+                myLocalRand = new Random(localRand.Next());
+              //for (int i = 0; i < reps; i++) {
+              int iterations = 0;
+              var globalStatistics = new SentenceSetStatistics();
+              // var problem = new SymbolicRegressionPoly10Problem();
+              // var problem = new SantaFeAntProblem();
+              //var problem = new PalindromeProblem();
+              //var problem = new HardPalindromeProblem();
+              //var problem = new RoyalPairProblem();
+              //var problem = new EvenParityProblem();
+              var alg = new MctsSampler(problem.Item1, problem.Item2, myLocalRand, myRandomTries, policy()); // TODO: Make sure we generate the same random numbers for each experiment
+              //var alg = new ExhaustiveBreadthFirstSearch(problem, 25);
+              //var alg = new AlternativesContextSampler(problem, 25);
+              alg.SolutionEvaluated += (sentence, quality) => {
+                iterations++;
+                globalStatistics.AddSentence(sentence, quality);
+                if (iterations % 10000 == 0) {
+                  Console.WriteLine("{0,4} {1,7} {2,5} {3,25} {4}", alg.treeDepth, alg.treeSize, myRandomTries, policy(), globalStatistics);
+                }
+              };
+              alg.Run(maxIterations);
+              //Console.WriteLine("{0,5} {1} {2}", randomTries, policyFactory(1), globalStatistics);
+              //}
+              //});
+              //tasks.Add(t);
+            });
+          }
+        }
+      }
       //Task.WaitAll(tasks.ToArray());
+    }
     private static void RunDemo() {
+      // TODO: test with eps-greedy using max instead of average as value (seems to work well for symb-reg! explore further!)
+      // TODO: implement GaussianWithUnknownMeanAndVariance Model for Thompson Sampling (verify with unit test if correct mean and variance is identified)
+      // TODO: separate value function from policy
+      // TODO: debug and verify implementation variants of Gaussian Thompson Sampling with unit test
+      // TODO: refactor Policies to use banditInfos (policies are factories for bandit infos and bandit info only has an update routine, each policy works only with it's type of banditinfo)
+      // TODO: in contextual MCTS store a bandit info for each node in the _graph_ and also update all bandit infos of all parents
+      // TODO: exhaustive search with priority list
       // TODO: warum funktioniert die alte Implementierung von GaussianThompson besser für SantaFe als alte? Siehe Vergleich: alte vs. neue implementierung GaussianThompsonSampling
       // TODO: why does GaussianThompsonSampling work so well with MCTS for the artificial ant problem?
 …
       // TODO: research thompson sampling for max bandit?
       // TODO: ausführlicher test von strategien für k-armed max bandit
+      // TODO: verify TA implementation using example from the original paper
+      // TODO: reference HL.ProblemInstances and try on tower dataset
+      // TODO: verify TA implementation using example from the original paper
       // TODO: compare results for different policies also for the symb-reg problem
       // TODO: separate policy from MCTS tree data structure to allow sharing of information over disconnected parts of the tree (semantic equivalence)
 …
       // TODO: vergleich bei complete-randomly möglichst kurze sätze generieren vs. einfach zufällig alternativen wählen
       // TODO: reward discounting (für veränderliche reward distributions über zeit). speziellen unit-test dafür erstellen
+      int maxIterations = 10000000;
+      // TODO: constant optimization
+      int maxIterations = 100000;
       int iterations = 0;
       var sw = new Stopwatch();
 …
       var random = new Random();
+      //var problem = new SymbolicRegressionPoly10Problem();
+      var problem = new SantaFeAntProblem();
+      var problem = new SymbolicRegressionPoly10Problem();
+      //var problem = new SantaFeAntProblem(); // good results e.g. with       var alg = new MctsSampler(problem, 17, random, 1, (rand, numActions) => new ThresholdAscentPolicy(numActions, 500, 0.01));
+      //var problem = new SymbolicRegressionProblem("Tower"); // very good results e.g. new EpsGreedyPolicy(0.2) using max reward as quality !!!
       //var problem = new PalindromeProblem();
       //var problem = new HardPalindromeProblem();
       //var problem = new RoyalPairProblem();
       //var problem = new EvenParityProblem();
       //var alg = new MctsSampler(problem, 17, random, 1, (rand, numActions) => new GenericThompsonSamplingPolicy(rand, numActions, new GaussianModel(numActions, 0.5, 10)));
+      var alg = new MctsSampler(problem, 23, random, 10, new EpsGreedyPolicy(0.2)); // GaussianModelWithUnknownVariance (and Q= 0.99-quantil) works well for Ant
       //var alg = new ExhaustiveBreadthFirstSearch(problem, 17);
       //var alg = new AlternativesContextSampler(problem, random, 17, 4, (rand, numActions) => new RandomPolicy(rand, numActions));
       //var alg = new ExhaustiveDepthFirstSearch(problem, 17);
       // var alg = new AlternativesSampler(problem, 17);
+      var alg = new RandomSearch(problem, random, 17);
+      // var alg = new RandomSearch(problem, random, 17);
+      // var alg = new ExhaustiveRandomFirstSearch(problem, random, 17);
       alg.FoundNewBestSolution += (sentence, quality) => {
         bestQuality = quality;
         bestSentence = sentence;
         Console.WriteLine("{0,10} {1,10:F5} {2,10:F5} {3}", iterations, bestQuality, quality, sentence);
+        Console.WriteLine("{0,4} {1,7} {2}", alg.treeDepth, alg.treeSize, globalStatistics);
       };
       alg.SolutionEvaluated += (sentence, quality) => {
 …
         globalStatistics.AddSentence(sentence, quality);
         if (iterations % 1000 == 0) {
           //alg.PrintStats();
+          alg.PrintStats();
+        }
         if (iterations % 10000 == 0) {
           //Console.WriteLine("{0,10} {1,10:F5} {2,10:F5} {3}", iterations, bestQuality, quality, sentence);
           //Console.WriteLine("{0,4} {1,7} {2}", alg.treeDepth, alg.treeSize, globalStatistics);
           Console.WriteLine(globalStatistics);
+          Console.WriteLine("{0,4} {1,7} {2}", alg.treeDepth, alg.treeSize, globalStatistics);
+        }
       };

Context Navigation

Legend:

Download in other formats: