using System; using System.Linq; using System.Collections.Generic; using System.Globalization; using HeuristicLab.Algorithms.Bandits; using HeuristicLab.Algorithms.Bandits.Models; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace HeuristicLab.Problems.GrammaticalOptimization.Test { [TestClass] public class TestBanditPolicies { [TestMethod] public void ComparePoliciesForBernoulliBandit() { CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture; var globalRand = new Random(31415); var seedForPolicy = globalRand.Next(); var nArms = 20; //Console.WriteLine("Exp3 (gamma=0.01)"); //TestPolicyBernoulli(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 1)); //Console.WriteLine("Exp3 (gamma=0.05)"); //estPolicyBernoulli(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 1)); Console.WriteLine("Thompson (Bernoulli)"); TestPolicyBernoulli(globalRand, nArms, new BernoulliThompsonSamplingPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("Generic Thompson (Bernoulli)"); TestPolicyBernoulli(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new BernoulliModel(nArms))); Console.WriteLine("Random"); TestPolicyBernoulli(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("UCB1"); TestPolicyBernoulli(globalRand, nArms, new UCB1Policy(nArms)); Console.WriteLine("UCB1Tuned"); TestPolicyBernoulli(globalRand, nArms, new UCB1TunedPolicy(nArms)); Console.WriteLine("UCB1Normal"); TestPolicyBernoulli(globalRand, nArms, new UCBNormalPolicy(nArms)); Console.WriteLine("Eps(0.01)"); TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01)); Console.WriteLine("Eps(0.05)"); TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05)); //Console.WriteLine("Eps(0.1)"); //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1)); //Console.WriteLine("Eps(0.2)"); //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2)); //Console.WriteLine("Eps(0.5)"); //TestPolicyBernoulli(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5)); Console.WriteLine("UCT(0.1)"); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 0.1)); Console.WriteLine("UCT(0.5)"); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 0.5)); Console.WriteLine("UCT(1) "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 1)); Console.WriteLine("UCT(2) "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 2)); Console.WriteLine("UCT(5) "); TestPolicyBernoulli(globalRand, nArms, new UCTPolicy(nArms, 5)); Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1)); Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5)); Console.WriteLine("BoltzmannExploration(1) "); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1)); Console.WriteLine("BoltzmannExploration(10) "); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10)); Console.WriteLine("BoltzmannExploration(100)"); TestPolicyBernoulli(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyBernoulli(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1)); // not applicable to bernoulli rewards //Console.WriteLine("ThresholdAscent(10, 0.01) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01)); //Console.WriteLine("ThresholdAscent(10, 0.05) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05)); //Console.WriteLine("ThresholdAscent(10, 0.1) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1)); //Console.WriteLine("ThresholdAscent(100, 0.01) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01)); //Console.WriteLine("ThresholdAscent(100, 0.05) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05)); //Console.WriteLine("ThresholdAscent(100, 0.1) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1)); //Console.WriteLine("ThresholdAscent(1000, 0.01)"); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01)); //Console.WriteLine("ThresholdAscent(1000, 0.05)"); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05)); //Console.WriteLine("ThresholdAscent(1000, 0.1) "); TestPolicyBernoulli(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1)); } [TestMethod] public void ComparePoliciesForNormalBandit() { CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture; var globalRand = new Random(31415); var seedForPolicy = globalRand.Next(); var nArms = 20; Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyNormal(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms, true)); Console.WriteLine("Thompson (Gaussian new)"); TestPolicyNormal(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyNormal(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new GaussianModel(nArms, 0.5, 1))); /* Console.WriteLine("Random"); TestPolicyNormal(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("UCB1"); TestPolicyNormal(globalRand, nArms, new UCB1Policy(nArms)); Console.WriteLine("UCB1Tuned"); TestPolicyNormal(globalRand, nArms, new UCB1TunedPolicy(nArms)); Console.WriteLine("UCB1Normal"); TestPolicyNormal(globalRand, nArms, new UCBNormalPolicy(nArms)); //Console.WriteLine("Exp3 (gamma=0.01)"); //TestPolicyNormal(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.01)); //Console.WriteLine("Exp3 (gamma=0.05)"); //TestPolicyNormal(globalRand, nArms, new Exp3Policy(new Random(seedForPolicy), nArms, 0.05)); Console.WriteLine("Eps(0.01)"); TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01)); Console.WriteLine("Eps(0.05)"); TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05)); //Console.WriteLine("Eps(0.1)"); //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.1)); //Console.WriteLine("Eps(0.2)"); //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.2)); //Console.WriteLine("Eps(0.5)"); //TestPolicyNormal(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.5)); Console.WriteLine("UCT(0.1)"); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 0.1)); Console.WriteLine("UCT(0.5)"); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 0.5)); Console.WriteLine("UCT(1) "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 1)); Console.WriteLine("UCT(2) "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 2)); Console.WriteLine("UCT(5) "); TestPolicyNormal(globalRand, nArms, new UCTPolicy(nArms, 5)); Console.WriteLine("BoltzmannExploration(0.1)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.1)); Console.WriteLine("BoltzmannExploration(0.5)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 0.5)); Console.WriteLine("BoltzmannExploration(1) "); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1)); Console.WriteLine("BoltzmannExploration(10) "); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10)); Console.WriteLine("BoltzmannExploration(100)"); TestPolicyNormal(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.01)"); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.01)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.05)"); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.05)); Console.WriteLine("ChernoffIntervalEstimationPolicy(0.1) "); TestPolicyNormal(globalRand, nArms, new ChernoffIntervalEstimationPolicy(nArms, 0.1)); Console.WriteLine("ThresholdAscent(10,0.01) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01)); Console.WriteLine("ThresholdAscent(10,0.05) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.05)); Console.WriteLine("ThresholdAscent(10,0.1) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.1)); Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01)); Console.WriteLine("ThresholdAscent(100,0.05) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.05)); Console.WriteLine("ThresholdAscent(100,0.1) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.1)); Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01)); Console.WriteLine("ThresholdAscent(1000,0.05)"); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.05)); Console.WriteLine("ThresholdAscent(1000,0.1) "); TestPolicyNormal(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.1)); */ } [TestMethod] public void ComparePoliciesForGaussianMixtureBandit() { CultureInfo.DefaultThreadCurrentCulture = CultureInfo.InvariantCulture; var globalRand = new Random(31415); var seedForPolicy = globalRand.Next(); var nArms = 20; Console.WriteLine("Thompson (Gaussian orig)"); TestPolicyGaussianMixture(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms, true)); Console.WriteLine("Thompson (Gaussian new)"); TestPolicyGaussianMixture(globalRand, nArms, new GaussianThompsonSamplingPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("Generic Thompson (Gaussian)"); TestPolicyGaussianMixture(globalRand, nArms, new GenericThompsonSamplingPolicy(new Random(seedForPolicy), nArms, new GaussianModel(nArms, 0.5, 1))); /* Console.WriteLine("Random"); TestPolicyGaussianMixture(globalRand, nArms, new RandomPolicy(new Random(seedForPolicy), nArms)); Console.WriteLine("UCB1"); TestPolicyGaussianMixture(globalRand, nArms, new UCB1Policy(nArms)); Console.WriteLine("UCB1Tuned "); TestPolicyGaussianMixture(globalRand, nArms, new UCB1TunedPolicy(nArms)); Console.WriteLine("UCB1Normal"); TestPolicyGaussianMixture(globalRand, nArms, new UCBNormalPolicy(nArms)); Console.WriteLine("Eps(0.01) "); TestPolicyGaussianMixture(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.01)); Console.WriteLine("Eps(0.05) "); TestPolicyGaussianMixture(globalRand, nArms, new EpsGreedyPolicy(new Random(seedForPolicy), nArms, 0.05)); Console.WriteLine("UCT(1) "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 1)); Console.WriteLine("UCT(2) "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 2)); Console.WriteLine("UCT(5) "); TestPolicyGaussianMixture(globalRand, nArms, new UCTPolicy(nArms, 5)); Console.WriteLine("BoltzmannExploration(1) "); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 1)); Console.WriteLine("BoltzmannExploration(10) "); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 10)); Console.WriteLine("BoltzmannExploration(100)"); TestPolicyGaussianMixture(globalRand, nArms, new BoltzmannExplorationPolicy(new Random(seedForPolicy), nArms, 100)); Console.WriteLine("ThresholdAscent(10,0.01) "); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10, 0.01)); Console.WriteLine("ThresholdAscent(100,0.01) "); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 100, 0.01)); Console.WriteLine("ThresholdAscent(1000,0.01)"); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 1000, 0.01)); Console.WriteLine("ThresholdAscent(10000,0.01)"); TestPolicyGaussianMixture(globalRand, nArms, new ThresholdAscentPolicy(nArms, 10000, 0.01)); */ } private void TestPolicyBernoulli(Random globalRand, int nArms, IPolicy policy) { TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new BernoulliBandit(banditRandom, nActions)); } private void TestPolicyNormal(Random globalRand, int nArms, IPolicy policy) { TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new TruncatedNormalBandit(banditRandom, nActions)); } private void TestPolicyGaussianMixture(Random globalRand, int nArms, IPolicy policy) { TestPolicy(globalRand, nArms, policy, (banditRandom, nActions) => new GaussianMixtureBandit(banditRandom, nActions)); } private void TestPolicy(Random globalRand, int nArms, IPolicy policy, Func banditFactory) { var maxIt = 1E5; var reps = 30; // independent runs var regretForIteration = new Dictionary>(); var numberOfPullsOfSuboptimalArmsForExp = new Dictionary(); var numberOfPullsOfSuboptimalArmsForMax = new Dictionary(); // calculate statistics for (int r = 0; r < reps; r++) { var nextLogStep = 1; var b = banditFactory(new Random(globalRand.Next()), nArms); policy.Reset(); var totalRegret = 0.0; var totalPullsOfSuboptimalArmsExp = 0.0; var totalPullsOfSuboptimalArmsMax = 0.0; for (int i = 0; i <= maxIt; i++) { var selectedAction = policy.SelectAction(); var reward = b.Pull(selectedAction); policy.UpdateReward(selectedAction, reward); // collect stats if (selectedAction != b.OptimalExpectedRewardArm) totalPullsOfSuboptimalArmsExp++; if (selectedAction != b.OptimalMaximalRewardArm) totalPullsOfSuboptimalArmsMax++; totalRegret += b.OptimalExpectedReward - reward; if (i == nextLogStep) { nextLogStep *= 2; if (!regretForIteration.ContainsKey(i)) { regretForIteration.Add(i, new List()); } regretForIteration[i].Add(totalRegret / i); if (!numberOfPullsOfSuboptimalArmsForExp.ContainsKey(i)) { numberOfPullsOfSuboptimalArmsForExp.Add(i, 0.0); } numberOfPullsOfSuboptimalArmsForExp[i] += totalPullsOfSuboptimalArmsExp; if (!numberOfPullsOfSuboptimalArmsForMax.ContainsKey(i)) { numberOfPullsOfSuboptimalArmsForMax.Add(i, 0.0); } numberOfPullsOfSuboptimalArmsForMax[i] += totalPullsOfSuboptimalArmsMax; } } } // print foreach (var p in regretForIteration.Keys.OrderBy(k => k)) { Console.WriteLine("iter {0,8} regret avg {1,7:F5} min {2,7:F5} max {3,7:F5} suboptimal pulls (exp) {4,7:F2} suboptimal pulls (max) {5,7:F2}", p, regretForIteration[p].Average(), regretForIteration[p].Min(), regretForIteration[p].Max(), numberOfPullsOfSuboptimalArmsForExp[p] / (double)reps, numberOfPullsOfSuboptimalArmsForMax[p] / (double)reps ); } } } }