using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { [Obsolete("Replaced by GenericThompsonSamplingPolicy(GaussianModel(0.5, 1.0, 0.1))")] public class GaussianThompsonSamplingPolicy : IBanditPolicy { private bool compatibility; // assumes a Gaussian reward distribution with different means but the same variances for each action // the prior for the mean is also Gaussian with the following parameters private readonly double rewardVariance = 0.1; // we assume a known variance private readonly double priorMean = 0.5; private readonly double priorVariance = 1; public GaussianThompsonSamplingPolicy(bool compatibility = false) { this.compatibility = compatibility; } public int SelectAction(Random random, IEnumerable actionInfos) { var myActionInfos = actionInfos.OfType(); int bestAction = -1; double bestQ = double.NegativeInfinity; int aIdx = -1; foreach (var aInfo in myActionInfos) { aIdx++; var tries = aInfo.Tries; var sampleMean = aInfo.AvgReward; var sampleVariance = aInfo.RewardVariance; double theta; if (compatibility) { // old code used for old experiments (preserved because it performed very well) if (tries < 2) return aIdx; var mu = sampleMean; var variance = sampleVariance; var stdDev = Math.Sqrt(variance); theta = Rand.RandNormal(random) * stdDev + mu; } else { // calculate posterior mean and variance (for mean reward) // see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf) var posteriorVariance = 1.0 / (tries / rewardVariance + 1.0 / priorVariance); var posteriorMean = posteriorVariance * (priorMean / priorVariance + tries * sampleMean / rewardVariance); // sample a mean from the posterior theta = Rand.RandNormal(random) * Math.Sqrt(posteriorVariance) + posteriorMean; // theta already represents the expected reward value => nothing else to do } // very unlikely to be the same (and we don't care) if (theta > bestQ) { bestQ = theta; bestAction = aIdx; } } Debug.Assert(bestAction > -1); return bestAction; } public IBanditPolicyActionInfo CreateActionInfo() { return new MeanAndVariancePolicyActionInfo(); } //public override void UpdateReward(int action, double reward) { // Debug.Assert(Actions.Contains(action)); // tries[action]++; // var delta = reward - sampleMean[action]; // sampleMean[action] += delta / tries[action]; // sampleM2[action] += sampleM2[action] + delta * (reward - sampleMean[action]); //} public override string ToString() { return "GaussianThompsonSamplingPolicy"; } } }