Context Navigation

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/GaussianThompsonSamplingPolicy.cs @ 11732

Visit:

Last change on this file since 11732 was 11732, checked in by gkronber, 9 years ago
#2283: refactoring and bug fixes
File size: 2.9 KB

Line
1	using System;
2	using System.Collections.Generic;
3	using System.Diagnostics;
4	using System.Linq;
5	using HeuristicLab.Common;
6
7	namespace HeuristicLab.Algorithms.Bandits {
8
9	public class GaussianThompsonSamplingPolicy : IPolicy {
10	private bool compatibility;
11
12	// assumes a Gaussian reward distribution with different means but the same variances for each action
13	// the prior for the mean is also Gaussian with the following parameters
14	private readonly double rewardVariance = 0.1; // we assume a known variance
15
16	private readonly double priorMean = 0.5;
17	private readonly double priorVariance = 1;
18
19
20	public GaussianThompsonSamplingPolicy(bool compatibility = false) {
21	this.compatibility = compatibility;
22	}
23
24	public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {
25	var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
26	int bestAction = -1;
27	double bestQ = double.NegativeInfinity;
28
29	int aIdx = -1;
30	foreach (var aInfo in myActionInfos) {
31	aIdx++;
32	if (aInfo.Disabled) continue;
33
34	var tries = aInfo.Tries;
35	var sampleMean = aInfo.AvgReward;
36	var sampleVariance = aInfo.RewardVariance;
37
38	double theta;
39	if (compatibility) {
40	if (tries < 2) return aIdx;
41	var mu = sampleMean;
42	var variance = sampleVariance;
43	var stdDev = Math.Sqrt(variance);
44	theta = Rand.RandNormal(random) * stdDev + mu;
45	} else {
46	// calculate posterior mean and variance (for mean reward)
47
48	// see Murphy 2007: Conjugate Bayesian analysis of the Gaussian distribution (http://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf)
49	var posteriorVariance = 1.0 / (tries / rewardVariance + 1.0 / priorVariance);
50	var posteriorMean = posteriorVariance * (priorMean / priorVariance + tries * sampleMean / rewardVariance);
51
52	// sample a mean from the posterior
53	theta = Rand.RandNormal(random) * Math.Sqrt(posteriorVariance) + posteriorMean;
54
55	// theta already represents the expected reward value => nothing else to do
56	}
57
58	if (theta > bestQ) {
59	bestQ = theta;
60	bestAction = aIdx;
61	}
62	}
63	Debug.Assert(bestAction > -1);
64	return bestAction;
65	}
66
67	public IPolicyActionInfo CreateActionInfo() {
68	return new MeanAndVariancePolicyActionInfo();
69	}
70
71
72	//public override void UpdateReward(int action, double reward) {
73	// Debug.Assert(Actions.Contains(action));
74	// tries[action]++;
75	// var delta = reward - sampleMean[action];
76	// sampleMean[action] += delta / tries[action];
77	// sampleM2[action] += sampleM2[action] + delta * (reward - sampleMean[action]);
78	//}
79
80	public override string ToString() {
81	return "GaussianThompsonSamplingPolicy";
82	}
83	}
84	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences