Changeset 11727 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Timestamp:
- 12/29/14 11:02:36 (9 years ago)
- Location:
- branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
- Files:
-
- 3 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj
r11711 r11727 43 43 <Compile Include="Bandits\TruncatedNormalBandit.cs" /> 44 44 <Compile Include="Policies\BanditPolicy.cs" /> 45 <Compile Include="Policies\BernoulliThompsonSamplingPolicy.cs" /> 46 <Compile Include="Policies\GaussianThompsonSamplingPolicy.cs" /> 47 <Compile Include="Policies\Exp3Policy.cs" /> 45 48 <Compile Include="Policies\EpsGreedyPolicy.cs" /> 46 49 <Compile Include="Policies\RandomPolicy.cs" /> … … 51 54 <Compile Include="Properties\AssemblyInfo.cs" /> 52 55 </ItemGroup> 53 <ItemGroup /> 56 <ItemGroup> 57 <ProjectReference Include="..\HeuristicLab.Common\HeuristicLab.Common.csproj"> 58 <Project>{3A2FBBCB-F9DF-4970-87F3-F13337D941AD}</Project> 59 <Name>HeuristicLab.Common</Name> 60 </ProjectReference> 61 </ItemGroup> 54 62 <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> 55 63 <!-- To modify your build process, add your task inside one of the targets below and uncomment it. -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs
r11708 r11727 6 6 7 7 namespace HeuristicLab.Algorithms.Bandits { 8 // this interface represents a policy for reinforcement learning 8 9 public interface IPolicy { 9 int SelectAction(); 10 void UpdateReward(int action, double reward); 10 IEnumerable<int> Actions { get; } 11 int SelectAction(); // action selection ... 12 void UpdateReward(int action, double reward); // ... and reward update are defined as usual 13 14 // policies must also support disabling of potential actions 15 // for instance if we know that an action in a state has a deterministic 16 // reward we need to sample it only once 17 // it is necessary to sample an action only once 18 void DisableAction(int action); 19 20 // reset causes the policy to be reinitialized to it's initial state (as after constructor-call) 11 21 void Reset(); 12 22 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BanditPolicy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 7 8 namespace HeuristicLab.Algorithms.Bandits { 8 9 public abstract class BanditPolicy : IPolicy { 9 public int NumActions { get; private set; } 10 public BanditPolicy(int numActions) { 11 this.NumActions = numActions; 10 public IEnumerable<int> Actions { get; private set; } 11 private readonly int numInitialActions; 12 13 protected BanditPolicy(int numActions) { 14 this.numInitialActions = numActions; 15 Actions = Enumerable.Range(0, numActions).ToArray(); 12 16 } 13 17 14 18 public abstract int SelectAction(); 15 19 public abstract void UpdateReward(int action, double reward); 16 public abstract void Reset(); 20 21 public virtual void DisableAction(int action) { 22 Debug.Assert(Actions.Contains(action)); 23 24 Actions = Actions.Where(a => a != action).ToArray(); 25 } 26 27 public virtual void Reset() { 28 Actions = Enumerable.Range(0, numInitialActions).ToArray(); 29 } 17 30 } 18 31 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 11 12 private readonly int[] tries; 12 13 private readonly double[] sumReward; 14 private readonly RandomPolicy randomPolicy; 15 13 16 public EpsGreedyPolicy(Random random, int numActions, double eps) 14 17 : base(numActions) { 15 18 this.random = random; 16 19 this.eps = eps; 17 this.tries = new int[NumActions]; 18 this.sumReward = new double[NumActions]; 20 this.randomPolicy = new RandomPolicy(random, numActions); 21 this.tries = new int[numActions]; 22 this.sumReward = new double[numActions]; 19 23 } 20 24 21 25 public override int SelectAction() { 26 Debug.Assert(Actions.Any()); 22 27 if (random.NextDouble() > eps) { 23 28 // select best 24 29 var maxReward = double.NegativeInfinity; 25 30 int bestAction = -1; 26 for (int i = 0; i < NumActions; i++) {27 if (tries[ i] == 0) return i;28 var avgReward = sumReward[ i] / tries[i];31 foreach (var a in Actions) { 32 if (tries[a] == 0) return a; 33 var avgReward = sumReward[a] / tries[a]; 29 34 if (maxReward < avgReward) { 30 35 maxReward = avgReward; 31 bestAction = i;36 bestAction = a; 32 37 } 33 38 } 39 Debug.Assert(bestAction >= 0); 34 40 return bestAction; 35 41 } else { 36 42 // select random 37 return random .Next(NumActions);43 return randomPolicy.SelectAction(); 38 44 } 39 45 } 40 46 public override void UpdateReward(int action, double reward) { 47 Debug.Assert(Actions.Contains(action)); 48 49 randomPolicy.UpdateReward(action, reward); // does nothing 41 50 tries[action]++; 42 51 sumReward[action] += reward; 43 52 } 53 54 public override void DisableAction(int action) { 55 base.DisableAction(action); 56 randomPolicy.DisableAction(action); 57 sumReward[action] = 0; 58 tries[action] = -1; 59 } 60 44 61 public override void Reset() { 62 base.Reset(); 63 randomPolicy.Reset(); 45 64 Array.Clear(tries, 0, tries.Length); 46 65 Array.Clear(sumReward, 0, sumReward.Length); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/RandomPolicy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; 5 6 using System.Threading.Tasks; 7 using HeuristicLab.Common; 6 8 7 9 namespace HeuristicLab.Algorithms.Bandits { 8 10 public class RandomPolicy : BanditPolicy { 9 11 private readonly Random random; 12 10 13 public RandomPolicy(Random random, int numActions) 11 14 : base(numActions) { … … 14 17 15 18 public override int SelectAction() { 16 return random.Next(NumActions); 19 Debug.Assert(Actions.Any()); 20 return Actions.SelectRandom(random); 17 21 } 18 22 public override void UpdateReward(int action, double reward) { 19 23 // do nothing 20 24 } 21 public override void Reset() { 22 // do nothing 23 } 25 24 26 } 25 27 } -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 12 13 public UCB1Policy(int numActions) 13 14 : base(numActions) { 14 this.tries = new int[ NumActions];15 this.sumReward = new double[ NumActions];15 this.tries = new int[numActions]; 16 this.sumReward = new double[numActions]; 16 17 } 17 18 … … 19 20 int bestAction = -1; 20 21 double bestQ = double.NegativeInfinity; 21 for (int i = 0; i < NumActions; i++) {22 if (tries[ i] == 0) return i;23 var q = sumReward[ i] / tries[i] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[i]);22 foreach (var a in Actions) { 23 if (tries[a] == 0) return a; 24 var q = sumReward[a] / tries[a] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[a]); 24 25 if (q > bestQ) { 25 26 bestQ = q; 26 bestAction = i;27 bestAction = a; 27 28 } 28 29 } … … 30 31 } 31 32 public override void UpdateReward(int action, double reward) { 33 Debug.Assert(Actions.Contains(action)); 32 34 totalTries++; 33 35 tries[action]++; 34 36 sumReward[action] += reward; 35 37 } 38 39 public override void DisableAction(int action) { 40 base.DisableAction(action); 41 totalTries -= tries[action]; 42 tries[action] = -1; 43 sumReward[action] = 0; 44 } 45 36 46 public override void Reset() { 47 base.Reset(); 37 48 totalTries = 0; 38 49 Array.Clear(tries, 0, tries.Length); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 13 14 public UCB1TunedPolicy(int numActions) 14 15 : base(numActions) { 15 this.tries = new int[ NumActions];16 this.sumReward = new double[ NumActions];17 this.sumSqrReward = new double[ NumActions];16 this.tries = new int[numActions]; 17 this.sumReward = new double[numActions]; 18 this.sumSqrReward = new double[numActions]; 18 19 } 19 20 … … 25 26 26 27 public override int SelectAction() { 28 Debug.Assert(Actions.Any()); 27 29 int bestAction = -1; 28 30 double bestQ = double.NegativeInfinity; 29 for (int i = 0; i < NumActions; i++) {30 if (tries[ i] == 0) return i;31 var q = sumReward[ i] / tries[i] + Math.Sqrt((Math.Log(totalTries) / tries[i]) * Math.Min(1.0 / 4, V(i))); // 1/4 is upper bound of bernoulli distributed variable31 foreach (var a in Actions) { 32 if (tries[a] == 0) return a; 33 var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable 32 34 if (q > bestQ) { 33 35 bestQ = q; 34 bestAction = i;36 bestAction = a; 35 37 } 36 38 } … … 38 40 } 39 41 public override void UpdateReward(int action, double reward) { 42 Debug.Assert(Actions.Contains(action)); 40 43 totalTries++; 41 44 tries[action]++; … … 43 46 sumSqrReward[action] += reward * reward; 44 47 } 48 49 public override void DisableAction(int action) { 50 base.DisableAction(action); 51 totalTries -= tries[action]; 52 tries[action] = -1; 53 sumReward[action] = 0; 54 sumSqrReward[action] = 0; 55 } 56 45 57 public override void Reset() { 58 base.Reset(); 46 59 totalTries = 0; 47 60 Array.Clear(tries, 0, tries.Length); -
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCBNormalPolicy.cs
r11711 r11727 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Diagnostics; 3 4 using System.Linq; 4 5 using System.Text; … … 13 14 public UCBNormalPolicy(int numActions) 14 15 : base(numActions) { 15 this.tries = new int[ NumActions];16 this.sumReward = new double[ NumActions];17 this.sumSqrReward = new double[ NumActions];16 this.tries = new int[numActions]; 17 this.sumReward = new double[numActions]; 18 this.sumSqrReward = new double[numActions]; 18 19 } 19 20 20 private double V(int arm) {21 var s = tries[arm];22 return sumSqrReward[arm] / s - Math.Pow(sumReward[arm] / s, 2) + Math.Sqrt(2 * Math.Log(totalTries) / s);23 }24 25 26 21 public override int SelectAction() { 22 Debug.Assert(Actions.Any()); 27 23 int bestAction = -1; 28 24 double bestQ = double.NegativeInfinity; 29 for (int i = 0; i < NumActions; i++) {30 if (totalTries == 0 || tries[ i] == 0 || tries[i] < Math.Ceiling(8 * Math.Log(totalTries))) return i;31 var avgReward = sumReward[ i] / tries[i];25 foreach (var a in Actions) { 26 if (totalTries == 0 || tries[a] == 0 || tries[a] < Math.Ceiling(8 * Math.Log(totalTries))) return a; 27 var avgReward = sumReward[a] / tries[a]; 32 28 var q = avgReward 33 + Math.Sqrt(16 * ((sumSqrReward[ i] - tries[i] * Math.Pow(avgReward, 2)) / (tries[i] - 1)) * (Math.Log(totalTries - 1) / tries[i]));29 + Math.Sqrt(16 * ((sumSqrReward[a] - tries[a] * Math.Pow(avgReward, 2)) / (tries[a] - 1)) * (Math.Log(totalTries - 1) / tries[a])); 34 30 if (q > bestQ) { 35 31 bestQ = q; 36 bestAction = i;32 bestAction = a; 37 33 } 38 34 } … … 40 36 } 41 37 public override void UpdateReward(int action, double reward) { 38 Debug.Assert(Actions.Contains(action)); 42 39 totalTries++; 43 40 tries[action]++; … … 45 42 sumSqrReward[action] += reward * reward; 46 43 } 44 45 public override void DisableAction(int action) { 46 base.DisableAction(action); 47 totalTries -= tries[action]; 48 tries[action] = -1; 49 sumReward[action] = 0; 50 sumSqrReward[action] = 0; 51 } 52 47 53 public override void Reset() { 54 base.Reset(); 48 55 totalTries = 0; 49 56 Array.Clear(tries, 0, tries.Length);
Note: See TracChangeset
for help on using the changeset viewer.