Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.Bandits

Timestamp:

12/29/14 11:02:36 (9 years ago)

Author:

gkronber

Message:

#2283: worked on grammatical optimization problem solvers (simple MCTS done)

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Files:

: 3 added
: 8 edited

HeuristicLab.Algorithms.Bandits.csproj (modified) (2 diffs)
IPolicy.cs (modified) (1 diff)
Policies/BanditPolicy.cs (modified) (2 diffs)
Policies/BernoulliThompsonSamplingPolicy.cs (added)
Policies/EpsGreedyPolicy.cs (modified) (2 diffs)
Policies/Exp3Policy.cs (added)
Policies/GaussianThompsonSamplingPolicy.cs (added)
Policies/RandomPolicy.cs (modified) (2 diffs)
Policies/UCB1Policy.cs (modified) (4 diffs)
Policies/UCB1TunedPolicy.cs (modified) (5 diffs)
Policies/UCBNormalPolicy.cs (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

-                      r11711
+                      r11727
     <Compile Include="Bandits\TruncatedNormalBandit.cs" />
     <Compile Include="Policies\BanditPolicy.cs" />
+    <Compile Include="Policies\BernoulliThompsonSamplingPolicy.cs" />
+    <Compile Include="Policies\GaussianThompsonSamplingPolicy.cs" />
+    <Compile Include="Policies\Exp3Policy.cs" />
     <Compile Include="Policies\EpsGreedyPolicy.cs" />
     <Compile Include="Policies\RandomPolicy.cs" />
 …
     <Compile Include="Properties\AssemblyInfo.cs" />
   </ItemGroup>
+  <ItemGroup />
+  <ItemGroup>
+    <ProjectReference Include="..\HeuristicLab.Common\HeuristicLab.Common.csproj">
+      <Project>{3A2FBBCB-F9DF-4970-87F3-F13337D941AD}</Project>
+      <Name>HeuristicLab.Common</Name>
+    </ProjectReference>
+  </ItemGroup>
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it.

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

-                      r11708
+                      r11727
 namespace HeuristicLab.Algorithms.Bandits {
+  // this interface represents a policy for reinforcement learning
   public interface IPolicy {
+    int SelectAction();
+    void UpdateReward(int action, double reward);
+    IEnumerable<int> Actions { get; }
+    int SelectAction(); // action selection ...
+    void UpdateReward(int action, double reward); // ... and reward update are defined as usual
+    // policies must also support disabling of potential actions
+    // for instance if we know that an action in a state has a deterministic
+    // reward we need to sample it only once
+    // it is necessary to sample an action only once
+    void DisableAction(int action);
+    // reset causes the policy to be reinitialized to it's initial state (as after constructor-call)
     void Reset();
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/BanditPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
 namespace HeuristicLab.Algorithms.Bandits {
   public abstract class BanditPolicy : IPolicy {
+    public int NumActions { get; private set; }
+    public BanditPolicy(int numActions) {
+      this.NumActions = numActions;
+    public IEnumerable<int> Actions { get; private set; }
+    private readonly int numInitialActions;
+    protected BanditPolicy(int numActions) {
+      this.numInitialActions = numActions;
+      Actions = Enumerable.Range(0, numActions).ToArray();
+    }
     public abstract int SelectAction();
     public abstract void UpdateReward(int action, double reward);
+    public abstract void Reset();
+    public virtual void DisableAction(int action) {
+      Debug.Assert(Actions.Contains(action));
+      Actions = Actions.Where(a => a != action).ToArray();
+    }
+    public virtual void Reset() {
+      Actions = Enumerable.Range(0, numInitialActions).ToArray();
+    }
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/EpsGreedyPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
     private readonly int[] tries;
     private readonly double[] sumReward;
+    private readonly RandomPolicy randomPolicy;
     public EpsGreedyPolicy(Random random, int numActions, double eps)
       : base(numActions) {
       this.random = random;
       this.eps = eps;
+      this.tries = new int[NumActions];
+      this.sumReward = new double[NumActions];
+      this.randomPolicy = new RandomPolicy(random, numActions);
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+    }
     public override int SelectAction() {
+      Debug.Assert(Actions.Any());
       if (random.NextDouble() > eps) {
         // select best
         var maxReward = double.NegativeInfinity;
         int bestAction = -1;
         for (int i = 0; i < NumActions; i++) {
           if (tries[i] == 0) return i;
           var avgReward = sumReward[i] / tries[i];
+        foreach (var a in Actions) {
+          if (tries[a] == 0) return a;
+          var avgReward = sumReward[a] / tries[a];
           if (maxReward < avgReward) {
             maxReward = avgReward;
             bestAction = i;
+            bestAction = a;
+          }
+        }
+        Debug.Assert(bestAction >= 0);
         return bestAction;
       } else {
         // select random
         return random.Next(NumActions);
+        return randomPolicy.SelectAction();
+      }
+    }
     public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
+      randomPolicy.UpdateReward(action, reward); // does nothing
       tries[action]++;
       sumReward[action] += reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      randomPolicy.DisableAction(action);
+      sumReward[action] = 0;
+      tries[action] = -1;
+    }
     public override void Reset() {
+      base.Reset();
+      randomPolicy.Reset();
       Array.Clear(tries, 0, tries.Length);
       Array.Clear(sumReward, 0, sumReward.Length);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/RandomPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 using System.Threading.Tasks;
+using HeuristicLab.Common;
 namespace HeuristicLab.Algorithms.Bandits {
   public class RandomPolicy : BanditPolicy {
     private readonly Random random;
     public RandomPolicy(Random random, int numActions)
       : base(numActions) {
 …
     public override int SelectAction() {
+      return random.Next(NumActions);
+      Debug.Assert(Actions.Any());
+      return Actions.SelectRandom(random);
+    }
     public override void UpdateReward(int action, double reward) {
       // do nothing
+    }
+    public override void Reset() {
+      // do nothing
+    }
+  }
+}

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1Policy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
     public UCB1Policy(int numActions)
       : base(numActions) {
       this.tries = new int[NumActions];
       this.sumReward = new double[NumActions];
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+    }
 …
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
       for (int i = 0; i < NumActions; i++) {
         if (tries[i] == 0) return i;
         var q = sumReward[i] / tries[i] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[i]);
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((2 * Math.Log(totalTries)) / tries[a]);
         if (q > bestQ) {
           bestQ = q;
           bestAction = i;
+          bestAction = a;
+        }
+      }
 …
+    }
     public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
       totalTries++;
       tries[action]++;
       sumReward[action] += reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+    }
     public override void Reset() {
+      base.Reset();
       totalTries = 0;
       Array.Clear(tries, 0, tries.Length);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCB1TunedPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
     public UCB1TunedPolicy(int numActions)
       : base(numActions) {
       this.tries = new int[NumActions];
       this.sumReward = new double[NumActions];
       this.sumSqrReward = new double[NumActions];
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
 …
     public override int SelectAction() {
+      Debug.Assert(Actions.Any());
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
       for (int i = 0; i < NumActions; i++) {
         if (tries[i] == 0) return i;
         var q = sumReward[i] / tries[i] + Math.Sqrt((Math.Log(totalTries) / tries[i]) * Math.Min(1.0 / 4, V(i))); // 1/4 is upper bound of bernoulli distributed variable
+      foreach (var a in Actions) {
+        if (tries[a] == 0) return a;
+        var q = sumReward[a] / tries[a] + Math.Sqrt((Math.Log(totalTries) / tries[a]) * Math.Min(1.0 / 4, V(a))); // 1/4 is upper bound of bernoulli distributed variable
         if (q > bestQ) {
           bestQ = q;
           bestAction = i;
+          bestAction = a;
+        }
+      }
 …
+    }
     public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
       totalTries++;
       tries[action]++;
 …
       sumSqrReward[action] += reward * reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+      sumSqrReward[action] = 0;
+    }
     public override void Reset() {
+      base.Reset();
       totalTries = 0;
       Array.Clear(tries, 0, tries.Length);

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Policies/UCBNormalPolicy.cs

-                      r11711
+                      r11727
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text;
 …
     public UCBNormalPolicy(int numActions)
       : base(numActions) {
       this.tries = new int[NumActions];
       this.sumReward = new double[NumActions];
       this.sumSqrReward = new double[NumActions];
+      this.tries = new int[numActions];
+      this.sumReward = new double[numActions];
+      this.sumSqrReward = new double[numActions];
+    }
-    private double V(int arm) {
-      var s = tries[arm];
-      return sumSqrReward[arm] / s - Math.Pow(sumReward[arm] / s, 2) + Math.Sqrt(2 * Math.Log(totalTries) / s);
+    }
     public override int SelectAction() {
+      Debug.Assert(Actions.Any());
       int bestAction = -1;
       double bestQ = double.NegativeInfinity;
       for (int i = 0; i < NumActions; i++) {
         if (totalTries == 0 || tries[i] == 0 || tries[i] < Math.Ceiling(8 * Math.Log(totalTries))) return i;
         var avgReward = sumReward[i] / tries[i];
+      foreach (var a in Actions) {
+        if (totalTries == 0 || tries[a] == 0 || tries[a] < Math.Ceiling(8 * Math.Log(totalTries))) return a;
+        var avgReward = sumReward[a] / tries[a];
         var q = avgReward
           + Math.Sqrt(16 * ((sumSqrReward[i] - tries[i] * Math.Pow(avgReward, 2)) / (tries[i] - 1)) * (Math.Log(totalTries - 1) / tries[i]));
+          + Math.Sqrt(16 * ((sumSqrReward[a] - tries[a] * Math.Pow(avgReward, 2)) / (tries[a] - 1)) * (Math.Log(totalTries - 1) / tries[a]));
         if (q > bestQ) {
           bestQ = q;
           bestAction = i;
+          bestAction = a;
+        }
+      }
 …
+    }
     public override void UpdateReward(int action, double reward) {
+      Debug.Assert(Actions.Contains(action));
       totalTries++;
       tries[action]++;
 …
       sumSqrReward[action] += reward * reward;
+    }
+    public override void DisableAction(int action) {
+      base.DisableAction(action);
+      totalTries -= tries[action];
+      tries[action] = -1;
+      sumReward[action] = 0;
+      sumSqrReward[action] = 0;
+    }
     public override void Reset() {
+      base.Reset();
       totalTries = 0;
       Array.Clear(tries, 0, tries.Length);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 11727 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Legend:

Download in other formats: