Context Navigation

← Previous Change
Next Change →

HeuristicLab.Algorithms.Bandits

Timestamp:

01/15/15 18:59:07 (9 years ago)

Author:

gkronber

Message:

#2283: worked on generic sequential search alg with bandit policy as parameter

Location:

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits

Files:

: 8 added
: 5 edited

GrammarPolicies/BoltzmanExplorationPolicy.cs (added)
GrammarPolicies/EpsGreedyPolicy.cs (added)
GrammarPolicies/GenericGrammarPolicy.cs (added)
GrammarPolicies/GrammarPolicy.cs (added)
GrammarPolicies/GreedyPolicy.cs (added)
GrammarPolicies/IGrammarPolicy.cs (added)
GrammarPolicies/RandomNoResamplingPolicy.cs (modified) (1 diff)
GrammarPolicies/RandomPolicy.cs (modified) (1 diff)
GrammarPolicies/TDPolicy.cs (added)
GrammarPolicies/UCTPolicy.cs (added)
HeuristicLab.Algorithms.Bandits.csproj (modified) (1 diff)
IBanditPolicyActionInfo.cs (modified) (1 diff)
IPolicy.cs (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomNoResamplingPolicy.cs

-                      r11742
+                      r11770
 using System;
 using System.Collections.Generic;
+using System.Configuration;
 using System.Linq;
+using System.Security.Policy;
 using System.Text;
-using System.Threading.Tasks;
 using HeuristicLab.Common;
 using HeuristicLab.Problems.GrammaticalOptimization;
 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
   public class RandomNoResamplingPolicy : IGrammarPolicy {
+  public class RandomNoResamplingPolicy : GrammarPolicy {
+    private readonly Dictionary<ReadonlySequence, bool> done;
+    private readonly Dictionary<Tuple<ReadonlySequence, ReadonlySequence>, ReadonlySequence> nextState;
+    private readonly HashSet<string> done;
     public RandomNoResamplingPolicy() {
       this.done = new Dictionary<ReadonlySequence, bool>();
+    public RandomNoResamplingPolicy(IProblem problem, bool useCanonicalRepresentation)
+      : base(problem, useCanonicalRepresentation) {
+      this.done = new HashSet<string>();
+    }
+    public ReadonlySequence SelectAction(Random random, ReadonlySequence state, IEnumerable<ReadonlySequence> actions) {
+      var allDone = true;
+      foreach (var a in actions) {
+        var p = Tuple.Create(state, a);
+        allDone &= nextState.ContainsKey(p) && Done(nextState[p]);
+        if (!allDone) break;
+    public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
+      // only select states that are not yet done
+      afterStates = afterStates.Where(a => !done.Contains(a.ToString())).ToArray();
+      if (!afterStates.Any()) {
+        // fail because all follow states have already been visited => also disable the current state
+        done.Add(CanonicalState(curState.ToString()));
+        selectedState = null;
+        return false;
+      }
+      if(allDone)
+      return actions
+        .Where(a => !nextState.ContainsKey(Tuple.Create(state, a)) || Done(nextState[Tuple.Create(state, a)]))
+        .SelectRandom(random);
+      selectedState = afterStates.SelectRandom(random);
+      return true;
+    }
+    public void UpdateReward(ReadonlySequence state, ReadonlySequence action, double reward, ReadonlySequence newState) {
+      var key = Tuple.Create(state, action);
+      nextState[key] = newState;
+      if (newState.IsTerminal) done[newState] = true;
+      if
+    public override void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
+      base.UpdateReward(stateTrajectory, reward);
+      // ignore rewards but update the set of visited terminal states
+      // the last state could be terminal
+      var lastState = stateTrajectory.Last();
+      if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
+    }
+    public bool Done(ReadonlySequence state) {
+      return done.ContainsKey(state);
+    public override void Reset() {
+      base.Reset();
+      done.Clear();
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomPolicy.cs

-                      r11742
+                      r11770
 namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
   public class RandomPolicy : IGrammarPolicy {
     public ReadonlySequence SelectAction(Random random, ReadonlySequence state, IEnumerable<ReadonlySequence> actions) {
       return actions.SelectRandom(random);
+  public class RandomPolicy : GrammarPolicy {
+    public RandomPolicy(IProblem problem, bool useCanonicalRepresentation)
+      : base(problem, useCanonicalRepresentation) {
+    }
+    public void UpdateReward(ReadonlySequence state, ReadonlySequence action, double reward, ReadonlySequence newState) {
+      // ignore
+    }
+    public bool Done(ReadonlySequence state) {
+      return false;
+    public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
+      // never fail => allows re-visits of terminal states
+      selectedState = afterStates.SelectRandom(random);
+      return true;
+    }
+  }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

-                      r11747
+                      r11770
     <Compile Include="Bandits\IBandit.cs" />
     <Compile Include="Bandits\TruncatedNormalBandit.cs" />
+    <Compile Include="GrammarPolicies\BoltzmanExplorationPolicy.cs" />
+    <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs">
+      <SubType>Code</SubType>
+    </Compile>
+    <Compile Include="GrammarPolicies\TDPolicy.cs" />
+    <Compile Include="GrammarPolicies\UCTPolicy.cs" />
+    <Compile Include="GrammarPolicies\GrammarPolicy.cs" />
+    <Compile Include="GrammarPolicies\EpsGreedyPolicy.cs" />
+    <Compile Include="GrammarPolicies\GreedyPolicy.cs" />
+    <Compile Include="GrammarPolicies\IGrammarPolicy.cs" />
+    <Compile Include="GrammarPolicies\RandomNoResamplingPolicy.cs" />
     <Compile Include="GrammarPolicies\RandomPolicy.cs" />
     <Compile Include="IPolicy.cs" />

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs

-                      r11747
+                      r11770
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+namespace HeuristicLab.Algorithms.Bandits {
+namespace HeuristicLab.Algorithms.Bandits {
   public interface IBanditPolicyActionInfo {
     bool Disabled { get; }

branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

-                      r11744
+                      r11770
 using System;
 using System.Collections.Generic;
+using System.Dynamic;
 using System.Linq;
 using System.Text;
 …
 namespace HeuristicLab.Algorithms.Bandits {
+  // this interface represents a policy for reinforcement learning
+  public interface IPolicy<in TState, TAction> {
+    TAction SelectAction(Random random, TState state, IEnumerable<TAction> actions);
+    void UpdateReward(TState state, TAction action, double reward, TState newState); // reward received when after taking action in state and new state
+    bool Done(TState state); // for deterministic MDP with deterministic rewards and goal to find a state with max reward
+  }
+  // this interface represents a policy for episodic reinforcement learning (with afterstates)
+  // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete
+  // we also assume that the policy can fail to select one of the followStates
+  public interface IPolicy<TState> {
+    bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out TState selectedState); // selectedState \in afterStates
+  public interface IGrammarPolicy : IPolicy<ReadonlySequence, ReadonlySequence> {
+    // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state)
+    void UpdateReward(IEnumerable<TState> stateTrajectory, double reward);
+    void Reset(); // clears all internal state
+    // for introspection
+    double GetValue(TState state);
+    int GetTries(TState state);
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.