Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
01/15/15 18:59:07 (9 years ago)
Author:
gkronber
Message:

#2283: worked on generic sequential search alg with bandit policy as parameter

Location:
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
Files:
8 added
5 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomNoResamplingPolicy.cs

    r11742 r11770  
    11using System;
    22using System.Collections.Generic;
     3using System.Configuration;
    34using System.Linq;
     5using System.Security.Policy;
    46using System.Text;
    5 using System.Threading.Tasks;
    67using HeuristicLab.Common;
    78using HeuristicLab.Problems.GrammaticalOptimization;
    89
    910namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
    10   public class RandomNoResamplingPolicy : IGrammarPolicy {
     11  public class RandomNoResamplingPolicy : GrammarPolicy {
    1112
    12     private readonly Dictionary<ReadonlySequence, bool> done;
    13     private readonly Dictionary<Tuple<ReadonlySequence, ReadonlySequence>, ReadonlySequence> nextState;
     13    private readonly HashSet<string> done;
    1414
    15 
    16     public RandomNoResamplingPolicy() {
    17       this.done = new Dictionary<ReadonlySequence, bool>();
     15    public RandomNoResamplingPolicy(IProblem problem, bool useCanonicalRepresentation)
     16      : base(problem, useCanonicalRepresentation) {
     17      this.done = new HashSet<string>();
    1818    }
    1919
    20     public ReadonlySequence SelectAction(Random random, ReadonlySequence state, IEnumerable<ReadonlySequence> actions) {
    21       var allDone = true;
    22       foreach (var a in actions) {
    23         var p = Tuple.Create(state, a);
    24         allDone &= nextState.ContainsKey(p) && Done(nextState[p]);
    25         if (!allDone) break;
     20    public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
     21      // only select states that are not yet done
     22      afterStates = afterStates.Where(a => !done.Contains(a.ToString())).ToArray();
     23      if (!afterStates.Any()) {
     24        // fail because all follow states have already been visited => also disable the current state
     25        done.Add(CanonicalState(curState.ToString()));
     26        selectedState = null;
     27        return false;
    2628      }
    27       if(allDone)
    28       return actions
    29         .Where(a => !nextState.ContainsKey(Tuple.Create(state, a)) || Done(nextState[Tuple.Create(state, a)]))
    30         .SelectRandom(random);
     29
     30      selectedState = afterStates.SelectRandom(random);
     31      return true;
    3132    }
    3233
    33     public void UpdateReward(ReadonlySequence state, ReadonlySequence action, double reward, ReadonlySequence newState) {
    34       var key = Tuple.Create(state, action);
    35       nextState[key] = newState;
    36       if (newState.IsTerminal) done[newState] = true;
    37       if
     34    public override void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
     35      base.UpdateReward(stateTrajectory, reward);
     36      // ignore rewards but update the set of visited terminal states
     37
     38      // the last state could be terminal
     39      var lastState = stateTrajectory.Last();
     40      if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
    3841    }
    3942
    40     public bool Done(ReadonlySequence state) {
    41       return done.ContainsKey(state);
     43    public override void Reset() {
     44      base.Reset();
     45      done.Clear();
    4246    }
    4347  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomPolicy.cs

    r11742 r11770  
    88
    99namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
    10   public class RandomPolicy : IGrammarPolicy {
    11     public ReadonlySequence SelectAction(Random random, ReadonlySequence state, IEnumerable<ReadonlySequence> actions) {
    12       return actions.SelectRandom(random);
     10  public class RandomPolicy : GrammarPolicy {
     11    public RandomPolicy(IProblem problem, bool useCanonicalRepresentation)
     12      : base(problem, useCanonicalRepresentation) {
    1313    }
    1414
    15     public void UpdateReward(ReadonlySequence state, ReadonlySequence action, double reward, ReadonlySequence newState) {
    16       // ignore
    17     }
    18 
    19     public bool Done(ReadonlySequence state) {
    20       return false;
     15    public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
     16      // never fail => allows re-visits of terminal states
     17      selectedState = afterStates.SelectRandom(random);
     18      return true;
    2119    }
    2220  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

    r11747 r11770  
    6666    <Compile Include="Bandits\IBandit.cs" />
    6767    <Compile Include="Bandits\TruncatedNormalBandit.cs" />
     68    <Compile Include="GrammarPolicies\BoltzmanExplorationPolicy.cs" />
     69    <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs">
     70      <SubType>Code</SubType>
     71    </Compile>
     72    <Compile Include="GrammarPolicies\TDPolicy.cs" />
     73    <Compile Include="GrammarPolicies\UCTPolicy.cs" />
     74    <Compile Include="GrammarPolicies\GrammarPolicy.cs" />
     75    <Compile Include="GrammarPolicies\EpsGreedyPolicy.cs" />
     76    <Compile Include="GrammarPolicies\GreedyPolicy.cs" />
     77    <Compile Include="GrammarPolicies\IGrammarPolicy.cs" />
     78    <Compile Include="GrammarPolicies\RandomNoResamplingPolicy.cs" />
    6879    <Compile Include="GrammarPolicies\RandomPolicy.cs" />
    6980    <Compile Include="IPolicy.cs" />
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IBanditPolicyActionInfo.cs

    r11747 r11770  
    1 using System;
    2 using System.Collections.Generic;
    3 using System.Linq;
    4 using System.Text;
    5 using System.Threading.Tasks;
    6 
    7 namespace HeuristicLab.Algorithms.Bandits {
     1namespace HeuristicLab.Algorithms.Bandits {
    82  public interface IBanditPolicyActionInfo {
    93    bool Disabled { get; }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

    r11744 r11770  
    11using System;
    22using System.Collections.Generic;
     3using System.Dynamic;
    34using System.Linq;
    45using System.Text;
     
    78
    89namespace HeuristicLab.Algorithms.Bandits {
    9   // this interface represents a policy for reinforcement learning
    10   public interface IPolicy<in TState, TAction> {
    11     TAction SelectAction(Random random, TState state, IEnumerable<TAction> actions);
    12     void UpdateReward(TState state, TAction action, double reward, TState newState); // reward received when after taking action in state and new state
    13     bool Done(TState state); // for deterministic MDP with deterministic rewards and goal to find a state with max reward
    14   }
     10  // this interface represents a policy for episodic reinforcement learning (with afterstates)
     11  // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete
     12  // we also assume that the policy can fail to select one of the followStates
     13  public interface IPolicy<TState> {
     14    bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out TState selectedState); // selectedState \in afterStates
    1515
    16   public interface IGrammarPolicy : IPolicy<ReadonlySequence, ReadonlySequence> {
     16    // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state)
     17    void UpdateReward(IEnumerable<TState> stateTrajectory, double reward);
    1718
     19    void Reset(); // clears all internal state
     20
     21    // for introspection
     22    double GetValue(TState state);
     23    int GetTries(TState state);
    1824  }
    1925}
Note: See TracChangeset for help on using the changeset viewer.