Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
01/18/15 18:24:58 (10 years ago)
Author:
gkronber
Message:

#2283 fixed compile errors and refactoring

Location:
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
Files:
6 deleted
8 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/EpsGreedyPolicy.cs

    r11742 r11793  
    2626    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
    2727      Debug.Assert(actionInfos.Any());
    28       if (random.NextDouble() > eps) {
     28      if (random.NextDouble() >= eps) { // eps == 0 should be equivalent to pure exploitation, eps == 1 is pure exploration
    2929        // select best
    3030        var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs

    r11792 r11793  
    1414    private readonly IProblem problem;
    1515    private readonly IBanditPolicy banditPolicy;
    16     private readonly HashSet<string> done;
     16    //private readonly HashSet<string> done;
    1717
    1818    public GenericGrammarPolicy(IProblem problem, IBanditPolicy banditPolicy, bool useCanonicalState = false) {
     
    2121      this.banditPolicy = banditPolicy;
    2222      this.stateInfo = new Dictionary<string, IBanditPolicyActionInfo>();
    23       this.done = new HashSet<string>();
     23      //this.done = new HashSet<string>();
    2424    }
    2525
    26     public bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates,
    27       out ReadonlySequence selectedState) {
    28       // only select states that are not yet done
    29       afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray();
    30       if (!afterStates.Any()) {
     26    public bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
     27      // fail if all states are done (corresponding state infos are disabled)
     28      if (afterStates.All(s => GetStateInfo(s).Disabled)) {
    3129        // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored)
    3230
    33         done.Add(CanonicalState(curState));
    34         selectedState = null;
     31        GetStateInfo(curState).Disable(0.0); // should the value be max of afterstate values instead of 0.0?
     32        selectedStateIdx = -1;
    3533        return false;
    3634      }
    3735
     36      selectedStateIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s)));
    3837
    39       var selectedIdx = banditPolicy.SelectAction(random, afterStates.Select(s => GetStateInfo(s)));
    40       selectedState = afterStates.ElementAt(selectedIdx);
    4138      return true;
    4239    }
    4340
    44     private IBanditPolicyActionInfo GetStateInfo(ReadonlySequence state) {
     41    private IBanditPolicyActionInfo GetStateInfo(string state) {
    4542      var s = CanonicalState(state);
    4643      IBanditPolicyActionInfo info;
     
    5249    }
    5350
    54     public virtual void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
     51    public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
    5552      // the last state could be terminal
    5653      var lastState = stateTrajectory.Last();
    57       if (lastState.IsTerminal) done.Add(CanonicalState(lastState));
     54      if (problem.Grammar.IsTerminal(lastState)) {
     55        GetStateInfo(lastState).Disable(reward);
     56      }
    5857
    59       foreach (var state in stateTrajectory) {
     58      // update remaining states
     59      foreach (var state in stateTrajectory.Reverse().Skip(1)) {
    6060        GetStateInfo(state).UpdateReward(reward);
    6161      }
     
    6464    public virtual void Reset() {
    6565      stateInfo.Clear();
    66       done.Clear();
     66      //done.Clear();
    6767    }
    6868
    69     public int GetTries(ReadonlySequence state) {
     69    public int GetTries(string state) {
    7070      var s = CanonicalState(state);
    7171      if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
     
    7373    }
    7474
    75     public double GetValue(ReadonlySequence state) {
     75    public double GetValue(string state) {
    7676      var s = CanonicalState(state);
    7777      if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
     
    7979    }
    8080
    81     protected string CanonicalState(ReadonlySequence state) {
     81    protected string CanonicalState(string state) {
    8282      if (useCanonicalState) {
    83         if (state.IsTerminal)
    84           return problem.CanonicalRepresentation(state.ToString());
     83        if (problem.Grammar.IsTerminal(state))
     84          return problem.CanonicalRepresentation(state);
    8585        else {
    8686          // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored
     
    8888          // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored
    8989          // solution: we disable the state rS4
    90           return problem.CanonicalRepresentation(state.ToString()) + state.Length;
     90          return problem.CanonicalRepresentation(state) + state.Length;
    9191        }
    9292      } else
    93         return state.ToString();
     93        return state;
    9494    }
    9595  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GrammarPolicy.cs

    r11770 r11793  
    88
    99namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
    10   // stores: tries, avg reward and max reward for each state
     10  // stores: tries, avg reward and max reward for each state (base class for RandomPolicy and TDPolicy
    1111  public abstract class GrammarPolicy : IGrammarPolicy {
    1212    protected Dictionary<string, double> avgReward;
    1313    protected Dictionary<string, int> tries;
    1414    protected Dictionary<string, double> maxReward;
    15     private readonly bool useCanonicalState;
    16     private readonly IProblem problem;
     15    protected readonly bool useCanonicalState;
     16    protected readonly IProblem problem;
    1717
    18     public GrammarPolicy(IProblem problem, bool useCanonicalState = false) {
     18    protected GrammarPolicy(IProblem problem, bool useCanonicalState = false) {
    1919      this.useCanonicalState = useCanonicalState;
    2020      this.problem = problem;
     
    2424    }
    2525
    26     public abstract bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState);
     26    public abstract bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx);
    2727
    28     public virtual void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
     28    public virtual void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
    2929      foreach (var state in stateTrajectory) {
    30         var s = CanonicalState(state.ToString());
     30        var s = CanonicalState(state);
    3131
    3232        if (!tries.ContainsKey(s)) tries.Add(s, 0);
     
    4747    }
    4848
    49     public double AvgReward(ReadonlySequence state) {
    50       var s = CanonicalState(state.ToString());
     49    public double AvgReward(string state) {
     50      var s = CanonicalState(state);
    5151      if (avgReward.ContainsKey(s)) return avgReward[s];
    5252      else return 0.0;
    5353    }
    5454
    55     public double MaxReward(ReadonlySequence state) {
    56       var s = CanonicalState(state.ToString());
     55    public double MaxReward(string state) {
     56      var s = CanonicalState(state);
    5757      if (maxReward.ContainsKey(s)) return maxReward[s];
    5858      else return 0.0;
    5959    }
    6060
    61     public virtual int GetTries(ReadonlySequence state) {
    62       var s = CanonicalState(state.ToString());
     61    public virtual int GetTries(string state) {
     62      var s = CanonicalState(state);
    6363      if (tries.ContainsKey(s)) return tries[s];
    6464      else return 0;
    6565    }
    6666
    67     public virtual double GetValue(ReadonlySequence state) {
     67    public virtual double GetValue(string state) {
    6868      return AvgReward(state);
    6969    }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/IGrammarPolicy.cs

    r11770 r11793  
    88
    99namespace HeuristicLab.Algorithms.Bandits.GrammarPolicies {
    10   public interface IGrammarPolicy : IPolicy<ReadonlySequence> {
     10  public interface IGrammarPolicy : IPolicy<string> {
    1111  }
    1212}
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/RandomPolicy.cs

    r11770 r11793  
    1313    }
    1414
    15     public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
     15    public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
    1616      // never fail => allows re-visits of terminal states
    17       selectedState = afterStates.SelectRandom(random);
     17      selectedStateIdx = random.Next(afterStates.Count());
    1818      return true;
    1919    }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/TDPolicy.cs

    r11770 r11793  
    22using System.Collections.Generic;
    33using System.Configuration;
     4using System.Diagnostics;
    45using System.Linq;
    56using System.Security.Policy;
     
    78using System.Threading;
    89using System.Threading.Tasks;
     10using HeuristicLab.Algorithms.Bandits.BanditPolicies;
    911using HeuristicLab.Common;
    1012using HeuristicLab.Problems.GrammaticalOptimization;
     
    1517    private readonly HashSet<string> done;
    1618    private readonly Dictionary<string, double> v;
    17     private EpsGreedyPolicy epsGreedy;
     19    private IGrammarPolicy epsGreedy;
    1820
    1921    public TDPolicy(IProblem problem, bool useCanonicalRepresentation = false)
     
    2123      this.done = new HashSet<string>();
    2224      this.v = new Dictionary<string, double>();
    23       this.epsGreedy = new EpsGreedyPolicy(problem, useCanonicalRepresentation, 0.1);
     25      this.epsGreedy = new GenericGrammarPolicy(problem, new EpsGreedyPolicy(0.1), useCanonicalRepresentation);
    2426    }
    2527
    26     public override bool TrySelect(Random random, ReadonlySequence curState, IEnumerable<ReadonlySequence> afterStates, out ReadonlySequence selectedState) {
     28    public override bool TrySelect(Random random, string curState, IEnumerable<string> afterStates, out int selectedStateIdx) {
    2729      // only select states that are not yet done
    2830      afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a.ToString()))).ToArray();
    2931      if (!afterStates.Any()) {
    3032        // fail because all follow states have already been visited => also disable the current state
    31         done.Add(CanonicalState(curState.ToString()));
    32         selectedState = null;
     33        done.Add(CanonicalState(curState));
     34        selectedStateIdx = -1;
    3335        return false;
    3436      }
     37      throw new NotImplementedException(); // TODO: remap indices of reduced action enumerable to indices of original enumerable
    3538
    3639      //return epsGreedy.TrySelect(random, curState, afterStates, out selectedState);
    3740
    3841      var bestQ = double.NegativeInfinity;
    39       selectedState = null;
     42      int idx = -1;
     43      selectedStateIdx = -1;
    4044      foreach (var state in afterStates) {
     45        idx++;
    4146        // try each state at least once
    4247        if (GetTries(state) == 0) {
    43           selectedState = state;
     48          selectedStateIdx = idx;
    4449          return true;
    4550        }
     
    4752        if (q > bestQ) {
    4853          bestQ = q;
    49           selectedState = state;
     54          selectedStateIdx = idx;
    5055        }
    5156      }
    5257
     58      Debug.Assert(selectedStateIdx > -1);
    5359      return true;
    5460    }
    5561
    56     private double V(ReadonlySequence state) {
    57       var s = CanonicalState(state.ToString());
     62    private double V(string state) {
     63      var s = CanonicalState(state);
    5864      if (v.ContainsKey(s)) return v[s];
    5965      else return 0.0;
    6066    }
    6167
    62     public override void UpdateReward(IEnumerable<ReadonlySequence> stateTrajectory, double reward) {
     68    public override void UpdateReward(IEnumerable<string> stateTrajectory, double reward) {
    6369      base.UpdateReward(stateTrajectory, reward);
    6470      epsGreedy.UpdateReward(stateTrajectory, reward);
    6571      // the last state could be terminal
    6672      var lastState = stateTrajectory.Last();
    67       if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
     73      if (problem.Grammar.IsTerminal(lastState)) done.Add(CanonicalState(lastState));
    6874
    69       v[CanonicalState(lastState.ToString())] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState));
     75      v[CanonicalState(lastState)] = V(lastState) + 1.0 / GetTries(lastState) * (reward - V(lastState));
    7076
    7177      foreach (var p in stateTrajectory.Zip(stateTrajectory.Skip(1), Tuple.Create).Reverse()) {
     
    7379        var next = p.Item2;
    7480
    75         v[CanonicalState(cur.ToString())] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur));
     81        v[CanonicalState(cur)] = V(cur) + 1.0 / GetTries(cur) * (V(next) - V(cur));
    7682        //v[CanonicalState(cur.ToString())] = V(cur) + 0.1 * (V(next) - V(cur));
    7783      }
     
    7985    }
    8086
    81     public override double GetValue(ReadonlySequence state) {
     87    public override double GetValue(string state) {
    8288      return V(state);
    8389    }
    8490
    85     public void Reset() {
     91    public override void Reset() {
    8692      base.Reset();
    8793      epsGreedy.Reset();
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/HeuristicLab.Algorithms.Bandits.csproj

    r11770 r11793  
    6666    <Compile Include="Bandits\IBandit.cs" />
    6767    <Compile Include="Bandits\TruncatedNormalBandit.cs" />
    68     <Compile Include="GrammarPolicies\BoltzmanExplorationPolicy.cs" />
    6968    <Compile Include="GrammarPolicies\GenericGrammarPolicy.cs">
    7069      <SubType>Code</SubType>
    7170    </Compile>
     71    <Compile Include="GrammarPolicies\RandomPolicy.cs">
     72      <SubType>Code</SubType>
     73    </Compile>
    7274    <Compile Include="GrammarPolicies\TDPolicy.cs" />
    73     <Compile Include="GrammarPolicies\UCTPolicy.cs" />
    7475    <Compile Include="GrammarPolicies\GrammarPolicy.cs" />
    75     <Compile Include="GrammarPolicies\EpsGreedyPolicy.cs" />
    76     <Compile Include="GrammarPolicies\GreedyPolicy.cs" />
    7776    <Compile Include="GrammarPolicies\IGrammarPolicy.cs" />
    78     <Compile Include="GrammarPolicies\RandomNoResamplingPolicy.cs" />
    79     <Compile Include="GrammarPolicies\RandomPolicy.cs" />
    8077    <Compile Include="IPolicy.cs" />
    8178    <Compile Include="IBanditPolicy.cs" />
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/IPolicy.cs

    r11770 r11793  
    1111  // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete
    1212  // we also assume that the policy can fail to select one of the followStates
    13   public interface IPolicy<TState> {
    14     bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out TState selectedState); // selectedState \in afterStates
     13  public interface IPolicy<in TState> {
     14    bool TrySelect(Random random, TState curState, IEnumerable<TState> afterStates, out int selectedStateIdx); // selectedState \in afterStates
    1515
    1616    // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state)
Note: See TracChangeset for help on using the changeset viewer.