Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
01/16/15 18:26:35 (9 years ago)
Author:
gkronber
Message:

#2283 work-in-progress commit (does not compile)

Location:
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ActiveLearningPolicy.cs

    r11747 r11792  
    1111    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
    1212      var myActionInfos = actionInfos.OfType<DefaultPolicyActionInfo>();
    13       double bestQ = double.NegativeInfinity;
    1413      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    1514      const double delta = 0.1;
     
    2625        double l;
    2726        if (aInfo.Tries == 0) {
    28           u = 1.0;
    29           l = 0.0;
     27          u = double.PositiveInfinity;
     28          l = double.NegativeInfinity;
    3029        } else {
    3130          q = aInfo.SumReward / aInfo.Tries;
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ChernoffIntervalEstimationPolicy.cs

    r11742 r11792  
    55using System.Text;
    66using System.Threading.Tasks;
     7using HeuristicLab.Common;
    78
    89namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
     
    2223      int k = myActionInfos.Count(a => !a.Disabled);
    2324      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    24       int bestAction = -1;
    2525      double bestQ = double.NegativeInfinity;
     26      var bestActions = new List<int>();
    2627      var aIdx = -1;
    2728      foreach (var aInfo in myActionInfos) {
    2829        aIdx++;
    2930        if (aInfo.Disabled) continue;
    30         if (aInfo.Tries == 0) return aIdx;
     31        double q;
     32        if (aInfo.Tries == 0) {
     33          q = double.PositiveInfinity;
     34        } else {
    3135
    32         var avgReward = aInfo.SumReward / aInfo.Tries;
     36          var avgReward = aInfo.SumReward / aInfo.Tries;
    3337
    34         // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
    35         // var alpha = Math.Log(2 * totalTries * k / delta);
    36         double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta); // total tries is max tries in the original paper
    37         var q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
     38          // page 5 of "A simple distribution-free appraoch to the max k-armed bandit problem"
     39          // var alpha = Math.Log(2 * totalTries * k / delta);
     40          double alpha = Math.Log(2.0) + Math.Log(totalTries) + Math.Log(k) - Math.Log(delta);
     41          // total tries is max tries in the original paper
     42          q = avgReward + (alpha + Math.Sqrt(2 * aInfo.Tries * avgReward * alpha + alpha * alpha)) / aInfo.Tries;
     43        }
    3844        if (q > bestQ) {
    3945          bestQ = q;
    40           bestAction = aIdx;
     46          bestActions.Clear();
     47          bestActions.Add(aIdx);
     48        } else if (q == bestQ) {
     49          bestActions.Add(aIdx);
    4150        }
    4251      }
    43       Debug.Assert(bestAction >= 0);
    44       return bestAction;
     52      Debug.Assert(bestActions.Any());
     53      return bestActions.SelectRandom(random);
    4554    }
    4655
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/ThresholdAscentPolicy.cs

    r11747 r11792  
    3333        get {
    3434          if (Disabled) return knownValue;
    35           if(Tries == 0.0) return 0.0;
     35          if (Tries == 0.0) return 0.0;
    3636          return rewardHistogram[thresholdBin] / (double)Tries;
    3737        }
     
    9999      UpdateThreshold(myActionInfos);
    100100
    101       int bestAction = -1;
     101      var bestActions = new List<int>();
    102102      double bestQ = double.NegativeInfinity;
    103103      int k = myActionInfos.Count(a => !a.Disabled);
     
    107107        aIdx++;
    108108        if (aInfo.Disabled) continue;
    109         if (aInfo.Tries == 0) return aIdx;
    110         double mu = aInfo.Value; // probability of rewards > T
    111         double q = U(mu, totalTries, aInfo.Tries, k);          // totalTries is max iterations in original paper
     109        double q;
     110        if (aInfo.Tries == 0) {
     111          q = double.PositiveInfinity;
     112        } else {
     113          double mu = aInfo.Value; // probability of rewards > T
     114          q = U(mu, totalTries, aInfo.Tries, k); // totalTries is max iterations in original paper
     115        }
    112116        if (q > bestQ) {
    113117          bestQ = q;
    114           bestAction = aIdx;
     118          bestActions.Clear();
     119          bestActions.Add(aIdx);
     120        } else if (q == bestQ) {
     121          bestActions.Add(aIdx);
    115122        }
    116123      }
    117       Debug.Assert(bestAction > -1);
    118       return bestAction;
     124      Debug.Assert(bestActions.Any());
     125      return bestActions.SelectRandom(random);
    119126    }
    120127
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCB1TunedPolicy.cs

    r11742 r11792  
    55using System.Text;
    66using System.Threading.Tasks;
     7using HeuristicLab.Common;
    78
    89namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
     
    1213    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
    1314      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
    14       int bestAction = -1;
    15       double bestQ = double.NegativeInfinity;
     15
    1616      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    1717
    1818      int aIdx = -1;
     19      double bestQ = double.NegativeInfinity;
     20      var bestActions = new List<int>();
    1921      foreach (var aInfo in myActionInfos) {
    2022        aIdx++;
    2123        if (aInfo.Disabled) continue;
    22         if (aInfo.Tries == 0) return aIdx;
     24        double q;
     25        if (aInfo.Tries == 0) {
     26          q = double.PositiveInfinity;
     27        } else {
     28          var sumReward = aInfo.SumReward;
     29          var tries = aInfo.Tries;
    2330
    24         var sumReward = aInfo.SumReward;
    25         var tries = aInfo.Tries;
    26 
    27         var avgReward = sumReward / tries;
    28         var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable
     31          var avgReward = sumReward / tries;
     32          q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));
     33          // 1/4 is upper bound of bernoulli distributed variable
     34        }
    2935        if (q > bestQ) {
    3036          bestQ = q;
    31           bestAction = aIdx;
     37          bestActions.Clear();
     38          bestActions.Add(aIdx);
     39        } else if (q == bestQ) {
     40          bestActions.Add(aIdx);
    3241        }
    3342      }
    34       Debug.Assert(bestAction > -1);
    35       return bestAction;
     43      Debug.Assert(bestActions.Any());
     44
     45      return bestActions.SelectRandom(random);
    3646    }
    3747
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/BanditPolicies/UCBNormalPolicy.cs

    r11742 r11792  
    55using System.Text;
    66using System.Threading.Tasks;
     7using HeuristicLab.Common;
    78
    89namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {
     
    1112    public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {
    1213      var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();
    13       int bestAction = -1;
     14      int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    1415      double bestQ = double.NegativeInfinity;
    15       int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);
    1616      int aIdx = -1;
     17      var bestActions = new List<int>();
    1718      foreach (var aInfo in myActionInfos) {
    1819        aIdx++;
    1920        if (aInfo.Disabled) continue;
    20         if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) return aIdx;
    21 
    22         var tries = aInfo.Tries;
    23         var avgReward = aInfo.AvgReward;
    24         var rewardVariance = aInfo.RewardVariance;
    25         var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries);
    26         var q = avgReward + Math.Sqrt(estVariance);
     21        double q;
     22        if (totalTries <= 1 || aInfo.Tries <= 1 || aInfo.Tries <= Math.Ceiling(8 * Math.Log(totalTries))) {
     23          q = double.PositiveInfinity;
     24        } else {
     25          var tries = aInfo.Tries;
     26          var avgReward = aInfo.AvgReward;
     27          var rewardVariance = aInfo.RewardVariance;
     28          var estVariance = 16.0 * rewardVariance * (Math.Log(totalTries - 1) / tries);
     29          q = avgReward + Math.Sqrt(estVariance);
     30        }
    2731        if (q > bestQ) {
    2832          bestQ = q;
    29           bestAction = aIdx;
     33          bestActions.Clear();
     34          bestActions.Add(aIdx);
     35        } else if (q == bestQ) {
     36          bestActions.Add(aIdx);
    3037        }
    3138      }
    32       Debug.Assert(bestAction > -1);
    33       return bestAction;
     39      Debug.Assert(bestActions.Any());
     40      return bestActions.SelectRandom(random);
    3441    }
    3542
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/GrammarPolicies/GenericGrammarPolicy.cs

    r11770 r11792  
    2727      out ReadonlySequence selectedState) {
    2828      // only select states that are not yet done
    29       afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a.ToString()))).ToArray();
     29      afterStates = afterStates.Where(a => !done.Contains(CanonicalState(a))).ToArray();
    3030      if (!afterStates.Any()) {
    3131        // fail because all follow states have already been visited => also disable the current state (if we can be sure that it has been fully explored)
    32         throw new NotImplementedException();
    33         //var curStateCanonical = CanonicalState(curState.ToString());
    34         //if (curState.ToString().Length == curStateCanonical.Length)
    35           done.Add(CanonicalState(curState.ToString()));
     32
     33        done.Add(CanonicalState(curState));
    3634        selectedState = null;
    3735        return false;
     
    4543
    4644    private IBanditPolicyActionInfo GetStateInfo(ReadonlySequence state) {
    47       var s = CanonicalState(state.ToString());
     45      var s = CanonicalState(state);
    4846      IBanditPolicyActionInfo info;
    4947      if (!stateInfo.TryGetValue(s, out info)) {
     
    5755      // the last state could be terminal
    5856      var lastState = stateTrajectory.Last();
    59       if (lastState.IsTerminal) done.Add(CanonicalState(lastState.ToString()));
     57      if (lastState.IsTerminal) done.Add(CanonicalState(lastState));
    6058
    6159      foreach (var state in stateTrajectory) {
     
    7068
    7169    public int GetTries(ReadonlySequence state) {
    72       var s = CanonicalState(state.ToString());
     70      var s = CanonicalState(state);
    7371      if (stateInfo.ContainsKey(s)) return stateInfo[s].Tries;
    7472      else return 0;
     
    7674
    7775    public double GetValue(ReadonlySequence state) {
    78       var s = CanonicalState(state.ToString());
     76      var s = CanonicalState(state);
    7977      if (stateInfo.ContainsKey(s)) return stateInfo[s].Value;
    8078      else return 0.0; // TODO: check alternatives
    8179    }
    8280
    83     protected string CanonicalState(string state) {
    84       if (useCanonicalState) return problem.CanonicalRepresentation(state);
    85       else return state;
     81    protected string CanonicalState(ReadonlySequence state) {
     82      if (useCanonicalState) {
     83        if (state.IsTerminal)
     84          return problem.CanonicalRepresentation(state.ToString());
     85        else {
     86          // for non-terminal phrases make sure we don't disable canonical states that have not yet been fully explored
     87          // e.g. if for the ant problem we have the phrase lllS (and we are limited to 4 symbols) and lllr as well as llll are explored
     88          // then we are not allowed to disable rS (canonical of lllS) because rS might not have been fully explored
     89          // solution: we disable the state rS4
     90          return problem.CanonicalRepresentation(state.ToString()) + state.Length;
     91        }
     92      } else
     93        return state.ToString();
    8694    }
    8795  }
  • branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Algorithms.Bandits/Models/BernoulliModel.cs

    r11742 r11792  
    2828
    2929    public void Update(double reward) {
    30       Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));
    31       if (reward.IsAlmost(1.0)) {
     30      // Debug.Assert(reward.IsAlmost(1.0) || reward.IsAlmost(0.0));
     31      if (reward > 0) {
    3232        success++;
    3333      } else {
Note: See TracChangeset for help on using the changeset viewer.