1  using System;


2  using System.Collections.Generic;


3  using System.Diagnostics;


4  using System.Linq;


5  using System.Text;


6  using System.Threading.Tasks;


7  using HeuristicLab.Common;


8 


9  namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {


10  // policy for karmed bandit (see Auer et al. 2002)


11  public class UCB1TunedPolicy : IBanditPolicy {


12 


13  public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {


14  var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();


15 


16  int totalTries = myActionInfos.Sum(a => a.Tries);


17 


18  int aIdx = 1;


19  double bestQ = double.NegativeInfinity;


20  var bestActions = new List<int>();


21  foreach (var aInfo in myActionInfos) {


22  aIdx++;


23  double q;


24  if (aInfo.Tries == 0) {


25  q = double.PositiveInfinity;


26  } else {


27  var sumReward = aInfo.SumReward;


28  var tries = aInfo.Tries;


29 


30  var avgReward = sumReward / tries;


31  q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries)));


32  // 1/4 is upper bound of bernoulli distributed variable


33  }


34  if (q > bestQ) {


35  bestQ = q;


36  bestActions.Clear();


37  bestActions.Add(aIdx);


38  } else if (q.IsAlmost(bestQ)) {


39  bestActions.Add(aIdx);


40  }


41  }


42  Debug.Assert(bestActions.Any());


43 


44  return bestActions.SelectRandom(random);


45  }


46 


47  public IBanditPolicyActionInfo CreateActionInfo() {


48  return new MeanAndVariancePolicyActionInfo();


49  }


50 


51  private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) {


52  var s = actionInfo.Tries;


53  return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s);


54  }


55 


56  public override string ToString() {


57  return "UCB1TunedPolicy";


58  }


59  }


60  }

