1  using System;


2  using System.Collections.Generic;


3  using System.Diagnostics;


4  using System.Linq;


5  using System.Text;


6  using System.Threading.Tasks;


7 


8  namespace HeuristicLab.Algorithms.Bandits.BanditPolicies {


9  // policy for karmed bandit (see Auer et al. 2002)


10  public class UCB1TunedPolicy : IBanditPolicy {


11 


12  public int SelectAction(Random random, IEnumerable<IBanditPolicyActionInfo> actionInfos) {


13  var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>();


14  int bestAction = 1;


15  double bestQ = double.NegativeInfinity;


16  int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);


17 


18  int aIdx = 1;


19  foreach (var aInfo in myActionInfos) {


20  aIdx++;


21  if (aInfo.Disabled) continue;


22  if (aInfo.Tries == 0) return aIdx;


23 


24  var sumReward = aInfo.SumReward;


25  var tries = aInfo.Tries;


26 


27  var avgReward = sumReward / tries;


28  var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(aInfo, totalTries))); // 1/4 is upper bound of bernoulli distributed variable


29  if (q > bestQ) {


30  bestQ = q;


31  bestAction = aIdx;


32  }


33  }


34  Debug.Assert(bestAction > 1);


35  return bestAction;


36  }


37 


38  public IBanditPolicyActionInfo CreateActionInfo() {


39  return new MeanAndVariancePolicyActionInfo();


40  }


41 


42  private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) {


43  var s = actionInfo.Tries;


44  return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s);


45  }


46 


47  public override string ToString() {


48  return "UCB1TunedPolicy";


49  }


50  }


51  }

