1  using System;


2  using System.Collections.Generic;


3  using System.Diagnostics;


4  using System.Linq;


5  using System.Text;


6  using System.Threading.Tasks;


7 


8  namespace HeuristicLab.Algorithms.Bandits {


9  public class UCB1TunedPolicy : IPolicy {


10 


11  public int SelectAction(Random random, IEnumerable<IPolicyActionInfo> actionInfos) {


12  var myActionInfos = actionInfos.OfType<MeanAndVariancePolicyActionInfo>().ToArray(); // TODO: performance


13  int bestAction = 1;


14  double bestQ = double.NegativeInfinity;


15  int totalTries = myActionInfos.Where(a => !a.Disabled).Sum(a => a.Tries);


16 


17  for (int a = 0; a < myActionInfos.Length; a++) {


18  if (myActionInfos[a].Disabled) continue;


19  if (myActionInfos[a].Tries == 0) return a;


20 


21  var sumReward = myActionInfos[a].SumReward;


22  var tries = myActionInfos[a].Tries;


23 


24  var avgReward = sumReward / tries;


25  var q = avgReward + Math.Sqrt((Math.Log(totalTries) / tries) * Math.Min(1.0 / 4, V(myActionInfos[a], totalTries))); // 1/4 is upper bound of bernoulli distributed variable


26  if (q > bestQ) {


27  bestQ = q;


28  bestAction = a;


29  }


30  }


31  Debug.Assert(bestAction > 1);


32  return bestAction;


33  }


34 


35  public IPolicyActionInfo CreateActionInfo() {


36  return new MeanAndVariancePolicyActionInfo();


37  }


38 


39  private double V(MeanAndVariancePolicyActionInfo actionInfo, int totalTries) {


40  var s = actionInfo.Tries;


41  return actionInfo.RewardVariance + Math.Sqrt(2 * Math.Log(totalTries) / s);


42  }


43 


44  public override string ToString() {


45  return "UCB1TunedPolicy";


46  }


47  }


48  }

