using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Common; namespace HeuristicLab.Algorithms.Bandits.BanditPolicies { // stores information that is relevant for most of the policies public class ExtremeHunterActionInfo : IBanditPolicyActionInfo { // hills estimator using a fraction r of the largest sample values private class OnlineHillEstimator { private MinHeap minHeap; private MaxHeap maxHeap; public double R { get; set; } private long n; public double Alpha { get { if (minHeap.Count <= 1) return double.PositiveInfinity; double xk = minHeap.GetMin(); if (xk.IsAlmost(0.0)) return double.PositiveInfinity; var alpha = 1.0 / (minHeap.Count - 1) * minHeap.Skip(1).Sum(x => Math.Log(x) - Math.Log(xk)); Debug.Assert(alpha > 0); return alpha; } } public OnlineHillEstimator(double r = 0.1) { this.R = r; this.minHeap = new MinHeap(); this.maxHeap = new MaxHeap(); n = 0; } private double maxNotInHeap = double.NegativeInfinity; public void Update(double reward) { // if we found a new larger reward then the best n*r so far then insert and drop the smallest if (minHeap.Count > 0 && reward > minHeap.GetMin()) { maxHeap.Add(minHeap.ExtractDominating()); minHeap.Add(reward); } else { maxHeap.Add(reward); } // increase number of observed rewards n++; // now it could be possible that we need to increase the size of the heap by 1 // => add the largest observed reward so far which is not in the heap if (((int)Math.Floor(n * R)) > minHeap.Count) { minHeap.Add(maxHeap.ExtractDominating()); } Debug.Assert(minHeap.Count == ((int)Math.Floor(n * R))); Debug.Assert(maxHeap.Count == 0 || minHeap.Count == 0 || maxHeap.GetMin() <= minHeap.GetMin()); } } public int Tries { get; private set; } public IEnumerable Rewards { get { return rewards; } } private OnlineHillEstimator hillEstimator; private List rewards; public double MaxReward { get; private set; } public double Value { get { return hillEstimator.Alpha; } } public ExtremeHunterActionInfo() { Reset(); } public void UpdateReward(double reward) { if (reward < 0.0) throw new ArgumentException("reward"); MaxReward = Math.Max(MaxReward, reward); Tries++; reward = (1 / (1 - reward)); // transformation from [0..1] rewards.Add(reward); hillEstimator.Update(reward); } public void Reset() { MaxReward = double.NegativeInfinity; this.hillEstimator = new OnlineHillEstimator(); this.rewards = new List(); this.Tries = 0; } public override string ToString() { return string.Format("{0:F3} {1}", Value, Tries); } } }