using System; using System.Collections.Generic; using System.Dynamic; using System.Linq; using System.Text; using System.Threading.Tasks; using HeuristicLab.Problems.GrammaticalOptimization; namespace HeuristicLab.Algorithms.Bandits { // this interface represents a policy for episodic reinforcement learning (with afterstates) // here we assume that a reward is only recieved at the end of the episode and the update is done only after an episode is complete // we also assume that the policy can fail to select one of the followStates public interface IPolicy { bool TrySelect(Random random, TState curState, IEnumerable afterStates, out TState selectedState); // selectedState \in afterStates // state-trajectory are the states of the episode, at the end we recieved the reward (only for the terminal state) void UpdateReward(IEnumerable stateTrajectory, double reward); void Reset(); // clears all internal state // for introspection double GetValue(TState state); int GetTries(TState state); } }