#region License Information
/* HeuristicLab
* Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Diagnostics;
using System.IO;
using System.Reflection;
using System.ServiceModel;
using System.Threading;
using System.Threading.Tasks;
using HeuristicLab.Common;
using HeuristicLab.Core;
using TS = System.Threading.Tasks;
namespace HeuristicLab.Clients.Hive.SlaveCore {
///
/// The core component of the Hive Slave.
/// Handles commands sent from the Hive Server and does all webservice calls for jobs.
///
public class Core : MarshalByRefObject {
private static readonly object locker = new object();
private static HeartbeatManager heartbeatManager;
public static HeartbeatManager HeartbeatManager {
get { return heartbeatManager; }
}
public EventLog ServiceEventLog { get; set; }
private Semaphore waitShutdownSem = new Semaphore(0, 1);
private bool abortRequested;
private ServiceHost slaveComm;
private WcfService wcfService;
private TaskManager taskManager;
private ConfigManager configManager;
private PluginManager pluginManager;
public Core() {
var log = new ThreadSafeLog(SlaveCore.Properties.Settings.Default.MaxLogCount);
this.pluginManager = new PluginManager(WcfService.Instance, log);
this.taskManager = new TaskManager(pluginManager, log);
log.MessageAdded += new EventHandler>(log_MessageAdded);
RegisterTaskManagerEvents();
this.configManager = new ConfigManager(taskManager);
ConfigManager.Instance = this.configManager;
}
///
/// Main method for the client
///
public void Start() {
abortRequested = false;
EventLogManager.ServiceEventLog = ServiceEventLog;
try {
//start the client communication service (pipe between slave and slave gui)
slaveComm = new ServiceHost(typeof(SlaveCommunicationService));
try {
slaveComm.Open();
}
catch (AddressAlreadyInUseException ex) {
if (ServiceEventLog != null) {
EventLogManager.LogException(ex);
}
}
// delete all left over task directories
pluginManager.CleanPluginTemp();
SlaveClientCom.Instance.LogMessage("Hive Slave started");
wcfService = WcfService.Instance;
RegisterServiceEvents();
StartHeartbeats(); // Start heartbeats thread
DispatchMessageQueue(); // dispatch messages until abortRequested
}
catch (Exception ex) {
if (ServiceEventLog != null) {
EventLogManager.LogException(ex);
} else {
//try to log with SlaveClientCom.Instance. if this works the user sees at least a message,
//else an exception will be thrown anyways.
SlaveClientCom.Instance.LogMessage(string.Format("Uncaught exception: {0} {1} Core is going to shutdown.", ex.ToString(), Environment.NewLine));
}
ShutdownCore();
}
finally {
DeregisterServiceEvents();
waitShutdownSem.Release();
}
}
private void StartHeartbeats() {
//Initialize the heartbeat
if (heartbeatManager == null) {
heartbeatManager = new HeartbeatManager();
heartbeatManager.StartHeartbeat();
}
}
private void DispatchMessageQueue() {
MessageQueue queue = MessageQueue.GetInstance();
while (!abortRequested) {
MessageContainer container = queue.GetMessage();
DetermineAction(container);
if (!abortRequested) {
SlaveClientCom.Instance.StatusChanged(configManager.GetStatusForClientConsole());
}
}
}
private void RegisterServiceEvents() {
WcfService.Instance.Connected += new EventHandler(WcfService_Connected);
WcfService.Instance.ExceptionOccured += new EventHandler>(WcfService_ExceptionOccured);
}
private void DeregisterServiceEvents() {
WcfService.Instance.Connected -= WcfService_Connected;
WcfService.Instance.ExceptionOccured -= WcfService_ExceptionOccured;
}
private void WcfService_ExceptionOccured(object sender, EventArgs e) {
SlaveClientCom.Instance.LogMessage(string.Format("Connection to server interruped with exception: {0}", e.Value.Message));
}
private void WcfService_Connected(object sender, EventArgs e) {
SlaveClientCom.Instance.LogMessage("Connected successfully to Hive server");
}
///
/// Reads and analyzes the Messages from the MessageQueue and starts corresponding actions
///
/// The container, containing the message
private void DetermineAction(MessageContainer container) {
SlaveClientCom.Instance.LogMessage(string.Format("Message: {0} for task: {1} ", container.Message.ToString(), container.TaskId));
switch (container.Message) {
case MessageContainer.MessageType.CalculateTask:
CalculateTaskAsync(container.TaskId);
break;
case MessageContainer.MessageType.AbortTask:
AbortTaskAsync(container.TaskId);
break;
case MessageContainer.MessageType.StopTask:
StopTaskAsync(container.TaskId);
break;
case MessageContainer.MessageType.PauseTask:
PauseTaskAsync(container.TaskId);
break;
case MessageContainer.MessageType.StopAll:
DoStopAll();
break;
case MessageContainer.MessageType.PauseAll:
DoPauseAll();
break;
case MessageContainer.MessageType.AbortAll:
DoAbortAll();
break;
case MessageContainer.MessageType.ShutdownSlave:
ShutdownCore();
break;
case MessageContainer.MessageType.Restart:
DoStartSlave();
break;
case MessageContainer.MessageType.Sleep:
Sleep();
break;
case MessageContainer.MessageType.SayHello:
wcfService.Connect(configManager.GetClientInfo());
break;
case MessageContainer.MessageType.NewHBInterval:
int interval = wcfService.GetNewHeartbeatInterval(ConfigManager.Instance.GetClientInfo().Id);
if (interval != -1) {
HeartbeatManager.Interval = TimeSpan.FromSeconds(interval);
}
break;
case MessageContainer.MessageType.ShutdownComputer:
ShutdownComputer();
break;
}
}
private void CalculateTaskAsync(Guid jobId) {
TS.Task.Factory.StartNew(HandleCalculateTask, jobId)
.ContinueWith((t) => {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(t.Exception.ToString());
}, TaskContinuationOptions.OnlyOnFaulted);
}
private void StopTaskAsync(Guid jobId) {
TS.Task.Factory.StartNew(HandleStopTask, jobId)
.ContinueWith((t) => {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(t.Exception.ToString());
}, TaskContinuationOptions.OnlyOnFaulted);
}
private void PauseTaskAsync(Guid jobId) {
TS.Task.Factory.StartNew(HandlePauseTask, jobId)
.ContinueWith((t) => {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(t.Exception.ToString());
}, TaskContinuationOptions.OnlyOnFaulted);
}
private void AbortTaskAsync(Guid jobId) {
TS.Task.Factory.StartNew(HandleAbortTask, jobId)
.ContinueWith((t) => {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(t.Exception.ToString());
}, TaskContinuationOptions.OnlyOnFaulted);
}
private void HandleCalculateTask(object taskIdObj) {
Guid taskId = (Guid)taskIdObj;
Task task = null;
int usedCores = 0;
try {
task = wcfService.GetTask(taskId);
if (task == null) throw new TaskNotFoundException(taskId);
lock (locker) {
// the amount of used cores/memory could exceed the amount of available cores/memory
// if HandleCalculateTask is called simultaneously from within two different tasks
if (ConfigManager.Instance.GetFreeCores() < task.CoresNeeded) throw new OutOfCoresException();
if (ConfigManager.Instance.GetFreeMemory() < task.MemoryNeeded) throw new OutOfMemoryException();
SlaveStatusInfo.IncrementUsedCores(task.CoresNeeded); usedCores = task.CoresNeeded;
}
TaskData taskData = wcfService.GetTaskData(taskId);
if (taskData == null) throw new TaskDataNotFoundException(taskId);
task = wcfService.UpdateJobState(taskId, TaskState.Calculating, null);
if (task == null) throw new TaskNotFoundException(taskId);
taskManager.StartTaskAsync(task, taskData);
}
catch (TaskNotFoundException) {
SlaveStatusInfo.DecrementUsedCores(usedCores);
throw;
}
catch (TaskDataNotFoundException) {
SlaveStatusInfo.DecrementUsedCores(usedCores);
throw;
}
catch (TaskAlreadyRunningException) {
SlaveStatusInfo.DecrementUsedCores(usedCores);
throw;
}
catch (OutOfCoresException) {
wcfService.UpdateJobState(taskId, TaskState.Waiting, "No more cores available");
throw;
}
catch (OutOfMemoryException) {
wcfService.UpdateJobState(taskId, TaskState.Waiting, "No more memory available");
throw;
}
catch (Exception e) {
SlaveStatusInfo.DecrementUsedCores(usedCores);
wcfService.UpdateJobState(taskId, TaskState.Failed, e.ToString());
throw;
}
}
private void HandleStopTask(object taskIdObj) {
Guid taskId = (Guid)taskIdObj;
try {
Task task = wcfService.GetTask(taskId);
if (task == null) throw new TaskNotFoundException(taskId);
taskManager.StopTaskAsync(taskId);
}
catch (TaskNotFoundException) {
throw;
}
catch (TaskNotRunningException) {
throw;
}
catch (AppDomainNotCreatedException) {
throw;
}
}
private void HandlePauseTask(object taskIdObj) {
Guid taskId = (Guid)taskIdObj;
try {
Task task = wcfService.GetTask(taskId);
if (task == null) throw new TaskNotFoundException(taskId);
taskManager.PauseTaskAsync(taskId);
}
catch (TaskNotFoundException) {
throw;
}
catch (TaskNotRunningException) {
throw;
}
catch (AppDomainNotCreatedException) {
throw;
}
}
private void HandleAbortTask(object taskIdObj) {
Guid taskId = (Guid)taskIdObj;
try {
taskManager.AbortTask(taskId);
}
catch (TaskNotFoundException) {
throw;
}
}
#region TaskManager Events
private void RegisterTaskManagerEvents() {
this.taskManager.TaskStarted += new EventHandler>(taskManager_TaskStarted);
this.taskManager.TaskPaused += new EventHandler>(taskManager_TaskPaused);
this.taskManager.TaskStopped += new EventHandler>(taskManager_TaskStopped);
this.taskManager.TaskFailed += new EventHandler>>(taskManager_TaskFailed);
this.taskManager.TaskAborted += new EventHandler>(taskManager_TaskAborted);
}
private void taskManager_TaskStarted(object sender, EventArgs e) {
// successfully started, everything is good
}
private void taskManager_TaskPaused(object sender, EventArgs e) {
try {
SlaveStatusInfo.DecrementUsedCores(e.Value.CoresNeeded);
heartbeatManager.AwakeHeartBeatThread();
Task task = wcfService.GetTask(e.Value.TaskId);
if (task == null) throw new TaskNotFoundException(e.Value.TaskId);
task.ExecutionTime = e.Value.ExecutionTime;
TaskData taskData = e.Value2;
wcfService.UpdateTaskData(task, taskData, configManager.GetClientInfo().Id, TaskState.Paused);
}
catch (TaskNotFoundException ex) {
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
catch (Exception ex) {
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
}
private void taskManager_TaskStopped(object sender, EventArgs e) {
try {
SlaveStatusInfo.DecrementUsedCores(e.Value.CoresNeeded);
heartbeatManager.AwakeHeartBeatThread();
Task task = wcfService.GetTask(e.Value.TaskId);
if (task == null) throw new TaskNotFoundException(e.Value.TaskId);
task.ExecutionTime = e.Value.ExecutionTime;
TaskData taskData = e.Value2;
wcfService.UpdateTaskData(task, taskData, configManager.GetClientInfo().Id, TaskState.Finished);
}
catch (TaskNotFoundException ex) {
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
catch (Exception ex) {
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
}
private void taskManager_TaskFailed(object sender, EventArgs> e) {
try {
SlaveStatusInfo.DecrementUsedCores(e.Value.Item1.CoresNeeded);
heartbeatManager.AwakeHeartBeatThread();
SlaveTask slaveTask = e.Value.Item1;
TaskData taskData = e.Value.Item2;
Exception exception = e.Value.Item3;
Task task = wcfService.GetTask(slaveTask.TaskId);
if (task == null) throw new TaskNotFoundException(slaveTask.TaskId);
task.ExecutionTime = slaveTask.ExecutionTime;
if (taskData != null) {
wcfService.UpdateTaskData(task, taskData, configManager.GetClientInfo().Id, TaskState.Failed, exception.ToString());
} else {
wcfService.UpdateJobState(task.Id, TaskState.Failed, exception.ToString());
}
SlaveClientCom.Instance.LogMessage(exception.Message);
}
catch (TaskNotFoundException ex) {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
catch (Exception ex) {
SlaveStatusInfo.IncrementTasksFailed();
SlaveClientCom.Instance.LogMessage(ex.ToString());
}
}
private void taskManager_TaskAborted(object sender, EventArgs e) {
var slaveTask = e.Value;
var task = wcfService.GetTask(slaveTask.TaskId);
wcfService.UpdateJobState(task.Id, TaskState.Aborted, null);
SlaveStatusInfo.DecrementUsedCores(e.Value.CoresNeeded);
}
#endregion
#region Log Events
private void log_MessageAdded(object sender, EventArgs e) {
try {
SlaveClientCom.Instance.LogMessage(e.Value.Split('\t')[1]);
}
catch { }
}
#endregion
///
/// aborts all running jobs, no results are sent back
///
private void DoAbortAll() {
SlaveClientCom.Instance.LogMessage("Aborting all jobs.");
foreach (Guid taskId in taskManager.TaskIds) {
AbortTaskAsync(taskId);
}
}
///
/// wait for jobs to finish, then pause client
///
private void DoPauseAll() {
SlaveClientCom.Instance.LogMessage("Pausing all jobs.");
foreach (Guid taskId in taskManager.TaskIds) {
PauseTaskAsync(taskId);
}
}
///
/// pause slave immediately
///
private void DoStopAll() {
SlaveClientCom.Instance.LogMessage("Stopping all jobs.");
foreach (Guid taskId in taskManager.TaskIds) {
StopTaskAsync(taskId);
}
}
#region Slave Lifecycle Methods
///
/// completly shudown slave
///
public void Shutdown() {
MessageContainer mc = new MessageContainer(MessageContainer.MessageType.ShutdownSlave);
MessageQueue.GetInstance().AddMessage(mc);
waitShutdownSem.WaitOne();
}
private void ShutdownComputer() {
var t = TS.Task.Factory.StartNew(new Action(Shutdown));
t.ContinueWith(c => {
try {
//we assume that *.exe means an executable in the current directory, otherwise it is a command
if (SlaveCore.Properties.Settings.Default.ShutdownCommand.EndsWith(".exe")) {
string dirName = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
Process.Start(Path.Combine(dirName, SlaveCore.Properties.Settings.Default.ShutdownCommand));
} else {
Process.Start(SlaveCore.Properties.Settings.Default.ShutdownCommand);
}
}
catch (Exception ex) {
if (ServiceEventLog != null) {
EventLogManager.LogException(ex);
} else
throw ex;
}
});
}
///
/// complete shutdown, should be called before the the application is exited
///
private void ShutdownCore() {
SlaveClientCom.Instance.LogMessage("Shutdown signal received");
SlaveClientCom.Instance.LogMessage("Stopping heartbeat");
heartbeatManager.StopHeartBeat();
SlaveClientCom.Instance.LogMessage("Stopping checkpointing");
taskManager.StopCheckpointing();
abortRequested = true;
DoAbortAll();
SlaveClientCom.Instance.LogMessage("Logging out");
WcfService.Instance.Disconnect();
SlaveClientCom.Instance.ClientCom.Shutdown();
SlaveClientCom.Close();
if (slaveComm.State != CommunicationState.Closed)
slaveComm.Close();
}
///
/// reinitializes everything and continues operation,
/// can be called after Sleep()
///
private void DoStartSlave() {
SlaveClientCom.Instance.LogMessage("Restart received");
configManager.Asleep = false;
}
///
/// stop slave, except for client gui communication,
/// primarily used by gui if core is running as windows service
///
private void Sleep() {
SlaveClientCom.Instance.LogMessage("Sleep received - not accepting any new jobs");
configManager.Asleep = true;
DoPauseAll();
}
#endregion
}
}