source: trunk/sources/HeuristicLab.Clients.Hive.Slave/3.3/SlaveTask.cs @ 15004

Last change on this file since 15004 was 15004, checked in by jkarder, 5 years ago

#2791: improved checkpointing (task is paused and sent back to the server, new one is assigned via next heartbeat)

File size: 9.1 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.IO;
24using System.Threading;
25using HeuristicLab.Clients.Hive.SlaveCore.Properties;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.PluginInfrastructure.Sandboxing;
29
30namespace HeuristicLab.Clients.Hive.SlaveCore {
31
32  /// <summary>
33  ///  Manages a single task and it's appdomain.
34  /// </summary>
35  public class SlaveTask : MarshalByRefObject {
36    private Executor executor;
37    private AppDomain appDomain;
38    private Semaphore waitForStartBeforeKillSem;
39    private bool executorMonitoringRun;
40    private Thread executorMonitoringThread;
41    private PluginManager pluginManager;
42    private ILog log;
43    public Guid TaskId { get; private set; }
44    public bool IsPrepared { get; private set; }
45    private TaskData originalTaskData;
46
47    private int coresNeeded;
48    public int CoresNeeded {
49      get { return coresNeeded; }
50      set { this.coresNeeded = value; }
51    }
52
53    public TimeSpan ExecutionTime {
54      get {
55        try {
56          return executor != null ? executor.ExecutionTime : TimeSpan.Zero;
57        }
58        catch (Exception ex) {
59          EventLogManager.LogException(ex);
60          return TimeSpan.Zero;
61        }
62      }
63    }
64
65    public SlaveTask(PluginManager pluginManager, int coresNeeded, ILog log) {
66      this.pluginManager = pluginManager;
67      this.coresNeeded = coresNeeded;
68      this.log = log;
69      waitForStartBeforeKillSem = new Semaphore(0, 1);
70      executorMonitoringRun = true;
71      IsPrepared = false;
72    }
73
74    public void StartJobAsync(Task task, TaskData taskData) {
75      try {
76        this.TaskId = task.Id;
77        originalTaskData = taskData;
78        Prepare(task);
79        StartTaskInAppDomain(taskData);
80      }
81      catch (Exception) {
82        // make sure to clean up if something went wrong
83        DisposeAppDomain();
84        throw;
85      }
86    }
87
88    public void PauseTask() {
89      if (!IsPrepared) throw new AppDomainNotCreatedException();
90      if (!executor.IsPausing && !executor.IsStopping) executor.Pause();
91    }
92
93    public void StopTask() {
94      if (!IsPrepared) throw new AppDomainNotCreatedException();
95      if (!executor.IsPausing && !executor.IsStopping) executor.Stop();
96    }
97
98    private void Prepare(Task task) {
99      string pluginDir = Path.Combine(pluginManager.PluginTempBaseDir, task.Id.ToString());
100      string configFileName;
101      pluginManager.PreparePlugins(task, out configFileName);
102      appDomain = CreateAppDomain(task, pluginDir, configFileName);
103      IsPrepared = true;
104    }
105
106    private AppDomain CreateAppDomain(Task task, String pluginDir, string configFileName) {
107      appDomain = SandboxManager.CreateAndInitSandbox(task.Id.ToString(), pluginDir, Path.Combine(pluginDir, configFileName));
108      appDomain.UnhandledException += new UnhandledExceptionEventHandler(AppDomain_UnhandledException);
109
110      log.LogMessage("Creating AppDomain");
111      executor = (Executor)appDomain.CreateInstanceAndUnwrap(typeof(Executor).Assembly.GetName().Name, typeof(Executor).FullName);
112
113      executor.TaskId = task.Id;
114      executor.CoresNeeded = task.CoresNeeded;
115      executor.MemoryNeeded = task.MemoryNeeded;
116      return appDomain;
117    }
118
119    private void StartTaskInAppDomain(TaskData taskData) {
120      executor.Start(taskData.Data);
121      waitForStartBeforeKillSem.Release();
122      StartExecutorMonitoringThread();
123    }
124
125    public void DisposeAppDomain() {
126      log.LogMessage(string.Format("Shutting down Appdomain for Task {0}", TaskId));
127      StopExecutorMonitoringThread();
128
129      if (executor != null) {
130        try {
131          executor.Dispose();
132        }
133        catch (Exception ex) {
134          EventLogManager.LogException(ex);
135        }
136      }
137
138      if (appDomain != null) {
139        appDomain.UnhandledException -= new UnhandledExceptionEventHandler(AppDomain_UnhandledException);
140        int repeat = Settings.Default.PluginDeletionRetries;
141        while (repeat > 0) {
142          try {
143            waitForStartBeforeKillSem.WaitOne(Settings.Default.ExecutorSemTimeouts);
144            AppDomain.Unload(appDomain);
145            waitForStartBeforeKillSem.Dispose();
146            repeat = 0;
147          }
148          catch (CannotUnloadAppDomainException) {
149            log.LogMessage("Could not unload AppDomain, will try again in 1 sec.");
150            Thread.Sleep(Settings.Default.PluginDeletionTimeout);
151            repeat--;
152            if (repeat == 0) {
153              throw; // rethrow and let app crash
154            }
155          }
156        }
157      }
158      pluginManager.DeletePluginsForJob(TaskId);
159      GC.Collect();
160    }
161
162    private void AppDomain_UnhandledException(object sender, UnhandledExceptionEventArgs e) {
163      DisposeAppDomain();
164      OnTaskFailed(new Exception("Unhandled exception: " + e.ExceptionObject.ToString()));
165    }
166
167    public TaskData GetTaskData() {
168      TaskData data = null;
169      try {
170        data = executor.GetTaskData();
171        //this means that there was a problem executing the task
172        if (data == null) return originalTaskData;
173      }
174      catch (Exception ex) {
175        EventLogManager.LogException(ex);
176      }
177      return data;
178    }
179
180    #region ExecutorMonitorThread
181    private void StartExecutorMonitoringThread() {
182      executorMonitoringThread = new Thread(MonitorExecutor);
183      executorMonitoringThread.Start();
184    }
185
186    private void StopExecutorMonitoringThread() {
187      if (executorMonitoringThread != null) {
188        if (executorMonitoringRun) {
189          executorMonitoringRun = false;
190          executor.ExecutorCommandQueue.AddMessage(ExecutorMessageType.StopExecutorMonitoringThread);
191        }
192      }
193    }
194
195    /// <summary>
196    /// Because the executor is in an appdomain and is not able to call back
197    /// (because of security -> lease time for marshall-by-ref object is 5 min),
198    /// we have to poll the executor for events we have to react to (e.g. task finished...)   
199    /// </summary>
200    private void MonitorExecutor() {
201      while (executorMonitoringRun) {
202        //this call goes through the appdomain border. We have to
203        //poll so that the the lease gets renewed
204        ExecutorMessage message;
205        do {
206          message = executor.ExecutorCommandQueue.GetMessage();
207        } while (message == null);
208
209        switch (message.MessageType) {
210          case ExecutorMessageType.TaskStarted:
211            OnTaskStarted();
212            break;
213
214          case ExecutorMessageType.TaskPaused:
215            executorMonitoringRun = false;
216            OnTaskPaused();
217            DisposeAppDomain();
218            break;
219
220          case ExecutorMessageType.TaskStopped:
221            executorMonitoringRun = false;
222            OnTaskStopped();
223            DisposeAppDomain();
224            break;
225
226          case ExecutorMessageType.TaskFailed:
227            executorMonitoringRun = false;
228            OnTaskFailed(new TaskFailedException(executor.CurrentExceptionStr));
229            DisposeAppDomain();
230            break;
231
232          case ExecutorMessageType.StopExecutorMonitoringThread:
233            executorMonitoringRun = false;
234            break;
235        }
236      }
237    }
238    #endregion
239
240    public event EventHandler<EventArgs<Guid>> TaskStarted;
241    private void OnTaskStarted() {
242      var handler = TaskStarted;
243      if (handler != null) handler(this, new EventArgs<Guid>(this.TaskId));
244    }
245
246    public event EventHandler<EventArgs<Guid>> TaskStopped;
247    private void OnTaskStopped() {
248      var handler = TaskStopped;
249      if (handler != null) handler(this, new EventArgs<Guid>(this.TaskId));
250    }
251
252    public event EventHandler<EventArgs<Guid>> TaskPaused;
253    private void OnTaskPaused() {
254      var handler = TaskPaused;
255      if (handler != null) handler(this, new EventArgs<Guid>(this.TaskId));
256    }
257
258    public event EventHandler<EventArgs<Guid>> TaskAborted;
259    private void OnTaskAborted() {
260      var handler = TaskAborted;
261      if (handler != null) handler(this, new EventArgs<Guid>(this.TaskId));
262    }
263
264    public event EventHandler<EventArgs<Guid, Exception>> TaskFailed;
265    private void OnTaskFailed(Exception exception) {
266      var handler = TaskFailed;
267      if (handler != null) handler(this, new EventArgs<Guid, Exception>(this.TaskId, exception));
268    }
269  }
270}
Note: See TracBrowser for help on using the repository browser.