Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Hive/sources/HeuristicLab.Hive/HeuristicLab.Hive.Slave.Core/3.3/Core.cs @ 4772

Last change on this file since 4772 was 4772, checked in by cneumuel, 14 years ago

#1260

  • added LogServiceReader to display log for slave without writing to local files
  • aborted jobs with childjobs now got back to state WaitForChildJob (instead of Offline)
  • lifecyclemanager now knows about available plugins (does not yet work perfectly)
File size: 21.7 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.IO;
25using System.Threading;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Hive.Contracts;
29using HeuristicLab.Hive.Contracts.BusinessObjects;
30using HeuristicLab.Hive.Contracts.ResponseObjects;
31using HeuristicLab.Hive.Slave.Common;
32using HeuristicLab.Hive.Slave.Communication;
33using HeuristicLab.Hive.Slave.Communication.SlaveFacade;
34using HeuristicLab.Hive.Slave.Core.ConfigurationManager;
35using HeuristicLab.Hive.Slave.Core.JobStorage;
36using HeuristicLab.Hive.Slave.Core.SlaveConsoleService;
37using HeuristicLab.Hive.Slave.ExecutionEngine;
38
39namespace HeuristicLab.Hive.Slave.Core {
40  /// <summary>
41  /// The core component of the Hive Client
42  /// </summary>
43  public class Core : MarshalByRefObject {
44    public static bool abortRequested { get; set; }
45
46    public static ILog Log { get; set; }
47
48    private Dictionary<Guid, Executor> engines = new Dictionary<Guid, Executor>();
49    private Dictionary<Guid, AppDomain> appDomains = new Dictionary<Guid, AppDomain>();
50    private Dictionary<Guid, JobDto> jobs = new Dictionary<Guid, JobDto>();
51
52    private WcfService wcfService;
53    private HeartbeatManager heartbeatManager;
54
55    private bool currentlyFetching;
56    private bool CurrentlyFetching {
57      get {
58        return currentlyFetching;
59      }
60      set {
61        currentlyFetching = value;
62        Logger.Debug("Set CurrentlyFetching to " + currentlyFetching);
63      }
64    }
65
66    public Dictionary<Guid, Executor> ExecutionEngines {
67      get { return engines; }
68    }
69
70    internal Dictionary<Guid, JobDto> Jobs {
71      get { return jobs; }
72    }
73
74    /// <summary>
75    /// Main Method for the client
76    /// </summary>
77    public void Start() {
78      abortRequested = false;
79      Logger.Info("Hive Slave started");
80      SlaveConsoleServer server = new SlaveConsoleServer();
81      server.Start();
82
83      ConfigManager manager = ConfigManager.Instance;
84      manager.Core = this;
85
86      wcfService = WcfService.Instance;
87      RegisterServiceEvents();
88
89      RecoverSettings(); // recover server IP from the settings framework
90      StartHeartbeats(); // Start heartbeats thread
91      DispatchMessageQueue(); // dispatch messages until abortRequested
92
93      DeRegisterServiceEvents();
94      server.Close();
95      Logger.Info("Program shutdown");
96    }
97
98    private void RecoverSettings() {
99      ConnectionContainer cc = ConfigManager.Instance.GetServerIP();
100      if (cc.IPAdress != String.Empty) {
101        wcfService.ServerIp = cc.IPAdress;
102      }
103    }
104
105    private void StartHeartbeats() {
106      //Initialize the heartbeat
107      heartbeatManager = new HeartbeatManager { Interval = new TimeSpan(0, 0, 10) };
108      heartbeatManager.StartHeartbeat();
109    }
110
111    private void DispatchMessageQueue() {
112      MessageQueue queue = MessageQueue.GetInstance();
113      while (!abortRequested) {
114        MessageContainer container = queue.GetMessage();
115        DetermineAction(container);
116      }
117    }
118
119    private void RegisterServiceEvents() {
120      wcfService.GetJobCompleted += new EventHandler<GetJobCompletedEventArgs>(wcfService_GetJobCompleted);
121      wcfService.GetFinishedJobResultCompleted += new EventHandler<StoreFinishedJobResultCompletedEventArgs>(wcfService_StoreFinishedJobResultCompleted);
122      wcfService.ProcessSnapshotCompleted += new EventHandler<ProcessSnapshotCompletedEventArgs>(wcfService_ProcessSnapshotCompleted);
123      wcfService.Connected += new EventHandler(wcfService_Connected);
124    }
125
126    private void DeRegisterServiceEvents() {
127      wcfService.GetJobCompleted -= new EventHandler<GetJobCompletedEventArgs>(wcfService_GetJobCompleted);
128      wcfService.GetFinishedJobResultCompleted -= new EventHandler<StoreFinishedJobResultCompletedEventArgs>(wcfService_StoreFinishedJobResultCompleted);
129      wcfService.ProcessSnapshotCompleted -= new EventHandler<ProcessSnapshotCompletedEventArgs>(wcfService_ProcessSnapshotCompleted);
130      wcfService.Connected -= new EventHandler(wcfService_Connected);
131    }
132
133    /// <summary>
134    /// Reads and analyzes the Messages from the MessageQueue and starts corresponding actions
135    /// </summary>
136    /// <param name="container">The Container, containing the message</param>
137    private void DetermineAction(MessageContainer container) {
138      Logger.Info("Message: " + container.Message.ToString() + " for job: " + container.JobId);
139      switch (container.Message) {
140        //Server requests to abort a job
141        case MessageContainer.MessageType.AbortJob:
142          if (engines.ContainsKey(container.JobId))
143            try {
144              engines[container.JobId].Abort();
145            }
146            catch (AppDomainUnloadedException) {
147              // appdomain already unloaded. Finishing job probably ongoing
148            }
149          else
150            Logger.Error("AbortJob: Engine doesn't exist");
151          break;
152
153        //Job has been successfully aborted
154        case MessageContainer.MessageType.JobAborted:
155          Guid jobId = new Guid(container.JobId.ToString());
156          KillAppDomain(jobId);
157          break;
158
159        //Request a Snapshot from the Execution Engine
160        case MessageContainer.MessageType.RequestSnapshot:
161          if (engines.ContainsKey(container.JobId))
162            engines[container.JobId].RequestSnapshot();
163          else
164            Logger.Error("RequestSnapshot: Engine with Job doesn't exist");
165          break;
166
167        //Snapshot is ready and can be sent back to the Server
168        case MessageContainer.MessageType.SnapshotReady:
169          GetSnapshot(container.JobId);
170          break;
171
172        //Pull a Job from the Server
173        case MessageContainer.MessageType.FetchJob:
174          if (!CurrentlyFetching) {
175            wcfService.GetJobAsync(ConfigManager.Instance.GetClientInfo().Id);
176            CurrentlyFetching = true;
177          } else
178            Logger.Info("Currently fetching, won't fetch this time!");
179          break;
180
181        //A Job has finished and can be sent back to the server
182        case MessageContainer.MessageType.FinishedJob:
183          SendFinishedJob(container.JobId);
184          break;
185
186        //When the timeslice is up
187        case MessageContainer.MessageType.UptimeLimitDisconnect:
188          Logger.Info("Uptime Limit reached, storing jobs and sending them back");
189          ShutdownRunningJobsAndSubmitSnapshots();
190          break;
191
192        //Fetch or Force Fetch Calendar!
193        case MessageContainer.MessageType.FetchOrForceFetchCalendar:
194          Logger.Info("Fetch Calendar from Server");
195          FetchCalendarFromServer();
196          break;
197
198        //Hard shutdown of the client
199        case MessageContainer.MessageType.Shutdown:
200          Logger.Info("Shutdown Signal received");
201          lock (engines) {
202            Logger.Debug("engines locked");
203            foreach (KeyValuePair<Guid, AppDomain> kvp in appDomains) {
204              Logger.Debug("Shutting down Appdomain for " + kvp.Key);
205              appDomains[kvp.Key].UnhandledException -= new UnhandledExceptionEventHandler(appDomain_UnhandledException);
206              AppDomain.Unload(kvp.Value);
207            }
208          }
209          Logger.Debug("Stopping heartbeat");
210          abortRequested = true;
211          heartbeatManager.StopHeartBeat();
212          Logger.Debug("Logging out");
213          WcfService.Instance.Logout(ConfigManager.Instance.GetClientInfo().Id);
214          break;
215
216        case MessageContainer.MessageType.AddChildJob:
217          AddChildJob((MessageContainerWithJob)container);
218          break;
219
220        case MessageContainer.MessageType.PauseJob:
221          // send the job back to hive
222          PauseJob((MessageContainerWithJob)container);
223          break;
224
225        case MessageContainer.MessageType.GetChildJobs:
226          GetChildJobs((MessageContainerWithCallback<SerializedJobList>)container);
227          break;
228
229        case MessageContainer.MessageType.DeleteChildJobs:
230          wcfService.DeleteChildJobs(container.JobId);
231          break;
232      }
233    }
234
235    private void GetChildJobs(MessageContainerWithCallback<SerializedJobList> mc) {
236      ResponseObject<SerializedJobList> response = wcfService.GetChildJobs(mc.JobId);
237      if (response != null && response.StatusMessage == ResponseStatus.Ok) {
238        mc.Callback(response.Obj);
239      } else {
240        if (response != null) {
241          Logger.Error(string.Format("GetChildJobs failed: {0}", response.StatusMessage));
242        } else {
243          Logger.Error("GetChildJobs failed.");
244        }
245      }
246    }
247
248    private void PauseJob(MessageContainerWithJob mc) {
249      ResponseObject<JobDto> response = wcfService.PauseJob(mc.SerializedJob);
250      KillAppDomain(mc.JobId);
251      if (response == null || response.StatusMessage != ResponseStatus.Ok) {
252        Logger.Error("PauseJob failed: " + response.StatusMessage);
253      }
254    }
255
256    private ResponseObject<JobDto> AddChildJob(MessageContainerWithJob mc) {
257      ResponseObject<JobDto> response = wcfService.AddChildJob(mc.JobId, mc.SerializedJob);
258      if (response == null || response.StatusMessage != ResponseStatus.Ok) {
259        Logger.Error("AddChildJob failed: " + response.StatusMessage);
260      }
261      return response;
262    }
263
264    private void ShutdownRunningJobsAndSubmitSnapshots() {
265      //check if there are running jobs
266      if (engines.Count > 0) {
267        //make sure there is no more fetching of jobs while the snapshots get processed
268        CurrentlyFetching = true;
269        //request a snapshot of each running job
270        foreach (KeyValuePair<Guid, Executor> kvp in engines) {
271          kvp.Value.RequestSnapshot();
272        }
273      }
274    }
275
276    //Asynchronous Threads for interaction with the Execution Engine
277    #region Async Threads for the EE
278
279    /// <summary>
280    /// serializes the finished job and submits it to the server. If, at the time, a network connection is unavailable, the Job gets stored on the disk.
281    /// once the connection gets reestablished, the job gets submitted
282    /// </summary>
283    /// <param name="jobId"></param>
284    private void SendFinishedJob(object jobId) {
285      try {
286        Guid jId = (Guid)jobId;
287        Logger.Info("Getting the finished job with id: " + jId);
288        if (!engines.ContainsKey(jId)) {
289          Logger.Info("Engine doesn't exist");
290          return;
291        }
292
293        byte[] sJob = engines[jId].GetFinishedJob();
294
295        try {
296          Logger.Info("Sending the finished job with id: " + jId);
297          wcfService.GetFinishedJobResultAsync(ConfigManager.Instance.GetClientInfo().Id, jId, sJob, engines[jId].ExecutionTime, engines[jId].CurrentException, true);
298        }
299        catch (Exception e) {
300          Logger.Info("Transmitting to server failed. Storing the finished job with id: " + jId + " to hdd (" + e.ToString() + ")");
301          JobStorageManager.PersistObjectToDisc(wcfService.ServerIp, 0, jId, sJob); // [chn] Port is not unique anymore (since we need two ports for http and net.tcp-streaming). also the port is now specified only in app.config. use port 0 for the moment
302        }
303        finally {
304          KillAppDomain(jId); // kill app-domain in every case
305        }
306      }
307      catch (Exception e) {
308        OnExceptionOccured(e);
309      }
310    }
311
312    private void GetSnapshot(object jobId) {
313      try {
314        Logger.Info("Fetching a snapshot for job " + jobId);
315        Guid jId = (Guid)jobId;
316        byte[] obj = engines[jId].GetSnapshot();
317        wcfService.ProcessSnapshotSync(ConfigManager.Instance.GetClientInfo().Id, jId, obj, engines[jId].ExecutionTime, null);
318
319        //Uptime Limit reached, now is a good time to destroy this jobs.
320        Logger.Debug("Checking if uptime limit is reached");
321        if (!UptimeManager.Instance.IsAllowedToCalculate()) {
322          Logger.Debug("Uptime limit reached");
323          Logger.Debug("Killing Appdomain");
324          KillAppDomain(jId);
325          //Still anything running? 
326          if (engines.Count == 0) {
327            Logger.Info("All jobs snapshotted and sent back, disconnecting");
328            WcfService.Instance.Disconnect();
329          } else {
330            Logger.Debug("There are still active Jobs in the Field, not disconnecting");
331          }
332        } else {
333          Logger.Debug("Restarting the job" + jobId);
334          engines[jId].StartOnlyJob();
335          Logger.Info("Restarted the job" + jobId);
336        }
337      }
338      catch (Exception e) {
339        OnExceptionOccured(e);
340      }
341    }
342
343    #endregion
344
345    //Eventhandlers for the communication with the wcf Layer
346    #region wcfService Events
347    /// <summary>
348    /// Login has returned
349    /// </summary>
350    /// <param name="sender"></param>
351    /// <param name="e"></param>
352    void wcfService_LoginCompleted(object sender, LoginCompletedEventArgs e) {
353      if (e.Result.StatusMessage == ResponseStatus.Ok) {
354        CurrentlyFetching = false;
355        Logger.Info("Login completed to Hive Server @ " + DateTime.Now);
356      } else
357        Logger.Error("Error during login: " + e.Result.StatusMessage.ToString());
358    }
359
360    /// <summary>
361    /// A new Job from the wcfService has been received and will be started within a AppDomain.
362    /// </summary>
363    /// <param name="sender"></param>
364    /// <param name="e"></param>
365    void wcfService_GetJobCompleted(object sender, GetJobCompletedEventArgs e) {
366      if (e.Result.StatusMessage != ResponseStatus.GetJob_NoJobsAvailable) {
367        Logger.Info("Received new job with id " + e.Result.Obj.Id);
368        Logger.Debug("Fetching plugins for job " + e.Result.Obj.Id);
369        try {
370          PluginCache.Instance.PreparePlugins(e.Result.Obj.PluginsNeeded);
371          PluginCache.Instance.CopyPluginsForJob(e.Result.Obj.PluginsNeeded, e.Result.Obj.Id);
372
373          Logger.Debug("Plugins fetched for job " + e.Result.Obj.Id);
374          String pluginDir = Path.Combine(PluginCache.Instance.PluginRepositoryDir, e.Result.Obj.Id.ToString());
375
376          AppDomain appDomain = HeuristicLab.PluginInfrastructure.Sandboxing.SandboxManager.CreateAndInitSandbox(pluginDir, null);
377          appDomain.UnhandledException += new UnhandledExceptionEventHandler(appDomain_UnhandledException);
378          lock (engines) {
379            if (!jobs.ContainsKey(e.Result.Obj.Id)) {
380              jobs.Add(e.Result.Obj.Id, e.Result.Obj);
381              appDomains.Add(e.Result.Obj.Id, appDomain);
382              Logger.Debug("Creating AppDomain");
383              Executor engine = (Executor)appDomain.CreateInstanceAndUnwrap(typeof(Executor).Assembly.GetName().Name, typeof(Executor).FullName);
384              Logger.Debug("Created AppDomain");
385              engine.JobId = e.Result.Obj.Id;
386              engine.Queue = MessageQueue.GetInstance();
387              Logger.Debug("Starting Engine for job " + e.Result.Obj.Id);
388              engine.Start(e.Data);
389              engines.Add(e.Result.Obj.Id, engine);
390              SlaveStatusInfo.JobsFetched++;
391              Logger.Info("Increment FetchedJobs to:" + SlaveStatusInfo.JobsFetched);
392            }
393          }
394          heartbeatManager.AwakeHeartBeatThread();
395        }
396        catch (Exception exception) {
397          Logger.Error("Creating the Appdomain and loading the job failed for job " + e.Result.Obj.Id);
398          Logger.Error("Error thrown is: ", exception);
399          CurrentlyFetching = false;
400          KillAppDomain(e.Result.Obj.Id);
401          wcfService.StoreFinishedJobResultsSync(ConfigManager.Instance.GetClientInfo().Id, e.Result.Obj.Id, new byte[] { }, e.Result.Obj.ExecutionTime, exception.ToString(), true);
402        }
403      } else {
404        Logger.Info("No more jobs left!");
405      }
406      CurrentlyFetching = false;
407    }
408
409    /// <summary>
410    /// A finished job has been stored on the server
411    /// </summary>
412    /// <param name="sender"></param>
413    /// <param name="e"></param>
414    void wcfService_StoreFinishedJobResultCompleted(object sender, StoreFinishedJobResultCompletedEventArgs e) {
415      Logger.Info("Job submitted with id " + e.Result.JobId);
416      KillAppDomain(e.Result.JobId);
417      if (e.Result.StatusMessage == ResponseStatus.Ok) {
418        SlaveStatusInfo.JobsProcessed++;
419        Logger.Info("Increased ProcessedJobs to:" + SlaveStatusInfo.JobsProcessed);
420        heartbeatManager.AwakeHeartBeatThread();
421      } else {
422        Logger.Error("Sending of job " + e.Result.JobId + " failed, job has been wasted. Message: " + e.Result.StatusMessage);
423      }
424    }
425
426    /// <summary>
427    /// A snapshot has been stored on the server
428    /// </summary>
429    /// <param name="sender"></param>
430    /// <param name="e"></param>
431    void wcfService_ProcessSnapshotCompleted(object sender, ProcessSnapshotCompletedEventArgs e) {
432      Logger.Info("Snapshot " + e.Result.JobId + " has been transmitted according to plan.");
433    }
434
435
436    /// <summary>
437    /// Connnection to the server has been estabilshed => Login and Send the persistet Jobs from the harddisk.
438    /// </summary>
439    /// <param name="sender"></param>
440    /// <param name="e"></param>
441    void wcfService_Connected(object sender, EventArgs e) {
442      Logger.Info("WCF Service got a connection");
443      if (!UptimeManager.Instance.CalendarAvailable) {
444        Logger.Info("No local calendar available, fetch it");
445        FetchCalendarFromServer();
446      }
447      Logger.Info("CalendarAvailable is " + UptimeManager.Instance.CalendarAvailable + " and IsOnline is: " + UptimeManager.Instance.IsAllowedToCalculate());
448      CurrentlyFetching = false;
449      CheckRunningAppDomains();
450      JobStorageManager.CheckAndSubmitJobsFromDisc();
451    }
452
453    private void FetchCalendarFromServer() {
454      ResponseCalendar calres = wcfService.GetCalendarSync(ConfigManager.Instance.GetClientInfo().Id);
455      if (calres.StatusMessage == ResponseStatus.Ok) {
456        if (UptimeManager.Instance.SetAppointments(false, calres)) {
457          Logger.Info("Remote calendar installed");
458          wcfService.SetCalendarStatus(ConfigManager.Instance.GetClientInfo().Id, CalendarState.Fetched);
459        } else {
460          Logger.Info("Remote calendar installation failed, setting state to " + CalendarState.NotAllowedToFetch);
461          wcfService.SetCalendarStatus(ConfigManager.Instance.GetClientInfo().Id, CalendarState.NotAllowedToFetch);
462        }
463      } else {
464        Logger.Info("Remote calendar installation failed, setting state to " + CalendarState.NotAllowedToFetch);
465        wcfService.SetCalendarStatus(ConfigManager.Instance.GetClientInfo().Id, CalendarState.NotAllowedToFetch);
466      }
467    }
468
469    private void CheckRunningAppDomains() {
470      foreach (KeyValuePair<Guid, Executor> execKVP in engines) {
471        if (execKVP.Value.ExecutionState != ExecutionState.Started && execKVP.Value.CurrentMessage == MessageContainer.MessageType.NoMessage) {
472          Logger.Info("Checking for JobId: " + execKVP.Value.JobId);
473          Thread finThread = new Thread(new ParameterizedThreadStart(SendFinishedJob));
474          finThread.Start(execKVP.Value.JobId);
475        }
476      }
477    }
478
479    #endregion
480
481    public event EventHandler<EventArgs<Exception>> ExceptionOccured;
482    private void OnExceptionOccured(Exception e) {
483      Logger.Error("Error: " + e.ToString());
484      var handler = ExceptionOccured;
485      if (handler != null) handler(this, new EventArgs<Exception>(e));
486    }
487
488    void appDomain_UnhandledException(object sender, UnhandledExceptionEventArgs e) {
489      Logger.Error("Exception in AppDomain: " + e.ExceptionObject.ToString());
490    }
491
492    /// <summary>
493    /// Kill a appdomain with a specific id.
494    /// </summary>
495    /// <param name="id">the GUID of the job</param>
496    private void KillAppDomain(Guid id) {
497      Logger.Debug("Shutting down Appdomain for Job " + id);
498      lock (engines) {
499        try {
500          if (engines.ContainsKey(id))
501            engines[id].Dispose();
502          if (appDomains.ContainsKey(id)) {
503            appDomains[id].UnhandledException -= new UnhandledExceptionEventHandler(appDomain_UnhandledException);
504
505            int repeat = 5;
506            while (repeat > 0) {
507              try {
508                AppDomain.Unload(appDomains[id]);
509                repeat = 0;
510              }
511              catch (CannotUnloadAppDomainException) {
512                Logger.Error("Could not unload AppDomain, will try again in 1 sec.");
513                Thread.Sleep(1000);
514                repeat--;
515                if (repeat == 0) {
516                  throw; // rethrow and let app crash
517                }
518              }
519            }
520            appDomains.Remove(id);
521          }
522
523          engines.Remove(id);
524          jobs.Remove(id);
525          PluginCache.Instance.DeletePluginsForJob(id);
526          GC.Collect();
527        }
528        catch (Exception ex) {
529          Logger.Error("Exception when unloading the appdomain: ", ex);
530        }
531      }
532      GC.Collect();
533    }
534  }
535
536
537}
Note: See TracBrowser for help on using the repository browser.