Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive/3.4/LifecycleManager.cs @ 5991

Last change on this file since 5991 was 5786, checked in by cneumuel, 14 years ago

#1233

  • implemented correct numbering of BatchRuns
  • improvements in ExperimentManager
  • fixed bug in server (jobs were scheduled multiple times)
  • added exception handling for task in slave
  • improved timeout handling of jobs (LifecycleManager)
File size: 3.0 KB
RevLine 
[5095]1using System;
2using System.Linq;
3using HeuristicLab.Services.Hive.Common;
4using HeuristicLab.Services.Hive.Common.DataTransfer;
5
6namespace HeuristicLab.Services.Hive {
7  /// <summary>
[5593]8  /// This class offers methods for cleaning up offline slaves and jobs
[5095]9  /// </summary>
10  public class LifecycleManager : ILifecycleManager {
11    private DataAccess.IHiveDao dao {
12      get { return ServiceLocator.Instance.HiveDao; }
13    }
14    private HeuristicLab.Services.Hive.DataAccess.TransactionManager trans {
15      get { return ServiceLocator.Instance.TransactionManager; }
16    }
17    private IAuthorizationManager auth {
18      get { return ServiceLocator.Instance.AuthorizationManager; }
19    }
[5593]20    private ILogger log {
21      get { return LogFactory.GetLogger(this.GetType().Namespace); }
[5095]22    }
23
[5593]24    public void Cleanup() {
25      log.Log("LifecycleManager.Cleanup()");
26      SetTimeoutSlavesOffline();
[5786]27      SetTimeoutJobsWaiting();
28      FinishParentJobs();     
[5095]29    }
30
31    /// <summary>
[5526]32    /// Searches for slaves which are timed out, puts them and their jobs offline
33    /// </summary>
34    private void SetTimeoutSlavesOffline() {
35      var slaves = dao.GetSlaves(x => x.SlaveState != SlaveState.Offline);
36      foreach (Slave slave in slaves) {
[5786]37        if (!slave.LastHeartbeat.HasValue || (DateTime.Now - slave.LastHeartbeat.Value) > ApplicationConstants.SlaveHeartbeatTimeout) {
[5526]38          slave.SlaveState = SlaveState.Offline;
39          SetJobsWaiting(slave.Id);
40          dao.UpdateSlave(slave);
41        }
42      }
43    }
44
45    /// <summary>
46    /// Looks for parent jobs which have FinishWhenChildJobsFinished and set their state to finished
47    /// </summary>
48    private void FinishParentJobs() {
49      var parentJobsToFinish = dao.GetParentJobs(dao.GetResources(x => true).Select(x => x.Id), 0, true);
50      foreach (var job in parentJobsToFinish) {
[5636]51        dao.UpdateJobState(job.Id, JobState.Finished, null, null, string.Empty);
[5526]52      }
53    }
54
55    private void SetJobsWaiting(Guid slaveId) {
56      var jobs = dao.GetJobs(x => x.State == JobState.Calculating).Where(x => x.StateLog.Last().SlaveId == slaveId);
[5095]57      foreach (var j in jobs) {
[5786]58        Job job = dao.UpdateJobState(j.Id, JobState.Waiting, slaveId, null, "Slave timed out.");
[5779]59        job.Command = null;
60        dao.UpdateJob(job);
[5095]61      }
62    }
[5593]63
[5786]64    /// <summary>
65    /// Looks for jobs which have not sent heartbeats for some time and reschedules them for calculation
66    /// </summary>
67    private void SetTimeoutJobsWaiting() {
68      var jobs = dao.GetJobs(x => (x.State == JobState.Calculating && (DateTime.Now - x.LastHeartbeat) > ApplicationConstants.CalculatingJobHeartbeatTimeout)
69                               || (x.State == JobState.Transferring && (DateTime.Now - x.LastHeartbeat) > ApplicationConstants.TransferringJobHeartbeatTimeout));
70      foreach (var j in jobs) {
71        Job job = dao.UpdateJobState(j.Id, JobState.Waiting, null, null, "Slave timed out.");
72        job.Command = null;
73        dao.UpdateJob(job);
74      }
75    }
[5095]76  }
77}
Note: See TracBrowser for help on using the repository browser.