source: trunk/sources/HeuristicLab.Services.Hive/3.3/Manager/HeartbeatManager.cs @ 12878

Last change on this file since 12878 was 12878, checked in by ascheibe, 7 years ago

#2388 merged hive statistics branch into trunk

File size: 9.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading;
26using HeuristicLab.Services.Hive.DataAccess;
27using HeuristicLab.Services.Hive.DataAccess.Interfaces;
28using HeuristicLab.Services.Hive.DataTransfer;
29using DA = HeuristicLab.Services.Hive.DataAccess;
30
31namespace HeuristicLab.Services.Hive.Manager {
32  public class HeartbeatManager {
33    private const string MutexName = "HiveTaskSchedulingMutex";
34
35    private IPersistenceManager PersistenceManager {
36      get { return ServiceLocator.Instance.PersistenceManager; }
37    }
38
39    private ITaskScheduler TaskScheduler {
40      get { return ServiceLocator.Instance.TaskScheduler; }
41    }
42
43    /// <summary>
44    /// This method will be called every time a slave sends a heartbeat (-> very often; concurrency is important!)
45    /// </summary>
46    /// <returns>a list of actions the slave should do</returns>
47    public List<MessageContainer> ProcessHeartbeat(Heartbeat heartbeat) {
48      List<MessageContainer> actions = new List<MessageContainer>();
49      var pm = PersistenceManager;
50      var slaveDao = pm.SlaveDao;
51      var taskDao = pm.TaskDao;
52      var slave = pm.UseTransaction(() => slaveDao.GetById(heartbeat.SlaveId));
53      if (slave == null) {
54        actions.Add(new MessageContainer(MessageContainer.MessageType.SayHello));
55      } else {
56        if (heartbeat.HbInterval != slave.HbInterval) {
57          actions.Add(new MessageContainer(MessageContainer.MessageType.NewHBInterval));
58        }
59        if (slaveDao.SlaveHasToShutdownComputer(slave.ResourceId)) {
60          actions.Add(new MessageContainer(MessageContainer.MessageType.ShutdownComputer));
61        }
62        // update slave data 
63        slave.FreeCores = heartbeat.FreeCores;
64        slave.FreeMemory = heartbeat.FreeMemory;
65        slave.CpuUtilization = heartbeat.CpuUtilization;
66        slave.SlaveState = (heartbeat.JobProgress != null && heartbeat.JobProgress.Count > 0)
67          ? DA.SlaveState.Calculating
68          : DA.SlaveState.Idle;
69        slave.LastHeartbeat = DateTime.Now;
70        pm.UseTransaction(() => {
71          slave.IsAllowedToCalculate = slaveDao.SlaveIsAllowedToCalculate(slave.ResourceId);
72          pm.SubmitChanges();
73        });
74
75        // update task data
76        actions.AddRange(UpdateTasks(pm, heartbeat, slave.IsAllowedToCalculate));
77
78        // assign new task
79        if (heartbeat.AssignJob && slave.IsAllowedToCalculate && heartbeat.FreeCores > 0) {
80          bool mutexAquired = false;
81          var mutex = new Mutex(false, MutexName);
82          try {
83            mutexAquired = mutex.WaitOne(Properties.Settings.Default.SchedulingPatience);
84            if (mutexAquired) {
85              var waitingTasks = pm.UseTransaction(() => taskDao.GetWaitingTasks(slave)
86                  .Select(x => new TaskInfoForScheduler {
87                    TaskId = x.TaskId,
88                    JobId = x.JobId,
89                    Priority = x.Priority
90                  })
91                  .ToList()
92              );
93              var availableTasks = TaskScheduler.Schedule(waitingTasks).ToArray();
94              if (availableTasks.Any()) {
95                var task = availableTasks.First();
96                AssignTask(pm, slave, task.TaskId);
97                actions.Add(new MessageContainer(MessageContainer.MessageType.CalculateTask, task.TaskId));
98              }
99            } else {
100              LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling could not be aquired.");
101            }
102          }
103          catch (AbandonedMutexException) {
104            LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling has been abandoned.");
105          }
106          catch (Exception ex) {
107            LogFactory.GetLogger(this.GetType().Namespace).Log(string.Format("HeartbeatManager threw an exception in ProcessHeartbeat: {0}", ex));
108          }
109          finally {
110            if (mutexAquired) mutex.ReleaseMutex();
111          }
112        }
113      }
114      return actions;
115    }
116
117    private void AssignTask(IPersistenceManager pm, DA.Slave slave, Guid taskId) {
118      const DA.TaskState transferring = DA.TaskState.Transferring;
119      DateTime now = DateTime.Now;
120      var taskDao = pm.TaskDao;
121      var stateLogDao = pm.StateLogDao;
122      pm.UseTransaction(() => {
123        var task = taskDao.GetById(taskId);
124        stateLogDao.Save(new DA.StateLog {
125          State = transferring,
126          DateTime = now,
127          TaskId = taskId,
128          SlaveId = slave.ResourceId,
129          UserId = null,
130          Exception = null
131        });
132        task.State = transferring;
133        task.LastHeartbeat = now;
134        pm.SubmitChanges();
135      });
136    }
137
138    /// <summary>
139    /// Update the progress of each task
140    /// Checks if all the task sent by heartbeat are supposed to be calculated by this slave
141    /// </summary>
142    private IEnumerable<MessageContainer> UpdateTasks(IPersistenceManager pm, Heartbeat heartbeat, bool isAllowedToCalculate) {
143      var taskDao = pm.TaskDao;
144      var assignedResourceDao = pm.AssignedResourceDao;
145      var actions = new List<MessageContainer>();
146      if (heartbeat.JobProgress == null || !heartbeat.JobProgress.Any())
147        return actions;
148
149      if (!isAllowedToCalculate && heartbeat.JobProgress.Count != 0) {
150        actions.Add(new MessageContainer(MessageContainer.MessageType.PauseAll));
151      } else {
152        // select all tasks and statelogs with one query
153        var taskIds = heartbeat.JobProgress.Select(x => x.Key).ToList();
154        var taskInfos = pm.UseTransaction(() =>
155          (from task in taskDao.GetAll()
156           where taskIds.Contains(task.TaskId)
157           let lastStateLog = task.StateLogs.OrderByDescending(x => x.DateTime).FirstOrDefault()
158           select new {
159             TaskId = task.TaskId,
160             Command = task.Command,
161             SlaveId = lastStateLog != null ? lastStateLog.SlaveId : default(Guid)
162           }).ToList()
163        );
164
165        // process the jobProgresses
166        foreach (var jobProgress in heartbeat.JobProgress) {
167          var progress = jobProgress;
168          var curTask = taskInfos.SingleOrDefault(x => x.TaskId == progress.Key);
169          if (curTask == null) {
170            actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, progress.Key));
171            LogFactory.GetLogger(this.GetType().Namespace).Log("Task on slave " + heartbeat.SlaveId + " does not exist in DB: " + jobProgress.Key);
172          } else {
173            var slaveId = curTask.SlaveId;
174            if (slaveId == Guid.Empty || slaveId != heartbeat.SlaveId) {
175              // assigned slave does not match heartbeat
176              actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId));
177              LogFactory.GetLogger(this.GetType().Namespace).Log("The slave " + heartbeat.SlaveId + " is not supposed to calculate task: " + curTask.TaskId);
178            } else if (!assignedResourceDao.TaskIsAllowedToBeCalculatedBySlave(curTask.TaskId, heartbeat.SlaveId)) {
179              // assigned resources ids of task do not match with slaveId (and parent resourceGroupIds); this might happen when slave is moved to different group
180              actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId));
181            } else {
182              // update task execution time
183              pm.UseTransaction(() => {
184                taskDao.UpdateExecutionTime(curTask.TaskId, progress.Value.TotalMilliseconds);
185              });
186              switch (curTask.Command) {
187                case DA.Command.Stop:
188                  actions.Add(new MessageContainer(MessageContainer.MessageType.StopTask, curTask.TaskId));
189                  break;
190                case DA.Command.Pause:
191                  actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId));
192                  break;
193                case DA.Command.Abort:
194                  actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId));
195                  break;
196              }
197            }
198          }
199        }
200      }
201      return actions;
202    }
203  }
204}
Note: See TracBrowser for help on using the repository browser.