source: trunk/sources/HeuristicLab.Services.Hive/3.3/Manager/EventManager.cs @ 7434

Last change on this file since 7434 was 7434, checked in by ascheibe, 8 years ago

#1722 When setting timed-out slaves offline the tasks on these slaves shouldn't be set to waiting as this leads to performance problems and deadlocks with big statelog tables. Setting tasks to waiting is done by another method of the cleanup process more efficiently.

File size: 4.8 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Services.Hive.DataAccess;
26using DT = HeuristicLab.Services.Hive.DataTransfer;
27
28
29namespace HeuristicLab.Services.Hive {
30  /// <summary>
31  /// This class offers methods for cleaning up offline slaves and task
32  /// </summary>
33  public class EventManager : IEventManager {
34    private IHiveDao dao {
35      get { return ServiceLocator.Instance.HiveDao; }
36    }
37    private IAuthorizationManager auth {
38      get { return ServiceLocator.Instance.AuthorizationManager; }
39    }
40    private ILogger log {
41      get { return LogFactory.GetLogger(this.GetType().Namespace); }
42    }
43    private DataAccess.ITransactionManager trans {
44      get { return ServiceLocator.Instance.TransactionManager; }
45    }
46
47    public void Cleanup() {
48      trans.UseTransaction(() => {
49        SetTimeoutSlavesOffline();
50        SetTimeoutTasksWaiting();
51      }, true);
52
53      trans.UseTransaction(() => {
54        FinishParentTasks();
55        UpdateStatistics();
56      }, false);
57    }
58
59    private void UpdateStatistics() {
60      var slaves = dao.GetSlaves(x => x.SlaveState == SlaveState.Calculating || x.SlaveState == SlaveState.Idle);
61
62      var stats = new DataTransfer.Statistics();
63      stats.TimeStamp = DateTime.Now;
64      var slaveStats = new List<DT.SlaveStatistics>();
65      foreach (var slave in slaves) {
66        slaveStats.Add(new DT.SlaveStatistics() {
67          SlaveId = slave.Id,
68          Cores = slave.Cores.HasValue ? slave.Cores.Value : 0,
69          FreeCores = slave.FreeCores.HasValue ? slave.FreeCores.Value : 0,
70          Memory = slave.Memory.HasValue ? slave.Memory.Value : 0,
71          FreeMemory = slave.FreeMemory.HasValue ? slave.FreeMemory.Value : 0,
72          CpuUtilization = slave.CpuUtilization
73        });
74      }
75      stats.SlaveStatistics = slaveStats;
76      //collecting user statistics slows down the db and results in timeouts.
77      //we have to find another way to deal with this. 
78      //until then the next line is commented out...
79      //stats.UserStatistics = dao.GetUserStatistics();
80      dao.AddStatistics(stats);
81    }
82
83    /// <summary>
84    /// Searches for slaves which are timed out, puts them and their task offline
85    /// </summary>
86    private void SetTimeoutSlavesOffline() {
87      var slaves = dao.GetSlaves(x => x.SlaveState != SlaveState.Offline);
88      foreach (DT.Slave slave in slaves) {
89        if (!slave.LastHeartbeat.HasValue || (DateTime.Now - slave.LastHeartbeat.Value) > HeuristicLab.Services.Hive.Properties.Settings.Default.SlaveHeartbeatTimeout) {
90          slave.SlaveState = DT.SlaveState.Offline;
91          dao.UpdateSlave(slave);
92        }
93      }
94    }
95
96    /// <summary>
97    /// Looks for parent tasks which have FinishWhenChildJobsFinished and set their state to finished
98    /// </summary>
99    private void FinishParentTasks() {
100      var parentTasksToFinish = dao.GetParentTasks(dao.GetResources(x => true).Select(x => x.Id), 0, true);
101      foreach (var task in parentTasksToFinish) {
102        dao.UpdateTaskState(task.Id, TaskState.Finished, null, null, string.Empty);
103      }
104    }
105
106    /// <summary>
107    /// Looks for task which have not sent heartbeats for some time and reschedules them for calculation
108    /// </summary>
109    private void SetTimeoutTasksWaiting() {
110      var tasks = dao.GetTasks(x => (x.State == TaskState.Calculating && (DateTime.Now - x.LastHeartbeat) > HeuristicLab.Services.Hive.Properties.Settings.Default.CalculatingJobHeartbeatTimeout)
111                               || (x.State == TaskState.Transferring && (DateTime.Now - x.LastHeartbeat) > HeuristicLab.Services.Hive.Properties.Settings.Default.TransferringJobHeartbeatTimeout));
112      foreach (var j in tasks) {
113        DT.Task task = dao.UpdateTaskState(j.Id, TaskState.Waiting, null, null, "Slave timed out.");
114        task.Command = null;
115        dao.UpdateTask(task);
116      }
117    }
118  }
119}
Note: See TracBrowser for help on using the repository browser.