Free cookie consent management tool by TermsFeed Policy Generator

source: branches/3105_PythonFormatter/HeuristicLab.Services.Hive/3.3/Manager/EventManager.cs @ 17842

Last change on this file since 17842 was 17574, checked in by jkarder, 5 years ago

#3062: overhauled statistics generation and cleanup

  • switched to a single thread for database cleanup and statistics generation (executed sequentially)
  • switched to preemptive deletion of items that are in status DeletionPending (for jobs: statelogs, taskdata, tasks)
  • added code that aborts tasks whose jobs have already been marked for deletion
  • added method UseTransactionAndSubmit in addition to UseTransaction in PersistenceManager
  • updated DAO methods and introduced more bare metal sql
  • introduced DAO methods for batch deletion
  • fixed usage of enum values in DAO sql queries
  • deleted unnecessary triggers tr_JobDeleteCascade and tr_TaskDeleteCascade in Prepare Hive Database.sql
  • changed scheduling for less interference with janitor and other heartbeats
    • increased scheduling patience from 20 to 70 seconds (to wait longer to get the mutex for scheduling)
    • changed signature of ITaskScheduler.Schedule
    • added base class for TaskSchedulers and moved assignment of tasks to slaves into it
    • changed RoundRobinTaskScheduler to use bare metal sql
  • made MessageContainer a storable type (leftover)
  • updated HiveJanitorServiceInstaller.nsi
File size: 8.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Linq;
24using HeuristicLab.Services.Hive.DataAccess;
25using HeuristicLab.Services.Hive.DataAccess.Interfaces;
26
27namespace HeuristicLab.Services.Hive.Manager {
28  public class EventManager : IEventManager {
29    private const string SlaveTimeout = "Slave timed out.";
30    private static readonly TaskState[] CompletedStates = { TaskState.Finished, TaskState.Aborted, TaskState.Failed };
31
32    private IPersistenceManager PersistenceManager {
33      get { return ServiceLocator.Instance.PersistenceManager; }
34    }
35
36    public void Cleanup() {
37      Console.WriteLine("started cleanup");
38      var pm = PersistenceManager;
39
40      // preemptiv delete obsolete entities
41      // speeds up job deletion
42      BatchDelete((p, s) => p.StateLogDao.DeleteObsolete(s), 100, 100, true, pm, "DeleteObsoleteStateLogs");
43      BatchDelete((p, s) => p.TaskDataDao.DeleteObsolete(s), 100, 20, true, pm, "DeleteObsoleteTaskData");
44      BatchDelete((p, s) => p.TaskDao.DeleteObsolete(s), 100, 20, false, pm, "DeleteObsoleteTasks");
45      BatchDelete((p, s) => p.JobDao.DeleteByState(JobState.DeletionPending, s), 100, 20, true, pm, "DeleteObsoleteJobs");
46
47      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: SetTimeoutSlavesOffline");
48      Console.WriteLine("5");
49      pm.UseTransactionAndSubmit(() => { SetTimeoutSlavesOffline(pm); });
50      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: SetTimeoutTasksWaiting");
51      Console.WriteLine("6");
52      pm.UseTransactionAndSubmit(() => { SetTimeoutTasksWaiting(pm); });
53      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: DeleteObsoleteSlaves");
54      Console.WriteLine("7");
55      pm.UseTransactionAndSubmit(() => { DeleteObsoleteSlaves(pm); });
56      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: AbortObsoleteTasks");
57      Console.WriteLine("8");
58      pm.UseTransactionAndSubmit(() => { AbortObsoleteTasks(pm); });
59      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: FinishParentTasks");
60      Console.WriteLine("9");
61      pm.UseTransactionAndSubmit(() => { FinishParentTasks(pm); });
62      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log("HiveJanitor: DONE");
63      Console.WriteLine("10");
64    }
65
66    private void BatchDelete(
67      Func<IPersistenceManager, int, int> deletionFunc,
68      int batchSize,
69      int maxCalls,
70      bool limitIsBatchSize,
71      IPersistenceManager pm,
72      string logMessage
73    ) {
74      int totalDeleted = 0;
75      while (maxCalls > 0) {
76        maxCalls--;
77        LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log($"HiveJanitor: {logMessage}");
78        Console.WriteLine($"HiveJanitor: {logMessage}");
79        var deleted = pm.UseTransactionAndSubmit(() => { return deletionFunc(pm, batchSize); });
80        LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log($"HiveJanitor: {logMessage} DONE (deleted {deleted}, {maxCalls} calls left)");
81        Console.WriteLine($"HiveJanitor: {logMessage} DONE (deleted {deleted}, {maxCalls} calls left)");
82        totalDeleted += deleted;
83        if (limitIsBatchSize && deleted < batchSize || deleted <= 0) return;
84      }
85      LogFactory.GetLogger(typeof(HiveJanitor).Namespace).Log($"HiveJanitor: Possible rows left to delete (total deleted: {totalDeleted}).");
86      Console.WriteLine($"HiveJanitor: Possible rows left to delete (total deleted: {totalDeleted}).");
87    }
88
89    /// <summary>
90    /// Deletes all jobs which are in state "DeletionPending" (this will include all corresponding tasks).
91    /// The state "DeletionPending" is set by HiveJanitor > StatisticsGenerator
92    /// </summary>
93    private void FinishJobDeletion(IPersistenceManager pm) {
94      var jobDao = pm.JobDao;
95      jobDao.DeleteByState(JobState.DeletionPending);
96    }
97
98    /// <summary>
99    /// Searches for slaves which are timed out, puts them and their task offline
100    /// </summary>
101    private void SetTimeoutSlavesOffline(IPersistenceManager pm) {
102      var slaveDao = pm.SlaveDao;
103      var slaves = slaveDao.GetOnlineSlaves();
104      foreach (var slave in slaves) {
105        if (!slave.LastHeartbeat.HasValue ||
106            (DateTime.Now - slave.LastHeartbeat.Value) >
107            Properties.Settings.Default.SlaveHeartbeatTimeout) {
108          slave.SlaveState = SlaveState.Offline;
109        }
110      }
111    }
112
113    /// <summary>
114    /// Looks for parent tasks which have FinishWhenChildJobsFinished and set their state to finished
115    /// </summary>
116    private void FinishParentTasks(IPersistenceManager pm) {
117      var resourceDao = pm.ResourceDao;
118      var taskDao = pm.TaskDao;
119      var resourceIds = resourceDao.GetAll().Select(x => x.ResourceId).ToList();
120      var parentTasksToFinish = taskDao.GetParentTasks(resourceIds, 0, true);
121      foreach (var task in parentTasksToFinish) {
122        task.State = TaskState.Finished;
123        task.StateLogs.Add(new StateLog {
124          State = task.State,
125          SlaveId = null,
126          UserId = null,
127          Exception = string.Empty,
128          DateTime = DateTime.Now
129        });
130      }
131    }
132
133    /// <summary>
134    /// Looks for task which have not sent heartbeats for some time and reschedules them for calculation
135    /// </summary>
136    private void SetTimeoutTasksWaiting(IPersistenceManager pm) {
137      var taskDao = pm.TaskDao;
138      var tasks = taskDao.GetAll().Where(x => (x.State == TaskState.Calculating && (DateTime.Now - x.LastHeartbeat) > Properties.Settings.Default.CalculatingJobHeartbeatTimeout)
139                                           || (x.State == TaskState.Transferring && (DateTime.Now - x.LastHeartbeat) > Properties.Settings.Default.TransferringJobHeartbeatTimeout));
140      foreach (var task in tasks) {
141        task.State = TaskState.Waiting;
142        task.StateLogs.Add(new StateLog {
143          State = task.State,
144          SlaveId = null,
145          UserId = null,
146          Exception = SlaveTimeout,
147          DateTime = DateTime.Now
148        });
149        task.Command = null;
150      }
151    }
152
153    /// <summary>
154    /// Searches for slaves that are disposable and deletes them if they were offline for too long
155    /// </summary>
156    private void DeleteObsoleteSlaves(IPersistenceManager pm) {
157      var slaveDao = pm.SlaveDao;
158      var downtimeDao = pm.DowntimeDao;
159      var slaveIds = slaveDao.GetAll()
160        .Where(x => x.IsDisposable.GetValueOrDefault()
161                 && x.SlaveState == SlaveState.Offline
162                 && (DateTime.Now - x.LastHeartbeat) > Properties.Settings.Default.SweepInterval)
163        .Select(x => x.ResourceId)
164        .ToList();
165      foreach (var id in slaveIds) {
166        bool downtimesAvailable = downtimeDao.GetByResourceId(id).Any();
167        if (!downtimesAvailable) {
168          slaveDao.Delete(id);
169        }
170      }
171    }
172
173    /// <summary>
174    /// Aborts tasks whose jobs have already been marked for deletion
175    /// </summary>
176    /// <param name="pm"></param>
177    private void AbortObsoleteTasks(IPersistenceManager pm) {
178      var jobDao = pm.JobDao;
179      var taskDao = pm.TaskDao;
180
181      var obsoleteTasks = (from jobId in jobDao.GetJobIdsByState(JobState.StatisticsPending)
182                           join task in taskDao.GetAll() on jobId equals task.JobId
183                           where !CompletedStates.Contains(task.State) && task.Command == null
184                           select task).ToList();
185
186      foreach (var t in obsoleteTasks) {
187        t.State = TaskState.Aborted;
188      }
189    }
190  }
191}
Note: See TracBrowser for help on using the repository browser.