Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Grid/3.2/JobManager.cs @ 2055

Last change on this file since 2055 was 2055, checked in by gkronber, 15 years ago

Refactored JobManager and DistributedEngine to fix bugs in the GridExecuter. #644.

File size: 8.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Text;
26using System.ServiceModel;
27using HeuristicLab.Grid;
28using System.Threading;
29using HeuristicLab.Core;
30using System.IO;
31using System.Windows.Forms;
32using System.Diagnostics;
33
34namespace HeuristicLab.Grid {
35  public class JobExecutionException : ApplicationException {
36    public JobExecutionException(string msg) : base(msg) { }
37  }
38
39  public class JobManager {
40    private const int MAX_RESTARTS = 5;
41    private const int MAX_CONNECTION_RETRIES = 10;
42    private const int RETRY_TIMEOUT_SEC = 60;
43    private const int RESULT_POLLING_TIMEOUT = 5;
44
45    private IGridServer server;
46    private string address;
47    private object waitingQueueLock = new object();
48    private Queue<AsyncGridResult> waitingJobs = new Queue<AsyncGridResult>();
49    private object runningQueueLock = new object();
50    private Queue<AsyncGridResult> runningJobs = new Queue<AsyncGridResult>();
51    private object connectionLock = new object();
52
53    private AutoResetEvent runningWaitHandle = new AutoResetEvent(false);
54    private AutoResetEvent waitingWaitHandle = new AutoResetEvent(false);
55
56    private ChannelFactory<IGridServer> factory;
57
58    public JobManager(string address) {
59      this.address = address;
60      Thread starterThread = new Thread(StartEngines);
61      Thread resultsGatheringThread = new Thread(GetResults);
62      starterThread.Start();
63      resultsGatheringThread.Start();
64    }
65
66    public void Reset() {
67      ResetConnection();
68      lock (waitingQueueLock) {
69        foreach (AsyncGridResult r in waitingJobs) {
70          r.WaitHandle.Close();
71        }
72        waitingJobs.Clear();
73      }
74      lock (runningQueueLock) {
75        foreach (AsyncGridResult r in runningJobs) {
76          r.WaitHandle.Close();
77        }
78        runningJobs.Clear();
79      }
80    }
81
82    private void ResetConnection() {
83      Trace.TraceInformation("Reset connection in JobManager");
84      lock (connectionLock) {
85        // open a new channel
86        NetTcpBinding binding = new NetTcpBinding();
87        binding.MaxReceivedMessageSize = 100000000; // 100Mbytes
88        binding.ReaderQuotas.MaxStringContentLength = 100000000; // also 100M chars
89        binding.ReaderQuotas.MaxArrayLength = 100000000; // also 100M elements;
90
91        factory = new ChannelFactory<IGridServer>(binding);
92        server = factory.CreateChannel(new EndpointAddress(address));
93      }
94    }
95
96    public void StartEngines() {
97      try {
98        while (true) {
99          AsyncGridResult job = null;
100          lock (waitingQueueLock) {
101            if (waitingJobs.Count > 0) job = waitingJobs.Dequeue();
102          }
103          if (job == null) waitingWaitHandle.WaitOne(); // no jobs waiting
104          else {
105            Guid currentEngineGuid = TryStartExecuteEngine(job.Engine);
106            if (currentEngineGuid == Guid.Empty) {
107              // couldn't start the job -> requeue
108              if (job.Restarts < MAX_RESTARTS) {
109                job.Restarts++;
110                lock (waitingQueueLock) waitingJobs.Enqueue(job);
111                waitingWaitHandle.Set();
112              } else {
113                // max restart count reached -> give up on this job and flag error
114                job.Aborted = true;
115                job.SignalFinished();
116              }
117            } else {
118              // job started successfully
119              job.Guid = currentEngineGuid;
120              lock (runningQueueLock) {
121                runningJobs.Enqueue(job);
122                runningWaitHandle.Set();
123              }
124            }
125          }
126        }
127      }
128      catch (Exception e) {
129        Trace.TraceError("Exception " + e + " in JobManager.StartEngines() killed the start-engine thread\n" + e.StackTrace);
130      }
131    }
132
133
134    public void GetResults() {
135      try {
136        while (true) {
137          AsyncGridResult job = null;
138          lock (runningQueueLock) {
139            if (runningJobs.Count > 0) job = runningJobs.Dequeue();
140          }
141          if (job == null) runningWaitHandle.WaitOne(); // no jobs running
142          else {
143            byte[] zippedResult = TryEndExecuteEngine(server, job.Guid);
144            if (zippedResult != null) {
145              // successful => store result
146              job.ZippedResult = zippedResult;
147              // notify consumer that result is ready
148              job.SignalFinished();
149            } else {
150              // there was a problem -> check the state of the job and restart if necessary
151              JobState jobState = TryGetJobState(server, job.Guid);
152              if (jobState == JobState.Unknown) {
153                job.Restarts++;
154                lock (waitingQueueLock) {
155                  waitingJobs.Enqueue(job);
156                  waitingWaitHandle.Set();
157                }
158              } else {
159                // job still active at the server
160                lock (runningQueueLock) {
161                  runningJobs.Enqueue(job);
162                  runningWaitHandle.Set();
163                }
164                Thread.Sleep(TimeSpan.FromSeconds(RESULT_POLLING_TIMEOUT)); // sleep a while before trying to get the next result
165              }
166            }
167          }
168        }
169      }
170      catch (Exception e) {
171        Trace.TraceError("Exception " + e + " in JobManager.GetResults() killed the results-gathering thread\n" + e.StackTrace);
172      }
173    }
174
175    public AsyncGridResult BeginExecuteEngine(ProcessingEngine engine) {
176      AsyncGridResult asyncResult = new AsyncGridResult(engine);
177      asyncResult.Engine = engine;
178      lock (waitingQueueLock) {
179        waitingJobs.Enqueue(asyncResult);
180      }
181      waitingWaitHandle.Set();
182      return asyncResult;
183    }
184
185    private byte[] ZipEngine(IEngine engine) {
186      return PersistenceManager.SaveToGZip(engine);
187    }
188
189    public IEngine EndExecuteEngine(AsyncGridResult asyncResult) {
190      if (asyncResult.Aborted) {
191        throw new JobExecutionException("Maximal number of job restarts reached. There is a problem with the connection to the grid-server.");
192      } else {
193        // restore the engine
194        return (IEngine)PersistenceManager.RestoreFromGZip(asyncResult.ZippedResult);
195      }
196    }
197
198    private Guid TryStartExecuteEngine(IEngine engine) {
199      byte[] zippedEngine = ZipEngine(engine);
200      return SavelyExecute(() => server.BeginExecuteEngine(zippedEngine));
201    }
202
203    private byte[] TryEndExecuteEngine(IGridServer server, Guid engineGuid) {
204      return SavelyExecute(() => {
205        byte[] zippedResult = server.TryEndExecuteEngine(engineGuid);
206        return zippedResult;
207      });
208    }
209
210    private JobState TryGetJobState(IGridServer server, Guid engineGuid) {
211      return SavelyExecute(() => server.JobState(engineGuid));
212    }
213
214    private TResult SavelyExecute<TResult>(Func<TResult> a) {
215      int retries = 0;
216      do {
217        try {
218          lock (connectionLock) {
219            return a();
220          }
221        }
222        catch (TimeoutException) {
223          retries++;
224          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
225        }
226        catch (CommunicationException) {
227          ResetConnection();
228          retries++;
229          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
230        }
231      } while (retries < MAX_CONNECTION_RETRIES);
232      Trace.TraceWarning("Reached max connection retries");
233      return default(TResult);
234    }
235  }
236}
Note: See TracBrowser for help on using the repository browser.