Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Grid/JobManager.cs @ 428

Last change on this file since 428 was 414, checked in by gkronber, 16 years ago

implemented #216 (ProcessingEngine should terminate on breakpoints)

File size: 10.2 KB
RevLine 
[265]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
[219]23using System.Collections.Generic;
24using System.Linq;
25using System.Text;
26using System.ServiceModel;
27using HeuristicLab.Grid;
28using System.Threading;
29using HeuristicLab.Core;
30using System.IO;
31using System.Windows.Forms;
[386]32using System.Diagnostics;
[219]33
[372]34namespace HeuristicLab.Grid {
[391]35  public class JobExecutionException : ApplicationException {
36    public JobExecutionException(string msg) : base(msg) { }
37  }
38
[372]39  public class JobManager {
[386]40    private const int MAX_RESTARTS = 5;
41    private const int MAX_CONNECTION_RETRIES = 10;
42    private const int RETRY_TIMEOUT_SEC = 60;
43    private const int CHECK_RESULTS_TIMEOUT = 3;
44
[391]45    private class Job {
46      public Guid guid;
47      public ProcessingEngine engine;
48      public ManualResetEvent waitHandle;
49      public int restarts;
50    }
51
[219]52    private IGridServer server;
53    private string address;
[386]54    private object waitingQueueLock = new object();
[391]55    private Queue<Job> waitingJobs = new Queue<Job>();
[386]56    private object runningQueueLock = new object();
[391]57    private Queue<Job> runningJobs = new Queue<Job>();
[248]58    private Dictionary<AtomicOperation, byte[]> results = new Dictionary<AtomicOperation, byte[]>();
[386]59
[315]60    private List<IOperation> erroredOperations = new List<IOperation>();
[248]61    private object connectionLock = new object();
62    private object dictionaryLock = new object();
63
[387]64    private AutoResetEvent runningWaitHandle = new AutoResetEvent(false);
65    private AutoResetEvent waitingWaitHandle = new AutoResetEvent(false);
[248]66
[228]67    private ChannelFactory<IGridServer> factory;
[219]68
69    public JobManager(string address) {
[402]70      Trace.Listeners.Clear();
71      Trace.Listeners.Add(new EventLogTraceListener("HeuristicLab.Grid"));
[219]72      this.address = address;
[386]73      Thread starterThread = new Thread(StartEngines);
74      Thread resultsGatheringThread = new Thread(GetResults);
75      starterThread.Start();
76      resultsGatheringThread.Start();
[219]77    }
78
[372]79    public void Reset() {
[248]80      ResetConnection();
81      lock(dictionaryLock) {
[391]82        foreach(Job j in waitingJobs) {
83          j.waitHandle.Close();
84        }
85        waitingJobs.Clear();
86        foreach(Job j in runningJobs) {
87          j.waitHandle.Close();
88        }
89        runningJobs.Clear();
[248]90        results.Clear();
[315]91        erroredOperations.Clear();
[219]92      }
93    }
94
[228]95    private void ResetConnection() {
[402]96      Trace.TraceInformation("Reset connection in JobManager");
[248]97      lock(connectionLock) {
98        // open a new channel
99        NetTcpBinding binding = new NetTcpBinding();
100        binding.MaxReceivedMessageSize = 100000000; // 100Mbytes
101        binding.ReaderQuotas.MaxStringContentLength = 100000000; // also 100M chars
102        binding.ReaderQuotas.MaxArrayLength = 100000000; // also 100M elements;
103        binding.Security.Mode = SecurityMode.None;
104
105        factory = new ChannelFactory<IGridServer>(binding);
106        server = factory.CreateChannel(new EndpointAddress(address));
107      }
[228]108    }
109
[386]110    public void StartEngines() {
111      try {
112        while(true) {
[391]113          Job job = null;
[386]114          lock(waitingQueueLock) {
[391]115            if(waitingJobs.Count > 0) job = waitingJobs.Dequeue();
[248]116          }
[391]117          if(job==null) waitingWaitHandle.WaitOne(); // no jobs waiting
118          else {
119            Guid currentEngineGuid = TryStartExecuteEngine(job.engine);
120            if(currentEngineGuid == Guid.Empty) {
121              // couldn't start the job -> requeue
122              if(job.restarts < MAX_RESTARTS) {
123                job.restarts++;
124                lock(waitingQueueLock) waitingJobs.Enqueue(job);
125                waitingWaitHandle.Set();
[386]126              } else {
[391]127                // max restart count reached -> give up on this job and flag error
128                lock(dictionaryLock) {
129                  erroredOperations.Add(job.engine.InitialOperation);
130                  job.waitHandle.Set();
[386]131                }
132              }
[391]133            } else {
134              // job started successfully
135              job.guid = currentEngineGuid;
136              lock(runningQueueLock) {
137                runningJobs.Enqueue(job);
138                runningWaitHandle.Set();
139              }
[386]140            }
[315]141          }
[386]142        }
[402]143      } catch(Exception e) {
144        Trace.TraceError("Exception "+e+" in JobManager.StartEngines() killed the start-engine thread\n"+e.StackTrace);
[386]145      }
146    }
147
[391]148
[386]149    public void GetResults() {
150      try {
151        while(true) {
[391]152          Job job = null;
[386]153          lock(runningQueueLock) {
[391]154            if(runningJobs.Count > 0) job = runningJobs.Dequeue();
[315]155          }
[391]156          if(job == null) runningWaitHandle.WaitOne(); // no jobs running
157          else {
158            byte[] zippedResult = TryEndExecuteEngine(server, job.guid);
[386]159            if(zippedResult != null) { // successful
160              lock(dictionaryLock) {
161                // store result
[391]162                results[job.engine.InitialOperation] = zippedResult;
163                // notify consumer that result is ready
164                job.waitHandle.Set();
[386]165              }
166            } else {
167              // there was a problem -> check the state of the job and restart if necessary
[391]168              JobState jobState = TryGetJobState(server, job.guid);
169              if(jobState == JobState.Unknown) {
170                job.restarts++;
[386]171                lock(waitingQueueLock) {
[391]172                  waitingJobs.Enqueue(job);
[386]173                  waitingWaitHandle.Set();
174                }
175              } else {
176                // job still active at the server
177                lock(runningQueueLock) {
[391]178                  runningJobs.Enqueue(job);
179                  runningWaitHandle.Set();
[386]180                }
181              }
182            }
183          }
[248]184        }
[402]185      } catch(Exception e) {
186        Trace.TraceError("Exception " + e + " in JobManager.GetResults() killed the results-gathering thread\n"+ e.StackTrace);
[219]187      }
188    }
189
[386]190    public WaitHandle BeginExecuteOperation(IScope globalScope, AtomicOperation operation) {
[414]191      return BeginExecuteEngine(new ProcessingEngine(globalScope, operation));
192    }
193
194    public WaitHandle BeginExecuteEngine(ProcessingEngine engine) {
[391]195      Job job = new Job();
[414]196      job.engine = engine;
[391]197      job.waitHandle = new ManualResetEvent(false);
198      job.restarts = 0;
[386]199      lock(waitingQueueLock) {
[391]200        waitingJobs.Enqueue(job);
[386]201      }
202      waitingWaitHandle.Set();
[391]203      return job.waitHandle;
[386]204    }
205
[257]206    private byte[] ZipEngine(ProcessingEngine engine) {
[402]207      return PersistenceManager.SaveToGZip(engine);
[257]208    }
209
[281]210    public ProcessingEngine EndExecuteOperation(AtomicOperation operation) {
[315]211      if(erroredOperations.Contains(operation)) {
212        erroredOperations.Remove(operation);
[391]213        throw new JobExecutionException("Maximal number of job restarts reached. There is a problem with the connection to the grid-server.");
[315]214      } else {
215        byte[] zippedResult = null;
216        lock(dictionaryLock) {
217          zippedResult = results[operation];
218          results.Remove(operation);
219        }
220        // restore the engine
[402]221        return (ProcessingEngine)PersistenceManager.RestoreFromGZip(zippedResult);
[256]222      }
[248]223    }
224
[391]225    private Guid TryStartExecuteEngine(ProcessingEngine engine) {
226      byte[] zippedEngine = ZipEngine(engine);
227      int retries = 0;
228      Guid guid = Guid.Empty;
229      do {
230        try {
231          lock(connectionLock) {
232            guid = server.BeginExecuteEngine(zippedEngine);
233          }
234          return guid;
235        } catch(TimeoutException) {
236          retries++;
237          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
238        } catch(CommunicationException) {
239          ResetConnection();
240          retries++;
241          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
242        }
243      } while(retries < MAX_CONNECTION_RETRIES);
[402]244      Trace.TraceWarning("Reached max connection retries in TryStartExecuteEngine");
[391]245      return Guid.Empty;
246    }
247
[315]248    private byte[] TryEndExecuteEngine(IGridServer server, Guid engineGuid) {
249      int retries = 0;
250      do {
251        try {
252          lock(connectionLock) {
253            byte[] zippedResult = server.TryEndExecuteEngine(engineGuid, 100);
254            return zippedResult;
255          }
[383]256        } catch(TimeoutException) {
[315]257          retries++;
258          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
[383]259        } catch(CommunicationException) {
[315]260          ResetConnection();
261          retries++;
262          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
263        }
264      } while(retries < MAX_CONNECTION_RETRIES);
[402]265      Trace.TraceWarning("Reached max connection retries in TryEndExecuteEngine");
[315]266      return null;
267    }
268
269    private JobState TryGetJobState(IGridServer server, Guid engineGuid) {
270      // check if the server is still working on the job
271      int retries = 0;
272      do {
273        try {
274          lock(connectionLock) {
275            JobState jobState = server.JobState(engineGuid);
276            return jobState;
277          }
[383]278        } catch(TimeoutException) {
[315]279          retries++;
280          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
[383]281        } catch(CommunicationException) {
[315]282          ResetConnection();
283          retries++;
284          Thread.Sleep(TimeSpan.FromSeconds(RETRY_TIMEOUT_SEC));
285        }
286      } while(retries < MAX_CONNECTION_RETRIES);
[402]287      Trace.TraceWarning("Reached max connection retries in TryGetJobState");
[391]288      return JobState.Unknown;
[315]289    }
[219]290  }
291}
Note: See TracBrowser for help on using the repository browser.