Changeset 5786


Ignore:
Timestamp:
03/22/11 11:36:53 (11 years ago)
Author:
cneumuel
Message:

#1233

  • implemented correct numbering of BatchRuns
  • improvements in ExperimentManager
  • fixed bug in server (jobs were scheduled multiple times)
  • added exception handling for task in slave
  • improved timeout handling of jobs (LifecycleManager)
Location:
branches/HeuristicLab.Hive-3.4/sources
Files:
4 added
1 deleted
14 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive.Slave/3.4/Core.cs

    r5780 r5786  
    5757    private int coreThreadId;
    5858
    59     private ISlaveCommunication ClientCom;
     59    private ISlaveCommunication clientCom;
    6060    private ServiceHost slaveComm;
    6161
     
    8484        slaveComm.Open();
    8585
    86         ClientCom = SlaveClientCom.Instance.ClientCom;
    87         ClientCom.LogMessage("Hive Slave started");
     86        clientCom = SlaveClientCom.Instance.ClientCom;
     87        clientCom.LogMessage("Hive Slave started");
    8888
    8989        ConfigManager manager = ConfigManager.Instance;
     
    139139
    140140    void WcfService_ExceptionOccured(object sender, EventArgs<Exception> e) {
    141       ClientCom.LogMessage("Connection to server interruped with exception: " + e.Value.Message);
     141      clientCom.LogMessage("Connection to server interruped with exception: " + e.Value.Message);
    142142    }
    143143
    144144    void WcfService_Connected(object sender, EventArgs e) {
    145       ClientCom.LogMessage("Connected successfully to Hive server");
     145      clientCom.LogMessage("Connected successfully to Hive server");
    146146    }
    147147
     
    151151    /// <param name="container">The Container, containing the message</param>
    152152    private void DetermineAction(MessageContainer container) {
    153       ClientCom.LogMessage("Message: " + container.Message.ToString() + " for job: " + container.JobId);
     153      clientCom.LogMessage("Message: " + container.Message.ToString() + " for job: " + container.JobId);
    154154
    155155      if (container is ExecutorMessageContainer<Guid>) {
     
    159159        switch (container.Message) {
    160160          case MessageContainer.MessageType.CalculateJob:
    161             Task.Factory.StartNew(() => {
    162               Job job = wcfService.GetJob(container.JobId);
     161            clientCom.LogMessage("Task.StartNew[0]: jobId: " + container.JobId);
     162            Task.Factory.StartNew((jobIdObj) => {
     163              Guid jobId = (Guid)jobIdObj;
     164              clientCom.LogMessage("Task.StartNew[1]: jobId: " + jobId);
     165              Job job = wcfService.GetJob(jobId);
     166              if (job == null) throw new JobNotFoundException(jobId);
    163167              lock (engines) {
    164168                if (!jobs.ContainsKey(job.Id)) {
     
    167171              }
    168172              JobData jobData = wcfService.GetJobData(job.Id);
     173              if (job == null) throw new JobDataNotFoundException(jobId);
    169174              job = wcfService.UpdateJobState(job.Id, JobState.Calculating, null);
    170175              StartJobInAppDomain(job, jobData);
    171             });
     176            }, container.JobId)
     177            .ContinueWith((t) => {
     178              // handle exception of task
     179              clientCom.LogMessage(t.Exception.ToString());
     180            }, TaskContinuationOptions.OnlyOnFaulted);
    172181            break;
    173182          case MessageContainer.MessageType.ShutdownSlave:
     
    203212        }
    204213      } else {
    205         ClientCom.LogMessage("Unknown MessageContainer: " + container);
     214        clientCom.LogMessage("Unknown MessageContainer: " + container);
    206215      }
    207216    }
     
    217226
    218227        try {
    219           ClientCom.LogMessage("Sending the paused job with id: " + job.Id);
     228          clientCom.LogMessage("Sending the paused job with id: " + job.Id);
    220229          wcfService.UpdateJobData(job, sJob, ConfigManager.Instance.GetClientInfo().Id, JobState.Paused);
    221230          SlaveStatusInfo.JobsProcessed++;    //TODO: count or not count, thats the question
    222231        }
    223232        catch (Exception e) {
    224           ClientCom.LogMessage("Transmitting to server failed. Storing the paused job with id: " + job.Id + " to hdd (" + e.ToString() + ")");
     233          clientCom.LogMessage("Transmitting to server failed. Storing the paused job with id: " + job.Id + " to hdd (" + e.ToString() + ")");
    225234        }
    226235        finally {
     
    240249
    241250        try {
    242           ClientCom.LogMessage("Sending the stoppped job with id: " + job.Id);
     251          clientCom.LogMessage("Sending the stoppped job with id: " + job.Id);
    243252          wcfService.UpdateJobData(job, sJob, ConfigManager.Instance.GetClientInfo().Id, JobState.Paused);
    244253          SlaveStatusInfo.JobsProcessed++;    //TODO: count or not count, thats the question
    245254        }
    246255        catch (Exception e) {
    247           ClientCom.LogMessage("Transmitting to server failed. Storing the paused job with id: " + job.Id + " to hdd (" + e.ToString() + ")");
     256          clientCom.LogMessage("Transmitting to server failed. Storing the paused job with id: " + job.Id + " to hdd (" + e.ToString() + ")");
    248257        }
    249258        finally {
     
    266275      }
    267276
    268       ClientCom.LogMessage("Aborted all jobs!");
     277      clientCom.LogMessage("Aborted all jobs!");
    269278    }
    270279
     
    273282    /// </summary>
    274283    private void DoPauseAll() {
    275       ClientCom.LogMessage("Pause all received");
     284      clientCom.LogMessage("Pause all received");
    276285
    277286      //copy guids because there will be removed items from 'Jobs'
     
    290299    /// </summary>
    291300    private void DoStopAll() {
    292       ClientCom.LogMessage("Stop all received");
     301      clientCom.LogMessage("Stop all received");
    293302
    294303      //copy guids because there will be removed items from 'Jobs'
     
    316325    /// </summary>
    317326    private void ShutdownCore() {
    318       ClientCom.LogMessage("Shutdown Signal received");
    319       ClientCom.LogMessage("Stopping heartbeat");
     327      clientCom.LogMessage("Shutdown Signal received");
     328      clientCom.LogMessage("Stopping heartbeat");
    320329      heartbeatManager.StopHeartBeat();
    321330      abortRequested = true;
    322       ClientCom.LogMessage("Logging out");
     331      clientCom.LogMessage("Logging out");
    323332
    324333
    325334      lock (engines) {
    326         ClientCom.LogMessage("engines locked");
     335        clientCom.LogMessage("engines locked");
    327336        foreach (KeyValuePair<Guid, AppDomain> kvp in appDomains) {
    328           ClientCom.LogMessage("Shutting down Appdomain for " + kvp.Key);
     337          clientCom.LogMessage("Shutting down Appdomain for " + kvp.Key);
    329338          appDomains[kvp.Key].UnhandledException -= new UnhandledExceptionEventHandler(AppDomain_UnhandledException);
    330339          AppDomain.Unload(kvp.Value);
     
    332341      }
    333342      WcfService.Instance.Disconnect();
    334       ClientCom.Shutdown();
     343      clientCom.Shutdown();
    335344      SlaveClientCom.Close();
    336345
     
    344353    /// </summary> 
    345354    private void DoStartSlave() {
    346       ClientCom.LogMessage("Restart received");
     355      clientCom.LogMessage("Restart received");
    347356      StartHeartbeats();
    348       ClientCom.LogMessage("Restart done");
     357      clientCom.LogMessage("Restart done");
    349358    }
    350359
     
    355364    //TODO: do we need an AbortSleep?
    356365    private void Sleep() {
    357       ClientCom.LogMessage("Sleep received");
     366      clientCom.LogMessage("Sleep received");
    358367      heartbeatManager.StopHeartBeat();
    359368      heartbeatManager = null;
    360369      DoStopAll();
    361370      WcfService.Instance.Disconnect();
    362       ClientCom.LogMessage("Sleep done");
     371      clientCom.LogMessage("Sleep done");
    363372    }
    364373
     
    371380    public void PauseWaitJob(JobData data) {
    372381      if (!Jobs.ContainsKey(data.JobId)) {
    373         ClientCom.LogMessage("Can't find job with id " + data.JobId);
     382        clientCom.LogMessage("Can't find job with id " + data.JobId);
    374383      } else {
    375384        Job job = Jobs[data.JobId];
     
    388397    public void SendFinishedJob(Guid jobId) {
    389398      try {
    390         ClientCom.LogMessage("Getting the finished job with id: " + jobId);
     399        clientCom.LogMessage("Getting the finished job with id: " + jobId);
    391400        if (!engines.ContainsKey(jobId)) {
    392           ClientCom.LogMessage("Engine doesn't exist");
     401          clientCom.LogMessage("Engine doesn't exist");
    393402          return;
    394403        }
    395404        if (!jobs.ContainsKey(jobId)) {
    396           ClientCom.LogMessage("Job doesn't exist");
     405          clientCom.LogMessage("Job doesn't exist");
    397406          return;
    398407        }
     
    405414
    406415        try {
    407           ClientCom.LogMessage("Sending the finished job with id: " + jobId);
     416          clientCom.LogMessage("Sending the finished job with id: " + jobId);
    408417          wcfService.UpdateJobData(cJob, sJob, ConfigManager.Instance.GetClientInfo().Id, JobState.Finished);
    409418          SlaveStatusInfo.JobsProcessed++;
    410419        }
    411420        catch (Exception e) {
    412           ClientCom.LogMessage("Transmitting to server failed. Storing the finished job with id: " + jobId + " to hdd (" + e.ToString() + ")");
     421          clientCom.LogMessage("Transmitting to server failed. Storing the finished job with id: " + jobId + " to hdd (" + e.ToString() + ")");
    413422        }
    414423        finally {
     
    428437    /// <param name="e"></param>
    429438    private void StartJobInAppDomain(Job myJob, JobData jobData) {
    430       ClientCom.LogMessage("Received new job with id " + myJob.Id);
     439      clientCom.LogMessage("Received new job with id " + myJob.Id);
    431440      String pluginDir = Path.Combine(PluginCache.Instance.PluginTempBaseDir, myJob.Id.ToString());
    432441      bool pluginsPrepared = false;
     
    435444      try {
    436445        PluginCache.Instance.PreparePlugins(myJob, out configFileName);
    437         ClientCom.LogMessage("Plugins fetched for job " + myJob.Id);
     446        clientCom.LogMessage("Plugins fetched for job " + myJob.Id);
    438447        pluginsPrepared = true;
    439448      }
    440449      catch (Exception exception) {
    441         ClientCom.LogMessage(string.Format("Copying plugins for job {0} failed: {1}", myJob.Id, exception));
     450        clientCom.LogMessage(string.Format("Copying plugins for job {0} failed: {1}", myJob.Id, exception));
    442451      }
    443452
     
    448457          lock (engines) {
    449458            appDomains.Add(myJob.Id, appDomain);
    450             ClientCom.LogMessage("Creating AppDomain");
     459            clientCom.LogMessage("Creating AppDomain");
    451460            Executor engine = (Executor)appDomain.CreateInstanceAndUnwrap(typeof(Executor).Assembly.GetName().Name, typeof(Executor).FullName);
    452             ClientCom.LogMessage("Created AppDomain");
     461            clientCom.LogMessage("Created AppDomain");
    453462            engine.JobId = myJob.Id;
    454463            engine.Core = this;
    455             ClientCom.LogMessage("Starting Engine for job " + myJob.Id);
     464            clientCom.LogMessage("Starting Engine for job " + myJob.Id);
    456465            engines.Add(myJob.Id, engine);
    457466            engine.Start(jobData.Data);
    458467            SlaveStatusInfo.JobsFetched++;
    459             ClientCom.LogMessage("Increment FetchedJobs to:" + SlaveStatusInfo.JobsFetched);
     468            clientCom.LogMessage("Increment FetchedJobs to:" + SlaveStatusInfo.JobsFetched);
    460469          }
    461470        }
    462471        catch (Exception exception) {
    463           ClientCom.LogMessage("Creating the Appdomain and loading the job failed for job " + myJob.Id);
    464           ClientCom.LogMessage("Error thrown is: " + exception.ToString());
     472          clientCom.LogMessage("Creating the Appdomain and loading the job failed for job " + myJob.Id);
     473          clientCom.LogMessage("Error thrown is: " + exception.ToString());
    465474          KillAppDomain(myJob.Id);
    466475        }
     
    471480    public event EventHandler<EventArgs<Exception>> ExceptionOccured;
    472481    private void OnExceptionOccured(Exception e) {
    473       ClientCom.LogMessage("Error: " + e.ToString());
     482      clientCom.LogMessage("Error: " + e.ToString());
    474483      var handler = ExceptionOccured;
    475484      if (handler != null) handler(this, new EventArgs<Exception>(e));
     
    477486
    478487    private void AppDomain_UnhandledException(object sender, UnhandledExceptionEventArgs e) {
    479       ClientCom.LogMessage("Exception in AppDomain: " + e.ExceptionObject.ToString());
     488      clientCom.LogMessage("Exception in AppDomain: " + e.ExceptionObject.ToString());
    480489      KillAppDomain(new Guid(e.ExceptionObject.ToString()));
    481490    }
     
    508517      }
    509518
    510       ClientCom.LogMessage("Shutting down Appdomain for Job " + id);
     519      clientCom.LogMessage("Shutting down Appdomain for Job " + id);
    511520      lock (engines) {
    512521        try {
     
    526535              }
    527536              catch (CannotUnloadAppDomainException) {
    528                 ClientCom.LogMessage("Could not unload AppDomain, will try again in 1 sec.");
     537                clientCom.LogMessage("Could not unload AppDomain, will try again in 1 sec.");
    529538                Thread.Sleep(1000);
    530539                repeat--;
     
    542551        }
    543552        catch (Exception ex) {
    544           ClientCom.LogMessage("Exception when unloading the appdomain: " + ex.ToString());
     553          clientCom.LogMessage("Exception when unloading the appdomain: " + ex.ToString());
    545554        }
    546555      }
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive.Slave/3.4/HeuristicLab.Clients.Hive.Slave-3.4.csproj

    r5721 r5786  
    9898  <ItemGroup>
    9999    <Compile Include="ConfigManager.cs" />
     100    <Compile Include="Exceptions\JobNotFoundException.cs" />
     101    <Compile Include="Exceptions\JobNotDataFoundException.cs" />
    100102    <Compile Include="SlaveClientCom.cs" />
    101103    <Compile Include="Core.cs" />
     
    103105    <Compile Include="Executor.cs" />
    104106    <Compile Include="HeartbeatManager.cs" />
    105     <Compile Include="InvalidStateException.cs" />
     107    <Compile Include="Exceptions\InvalidStateException.cs" />
    106108    <Compile Include="JobStatus.cs" />
    107109    <Compile Include="MessageQueue.cs" />
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive/3.4/ExperimentManager/HiveExperimentClient.cs

    r5718 r5786  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Configuration;
     25using System.IO;
    2426using System.Linq;
    2527using System.Threading;
    2628using HeuristicLab.Clients.Hive.Jobs;
     29using HeuristicLab.Collections;
    2730using HeuristicLab.Common;
    2831using HeuristicLab.Core;
    2932using HeuristicLab.Optimization;
     33using HeuristicLab.PluginInfrastructure;
    3034
    3135namespace HeuristicLab.Clients.Hive {
    32   using System.Configuration;
    33   using System.IO;
    34   using HeuristicLab.Collections;
    35   using HeuristicLab.PluginInfrastructure;
    36 
    3736  /// <summary>
    3837  /// An experiment which contains multiple batch runs of algorithms.
     
    378377
    379378    #region HiveJob Events
    380     void HiveJob_JobStateChanged(object sender, EventArgs e) {
     379    private void HiveJob_JobStateChanged(object sender, EventArgs e) {
    381380      if (HiveJob != null) {
    382381        rootJobId = HiveJob.Job.Id;
     
    520519        HiveJob hj = hiveJob.GetHiveJobByJobId(lightweightJob.Id);
    521520        if (hj != null) {
     521          DateTime lastJobDataUpdate = hj.Job.LastJobDataUpdate;
    522522          hj.UpdateFromLightweightJob(lightweightJob);
    523           if ((hj.Job.State == JobState.Aborted ||
    524                hj.Job.State == JobState.Failed ||
    525                hj.Job.State == JobState.Finished) &&
    526               !hj.IsFinishedOptimizerDownloaded) {
     523
     524          // lastJobDataUpdate equals DateTime.MinValue right after it was uploaded. When the first results are polled, this value is updated
     525          if (lastJobDataUpdate != DateTime.MinValue && lastJobDataUpdate < hj.Job.LastJobDataUpdate) {
    527526            LogMessage(hj.Job.Id, "Downloading optimizer for job");
    528527            OptimizerJob optimizerJob = LoadOptimizerJob(hj.Job.Id);
    529528            if (optimizerJob == null) {
    530529              // something bad happened to this job. set to finished to allow the rest beeing downloaded
    531               hj.IsFinishedOptimizerDownloaded = true;
     530              //hj.IsFinishedOptimizerDownloaded = true;
    532531            } else {
    533               if (lightweightJob.ParentJobId.HasValue) {
    534                 HiveJob parentHiveJob = HiveJob.GetHiveJobByJobId(lightweightJob.ParentJobId.Value);
    535                 parentHiveJob.UpdateChildOptimizer(optimizerJob, hj.Job.Id);
     532              // if the job is paused, download but don't integrate into parent optimizer (to avoid Prepare)
     533              if (hj.Job.State == JobState.Paused) {
     534               
    536535              } else {
    537                 this.HiveJob.IsFinishedOptimizerDownloaded = true;
     536                if (lightweightJob.ParentJobId.HasValue) {
     537                  HiveJob parentHiveJob = HiveJob.GetHiveJobByJobId(lightweightJob.ParentJobId.Value);
     538                  parentHiveJob.UpdateChildOptimizer(optimizerJob, hj.Job.Id);
     539                } else {
     540                  //this.HiveJob.IsFinishedOptimizerDownloaded = true;
     541                }
    538542              }
    539543            }
     
    550554
    551555    private bool AllJobsFinished() {
    552       return HiveJob.GetAllHiveJobs().All(hj => hj.IsFinishedOptimizerDownloaded);
     556      //return HiveJob.GetAllHiveJobs().All(hj => hj.IsFinishedOptimizerDownloaded);
     557      return HiveJob.GetAllHiveJobs().All(j => j.Job.State == JobState.Finished
     558                                            || j.Job.State == JobState.Aborted
     559                                            || j.Job.State == JobState.Failed);
    553560    }
    554561
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive/3.4/ExperimentManager/HiveJobClient.cs

    r5779 r5786  
    7777          DergisterOptimizerEvents();
    7878          optimizerJob = value;
    79           if (optimizerJob.ExecutionState == ExecutionState.Stopped) {
    80             IsFinishedOptimizerDownloaded = true;
    81           }
     79          //if (optimizerJob.ExecutionState == ExecutionState.Stopped) {
     80          //  IsFinishedOptimizerDownloaded = true;
     81          //}
    8282          RegisterOptimizerEvents();
    8383          OnOptimizerJobChanged();
     
    9191    }
    9292
    93     private bool isFinishedOptimizerDownloaded;
    94     public bool IsFinishedOptimizerDownloaded {
    95       get { return isFinishedOptimizerDownloaded; }
    96       set {
    97         if (isFinishedOptimizerDownloaded != value) {
    98           isFinishedOptimizerDownloaded = value;
    99           OnIsFinishedOptimizerDownloadedChanged();
    100         }
    101       }
    102     }
     93    //private bool isFinishedOptimizerDownloaded;
     94    //public bool IsFinishedOptimizerDownloaded {
     95    //  get { return isFinishedOptimizerDownloaded; }
     96    //  set {
     97    //    if (isFinishedOptimizerDownloaded != value) {
     98    //      isFinishedOptimizerDownloaded = value;
     99    //      OnIsFinishedOptimizerDownloadedChanged();
     100    //    }
     101    //  }
     102    //}
    103103
    104104    private bool syncJobsWithOptimizers = true;
     
    331331      }
    332332      if (childIsFinishedOptimizerDownloaded) {
    333         child.IsFinishedOptimizerDownloaded = true;
     333        //child.IsFinishedOptimizerDownloaded = true; // todo: clean up with childIsFinishedOptimizerDownloaded
    334334      }
    335335      syncJobsWithOptimizers = true;
     
    345345      }
    346346      foreach (IRun run in optimizerJob.Optimizer.Runs) {
    347         if (!batchRun.Runs.Contains(run))
     347        if (!batchRun.Runs.Contains(run)) {
     348          run.Name = GetNewRunName(run, batchRun.Runs);
    348349          batchRun.Runs.Add(run);
     350        }
     351      }
     352    }
     353
     354    /// <summary>
     355    /// Parses the run numbers out of runs and renames the run to the next number
     356    /// </summary>
     357    private static string GetNewRunName(IRun run, RunCollection runs) {
     358      int idx = run.Name.IndexOf("Run ") + 4;
     359
     360      if (idx == -1 || runs.Count == 0)
     361        return run.Name;
     362
     363      int maxRunNumber = int.MinValue;
     364      foreach (IRun r in runs) {
     365        int number = GetRunNumber(r.Name);
     366        maxRunNumber = Math.Max(maxRunNumber, number);
     367      }
     368
     369      return run.Name.Substring(0, idx) + (maxRunNumber + 1).ToString();
     370    }
     371
     372    /// <summary>
     373    /// Parses the number of a Run out of its name. Example "Genetic Algorithm Run 3" -> 3
     374    /// </summary>
     375    private static int GetRunNumber(string runName) {
     376      int idx = runName.IndexOf("Run ") + 4;
     377      if (idx == -1) {
     378        return 0;
     379      } else {
     380        return int.Parse(runName.Substring(idx, runName.Length - idx));
    349381      }
    350382    }
     
    403435      if (lightweightJob != null) {
    404436        job.Id = lightweightJob.Id;
    405         job.Id = lightweightJob.Id;
     437        job.ParentJobId = lightweightJob.ParentJobId;
    406438        job.ExecutionTime = lightweightJob.ExecutionTime;
    407439        job.State = lightweightJob.State;
    408440        job.StateLog = new List<StateLog>(lightweightJob.StateLog);
    409         // what about parentJob
     441        job.Command = lightweightJob.Command;
     442        job.LastJobDataUpdate = lightweightJob.LastJobDataUpdate;
     443       
    410444        OnJobStateChanged();
    411445        OnToStringChanged();
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive/3.4/ServiceClients/HiveServiceClient.cs

    r5779 r5786  
    9494       
    9595        [System.Runtime.Serialization.OptionalFieldAttribute()]
     96        private System.DateTime LastJobDataUpdateField;
     97       
     98        [System.Runtime.Serialization.OptionalFieldAttribute()]
    9699        private System.Nullable<System.Guid> ParentJobIdField;
    97100       
     
    132135                    this.ExecutionTimeField = value;
    133136                    this.RaisePropertyChanged("ExecutionTime");
     137                }
     138            }
     139        }
     140       
     141        [System.Runtime.Serialization.DataMemberAttribute()]
     142        public System.DateTime LastJobDataUpdate
     143        {
     144            get
     145            {
     146                return this.LastJobDataUpdateField;
     147            }
     148            set
     149            {
     150                if ((this.LastJobDataUpdateField.Equals(value) != true))
     151                {
     152                    this.LastJobDataUpdateField = value;
     153                    this.RaisePropertyChanged("LastJobDataUpdate");
    134154                }
    135155            }
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Clients.Hive/3.4/ServiceClients/LightweightJob.cs

    r5779 r5786  
    4343      this.State = job.State;
    4444      this.Command = job.Command;
     45      this.LastJobDataUpdate = job.LastJobDataUpdate;
    4546    }
    4647
     
    5253      this.State = original.State;
    5354      this.Command = original.Command;
     55      this.LastJobDataUpdate = original.LastJobDataUpdate;
    5456    }
    5557    public override IDeepCloneable Clone(Cloner cloner) {
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive.Common/3.4/ApplicationConstants.cs

    r5779 r5786  
    3333    public static System.Transactions.IsolationLevel IsolationLevelScope = System.Transactions.IsolationLevel.ReadUncommitted;
    3434
    35     public static int HeartbeatTimeout = 120; // value in seconds
     35    public static TimeSpan SlaveHeartbeatTimeout = TimeSpan.FromMinutes(1);
     36
     37    public static TimeSpan CalculatingJobHeartbeatTimeout = TimeSpan.FromMinutes(1);
     38
     39    public static TimeSpan TransferringJobHeartbeatTimeout = TimeSpan.FromMinutes(5);
    3640
    3741    /// <summary>
    3842    /// Interval in which the HL.HiveExperiment will poll results from server
    3943    /// </summary>
    40     public static TimeSpan ResultPollingInterval = new TimeSpan(0, 0, 5);
     44    public static TimeSpan ResultPollingInterval = TimeSpan.FromSeconds(5);
    4145
    4246    /// <summary>
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive.Common/3.4/DataTransfer/LightweightJob.cs

    r5779 r5786  
    3939    [DataMember]
    4040    public Command? Command { get; set; }
     41    [DataMember]
     42    public DateTime LastJobDataUpdate { get; set; }
    4143
    4244    public StateLog CurrentStateLog { get { return StateLog.LastOrDefault(); } }
     
    5557      this.State = job.State;
    5658      this.Command = job.Command;
     59      this.LastJobDataUpdate = job.LastJobDataUpdate;
    5760    }
    5861  }
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive.DataAccess/3.4/Convert.cs

    r5779 r5786  
    4343        IsParentJob = source.IsParentJob,
    4444        FinishWhenChildJobsFinished = source.FinishWhenChildJobsFinished,
    45         Command = source.Command
     45        Command = source.Command,
     46        LastJobDataUpdate = source.JobData.LastUpdate
    4647      };
    4748    }
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive.DataAccess/3.4/HiveDao.cs

    r5779 r5786  
    3535    }
    3636
    37     public HiveDao() {
    38     }
     37    public HiveDao() { }
    3938
    4039    #region Job Methods
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive.Tests/DaoTests.cs

    r5779 r5786  
    3232      job1.Command = Command.Pause;
    3333
     34      DT.JobData jobData1 = new DT.JobData();
     35      jobData1.Data = new byte[] { 0, 1, 2, 3, 4, 5 };
     36      jobData1.LastUpdate = DateTime.Now;
     37
    3438      DT.Plugin plugin1 = new DT.Plugin();
    3539      plugin1.Name = "Tests.MyPlugin";
     
    5155
    5256      job1.Id = dao.AddJob(job1);
     57      jobData1.JobId = job1.Id;
     58      dao.AddJobData(jobData1);
    5359
    5460      DT.Job job1loaded = dao.GetJob(job1.Id);
     
    6066      Assert.AreEqual(job1.StateLog.Count, job1loaded.StateLog.Count);
    6167      Assert.AreEqual(job1.Command, job1loaded.Command);
     68      Assert.IsTrue(Math.Abs((job1loaded.LastJobDataUpdate - jobData1.LastUpdate).TotalSeconds) < 1);
    6269      for (int i = 0; i < job1.StateLog.Count; i++) {
    6370        Assert.AreEqual(job1.Id, job1loaded.StateLog[i].JobId);
     
    8895      }
    8996
     97      DT.JobData jobData1Loaded = dao.GetJobData(job1.Id);
     98      Assert.AreEqual(jobData1.JobId, jobData1Loaded.JobId);
     99      Assert.IsTrue(Math.Abs((jobData1.LastUpdate - jobData1Loaded.LastUpdate).TotalSeconds) < 1);
     100      Assert.IsTrue(jobData1.Data.SequenceEqual(jobData1Loaded.Data));
     101     
    90102      dao.DeleteJob(job1.Id);
    91103
    92104      Assert.AreEqual(null, dao.GetJob(job1.Id));
     105      Assert.AreEqual(null, dao.GetJobData(job1.Id));
    93106    }
    94107
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive/3.4/HeartbeatManager.cs

    r5779 r5786  
    5252
    5353    private void AssignJob(Slave slave, Job job) {
    54       dao.UpdateJobState(job.Id, JobState.Transferring, slave.Id, null, null);
     54      job = dao.UpdateJobState(job.Id, JobState.Transferring, slave.Id, null, null);
    5555      dao.UpdateSlave(slave);
     56
     57      // from now on the job has some time to send the next heartbeat (ApplicationConstants.TransferringJobHeartbeatTimeout)
     58      job.LastHeartbeat = DateTime.Now;
     59      dao.UpdateJob(job);
    5660    }
    5761
  • branches/HeuristicLab.Hive-3.4/sources/HeuristicLab.Services.Hive/3.4/LifecycleManager.cs

    r5779 r5786  
    2525      log.Log("LifecycleManager.Cleanup()");
    2626      SetTimeoutSlavesOffline();
    27       FinishParentJobs();
     27      SetTimeoutJobsWaiting();
     28      FinishParentJobs();     
    2829    }
    2930
     
    3435      var slaves = dao.GetSlaves(x => x.SlaveState != SlaveState.Offline);
    3536      foreach (Slave slave in slaves) {
    36         if (!slave.LastHeartbeat.HasValue || (DateTime.Now - slave.LastHeartbeat.Value).TotalSeconds > ApplicationConstants.HeartbeatTimeout) {
     37        if (!slave.LastHeartbeat.HasValue || (DateTime.Now - slave.LastHeartbeat.Value) > ApplicationConstants.SlaveHeartbeatTimeout) {
    3738          slave.SlaveState = SlaveState.Offline;
    3839          SetJobsWaiting(slave.Id);
     
    5556      var jobs = dao.GetJobs(x => x.State == JobState.Calculating).Where(x => x.StateLog.Last().SlaveId == slaveId);
    5657      foreach (var j in jobs) {
    57         Job job = dao.UpdateJobState(j.Id, JobState.Waiting, slaveId, null, "Slave timed out");
     58        Job job = dao.UpdateJobState(j.Id, JobState.Waiting, slaveId, null, "Slave timed out.");
    5859        job.Command = null;
    5960        dao.UpdateJob(job);
     
    6162    }
    6263
     64    /// <summary>
     65    /// Looks for jobs which have not sent heartbeats for some time and reschedules them for calculation
     66    /// </summary>
     67    private void SetTimeoutJobsWaiting() {
     68      var jobs = dao.GetJobs(x => (x.State == JobState.Calculating && (DateTime.Now - x.LastHeartbeat) > ApplicationConstants.CalculatingJobHeartbeatTimeout)
     69                               || (x.State == JobState.Transferring && (DateTime.Now - x.LastHeartbeat) > ApplicationConstants.TransferringJobHeartbeatTimeout));
     70      foreach (var j in jobs) {
     71        Job job = dao.UpdateJobState(j.Id, JobState.Waiting, null, null, "Slave timed out.");
     72        job.Command = null;
     73        dao.UpdateJob(job);
     74      }
     75    }
    6376  }
    6477}
  • branches/HeuristicLab.Hive-3.4/sources/MergeConfigs.cmd

    r5095 r5786  
    11copy "%SolutionDir%HeuristicLab.Hive 3.4.dll.config" "%TargetDir%"
    22
    3 %SolutionDir%ConfigMerger "%SolutionDir%HeuristicLab.Services.Hive\3.4\app.config" "%TargetDir%HeuristicLab.Hive 3.4.dll.config"
    43%SolutionDir%ConfigMerger "%SolutionDir%HeuristicLab.Services.Hive.DataAccess\3.4\app.config" "%TargetDir%HeuristicLab.Hive 3.4.dll.config"
    54%SolutionDir%ConfigMerger "%SolutionDir%HeuristicLab.Clients.Hive\3.4\app.config" "%TargetDir%HeuristicLab.Hive 3.4.dll.config"
Note: See TracChangeset for help on using the changeset viewer.