Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
05/11/11 17:59:18 (13 years ago)
Author:
mkommend
Message:

#1524: Excluded variables with more than 100 different values from the valid target variable values in ClassificationProblemData.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationProblemData.cs

    r5847 r6186  
    3737    private const string ClassNamesParameterName = "ClassNames";
    3838    private const string ClassificationPenaltiesParameterName = "ClassificationPenalties";
     39    private const int MaximumNumberOfClass = 100;
     40    private const int InspectedRowsToDetermineTargets = 500;
    3941
    4042    #region default data
     
    252254    public ClassificationProblemData(Dataset dataset, IEnumerable<string> allowedInputVariables, string targetVariable)
    253255      : base(dataset, allowedInputVariables) {
    254       var variables = InputVariables.Select(x => x.AsReadOnly()).ToList();
    255       Parameters.Add(new ConstrainedValueParameter<StringValue>(TargetVariableParameterName, new ItemSet<StringValue>(variables), variables.Where(x => x.Value == targetVariable).First()));
     256      var validTargetVariableValues = CheckVariablesForPossibleTargetVariables(dataset).Select(x => new StringValue(x).AsReadOnly()).ToList();
     257      var target = validTargetVariableValues.Where(x => x.Value == targetVariable).DefaultIfEmpty(validTargetVariableValues.First()).First();
     258
     259      Parameters.Add(new ConstrainedValueParameter<StringValue>(TargetVariableParameterName, new ItemSet<StringValue>(validTargetVariableValues), target));
    256260      Parameters.Add(new FixedValueParameter<StringMatrix>(ClassNamesParameterName, ""));
    257261      Parameters.Add(new FixedValueParameter<DoubleMatrix>(ClassificationPenaltiesParameterName, ""));
     
    260264      RegisterParameterEvents();
    261265    }
     266
     267    private static IEnumerable<string> CheckVariablesForPossibleTargetVariables(Dataset dataset) {
     268      var validTargetVariables = from v in dataset.VariableNames
     269                                 let DistinctValues = dataset.Rows > InspectedRowsToDetermineTargets ? dataset.GetVariableValues(v, 0, InspectedRowsToDetermineTargets).Distinct().Count()
     270                                                                        : dataset.GetVariableValues(v).Distinct().Count()
     271                                 where DistinctValues < MaximumNumberOfClass
     272                                 select v;
     273
     274      if (!validTargetVariables.Any())
     275        throw new ArgumentException("Import of classification problem data was not successfull, because no target variable was found." +
     276          " A target variable must have at most " + MaximumNumberOfClass + " distinct values to be applicable to classification.");
     277      return validTargetVariables;
     278    }
     279
    262280
    263281    private void ResetTargetVariableDependentMembers() {
Note: See TracChangeset for help on using the changeset viewer.