Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
09/17/09 10:15:56 (15 years ago)
Author:
gkronber
Message:

Fixed #750 (LinearRegressionOperator should first remove columns with many NaN values and after that remove the rows with NaN values).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.LinearRegression/3.2/LinearRegressionOperator.cs

    r2360 r2367  
    5656      int minTimeOffset = minTimeOffsetData == null ? 0 : minTimeOffsetData.Data;
    5757
    58       List<int> allowedRows = CalculateAllowedRows(dataset, targetVariable, start, end, minTimeOffset, maxTimeOffset);
    5958      List<int> allowedColumns = CalculateAllowedColumns(dataset, targetVariable, start, end);
     59      List<int> allowedRows = CalculateAllowedRows(dataset, targetVariable, allowedColumns, start, end, minTimeOffset, maxTimeOffset);
    6060
    6161      double[,] inputMatrix = PrepareInputMatrix(dataset, allowedColumns, allowedRows, minTimeOffset, maxTimeOffset);
     
    110110
    111111    //returns list of valid row indexes (rows without NaN values)
    112     private List<int> CalculateAllowedRows(Dataset dataset, int targetVariable, int start, int end, int minTimeOffset, int maxTimeOffset) {
     112    private List<int> CalculateAllowedRows(Dataset dataset, int targetVariable, IList<int> allowedColumns, int start, int end, int minTimeOffset, int maxTimeOffset) {
    113113      List<int> allowedRows = new List<int>();
    114114      bool add;
    115115      for (int row = start; row < end; row++) {
    116116        add = true;
    117         for (int col = 0; col < dataset.Columns && add == true; col++) {
     117        for (int colIndex = 0; colIndex < allowedColumns.Count && add == true; colIndex++) {
    118118          for (int timeOffset = minTimeOffset; timeOffset <= maxTimeOffset; timeOffset++) {
    119119            if (
    120120              row + timeOffset < 0 ||
    121121              row + timeOffset > dataset.Rows ||
    122               double.IsNaN(dataset.GetValue(row + timeOffset, col)) ||
     122              double.IsNaN(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
     123              double.IsInfinity(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
    123124              double.IsNaN(dataset.GetValue(row + timeOffset, targetVariable))) {
    124125              add = false;
     
    133134    }
    134135
    135     //returns list of valid column indexes (columns which contain at least one non-zero value)
     136    //returns list of valid column indexes (columns which contain max. 10% NaN (or infinity) and contain at least two different values)
    136137    private List<int> CalculateAllowedColumns(Dataset dataset, int targetVariable, int start, int end) {
    137138      List<int> allowedColumns = new List<int>();
     139      double n = end - start;
    138140      for (int i = 0; i < dataset.Columns; i++) {
    139         if (i == targetVariable) continue;
    140         if (!dataset.GetMinimum(i, start, end).IsAlmost(0.0) ||
    141             !dataset.GetMaximum(i, start, end).IsAlmost(0.0))
     141        double nanRatio = CountNaN(dataset, i, start, end) / n;
     142        if (i != targetVariable && nanRatio < 0.1 && dataset.GetRange(i, start, end) > 0.0) {
    142143          allowedColumns.Add(i);
     144        }
    143145      }
    144146      return allowedColumns;
    145147    }
     148
     149    private double CountNaN(Dataset dataset, int column, int start, int end) {
     150      double n = 0;
     151      for (int i = start; i < end; i++) {
     152        if (double.IsNaN(dataset.GetValue(i, column)) || double.IsInfinity(dataset.GetValue(i, column)))
     153          n++;
     154      }
     155      return n;
     156    }
     157
    146158
    147159    private double[,] PrepareInputMatrix(Dataset dataset, List<int> allowedColumns, List<int> allowedRows, int minTimeOffset, int maxTimeOffset) {
Note: See TracChangeset for help on using the changeset viewer.