Free cookie consent management tool by TermsFeed Policy Generator

Changeset 2367


Ignore:
Timestamp:
09/17/09 10:15:56 (15 years ago)
Author:
gkronber
Message:

Fixed #750 (LinearRegressionOperator should first remove columns with many NaN values and after that remove the rows with NaN values).

Location:
trunk/sources
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Common/3.2/HeuristicLab.Common-3.2.csproj

    r2345 r2367  
    4747    <DebugType>pdbonly</DebugType>
    4848    <PlatformTarget>x86</PlatformTarget>
     49    <ErrorReport>prompt</ErrorReport>
     50  </PropertyGroup>
     51  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
     52    <DebugSymbols>true</DebugSymbols>
     53    <OutputPath>bin\x64\Debug\</OutputPath>
     54    <DefineConstants>DEBUG;TRACE</DefineConstants>
     55    <DebugType>full</DebugType>
     56    <PlatformTarget>x64</PlatformTarget>
     57    <ErrorReport>prompt</ErrorReport>
     58  </PropertyGroup>
     59  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
     60    <OutputPath>bin\x64\Release\</OutputPath>
     61    <DefineConstants>TRACE</DefineConstants>
     62    <Optimize>true</Optimize>
     63    <DebugType>pdbonly</DebugType>
     64    <PlatformTarget>x64</PlatformTarget>
    4965    <ErrorReport>prompt</ErrorReport>
    5066  </PropertyGroup>
  • trunk/sources/HeuristicLab.DataAnalysis/3.2/Dataset.cs

    r2319 r2367  
    3434    private Dictionary<int, Dictionary<int, double>>[] cachedRanges;
    3535    private bool cachedValuesInvalidated = true;
    36    
     36
    3737    public Dataset()
    3838      : this(new double[,] { { 0.0 } }) {
     
    128128
    129129    #region Modify and get values
    130     public double GetValue(int i, int j) {
    131       return samples[columns * i + j];
     130    public double GetValue(int row, int column) {
     131      return samples[columns * row + column];
    132132    }
    133133
     
    259259        }
    260260        double range = Statistics.Range(values);
    261         if (!cachedRanges[column].ContainsKey(start)) cachedRanges[column][start]= new Dictionary<int, double>();
     261        if (!cachedRanges[column].ContainsKey(start)) cachedRanges[column][start] = new Dictionary<int, double>();
    262262        cachedRanges[column][start][end] = range;
    263263        return range;
  • trunk/sources/HeuristicLab.LinearRegression/3.2/LinearRegressionOperator.cs

    r2360 r2367  
    5656      int minTimeOffset = minTimeOffsetData == null ? 0 : minTimeOffsetData.Data;
    5757
    58       List<int> allowedRows = CalculateAllowedRows(dataset, targetVariable, start, end, minTimeOffset, maxTimeOffset);
    5958      List<int> allowedColumns = CalculateAllowedColumns(dataset, targetVariable, start, end);
     59      List<int> allowedRows = CalculateAllowedRows(dataset, targetVariable, allowedColumns, start, end, minTimeOffset, maxTimeOffset);
    6060
    6161      double[,] inputMatrix = PrepareInputMatrix(dataset, allowedColumns, allowedRows, minTimeOffset, maxTimeOffset);
     
    110110
    111111    //returns list of valid row indexes (rows without NaN values)
    112     private List<int> CalculateAllowedRows(Dataset dataset, int targetVariable, int start, int end, int minTimeOffset, int maxTimeOffset) {
     112    private List<int> CalculateAllowedRows(Dataset dataset, int targetVariable, IList<int> allowedColumns, int start, int end, int minTimeOffset, int maxTimeOffset) {
    113113      List<int> allowedRows = new List<int>();
    114114      bool add;
    115115      for (int row = start; row < end; row++) {
    116116        add = true;
    117         for (int col = 0; col < dataset.Columns && add == true; col++) {
     117        for (int colIndex = 0; colIndex < allowedColumns.Count && add == true; colIndex++) {
    118118          for (int timeOffset = minTimeOffset; timeOffset <= maxTimeOffset; timeOffset++) {
    119119            if (
    120120              row + timeOffset < 0 ||
    121121              row + timeOffset > dataset.Rows ||
    122               double.IsNaN(dataset.GetValue(row + timeOffset, col)) ||
     122              double.IsNaN(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
     123              double.IsInfinity(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
    123124              double.IsNaN(dataset.GetValue(row + timeOffset, targetVariable))) {
    124125              add = false;
     
    133134    }
    134135
    135     //returns list of valid column indexes (columns which contain at least one non-zero value)
     136    //returns list of valid column indexes (columns which contain max. 10% NaN (or infinity) and contain at least two different values)
    136137    private List<int> CalculateAllowedColumns(Dataset dataset, int targetVariable, int start, int end) {
    137138      List<int> allowedColumns = new List<int>();
     139      double n = end - start;
    138140      for (int i = 0; i < dataset.Columns; i++) {
    139         if (i == targetVariable) continue;
    140         if (!dataset.GetMinimum(i, start, end).IsAlmost(0.0) ||
    141             !dataset.GetMaximum(i, start, end).IsAlmost(0.0))
     141        double nanRatio = CountNaN(dataset, i, start, end) / n;
     142        if (i != targetVariable && nanRatio < 0.1 && dataset.GetRange(i, start, end) > 0.0) {
    142143          allowedColumns.Add(i);
     144        }
    143145      }
    144146      return allowedColumns;
    145147    }
     148
     149    private double CountNaN(Dataset dataset, int column, int start, int end) {
     150      double n = 0;
     151      for (int i = start; i < end; i++) {
     152        if (double.IsNaN(dataset.GetValue(i, column)) || double.IsInfinity(dataset.GetValue(i, column)))
     153          n++;
     154      }
     155      return n;
     156    }
     157
    146158
    147159    private double[,] PrepareInputMatrix(Dataset dataset, List<int> allowedColumns, List<int> allowedRows, int minTimeOffset, int maxTimeOffset) {
  • trunk/sources/HeuristicLab.sln

    r2363 r2367  
    40154015    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|Any CPU.ActiveCfg = Release|Any CPU
    40164016    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|Any CPU.Build.0 = Release|Any CPU
    4017     {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|x64.ActiveCfg = Release|x86
     4017    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|x64.ActiveCfg = Release|x64
     4018    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|x64.Build.0 = Release|x64
    40184019    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|x86.ActiveCfg = Release|x86
    40194020    {1FC004FC-59AF-4249-B1B6-FF25873A20E4}.Release|x86.Build.0 = Release|x86
     
    40474048    {3127719F-110E-4558-8845-98559DBB422D}.Release|Any CPU.Build.0 = Release|Any CPU
    40484049    {3127719F-110E-4558-8845-98559DBB422D}.Release|x64.ActiveCfg = Release|x64
     4050    {3127719F-110E-4558-8845-98559DBB422D}.Release|x64.Build.0 = Release|x64
    40494051    {3127719F-110E-4558-8845-98559DBB422D}.Release|x86.ActiveCfg = Release|x86
    40504052    {3127719F-110E-4558-8845-98559DBB422D}.Release|x86.Build.0 = Release|x86
Note: See TracChangeset for help on using the changeset viewer.