#region License Information /* HeuristicLab * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Common; namespace HeuristicLab.DataPreprocessing { public class StatisticsLogic { private readonly ITransactionalPreprocessingData preprocessingData; private readonly SearchLogic searchLogic; public StatisticsLogic(ITransactionalPreprocessingData thePreprocessingData, SearchLogic theSearchLogic) { preprocessingData = thePreprocessingData; searchLogic = theSearchLogic; } public int GetColumnCount() { return searchLogic.Columns; } public int GetRowCount() { return searchLogic.Rows; } public int GetNumericColumnCount() { int count = 0; for (int i = 0; i < searchLogic.Columns; ++i) { if (preprocessingData.VariableHasType(i)) { ++count; } } return count; } public int GetNominalColumnCount() { return searchLogic.Columns - GetNumericColumnCount(); } public int GetMissingValueCount() { int count = 0; for (int i = 0; i < searchLogic.Columns; ++i) { count += GetMissingValueCount(i); } return count; } public int GetMissingValueCount(int columnIndex) { return searchLogic.GetMissingValueIndices(columnIndex).Count(); } public T GetMin(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable { var min = defaultValue; if (preprocessingData.VariableHasType(columnIndex)) { var values = GetValuesWithoutNaN(columnIndex, considerSelection); if (values.Any()) { min = values.Min(); } } return min; } public T GetMax(int columnIndex, T defaultValue, bool considerSelection = false) where T : IComparable { var max = defaultValue; if (preprocessingData.VariableHasType(columnIndex)) { var values = GetValuesWithoutNaN(columnIndex, considerSelection); if (values.Any()) { max = values.Max(); } } return max; } public double GetMedian(int columnIndex, bool considerSelection = false) { double median = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { var values = GetValuesWithoutNaN(columnIndex, considerSelection); if (values.Any()) { median = values.Median(); } } return median; } public double GetAverage(int columnIndex, bool considerSelection = false) { double avg = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { var values = GetValuesWithoutNaN(columnIndex, considerSelection); if (values.Any()) { avg = values.Average(); } } return avg; } public DateTime GetMedianDateTime(int columnIndex, bool considerSelection = false) { DateTime median = new DateTime(); if (preprocessingData.VariableHasType(columnIndex)) { median = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Median()); } return median; } public DateTime GetAverageDateTime(int columnIndex, bool considerSelection = false) { DateTime avg = new DateTime(); if (preprocessingData.VariableHasType(columnIndex)) { avg = GetSecondsAsDateTime(GetDateTimeAsSeconds(columnIndex, considerSelection).Average()); } return avg; } public T GetMostCommonValue(int columnIndex, T defaultValue, bool considerSelection = false) { var values = GetValuesWithoutNaN(columnIndex, considerSelection); if (!values.Any()) return defaultValue; return values.GroupBy(x => x) .OrderByDescending(g => g.Count()) .Select(g => g.Key) .First(); } public double GetStandardDeviation(int columnIndex) { double stdDev = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { stdDev = GetValuesWithoutNaN(columnIndex).StandardDeviation(); } else if (preprocessingData.VariableHasType(columnIndex)) { stdDev = GetDateTimeAsSeconds(columnIndex).StandardDeviation(); } return stdDev; } public double GetVariance(int columnIndex) { double variance = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { variance = GetValuesWithoutNaN(columnIndex).Variance(); } else if (preprocessingData.VariableHasType(columnIndex)) { variance = GetDateTimeAsSeconds(columnIndex).Variance(); } return variance; } public double GetOneQuarterPercentile(int columnIndex) { double percentile = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { percentile = GetValuesWithoutNaN(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25); } else if (preprocessingData.VariableHasType(columnIndex)) { percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.25); } return percentile; } public double GetThreeQuarterPercentile(int columnIndex) { double percentile = double.NaN; if (preprocessingData.VariableHasType(columnIndex)) { percentile = GetValuesWithoutNaN(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75); } else if (preprocessingData.VariableHasType(columnIndex)) { percentile = GetDateTimeAsSeconds(columnIndex).DefaultIfEmpty(double.NaN).Quantile(0.75); } return percentile; } public int GetDifferentValuesCount(int columnIndex) { return preprocessingData.GetValues(columnIndex).GroupBy(x => x).Count(); } public int GetRowMissingValueCount(int rowIndex) { int count = 0; for (int i = 0; i < preprocessingData.Columns; ++i) { if (searchLogic.IsMissingValue(i, rowIndex)) { ++count; } } return count; } public string GetVariableName(int columnIndex) { return preprocessingData.GetVariableName(columnIndex); } public bool VariableHasType(int columnIndex) { return preprocessingData.VariableHasType(columnIndex); } public string GetColumnTypeAsString(int columnIndex) { if (preprocessingData.VariableHasType(columnIndex)) { return "double"; } else if (preprocessingData.VariableHasType(columnIndex)) { return "string"; } else if (preprocessingData.VariableHasType(columnIndex)) { return "DateTime"; } return "Unknown Type"; } private IEnumerable GetDateTimeAsSeconds(int columnIndex, bool considerSelection = false) { return GetValuesWithoutNaN(columnIndex, considerSelection).Select(x => (double)x.Ticks / TimeSpan.TicksPerSecond); } private IEnumerable GetValuesWithoutNaN(int columnIndex, bool considerSelection = false) { return searchLogic.GetValuesWithoutNaN(columnIndex, considerSelection); } private DateTime GetSecondsAsDateTime(double seconds) { DateTime dateTime = new DateTime(); return dateTime.AddSeconds(seconds); } public event DataPreprocessingChangedEventHandler Changed { add { preprocessingData.Changed += value; } remove { preprocessingData.Changed -= value; } } } }