Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
04/04/10 18:53:55 (14 years ago)
Author:
gkronber
Message:

Implemented import of CSV files for regression problems. #938 (Data types and operators for regression problems)

File:
1 copied

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs

    r3262 r3264  
    11#region License Information
    22/* HeuristicLab
    3  * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
     3 * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
    44 *
    55 * This file is part of HeuristicLab.
     
    2828using System.Text;
    2929
    30 namespace HeuristicLab.DataAnalysis {
    31   public class DatasetParser {
    32     private const string PROBLEMNAME = "PROBLEMNAME";
     30namespace HeuristicLab.Problems.DataAnalysis.Regression {
     31  public class CsvFileParser {
    3332    private const string VARIABLENAMES = "VARIABLENAMES";
    34     private const string TARGETVARIABLE = "TARGETVARIABLE";
    35     private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
    36     private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
    37     private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
    38     private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
    39     private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
    40     private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
    41     private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
    42     private const string TESTSAMPLESEND = "TESTSAMPLESEND";
    43     private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";
    4433    private Tokenizer tokenizer;
    45     private Dictionary<string, List<Token>> metadata;
    46     private List<List<double>> samplesList;
     34    private List<string> variableNames;
     35    private List<List<double>> rowValues;
    4736
    4837    private int rows;
     
    5847    }
    5948
    60     private double[] samples;
    61     public double[] Samples {
     49    private double[,] values;
     50    public double[,] Values {
    6251      get {
    63         return samples;
    64       }
    65     }
    66 
    67     public string ProblemName {
     52        return values;
     53      }
     54    }
     55
     56    public IEnumerable<string> VariableNames {
    6857      get {
    69         if (metadata.ContainsKey(PROBLEMNAME)) {
    70           return metadata[PROBLEMNAME][0].stringValue;
    71         } else return "-";
    72       }
    73     }
    74 
    75     public string[] VariableNames {
    76       get {
    77         if (metadata.ContainsKey(VARIABLENAMES)) {
    78           List<Token> nameList = metadata[VARIABLENAMES];
    79           string[] names = new string[nameList.Count];
    80           for (int i = 0; i < names.Length; i++) {
    81             names[i] = nameList[i].stringValue;
    82           }
    83           return names;
    84         } else {
     58        if (variableNames.Count > 0) return variableNames;
     59        else {
    8560          string[] names = new string[columns];
    8661          for (int i = 0; i < names.Length; i++) {
     
    9267    }
    9368
    94     public int TargetVariable {
    95       get {
    96         if (metadata.ContainsKey(TARGETVARIABLE)) {
    97           return metadata[TARGETVARIABLE][0].intValue;
    98         } else return 0; // default is the first column
    99       }
    100     }
    101 
    102     public int MaxTreeHeight {
    103       get {
    104         if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
    105           return metadata[MAXIMUMTREEHEIGHT][0].intValue;
    106         } else return 0;
    107       }
    108     }
    109 
    110     public int MaxTreeSize {
    111       get {
    112         if (metadata.ContainsKey(MAXIMUMTREESIZE)) {
    113           return metadata[MAXIMUMTREESIZE][0].intValue;
    114         } else return 0;
    115       }
    116     }
    117 
    118     public int TrainingSamplesStart {
    119       get {
    120         if (metadata.ContainsKey(TRAININGSAMPLESSTART)) {
    121           return metadata[TRAININGSAMPLESSTART][0].intValue;
    122         } else return 0;
    123       }
    124     }
    125 
    126     public int TrainingSamplesEnd {
    127       get {
    128         if (metadata.ContainsKey(TRAININGSAMPLESEND)) {
    129           return metadata[TRAININGSAMPLESEND][0].intValue;
    130         } else return rows;
    131       }
    132     }
    133     public int ValidationSamplesStart {
    134       get {
    135         if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
    136           return metadata[VALIDATIONSAMPLESSTART][0].intValue;
    137         } else return 0;
    138       }
    139     }
    140 
    141     public int ValidationSamplesEnd {
    142       get {
    143         if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
    144           return metadata[VALIDATIONSAMPLESEND][0].intValue;
    145         } else return rows;
    146       }
    147     }
    148     public int TestSamplesStart {
    149       get {
    150         if (metadata.ContainsKey(TESTSAMPLESSTART)) {
    151           return metadata[TESTSAMPLESSTART][0].intValue;
    152         } else return 0;
    153       }
    154     }
    155 
    156     public int TestSamplesEnd {
    157       get {
    158         if (metadata.ContainsKey(TESTSAMPLESEND)) {
    159           return metadata[TESTSAMPLESEND][0].intValue;
    160         } else return rows;
    161       }
    162     }
    163 
    164     public List<int> NonInputVariables {
    165       get {
    166         List<int> disallowedVariables = new List<int>();
    167         if (metadata.ContainsKey(NONINPUTVARIABLES)) {
    168           foreach (Token t in metadata[NONINPUTVARIABLES]) {
    169             disallowedVariables.Add(t.intValue);
    170           }
    171         }
    172         return disallowedVariables;
    173       }
    174     }
    175 
    176     public DatasetParser() {
    177       this.metadata = new Dictionary<string, List<Token>>();
    178       samplesList = new List<List<double>>();
    179     }
    180 
    181     public void Reset() {
    182       metadata.Clear();
    183       samplesList.Clear();
    184     }
    185 
    186     public void Import(string importFileName, bool strict) {
    187       TryParse(importFileName, strict);
     69    public CsvFileParser() {
     70      rowValues = new List<List<double>>();
     71      variableNames = new List<string>();
     72    }
     73
     74    private void Reset() {
     75      variableNames.Clear();
     76      rowValues.Clear();
     77    }
     78
     79    public void Parse(string fileName) {
     80      TryParse(fileName);
    18881      // translate the list of samples into a DoubleMatrixData item
    189       samples = new double[samplesList.Count * samplesList[0].Count];
    190       rows = samplesList.Count;
    191       columns = samplesList[0].Count;
    192 
    193       int i = 0;
    194       int j = 0;
    195       foreach (List<double> row in samplesList) {
    196         j = 0;
     82      rows = rowValues.Count;
     83      columns = rowValues[0].Count;
     84      values = new double[rows, columns];
     85
     86      int rowIndex = 0;
     87      int columnIndex = 0;
     88      foreach (List<double> row in rowValues) {
     89        columnIndex = 0;
    19790        foreach (double element in row) {
    198           samples[i * columns + j] = element;
    199           j++;
    200         }
    201         i++;
    202       }
    203     }
    204 
    205     private void TryParse(string importFileName, bool strict) {
     91          values[rowIndex, columnIndex++] = element;
     92        }
     93        rowIndex++;
     94      }
     95    }
     96
     97    private void TryParse(string fileName) {
    20698      Exception lastEx = null;
    207       NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
     99      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo };
    208100      foreach (NumberFormatInfo numberFormat in possibleFormats) {
    209         using (StreamReader reader = new StreamReader(importFileName)) {
     101        using (StreamReader reader = new StreamReader(fileName)) {
    210102          tokenizer = new Tokenizer(reader, numberFormat);
    211103          try {
    212104            // parse the file
    213             Parse(strict);
     105            Parse();
    214106            return; // parsed without errors -> return;
    215107          }
     
    225117    #region tokenizer
    226118    internal enum TokenTypeEnum {
    227       At, Assign, NewLine, SemiColon, String, Double, Int
     119      NewLine, Separator, String, Double
    228120    }
    229121
     
    232124      public string stringValue;
    233125      public double doubleValue;
    234       public int intValue;
    235126
    236127      public Token(TokenTypeEnum type, string value) {
     
    238129        stringValue = value;
    239130        doubleValue = 0.0;
    240         intValue = 0;
    241131      }
    242132
     
    247137
    248138
    249     class Tokenizer {
     139    internal class Tokenizer {
    250140      private StreamReader reader;
    251141      private List<Token> tokens;
    252142      private NumberFormatInfo numberFormatInfo;
    253143
    254       public int CurrentLineNumber = 0;
    255       public string CurrentLine;
    256 
    257       public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
    258       public static Token AtToken = new Token(TokenTypeEnum.At, "@");
    259       public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
    260       public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";");
    261 
    262       public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
     144      private int currentLineNumber = 0;
     145      public int CurrentLineNumber {
     146        get { return currentLineNumber; }
     147        private set { currentLineNumber = value; }
     148      }
     149      private string currentLine;
     150      public string CurrentLine {
     151        get { return currentLine; }
     152        private set { currentLine = value; }
     153      }
     154
     155      private Token newlineToken;
     156      public Token NewlineToken {
     157        get { return newlineToken; }
     158        private set { newlineToken = value; }
     159      }
     160      private Token separatorToken;
     161      public Token SeparatorToken {
     162        get { return separatorToken; }
     163        private set { separatorToken = value; }
     164      }
     165
     166      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
    263167        this.reader = reader;
    264168        this.numberFormatInfo = numberFormatInfo;
     169        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
     170        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
    265171        tokens = new List<Token>();
    266172        ReadNextTokens();
     173      }
     174      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
     175        : this(reader, numberFormatInfo, ';') {
    267176      }
    268177
     
    284193        StringBuilder subStr = new StringBuilder();
    285194        foreach (char c in line) {
    286           if (c == '@' || c == '=' || c == ';') {
     195          if (c == ';') {
    287196            yield return subStr.ToString();
    288197            subStr = new StringBuilder();
     
    297206      private Token MakeToken(string strToken) {
    298207        Token token = new Token(TokenTypeEnum.String, strToken);
    299         if (strToken.Equals(AtToken.stringValue)) {
    300           return AtToken;
    301         } else if (strToken.Equals(AssignmentToken.stringValue)) {
    302           return AssignmentToken;
    303         } else if (strToken.Equals(SeparatorToken.stringValue)) {
     208        if (strToken.Equals(SeparatorToken.stringValue)) {
    304209          return SeparatorToken;
    305         } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
    306           token.type = TokenTypeEnum.Int;
    307           return token;
    308210        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
    309211          token.type = TokenTypeEnum.Double;
     
    335237
    336238    #region parsing
    337     private void Parse(bool strict) {
    338       ParseMetaData(strict);
     239    private void Parse() {
     240      ParseVariableNames();
    339241      if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    340       ParseSampleData(strict);
    341       if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    342     }
    343 
    344     private void ParseSampleData(bool strict) {
     242      ParseValues();
     243      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
     244    }
     245
     246    private void ParseValues() {
    345247      while (tokenizer.HasNext()) {
    346248        List<double> row = new List<double>();
    347         row.Add(NextValue(tokenizer, strict));
    348         while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
    349           Expect(Tokenizer.SeparatorToken);
    350           row.Add(NextValue(tokenizer, strict));
    351         }
    352         Expect(Tokenizer.NewlineToken);
    353         // when parsing strictly all rows have to have the same number of values           
    354         if (strict) {
    355           // the first row defines how many samples are needed
    356           if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
    357             Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
    358               "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
    359           }
    360         } else if (samplesList.Count > 0) {
    361           // when we are not strict then fill or drop elements as needed
    362           if (samplesList[0].Count > row.Count) {
    363             // fill with NAN
    364             for (int i = row.Count; i < samplesList[0].Count; i++) {
    365               row.Add(double.NaN);
    366             }
    367           } else if (samplesList[0].Count < row.Count) {
    368             // drop last k elements where k = n - length of first row
    369             row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
    370           }
    371         }
    372 
     249        row.Add(NextValue(tokenizer));
     250        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
     251          Expect(tokenizer.SeparatorToken);
     252          row.Add(NextValue(tokenizer));
     253        }
     254        Expect(tokenizer.NewlineToken);
     255        // all rows have to have the same number of values           
     256        // the first row defines how many samples are needed
     257        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
     258          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
     259            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
     260        }
    373261        // add the current row to the collection of rows and start a new row
    374         samplesList.Add(row);
     262        rowValues.Add(row);
    375263        row = new List<double>();
    376264      }
    377265    }
    378266
    379     private double NextValue(Tokenizer tokenizer, bool strict) {
    380       if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN;
     267    private double NextValue(Tokenizer tokenizer) {
     268      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
    381269      Token current = tokenizer.Next();
    382       if (current.type == TokenTypeEnum.SemiColon || current.type == TokenTypeEnum.String) {
     270      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
    383271        return double.NaN;
    384272      } else if (current.type == TokenTypeEnum.Double) {
    385273        // just take the value
    386274        return current.doubleValue;
    387       } else if (current.type == TokenTypeEnum.Int) {
    388         // translate the int value to double
    389         return (double)current.intValue;
    390       } else {
    391         // found an unexpected token => throw error when parsing strictly
    392         // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
    393         if (strict) {
    394           Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
    395         } else {
    396           return double.NaN;
    397         }
    398       }
    399       return double.NaN;
    400     }
    401 
    402     private void ParseMetaData(bool strict) {
    403       while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) {
    404         Expect(Tokenizer.AtToken);
    405 
    406         Token nameToken = tokenizer.Next();
    407         Expect(Tokenizer.AssignmentToken);
     275      }
     276      // found an unexpected token => throw error
     277      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
     278      // this line is never executed because Error() throws an exception
     279      throw new InvalidOperationException();
     280    }
     281
     282    private void ParseVariableNames() {
     283      // if the first line doesn't start with a double value then we assume that the
     284      // first line contains variable names
     285      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
    408286
    409287        List<Token> tokens = new List<Token>();
     
    411289        valueToken = tokenizer.Next();
    412290        tokens.Add(valueToken);
    413         while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
    414           Expect(Tokenizer.SeparatorToken);
     291        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
     292          Expect(tokenizer.SeparatorToken);
    415293          valueToken = tokenizer.Next();
    416           if (valueToken != Tokenizer.NewlineToken) {
     294          if (valueToken != tokenizer.NewlineToken) {
    417295            tokens.Add(valueToken);
    418296          }
    419297        }
    420         if (valueToken != Tokenizer.NewlineToken) {
    421           Expect(Tokenizer.NewlineToken);
    422         }
    423         metadata[nameToken.stringValue] = tokens;
     298        if (valueToken != tokenizer.NewlineToken) {
     299          Expect(tokenizer.NewlineToken);
     300        }
     301        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
    424302      }
    425303    }
Note: See TracChangeset for help on using the changeset viewer.