Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
12/03/10 10:51:54 (14 years ago)
Author:
gkronber
Message:

Implemented heuristic to determine format for import of data tables and test cases. #1173

File:
1 moved

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs

    r5012 r5013  
    2828
    2929namespace HeuristicLab.Problems.DataAnalysis {
    30   public class CsvFileParser {
     30  public class TableFileParser {
     31    private const int BUFFER_SIZE = 1024;
     32    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
    3133    private const string VARIABLENAMES = "VARIABLENAMES";
    3234    private Tokenizer tokenizer;
     
    6668    }
    6769
    68     public CsvFileParser() {
     70    public TableFileParser() {
    6971      rowValues = new List<List<double>>();
    7072      variableNames = new List<string>();
     
    7779
    7880    public void Parse(string fileName) {
    79       TryParse(fileName);
     81      NumberFormatInfo numberFormat;
     82      char separator;
     83      DetermineFileFormat(fileName, out numberFormat, out separator);
     84      using (StreamReader reader = new StreamReader(fileName)) {
     85        tokenizer = new Tokenizer(reader, numberFormat, separator);
     86        // parse the file
     87        Parse();
     88      }
     89
    8090      // translate the list of samples into a DoubleMatrixData item
    8191      rows = rowValues.Count;
     
    94104    }
    95105
    96     private void TryParse(string fileName) {
    97       Exception lastEx = null;
    98       NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
    99       foreach (NumberFormatInfo numberFormat in possibleFormats) {
    100         using (StreamReader reader = new StreamReader(fileName)) {
    101           tokenizer = new Tokenizer(reader, numberFormat);
    102           try {
    103             // parse the file
    104             Parse();
    105             return; // parsed without errors -> return;
    106           }
    107           catch (DataFormatException ex) {
    108             lastEx = ex;
    109           }
    110         }
    111       }
    112       // all number formats threw an exception -> rethrow the last exception
    113       throw lastEx;
     106    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
     107      using (StreamReader reader = new StreamReader(fileName)) {
     108        // skip first line
     109        reader.ReadLine();
     110        // read a block
     111        char[] buffer = new char[BUFFER_SIZE];
     112        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
     113        // count frequency of special characters
     114        Dictionary<char, int> charCounts = buffer.Take(charsRead)
     115          .GroupBy(c => c)
     116          .ToDictionary(g => g.Key, g => g.Count());
     117
     118        // depending on the characters occuring in the block
     119        // we distinghish a number of different cases based on the the following rules:
     120        // many points => it must be English number format, the other frequently occuring char is the separator
     121        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
     122        //   => check the line in more detail:
     123        //            English: 0, 0, 0, 0
     124        //            German:  0,0 0,0 0,0 ...
     125        //            => if commas are followed by space => English format
     126        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
     127        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
     128        if (OccurrencesOf(charCounts, '.') > 10) {
     129          numberFormat = NumberFormatInfo.InvariantInfo;
     130          separator = POSSIBLE_SEPARATORS
     131            .Where(c => OccurrencesOf(charCounts, c) > 10)
     132            .OrderBy(c => -OccurrencesOf(charCounts, c))
     133            .DefaultIfEmpty(' ')
     134            .First();
     135        } else if (OccurrencesOf(charCounts, ',') > 10) {
     136          // no points and many commas
     137          int countCommaNonDigitPairs = 0;
     138          for (int i = 0; i < charsRead - 1; i++) {
     139            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
     140              countCommaNonDigitPairs++;
     141            }
     142          }
     143          if (countCommaNonDigitPairs > 10) {
     144            // English format (only integer values) with ',' as separator
     145            numberFormat = NumberFormatInfo.InvariantInfo;
     146            separator = ',';
     147          } else {
     148            char[] disallowedSeparators = new char[] { ',' };
     149            // German format (real values)
     150            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de"));
     151            separator = POSSIBLE_SEPARATORS
     152              .Except(disallowedSeparators)
     153              .Where(c => OccurrencesOf(charCounts, c) > 10)
     154              .OrderBy(c => -OccurrencesOf(charCounts, c))
     155              .DefaultIfEmpty(' ')
     156              .First();
     157          }
     158        } else {
     159          // no points and no commas => English format
     160          numberFormat = NumberFormatInfo.InvariantInfo;
     161          separator = POSSIBLE_SEPARATORS
     162            .Where(c => OccurrencesOf(charCounts, c) > 10)
     163            .OrderBy(c => -OccurrencesOf(charCounts, c))
     164            .DefaultIfEmpty(' ')
     165            .First();
     166        }
     167      }
     168    }
     169
     170    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
     171      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
    114172    }
    115173
     
    140198      private List<Token> tokens;
    141199      private NumberFormatInfo numberFormatInfo;
     200      private char separator;
     201      private const string INTERNAL_SEPARATOR = "#";
    142202
    143203      private int currentLineNumber = 0;
     
    166226        this.reader = reader;
    167227        this.numberFormatInfo = numberFormatInfo;
    168         separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
     228        this.separator = separator;
     229        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
    169230        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
    170231        tokens = new List<Token>();
    171232        ReadNextTokens();
    172       }
    173       public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
    174         : this(reader, numberFormatInfo, ';') {
    175233      }
    176234
     
    181239                          let trimmedStr = str.Trim()
    182240                          where !string.IsNullOrEmpty(trimmedStr)
    183                           select MakeToken(trimmedStr.Trim());
     241                          select MakeToken(trimmedStr);
    184242
    185243          tokens.AddRange(newTokens);
     
    192250        StringBuilder subStr = new StringBuilder();
    193251        foreach (char c in line) {
    194           if (c == ';') {
     252          if (c == separator) {
    195253            yield return subStr.ToString();
    196254            subStr = new StringBuilder();
    197             yield return c.ToString();
     255            // all separator characters are transformed to the internally used separator character
     256            yield return INTERNAL_SEPARATOR;
    198257          } else {
    199258            subStr.Append(c);
     
    205264      private Token MakeToken(string strToken) {
    206265        Token token = new Token(TokenTypeEnum.String, strToken);
    207         if (strToken.Equals(SeparatorToken.stringValue)) {
     266        if (strToken.Equals(INTERNAL_SEPARATOR)) {
    208267          return SeparatorToken;
    209268        } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
Note: See TracChangeset for help on using the changeset viewer.