Context Navigation

← Previous Changeset
Next Changeset →

Changeset 5013

Timestamp:

12/03/10 10:51:54 (14 years ago)

Author:

gkronber

Message:

Implemented heuristic to determine format for import of data tables and test cases. #1173

Location:

trunk/sources

Files:

: 1 added
: 4 edited
: 1 moved

HeuristicLab.Problems.DataAnalysis.Classification/3.3/ClassificationProblemData.cs (modified) (1 diff)
HeuristicLab.Problems.DataAnalysis/3.3/DataAnalysisProblemData.cs (modified) (1 diff)
HeuristicLab.Problems.DataAnalysis/3.3/HeuristicLab.Problems.DataAnalysis-3.3.csproj (modified) (2 diffs)
HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs (moved) (moved from trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs) (9 diffs)
HeuristicLab.Problems.DataAnalysis/3.3/Tests/HeuristicLab.Problems.DataAnalysis.Tests-3.3.csproj (modified) (1 diff)
HeuristicLab.Problems.DataAnalysis/3.3/Tests/TableFileParserTest.cs (added)

Legend:

: Unmodified
: Added
: Removed

trunk/sources/HeuristicLab.Problems.DataAnalysis.Classification/3.3/ClassificationProblemData.cs

r4836	r5013
225	225
226	226	public override void ImportFromFile(string fileName) {
227		var csvFileParser = new ~~Csv~~FileParser();
	227	var csvFileParser = new TableFileParser();
228	228	csvFileParser.Parse(fileName);
229	229	suppressEvents = true;

trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/DataAnalysisProblemData.cs

r4835	r5013
396	396
397	397	public virtual void ImportFromFile(string fileName) {
398		var csvFileParser = new ~~Csv~~FileParser();
	398	var csvFileParser = new TableFileParser();
399	399	csvFileParser.Parse(fileName);
400	400	suppressEvents = true;

trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/HeuristicLab.Problems.DataAnalysis-3.3.csproj

-                      r4980
+                      r5013
   </ItemGroup>
   <ItemGroup>
+    <Compile Include="TableFileParser.cs" />
     <None Include="HeuristicLab.snk" />
     <None Include="HeuristicLabProblemsDataAnalysisPlugin.cs.frame" />
 …
     <Compile Include="Evaluators\OnlineNormalizedMeanSquaredErrorEvaluator.cs" />
     <Compile Include="DataAnalysisSolution.cs" />
-    <Compile Include="CsvFileParser.cs" />
     <Compile Include="DataAnalysisProblem.cs" />
     <Compile Include="DataAnalysisProblemData.cs" />

trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs

-                      r5012
+                      r5013
 namespace HeuristicLab.Problems.DataAnalysis {
+  public class CsvFileParser {
+  public class TableFileParser {
+    private const int BUFFER_SIZE = 1024;
+    private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
     private const string VARIABLENAMES = "VARIABLENAMES";
     private Tokenizer tokenizer;
 …
+    }
     public CsvFileParser() {
+    public TableFileParser() {
       rowValues = new List<List<double>>();
       variableNames = new List<string>();
 …
     public void Parse(string fileName) {
+      TryParse(fileName);
+      NumberFormatInfo numberFormat;
+      char separator;
+      DetermineFileFormat(fileName, out numberFormat, out separator);
+      using (StreamReader reader = new StreamReader(fileName)) {
+        tokenizer = new Tokenizer(reader, numberFormat, separator);
+        // parse the file
+        Parse();
+      }
       // translate the list of samples into a DoubleMatrixData item
       rows = rowValues.Count;
 …
+    }
+    private void TryParse(string fileName) {
+      Exception lastEx = null;
+      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
+      foreach (NumberFormatInfo numberFormat in possibleFormats) {
+        using (StreamReader reader = new StreamReader(fileName)) {
+          tokenizer = new Tokenizer(reader, numberFormat);
+          try {
+            // parse the file
+            Parse();
+            return; // parsed without errors -> return;
+          }
+          catch (DataFormatException ex) {
+            lastEx = ex;
+          }
+        }
+      }
+      // all number formats threw an exception -> rethrow the last exception
+      throw lastEx;
+    private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
+      using (StreamReader reader = new StreamReader(fileName)) {
+        // skip first line
+        reader.ReadLine();
+        // read a block
+        char[] buffer = new char[BUFFER_SIZE];
+        int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
+        // count frequency of special characters
+        Dictionary<char, int> charCounts = buffer.Take(charsRead)
+          .GroupBy(c => c)
+          .ToDictionary(g => g.Key, g => g.Count());
+        // depending on the characters occuring in the block
+        // we distinghish a number of different cases based on the the following rules:
+        // many points => it must be English number format, the other frequently occuring char is the separator
+        // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
+        //   => check the line in more detail:
+        //            English: 0, 0, 0, 0
+        //            German:  0,0 0,0 0,0 ...
+        //            => if commas are followed by space => English format
+        // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
+        // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
+        if (OccurrencesOf(charCounts, '.') > 10) {
+          numberFormat = NumberFormatInfo.InvariantInfo;
+          separator = POSSIBLE_SEPARATORS
+            .Where(c => OccurrencesOf(charCounts, c) > 10)
+            .OrderBy(c => -OccurrencesOf(charCounts, c))
+            .DefaultIfEmpty(' ')
+            .First();
+        } else if (OccurrencesOf(charCounts, ',') > 10) {
+          // no points and many commas
+          int countCommaNonDigitPairs = 0;
+          for (int i = 0; i < charsRead - 1; i++) {
+            if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
+              countCommaNonDigitPairs++;
+            }
+          }
+          if (countCommaNonDigitPairs > 10) {
+            // English format (only integer values) with ',' as separator
+            numberFormat = NumberFormatInfo.InvariantInfo;
+            separator = ',';
+          } else {
+            char[] disallowedSeparators = new char[] { ',' };
+            // German format (real values)
+            numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de"));
+            separator = POSSIBLE_SEPARATORS
+              .Except(disallowedSeparators)
+              .Where(c => OccurrencesOf(charCounts, c) > 10)
+              .OrderBy(c => -OccurrencesOf(charCounts, c))
+              .DefaultIfEmpty(' ')
+              .First();
+          }
+        } else {
+          // no points and no commas => English format
+          numberFormat = NumberFormatInfo.InvariantInfo;
+          separator = POSSIBLE_SEPARATORS
+            .Where(c => OccurrencesOf(charCounts, c) > 10)
+            .OrderBy(c => -OccurrencesOf(charCounts, c))
+            .DefaultIfEmpty(' ')
+            .First();
+        }
+      }
+    }
+    private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
+      return charCounts.ContainsKey(c) ? charCounts[c] : 0;
+    }
 …
       private List<Token> tokens;
       private NumberFormatInfo numberFormatInfo;
+      private char separator;
+      private const string INTERNAL_SEPARATOR = "#";
       private int currentLineNumber = 0;
 …
         this.reader = reader;
         this.numberFormatInfo = numberFormatInfo;
+        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
+        this.separator = separator;
+        separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
         newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
         tokens = new List<Token>();
         ReadNextTokens();
+      }
-      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
-        : this(reader, numberFormatInfo, ';') {
+      }
 …
                           let trimmedStr = str.Trim()
                           where !string.IsNullOrEmpty(trimmedStr)
                           select MakeToken(trimmedStr.Trim());
+                          select MakeToken(trimmedStr);
           tokens.AddRange(newTokens);
 …
         StringBuilder subStr = new StringBuilder();
         foreach (char c in line) {
           if (c == ';') {
+          if (c == separator) {
             yield return subStr.ToString();
             subStr = new StringBuilder();
+            yield return c.ToString();
+            // all separator characters are transformed to the internally used separator character
+            yield return INTERNAL_SEPARATOR;
           } else {
             subStr.Append(c);
 …
       private Token MakeToken(string strToken) {
         Token token = new Token(TokenTypeEnum.String, strToken);
         if (strToken.Equals(SeparatorToken.stringValue)) {
+        if (strToken.Equals(INTERNAL_SEPARATOR)) {
           return SeparatorToken;
         } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {

trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/Tests/HeuristicLab.Problems.DataAnalysis.Tests-3.3.csproj

r4980	r5013
109	109	</ItemGroup>
110	110	<ItemGroup>
	111	<Compile Include="TableFileParserTest.cs" />
111	112	<Compile Include="SymbolicSimplifierTest.cs" />
112	113	<Compile Include="StatisticCalculatorsTest.cs" />

Note: See TracChangeset for help on using the changeset viewer.