- Timestamp:
- 12/03/10 10:51:54 (14 years ago)
- File:
-
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs
r5012 r5013 28 28 29 29 namespace HeuristicLab.Problems.DataAnalysis { 30 public class CsvFileParser { 30 public class TableFileParser { 31 private const int BUFFER_SIZE = 1024; 32 private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' }; 31 33 private const string VARIABLENAMES = "VARIABLENAMES"; 32 34 private Tokenizer tokenizer; … … 66 68 } 67 69 68 public CsvFileParser() {70 public TableFileParser() { 69 71 rowValues = new List<List<double>>(); 70 72 variableNames = new List<string>(); … … 77 79 78 80 public void Parse(string fileName) { 79 TryParse(fileName); 81 NumberFormatInfo numberFormat; 82 char separator; 83 DetermineFileFormat(fileName, out numberFormat, out separator); 84 using (StreamReader reader = new StreamReader(fileName)) { 85 tokenizer = new Tokenizer(reader, numberFormat, separator); 86 // parse the file 87 Parse(); 88 } 89 80 90 // translate the list of samples into a DoubleMatrixData item 81 91 rows = rowValues.Count; … … 94 104 } 95 105 96 private void TryParse(string fileName) { 97 Exception lastEx = null; 98 NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat }; 99 foreach (NumberFormatInfo numberFormat in possibleFormats) { 100 using (StreamReader reader = new StreamReader(fileName)) { 101 tokenizer = new Tokenizer(reader, numberFormat); 102 try { 103 // parse the file 104 Parse(); 105 return; // parsed without errors -> return; 106 } 107 catch (DataFormatException ex) { 108 lastEx = ex; 109 } 110 } 111 } 112 // all number formats threw an exception -> rethrow the last exception 113 throw lastEx; 106 private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) { 107 using (StreamReader reader = new StreamReader(fileName)) { 108 // skip first line 109 reader.ReadLine(); 110 // read a block 111 char[] buffer = new char[BUFFER_SIZE]; 112 int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE); 113 // count frequency of special characters 114 Dictionary<char, int> charCounts = buffer.Take(charsRead) 115 .GroupBy(c => c) 116 .ToDictionary(g => g.Key, g => g.Count()); 117 118 // depending on the characters occuring in the block 119 // we distinghish a number of different cases based on the the following rules: 120 // many points => it must be English number format, the other frequently occuring char is the separator 121 // no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator 122 // => check the line in more detail: 123 // English: 0, 0, 0, 0 124 // German: 0,0 0,0 0,0 ... 125 // => if commas are followed by space => English format 126 // no points no commas => English format (only integer numbers) use the other frequently occuring char as separator 127 // in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators) 128 if (OccurrencesOf(charCounts, '.') > 10) { 129 numberFormat = NumberFormatInfo.InvariantInfo; 130 separator = POSSIBLE_SEPARATORS 131 .Where(c => OccurrencesOf(charCounts, c) > 10) 132 .OrderBy(c => -OccurrencesOf(charCounts, c)) 133 .DefaultIfEmpty(' ') 134 .First(); 135 } else if (OccurrencesOf(charCounts, ',') > 10) { 136 // no points and many commas 137 int countCommaNonDigitPairs = 0; 138 for (int i = 0; i < charsRead - 1; i++) { 139 if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) { 140 countCommaNonDigitPairs++; 141 } 142 } 143 if (countCommaNonDigitPairs > 10) { 144 // English format (only integer values) with ',' as separator 145 numberFormat = NumberFormatInfo.InvariantInfo; 146 separator = ','; 147 } else { 148 char[] disallowedSeparators = new char[] { ',' }; 149 // German format (real values) 150 numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de")); 151 separator = POSSIBLE_SEPARATORS 152 .Except(disallowedSeparators) 153 .Where(c => OccurrencesOf(charCounts, c) > 10) 154 .OrderBy(c => -OccurrencesOf(charCounts, c)) 155 .DefaultIfEmpty(' ') 156 .First(); 157 } 158 } else { 159 // no points and no commas => English format 160 numberFormat = NumberFormatInfo.InvariantInfo; 161 separator = POSSIBLE_SEPARATORS 162 .Where(c => OccurrencesOf(charCounts, c) > 10) 163 .OrderBy(c => -OccurrencesOf(charCounts, c)) 164 .DefaultIfEmpty(' ') 165 .First(); 166 } 167 } 168 } 169 170 private int OccurrencesOf(Dictionary<char, int> charCounts, char c) { 171 return charCounts.ContainsKey(c) ? charCounts[c] : 0; 114 172 } 115 173 … … 140 198 private List<Token> tokens; 141 199 private NumberFormatInfo numberFormatInfo; 200 private char separator; 201 private const string INTERNAL_SEPARATOR = "#"; 142 202 143 203 private int currentLineNumber = 0; … … 166 226 this.reader = reader; 167 227 this.numberFormatInfo = numberFormatInfo; 168 separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString()); 228 this.separator = separator; 229 separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR); 169 230 newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine); 170 231 tokens = new List<Token>(); 171 232 ReadNextTokens(); 172 }173 public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)174 : this(reader, numberFormatInfo, ';') {175 233 } 176 234 … … 181 239 let trimmedStr = str.Trim() 182 240 where !string.IsNullOrEmpty(trimmedStr) 183 select MakeToken(trimmedStr .Trim());241 select MakeToken(trimmedStr); 184 242 185 243 tokens.AddRange(newTokens); … … 192 250 StringBuilder subStr = new StringBuilder(); 193 251 foreach (char c in line) { 194 if (c == ';') {252 if (c == separator) { 195 253 yield return subStr.ToString(); 196 254 subStr = new StringBuilder(); 197 yield return c.ToString(); 255 // all separator characters are transformed to the internally used separator character 256 yield return INTERNAL_SEPARATOR; 198 257 } else { 199 258 subStr.Append(c); … … 205 264 private Token MakeToken(string strToken) { 206 265 Token token = new Token(TokenTypeEnum.String, strToken); 207 if (strToken.Equals( SeparatorToken.stringValue)) {266 if (strToken.Equals(INTERNAL_SEPARATOR)) { 208 267 return SeparatorToken; 209 268 } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
Note: See TracChangeset
for help on using the changeset viewer.