Context Navigation

source: branches/2520_PersistenceReintegration/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/PennML/PennMLRegressionInstanceProvider.cs @ 16752

Visit:

Last change on this file since 16752 was 16453, checked in by jkarder, 6 years ago
#2520: updated year of copyrights
File size: 3.7 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2019 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.IO.Compression;
27	using System.Linq;
28	using HeuristicLab.Data;
29
30	namespace HeuristicLab.Problems.Instances.DataAnalysis {
31	public class PennMLRegressionInstanceProvider : ResourceRegressionInstanceProvider {
32	public override string Name {
33	get { return "PennML Regression Problems"; }
34	}
35
36	public override string Description {
37	get { return "A set of datasets used for benchmarking symbolic regression algorithms."; }
38	}
39
40	public override Uri WebLink {
41	get { return new Uri("https://github.com/EpistasisLab/penn-ml-benchmarks"); }
42	}
43
44	public override string ReferencePublication {
45	get { return "Patryk Orzechowski, William La Cava, Jason H. Moore - Where are we now? A large benchmark study of recent symbolic regression methods"; }
46	}
47
48	protected override string FileName {
49	get { return "PennML"; }
50	}
51
52	// the reference publication uses 75% of the samples in each of the datasets for training and the remaining 25% for testing
53	private const double trainTestSplit = 0.75;
54
55	public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
56	var instanceArchiveName = GetResourceName(FileName + @"\.zip");
57	using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) {
58	foreach (var entry in instancesZipFile.Entries) {
59	NumberFormatInfo numberFormat;
60	DateTimeFormatInfo dateFormat;
61	char separator;
62	using (var stream = entry.Open()) {
63	// the method below disposes the stream
64	TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator);
65	}
66
67	using (var stream = entry.Open()) {
68	using (var reader = new StreamReader(stream)) {
69	var header = reader.ReadLine(); // read the first line
70
71	// by convention each dataset from the PennML collection reserves the last column for the target
72	var variableNames = header.Split(separator);
73	var allowedInputVariables = variableNames.Take(variableNames.Length - 1);
74	var target = variableNames.Last();
75
76	// count lines
77	int lines = 0; while (reader.ReadLine() != null) lines++;
78
79	var trainEnd = (int)Math.Round(lines * trainTestSplit);
80	var trainRange = new IntRange(0, trainEnd);
81	var testRange = new IntRange(trainEnd, lines);
82
83	var descriptor = new PennMLRegressionDataDescriptor(entry.Name, variableNames, allowedInputVariables, target, trainRange, testRange);
84	yield return descriptor;
85	}
86	}
87	}
88	}
89	}
90	}
91	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences