Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2929_PrioritizedGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.PGE/3.3/go-code/go-pge/problems/data.go @ 16230

Last change on this file since 16230 was 16230, checked in by hmaislin, 6 years ago

#2929: Adapted pge plugin to check for null value

File size: 15.3 KB
Line 
1package problems
2
3import (
4  "bufio"
5  "bytes"
6  "fmt"
7  "math"
8  "math/rand"
9  "os"
10  "C"
11  "unsafe"
12)
13
14type Point struct {
15  indep []float64
16  depnd []float64
17}
18
19func (d *Point) NumIndep() int             { return len(d.indep) }
20func (d *Point) SetNumIndep(sz int)        { d.indep = make([]float64, sz) }
21func (d *Point) Indep(p int) float64       { return d.indep[p] }
22func (d *Point) SetIndep(p int, v float64) { d.indep[p] = v }
23func (d *Point) Indeps() []float64         { return d.indep }
24func (d *Point) SetIndeps(v []float64)     { d.indep = v }
25func (d *Point) NumDepnd() int             { return len(d.depnd) }
26func (d *Point) SetNumDepnd(sz int)        { d.depnd = make([]float64, sz) }
27func (d *Point) Depnd(p int) float64       { return d.depnd[p] }
28func (d *Point) SetDepnd(p int, v float64) { d.depnd[p] = v }
29func (d *Point) Depnds() []float64         { return d.depnd }
30func (d *Point) SetDepnds(v []float64)     { d.depnd = v }
31
32type PointSet struct {
33  filename string
34  id       int
35
36  numDim     int
37  indepNames []string
38  depndNames []string
39  sysNames   []string
40
41  dataPoints []Point
42  sysVals    []float64
43}
44
45func (d *PointSet) FN() string           { return d.filename }
46func (d *PointSet) SetFN(fn string)      { d.filename = fn }
47func (d *PointSet) ID() int              { return d.id }
48func (d *PointSet) SetID(id int)         { d.id = id }
49func (d *PointSet) SetNumPoints(cnt int) { d.dataPoints = make([]Point, cnt) }
50
51func (d *PointSet) NumIndep() int     { return len(d.indepNames) }
52func (d *PointSet) NumDepnd() int     { return len(d.depndNames) }
53func (d *PointSet) NumDim() int       { return d.numDim } // TODO check to see if TIME is a variable
54func (d *PointSet) SetNumDim(dim int) { d.numDim = dim }  // TODO check to see if TIME is a variable
55func (d *PointSet) NumSys() int       { return len(d.sysNames) }
56func (d *PointSet) NumPoints() int    { return len(d.dataPoints) }
57
58func (d *PointSet) IndepName(xi int) string      { return d.indepNames[xi] }
59func (d *PointSet) GetIndepNames() []string      { return d.indepNames }
60func (d *PointSet) SetIndepNames(names []string) { d.indepNames = names }
61func (d *PointSet) DepndName(xi int) string      { return d.depndNames[xi] }
62func (d *PointSet) GetDepndNames() []string      { return d.depndNames }
63func (d *PointSet) SetDepndNames(names []string) { d.depndNames = names }
64
65func (d *PointSet) SysName(si int) string      { return d.sysNames[si] }
66func (d *PointSet) GetSysNames() []string      { return d.sysNames }
67func (d *PointSet) SetSysNames(names []string) { d.sysNames = names }
68func (d *PointSet) SetSysVals(sv []float64)    { d.sysVals = sv }
69
70func (d *PointSet) Point(p int) *Point    { return &(d.dataPoints[p]) }
71func (d *PointSet) Points() []Point       { return d.dataPoints }
72func (d *PointSet) SetPoints(pts []Point) { d.dataPoints = pts }
73func (d *PointSet) SysVal(p int) float64  { return d.sysVals[p] }
74func (d *PointSet) SysVals() []float64    { return d.sysVals }
75
76// read function at end of file  [ func (d *PointSet) Read(filename string) ]
77
78type PntSubset struct {
79  ds *PointSet
80
81  index   []int
82  input   []Point
83  output  []Point
84  sysVals []float64
85}
86
87func (s *PntSubset) ID() int            { return s.ds.id }
88func (s *PntSubset) DS() *PointSet      { return s.ds }
89func (s *PntSubset) SetDS(ds *PointSet) { s.ds = ds }
90
91func (s *PntSubset) NumIndep() int  { return s.ds.NumIndep() }
92func (s *PntSubset) NumDepnd() int  { return s.ds.NumDepnd() }
93func (s *PntSubset) NumSys() int    { return s.ds.NumSys() }
94func (s *PntSubset) NumPoints() int { return len(s.index) }
95
96func (s *PntSubset) SysVals() []float64        { return s.sysVals }
97func (s *PntSubset) SetSysVals(svls []float64) { s.sysVals = svls }
98
99func (s *PntSubset) Index(p int) int       { return s.index[p] }
100func (s *PntSubset) Indexes() []int        { return s.index }
101func (s *PntSubset) SetIndexes(idxs []int) { s.index = idxs }
102func (s *PntSubset) Input(p int) *Point    { return &s.input[p] }
103func (s *PntSubset) Output(p int) *Point   { return &s.output[p] }
104
105func (s *PntSubset) AddPoint(p int, input, output *Point) {
106  s.index = append(s.index, p)
107  s.input = append(s.input, *input)
108  s.output = append(s.output, *output)
109}
110
111// using indexes, update the input/output data
112func (s *PntSubset) Refresh() {
113  L := len(s.index)
114  if s.input == nil {
115    s.input = make([]Point, L)
116  }
117  if s.output == nil {
118    s.output = make([]Point, L)
119  }
120
121  for i := 0; i < L; i++ {
122    s.input[i] = *s.ds.Point(s.index[i])
123    if s.index[i]+1 >= s.ds.NumPoints() {
124      continue
125    }
126    s.output[i] = *s.ds.Point(s.index[i] + 1)
127  }
128}
129
130func (trainData *PointSet) InitTrainDataF(indepNames []string, depndNames []string, matrix unsafe.Pointer, nEntries int) {
131  trainData.ReadPointSet("C:/Users/Hansi/Desktop/Data.trn")
132}
133
134func (trainData *PointSet) InitTestDataF(indepNames []string, depndNames []string, matrix unsafe.Pointer, nEntries int) {
135  trainData.ReadPointSet("C:/Users/Hansi/Desktop/Data.tst")
136}
137
138func (trainData *PointSet) InitTrainData(indepNames []string, depndNames []string, matrix unsafe.Pointer, nEntries int) {
139  trainData.indepNames = indepNames  //First line in .trn file
140  trainData.numDim = len(trainData.indepNames)
141  trainData.depndNames = depndNames //Second line in .trn file
142 
143  fmt.Printf("Var Names = %v | %v\n", trainData.depndNames, trainData.indepNames)
144  var nClum int = len(indepNames) + len(depndNames)
145
146  for i := 0; i < nEntries; i++ {
147    var pnt Point
148    for j := 0; j < len(trainData.indepNames); j++ {
149      elemNr := i * nClum + j
150      item := (*float64) (unsafe.Pointer( uintptr(unsafe.Pointer(matrix)) + unsafe.Sizeof(float64(0)) * uintptr(elemNr) ))
151      pnt.indep = append(pnt.indep, *item)
152     
153      if os.Getenv("PGEDEBUG") == "1" {
154        fmt.Printf("Adding independend Test/Train Data (%v,%v): %v\n", i, j, *item)
155      }
156    }
157
158    for j := len(trainData.indepNames); j < (len(trainData.indepNames) + len(trainData.depndNames)); j++ {
159      elemNr := i * nClum + j
160      item := (*float64) (unsafe.Pointer( uintptr(unsafe.Pointer(matrix)) + unsafe.Sizeof(float64(0)) * uintptr(elemNr) ))
161      pnt.depnd = append(pnt.depnd, *item )
162     
163      if os.Getenv("PGEDEBUG") == "1" {
164        fmt.Printf("Adding dependend (right side) Test/Train Data (%v, %v): %v\n", i, j, *item)
165      }
166    }
167
168    if len(pnt.indep) > 0 {
169      trainData.dataPoints = append(trainData.dataPoints, pnt)
170    }
171  }
172  fmt.Printf("Num Points: %v\n", len(trainData.dataPoints))
173 
174  //if os.Getenv("PGEDEBUG") == "2" {
175  //  DebugPrint(trainData)
176  //}
177}
178
179//func (trainData *PointSet) DebugPrint() {
180//  for index, elem in range
181//}
182
183func (d *PointSet) ReadPointSet(filename string) {
184  ftotal, err := os.OpenFile(filename, os.O_RDONLY, 0)
185  if err != nil {
186    fmt.Printf("err: %v\n", err)
187    return
188  }
189  defer ftotal.Close()
190  file := bufio.NewReader(ftotal)
191
192  var word string
193
194  // get independent variables (x_i...)
195  for i := 0; ; i++ {
196    _, err := fmt.Fscanf(file, "%s", &word)
197    if err != nil {
198      break
199    }
200    d.indepNames = append(d.indepNames, word)
201  }
202  d.numDim = len(d.indepNames)
203
204  // get dependent variables (y_j...)
205  for i := 0; ; i++ {
206    _, err := fmt.Fscanf(file, "%s", &word)
207    if err != nil {
208      break
209    }
210    d.depndNames = append(d.depndNames, word)
211  }
212
213  fmt.Printf("Var Names = %v | %v\n", d.depndNames, d.indepNames)
214
215  for i := 0; ; i++ {
216    var pnt Point
217    var dval, ival float64
218    if err != nil {
219      break
220    }
221
222    for j := 0; j < len(d.indepNames); j++ {
223      _, err = fmt.Fscanf(file, "%f", &ival)
224      if err != nil {
225        break
226      }
227      pnt.indep = append(pnt.indep, ival)
228    }
229
230    for j := 0; j < len(d.depndNames); j++ {
231      _, err = fmt.Fscanf(file, "%f\n", &dval)
232      if err != nil {
233        break
234      }
235      pnt.depnd = append(pnt.depnd, dval)
236    }
237
238    if len(pnt.indep) > 0 {
239      d.dataPoints = append(d.dataPoints, pnt)
240    }
241    if i%100 == 0 {
242      fmt.Println("Point(%d): %v\n", i, pnt)
243    }
244  }
245  fmt.Printf("Num Points: %v\n", len(d.dataPoints))
246}
247
248func ReadBytesPointSet(ftotal []byte) (d *PointSet) {
249  d = new(PointSet)
250  // ftotal, err := os.OpenFile(filename, os.O_RDONLY, 0)
251  // if err != nil {
252  //  fmt.Printf("err: %v\n", err)
253  //  return
254  // }
255  // defer ftotal.Close()
256  var err error
257  file := bytes.NewReader(ftotal)
258
259  var word string
260
261  // get independent variables (x_i...)
262  for i := 0; ; i++ {
263    _, err := fmt.Fscanf(file, "%s", &word)
264    if err != nil {
265      break
266    }
267    d.indepNames = append(d.indepNames, word)
268  }
269  d.numDim = len(d.indepNames)
270
271  // get dependent variables (y_j...)
272  for i := 0; ; i++ {
273    _, err := fmt.Fscanf(file, "%s", &word)
274    if err != nil {
275      break
276    }
277    d.depndNames = append(d.depndNames, word)
278  }
279
280  fmt.Printf("Var Names = %v | %v\n", d.depndNames, d.indepNames)
281
282  for i := 0; ; i++ {
283    var pnt Point
284    var dval, ival float64
285    if err != nil {
286      break
287    }
288
289    for j := 0; j < len(d.indepNames); j++ {
290      _, err = fmt.Fscanf(file, "%f", &ival)
291      if err != nil {
292        break
293      }
294      pnt.indep = append(pnt.indep, ival)
295    }
296
297    for j := 0; j < len(d.depndNames); j++ {
298      _, err = fmt.Fscanf(file, "%f\n", &dval)
299      if err != nil {
300        break
301      }
302      pnt.depnd = append(pnt.depnd, dval)
303    }
304
305    if len(pnt.indep) > 0 {
306      d.dataPoints = append(d.dataPoints, pnt)
307    }
308    if i%100 == 0 {
309      fmt.Println("Point(%d): %v\n", i, pnt)
310    }
311  }
312  fmt.Printf("Num Points: %v\n", len(d.dataPoints))
313  return d
314}
315
316func (d *PointSet) WritePointSet(filename string) {
317  fmt.Printf("Writing file: %s\n", filename)
318  ftotal, err := os.Create(filename)
319  if err != nil {
320    fmt.Printf("Error creating file: %s  %v\n", filename, err)
321    return
322  }
323  defer ftotal.Close()
324  file := bufio.NewWriter(ftotal)
325  defer file.Flush()
326
327  // write independent variable names (x_i...)
328  for i := 0; i < d.NumIndep(); i++ {
329    _, err := fmt.Fprintf(file, "%s ", d.IndepName(i))
330    if err != nil {
331      fmt.Errorf("error writing pointset to file: %v\n", err)
332      break
333    }
334  }
335  fmt.Fprintln(file)
336
337  // trite dependent variable names (y_j...)
338  for i := 0; i < d.NumDepnd(); i++ {
339    _, err := fmt.Fprintf(file, "%s ", d.DepndName(i))
340    if err != nil {
341      break
342    }
343  }
344  fmt.Fprintln(file)
345
346  // write points
347  points := d.Points()
348  for i := 0; i < d.NumPoints(); i++ {
349    indep := points[i].Indeps()
350    depnd := points[i].Depnds()
351    for j := 0; j < len(indep); j++ {
352      _, err = fmt.Fprintf(file, "%f ", indep[j])
353      if err != nil {
354        break
355      }
356    }
357    for j := 0; j < len(depnd); j++ {
358      _, err = fmt.Fprintf(file, "%f ", depnd[j])
359      if err != nil {
360        break
361      }
362    }
363    fmt.Fprintln(file)
364  }
365}
366
367func SplitPointSetTrainTest(pnts *PointSet, pcnt_train float64, seed int) (train, test *PointSet) {
368
369  train = new(PointSet)
370  test = new(PointSet)
371
372  train.filename, test.filename = pnts.filename, pnts.filename
373  train.id, test.id = pnts.id, pnts.id
374  train.indepNames, test.indepNames = pnts.indepNames, pnts.indepNames
375  train.depndNames, test.depndNames = pnts.depndNames, pnts.depndNames
376  train.sysNames, test.sysNames = pnts.sysNames, pnts.sysNames
377  train.sysVals, test.sysVals = pnts.sysVals, pnts.sysVals
378
379  L := len(pnts.dataPoints)
380  Tst := int(float64(L) * (1.0 - pcnt_train))
381
382  tmp := make([]Point, L)
383  copy(tmp, pnts.dataPoints)
384
385  rand.Seed(int64(seed))
386
387  for i := 0; i < Tst; i++ {
388    p := rand.Intn(L - i)
389    tmp[i], tmp[p] = tmp[p], tmp[i]
390  }
391
392  test.dataPoints = tmp[:Tst]
393  train.dataPoints = tmp[Tst:]
394
395  return
396}
397
398func (d *PointSet) ReadLakeFile(filename string) {
399  ftotal, err := os.OpenFile(filename, os.O_RDONLY, 0)
400  if err != nil {
401    fmt.Printf("err: %v\n", err)
402    return
403  }
404  defer ftotal.Close()
405  file := bufio.NewReader(ftotal)
406
407  var word string
408
409  // get independent variables (x_i...)
410  for i := 0; ; i++ {
411    _, err := fmt.Fscanf(file, "%s", &word)
412    if err != nil {
413      break
414    }
415    d.indepNames = append(d.indepNames, word)
416  }
417  d.numDim = len(d.indepNames)
418
419  // // get dependent variables (y_j...)
420  // for i := 0; ; i++ {
421  //  _, err := fmt.Fscanf(file, "%s", &word)
422  //  if err != nil {
423  //    break
424  //  }
425  //  d.depndNames = append(d.depndNames, word)
426  // }
427
428  // remove time names from indepNames
429  d.indepNames = d.indepNames[2:]
430
431  // fmt.Printf("Var Names = %v | %v\n", d.depndNames, d.indepNames)
432
433  for i := 0; ; i++ {
434    var pnt Point
435    var ival float64
436    if err != nil {
437      break
438    }
439
440    // read time values and disgaurd
441    var dummy string
442    for j := 0; j < 2; j++ {
443      _, err = fmt.Fscanf(file, "%s", &dummy)
444      // fmt.Println(i, dummy)
445      if err != nil {
446        // fmt.Println("err:", err)
447        break
448      }
449    }
450
451    // append dummy time value
452    pnt.indep = append(pnt.indep, -0.1)
453
454    // read real data
455    for j := 0; j < len(d.indepNames); j++ {
456      _, err = fmt.Fscanf(file, "%f\n", &ival)
457      if err != nil {
458        // fmt.Println("err:", err)
459        break
460      }
461      // hack for PAR
462      // if j == 1 {
463      //  ival /= 2000.0 * (math.Pi / 2.0)
464      // }
465
466      pnt.indep = append(pnt.indep, ival)
467    }
468
469    // fmt.Println(i, pnt)
470
471    // for j := 0; j < len(d.depndNames); j++ {
472    //  _, err = fmt.Fscanf(file, "%f\n", &dval)
473    //  if err != nil {
474    //    break
475    //  }
476    //  pnt.depnd = append(pnt.depnd, dval)
477    // }
478
479    if len(pnt.indep) > 1 {
480      for p := 0; p < len(pnt.indep); p++ {
481        if math.IsNaN(pnt.indep[p]) {
482          // fmt.Println("NaN @ ", i, p)
483          goto skip
484        }
485      }
486      d.dataPoints = append(d.dataPoints, pnt)
487    }
488  skip:
489  }
490
491  // calculate numerical derivatives
492  calcDerivs(d.dataPoints)
493
494  for i := 0; i < len(d.dataPoints); i++ {
495    p := d.dataPoints[i]
496    if len(p.Indeps()) == 0 || len(p.Depnds()) == 0 {
497      fmt.Println("Bad Point @", i)
498    }
499    // if i%100 == 0 {
500    //  fmt.Printf("Point(%d): %v\n", i, d.dataPoints[i])
501    // }
502  }
503
504  // fmt.Printf("Num Points: %v\n", len(d.dataPoints))
505}
506
507/* Calculate the first derivative of four points with: h = 0.25
508 * (from: http://www.trentfguidry.net/post/2010/09/04/Numerical-differentiation-formulas.aspx)
509 *
510 * xF0 = ( -3.0*xF4 + 16.0*xF3 - 36.0*xF2 + 48.0*xF1 - 25.0*xF0) / (12.0*h)
511 * xF1 = (      xF4 -  6.0-xF3 + 18.0*xF2 - 10.0*xF1 -  3.0*xF0) / (12.0*h)
512 * xF2 = (     -xF4 +  8.0*xF3           -   8.0*xF1 +      xF0) / (12.0*h)
513 * xF3 = (  3.0*xF4 + 10.0*xF3 - 18.0*xF2 +  6.0*xF1 -      xF0) / (12.0*h)
514 * xF4 = ( 25.0*xF4 - 48.0*xF3 + 36.0*xF2 - 16.0*xF1 +  3.0*xF0) / (12.0*h)
515 */
516func calcDerivs(pts []Point) {
517  h := 24.0 // / (24.0 * 60.0)
518
519  NP := len(pts)
520  ND := pts[0].NumIndep()
521
522  // for summing and averaging on a point/variable~wise basis
523  cnts := make([][]int, NP)
524  vals := make([][]float64, NP)
525  for p := 0; p < NP; p++ {
526    cnts[p] = make([]int, ND)
527    vals[p] = make([]float64, ND)
528  }
529
530  for p := 0; p < NP-5; p++ {
531    for i := 0; i < ND; i++ {
532      var F [5]float64
533      var dF [5]float64
534
535      for j := 0; j < 5; j++ {
536        F[j] = pts[p+j].Indep(i)
537      }
538      dF[0] = (-3.0*F[4] + 16.0*F[3] - 36.0*F[2] + 48.0*F[1] - 25.0*F[0]) / (12.0 * h)
539      dF[1] = (F[4] - 6.0 - F[3] + 18.0*F[2] - 10.0*F[1] - 3.0*F[0]) / (12.0 * h)
540      dF[2] = (-F[4] + 8.0*F[3] - 8.0*F[1] + F[0]) / (12.0 * h)
541      dF[3] = (3.0*F[4] + 10.0*F[3] - 18.0*F[2] + 6.0*F[1] - F[0]) / (12.0 * h)
542      dF[4] = (25.0*F[4] - 48.0*F[3] + 36.0*F[2] - 16.0*F[1] + 3.0*F[0]) / (12.0 * h)
543
544      for j := 0; j < 5; j++ {
545
546        vals[p+j][i] += dF[j]
547        cnts[p+j][i]++
548      }
549    }
550  }
551
552  for p := 0; p < NP; p++ {
553    depnds := make([]float64, ND)
554    for i := 0; i < ND; i++ {
555      depnds[i] = vals[p][i] / float64(cnts[p][i])
556    }
557    pts[p].SetDepnds(depnds)
558  }
559}
Note: See TracBrowser for help on using the repository browser.