1 | # find nearest neighbors for GP indiviuals
|
---|
2 |
|
---|
3 | library(dplyr)
|
---|
4 | library(hexbin)
|
---|
5 | library(ggplot2);
|
---|
6 | #library(doSNOW); #dopar
|
---|
7 | #library(foreach); # dopar
|
---|
8 |
|
---|
9 | #cl<-makeCluster(2) #change the 2 to your number of CPU cores
|
---|
10 | #registerDoSNOW(cl)
|
---|
11 |
|
---|
12 | setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4");
|
---|
13 |
|
---|
14 | gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=',');
|
---|
15 | idx <- seq(1:nrow(gp_log));
|
---|
16 | # check popSize in gp_log
|
---|
17 | gp_log[seq(1,20500,500),1]
|
---|
18 | popSize <- 500;
|
---|
19 |
|
---|
20 | # check qualities
|
---|
21 | # ggplot(gp_log, aes(x=idx,y=gp_log$V1)) + geom_line();
|
---|
22 |
|
---|
23 | # generations <- seq(1,34000/popSize,1);
|
---|
24 | generations <- seq(1,41,1);
|
---|
25 |
|
---|
26 | numClusters <- max(m$c);
|
---|
27 |
|
---|
28 | gp_evals <- gp_log[,seq(3,202,2)];
|
---|
29 | all_evals <- m[,5:104];
|
---|
30 | gen_i=1
|
---|
31 | stats <- data.frame(gen=generations,
|
---|
32 | numDiffClusters = -generations,
|
---|
33 | medianRank=-generations)
|
---|
34 | for(gen_i in generations) {
|
---|
35 | #gen_i <- 1;
|
---|
36 | selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1);
|
---|
37 | #min(select2edRows)
|
---|
38 | #max(selectedRows)
|
---|
39 | xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2
|
---|
40 | xcorrel[is.na(xcorrel)] <- 0; # for constant expressions
|
---|
41 |
|
---|
42 | idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel)));
|
---|
43 |
|
---|
44 | # number of different nearest neighbours
|
---|
45 | numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel))
|
---|
46 | stats$numDiffClusters[gen_i] <- numDistinctNeighbors
|
---|
47 |
|
---|
48 | mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5]
|
---|
49 | #check
|
---|
50 | #cor(t(all_evals[128082,]), t(gp_evals[2,]))^2
|
---|
51 | #max(cor(t(all_evals[,]), t(gp_evals[2,]))^2)
|
---|
52 | ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point();
|
---|
53 | ggsave(paste("scatter",gen_i,".png"))
|
---|
54 |
|
---|
55 | # bestFunc <- data.frame(x = seq(0,9.8,0.1),
|
---|
56 | # t(m[idxOfMaxCorrel$idx[1], 6:104]),
|
---|
57 | # t(gp_evals[selectedRows[1],1:99]));
|
---|
58 | # # best neighbor
|
---|
59 | # ggplot(bestFunc, aes(x=x)) +
|
---|
60 | # geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) +
|
---|
61 | # geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) +
|
---|
62 | # guides(fill="legend") +
|
---|
63 | # labs(y="f(x)", color="Function");
|
---|
64 | # ggsave(paste("best_function",gen_i,".png"))
|
---|
65 |
|
---|
66 | # count number of unclustered
|
---|
67 | # count(m, m$c==-1)
|
---|
68 |
|
---|
69 | #plot expr freq by ranks
|
---|
70 | mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters"))
|
---|
71 | stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank)
|
---|
72 |
|
---|
73 | mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n())
|
---|
74 | mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters"))
|
---|
75 | ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) +
|
---|
76 | scale_y_log10() +
|
---|
77 | xlim(0,17540) +
|
---|
78 | geom_point()
|
---|
79 | ggsave(paste("cluster-freqs",gen_i,".pdf"),
|
---|
80 | device=pdf,width=3)
|
---|
81 |
|
---|
82 |
|
---|
83 | #plot hex map
|
---|
84 | ggplot(mapped_gp_log, aes(x=x, y=y)) +
|
---|
85 | xlim(-60,60) +
|
---|
86 | ylim(-60,60) +
|
---|
87 | theme_void() +
|
---|
88 | geom_hex()
|
---|
89 |
|
---|
90 | ggsave(paste("hexbin",gen_i,".pdf"),
|
---|
91 | device=pdf,width=3,height=2)
|
---|
92 |
|
---|
93 | # ggplot(mapped_gp_log, aes(x=c)) +
|
---|
94 | # xlim(-2,numClusters+1) +
|
---|
95 | # geom_histogram(binwidth = 1) +
|
---|
96 | # ylim(0,500);
|
---|
97 | # ggsave(paste("cluster_freq",gen_i,".png"))
|
---|
98 | }
|
---|
99 |
|
---|
100 | # plot line chart of stats
|
---|
101 | ggplot(stats, aes(x=gen)) +
|
---|
102 | xlim(1,40) +
|
---|
103 | geom_line(aes(y=medianRank)) +
|
---|
104 | theme_classic() +
|
---|
105 | labs(x="Generations", y="Median cluster rank") +
|
---|
106 | scale_y_log10()
|
---|
107 | ggsave("gp-medianClusterRank.pdf", device=pdf)
|
---|
108 |
|
---|
109 | print(stats$numDiffClusters)
|
---|
110 |
|
---|
111 | stats$numDiffClusters <- unlist(stats$numDiffClusters)
|
---|
112 |
|
---|
113 | ggplot(stats, aes(x=gen, y=numDiffClusters)) +
|
---|
114 | xlim(1,40) +
|
---|
115 | geom_line() +
|
---|
116 | theme_classic() +
|
---|
117 | labs(x="Generations", y="Number of explored clusters")
|
---|
118 | ggsave("gp-numClusters.pdf", device=pdf)
|
---|