1 | library(largeVis)
|
---|
2 | library(ggplot2)
|
---|
3 | library(dplyr)
|
---|
4 |
|
---|
5 | #setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
|
---|
6 | setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
|
---|
7 | sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
|
---|
8 |
|
---|
9 | # read from CSV and store as R binary (must be done once to produce the .rds file)
|
---|
10 | #evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
|
---|
11 | #saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
|
---|
12 |
|
---|
13 | # read from R binary (faster)
|
---|
14 | evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
|
---|
15 |
|
---|
16 | nrow(evalData[evalData$R2.keijzer4 > 0.4,])
|
---|
17 |
|
---|
18 | max(evalData$R2.keijzer4);
|
---|
19 | max(evalData$R2.keijzer9);
|
---|
20 | max(evalData$R2.pagie);
|
---|
21 | max(evalData$R2.nguyen5);
|
---|
22 | max(evalData$R2.nguyen6);
|
---|
23 | max(evalData$R2.nguyen7);
|
---|
24 |
|
---|
25 |
|
---|
26 | #check zero mean, unit variance
|
---|
27 | #mean(t(outputs[2,]))
|
---|
28 | #sd(t(outputs[2,]))
|
---|
29 |
|
---|
30 | # check
|
---|
31 | # plot(t(outputs[4,]))
|
---|
32 |
|
---|
33 | #apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
|
---|
34 | # check ANN
|
---|
35 | #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
|
---|
36 | #xs <- rep(seq(1:100),100)
|
---|
37 | #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
|
---|
38 |
|
---|
39 | #edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
|
---|
40 | #clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
|
---|
41 |
|
---|
42 | # check cluster
|
---|
43 | #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
|
---|
44 | #xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
|
---|
45 | #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
|
---|
46 |
|
---|
47 |
|
---|
48 | lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
|
---|
49 | perplexity=100, K = 300, n_trees = 50, threads=4,
|
---|
50 | verbose=TRUE) ;
|
---|
51 | #clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);
|
---|
52 |
|
---|
53 | #use clusters calculated with C# program
|
---|
54 | clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')
|
---|
55 |
|
---|
56 | # calculate quality distribution for each cluster
|
---|
57 | qualities <- evalData$R2.keijzer4;
|
---|
58 | clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
|
---|
59 |
|
---|
60 | clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
|
---|
61 | clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
|
---|
62 | clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
|
---|
63 | clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
|
---|
64 | clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
|
---|
65 | clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
|
---|
66 | clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
|
---|
67 | clusterStats$Rank <- seq(1:nrow(clusterStats));
|
---|
68 |
|
---|
69 | # write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");
|
---|
70 |
|
---|
71 | ##output target function for paper
|
---|
72 | ## using only one of the following
|
---|
73 | #xi <- seq(0,9.99,0.1);
|
---|
74 | #target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
|
---|
75 | #problemName <- "keijzer-4"
|
---|
76 | #target <- target_keijzer4
|
---|
77 | #
|
---|
78 | #
|
---|
79 | #xi <- seq(0, 99, 1)
|
---|
80 | #target_keijzer9 <- log(xi + sqrt(xi*xi+1))
|
---|
81 | #problemName <- "keijzer-9"
|
---|
82 | #target <- target_keijzer9
|
---|
83 | #
|
---|
84 | #xi <- seq(-5, 4.9, 0.1)
|
---|
85 | #target_pagie1d <- 1 / (1+xi^-4)
|
---|
86 | #problemName <- "pagie-1d"
|
---|
87 | #target <- target_pagie1d
|
---|
88 | #
|
---|
89 | #xi <- seq(-1, 1, 0.0201)
|
---|
90 | #target_nguyen5 <- sin(xi*xi)*cos(xi)-1
|
---|
91 | #problemName <- "nguyen-5"
|
---|
92 | #target <- target_nguyen5
|
---|
93 | #
|
---|
94 | #xi <- seq(-1, 1, 0.0201)
|
---|
95 | #target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
|
---|
96 | #problemName <- "nguyen-6"
|
---|
97 | #target <- target_nguyen6
|
---|
98 |
|
---|
99 | target_df <- data.frame(xi, target)
|
---|
100 | ggplot(target_df, aes(x=xi, y=target)) +
|
---|
101 | theme_void() +
|
---|
102 | geom_line(alpha=1, size = 0.3, color='red');
|
---|
103 | ggsave(gsub(" ", "", paste(problemName, ".pdf")),
|
---|
104 | device="pdf",width=2, height=1.5, units="cm");
|
---|
105 |
|
---|
106 | for(i in seq(1:10)) {
|
---|
107 | #i <- 1
|
---|
108 | clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
|
---|
109 | #i <- 9358
|
---|
110 | cluster_n <- dplyr::filter(m, c==clusterNumber);
|
---|
111 | for(j in seq(1,nrow(cluster_n))) {
|
---|
112 | # j <- 2
|
---|
113 | other <- t(cluster_n[j,])[5:104]
|
---|
114 | max(other)
|
---|
115 | temp <- data.frame(y=as.vector(target), x=other)
|
---|
116 | cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
|
---|
117 | }
|
---|
118 | cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
|
---|
119 | evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
|
---|
120 |
|
---|
121 | #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
|
---|
122 |
|
---|
123 | size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]
|
---|
124 |
|
---|
125 | ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
|
---|
126 | theme_void() +
|
---|
127 | geom_line(alpha=max(0.1, 1/size), size = 0.2);
|
---|
128 |
|
---|
129 | ggsave(gsub(" ", "", paste(problemName,"-",as.character(i),
|
---|
130 | ".pdf")),
|
---|
131 | device="pdf",width=2, height=1.5, units="cm");
|
---|
132 | }
|
---|
133 |
|
---|
134 |
|
---|
135 | #funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
|
---|
136 | #cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
|
---|
137 | #plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)
|
---|
138 |
|
---|
139 | m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
|
---|
140 | m_sub <- m[m$q<1.0,];
|
---|
141 |
|
---|
142 | m <- dplyr::arrange(m, q);
|
---|
143 |
|
---|
144 | # plot mapped points (clusters)
|
---|
145 | ggplot(data=m[,], aes(x=x, y=y)) +
|
---|
146 | geom_point(aes(color=q))+
|
---|
147 | scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) +
|
---|
148 | scale_size_area(max_size=1) +
|
---|
149 | labs(color="R²")
|
---|
150 | # theme(legend.position = "none")
|
---|
151 | # scale_color_gradient(low = "red",high = "black")
|
---|
152 | ;
|
---|
153 | ggsave("phenotypic_clusters.pdf", device="pdf", width=11.66, height=5.9, units="cm", dpi=600, scale=2)
|
---|
154 |
|
---|
155 | # qualities in clusters
|
---|
156 | #
|
---|
157 | ggplot(data=m, aes(x=c, y=q)) +
|
---|
158 | geom_boxplot(aes(group=c));
|
---|
159 |
|
---|
160 |
|
---|
161 |
|
---|
162 | # plot mapped points (qualities)
|
---|
163 | #ggplot(data=m, aes(x=x, y=y)) +
|
---|
164 | # geom_point(aes(color=q)) +
|
---|
165 | # scale_color_gradientn(colors=heat.colors(30))
|
---|
166 | #;
|
---|
167 |
|
---|
168 |
|
---|
169 | #write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
|
---|
170 | m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
|
---|
171 |
|
---|
172 | # produce all clusters (TODO: order by cluster size)
|
---|
173 | clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
|
---|
174 | clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)
|
---|
175 |
|
---|
176 | #TODO: check 2, 10
|
---|
177 | #check 143292, 122983
|
---|
178 | #check 118734, 118970
|
---|
179 | #cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
|
---|
180 | #plot(t(outputs[122735,]))
|
---|
181 |
|
---|
182 | #clusters
|
---|
183 | picIdx <- 0
|
---|
184 | for(i in clusterSizes$`clusters$ClusterId`) {
|
---|
185 | #i <- 9358
|
---|
186 | representative <- t(outputs[i+1,])
|
---|
187 | #representative
|
---|
188 | cluster_n <- dplyr::filter(m, c==i);
|
---|
189 | for(j in seq(1,nrow(cluster_n))) {
|
---|
190 | # j <- 2
|
---|
191 | other <- t(cluster_n[j,])[5:104]
|
---|
192 | max(other)
|
---|
193 | temp <- data.frame(y=as.vector(representative), x=other)
|
---|
194 | cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
|
---|
195 | }
|
---|
196 | cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
|
---|
197 | evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
|
---|
198 |
|
---|
199 | #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
|
---|
200 |
|
---|
201 | ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
|
---|
202 | theme_void() +
|
---|
203 | geom_line(alpha=1, size = 1);
|
---|
204 | size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
|
---|
205 | row_num <- picIdx %/% 50
|
---|
206 | ggsave(paste(as.character(row_num),
|
---|
207 | as.character(picIdx- row_num*50),
|
---|
208 | as.character(i),
|
---|
209 | #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)),
|
---|
210 | #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
|
---|
211 | ".png"),
|
---|
212 | width=4, height=3);
|
---|
213 | picIdx <- picIdx+1
|
---|
214 | }
|
---|
215 |
|
---|
216 |
|
---|
217 | # inspect one cluster
|
---|
218 | cluster_n <- dplyr::filter(m, c==0);
|
---|
219 | cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
|
---|
220 | evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
|
---|
221 |
|
---|
222 | p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
|
---|
223 | p
|
---|
224 |
|
---|
225 | # plot ranked clusters
|
---|
226 | ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
|
---|
227 | geom_point() +
|
---|
228 | labs(y='Avg R²', x=paste('Cluster rank','-',problemName))
|
---|
229 | ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
|
---|
230 | device="pdf",
|
---|
231 | width=6, height=4.5, units="cm"
|
---|
232 | )
|
---|
233 |
|
---|