Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R @ 16188

Last change on this file since 16188 was 16180, checked in by gkronber, 7 years ago

changed output format for maps for the GPTP paper

File size: 8.4 KB
Line 
1library(largeVis)
2library(ggplot2)
3library(dplyr)
4
5#setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
6setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
7sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
8
9# read from CSV and store as R binary (must be done once to produce the .rds file)
10#evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
11#saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
12
13# read from R binary (faster)
14evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
15
16nrow(evalData[evalData$R2.keijzer4 > 0.4,])
17
18max(evalData$R2.keijzer4);
19max(evalData$R2.keijzer9);
20max(evalData$R2.pagie);
21max(evalData$R2.nguyen5);
22max(evalData$R2.nguyen6);
23max(evalData$R2.nguyen7);
24
25
26#check zero mean, unit variance
27#mean(t(outputs[2,]))
28#sd(t(outputs[2,]))
29
30# check
31# plot(t(outputs[4,]))
32
33#apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
34# check ANN
35#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
36#xs <- rep(seq(1:100),100)
37#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
38
39#edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
40#clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
41
42# check cluster
43#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
44#xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
45#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
46
47
48lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
49               perplexity=100, K = 300, n_trees = 50, threads=4,
50               verbose=TRUE) ;
51#clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);
52
53#use clusters calculated with C# program
54clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')
55
56# calculate quality distribution for each cluster
57qualities <- evalData$R2.keijzer4;
58clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
59
60clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
61clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
62clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
63clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
64clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
65clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
66clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
67clusterStats$Rank <- seq(1:nrow(clusterStats));
68
69# write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");
70
71##output target function for paper
72## using only one of the following
73#xi <- seq(0,9.99,0.1);
74#target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
75#problemName <- "keijzer-4"
76#target <- target_keijzer4
77#
78#
79#xi <- seq(0, 99, 1)
80#target_keijzer9 <- log(xi + sqrt(xi*xi+1))
81#problemName <- "keijzer-9" 
82#target <- target_keijzer9
83#
84#xi <- seq(-5, 4.9, 0.1)
85#target_pagie1d <- 1 / (1+xi^-4)
86#problemName <- "pagie-1d" 
87#target <- target_pagie1d
88#
89#xi <- seq(-1, 1, 0.0201)
90#target_nguyen5 <- sin(xi*xi)*cos(xi)-1
91#problemName <- "nguyen-5" 
92#target <- target_nguyen5
93#
94#xi <- seq(-1, 1, 0.0201)
95#target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
96#problemName <- "nguyen-6" 
97#target <- target_nguyen6
98
99target_df <- data.frame(xi, target)
100ggplot(target_df, aes(x=xi, y=target)) +
101  theme_void() +
102  geom_line(alpha=1, size = 0.3, color='red');
103ggsave(gsub(" ", "", paste(problemName, ".pdf")),
104       device="pdf",width=2, height=1.5, units="cm");
105
106for(i in seq(1:10)) {
107  #i <- 1
108  clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
109  #i <- 9358
110  cluster_n <- dplyr::filter(m, c==clusterNumber);
111  for(j in seq(1,nrow(cluster_n))) {
112    #    j <- 2
113    other <- t(cluster_n[j,])[5:104]
114    max(other)
115    temp <- data.frame(y=as.vector(target), x=other)
116    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
117  }
118  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
119  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
120 
121  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
122 
123  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]
124
125    ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
126    theme_void() +
127    geom_line(alpha=max(0.1, 1/size), size = 0.2);
128 
129  ggsave(gsub(" ", "", paste(problemName,"-",as.character(i),
130                       ".pdf")),
131         device="pdf",width=2, height=1.5, units="cm");
132}
133
134
135#funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
136#cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
137#plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)
138
139m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
140m_sub <- m[m$q<1.0,];
141
142m <- dplyr::arrange(m, q);
143
144# plot mapped points (clusters)
145ggplot(data=m[,], aes(x=x, y=y)) +
146  geom_point(aes(color=q))+
147  scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) +
148  scale_size_area(max_size=1) +
149  labs(color="R²")
150#  theme(legend.position = "none")
151#  scale_color_gradient(low = "red",high = "black")
152;
153ggsave("phenotypic_clusters.pdf", device="pdf", width=11.66, height=5.9, units="cm", dpi=600, scale=2)
154
155# qualities in clusters
156#
157ggplot(data=m, aes(x=c, y=q)) +
158  geom_boxplot(aes(group=c));
159 
160
161
162# plot mapped points (qualities)
163#ggplot(data=m, aes(x=x, y=y)) +
164#  geom_point(aes(color=q))  +
165#  scale_color_gradientn(colors=heat.colors(30))
166#;
167
168
169#write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
170m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
171
172# produce all clusters (TODO: order by cluster size)
173clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
174clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)
175
176#TODO: check 2, 10
177#check 143292, 122983
178#check 118734, 118970
179#cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
180#plot(t(outputs[122735,]))
181
182#clusters
183picIdx <- 0
184for(i in clusterSizes$`clusters$ClusterId`) {
185  #i <- 9358
186  representative <- t(outputs[i+1,])
187  #representative
188  cluster_n <- dplyr::filter(m, c==i);
189  for(j in seq(1,nrow(cluster_n))) {
190#    j <- 2
191    other <- t(cluster_n[j,])[5:104]
192    max(other)
193    temp <- data.frame(y=as.vector(representative), x=other)
194    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
195  }
196  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
197  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
198
199  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
200 
201  ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
202    theme_void() +
203    geom_line(alpha=1, size = 1);
204  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
205  row_num <- picIdx %/% 50
206  ggsave(paste(as.character(row_num),
207               as.character(picIdx- row_num*50),
208               as.character(i),
209         #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)),
210         #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
211              ".png"),
212         width=4, height=3);
213  picIdx <- picIdx+1
214}
215
216
217# inspect one cluster
218cluster_n <- dplyr::filter(m, c==0);
219cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
220evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
221
222p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
223p
224
225# plot ranked clusters
226ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
227  geom_point() +
228  labs(y='Avg R²', x=paste('Cluster rank','-',problemName))
229ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
230       device="pdf",
231       width=6, height=4.5, units="cm"
232       )
233
Note: See TracBrowser for help on using the repository browser.