Context Navigation

ClusteringScript.R @ 16188

Visit:

Last change on this file since 16188 was 16180, checked in by gkronber, 7 years ago
changed output format for maps for the GPTP paper
File size: 8.4 KB

Line
1	library(largeVis)
2	library(ggplot2)
3	library(dplyr)
4
5	#setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
6	setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
7	sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
8
9	# read from CSV and store as R binary (must be done once to produce the .rds file)
10	#evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
11	#saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
12
13	# read from R binary (faster)
14	evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
15
16	nrow(evalData[evalData$R2.keijzer4 > 0.4,])
17
18	max(evalData$R2.keijzer4);
19	max(evalData$R2.keijzer9);
20	max(evalData$R2.pagie);
21	max(evalData$R2.nguyen5);
22	max(evalData$R2.nguyen6);
23	max(evalData$R2.nguyen7);
24
25
26	#check zero mean, unit variance
27	#mean(t(outputs[2,]))
28	#sd(t(outputs[2,]))
29
30	# check
31	# plot(t(outputs[4,]))
32
33	#apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
34	# check ANN
35	#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
36	#xs <- rep(seq(1:100),100)
37	#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
38
39	#edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
40	#clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
41
42	# check cluster
43	#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
44	#xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
45	#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
46
47
48	lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
49	perplexity=100, K = 300, n_trees = 50, threads=4,
50	verbose=TRUE) ;
51	#clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);
52
53	#use clusters calculated with C# program
54	clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')
55
56	# calculate quality distribution for each cluster
57	qualities <- evalData$R2.keijzer4;
58	clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
59
60	clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
61	clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
62	clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
63	clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
64	clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
65	clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
66	clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
67	clusterStats$Rank <- seq(1:nrow(clusterStats));
68
69	# write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");
70
71	##output target function for paper
72	## using only one of the following
73	#xi <- seq(0,9.99,0.1);
74	#target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)sin(xi) cos(xi) - 1);
75	#problemName <- "keijzer-4"
76	#target <- target_keijzer4
77	#
78	#
79	#xi <- seq(0, 99, 1)
80	#target_keijzer9 <- log(xi + sqrt(xi*xi+1))
81	#problemName <- "keijzer-9"
82	#target <- target_keijzer9
83	#
84	#xi <- seq(-5, 4.9, 0.1)
85	#target_pagie1d <- 1 / (1+xi^-4)
86	#problemName <- "pagie-1d"
87	#target <- target_pagie1d
88	#
89	#xi <- seq(-1, 1, 0.0201)
90	#target_nguyen5 <- sin(xixi)cos(xi)-1
91	#problemName <- "nguyen-5"
92	#target <- target_nguyen5
93	#
94	#xi <- seq(-1, 1, 0.0201)
95	#target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
96	#problemName <- "nguyen-6"
97	#target <- target_nguyen6
98
99	target_df <- data.frame(xi, target)
100	ggplot(target_df, aes(x=xi, y=target)) +
101	theme_void() +
102	geom_line(alpha=1, size = 0.3, color='red');
103	ggsave(gsub(" ", "", paste(problemName, ".pdf")),
104	device="pdf",width=2, height=1.5, units="cm");
105
106	for(i in seq(1:10)) {
107	#i <- 1
108	clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
109	#i <- 9358
110	cluster_n <- dplyr::filter(m, c==clusterNumber);
111	for(j in seq(1,nrow(cluster_n))) {
112	# j <- 2
113	other <- t(cluster_n[j,])[5:104]
114	max(other)
115	temp <- data.frame(y=as.vector(target), x=other)
116	cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
117	}
118	cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
119	evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
120
121	#ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
122
123	size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]
124
125	ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
126	theme_void() +
127	geom_line(alpha=max(0.1, 1/size), size = 0.2);
128
129	ggsave(gsub(" ", "", paste(problemName,"-",as.character(i),
130	".pdf")),
131	device="pdf",width=2, height=1.5, units="cm");
132	}
133
134
135	#funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
136	#cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
137	#plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)
138
139	m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
140	m_sub <- m[m$q<1.0,];
141
142	m <- dplyr::arrange(m, q);
143
144	# plot mapped points (clusters)
145	ggplot(data=m[,], aes(x=x, y=y)) +
146	geom_point(aes(color=q))+
147	scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) +
148	scale_size_area(max_size=1) +
149	labs(color="RÂ²")
150	# theme(legend.position = "none")
151	# scale_color_gradient(low = "red",high = "black")
152	;
153	ggsave("phenotypic_clusters.pdf", device="pdf", width=11.66, height=5.9, units="cm", dpi=600, scale=2)
154
155	# qualities in clusters
156	#
157	ggplot(data=m, aes(x=c, y=q)) +
158	geom_boxplot(aes(group=c));
159
160
161
162	# plot mapped points (qualities)
163	#ggplot(data=m, aes(x=x, y=y)) +
164	# geom_point(aes(color=q)) +
165	# scale_color_gradientn(colors=heat.colors(30))
166	#;
167
168
169	#write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
170	m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
171
172	# produce all clusters (TODO: order by cluster size)
173	clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
174	clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)
175
176	#TODO: check 2, 10
177	#check 143292, 122983
178	#check 118734, 118970
179	#cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
180	#plot(t(outputs[122735,]))
181
182	#clusters
183	picIdx <- 0
184	for(i in clusterSizes$`clusters$ClusterId`) {
185	#i <- 9358
186	representative <- t(outputs[i+1,])
187	#representative
188	cluster_n <- dplyr::filter(m, c==i);
189	for(j in seq(1,nrow(cluster_n))) {
190	# j <- 2
191	other <- t(cluster_n[j,])[5:104]
192	max(other)
193	temp <- data.frame(y=as.vector(representative), x=other)
194	cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
195	}
196	cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
197	evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
198
199	#ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
200
201	ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
202	theme_void() +
203	geom_line(alpha=1, size = 1);
204	size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
205	row_num <- picIdx %/% 50
206	ggsave(paste(as.character(row_num),
207	as.character(picIdx- row_num*50),
208	as.character(i),
209	#as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)),
210	#as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
211	".png"),
212	width=4, height=3);
213	picIdx <- picIdx+1
214	}
215
216
217	# inspect one cluster
218	cluster_n <- dplyr::filter(m, c==0);
219	cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
220	evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
221
222	p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
223	p
224
225	# plot ranked clusters
226	ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
227	geom_point() +
228	labs(y='Avg RÂ²', x=paste('Cluster rank','-',problemName))
229	ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
230	device="pdf",
231	width=6, height=4.5, units="cm"
232	)
233

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format