Changeset 16174
- Timestamp:
- 09/22/18 07:12:57 (6 years ago)
- Location:
- branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R
- Files:
-
- 2 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R
r15936 r16174 13 13 # read from R binary (faster) 14 14 evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); 15 16 nrow(evalData[evalData$R2.keijzer4 > 0.4,]) 15 17 16 18 max(evalData$R2.keijzer4); … … 21 23 max(evalData$R2.nguyen7); 22 24 23 outputs <- evalData[,10:109];24 25 25 26 #check zero mean, unit variance … … 46 47 47 48 lv <- largeVis(t(outputs), dim=2, distance_method="Cosine", 48 perplexity=100, K = 100, n_trees = 150, threads=4, 49 save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ; 50 clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20); 51 49 perplexity=100, K = 300, n_trees = 50, threads=4, 50 verbose=TRUE) ; 51 #clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300); 52 53 #use clusters calculated with C# program 54 clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';') 52 55 53 56 # calculate quality distribution for each cluster 54 57 qualities <- evalData$R2.keijzer4; 55 clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );58 clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] ); 56 59 57 60 clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ; … … 63 66 clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality)); 64 67 clusterStats$Rank <- seq(1:nrow(clusterStats)); 65 ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point(); 66 67 write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = "."); 68 69 #check clusters 70 for(i in seq(1:nrow(clusterStats))) { 68 69 # write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = "."); 70 71 ##output target function for paper 72 ## using only one of the following 73 #xi <- seq(0,9.99,0.1); 74 #target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1); 75 #problemName <- "keijzer-4" 76 #target <- target_keijzer4 77 # 78 # 79 #xi <- seq(0, 99, 1) 80 #target_keijzer9 <- log(xi + sqrt(xi*xi+1)) 81 #problemName <- "keijzer-9" 82 #target <- target_keijzer9 83 # 84 #xi <- seq(-5, 4.9, 0.1) 85 #target_pagie1d <- 1 / (1+xi^-4) 86 #problemName <- "pagie-1d" 87 #target <- target_pagie1d 88 # 89 #xi <- seq(-1, 1, 0.0201) 90 #target_nguyen5 <- sin(xi*xi)*cos(xi)-1 91 #problemName <- "nguyen-5" 92 #target <- target_nguyen5 93 # 94 #xi <- seq(-1, 1, 0.0201) 95 #target_nguyen6 <- sin(xi)+sin(xi+xi*xi) 96 #problemName <- "nguyen-6" 97 #target <- target_nguyen6 98 99 target_df <- data.frame(xi, target) 100 ggplot(target_df, aes(x=xi, y=target)) + 101 theme_void() + 102 geom_line(alpha=1, size = 0.3, color='red'); 103 ggsave(gsub(" ", "", paste(problemName, ".pdf")), 104 device="pdf",width=2, height=1.5, units="cm"); 105 106 for(i in seq(1:10)) { 107 #i <- 1 71 108 clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!) 72 cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value"); 73 xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster 74 ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) + 109 #i <- 9358 110 cluster_n <- dplyr::filter(m, c==clusterNumber); 111 for(j in seq(1,nrow(cluster_n))) { 112 # j <- 2 113 other <- t(cluster_n[j,])[5:104] 114 max(other) 115 temp <- data.frame(y=as.vector(target), x=other) 116 cluster_n[j,5:104] <- fitted(lm("y ~ x", temp)) 117 } 118 cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104])) 119 evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) 120 121 #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line() 122 123 size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber] 124 125 ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) + 75 126 theme_void() + 76 geom_line(alpha=0.1); 77 78 ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png")); 127 geom_line(alpha=max(0.1, 1/size), size = 0.2); 128 129 ggsave(gsub(" ", "", paste(problemName,"-",as.character(i), 130 ".pdf")), 131 device="pdf",width=2, height=1.5, units="cm"); 79 132 } 80 133 81 funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748] 82 cor(method="pearson", target_keijzer4, t(outputs[20281, ])) 83 plot(funs_in_cluster[,4], target_keijzer4) 84 85 xi <- seq(0,9.99,0.1); 86 # x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1) 87 target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1); 88 plot(xi, target_keijzer4); 89 90 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs) 134 135 #funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber] 136 #cor(method="pearson", target_keijzer4, t(outputs[18450, ])) 137 #plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4) 138 139 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs) 91 140 m_sub <- m[m$q<1.0,]; 92 141 142 m <- dplyr::arrange(m, q); 143 93 144 # plot mapped points (clusters) 94 ggplot(data=m, aes(x=x, y=y)) + 95 geom_point(aes(color=c)) + 96 theme(legend.position = "none") 145 ggplot(data=m[,], aes(x=x, y=y)) + 146 geom_point(aes(color=q))+ 147 scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) + 148 scale_size_area(max_size=1) + 149 labs(color="R²", title="Phenotypic Embedding (R² with Keijzer4)") 150 # theme(legend.position = "none") 97 151 # scale_color_gradient(low = "red",high = "black") 98 152 ; 99 153 ggsave("phenotypic_clusters.png") 100 154 155 # qualities in clusters 156 # 157 ggplot(data=m, aes(x=c, y=q)) + 158 geom_boxplot(aes(group=c)); 159 160 101 161 102 162 # plot mapped points (qualities) 103 ggplot(data=m, aes(x=x, y=y)) +104 geom_point(aes(color=q)) +105 scale_color_gradientn(colors=heat.colors(30))106 ;163 #ggplot(data=m, aes(x=x, y=y)) + 164 # geom_point(aes(color=q)) + 165 # scale_color_gradientn(colors=heat.colors(30)) 166 #; 107 167 108 168 … … 110 170 m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv"); 111 171 112 cluster_n <- dplyr::filter(m, c==9); 172 # produce all clusters (TODO: order by cluster size) 173 clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count() 174 clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`) 175 176 #TODO: check 2, 10 177 #check 143292, 122983 178 #check 118734, 118970 179 #cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2 180 #plot(t(outputs[122735,])) 181 182 #clusters 183 picIdx <- 0 184 for(i in clusterSizes$`clusters$ClusterId`) { 185 #i <- 9358 186 representative <- t(outputs[i+1,]) 187 #representative 188 cluster_n <- dplyr::filter(m, c==i); 189 for(j in seq(1,nrow(cluster_n))) { 190 # j <- 2 191 other <- t(cluster_n[j,])[5:104] 192 max(other) 193 temp <- data.frame(y=as.vector(representative), x=other) 194 cluster_n[j,5:104] <- fitted(lm("y ~ x", temp)) 195 } 196 cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104])) 197 evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) 198 199 #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line() 200 201 ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) + 202 theme_void() + 203 geom_line(alpha=1, size = 1); 204 size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i] 205 row_num <- picIdx %/% 50 206 ggsave(paste(as.character(row_num), 207 as.character(picIdx- row_num*50), 208 as.character(i), 209 #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)), 210 #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)), 211 ".png"), 212 width=4, height=3); 213 picIdx <- picIdx+1 214 } 215 216 217 # inspect one cluster 218 cluster_n <- dplyr::filter(m, c==0); 113 219 cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104])) 114 220 evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) … … 116 222 p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line(); 117 223 p 224 225 # plot ranked clusters 226 ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) + 227 geom_point() + 228 labs(y='Avg R²', x=paste('Cluster rank','-',problemName)) 229 ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")), 230 device="pdf", 231 width=6, height=4.5, units="cm" 232 ) 233 -
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R
r15936 r16174 1 1 # find nearest neighbors for GP indiviuals 2 2 3 library(dplyr) 4 library(hexbin) 3 5 library(ggplot2); 6 #library(doSNOW); #dopar 7 #library(foreach); # dopar 4 8 5 gp_log <- read.csv2("C:/Users/P24581/filebox/GPTP 2018/symbreg-models-13.05.2018-1418.csv",header = FALSE,sep='\t',dec=','); 9 #cl<-makeCluster(2) #change the 2 to your number of CPU cores 10 #registerDoSNOW(cl) 11 12 setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4"); 13 14 gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=','); 6 15 idx <- seq(1:nrow(gp_log)); 7 16 # check popSize in gp_log 8 gp_log[seq(1, 34000,500),1]17 gp_log[seq(1,20500,500),1] 9 18 popSize <- 500; 10 19 … … 13 22 14 23 # generations <- seq(1,34000/popSize,1); 15 generations <- seq(1,15,1); 24 generations <- seq(1,41,1); 25 16 26 numClusters <- max(m$c); 17 27 18 28 gp_evals <- gp_log[,seq(3,202,2)]; 19 all_evals <- m[,6:105]; 29 all_evals <- m[,5:104]; 30 gen_i=1 31 stats <- data.frame(gen=generations, 32 numDiffClusters = -generations, 33 medianRank=-generations) 20 34 for(gen_i in generations) { 21 #gen_i <- 15;35 #gen_i <- 1; 22 36 selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1); 23 min(selectedRows)24 max(selectedRows)37 #min(select2edRows) 38 #max(selectedRows) 25 39 xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2 26 mapped_gp_log <- m[max.col(t(xcorrel)), 1:5] 40 xcorrel[is.na(xcorrel)] <- 0; # for constant expressions 41 42 idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel))); 43 44 # number of different nearest neighbours 45 numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel)) 46 stats$numDiffClusters[gen_i] <- numDistinctNeighbors 47 48 mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5] 27 49 #check 28 50 #cor(t(all_evals[128082,]), t(gp_evals[2,]))^2 29 51 #max(cor(t(all_evals[,]), t(gp_evals[2,]))^2) 30 ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + geom_point();52 ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point(); 31 53 ggsave(paste("scatter",gen_i,".png")) 54 55 # bestFunc <- data.frame(x = seq(0,9.8,0.1), 56 # t(m[idxOfMaxCorrel$idx[1], 6:104]), 57 # t(gp_evals[selectedRows[1],1:99])); 58 # # best neighbor 59 # ggplot(bestFunc, aes(x=x)) + 60 # geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) + 61 # geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) + 62 # guides(fill="legend") + 63 # labs(y="f(x)", color="Function"); 64 # ggsave(paste("best_function",gen_i,".png")) 65 66 # count number of unclustered 67 # count(m, m$c==-1) 32 68 33 ggplot(mapped_gp_log, aes(x=c)) +xlim(0,numClusters+1) + geom_histogram(binwidth = 1); 34 ggsave(paste("cluster_freq",gen_i,".png")) 69 #plot expr freq by ranks 70 mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters")) 71 stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank) 72 73 mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n()) 74 mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters")) 75 ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) + 76 scale_y_log10() + 77 xlim(0,17540) + 78 geom_point() 79 ggsave(paste("cluster-freqs",gen_i,".pdf"), 80 device=pdf,width=3) 81 82 83 #plot hex map 84 ggplot(mapped_gp_log, aes(x=x, y=y)) + 85 xlim(-60,60) + 86 ylim(-60,60) + 87 theme_void() + 88 geom_hex() 89 90 ggsave(paste("hexbin",gen_i,".pdf"), 91 device=pdf,width=3,height=2) 92 93 # ggplot(mapped_gp_log, aes(x=c)) + 94 # xlim(-2,numClusters+1) + 95 # geom_histogram(binwidth = 1) + 96 # ylim(0,500); 97 # ggsave(paste("cluster_freq",gen_i,".png")) 35 98 } 99 100 # plot line chart of stats 101 ggplot(stats, aes(x=gen)) + 102 xlim(1,40) + 103 geom_line(aes(y=medianRank)) + 104 theme_classic() + 105 labs(x="Generations", y="Median cluster rank") + 106 scale_y_log10() 107 ggsave("gp-medianClusterRank.pdf", device=pdf) 108 109 print(stats$numDiffClusters) 110 111 stats$numDiffClusters <- unlist(stats$numDiffClusters) 112 113 ggplot(stats, aes(x=gen, y=numDiffClusters)) + 114 xlim(1,40) + 115 geom_line() + 116 theme_classic() + 117 labs(x="Generations", y="Number of explored clusters") 118 ggsave("gp-numClusters.pdf", device=pdf)
Note: See TracChangeset
for help on using the changeset viewer.