Changeset 16174 for branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R
- Timestamp:
- 09/22/18 07:12:57 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R
r15936 r16174 1 1 # find nearest neighbors for GP indiviuals 2 2 3 library(dplyr) 4 library(hexbin) 3 5 library(ggplot2); 6 #library(doSNOW); #dopar 7 #library(foreach); # dopar 4 8 5 gp_log <- read.csv2("C:/Users/P24581/filebox/GPTP 2018/symbreg-models-13.05.2018-1418.csv",header = FALSE,sep='\t',dec=','); 9 #cl<-makeCluster(2) #change the 2 to your number of CPU cores 10 #registerDoSNOW(cl) 11 12 setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4"); 13 14 gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=','); 6 15 idx <- seq(1:nrow(gp_log)); 7 16 # check popSize in gp_log 8 gp_log[seq(1, 34000,500),1]17 gp_log[seq(1,20500,500),1] 9 18 popSize <- 500; 10 19 … … 13 22 14 23 # generations <- seq(1,34000/popSize,1); 15 generations <- seq(1,15,1); 24 generations <- seq(1,41,1); 25 16 26 numClusters <- max(m$c); 17 27 18 28 gp_evals <- gp_log[,seq(3,202,2)]; 19 all_evals <- m[,6:105]; 29 all_evals <- m[,5:104]; 30 gen_i=1 31 stats <- data.frame(gen=generations, 32 numDiffClusters = -generations, 33 medianRank=-generations) 20 34 for(gen_i in generations) { 21 #gen_i <- 15;35 #gen_i <- 1; 22 36 selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1); 23 min(selectedRows)24 max(selectedRows)37 #min(select2edRows) 38 #max(selectedRows) 25 39 xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2 26 mapped_gp_log <- m[max.col(t(xcorrel)), 1:5] 40 xcorrel[is.na(xcorrel)] <- 0; # for constant expressions 41 42 idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel))); 43 44 # number of different nearest neighbours 45 numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel)) 46 stats$numDiffClusters[gen_i] <- numDistinctNeighbors 47 48 mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5] 27 49 #check 28 50 #cor(t(all_evals[128082,]), t(gp_evals[2,]))^2 29 51 #max(cor(t(all_evals[,]), t(gp_evals[2,]))^2) 30 ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + geom_point();52 ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point(); 31 53 ggsave(paste("scatter",gen_i,".png")) 54 55 # bestFunc <- data.frame(x = seq(0,9.8,0.1), 56 # t(m[idxOfMaxCorrel$idx[1], 6:104]), 57 # t(gp_evals[selectedRows[1],1:99])); 58 # # best neighbor 59 # ggplot(bestFunc, aes(x=x)) + 60 # geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) + 61 # geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) + 62 # guides(fill="legend") + 63 # labs(y="f(x)", color="Function"); 64 # ggsave(paste("best_function",gen_i,".png")) 65 66 # count number of unclustered 67 # count(m, m$c==-1) 32 68 33 ggplot(mapped_gp_log, aes(x=c)) +xlim(0,numClusters+1) + geom_histogram(binwidth = 1); 34 ggsave(paste("cluster_freq",gen_i,".png")) 69 #plot expr freq by ranks 70 mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters")) 71 stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank) 72 73 mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n()) 74 mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters")) 75 ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) + 76 scale_y_log10() + 77 xlim(0,17540) + 78 geom_point() 79 ggsave(paste("cluster-freqs",gen_i,".pdf"), 80 device=pdf,width=3) 81 82 83 #plot hex map 84 ggplot(mapped_gp_log, aes(x=x, y=y)) + 85 xlim(-60,60) + 86 ylim(-60,60) + 87 theme_void() + 88 geom_hex() 89 90 ggsave(paste("hexbin",gen_i,".pdf"), 91 device=pdf,width=3,height=2) 92 93 # ggplot(mapped_gp_log, aes(x=c)) + 94 # xlim(-2,numClusters+1) + 95 # geom_histogram(binwidth = 1) + 96 # ylim(0,500); 97 # ggsave(paste("cluster_freq",gen_i,".png")) 35 98 } 99 100 # plot line chart of stats 101 ggplot(stats, aes(x=gen)) + 102 xlim(1,40) + 103 geom_line(aes(y=medianRank)) + 104 theme_classic() + 105 labs(x="Generations", y="Median cluster rank") + 106 scale_y_log10() 107 ggsave("gp-medianClusterRank.pdf", device=pdf) 108 109 print(stats$numDiffClusters) 110 111 stats$numDiffClusters <- unlist(stats$numDiffClusters) 112 113 ggplot(stats, aes(x=gen, y=numDiffClusters)) + 114 xlim(1,40) + 115 geom_line() + 116 theme_classic() + 117 labs(x="Generations", y="Number of explored clusters") 118 ggsave("gp-numClusters.pdf", device=pdf)
Note: See TracChangeset
for help on using the changeset viewer.