Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R @ 16174

Last change on this file since 16174 was 16174, checked in by gkronber, 6 years ago

#2886 made several changes to clustering scripts for GPTP paper

File size: 3.8 KB
Line 
1# find nearest neighbors for GP indiviuals
2
3library(dplyr)
4library(hexbin)
5library(ggplot2);
6#library(doSNOW); #dopar
7#library(foreach); # dopar
8
9#cl<-makeCluster(2) #change the 2 to your number of CPU cores
10#registerDoSNOW(cl)
11
12setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4");
13
14gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=',');
15idx <- seq(1:nrow(gp_log));
16# check popSize in gp_log
17gp_log[seq(1,20500,500),1]
18popSize <- 500;
19
20# check qualities
21# ggplot(gp_log, aes(x=idx,y=gp_log$V1)) + geom_line();
22
23# generations <- seq(1,34000/popSize,1);
24generations <- seq(1,41,1);
25
26numClusters <- max(m$c);
27
28gp_evals <- gp_log[,seq(3,202,2)];
29all_evals <- m[,5:104];
30gen_i=1
31stats <- data.frame(gen=generations,
32                    numDiffClusters = -generations,
33                    medianRank=-generations)
34for(gen_i in generations) {
35 #gen_i <- 1;
36  selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1);
37  #min(select2edRows)
38  #max(selectedRows)
39  xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2
40  xcorrel[is.na(xcorrel)] <- 0; # for constant expressions
41 
42  idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel)));
43 
44  # number of different nearest neighbours
45  numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel))
46  stats$numDiffClusters[gen_i] <- numDistinctNeighbors
47 
48  mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5]
49#check
50#cor(t(all_evals[128082,]), t(gp_evals[2,]))^2
51#max(cor(t(all_evals[,]), t(gp_evals[2,]))^2)
52  ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point();
53  ggsave(paste("scatter",gen_i,".png"))
54 
55#  bestFunc <- data.frame(x = seq(0,9.8,0.1),
56#                         t(m[idxOfMaxCorrel$idx[1], 6:104]),
57#                         t(gp_evals[selectedRows[1],1:99]));
58#  # best neighbor
59#  ggplot(bestFunc, aes(x=x)) +
60#    geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) +
61#    geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) +
62#    guides(fill="legend") +
63#    labs(y="f(x)", color="Function");
64#  ggsave(paste("best_function",gen_i,".png"))
65 
66# count number of unclustered   
67#  count(m, m$c==-1)
68
69  #plot expr freq by ranks
70  mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters"))
71  stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank)
72 
73  mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n())
74  mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters"))
75  ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) +
76    scale_y_log10() +
77    xlim(0,17540) +
78    geom_point()
79  ggsave(paste("cluster-freqs",gen_i,".pdf"),
80         device=pdf,width=3)
81 
82 
83  #plot hex map
84  ggplot(mapped_gp_log, aes(x=x, y=y)) +
85    xlim(-60,60) +
86    ylim(-60,60) +
87    theme_void() +
88    geom_hex()
89 
90  ggsave(paste("hexbin",gen_i,".pdf"),
91         device=pdf,width=3,height=2)
92 
93#    ggplot(mapped_gp_log, aes(x=c)) +
94#  xlim(-2,numClusters+1) +
95#  geom_histogram(binwidth = 1) +
96#  ylim(0,500);
97#  ggsave(paste("cluster_freq",gen_i,".png"))
98}
99
100# plot line chart of stats
101ggplot(stats, aes(x=gen)) +
102  xlim(1,40) +
103  geom_line(aes(y=medianRank)) + 
104  theme_classic() +
105  labs(x="Generations", y="Median cluster rank") +
106  scale_y_log10()
107ggsave("gp-medianClusterRank.pdf", device=pdf)
108
109print(stats$numDiffClusters)
110
111stats$numDiffClusters <- unlist(stats$numDiffClusters)
112
113ggplot(stats, aes(x=gen, y=numDiffClusters)) +
114  xlim(1,40) +
115  geom_line() + 
116  theme_classic() +
117  labs(x="Generations", y="Number of explored clusters")
118ggsave("gp-numClusters.pdf", device=pdf)
Note: See TracBrowser for help on using the repository browser.