Context Navigation

← Previous Changeset
Next Changeset →

Changeset 16174

Timestamp:

09/22/18 07:12:57 (6 years ago)

Author:

gkronber

Message:

#2886 made several changes to clustering scripts for GPTP paper

Location:

branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R

Files:

: 2 added
: 2 edited

ClusteringScript.R (modified) (6 diffs)
Exploration-Spectrum-Clustering.R (added)
FindClustersForGPLog.R (modified) (2 diffs)
genotypic_similarity.r (added)

Legend:

: Unmodified
: Added
: Removed

branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R

-                      r15936
+                      r16174
 # read from R binary (faster)
 evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
+nrow(evalData[evalData$R2.keijzer4 > 0.4,])
 max(evalData$R2.keijzer4);
 …
 max(evalData$R2.nguyen7);
-outputs <- evalData[,10:109];
 #check zero mean, unit variance
 …
 lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
+               perplexity=100, K = 100, n_trees = 150, threads=4,
+               save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ;
+clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20);
+               perplexity=100, K = 300, n_trees = 50, threads=4,
+               verbose=TRUE) ;
+#clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);
+#use clusters calculated with C# program
+clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')
 # calculate quality distribution for each cluster
 qualities <- evalData$R2.keijzer4;
 clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
+clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
 clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
 …
 clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
 clusterStats$Rank <- seq(1:nrow(clusterStats));
+ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point();
+write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = ".");
+#check clusters
+for(i in seq(1:nrow(clusterStats))) {
+# write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");
+##output target function for paper
+## using only one of the following
+#xi <- seq(0,9.99,0.1);
+#target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
+#problemName <- "keijzer-4"
+#target <- target_keijzer4
+#
+#
+#xi <- seq(0, 99, 1)
+#target_keijzer9 <- log(xi + sqrt(xi*xi+1))
+#problemName <- "keijzer-9"
+#target <- target_keijzer9
+#
+#xi <- seq(-5, 4.9, 0.1)
+#target_pagie1d <- 1 / (1+xi^-4)
+#problemName <- "pagie-1d"
+#target <- target_pagie1d
+#
+#xi <- seq(-1, 1, 0.0201)
+#target_nguyen5 <- sin(xi*xi)*cos(xi)-1
+#problemName <- "nguyen-5"
+#target <- target_nguyen5
+#
+#xi <- seq(-1, 1, 0.0201)
+#target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
+#problemName <- "nguyen-6"
+#target <- target_nguyen6
+target_df <- data.frame(xi, target)
+ggplot(target_df, aes(x=xi, y=target)) +
+  theme_void() +
+  geom_line(alpha=1, size = 0.3, color='red');
+ggsave(gsub(" ", "", paste(problemName, ".pdf")),
+       device="pdf",width=2, height=1.5, units="cm");
+for(i in seq(1:10)) {
+  #i <- 1
   clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
+  cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value");
+  xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster
+  ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) +
+  #i <- 9358
+  cluster_n <- dplyr::filter(m, c==clusterNumber);
+  for(j in seq(1,nrow(cluster_n))) {
+    #    j <- 2
+    other <- t(cluster_n[j,])[5:104]
+    max(other)
+    temp <- data.frame(y=as.vector(target), x=other)
+    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
+  }
+  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
+  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
+  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
+  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]
+    ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
     theme_void() +
+    geom_line(alpha=0.1);
+  ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png"));
+    geom_line(alpha=max(0.1, 1/size), size = 0.2);
+  ggsave(gsub(" ", "", paste(problemName,"-",as.character(i),
+                       ".pdf")),
+         device="pdf",width=2, height=1.5, units="cm");
+}
+funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748]
+cor(method="pearson", target_keijzer4, t(outputs[20281, ]))
+plot(funs_in_cluster[,4], target_keijzer4)
+xi <- seq(0,9.99,0.1);
+#  xÂ³  * exp(-x) * cos(x) * sin(x) * (sin(x)Â² * cos(x) - 1)
+target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
+plot(xi, target_keijzer4);
+m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs)
+#funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
+#cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
+#plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)
+m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
 m_sub <- m[m$q<1.0,];
+m <- dplyr::arrange(m, q);
 # plot mapped points (clusters)
+ggplot(data=m, aes(x=x, y=y)) +
+  geom_point(aes(color=c))  +
+  theme(legend.position = "none")
+ggplot(data=m[,], aes(x=x, y=y)) +
+  geom_point(aes(color=q))+
+  scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) +
+  scale_size_area(max_size=1) +
+  labs(color="RÂ²", title="Phenotypic Embedding (RÂ² with Keijzer4)")
+#  theme(legend.position = "none")
 #  scale_color_gradient(low = "red",high = "black")
+;
 ggsave("phenotypic_clusters.png")
+# qualities in clusters
+#
+ggplot(data=m, aes(x=c, y=q)) +
+  geom_boxplot(aes(group=c));
 # plot mapped points (qualities)
 ggplot(data=m, aes(x=x, y=y)) +
   geom_point(aes(color=q))  +
   scale_color_gradientn(colors=heat.colors(30))
+;
+#ggplot(data=m, aes(x=x, y=y)) +
+#  geom_point(aes(color=q))  +
+#  scale_color_gradientn(colors=heat.colors(30))
+#;
 …
 m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
+cluster_n <- dplyr::filter(m, c==9);
+# produce all clusters (TODO: order by cluster size)
+clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
+clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)
+#TODO: check 2, 10
+#check 143292, 122983
+#check 118734, 118970
+#cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
+#plot(t(outputs[122735,]))
+#clusters
+picIdx <- 0
+for(i in clusterSizes$`clusters$ClusterId`) {
+  #i <- 9358
+  representative <- t(outputs[i+1,])
+  #representative
+  cluster_n <- dplyr::filter(m, c==i);
+  for(j in seq(1,nrow(cluster_n))) {
+#    j <- 2
+    other <- t(cluster_n[j,])[5:104]
+    max(other)
+    temp <- data.frame(y=as.vector(representative), x=other)
+    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
+  }
+  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
+  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
+  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
+  ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
+    theme_void() +
+    geom_line(alpha=1, size = 1);
+  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
+  row_num <- picIdx %/% 50
+  ggsave(paste(as.character(row_num),
+               as.character(picIdx- row_num*50),
+               as.character(i),
+         #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)),
+         #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
+              ".png"),
+         width=4, height=3);
+  picIdx <- picIdx+1
+}
+# inspect one cluster
+cluster_n <- dplyr::filter(m, c==0);
 cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
 evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
 …
 p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
+p
+# plot ranked clusters
+ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
+  geom_point() +
+  labs(y='Avg RÂ²', x=paste('Cluster rank','-',problemName))
+ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
+       device="pdf",
+       width=6, height=4.5, units="cm"
+       )

branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R

-                      r15936
+                      r16174
 # find nearest neighbors for GP indiviuals
+library(dplyr)
+library(hexbin)
 library(ggplot2);
+#library(doSNOW); #dopar
+#library(foreach); # dopar
+gp_log <- read.csv2("C:/Users/P24581/filebox/GPTP 2018/symbreg-models-13.05.2018-1418.csv",header = FALSE,sep='\t',dec=',');
+#cl<-makeCluster(2) #change the 2 to your number of CPU cores
+#registerDoSNOW(cl)
+setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4");
+gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=',');
 idx <- seq(1:nrow(gp_log));
 # check popSize in gp_log
 gp_log[seq(1,34000,500),1]
+gp_log[seq(1,20500,500),1]
 popSize <- 500;
 …
 # generations <- seq(1,34000/popSize,1);
+generations <- seq(1,15,1);
+generations <- seq(1,41,1);
 numClusters <- max(m$c);
 gp_evals <- gp_log[,seq(3,202,2)];
+all_evals <- m[,6:105];
+all_evals <- m[,5:104];
+gen_i=1
+stats <- data.frame(gen=generations,
+                    numDiffClusters = -generations,
+                    medianRank=-generations)
 for(gen_i in generations) {
 #gen_i <- 15;
+ #gen_i <- 1;
   selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1);
   min(selectedRows)
   max(selectedRows)
+  #min(select2edRows)
+  #max(selectedRows)
   xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2
+  mapped_gp_log <- m[max.col(t(xcorrel)), 1:5]
+  xcorrel[is.na(xcorrel)] <- 0; # for constant expressions
+  idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel)));
+  # number of different nearest neighbours
+  numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel))
+  stats$numDiffClusters[gen_i] <- numDistinctNeighbors
+  mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5]
 #check
 #cor(t(all_evals[128082,]), t(gp_evals[2,]))^2
 #max(cor(t(all_evals[,]), t(gp_evals[2,]))^2)
   ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + geom_point();
+  ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point();
   ggsave(paste("scatter",gen_i,".png"))
+#  bestFunc <- data.frame(x = seq(0,9.8,0.1),
+#                         t(m[idxOfMaxCorrel$idx[1], 6:104]),
+#                         t(gp_evals[selectedRows[1],1:99]));
+#  # best neighbor
+#  ggplot(bestFunc, aes(x=x)) +
+#    geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) +
+#    geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) +
+#    guides(fill="legend") +
+#    labs(y="f(x)", color="Function");
+#  ggsave(paste("best_function",gen_i,".png"))
+# count number of unclustered
+#  count(m, m$c==-1)
+  ggplot(mapped_gp_log, aes(x=c)) +xlim(0,numClusters+1) + geom_histogram(binwidth = 1);
+  ggsave(paste("cluster_freq",gen_i,".png"))
+  #plot expr freq by ranks
+  mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters"))
+  stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank)
+  mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n())
+  mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters"))
+  ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) +
+    scale_y_log10() +
+    xlim(0,17540) +
+    geom_point()
+  ggsave(paste("cluster-freqs",gen_i,".pdf"),
+         device=pdf,width=3)
+  #plot hex map
+  ggplot(mapped_gp_log, aes(x=x, y=y)) +
+    xlim(-60,60) +
+    ylim(-60,60) +
+    theme_void() +
+    geom_hex()
+  ggsave(paste("hexbin",gen_i,".pdf"),
+         device=pdf,width=3,height=2)
+#    ggplot(mapped_gp_log, aes(x=c)) +
+#  xlim(-2,numClusters+1) +
+#  geom_histogram(binwidth = 1) +
+#  ylim(0,500);
+#  ggsave(paste("cluster_freq",gen_i,".png"))
+}
+# plot line chart of stats
+ggplot(stats, aes(x=gen)) +
+  xlim(1,40) +
+  geom_line(aes(y=medianRank)) +
+  theme_classic() +
+  labs(x="Generations", y="Median cluster rank") +
+  scale_y_log10()
+ggsave("gp-medianClusterRank.pdf", device=pdf)
+print(stats$numDiffClusters)
+stats$numDiffClusters <- unlist(stats$numDiffClusters)
+ggplot(stats, aes(x=gen, y=numDiffClusters)) +
+  xlim(1,40) +
+  geom_line() +
+  theme_classic() +
+  labs(x="Generations", y="Number of explored clusters")
+ggsave("gp-numClusters.pdf", device=pdf)

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 16174

Legend:

branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R

branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R

Download in other formats: