library(largeVis) library(ggplot2) library(dplyr) #setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data"); setwd("C:/reps/HEAL/Publications-2018-GPTP/data"); sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"; # read from CSV and store as R binary (must be done once to produce the .rds file) #evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=","); #saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); # read from R binary (faster) evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); nrow(evalData[evalData$R2.keijzer4 > 0.4,]) max(evalData$R2.keijzer4); max(evalData$R2.keijzer9); max(evalData$R2.pagie); max(evalData$R2.nguyen5); max(evalData$R2.nguyen6); max(evalData$R2.nguyen7); #check zero mean, unit variance #mean(t(outputs[2,])) #sd(t(outputs[2,])) # check # plot(t(outputs[4,])) #apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE) # check ANN #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value"); #xs <- rep(seq(1:100),100) #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); #edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE); #clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE); # check cluster #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value"); #xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); lv <- largeVis(t(outputs), dim=2, distance_method="Cosine", perplexity=100, K = 300, n_trees = 50, threads=4, verbose=TRUE) ; #clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300); #use clusters calculated with C# program clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';') # calculate quality distribution for each cluster qualities <- evalData$R2.keijzer4; clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] ); clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ; clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities)); clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n()); clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x)); clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y)); clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters"); clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality)); clusterStats$Rank <- seq(1:nrow(clusterStats)); # write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = "."); ##output target function for paper ## using only one of the following #xi <- seq(0,9.99,0.1); #target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1); #problemName <- "keijzer-4" #target <- target_keijzer4 # # #xi <- seq(0, 99, 1) #target_keijzer9 <- log(xi + sqrt(xi*xi+1)) #problemName <- "keijzer-9" #target <- target_keijzer9 # #xi <- seq(-5, 4.9, 0.1) #target_pagie1d <- 1 / (1+xi^-4) #problemName <- "pagie-1d" #target <- target_pagie1d # #xi <- seq(-1, 1, 0.0201) #target_nguyen5 <- sin(xi*xi)*cos(xi)-1 #problemName <- "nguyen-5" #target <- target_nguyen5 # #xi <- seq(-1, 1, 0.0201) #target_nguyen6 <- sin(xi)+sin(xi+xi*xi) #problemName <- "nguyen-6" #target <- target_nguyen6 target_df <- data.frame(xi, target) ggplot(target_df, aes(x=xi, y=target)) + theme_void() + geom_line(alpha=1, size = 0.3, color='red'); ggsave(gsub(" ", "", paste(problemName, ".pdf")), device="pdf",width=2, height=1.5, units="cm"); for(i in seq(1:10)) { #i <- 1 clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!) #i <- 9358 cluster_n <- dplyr::filter(m, c==clusterNumber); for(j in seq(1,nrow(cluster_n))) { # j <- 2 other <- t(cluster_n[j,])[5:104] max(other) temp <- data.frame(y=as.vector(target), x=other) cluster_n[j,5:104] <- fitted(lm("y ~ x", temp)) } cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104])) evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line() size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber] ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) + theme_void() + geom_line(alpha=max(0.1, 1/size), size = 0.2); ggsave(gsub(" ", "", paste(problemName,"-",as.character(i), ".pdf")), device="pdf",width=2, height=1.5, units="cm"); } #funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber] #cor(method="pearson", target_keijzer4, t(outputs[18450, ])) #plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4) m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs) m_sub <- m[m$q<1.0,]; m <- dplyr::arrange(m, q); # plot mapped points (clusters) ggplot(data=m[,], aes(x=x, y=y)) + geom_point(aes(color=q))+ scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) + scale_size_area(max_size=1) + labs(color="R²") # theme(legend.position = "none") # scale_color_gradient(low = "red",high = "black") ; ggsave("phenotypic_clusters.pdf", device="pdf", width=11.66, height=5.9, units="cm", dpi=600, scale=2) # qualities in clusters # ggplot(data=m, aes(x=c, y=q)) + geom_boxplot(aes(group=c)); # plot mapped points (qualities) #ggplot(data=m, aes(x=x, y=y)) + # geom_point(aes(color=q)) + # scale_color_gradientn(colors=heat.colors(30)) #; #write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv"); m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv"); # produce all clusters (TODO: order by cluster size) clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count() clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`) #TODO: check 2, 10 #check 143292, 122983 #check 118734, 118970 #cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2 #plot(t(outputs[122735,])) #clusters picIdx <- 0 for(i in clusterSizes$`clusters$ClusterId`) { #i <- 9358 representative <- t(outputs[i+1,]) #representative cluster_n <- dplyr::filter(m, c==i); for(j in seq(1,nrow(cluster_n))) { # j <- 2 other <- t(cluster_n[j,])[5:104] max(other) temp <- data.frame(y=as.vector(representative), x=other) cluster_n[j,5:104] <- fitted(lm("y ~ x", temp)) } cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104])) evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line() ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) + theme_void() + geom_line(alpha=1, size = 1); size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i] row_num <- picIdx %/% 50 ggsave(paste(as.character(row_num), as.character(picIdx- row_num*50), as.character(i), #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)), #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)), ".png"), width=4, height=3); picIdx <- picIdx+1 } # inspect one cluster cluster_n <- dplyr::filter(m, c==0); cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104])) evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line(); p # plot ranked clusters ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) + geom_point() + labs(y='Avg R²', x=paste('Cluster rank','-',problemName)) ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")), device="pdf", width=6, height=4.5, units="cm" )