library(largeVis) library(ggplot2) library(dplyr) #setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data"); setwd("C:/reps/HEAL/Publications-2018-GPTP/data"); sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"; # read from CSV and store as R binary (must be done once to produce the .rds file) #evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=","); #saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); # read from R binary (faster) evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); max(evalData$R2.keijzer4); max(evalData$R2.keijzer9); max(evalData$R2.pagie); max(evalData$R2.nguyen5); max(evalData$R2.nguyen6); max(evalData$R2.nguyen7); outputs <- evalData[,10:109]; #check zero mean, unit variance #mean(t(outputs[2,])) #sd(t(outputs[2,])) # check # plot(t(outputs[4,])) #apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE) # check ANN #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value"); #xs <- rep(seq(1:100),100) #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); #edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE); #clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE); # check cluster #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value"); #xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); lv <- largeVis(t(outputs), dim=2, distance_method="Cosine", perplexity=100, K = 100, n_trees = 150, threads=4, save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ; clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20); # calculate quality distribution for each cluster qualities <- evalData$R2.keijzer4; clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] ); clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ; clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities)); clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n()); clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x)); clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y)); clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters"); clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality)); clusterStats$Rank <- seq(1:nrow(clusterStats)); ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point(); write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = "."); #check clusters for(i in seq(1:nrow(clusterStats))) { clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!) cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value"); xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) + theme_void() + geom_line(alpha=0.1); ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png")); } funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748] cor(method="pearson", target_keijzer4, t(outputs[20281, ])) plot(funs_in_cluster[,4], target_keijzer4) xi <- seq(0,9.99,0.1); # x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1) target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1); plot(xi, target_keijzer4); m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs) m_sub <- m[m$q<1.0,]; # plot mapped points (clusters) ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=c)) + theme(legend.position = "none") # scale_color_gradient(low = "red",high = "black") ; ggsave("phenotypic_clusters.png") # plot mapped points (qualities) ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=q)) + scale_color_gradientn(colors=heat.colors(30)) ; #write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv"); m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv"); cluster_n <- dplyr::filter(m, c==9); cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104])) evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line(); p