Changeset 15927
- Timestamp:
- 05/03/18 19:39:55 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R
r15924 r15927 3 3 library(dplyr) 4 4 5 setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data"); 6 7 sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"; 8 9 # read from CSV and store as R binary (must be done once to produce the .rds file) 10 #evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=","); 11 #saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); 12 13 # read from R binary (faster) 14 evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds"); 15 16 max(evalData$R2.keijzer4); 17 max(evalData$R2.keijzer9); 18 max(evalData$R2.pagie); 19 max(evalData$R2.nguyen5); 20 max(evalData$R2.nguyen6); 21 max(evalData$R2.nguyen7); 22 23 outputs <- evalData[,10:109]; 24 25 #check zero mean, unit variance 26 #mean(t(outputs[2,])) 27 #sd(t(outputs[2,])) 28 29 # check 30 # plot(t(outputs[4,])) 31 32 #apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE) 33 # check ANN 34 #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value"); 35 #xs <- rep(seq(1:100),100) 36 #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); 37 38 #edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE); 39 #clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE); 40 41 # check cluster 42 #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value"); 43 #xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster 44 #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line(); 5 45 6 46 7 sentenceFileName <- "D:/heal/documents/trunk/Publications/2018/GPTP/data/evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"; 8 evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=","); 9 qualities <- evalData$R2.keijzer4; 10 outputs <- evalData[,6:105]; 11 12 lv <- largeVis(outputs, dim=2, K = 50, n_trees = 50) # TODO scale? 13 clusters <- hdbscan(lv, minPts = 3, K=50); 47 lv <- largeVis(t(outputs), dim=2, distance_method="Cosine", 48 perplexity=100, K = 100, n_trees = 150, threads=4, 49 save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ; 50 clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20); 14 51 15 52 53 # calculate quality distribution for each cluster 54 qualities <- evalData$R2.keijzer4; 55 clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] ); 56 57 clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ; 58 clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities)); 59 clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n()); 60 clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x)); 61 clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y)); 62 clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters"); 63 clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality)); 64 clusterStats$Rank <- seq(1:nrow(clusterStats)); 65 ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point(); 66 67 write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = "."); 68 69 #check clusters 70 for(i in seq(1:nrow(clusterStats))) { 71 clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!) 72 cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value"); 73 xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster 74 ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) + 75 theme_void() + 76 geom_line(alpha=0.1); 77 78 ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png")); 79 } 80 81 funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748] 82 cor(method="pearson", target_keijzer4, t(outputs[20281, ])) 83 plot(funs_in_cluster[,4], target_keijzer4) 84 85 xi <- seq(0,9.99,0.1); 86 # x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1) 87 target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1); 88 plot(xi, target_keijzer4); 16 89 17 90 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs) 18 # plot mapped points 19 ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=c)) + theme(legend.position = "none") ; 91 m_sub <- m[m$q<1.0,]; 92 93 # plot mapped points (clusters) 94 ggplot(data=m, aes(x=x, y=y)) + 95 geom_point(aes(color=c)) + 96 theme(legend.position = "none") 97 # scale_color_gradient(low = "red",high = "black") 98 ; 99 ggsave("phenotypic_clusters.png") 100 101 102 # plot mapped points (qualities) 103 ggplot(data=m, aes(x=x, y=y)) + 104 geom_point(aes(color=q)) + 105 scale_color_gradientn(colors=heat.colors(30)) 106 ; 107 108 109 write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"); 20 110 21 111 cluster_n <- dplyr::filter(m, c==5); … … 25 115 p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line(); 26 116 p 27
Note: See TracChangeset
for help on using the changeset viewer.