[15903] | 1 | library(largeVis)
|
---|
| 2 | library(ggplot2)
|
---|
[15924] | 3 | library(dplyr)
|
---|
[15903] | 4 |
|
---|
[15936] | 5 | #setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
|
---|
| 6 | setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
|
---|
[15927] | 7 | sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
|
---|
[15903] | 8 |
|
---|
[15927] | 9 | # read from CSV and store as R binary (must be done once to produce the .rds file)
|
---|
| 10 | #evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
|
---|
| 11 | #saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
|
---|
| 12 |
|
---|
| 13 | # read from R binary (faster)
|
---|
| 14 | evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
|
---|
| 15 |
|
---|
| 16 | max(evalData$R2.keijzer4);
|
---|
| 17 | max(evalData$R2.keijzer9);
|
---|
| 18 | max(evalData$R2.pagie);
|
---|
| 19 | max(evalData$R2.nguyen5);
|
---|
| 20 | max(evalData$R2.nguyen6);
|
---|
| 21 | max(evalData$R2.nguyen7);
|
---|
| 22 |
|
---|
| 23 | outputs <- evalData[,10:109];
|
---|
| 24 |
|
---|
| 25 | #check zero mean, unit variance
|
---|
| 26 | #mean(t(outputs[2,]))
|
---|
| 27 | #sd(t(outputs[2,]))
|
---|
| 28 |
|
---|
| 29 | # check
|
---|
| 30 | # plot(t(outputs[4,]))
|
---|
| 31 |
|
---|
| 32 | #apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
|
---|
| 33 | # check ANN
|
---|
| 34 | #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
|
---|
| 35 | #xs <- rep(seq(1:100),100)
|
---|
| 36 | #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
|
---|
| 37 |
|
---|
| 38 | #edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
|
---|
| 39 | #clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
|
---|
| 40 |
|
---|
| 41 | # check cluster
|
---|
| 42 | #cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
|
---|
| 43 | #xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
|
---|
| 44 | #ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
|
---|
| 45 |
|
---|
| 46 |
|
---|
| 47 | lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
|
---|
| 48 | perplexity=100, K = 100, n_trees = 150, threads=4,
|
---|
| 49 | save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ;
|
---|
| 50 | clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20);
|
---|
| 51 |
|
---|
| 52 |
|
---|
| 53 | # calculate quality distribution for each cluster
|
---|
[15924] | 54 | qualities <- evalData$R2.keijzer4;
|
---|
[15927] | 55 | clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
|
---|
[15903] | 56 |
|
---|
[15927] | 57 | clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
|
---|
| 58 | clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
|
---|
| 59 | clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
|
---|
| 60 | clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
|
---|
| 61 | clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
|
---|
| 62 | clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
|
---|
| 63 | clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
|
---|
| 64 | clusterStats$Rank <- seq(1:nrow(clusterStats));
|
---|
| 65 | ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point();
|
---|
[15924] | 66 |
|
---|
[15927] | 67 | write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = ".");
|
---|
[15924] | 68 |
|
---|
[15927] | 69 | #check clusters
|
---|
| 70 | for(i in seq(1:nrow(clusterStats))) {
|
---|
| 71 | clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
|
---|
| 72 | cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value");
|
---|
| 73 | xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster
|
---|
| 74 | ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) +
|
---|
| 75 | theme_void() +
|
---|
| 76 | geom_line(alpha=0.1);
|
---|
| 77 |
|
---|
| 78 | ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png"));
|
---|
| 79 | }
|
---|
[15924] | 80 |
|
---|
[15927] | 81 | funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748]
|
---|
| 82 | cor(method="pearson", target_keijzer4, t(outputs[20281, ]))
|
---|
| 83 | plot(funs_in_cluster[,4], target_keijzer4)
|
---|
| 84 |
|
---|
| 85 | xi <- seq(0,9.99,0.1);
|
---|
| 86 | # x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
|
---|
| 87 | target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
|
---|
| 88 | plot(xi, target_keijzer4);
|
---|
| 89 |
|
---|
[15924] | 90 | m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs)
|
---|
[15927] | 91 | m_sub <- m[m$q<1.0,];
|
---|
[15924] | 92 |
|
---|
[15927] | 93 | # plot mapped points (clusters)
|
---|
| 94 | ggplot(data=m, aes(x=x, y=y)) +
|
---|
| 95 | geom_point(aes(color=c)) +
|
---|
| 96 | theme(legend.position = "none")
|
---|
| 97 | # scale_color_gradient(low = "red",high = "black")
|
---|
| 98 | ;
|
---|
| 99 | ggsave("phenotypic_clusters.png")
|
---|
| 100 |
|
---|
| 101 |
|
---|
| 102 | # plot mapped points (qualities)
|
---|
| 103 | ggplot(data=m, aes(x=x, y=y)) +
|
---|
| 104 | geom_point(aes(color=q)) +
|
---|
| 105 | scale_color_gradientn(colors=heat.colors(30))
|
---|
| 106 | ;
|
---|
| 107 |
|
---|
| 108 |
|
---|
[15936] | 109 | #write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
|
---|
| 110 | m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
|
---|
[15927] | 111 |
|
---|
[15936] | 112 | cluster_n <- dplyr::filter(m, c==9);
|
---|
[15924] | 113 | cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
|
---|
| 114 | evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
|
---|
| 115 |
|
---|
| 116 | p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
|
---|
| 117 | p
|
---|