Changeset 15927


Ignore:
Timestamp:
05/03/18 19:39:55 (4 years ago)
Author:
gkronber
Message:

#2886 made some changes to the R script for clustering and visualzation for the book chapter

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R

    r15924 r15927  
    33library(dplyr)
    44
     5setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
     6
     7sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
     8
     9# read from CSV and store as R binary (must be done once to produce the .rds file)
     10#evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
     11#saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
     12
     13# read from R binary (faster)
     14evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
     15
     16max(evalData$R2.keijzer4);
     17max(evalData$R2.keijzer9);
     18max(evalData$R2.pagie);
     19max(evalData$R2.nguyen5);
     20max(evalData$R2.nguyen6);
     21max(evalData$R2.nguyen7);
     22
     23outputs <- evalData[,10:109];
     24
     25#check zero mean, unit variance
     26#mean(t(outputs[2,]))
     27#sd(t(outputs[2,]))
     28
     29# check
     30# plot(t(outputs[4,]))
     31
     32#apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
     33# check ANN
     34#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
     35#xs <- rep(seq(1:100),100)
     36#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
     37
     38#edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
     39#clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
     40
     41# check cluster
     42#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
     43#xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
     44#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
    545
    646
    7 sentenceFileName <- "D:/heal/documents/trunk/Publications/2018/GPTP/data/evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
    8 evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
    9 qualities <- evalData$R2.keijzer4;
    10 outputs <- evalData[,6:105];
    11 
    12 lv <- largeVis(outputs, dim=2, K = 50, n_trees = 50) # TODO scale?
    13 clusters <- hdbscan(lv, minPts = 3, K=50);
     47lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
     48               perplexity=100, K = 100, n_trees = 150, threads=4,
     49               save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ;
     50clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20);
    1451
    1552
     53# calculate quality distribution for each cluster
     54qualities <- evalData$R2.keijzer4;
     55clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
     56
     57clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
     58clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
     59clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
     60clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
     61clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
     62clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
     63clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
     64clusterStats$Rank <- seq(1:nrow(clusterStats));
     65ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point();
     66
     67write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = ".");
     68
     69#check clusters
     70for(i in seq(1:nrow(clusterStats))) {
     71  clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
     72  cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value");
     73  xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster
     74  ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) +
     75    theme_void() +
     76    geom_line(alpha=0.1);
     77 
     78  ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png"));
     79}
     80
     81funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748]
     82cor(method="pearson", target_keijzer4, t(outputs[20281, ]))
     83plot(funs_in_cluster[,4], target_keijzer4)
     84
     85xi <- seq(0,9.99,0.1);
     86#  x³  * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
     87target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
     88plot(xi, target_keijzer4);
    1689
    1790m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs)
    18 # plot mapped points
    19 ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=c))  + theme(legend.position = "none") ;
     91m_sub <- m[m$q<1.0,];
     92
     93# plot mapped points (clusters)
     94ggplot(data=m, aes(x=x, y=y)) +
     95  geom_point(aes(color=c))  +
     96  theme(legend.position = "none")
     97#  scale_color_gradient(low = "red",high = "black")
     98;
     99ggsave("phenotypic_clusters.png")
     100
     101
     102# plot mapped points (qualities)
     103ggplot(data=m, aes(x=x, y=y)) +
     104  geom_point(aes(color=q))  +
     105  scale_color_gradientn(colors=heat.colors(30))
     106;
     107
     108
     109write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz");
    20110
    21111cluster_n <- dplyr::filter(m, c==5);
     
    25115p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
    26116p
    27 
Note: See TracChangeset for help on using the changeset viewer.