Free cookie consent management tool by TermsFeed Policy Generator

Changeset 16174


Ignore:
Timestamp:
09/22/18 07:12:57 (4 years ago)
Author:
gkronber
Message:

#2886 made several changes to clustering scripts for GPTP paper

Location:
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R
Files:
2 added
2 edited

Legend:

Unmodified
Added
Removed
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R

    r15936 r16174  
    1313# read from R binary (faster)
    1414evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
     15
     16nrow(evalData[evalData$R2.keijzer4 > 0.4,])
    1517
    1618max(evalData$R2.keijzer4);
     
    2123max(evalData$R2.nguyen7);
    2224
    23 outputs <- evalData[,10:109];
    2425
    2526#check zero mean, unit variance
     
    4647
    4748lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
    48                perplexity=100, K = 100, n_trees = 150, threads=4,
    49                save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ;
    50 clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20);
    51 
     49               perplexity=100, K = 300, n_trees = 50, threads=4,
     50               verbose=TRUE) ;
     51#clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);
     52
     53#use clusters calculated with C# program
     54clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')
    5255
    5356# calculate quality distribution for each cluster
    5457qualities <- evalData$R2.keijzer4;
    55 clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
     58clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
    5659
    5760clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
     
    6366clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
    6467clusterStats$Rank <- seq(1:nrow(clusterStats));
    65 ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point();
    66 
    67 write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = ".");
    68 
    69 #check clusters
    70 for(i in seq(1:nrow(clusterStats))) {
     68
     69# write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");
     70
     71##output target function for paper
     72## using only one of the following
     73#xi <- seq(0,9.99,0.1);
     74#target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
     75#problemName <- "keijzer-4"
     76#target <- target_keijzer4
     77#
     78#
     79#xi <- seq(0, 99, 1)
     80#target_keijzer9 <- log(xi + sqrt(xi*xi+1))
     81#problemName <- "keijzer-9" 
     82#target <- target_keijzer9
     83#
     84#xi <- seq(-5, 4.9, 0.1)
     85#target_pagie1d <- 1 / (1+xi^-4)
     86#problemName <- "pagie-1d" 
     87#target <- target_pagie1d
     88#
     89#xi <- seq(-1, 1, 0.0201)
     90#target_nguyen5 <- sin(xi*xi)*cos(xi)-1
     91#problemName <- "nguyen-5" 
     92#target <- target_nguyen5
     93#
     94#xi <- seq(-1, 1, 0.0201)
     95#target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
     96#problemName <- "nguyen-6" 
     97#target <- target_nguyen6
     98
     99target_df <- data.frame(xi, target)
     100ggplot(target_df, aes(x=xi, y=target)) +
     101  theme_void() +
     102  geom_line(alpha=1, size = 0.3, color='red');
     103ggsave(gsub(" ", "", paste(problemName, ".pdf")),
     104       device="pdf",width=2, height=1.5, units="cm");
     105
     106for(i in seq(1:10)) {
     107  #i <- 1
    71108  clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
    72   cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value");
    73   xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster
    74   ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) +
     109  #i <- 9358
     110  cluster_n <- dplyr::filter(m, c==clusterNumber);
     111  for(j in seq(1,nrow(cluster_n))) {
     112    #    j <- 2
     113    other <- t(cluster_n[j,])[5:104]
     114    max(other)
     115    temp <- data.frame(y=as.vector(target), x=other)
     116    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
     117  }
     118  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
     119  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
     120 
     121  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
     122 
     123  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]
     124
     125    ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
    75126    theme_void() +
    76     geom_line(alpha=0.1);
    77  
    78   ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png"));
     127    geom_line(alpha=max(0.1, 1/size), size = 0.2);
     128 
     129  ggsave(gsub(" ", "", paste(problemName,"-",as.character(i),
     130                       ".pdf")),
     131         device="pdf",width=2, height=1.5, units="cm");
    79132}
    80133
    81 funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748]
    82 cor(method="pearson", target_keijzer4, t(outputs[20281, ]))
    83 plot(funs_in_cluster[,4], target_keijzer4)
    84 
    85 xi <- seq(0,9.99,0.1);
    86 #  x³  * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
    87 target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
    88 plot(xi, target_keijzer4);
    89 
    90 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs)
     134
     135#funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
     136#cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
     137#plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)
     138
     139m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
    91140m_sub <- m[m$q<1.0,];
    92141
     142m <- dplyr::arrange(m, q);
     143
    93144# plot mapped points (clusters)
    94 ggplot(data=m, aes(x=x, y=y)) +
    95   geom_point(aes(color=c))  +
    96   theme(legend.position = "none")
     145ggplot(data=m[,], aes(x=x, y=y)) +
     146  geom_point(aes(color=q))+
     147  scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) +
     148  scale_size_area(max_size=1) +
     149  labs(color="R²", title="Phenotypic Embedding (R² with Keijzer4)")
     150#  theme(legend.position = "none")
    97151#  scale_color_gradient(low = "red",high = "black")
    98152;
    99153ggsave("phenotypic_clusters.png")
    100154
     155# qualities in clusters
     156#
     157ggplot(data=m, aes(x=c, y=q)) +
     158  geom_boxplot(aes(group=c));
     159 
     160
    101161
    102162# plot mapped points (qualities)
    103 ggplot(data=m, aes(x=x, y=y)) +
    104   geom_point(aes(color=q))  +
    105   scale_color_gradientn(colors=heat.colors(30))
    106 ;
     163#ggplot(data=m, aes(x=x, y=y)) +
     164#  geom_point(aes(color=q))  +
     165#  scale_color_gradientn(colors=heat.colors(30))
     166#;
    107167
    108168
     
    110170m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
    111171
    112 cluster_n <- dplyr::filter(m, c==9);
     172# produce all clusters (TODO: order by cluster size)
     173clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
     174clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)
     175
     176#TODO: check 2, 10
     177#check 143292, 122983
     178#check 118734, 118970
     179#cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
     180#plot(t(outputs[122735,]))
     181
     182#clusters
     183picIdx <- 0
     184for(i in clusterSizes$`clusters$ClusterId`) {
     185  #i <- 9358
     186  representative <- t(outputs[i+1,])
     187  #representative
     188  cluster_n <- dplyr::filter(m, c==i);
     189  for(j in seq(1,nrow(cluster_n))) {
     190#    j <- 2
     191    other <- t(cluster_n[j,])[5:104]
     192    max(other)
     193    temp <- data.frame(y=as.vector(representative), x=other)
     194    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
     195  }
     196  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
     197  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
     198
     199  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
     200 
     201  ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
     202    theme_void() +
     203    geom_line(alpha=1, size = 1);
     204  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
     205  row_num <- picIdx %/% 50
     206  ggsave(paste(as.character(row_num),
     207               as.character(picIdx- row_num*50),
     208               as.character(i),
     209         #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)),
     210         #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
     211              ".png"),
     212         width=4, height=3);
     213  picIdx <- picIdx+1
     214}
     215
     216
     217# inspect one cluster
     218cluster_n <- dplyr::filter(m, c==0);
    113219cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
    114220evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
     
    116222p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
    117223p
     224
     225# plot ranked clusters
     226ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
     227  geom_point() +
     228  labs(y='Avg R²', x=paste('Cluster rank','-',problemName))
     229ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
     230       device="pdf",
     231       width=6, height=4.5, units="cm"
     232       )
     233
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R

    r15936 r16174  
    11# find nearest neighbors for GP indiviuals
    22
     3library(dplyr)
     4library(hexbin)
    35library(ggplot2);
     6#library(doSNOW); #dopar
     7#library(foreach); # dopar
    48
    5 gp_log <- read.csv2("C:/Users/P24581/filebox/GPTP 2018/symbreg-models-13.05.2018-1418.csv",header = FALSE,sep='\t',dec=',');
     9#cl<-makeCluster(2) #change the 2 to your number of CPU cores
     10#registerDoSNOW(cl)
     11
     12setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4");
     13
     14gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=',');
    615idx <- seq(1:nrow(gp_log));
    716# check popSize in gp_log
    8 gp_log[seq(1,34000,500),1]
     17gp_log[seq(1,20500,500),1]
    918popSize <- 500;
    1019
     
    1322
    1423# generations <- seq(1,34000/popSize,1);
    15 generations <- seq(1,15,1);
     24generations <- seq(1,41,1);
     25
    1626numClusters <- max(m$c);
    1727
    1828gp_evals <- gp_log[,seq(3,202,2)];
    19 all_evals <- m[,6:105];
     29all_evals <- m[,5:104];
     30gen_i=1
     31stats <- data.frame(gen=generations,
     32                    numDiffClusters = -generations,
     33                    medianRank=-generations)
    2034for(gen_i in generations) {
    21 #gen_i <- 15;
     35 #gen_i <- 1;
    2236  selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1);
    23   min(selectedRows)
    24   max(selectedRows)
     37  #min(select2edRows)
     38  #max(selectedRows)
    2539  xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2
    26   mapped_gp_log <- m[max.col(t(xcorrel)), 1:5]
     40  xcorrel[is.na(xcorrel)] <- 0; # for constant expressions
     41 
     42  idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel)));
     43 
     44  # number of different nearest neighbours
     45  numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel))
     46  stats$numDiffClusters[gen_i] <- numDistinctNeighbors
     47 
     48  mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5]
    2749#check
    2850#cor(t(all_evals[128082,]), t(gp_evals[2,]))^2
    2951#max(cor(t(all_evals[,]), t(gp_evals[2,]))^2)
    30   ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + geom_point();
     52  ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point();
    3153  ggsave(paste("scatter",gen_i,".png"))
     54 
     55#  bestFunc <- data.frame(x = seq(0,9.8,0.1),
     56#                         t(m[idxOfMaxCorrel$idx[1], 6:104]),
     57#                         t(gp_evals[selectedRows[1],1:99]));
     58#  # best neighbor
     59#  ggplot(bestFunc, aes(x=x)) +
     60#    geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) +
     61#    geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) +
     62#    guides(fill="legend") +
     63#    labs(y="f(x)", color="Function");
     64#  ggsave(paste("best_function",gen_i,".png"))
     65 
     66# count number of unclustered   
     67#  count(m, m$c==-1)
    3268
    33   ggplot(mapped_gp_log, aes(x=c)) +xlim(0,numClusters+1) + geom_histogram(binwidth = 1);
    34   ggsave(paste("cluster_freq",gen_i,".png"))
     69  #plot expr freq by ranks
     70  mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters"))
     71  stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank)
     72 
     73  mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n())
     74  mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters"))
     75  ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) +
     76    scale_y_log10() +
     77    xlim(0,17540) +
     78    geom_point()
     79  ggsave(paste("cluster-freqs",gen_i,".pdf"),
     80         device=pdf,width=3)
     81 
     82 
     83  #plot hex map
     84  ggplot(mapped_gp_log, aes(x=x, y=y)) +
     85    xlim(-60,60) +
     86    ylim(-60,60) +
     87    theme_void() +
     88    geom_hex()
     89 
     90  ggsave(paste("hexbin",gen_i,".pdf"),
     91         device=pdf,width=3,height=2)
     92 
     93#    ggplot(mapped_gp_log, aes(x=c)) +
     94#  xlim(-2,numClusters+1) +
     95#  geom_histogram(binwidth = 1) +
     96#  ylim(0,500);
     97#  ggsave(paste("cluster_freq",gen_i,".png"))
    3598}
     99
     100# plot line chart of stats
     101ggplot(stats, aes(x=gen)) +
     102  xlim(1,40) +
     103  geom_line(aes(y=medianRank)) + 
     104  theme_classic() +
     105  labs(x="Generations", y="Median cluster rank") +
     106  scale_y_log10()
     107ggsave("gp-medianClusterRank.pdf", device=pdf)
     108
     109print(stats$numDiffClusters)
     110
     111stats$numDiffClusters <- unlist(stats$numDiffClusters)
     112
     113ggplot(stats, aes(x=gen, y=numDiffClusters)) +
     114  xlim(1,40) +
     115  geom_line() + 
     116  theme_classic() +
     117  labs(x="Generations", y="Number of explored clusters")
     118ggsave("gp-numClusters.pdf", device=pdf)
Note: See TracChangeset for help on using the changeset viewer.