Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
09/22/18 07:12:57 (6 years ago)
Author:
gkronber
Message:

#2886 made several changes to clustering scripts for GPTP paper

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/FindClustersForGPLog.R

    r15936 r16174  
    11# find nearest neighbors for GP indiviuals
    22
     3library(dplyr)
     4library(hexbin)
    35library(ggplot2);
     6#library(doSNOW); #dopar
     7#library(foreach); # dopar
    48
    5 gp_log <- read.csv2("C:/Users/P24581/filebox/GPTP 2018/symbreg-models-13.05.2018-1418.csv",header = FALSE,sep='\t',dec=',');
     9#cl<-makeCluster(2) #change the 2 to your number of CPU cores
     10#registerDoSNOW(cl)
     11
     12setwd("C:/Users/P24581/filebox/GPTP 2018/Keijzer4");
     13
     14gp_log <- read.csv2("symbreg-models-13.05.2018-1412.csv.gz",header = FALSE,sep='\t',dec=',');
    615idx <- seq(1:nrow(gp_log));
    716# check popSize in gp_log
    8 gp_log[seq(1,34000,500),1]
     17gp_log[seq(1,20500,500),1]
    918popSize <- 500;
    1019
     
    1322
    1423# generations <- seq(1,34000/popSize,1);
    15 generations <- seq(1,15,1);
     24generations <- seq(1,41,1);
     25
    1626numClusters <- max(m$c);
    1727
    1828gp_evals <- gp_log[,seq(3,202,2)];
    19 all_evals <- m[,6:105];
     29all_evals <- m[,5:104];
     30gen_i=1
     31stats <- data.frame(gen=generations,
     32                    numDiffClusters = -generations,
     33                    medianRank=-generations)
    2034for(gen_i in generations) {
    21 #gen_i <- 15;
     35 #gen_i <- 1;
    2236  selectedRows <- seq((gen_i - 1)*popSize + 1, gen_i * popSize,1);
    23   min(selectedRows)
    24   max(selectedRows)
     37  #min(select2edRows)
     38  #max(selectedRows)
    2539  xcorrel <- cor(t(all_evals[,]), t(gp_evals[selectedRows,]))^2
    26   mapped_gp_log <- m[max.col(t(xcorrel)), 1:5]
     40  xcorrel[is.na(xcorrel)] <- 0; # for constant expressions
     41 
     42  idxOfMaxCorrel <- data.frame(idx=max.col(t(xcorrel)));
     43 
     44  # number of different nearest neighbours
     45  numDistinctNeighbors <- count(distinct(.data=idxOfMaxCorrel))
     46  stats$numDiffClusters[gen_i] <- numDistinctNeighbors
     47 
     48  mapped_gp_log <- m[idxOfMaxCorrel$idx, 1:5]
    2749#check
    2850#cor(t(all_evals[128082,]), t(gp_evals[2,]))^2
    2951#max(cor(t(all_evals[,]), t(gp_evals[2,]))^2)
    30   ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + geom_point();
     52  ggplot(mapped_gp_log, aes(x=x, y=y)) + xlim(-75,75) + ylim(-75,75) + labs(caption=paste("# distinct neighbors",numDistinctNeighbors))+ geom_point();
    3153  ggsave(paste("scatter",gen_i,".png"))
     54 
     55#  bestFunc <- data.frame(x = seq(0,9.8,0.1),
     56#                         t(m[idxOfMaxCorrel$idx[1], 6:104]),
     57#                         t(gp_evals[selectedRows[1],1:99]));
     58#  # best neighbor
     59#  ggplot(bestFunc, aes(x=x)) +
     60#    geom_line(aes(y=scale.default(bestFunc[,2]), color=rep("Nearest Neighbor",nrow(bestFunc)))) +
     61#    geom_line(aes(y=scale.default(bestFunc[,3]), color=rep("Best GP Solution",nrow(bestFunc)))) +
     62#    guides(fill="legend") +
     63#    labs(y="f(x)", color="Function");
     64#  ggsave(paste("best_function",gen_i,".png"))
     65 
     66# count number of unclustered   
     67#  count(m, m$c==-1)
    3268
    33   ggplot(mapped_gp_log, aes(x=c)) +xlim(0,numClusters+1) + geom_histogram(binwidth = 1);
    34   ggsave(paste("cluster_freq",gen_i,".png"))
     69  #plot expr freq by ranks
     70  mapped_gp_log_with_clusters <- dplyr::inner_join(mapped_gp_log, clusterStats, by=c("c" = "Clusters"))
     71  stats$medianRank[gen_i] <- median(mapped_gp_log_with_clusters$Rank)
     72 
     73  mapped_gp_log_grouped <- mapped_gp_log_with_clusters %>% group_by(c) %>% summarise(count = n())
     74  mapped_gp_log_grouped <- dplyr::inner_join(mapped_gp_log_grouped, clusterStats, by=c("c"="Clusters"))
     75  ggplot(mapped_gp_log_grouped, aes(x=Rank,y=count)) +
     76    scale_y_log10() +
     77    xlim(0,17540) +
     78    geom_point()
     79  ggsave(paste("cluster-freqs",gen_i,".pdf"),
     80         device=pdf,width=3)
     81 
     82 
     83  #plot hex map
     84  ggplot(mapped_gp_log, aes(x=x, y=y)) +
     85    xlim(-60,60) +
     86    ylim(-60,60) +
     87    theme_void() +
     88    geom_hex()
     89 
     90  ggsave(paste("hexbin",gen_i,".pdf"),
     91         device=pdf,width=3,height=2)
     92 
     93#    ggplot(mapped_gp_log, aes(x=c)) +
     94#  xlim(-2,numClusters+1) +
     95#  geom_histogram(binwidth = 1) +
     96#  ylim(0,500);
     97#  ggsave(paste("cluster_freq",gen_i,".png"))
    3598}
     99
     100# plot line chart of stats
     101ggplot(stats, aes(x=gen)) +
     102  xlim(1,40) +
     103  geom_line(aes(y=medianRank)) + 
     104  theme_classic() +
     105  labs(x="Generations", y="Median cluster rank") +
     106  scale_y_log10()
     107ggsave("gp-medianClusterRank.pdf", device=pdf)
     108
     109print(stats$numDiffClusters)
     110
     111stats$numDiffClusters <- unlist(stats$numDiffClusters)
     112
     113ggplot(stats, aes(x=gen, y=numDiffClusters)) +
     114  xlim(1,40) +
     115  geom_line() + 
     116  theme_classic() +
     117  labs(x="Generations", y="Number of explored clusters")
     118ggsave("gp-numClusters.pdf", device=pdf)
Note: See TracChangeset for help on using the changeset viewer.