library(largeVis)
library(ggplot2)
library(dplyr)

#setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";

# read from CSV and store as R binary (must be done once to produce the .rds file)
#evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
#saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");

# read from R binary (faster)
evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");

nrow(evalData[evalData$R2.keijzer4 > 0.4,])

max(evalData$R2.keijzer4);
max(evalData$R2.keijzer9);
max(evalData$R2.pagie);
max(evalData$R2.nguyen5);
max(evalData$R2.nguyen6);
max(evalData$R2.nguyen7);


#check zero mean, unit variance
#mean(t(outputs[2,]))
#sd(t(outputs[2,]))

# check
# plot(t(outputs[4,]))

#apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
# check ANN
#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
#xs <- rep(seq(1:100),100)
#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();

#edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
#clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);

# check cluster
#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
#xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();


lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
               perplexity=100, K = 300, n_trees = 50, threads=4, 
               verbose=TRUE) ;
#clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 300);

#use clusters calculated with C# program
clusters <- read.csv2("180801_clusters.txt", header = TRUE, sep=';')

# calculate quality distribution for each cluster
qualities <- evalData$R2.keijzer4;
clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$ClusterId, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );

clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
clusterStats$Rank <- seq(1:nrow(clusterStats));

# write.csv2(clusters$clusters, "cluster_assignment_180518.csv", sep = " ", dec = ".");

##output target function for paper
## using only one of the following
#xi <- seq(0,9.99,0.1);
#target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)*sin(xi) * cos(xi) - 1);
#problemName <- "keijzer-4"
#target <- target_keijzer4
#
#
#xi <- seq(0, 99, 1)
#target_keijzer9 <- log(xi + sqrt(xi*xi+1))
#problemName <- "keijzer-9"  
#target <- target_keijzer9
#
#xi <- seq(-5, 4.9, 0.1)
#target_pagie1d <- 1 / (1+xi^-4)
#problemName <- "pagie-1d"  
#target <- target_pagie1d
#
#xi <- seq(-1, 1, 0.0201)
#target_nguyen5 <- sin(xi*xi)*cos(xi)-1
#problemName <- "nguyen-5"  
#target <- target_nguyen5
#
#xi <- seq(-1, 1, 0.0201)
#target_nguyen6 <- sin(xi)+sin(xi+xi*xi)
#problemName <- "nguyen-6"  
#target <- target_nguyen6

target_df <- data.frame(xi, target)
ggplot(target_df, aes(x=xi, y=target)) +
  theme_void() +
  geom_line(alpha=1, size = 0.3, color='red');
ggsave(gsub(" ", "", paste(problemName, ".pdf")),
       device="pdf",width=2, height=1.5, units="cm");

for(i in seq(1:10)) {
  #i <- 1
  clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
  #i <- 9358
  cluster_n <- dplyr::filter(m, c==clusterNumber);
  for(j in seq(1,nrow(cluster_n))) {
    #    j <- 2
    other <- t(cluster_n[j,])[5:104]
    max(other)
    temp <- data.frame(y=as.vector(target), x=other)
    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
  } 
  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
  
  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
  
  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==clusterNumber]

    ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
    theme_void() +
    geom_line(alpha=max(0.1, 1/size), size = 0.2);
  
  ggsave(gsub(" ", "", paste(problemName,"-",as.character(i), 
                       ".pdf")),
         device="pdf",width=2, height=1.5, units="cm");
}


#funs_in_cluster <- t(outputs)[,clusters$clusterId==clusterNumber]
#cor(method="pearson", target_keijzer4, t(outputs[18450, ]))
#plot(t(outputs)[,clusters$ClusterId==clusterNumber], target_keijzer4)

m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$ClusterId, q=qualities, outputs)
m_sub <- m[m$q<1.0,];

m <- dplyr::arrange(m, q);

# plot mapped points (clusters)
ggplot(data=m[,], aes(x=x, y=y)) +
  geom_point(aes(color=q))+
  scale_color_gradient2(low="blue", mid="yellow", high="red", midpoint=0.6) + 
  scale_size_area(max_size=1) +
  labs(color="R²")
#  theme(legend.position = "none")
#  scale_color_gradient(low = "red",high = "black") 
;
ggsave("phenotypic_clusters.pdf", device="pdf", width=11.66, height=5.9, units="cm", dpi=600, scale=2)

# qualities in clusters
#
ggplot(data=m, aes(x=c, y=q)) +
  geom_boxplot(aes(group=c));
  


# plot mapped points (qualities)
#ggplot(data=m, aes(x=x, y=y)) +
#  geom_point(aes(color=q))  +
#  scale_color_gradientn(colors=heat.colors(30)) 
#;


#write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");

# produce all clusters (TODO: order by cluster size)
clusterSizes <- group_by(clusters, clusters$ClusterId) %>% count()
clusterSizes <- dplyr::arrange(clusterSizes, clusterSizes$`clusters$ClusterId`)

#TODO: check 2, 10
#check 143292, 122983
#check 118734, 118970
#cor(method='pearson',t(outputs[81824,]), t(outputs[92710,]))^2
#plot(t(outputs[122735,]))

#clusters
picIdx <- 0
for(i in clusterSizes$`clusters$ClusterId`) {
  #i <- 9358
  representative <- t(outputs[i+1,])
  #representative
  cluster_n <- dplyr::filter(m, c==i);
  for(j in seq(1,nrow(cluster_n))) {
#    j <- 2
    other <- t(cluster_n[j,])[5:104]
    max(other)
    temp <- data.frame(y=as.vector(representative), x=other)
    cluster_n[j,5:104] <- fitted(lm("y ~ x", temp))
  } 
  cluster_evals <- data.frame(x=seq(1,100,1), fx=t(cluster_n[,5:104]))
  evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))

  #ggplot(cluster_evals, aes(x=x, y=cluster_evals$fx.4)) + geom_line()
  
  ggplot(evals_cluster_n, aes(x=x, y=fx,c=f)) +
    theme_void() +
    geom_line(alpha=1, size = 1);
  size <- clusterSizes$n[clusterSizes$`clusters$ClusterId`==i]
  row_num <- picIdx %/% 50
  ggsave(paste(as.character(row_num), 
               as.character(picIdx- row_num*50),
               as.character(i), 
         #as.character(round(clusterStats$meanX[clusterStats$Clusters==i], 3)), 
         #as.character(round(clusterStats$meanY[clusterStats$Clusters==i], 3)),
              ".png"),
         width=4, height=3);
  picIdx <- picIdx+1
}


# inspect one cluster
cluster_n <- dplyr::filter(m, c==0);
cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))

p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
p

# plot ranked clusters
ggplot(clusterStats, aes(x=Rank, y=AvgQuality)) +
  geom_point() +
  labs(y='Avg R²', x=paste('Cluster rank','-',problemName))
ggsave(gsub(" ", "", paste(problemName,"-cluster-ranks.pdf")),
       device="pdf",
       width=6, height=4.5, units="cm"
       )