Context Navigation

ClusteringScript.R @ 15985

Visit:

Last change on this file since 15985 was 15936, checked in by gkronber, 7 years ago
#2886 mapping GP solutions in R
File size: 5.1 KB

Rev	Line
[15903]	1	library(largeVis)
	2	library(ggplot2)
[15924]	3	library(dplyr)
[15903]	4
[15936]	5	#setwd("D:/heal/documents/trunk/Publications/2018/GPTP/data");
	6	setwd("C:/reps/HEAL/Publications-2018-GPTP/data");
[15927]	7	sentenceFileName <- "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz";
[15903]	8
[15927]	9	# read from CSV and store as R binary (must be done once to produce the .rds file)
	10	#evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=",");
	11	#saveRDS(evalData, "evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
	12
	13	# read from R binary (faster)
	14	evalData <- readRDS("evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.rds");
	15
	16	max(evalData$R2.keijzer4);
	17	max(evalData$R2.keijzer9);
	18	max(evalData$R2.pagie);
	19	max(evalData$R2.nguyen5);
	20	max(evalData$R2.nguyen6);
	21	max(evalData$R2.nguyen7);
	22
	23	outputs <- evalData[,10:109];
	24
	25	#check zero mean, unit variance
	26	#mean(t(outputs[2,]))
	27	#sd(t(outputs[2,]))
	28
	29	# check
	30	# plot(t(outputs[4,]))
	31
	32	#apprNN <- randomProjectionTreeSearch(t(outputs), K=100, n_trees=50, distance_method="Euclidean", verbose=TRUE)
	33	# check ANN
	34	#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,apprNN[,5]]), "rowNum", "value");
	35	#xs <- rep(seq(1:100),100)
	36	#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
	37
	38	#edgeMatrix <- buildEdgeMatrix(t(outputs),apprNN, verbose=TRUE);
	39	#clusters <- hdbscan(edgeMatrix, apprNN, minPts = 10, K = 5, verbose=TRUE);
	40
	41	# check cluster
	42	#cluster_1 <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==3]), "rowNum", "value");
	43	#xs <- rep(seq(1:100),nrow(cluster_1)/100) # reps must be the number of functions in the cluster
	44	#ggplot(cluster_1, aes(x=xs, y=value, c=rowNum)) + geom_line();
	45
	46
	47	lv <- largeVis(t(outputs), dim=2, distance_method="Cosine",
	48	perplexity=100, K = 100, n_trees = 150, threads=4,
	49	save_neighbors = TRUE, save_edges = TRUE, verbose=TRUE) ;
	50	clusters <- hdbscan(lv, verbose=TRUE, threads=4, minPts = 10, K = 20);
	51
	52
	53	# calculate quality distribution for each cluster
[15924]	54	qualities <- evalData$R2.keijzer4;
[15927]	55	clusterQualities <- data.frame(Qualities = qualities, Clusters = clusters$clusters, x=t(lv$coords)[,1], y=t(lv$coords)[,2] );
[15903]	56
[15927]	57	clusterQualityAvg <- clusterQualities %>% group_by(Clusters) %>% summarize(AvgQuality = mean(Qualities)) ;
	58	clusterQualityStdDev <- clusterQualities %>% group_by(Clusters) %>% summarize(StdDevQuality = sd(Qualities));
	59	clusterQualityCount <- clusterQualities %>% group_by(Clusters) %>% summarize(Count = n());
	60	clusterXCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanX = mean(x));
	61	clusterYCenter <- clusterQualities %>% group_by(Clusters) %>% summarize(meanY = mean(y));
	62	clusterStats <- clusterQualityAvg %>% full_join(clusterQualityStdDev, by="Clusters") %>% full_join(clusterQualityCount, by="Clusters") %>% full_join(clusterXCenter, by ="Clusters") %>% full_join(clusterYCenter, by="Clusters");
	63	clusterStats <- dplyr::arrange(clusterStats, desc(AvgQuality));
	64	clusterStats$Rank <- seq(1:nrow(clusterStats));
	65	ggplot(clusterStats, aes(x = Rank, y=AvgQuality)) + geom_point();
[15924]	66
[15927]	67	write.csv2(clusters$clusters, "cluster_assignment_new.csv", sep = " ", dec = ".");
[15924]	68
[15927]	69	#check clusters
	70	for(i in seq(1:nrow(clusterStats))) {
	71	clusterNumber <- clusterStats$Clusters[i] # number of cluster with smallest quality (error!)
	72	cluster_i <- tidyr::gather(dplyr::tbl_df(t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==clusterNumber]), "rowNum", "value");
	73	xs <- rep(seq(1:100),nrow(cluster_i)/100) # reps must be the number of functions in the cluster
	74	ggplot(cluster_i, aes(x=xs, y=value, c=rowNum)) +
	75	theme_void() +
	76	geom_line(alpha=0.1);
	77
	78	ggsave(paste(as.character(i), as.character(round(clusterStats$meanX[i], 3)), as.character(round(clusterStats$meanY[i], 3)), ".png"));
	79	}
[15924]	80
[15927]	81	funs_in_cluster <- t(outputs)[,!is.na(clusters$clusters) & clusters$clusters==1748]
	82	cor(method="pearson", target_keijzer4, t(outputs[20281, ]))
	83	plot(funs_in_cluster[,4], target_keijzer4)
	84
	85	xi <- seq(0,9.99,0.1);
	86	# xÂ³ * exp(-x) * cos(x) * sin(x) * (sin(x)Â² * cos(x) - 1)
	87	target_keijzer4 <- xi^3 * exp(-xi) * cos(xi) * sin(xi) * (sin(xi)sin(xi) cos(xi) - 1);
	88	plot(xi, target_keijzer4);
	89
[15924]	90	m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs)
[15927]	91	m_sub <- m[m$q<1.0,];
[15924]	92
[15927]	93	# plot mapped points (clusters)
	94	ggplot(data=m, aes(x=x, y=y)) +
	95	geom_point(aes(color=c)) +
	96	theme(legend.position = "none")
	97	# scale_color_gradient(low = "red",high = "black")
	98	;
	99	ggsave("phenotypic_clusters.png")
	100
	101
	102	# plot mapped points (qualities)
	103	ggplot(data=m, aes(x=x, y=y)) +
	104	geom_point(aes(color=q)) +
	105	scale_color_gradientn(colors=heat.colors(30))
	106	;
	107
	108
[15936]	109	#write.csv2(m, "mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
	110	m <- read.csv2("mapping_evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv");
[15927]	111
[15936]	112	cluster_n <- dplyr::filter(m, c==9);
[15924]	113	cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104]))
	114	evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals))
	115
	116	p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line();
	117	p

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format