library(heatmaply)
#> Loading required package: plotly
#> Loading required package: ggplot2
#>
#> Attaching package: 'plotly'
#> The following object is masked from 'package:ggplot2':
#>
#> last_plot
#> The following object is masked from 'package:stats':
#>
#> filter
#> The following object is masked from 'package:graphics':
#>
#> layout
#> Loading required package: viridis
#> Loading required package: viridisLite
#>
#> ---------------------
#> Welcome to heatmaply version 0.11.1
#> Type ?heatmaply for the main documentation.
#> The github page is: https://github.com/talgalili/heatmaply/
#>
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/heatmaply/issues
#> Or contact: <tal.galili@gmail.com>
#>
#> To suppress this message use: suppressPackageStartupMessages(library(heatmaply))
#> ---------------------
library(dendextend)
#>
#> ---------------------
#> Welcome to dendextend version 1.5.2
#> Type citation('dendextend') for how to cite the package.
#>
#> Type browseVignettes(package = 'dendextend') for the package vignette.
#> The github page is: https://github.com/talgalili/dendextend/
#>
#> Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
#> Or contact: <tal.galili@gmail.com>
#>
#> To suppress this message use: suppressPackageStartupMessages(library(dendextend))
#> ---------------------
#>
#> Attaching package: 'dendextend'
#> The following object is masked from 'package:stats':
#>
#> cutree
Here eval=FALSE
as we have the final version in the heatmaplyExamples package.
Preparing the data:
# measles
# bil gates (20 Feb 2015): https://twitter.com/BillGates/status/568749655112228865
# http://graphics.wsj.com/infectious-diseases-and-vaccines/#b02g20t20w15
# http://www.opiniomics.org/recreating-a-famous-visualisation/
# http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r-2/
# source: http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r/
# measles <- read.csv("https://raw.githubusercontent.com/blmoore/blogR/master/data/measles_incidence.csv", header=T,
# skip=2, stringsAsFactors=F)
# dir.create("data")
# write.csv(measles, file = "data\\measles.csv")
# measles[measles == "-"] <- NA
measles[,-(1:2)] <- apply(measles[,-(1:2)], 2, as.numeric)
dim(measles)
# head(measles)
d2 <- aggregate(measles, list(measles[, "YEAR"]), "sum", na.rm = T)
rownames(measles) <- d2[,1]
d2 <- d2[, -c(1:4)]
d2 <- t(measles)
# head(measles)
dim(measles)
d2[1:5,1:5]
# head(md2)
#
# # per week
# d22 <- aggregate(measles, list(measles$YEAR, measles$WEEK), "sum", na.rm = T)
# rownames(measles2) <- paste(measles2[,1], d22[,2], sep = "_")
# d22 <- d22[, -c(1:5)]
# d22 <- t(measles2)
# md22 <- reshape2::melt(measles2)
# dim(measles2) # 51 3952
# pander::pander(sqrt(measles))
if(!file.exists("data\\sqrt_d2.csv")) write.csv(sqrt(measles), file = "data\\sqrt_d2.csv")
Let’s have measles in a long format.
library(heatmaplyExamples)
data(measles)
dim(measles)
#> [1] 59 72
md2 <- reshape2::melt(measles)
head(md2)
#> Var1 Var2 value
#> 1 ALABAMA 1930 4389
#> 2 ALASKA 1930 0
#> 3 AMERICAN.SAMOA 1930 0
#> 4 ARIZONA 1930 2107
#> 5 ARKANSAS 1930 996
#> 6 CALIFORNIA 1930 43585
We can look at the general timeline trend with different transformations.
library(ggplot2)
ggplot(md2, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(md2, aes(x = Var2, y = sqrt(value))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(md2, aes(x = Var2, y = log(value+.1))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# ggplot(md22, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) # + geom_smooth(size = 2)
#
# we miss the per state information and the patters of similarity
We can try several heatmaps settings until we get one that works. Notice the effect of the color scheme.
library(gplots)
#>
#> Attaching package: 'gplots'
#> The following object is masked from 'package:stats':
#>
#> lowess
library(viridis)
heatmap.2(measles)
heatmap.2(measles, margins = c(3, 9))
heatmap.2(measles,trace = "none", margins = c(3, 9))
heatmap.2(measles, trace = "none", margins = c(3, 9), dendrogram = "row", Colv = NULL)
# heatmap.2(sqrt(measles), Colv = NULL, trace = "none", margins = c(3, 9))
library(viridis)
heatmap.2(measles, Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(measles, Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.
# hist(measles, col = )
library(ggplot2)
library(viridis)
hp <- qplot(x = as.numeric(measles), fill = ..count.., geom = "histogram")
hp + scale_fill_viridis(direction = -1) + xlab("Measles incedence") + theme_bw()
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using scaling doesn’t make much sense in this data.
Decisions for clustering (highlight_branches helps identify the topology of the dendrogram, it colors each branch based on its height):
DATA <- measles
d <- dist(sqrt(DATA))
library(dendextend)
dend_expend(d)[[3]]
#> dist_methods hclust_methods optim
#> 1 unknown ward.D 0.8435986
#> 2 unknown ward.D2 0.8499247
#> 3 unknown single 0.8600853
#> 4 unknown complete 0.8656224
#> 5 unknown average 0.8774877
#> 6 unknown mcquitty 0.8447603
#> 7 unknown median 0.8682800
#> 8 unknown centroid 0.8508883
# average seems to make the most sense
dend_row <- d %>% hclust(method = "average") %>% as.dendrogram
dend_row %>% highlight_branches %>% plot
What would be the optimal number of clusters? (it seems that 3 would be good)
library(dendextend)
dend_k <- find_k(dend_row)
plot(dend_k)
Let’s plot it:
Rowv <- dend_row %>% color_branches(k = 3)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.
Rowv <- dend_row %>% color_branches(k = 3) %>% seriate_dendrogram(x=d)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.
All of this work (other then finding that the hclust method should be average) can simply be done using the following code to produce an interactive heatmap using heatmaply:
library(heatmaply)
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average",
fontsize_row = 8,fontsize_col = 6,
k_row = NA, margins = c(60,170, 70,40),
xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine"
)
We can also use plot_method = “plotly”, row_dend_left = TRUE to make the plot more similar to heatmap.2 using the plotly engine (this is a bit more experimental. The default, plot_method = “ggplot”, is often safer to use - but it does not yet support row_dend_left = TRUE properly).
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average",
fontsize_row = 8,fontsize_col = 6,
k_row = NA, margins = c(60,50, 70,90),
xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine",
plot_method = "plotly", row_dend_left = TRUE
)
# save.image("example.Rdata")
sessionInfo()
#> R version 3.4.2 (2017-09-28)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 16.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/openblas-base/libblas.so.3
#> LAPACK: /usr/lib/libopenblasp-r0.2.18.so
#>
#> locale:
#> [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
#> [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
#> [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] gplots_3.0.1 heatmaplyExamples_0.2.0 dendextend_1.5.2
#> [4] heatmaply_0.11.1 viridis_0.4.0 viridisLite_0.2.0
#> [7] plotly_4.7.1 ggplot2_2.2.1.9000
#>
#> loaded via a namespace (and not attached):
#> [1] httr_1.3.1 tidyr_0.6.3 jsonlite_1.5
#> [4] foreach_1.4.3 gtools_3.5.0 shiny_1.0.5
#> [7] assertthat_0.2.0 stats4_3.4.2 yaml_2.1.14
#> [10] robustbase_0.92-7 backports_1.1.0 lattice_0.20-35
#> [13] glue_1.1.1 digest_0.6.12 RColorBrewer_1.1-2
#> [16] colorspace_1.3-2 httpuv_1.3.5 Matrix_1.2-11
#> [19] htmltools_0.3.6 plyr_1.8.4 pkgconfig_2.0.1
#> [22] xtable_1.8-2 purrr_0.2.3 mvtnorm_1.0-6
#> [25] scales_0.5.0.9000 gdata_2.18.0 whisker_0.3-2
#> [28] tibble_1.3.4 mgcv_1.8-22 nnet_7.3-12
#> [31] lazyeval_0.2.0 mime_0.5 magrittr_1.5
#> [34] mclust_5.3 evaluate_0.10.1 nlme_3.1-131
#> [37] MASS_7.3-47 class_7.3-14 tools_3.4.2
#> [40] registry_0.3 data.table_1.10.4 trimcluster_0.1-2
#> [43] stringr_1.2.0 kernlab_0.9-25 munsell_0.4.3
#> [46] cluster_2.0.6 fpc_2.1-10 bindrcpp_0.2
#> [49] compiler_3.4.2 caTools_1.17.1 rlang_0.1.2.9000
#> [52] grid_3.4.2 iterators_1.0.8 htmlwidgets_0.9
#> [55] crosstalk_1.0.0 labeling_0.3 bitops_1.0-6
#> [58] rmarkdown_1.6 gtable_0.2.0 codetools_0.2-15
#> [61] flexmix_2.3-14 TSP_1.1-5 reshape2_1.4.2
#> [64] R6_2.2.2 seriation_1.2-2 gridExtra_2.2.1
#> [67] knitr_1.16 prabclus_2.2-6 dplyr_0.7.2
#> [70] bindr_0.1 rprojroot_1.2 KernSmooth_2.23-15
#> [73] modeltools_0.2-21 stringi_1.1.5 Rcpp_0.12.12
#> [76] gclus_1.3.1 DEoptimR_1.0-8 diptest_0.75-7