I’m switching to R recently, and one of the challenges is to get the right package from the mass world. Rstudio provides CRAN download logs from 2012-10-01, which contain all hits to http://cran.rstudio.com/  related to packages.
Well, precisely we are going to find the popularity of packages among users who downloads from cran.rstudio.com, but I’ll not emphasize this later on.
The stats are calculated using logs from 2015-01-01 to 2015-05-14.
This download script is inspired by Felix Schönbrodt in this post , and changed quite a log.
Download files library ('lubridate' )dir.create("CRANlogs" ) base_url = 'http://cran-logs.rstudio.com/'  start = as.Date('2015-01-01' ) today = as.Date('2015-05-14' ) all_days = seq(start, today, by = 'day' ) missing_days = setdiff(as.character(all_days), tools::file_path_sans_ext(dir("CRANlogs" ), TRUE )) i = 0  total = length(missing_days) for  (dt in  missing_days) {  i = i + 1    print(paste0("Processing " , i, "/" , total, '...' ))   yearstr = year(as.Date(dt))   url = paste0(base_url, yearstr, '/' , dt, '.csv.gz' )   download.file(url, paste0('CRANlogs/' , dt, '.csv.gz' )) } 
Import data library (lubridate)library (data.table)library (dplyr)library (tidyr)DT = data.table() files = list.files("CRANlogs" , pattern='*.gz' , full.names=TRUE ) for  (file in  files) {    print(paste("Reading" , file, "..." ))     dt = tbl_dt(read.csv(file, as.is=TRUE ))     dt = dt[, `:=`(datetime=ymd_hms(paste(date, time)),                    r_version=as.factor(r_version),                    r_arch=as.factor(r_arch),                    r_os=as.factor(r_os),                    package=as.factor(package),                    date=NULL , time=NULL , ip_id=NULL , version=NULL )]     DT = rbind(DT, dt) } rm(dt) DT = tbl_dt(DT) setkey(DT, package, country) save(DT, file="CRANlogs/CRANlogs.RData" ) gc() 
Analysis Overview Let’s look through our table.
## Source: local data frame [6 x 7]
## 
##   size r_version r_arch      r_os package country            datetime
## 1 2579     3.1.3 x86_64   mingw32    MCTM      ES 2015-05-14 17:21:06
## 2 2579     3.2.0   i386   mingw32    MCTM      GB 2015-05-14 19:56:34
## 3 2579     3.1.3   i386   mingw32    MCTM      HK 2015-05-14 11:42:55
## 4 2579     3.2.0 x86_64   mingw32    MCTM      JP 2015-05-14 16:30:23
## 5 2579     3.1.3 x86_64 linux-gnu    MCTM      US 2015-05-14 15:17:15
## 6 2579     3.1.3 x86_64 linux-gnu    MCTM      US 2015-05-14 19:16:00
## [1] 41345731        7
##      NAME       NROW NCOL    MB
## [1,] DT   41,345,731    7 1,420
##      COLS                                                KEY            
## [1,] size,r_version,r_arch,r_os,package,country,datetime package,country
## Total: 1,420MB
We’ve got a 41345731 by 7 big table, using 1,420MB memory!
Now we will look into various statistics. Whenever the table is too long, we will show the top 10 rows only.
Most popular packages: library (dplyr)library (ggplot2)by_package = DT[, .N, keyby=package][order(-N)]; by_package 
## Source: local data table [7,845 x 2]
## 
##         package      N
## 1          Rcpp 614948
## 2       ggplot2 528963
## 3       stringr 474101
## 4        digest 471773
## 5          plyr 463810
## 6      reshape2 431532
## 7    colorspace 422595
## 8  RColorBrewer 407160
## 9    manipulate 356546
## 10       scales 350245
## ..          ...    ...
A histogram of downloads of packages Packages with download counts \<= 5e4 are ignored.
Downloads Time Series of Top 2 Rcpp and ggplot2 top2 = DT[.(c("Rcpp" , "ggplot2" ))] top2[, `:=`(date=as.Date(datetime))]; 
## Source: local data table [1,143,911 x 8]
## 
##       size r_version r_arch         r_os package country
## 1  2778359     3.0.3   i386      mingw32    Rcpp      NA
## 2  3148107     3.1.1 x86_64      mingw32    Rcpp      NA
## 3  3147121     3.1.1 x86_64      mingw32    Rcpp      NA
## 4  2667601     3.1.2 x86_64      mingw32    Rcpp      NA
## 5  2170103     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 6  3028965     3.1.2 x86_64    linux-gnu    Rcpp      A1
## 7  3029813     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 8  3148107     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 9  3148107     3.1.0 x86_64      mingw32    Rcpp      A1
## 10 3148107     3.0.3 x86_64      mingw32    Rcpp      A1
## ..     ...       ...    ...          ...     ...     ...
## Variables not shown: datetime (time), date (date)
top2[, `:=`(week=week(datetime))]; 
## Source: local data table [1,143,911 x 9]
## 
##       size r_version r_arch         r_os package country
## 1  2778359     3.0.3   i386      mingw32    Rcpp      NA
## 2  3148107     3.1.1 x86_64      mingw32    Rcpp      NA
## 3  3147121     3.1.1 x86_64      mingw32    Rcpp      NA
## 4  2667601     3.1.2 x86_64      mingw32    Rcpp      NA
## 5  2170103     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 6  3028965     3.1.2 x86_64    linux-gnu    Rcpp      A1
## 7  3029813     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 8  3148107     3.1.2 x86_64 darwin13.4.0    Rcpp      A1
## 9  3148107     3.1.0 x86_64      mingw32    Rcpp      A1
## 10 3148107     3.0.3 x86_64      mingw32    Rcpp      A1
## ..     ...       ...    ...          ...     ...     ...
## Variables not shown: datetime (time), date (date), week (int)
top2_stat = top2[, .N, by=.(package, date)] qplot(x=date, y=N, data=top2_stat, geom='line' , color=package) 
top2_stat = top2[, .N, by=.(package, week)] qplot(x=week, y=N, data=top2_stat, geom='line' , color=package) 
R Popularity Among Countries We use total downloads of all packages to estimate the popularity of R in a country.
by_country = DT[, .N, keyby=country][order(-N)]; by_country 
## Source: local data table [225 x 2]
## 
##    country        N
## 1       US 15334569
## 2       CN  3517081
## 3       DE  2087872
## 4       FR  1644046
## 5       GB  1619252
## 6       IN  1551254
## 7       JP  1394623
## 8       ES  1299567
## 9       CA  1015317
## 10      KR   885624
## ..     ...      ...
m = ggplot(by_country[1 :25 ], aes(x=country, y=N)) m + geom_bar(stat="identity" ) + coord_flip() 
Popular R arch by_r_arch = DT[, .N, keyby=r_arch][order(-N)]; by_r_arch 
## Source: local data table [17 x 2]
## 
##         r_arch        N
## 1       x86_64 31425351
## 2         i386  5356496
## 3           NA  4186643
## 4         i686   328437
## 5         i486    29263
## 6         i586     9029
## 7       armv7l     5964
## 8          arm     2268
## 9        amd64      928
## 10     powerpc      510
## 11      armv6l      324
## 12      000000      215
## 13   powerpc64      120
## 14 powerpc64le      109
## 15       sparc       72
## 16    armv5tel        1
## 17                    1
m = ggplot(by_r_arch[1 :15 ], aes(x=r_arch, y=N)) title = "Downloads by R arch"  m + geom_bar(stat="identity" ) + scale_y_log10() + coord_flip() + labs(title=title) 
R Versions by_r_version = DT[, .N, keyby=r_version][order(-N)]; by_r_version 
## Source: local data table [45 x 2]
## 
##    r_version        N
## 1      3.1.2 15588548
## 2      3.1.3  6815858
## 3      3.1.1  4538881
## 4      3.2.0  4511164
## 5         NA  4186643
## 6      3.0.2  2298008
## 7      3.1.0  1512864
## 8      3.0.3   662776
## 9      3.0.1   484936
## 10    2.15.2   123897
## ..       ...      ...
m = ggplot(by_r_version[1 :10 ], aes(x=r_version, y=N)) title = "Downloads by R version"  m + geom_bar(stat="identity" ) + coord_flip() + labs(title=title) 
Popular R os by_r_os = DT[, .N, keyby=r_os][order(-N)]; by_r_os 
## Source: local data table [51 x 2]
## 
##            r_os        N
## 1       mingw32 23873832
## 2     linux-gnu  6681662
## 3            NA  4186643
## 4  darwin13.4.0  3502524
## 5  darwin10.8.0  2051859
## 6  darwin13.1.0   740996
## 7  darwin14.1.0   108816
## 8   darwin9.8.0    75073
## 9  darwin14.0.0    45906
## 10 darwin14.3.0    35019
## ..          ...      ...
m = ggplot(by_r_os[1 :10 ], aes(x=r_os, y=N)) title = "Downloads by R os"  m + geom_bar(stat="identity" ) + coord_flip() + labs(title=title) 
Downloads by country*r_arch tt = DT[, .N, keyby=.(country, r_arch)] top_country = by_country[1 :10 ] top_arch = by_r_arch[1 :10 ] selected = tt[.(top_country$country)] setkey(selected, r_arch) selected = selected[.(top_arch$r_arch)] qplot(x=country, y=N, data=selected, fill=r_arch, geom='bar' , stat = "identity" ) 
qplot(x=r_arch, y=N, data=selected, fill=country, geom='bar' , stat = "identity" ) 
Downloads by country*r_os tt = DT[, .N, keyby=.(country, r_os)] top_os = by_r_os[1 :6 ] selected = tt[.(top_country$country)] setkey(selected, r_os) selected = selected[.(top_os$r_os)] qplot(x=country, y=N, data=selected, fill=r_os, geom='bar' , stat = "identity" ) 
qplot(x=r_os, y=N, data=selected, fill=country, geom='bar' , stat = "identity" )