The package adds the following vector functions
# sample_mode returns the statistical mode
sample_mode(c(1, 2, 2))
sample_mode(c(1, 2))
sample_mode(c(NA, NA, 1))
sample_mode(c(NA, NA, 1), na.rm = TRUE)
# bin creates integer variable for quantile categories (corresponds to Stata xtile)
v <- c(NA, 1:10)
bin(v, n_quantiles = 3) # 3 groups based on terciles
bin(v, probs = c(0.3, 0.7)) # 3 groups based on two quantiles
bin(v, cutpoints = c(2, 3)) # 3 groups based on two cutpoints
# winsorize (default based on 5 x interquartile range)
v <- c(1:4, 99)
winsorize(v)
winsorize(v, replace = NA)
winsorize(v, probs = c(0.01, 0.99))
winsorize(v, cutpoints = c(1, 50))
# demean on multiple groups (ie multiple fixed effects)
demean(c(1,2), fe = c(1,1))
demean(c(NA,2), fe = list(c(1,2), c(1,3)))
demean(c(1,2), fe = list(c(NA,2), c(1,3)))
# setkeep keeps certain columns inplace
DT <- data.table(
id = c(1,2),
v1 = c(1,1),
v2 = c(2,1)
)
setkeep(DT, list(id, v2))
setkeep(DT, -id)
setdrop(DT, v1)
# sum_up prints detailed summary statistics (corresponds to Stata summarize)
N <- 100
DT <- data.table(
id = 1:N,
v1 = sample(5, N, TRUE),
v2 = sample(1e6, N, TRUE)
)
sum_up(DT)
sum_up(DT, v2, d = TRUE)
sum_up(DT, starts_with("v"), by = v1)
# tab returns cross tabulation table (faster than tabulate)
tab(DT, id)
tab(DT, id, v1, w = v2)
# duplicates returns duplicated rows
DT <- data.table(a = rep(1:2, each = 3), b = 1:6)
duplicates(DT, by = a)
duplicates(DT, by = list(a,b))
graph
is a wrapper for ggplot2
functionalities, useful for interactive exploration of datasets
N <- 10000
DT <- data.table(
id = sample(c("id1","id2","id3"), N, TRUE),
v1 = sample(c(1:5), N, TRUE),
v2 = rnorm(N, sd = 20),
v3 = sample(runif(100, max=100), N, TRUE)
)
DT[, v4 := (id=="id1")* v2 + rnorm(N, sd = 5)]
graph(DT)
graph(DT, by = id)
graph(DT, by = id, type = "boxplot")
graph(DT, list(v3, v4), along_with = v2)
graph(DT, list(v3, v4), along_with = v2, by = id, type = "loess")
join
is a wrapper for data.table merge functionalities.
Stata | statar |
---|---|
merge v1 | join(x, y, kind = “outer”) |
merge v1, keep(master matched) | join(x, y, kind = “left”) |
merge v1, keep(matched using) | join(x, y, kind = “right”) |
merge v1, keep(matched) | join(x, y, kind = “inner”) |
merge v1, keep(matched) keepusing(v1) | join(x, y, kind = “semi”) |
merge v1, keep(master) keepusing(v1) | join(x, y, kind = “anti”) |
crossby | join(x, y, kind = “cross”) |
r # merge m:1 v1 join(x, y, kind = "outer", check = m~1)
- The option “gen” specifies the name of a new variable that identifies non matched and matched rows (as in Stata).
r # merge m:1 v1, gen(_merge) join(x, y, kind = "outer", gen = "_merge")
Functions with the prefix set
modify the input data.table in place. Function selects variables similarly to dplyr
syntax. Each function has a version that accepts strings, formulas or quoted expressions : its name is the original function’s name with the suffix _ (see the dplyr vignette for more details). For instance, the SE version of sum_up
is sum_up_
.
# NSE version
sum_up(DT, list(v2, v3), by = list(id,v1))
# SE version
sum_up_(DT, c("v2","v3"), by = c("id","v1"))