Title: | Simulation for Factorial Designs |
---|---|
Description: | Create datasets with factorial structure through simulation by specifying variable parameters. Extended documentation at <https://debruine.github.io/faux/>. Described in DeBruine (2020) <doi:10.5281/zenodo.2669586>. |
Authors: | Lisa DeBruine [aut, cre] , Anna Krystalli [ctb] , Andrew Heiss [ctb] |
Maintainer: | Lisa DeBruine <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.2.1.9002 |
Built: | 2024-11-10 04:00:54 UTC |
Source: | https://github.com/debruine/faux |
Add between factors
add_between(.data, .by = NULL, ..., .shuffle = FALSE, .prob = NULL)
add_between(.data, .by = NULL, ..., .shuffle = FALSE, .prob = NULL)
.data |
the data frame |
.by |
the grouping column (groups by row if NULL) |
... |
the names and levels of the new factors |
.shuffle |
whether to assign cells randomly or in "order" |
.prob |
probability of each level, equal if NULL |
data frame
add_random(subj = 4, item = 2) %>% add_between("subj", condition = c("cntl", "test")) %>% add_between("item", version = c("A", "B"))
add_random(subj = 4, item = 2) %>% add_between("subj", condition = c("cntl", "test")) %>% add_between("item", version = c("A", "B"))
Add a contrast to a data frame
add_contrast( data, col, contrast = c("anova", "sum", "treatment", "helmert", "poly", "difference"), levels = NULL, ..., add_cols = TRUE, colnames = NULL )
add_contrast( data, col, contrast = c("anova", "sum", "treatment", "helmert", "poly", "difference"), levels = NULL, ..., add_cols = TRUE, colnames = NULL )
data |
the data frame |
col |
the column to recode |
contrast |
the contrast to recode to |
levels |
the levels of the factor in order |
... |
arguments to pass to the contrast function (base or omit) |
add_cols |
whether to just add the contrast to the existing column or also to create new explicit columns in the dataset (default) |
colnames |
optional list of column names for the added contrasts |
the data frame with the recoded column and added columns (if add_cols == TRUE)
df <- sim_design(between = list(time = 1:6), plot = FALSE) %>% add_contrast("time", "poly") # test all polynomial contrasts lm(y ~ time, df) %>% broom::tidy() # test only the linear and quadratic contrasts lm(y ~ `time^1` + `time^2`, df) %>% broom::tidy()
df <- sim_design(between = list(time = 1:6), plot = FALSE) %>% add_contrast("time", "poly") # test all polynomial contrasts lm(y ~ time, df) %>% broom::tidy() # test only the linear and quadratic contrasts lm(y ~ `time^1` + `time^2`, df) %>% broom::tidy()
Add random factors to a data structure
add_random(.data = NULL, ..., .nested_in = NULL)
add_random(.data = NULL, ..., .nested_in = NULL)
.data |
the data frame |
... |
the new random factor column name and the number of values of the random factor (if crossed) or the n per group (if nested); can be a vector of n per group if nested |
.nested_in |
the column(s) to nest in (if NULL, the factor is crossed with all columns) |
a data frame
# start a data frame data1 <- add_random(school = 3) # nest classes in schools (2 classes per school) data2 <- add_random(data1, class = 2, .nested_in = "school") # nest pupils in each class (different n per class) data3 <- add_random(data2, pupil = c(20, 24, 23, 21, 25, 24), .nested_in = "class") # cross each pupil with 10 questions data4 <- add_random(data3, question = 10) # compare nesting in 2 different factors data <- add_random(A = 2, B = 2) add_random(data, C = 2, .nested_in = "A") add_random(data, C = 2, .nested_in = "B") # specify item names add_random(school = c("Hyndland Primary", "Hyndland Secondary")) %>% add_random(class = list(paste0("P", 1:7), paste0("S", 1:6)), .nested_in = "school")
# start a data frame data1 <- add_random(school = 3) # nest classes in schools (2 classes per school) data2 <- add_random(data1, class = 2, .nested_in = "school") # nest pupils in each class (different n per class) data3 <- add_random(data2, pupil = c(20, 24, 23, 21, 25, 24), .nested_in = "class") # cross each pupil with 10 questions data4 <- add_random(data3, question = 10) # compare nesting in 2 different factors data <- add_random(A = 2, B = 2) add_random(data, C = 2, .nested_in = "A") add_random(data, C = 2, .nested_in = "B") # specify item names add_random(school = c("Hyndland Primary", "Hyndland Secondary")) %>% add_random(class = list(paste0("P", 1:7), paste0("S", 1:6)), .nested_in = "school")
Add random effects to a data frame
add_ranef(.data, .by = NULL, ..., .cors = 0, .empirical = FALSE)
add_ranef(.data, .by = NULL, ..., .cors = 0, .empirical = FALSE)
.data |
the data frame |
.by |
the grouping column (groups by row if NULL) |
... |
the name and standard deviation of each random effect |
.cors |
the correlations among multiple random effects, to be passed to |
.empirical |
logical. To be passed to |
data frame with new random effects columns
add_random(rater = 2, stimulus = 2, time = 2) %>% add_ranef("rater", u0r = 1.5) %>% add_ranef("stimulus", u0s = 2.2, u1s = 0.75, .cors = 0.5) %>% add_ranef(c("rater", "stimulus"), u0sr = 1.2)
add_random(rater = 2, stimulus = 2, time = 2) %>% add_ranef("rater", u0r = 1.5) %>% add_ranef("stimulus", u0s = 2.2, u1s = 0.75, .cors = 0.5) %>% add_ranef(c("rater", "stimulus"), u0sr = 1.2)
Recode a categorical column
add_recode(.data, .col, .newcol = paste0(col, ".c"), ...)
add_recode(.data, .col, .newcol = paste0(col, ".c"), ...)
.data |
the data frame |
.col |
the column to recode |
.newcol |
the name of the recoded column (defaults to col.c) |
... |
coding for categorical column |
data frame with new fixed effects columns
add_random(subj = 4, item = 4) %>% add_between("subj", cond = c("cntl", "test")) %>% add_recode("cond", "cond.t", cntl = 0, test = 1)
add_random(subj = 4, item = 4) %>% add_between("subj", cond = c("cntl", "test")) %>% add_recode("cond", "cond.t", cntl = 0, test = 1)
Add within factors
add_within(.data, .by = NULL, ...)
add_within(.data, .by = NULL, ...)
.data |
the data frame |
.by |
the grouping column (groups by row if NULL) |
... |
the names and levels of the new factors |
data frame
add_random(subj = 2, item = 2) %>% add_within("subj", time = c("pre", "post"))
add_random(subj = 2, item = 2) %>% add_within("subj", time = c("pre", "post"))
Average r to Random Intercept SD
average_r2tau_0(average_r, sigma)
average_r2tau_0(average_r, sigma)
average_r |
The average inter-item correlation |
sigma |
Total error variance |
The standard deviation of the random intercept
Convert beta to normal
beta2norm(x, mu = 0, sd = 1, shape1 = NULL, shape2 = NULL, ...)
beta2norm(x, mu = 0, sd = 1, shape1 = NULL, shape2 = NULL, ...)
x |
the gamma distributed vector |
mu |
the mean of the normal distribution to convert to |
sd |
the SD of the normal distribution to convert to |
shape1 , shape2
|
non-negative parameters of the beta distribution |
... |
further arguments to pass to pbeta (e.g., ncp) |
a vector with a normal distribution
x <- rbeta(10000, 2, 3) y <- beta2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rbeta(10000, 2, 3) y <- beta2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert a binomial distribution to a normal (gaussian) distribution with specified mu and sd
binom2norm(x, mu = 0, sd = 1, size = NULL, prob = NULL)
binom2norm(x, mu = 0, sd = 1, size = NULL, prob = NULL)
x |
the binomially distributed vector |
mu |
the mean of the normal distribution to return |
sd |
the SD of the normal distribution to return |
size |
number of trials (set to max value of x if not specified) |
prob |
the probability of success on each trial (set to mean probability if not specified) |
a vector with a gaussian distribution
x <- rbinom(10000, 20, 0.75) y <- binom2norm(x, 0, 1, 20, 0.75) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rbinom(10000, 20, 0.75) y <- binom2norm(x, 0, 1, 20, 0.75) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Specify any number of within- and between-subject factors with any number of levels.
check_design( within = list(), between = list(), n = 100, mu = 0, sd = 1, r = 0, dv = list(y = "value"), id = list(id = "id"), vardesc = list(), plot = faux_options("plot"), design = NULL, fix_names = FALSE, sep = faux_options("sep") )
check_design( within = list(), between = list(), n = 100, mu = 0, sd = 1, r = 0, dv = list(y = "value"), id = list(id = "id"), vardesc = list(), plot = faux_options("plot"), design = NULL, fix_names = FALSE, sep = faux_options("sep") )
within |
a list of the within-subject factors |
between |
a list of the between-subject factors |
n |
the number of samples required |
mu |
a vector giving the means of the variables |
sd |
the standard deviations of the variables |
r |
the correlations among the variables (can be a single number, full correlation matrix as a matric or vector, or a vector of the upper right triangle of the correlation matrix |
dv |
the name of the DV column list(y = "value") |
id |
the name of the ID column list(id = "id") |
vardesc |
a list of variable descriptions having the names of the within- and between-subject factors |
plot |
whether to show a plot of the design |
design |
a design list including within, between, n, mu, sd, r, dv, id |
fix_names |
deprecated |
sep |
separator for factor levels |
Specify n for each between-subject cell; mu and sd for each cell, and r for the within-subject cells for each between-subject cell.
This function returns a validated design list for use in sim_data to simulate a data table with this design, or to archive your design.
See vignette("sim_design", package = "faux")
for details.
list
within <- list(time = c("day", "night")) between <- list(pet = c("dog", "cat")) mu <- list(dog = 10, cat = 5) vardesc <- list(time = "Time of Day", pet = "Type of Pet") check_design(within, between, mu = mu, vardesc = vardesc) between <- list(language = c("dutch", "thai"), pet = c("dog", "cat")) mu <- list(dutch_dog = 12, dutch_cat = 7, thai_dog = 8, thai_cat = 3) check_design(within, between, mu = mu)
within <- list(time = c("day", "night")) between <- list(pet = c("dog", "cat")) mu <- list(dog = 10, cat = 5) vardesc <- list(time = "Time of Day", pet = "Type of Pet") check_design(within, between, mu = mu, vardesc = vardesc) between <- list(language = c("dutch", "thai"), pet = c("dog", "cat")) mu <- list(dutch_dog = 12, dutch_cat = 7, thai_dog = 8, thai_cat = 3) check_design(within, between, mu = mu)
Get error terms from an existing data table.
check_mixed_design(data, dv = 1, sub_id = 2, item_id = 3, formula = NULL)
check_mixed_design(data, dv = 1, sub_id = 2, item_id = 3, formula = NULL)
data |
the existing tbl |
dv |
the column name or index containing the DV |
sub_id |
the column name or index for the subject IDs |
item_id |
the column name or index for the item IDs |
formula |
the formula to run in lmer (defaults to null model dv ~ 1 + (1|sub_id) + (1|item_id)) |
a list of parameters
des <- check_mixed_design(fr4, "rating", "rater_id", "face_id") str(des[1:4])
des <- check_mixed_design(fr4, "rating", "rater_id", "face_id") str(des[1:4])
See vignette("codebook", package = "faux")
for details.
codebook( data, name = NULL, vardesc = list(), ..., schemaVersion = "Psych-DS 0.1.0", return = c("json", "list", "data"), interactive = FALSE )
codebook( data, name = NULL, vardesc = list(), ..., schemaVersion = "Psych-DS 0.1.0", return = c("json", "list", "data"), interactive = FALSE )
data |
The data frame to generate a codebook for |
name |
The name of this dataset (if NULL, will be the same as 'data', limited to 64 characters) |
vardesc |
Optional variable properties in the format of a named list of vectors (can be named or unnamed and in the same order as the data) from the options "description", "privacy", "dataType", "identifier", "minValue", "maxValue", "levels", "levelsOrdered", "na", "naValue", "alternateName", "privacy", "unitCode", "unitText" |
... |
Further dataset properties (e.g., description, license, author, citation, funder, url, identifier, keywords, privacyPolicy) |
schemaVersion |
defaults to "Psych-DS 0.1.0" |
return |
Whether the output should be in JSON format (json), a list (list) or the reformatted data with the codebook as an attribute (data) |
interactive |
Whether the function should prompt the user to describe columns and factor levels |
a list or json-formatted codebook, or reformatted data with the codebook as an attribute
vardesc = list( description = c("Length of the sepal", "Width of the sepal", "Length of the petal", "Width of the petal", "The flower species"), type = c("float", "float", "float", "float", "string") ) codebook(iris, vardesc = vardesc)
vardesc = list( description = c("Length of the sepal", "Width of the sepal", "Length of the petal", "Width of the petal", "The flower species"), type = c("float", "float", "float", "float", "string") ) codebook(iris, vardesc = vardesc)
Anova coding (also called deviation or simple coding) sets the grand mean as the intercept. Each contrast compares one level with the reference level (base).
contr_code_anova(fct, levels = NULL, base = 1, colnames = NULL)
contr_code_anova(fct, levels = NULL, base = 1, colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
base |
the index of the level to use as baseline |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_anova(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_anova(df$pet, base = 1) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_anova(df$pet, base = 2) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_anova(df$pet, base = "ferret") lm(y ~ pet, df) %>% broom::tidy()
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_anova(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_anova(df$pet, base = 1) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_anova(df$pet, base = 2) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_anova(df$pet, base = "ferret") lm(y ~ pet, df) %>% broom::tidy()
Difference coding sets the grand mean as the intercept. Each contrast compares one level with the previous level.
contr_code_difference(fct, levels = NULL, colnames = NULL)
contr_code_difference(fct, levels = NULL, colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_difference(df$pet) lm(y ~ pet, df) %>% broom::tidy()
df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_difference(df$pet) lm(y ~ pet, df) %>% broom::tidy()
Helmert coding sets the grand mean as the intercept. Each contrast compares one level with the mean of previous levels.
contr_code_helmert(fct, levels = NULL, colnames = NULL)
contr_code_helmert(fct, levels = NULL, colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_helmert(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_helmert(df$pet) lm(y ~ pet, df) %>% broom::tidy() # reorder the levels to change the comparisons df$pet <- contr_code_helmert(df$pet, levels = c("dog", "cat", "ferret")) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_helmert(df$pet, levels = c("ferret", "dog", "cat")) lm(y ~ pet, df) %>% broom::tidy()
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_helmert(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_helmert(df$pet) lm(y ~ pet, df) %>% broom::tidy() # reorder the levels to change the comparisons df$pet <- contr_code_helmert(df$pet, levels = c("dog", "cat", "ferret")) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_helmert(df$pet, levels = c("ferret", "dog", "cat")) lm(y ~ pet, df) %>% broom::tidy()
Polynomial coding sets the grand mean as the intercept. Each contrast tests a trend (linear, quadratic, cubic, etc.). This is only suitable for ordered factors.
contr_code_poly(fct, levels = NULL, colnames = NULL)
contr_code_poly(fct, levels = NULL, colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(within = list(time = 1:6), mu = 1:6 + (1:6-3.5)^2, long = TRUE, plot = FALSE) df$time <- contr_code_poly(df$time) lm(y ~ time, df) %>% broom::tidy()
df <- sim_design(within = list(time = 1:6), mu = 1:6 + (1:6-3.5)^2, long = TRUE, plot = FALSE) df$time <- contr_code_poly(df$time) lm(y ~ time, df) %>% broom::tidy()
Sum coding sets the grand mean as the intercept. Each contrast compares one level with the grand mean.
contr_code_sum(fct, levels = NULL, omit = length(levels), colnames = NULL)
contr_code_sum(fct, levels = NULL, omit = length(levels), colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
omit |
the level to omit (defaults to the last level) |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(between = list(pet = c("cat", "dog", "bird", "ferret")), mu = c(2, 4, 9, 13), empirical = TRUE, plot = FALSE) df$pet <- contr_code_sum(df$pet) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_sum(df$pet, omit = "cat") lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_sum(df$pet, omit = 1) lm(y ~ pet, df) %>% broom::tidy()
df <- sim_design(between = list(pet = c("cat", "dog", "bird", "ferret")), mu = c(2, 4, 9, 13), empirical = TRUE, plot = FALSE) df$pet <- contr_code_sum(df$pet) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_sum(df$pet, omit = "cat") lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_sum(df$pet, omit = 1) lm(y ~ pet, df) %>% broom::tidy()
Treatment coding sets the mean of the reference level (base) as the intercept. Each contrast compares one level with the reference level.
contr_code_treatment(fct, levels = NULL, base = 1, colnames = NULL)
contr_code_treatment(fct, levels = NULL, base = 1, colnames = NULL)
fct |
the factor to contrast code (or a vector) |
levels |
the levels of the factor in order |
base |
the index of the level to use as baseline |
colnames |
optional list of column names for the added contrasts |
the factor with contrasts set
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_treatment(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_treatment(df$pet) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_treatment(df$pet, base = 2) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_treatment(df$pet, base = "ferret") lm(y ~ pet, df) %>% broom::tidy()
df <- sim_design(between = list(pet = c("cat", "dog")), mu = c(10, 20), plot = FALSE) df$pet <- contr_code_treatment(df$pet) lm(y ~ pet, df) %>% broom::tidy() df <- sim_design(between = list(pet = c("cat", "dog", "ferret")), mu = c(2, 4, 9), empirical = TRUE, plot = FALSE) df$pet <- contr_code_treatment(df$pet) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_treatment(df$pet, base = 2) lm(y ~ pet, df) %>% broom::tidy() df$pet <- contr_code_treatment(df$pet, base = "ferret") lm(y ~ pet, df) %>% broom::tidy()
Given a target r-value, returns the correlation you need to induce in a bivariate normal distribution to have the target correlation after converting distributions.
convert_r( target_r = 0, dist1 = "norm", dist2 = "norm", params1 = list(), params2 = list(), tol = 0.01 )
convert_r( target_r = 0, dist1 = "norm", dist2 = "norm", params1 = list(), params2 = list(), tol = 0.01 )
target_r |
The target correlation |
dist1 |
The target distribution function for variable 1 (e.g., norm, binom, gamma, truncnorm) |
dist2 |
The target distribution function for variable 2 |
params1 |
Arguments to pass to the functions for distribution 1 |
params2 |
Arguments to pass to the functions for distribution 2 |
tol |
Tolerance for optimise function |
See Distributions for distributions and their various arguments to specify in params1 and params2.
r-value to induce in the bivariate normal variables
convert_r(target_r = 0.5, dist1 = "norm", dist2 = "binom", params1 = list(mean = 100, sd = 10), params2 = list(size = 1, prob = 0.5)) convert_r(target_r = 0.5, dist1 = "norm", dist2 = "likert", params1 = list(mean = 100, sd = 10), params2 = list(prob = c(5, 10, 20, 30, 20)))
convert_r(target_r = 0.5, dist1 = "norm", dist2 = "binom", params1 = list(mean = 100, sd = 10), params2 = list(size = 1, prob = 0.5)) convert_r(target_r = 0.5, dist1 = "norm", dist2 = "likert", params1 = list(mean = 100, sd = 10), params2 = list(prob = c(5, 10, 20, 30, 20)))
cormat
makes a correlation matrix from a single number,
vars\*vars matrix, vars\*vars vector, or a vars\*(vars-1)/2 vector.
cormat(cors = 0, vars = 3)
cormat(cors = 0, vars = 3)
cors |
the correlations among the variables (can be a single number, vars\*vars matrix, vars\*vars vector, or a vars\*(vars-1)/2 vector) |
vars |
the number of variables in the matrix |
matrix
cormat(.5, 3) cormat(c( 1, .2, .3, .4, .2, 1, .5, .6, .3, .5, 1, .7, .4, .6, .7, 1), 4) cormat(c(.2, .3, .4, .5, .6, .7), 4)
cormat(.5, 3) cormat(c( 1, .2, .3, .4, .2, 1, .5, .6, .3, .5, 1, .7, .4, .6, .7, 1), 4) cormat(c(.2, .3, .4, .5, .6, .7), 4)
cormat_from_triangle
makes a correlation matrix from a vector of the upper right triangle
cormat_from_triangle(cors)
cormat_from_triangle(cors)
cors |
the correlations among the variables as a vars\*(vars-1)/2 vector |
matrix
cormat_from_triangle(c(.2, .3, .4, .5, .6, .7))
cormat_from_triangle(c(.2, .3, .4, .5, .6, .7))
Get distribution functions
distfuncs(dist = "norm")
distfuncs(dist = "norm")
dist |
The target distribution function (e.g., norm, binom, gamma, truncnorm, likert). If the distribution isn't definited in the packages stats, truncnorm, or faux, use the format "package::dist". |
a list with the r and q functions
qfunc <- distfuncs("norm")$q # returns qnorm p <- seq(0.1, 0.9, .1) qfunc(p) == qnorm(p) rfunc <- distfuncs("norm")$r # returns rnorm rfunc(n = 10, mean = 100, sd = 10)
qfunc <- distfuncs("norm")$q # returns qnorm p <- seq(0.1, 0.9, .1) qfunc(p) == qnorm(p) rfunc <- distfuncs("norm")$r # returns rnorm rfunc(n = 10, mean = 100, sd = 10)
Likert density function
dlikert(x, prob, labels = names(prob))
dlikert(x, prob, labels = names(prob))
x |
the likert distributed vector |
prob |
a vector of probabilities or counts; if named, the output is a factor |
labels |
a vector of values, defaults to names(prob) or 1:length(prob), if numeric, the output is numeric |
a vector of the densities
x <- 1:5 prob <- c(.1, .2, .4, .2, .1) dlikert(x, prob) x <- c("A", "C", "B", "B") prob <- c(A = 10, B = 20, C = 30) dlikert(x, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 x <- sample(labels, 10, replace = TRUE) prob <- rep(1, length(labels)) # uniform probability dlikert(x, prob, labels)
x <- 1:5 prob <- c(.1, .2, .4, .2, .1) dlikert(x, prob) x <- c("A", "C", "B", "B") prob <- c(A = 10, B = 20, C = 30) dlikert(x, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 x <- sample(labels, 10, replace = TRUE) prob <- rep(1, length(labels)) # uniform probability dlikert(x, prob, labels)
A dataset containing attractiveness ratings (on a 1-7 scale from "much less attractiveness than average" to "much more attractive than average") for the neutral front faces from 2513 people (ages 17-90)
faceratings
faceratings
A data frame with 256326 rows and 9 variables:
rater's ID
rater's sex (female, male, intersex, NA)
rater's age (17-90 years)
rater's preferred sex for romantic relationships (either, men, neither, women, NA)
face's ID
face's sex (female, male)
face's age (in years)
face's ethnic group
attractiveness rating on a scale from 1 (much less attractive than average) to 7 (much more attractive than average)
https://figshare.com/articles/dataset/Face_Research_Lab_London_Set/5047666
The faux package provides functions for simulating datasets with specified structure.
Global faux options are used, for example, to set the default separator for cell names.
faux_options(...)
faux_options(...)
... |
One of four: (1) nothing, then returns all options as a list; (2) a name of an option element, then returns its value; (3) a name-value pair which sets the corresponding option to the new value (and returns nothing), (4) a list with option-value pairs which sets all the corresponding arguments. |
a list of options, values of an option, or nothing
faux_options() # see all options faux_options("sep") # see value of faux.sep ## Not run: # changes cell separator (e.g., A1.B2) faux_options(sep = ".") # changes cell separator back to default (e.g., A1_B2) faux_options(sep = "_") ## End(Not run)
faux_options() # see all options faux_options("sep") # see value of faux.sep ## Not run: # changes cell separator (e.g., A1.B2) faux_options(sep = ".") # changes cell separator back to default (e.g., A1_B2) faux_options(sep = "_") ## End(Not run)
Fréchet-Hoefding bounds are the limits to a correlation between different distributions.
fh_bounds(dist1 = "norm", dist2 = "norm", params1 = list(), params2 = list())
fh_bounds(dist1 = "norm", dist2 = "norm", params1 = list(), params2 = list())
dist1 |
The target distribution function for variable 1 (e.g., norm, binom, gamma, truncnorm) |
dist2 |
The target distribution function for variable 2 |
params1 |
Arguments to pass to the rdist function for distribution 1 |
params2 |
Arguments to pass to the rdist function for distribution 2 |
a list of the min and max possible values
fh_bounds(dist1 = "pois", dist2 = "unif", params1 = list(lambda = 3), params2 = list(min = 0, max = 100))
fh_bounds(dist1 = "pois", dist2 = "unif", params1 = list(lambda = 3), params2 = list(min = 0, max = 100))
Fixes if a factor list does not have named levels or has special characters in the names
fix_name_labels(x, pattern = NA, replacement = ".")
fix_name_labels(x, pattern = NA, replacement = ".")
x |
the vector or list to fix |
pattern |
regex pattern to replace; defaults to non-word characters and the value of faux_options("sep") (default = _) |
replacement |
the character to replace; defaults to . (or _ if faux_options("sep") == ".") |
a named list with fixed names
source <- list("full.stop", " space ", "under_score", "plus+", "dash-", "tab\t", "line\nbreak") fix_name_labels(source)
source <- list("full.stop", " space ", "under_score", "plus+", "dash-", "tab\t", "line\nbreak") fix_name_labels(source)
The faceratings dataset cut down for demos to the first 4 raters of each sex and sexpref and the first 4 faces of each sex and ethnicity with non-NA ages
fr4
fr4
A data frame with 768 rows and 9 variables:
rater's ID
rater's sex (female, male)
rater's age (17.4-54.3 years)
rater's preferred sex for romantic relationships (either, men, women)
face's ID
face's sex (female, male)
face's age (19-47 years)
face's ethnic group (black, east_asian, west_asian, white)
attractiveness rating on a scale from 1 (much less attractive than average) to 7 (much more attractive than average)
https://figshare.com/articles/dataset/Face_Research_Lab_London_Set/5047666
Convert gamma to normal
gamma2norm(x, mu = 0, sd = 1, shape = NULL, rate = 1, scale = 1/rate)
gamma2norm(x, mu = 0, sd = 1, shape = NULL, rate = 1, scale = 1/rate)
x |
the gamma distributed vector |
mu |
the mean of the normal distribution to convert to |
sd |
the SD of the normal distribution to convert to |
shape |
gamma distribution parameter (must be positive) |
rate |
an alternative way to specify the scale |
scale |
gamma distribution parameter (must be positive) |
a vector with a normal distribution
x <- rgamma(10000, 2) y <- gamma2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rgamma(10000, 2) y <- gamma2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
You need model coefficients to simulate multilevel data, and can get them from data simulated from parameters using sim_design() or rmulti().
get_coefs(data, formula = NULL, fun = c("lm", "glm", "lmer", "glmer"), ...)
get_coefs(data, formula = NULL, fun = c("lm", "glm", "lmer", "glmer"), ...)
data |
A dataset in long format |
formula |
A formula (can be extracted from datasets created by sim_design) |
fun |
the model function (one of "lm", "glm", "lmer", or "glmer") |
... |
Further arguments to the model function |
a list of the model coefficients
# simulate some data data <- sim_design(within = 2, between = 2, mu = c(1, 0, 1, 1), long = TRUE, empirical = TRUE) # get coefs for the full factorial model get_coefs(data) # a reduced model get_coefs(data, y ~ B1 + W1) # specify a different model function data$y <- norm2binom(data$y) get_coefs(data, fun = "glm", family = binomial)
# simulate some data data <- sim_design(within = 2, between = 2, mu = c(1, 0, 1, 1), long = TRUE, empirical = TRUE) # get coefs for the full factorial model get_coefs(data) # a reduced model get_coefs(data, y ~ B1 + W1) # specify a different model function data$y <- norm2binom(data$y) get_coefs(data, fun = "glm", family = binomial)
Get a data frame of contrast values from a factor vector
get_contrast_vals(v)
get_contrast_vals(v)
v |
a factor vector |
a data frame
dat <- sim_design( between = list(group = c("A", "B")), n = 5, plot = FALSE) get_contrast_vals(dat$group)
dat <- sim_design( between = list(group = c("A", "B")), n = 5, plot = FALSE) get_contrast_vals(dat$group)
Get the design specification from a data table created in faux. This can be used to create more simulated data with the same design.
get_design(data)
get_design(data)
data |
The data table to check |
list with class design
data <- sim_design(2, 2, plot = FALSE) design <- get_design(data) data2 <- sim_design(design, plot = FALSE)
data <- sim_design(2, 2, plot = FALSE) design <- get_design(data) data2 <- sim_design(design, plot = FALSE)
Makes a best guess at the design of a long-format data frame.
get_design_long( data, dv = c(y = "score"), id = c(id = "id"), plot = faux_options("plot") )
get_design_long( data, dv = c(y = "score"), id = c(id = "id"), plot = faux_options("plot") )
data |
the data frame (in long format) |
dv |
the column name that identifies the DV |
id |
the column name(s) that identify a unit of analysis |
plot |
whether to show a plot of the design |
Finds all columns that contain a single value per unit of analysis (between factors), all columns that contain the same values per unit of analysis (within factors), and all columns that differ over units of analysis (dv, continuous factors)
a design list
Generates a table of the correlations and means of numeric columns in a data frame. If data was generated by sim_design
and has a "design" attribute, between, within, dv and id are retrieved from that, unless overridden (use between = 0 to
get_params( data, between = NULL, within = NULL, dv = NULL, id = NULL, digits = 2 ) check_sim_stats( data, between = NULL, within = NULL, dv = NULL, id = NULL, digits = 2 )
get_params( data, between = NULL, within = NULL, dv = NULL, id = NULL, digits = 2 ) check_sim_stats( data, between = NULL, within = NULL, dv = NULL, id = NULL, digits = 2 )
data |
the existing tbl |
between |
a vector of column names for between-subject factors |
within |
a vector of column names for within-subject factors (if data is long) |
dv |
the column name(s) of the dv, if NULL all numeric columns will be selected |
id |
the column name(s) of the subject ID, excluded from the table even if numeric |
digits |
how many digits to round to (default = 2) |
a tbl of correlations, means and sds
get_params(iris, "Species")
get_params(iris, "Species")
Get columns from a data table by specifying the index, column name as a string, or unquoted column name. Returns the column names or indices.
getcols(data, ..., as_index = FALSE)
getcols(data, ..., as_index = FALSE)
data |
the existing tbl |
... |
Columns to get |
as_index |
return the column indices (defaults to name) |
vector of column names or indices
getcols(mtcars, 1, cyl, "disp", 5:7)
getcols(mtcars, 1, cyl, "disp", 5:7)
Set design interactively
interactive_design(output = c("faux"), plot = faux_options("plot"))
interactive_design(output = c("faux"), plot = faux_options("plot"))
output |
what type of design to output (faux) |
plot |
whether to show a plot of the design |
list
if(interactive()){ des <- interactive_design() }
if(interactive()){ des <- interactive_design() }
is_pos_def
makes a correlation matrix from a vector
is_pos_def(cor_mat, tol = 1e-08)
is_pos_def(cor_mat, tol = 1e-08)
cor_mat |
a correlation matrix |
tol |
the tolerance for comparing eigenvalues to 0 |
logical value
is_pos_def(matrix(c(1, .5, .5, 1), 2)) # returns TRUE is_pos_def(matrix(c(1, .9, .9, .9, 1, -.2, .9, -.2, 1), 3)) # returns FALSE
is_pos_def(matrix(c(1, .5, .5, 1), 2)) # returns TRUE is_pos_def(matrix(c(1, .9, .9, .9, 1, -.2, .9, -.2, 1), 3)) # returns FALSE
Convert a design list to JSON notation for archiving (e.g. in scienceverse)
json_design(design, filename = NULL, digits = 8, pretty = FALSE, ...)
json_design(design, filename = NULL, digits = 8, pretty = FALSE, ...)
design |
a design list including within, between, n, mu, sd, r, dv, id |
filename |
option name of file to save the json to |
digits |
number of digits to save |
pretty |
whether to print condensed or readable |
... |
other options to send to jsonlite::toJSON |
a JSON string
des <- check_design(2,2) json_design(des) json_design(des, pretty = TRUE)
des <- check_design(2,2) json_design(des) json_design(des, pretty = TRUE)
Convert data from long to wide format
long2wide( data, within = c(), between = c(), dv = "y", id = "id", sep = faux_options("sep") )
long2wide( data, within = c(), between = c(), dv = "y", id = "id", sep = faux_options("sep") )
data |
the tbl in long format |
within |
the names of the within column(s) |
between |
the names of between column(s) (optional) |
dv |
the name of the DV (value) column |
id |
the names of the column(s) for grouping observations |
sep |
separator for factor levels |
a tbl in wide format
df_long <- sim_design(2, 2, long = TRUE) long2wide(df_long, "A", "B")
df_long <- sim_design(2, 2, long = TRUE) long2wide(df_long, "A", "B")
Make IDs with fixed length and a prefix (e.g., S001, S002, ..., S100).
make_id(n = 100, prefix = "S", digits = 0, suffix = "")
make_id(n = 100, prefix = "S", digits = 0, suffix = "")
n |
the number of IDs to generate (or a vector of numbers) |
prefix |
the prefix to the number (default "S") |
digits |
the number of digits to use for the numeric part. Only used if this is larger than the largest number of digits in n. |
suffix |
the suffix to the number (default "") |
a vector of IDs
make_id(20, "SUBJECT_") make_id(10:30, digits = 3)
make_id(20, "SUBJECT_") make_id(10:30, digits = 3)
Insert NA or another replacement value for some proportion of specified columns to simulate missing data.
messy(data, prop = 0, ..., replace = NA)
messy(data, prop = 0, ..., replace = NA)
data |
the tbl |
prop |
the proportion of data to mess up |
... |
the columns to mess up (as a vector of column names or numbers) |
replace |
the replacement value (defaults to NA) |
the messed up table
messy(iris, 0.1, "Species", replace = "NO SPECIES") messy(iris, 0.5, 1:4)
messy(iris, 0.1, "Species", replace = "NO SPECIES") messy(iris, 0.5, 1:4)
Convert a negative binomial distribution to a normal (gaussian) distribution with specified mu and sd
nbinom2norm(x, mu = 0, sd = 1, size = NULL, prob = NULL)
nbinom2norm(x, mu = 0, sd = 1, size = NULL, prob = NULL)
x |
the negative binomially distributed vector |
mu |
the mean of the normal distribution to return |
sd |
the SD of the normal distribution to return |
size |
number of trials (set to max value of x if not specified) |
prob |
the probability of success on each trial (set to mean probability if not specified) |
a vector with a gaussian distribution
x <- rnbinom(10000, 20, 0.75) y <- nbinom2norm(x, 0, 1, 20, 0.75) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnbinom(10000, 20, 0.75) y <- nbinom2norm(x, 0, 1, 20, 0.75) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Output a nested list in RMarkdown list format
nested_list(x, pre = "", quote = "")
nested_list(x, pre = "", quote = "")
x |
The list |
pre |
Text to prefix to each line (e.g., if you want all lines indented 4 spaces to start, use " ") |
quote |
Text to quote values with (e.g., use "'" to make sure values are not parsed as markdown |
A character string
x <- list( a = list(a1 = "Named", a2 = "List"), b = list("Unnamed", "List"), c = c(c1 = "Named", c2 = "Vector"), d = c("Unnamed", "Vector"), e = list(e1 = list("A", "B", "C"), e2 = list(a = "A", b = "B"), e3 = c("A", "B", "C"), e4 = 100), f = "single item vector", g = list() ) nested_list(x)
x <- list( a = list(a1 = "Named", a2 = "List"), b = list("Unnamed", "List"), c = c(c1 = "Named", c2 = "Vector"), d = c("Unnamed", "Vector"), e = list(e1 = list("A", "B", "C"), e2 = list(a = "A", b = "B"), e3 = c("A", "B", "C"), e4 = 100), f = "single item vector", g = list() ) nested_list(x)
Convert normal to beta
norm2beta(x, shape1, shape2, mu = mean(x), sd = stats::sd(x), ...)
norm2beta(x, shape1, shape2, mu = mean(x), sd = stats::sd(x), ...)
x |
the normally distributed vector |
shape1 , shape2
|
non-negative parameters of the distribution to return |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
... |
further arguments to pass to qbeta (e.g., ncp) |
a vector with a beta distribution
x <- rnorm(10000) y <- norm2beta(x, 1, 3) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2beta(x, 1, 3) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert normal to binomial
norm2binom(x, size = 1, prob = 0.5, mu = mean(x), sd = stats::sd(x))
norm2binom(x, size = 1, prob = 0.5, mu = mean(x), sd = stats::sd(x))
x |
the normally distributed vector |
size |
number of trials (0 or more) |
prob |
the probability of success on each trial (0 to 1) |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
a vector with a binomial distribution
x <- rnorm(10000) y <- norm2binom(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2binom(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert normal to gamma
norm2gamma(x, shape, rate = 1, scale = 1/rate, mu = mean(x), sd = stats::sd(x))
norm2gamma(x, shape, rate = 1, scale = 1/rate, mu = mean(x), sd = stats::sd(x))
x |
the normally distributed vector |
shape |
gamma distribution parameter (must be positive) |
rate |
an alternative way to specify the scale |
scale |
gamma distribution parameter (must be positive) |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
a vector with a gamma distribution
x <- rnorm(10000) y <- norm2gamma(x, shape = 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2gamma(x, shape = 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert normal to likert
norm2likert(x, prob, labels = names(prob), mu = mean(x), sd = stats::sd(x))
norm2likert(x, prob, labels = names(prob), mu = mean(x), sd = stats::sd(x))
x |
the normally distributed vector |
prob |
a vector of probabilities or counts; if named, the output is a factor |
labels |
a vector of values, defaults to names(prob) or 1:length(prob), if numeric, the output is numeric |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
a vector with the specified distribution
x <- rnorm(10000) y <- norm2likert(x, c(.1, .2, .35, .2, .1, .05)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram") y <- norm2likert(x, c(40, 30, 20, 10)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram") y <- norm2likert(x, c(lower = .5, upper = .5)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2likert(x, c(.1, .2, .35, .2, .1, .05)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram") y <- norm2likert(x, c(40, 30, 20, 10)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram") y <- norm2likert(x, c(lower = .5, upper = .5)) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
See the help for 'qnbinom()' for further info about prob versus mu parameter specification. Thanks for the suggested code, David Hugh-Jones!
norm2nbinom( x, size, prob, mu, lower.tail = TRUE, log.p = FALSE, x_mu = mean(x), x_sd = stats::sd(x) )
norm2nbinom( x, size, prob, mu, lower.tail = TRUE, log.p = FALSE, x_mu = mean(x), x_sd = stats::sd(x) )
x |
the normally distributed vector |
size |
target for number of successful trials, or dispersion parameter (the shape parameter of the gamma mixing distribution). (size > 0) |
prob |
the probability of success on each trial (0 to 1) |
mu |
alternative parametrization via mean (only specify one of prob or mu) |
lower.tail |
logical; if TRUE (default), probabilities are P[$X <= x$], otherwise, P[$X > x$] |
log.p |
logical; if TRUE, probabilities p are given as log(p) |
x_mu |
the mean of x (calculated from x if not given) |
x_sd |
the SD of x (calculated from x if not given) |
a vector with a negative binomial distribution
x <- rnorm(10000) y <- norm2nbinom(x, 1, prob = 0.5) z <- norm2nbinom(x, 1, mu = 1) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2nbinom(x, 1, prob = 0.5) z <- norm2nbinom(x, 1, mu = 1) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert a normal distribution to a normal (gaussian) distribution with specified mu and sd
norm2norm(x, mu = 0, sd = 1, x_mu = mean(x), x_sd = stats::sd(x))
norm2norm(x, mu = 0, sd = 1, x_mu = mean(x), x_sd = stats::sd(x))
x |
the uniformly distributed vector |
mu |
the mean of the normal distribution to return |
sd |
the SD of the normal distribution to return |
x_mu |
the mean of x (calculated from x if not given) |
x_sd |
the SD of x (calculated from x if not given) |
a vector with a gaussian distribution
x <- rnorm(10000) y <- norm2norm(x, 100, 10) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2norm(x, 100, 10) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert normal to poisson
norm2pois(x, lambda, mu = mean(x), sd = stats::sd(x))
norm2pois(x, lambda, mu = mean(x), sd = stats::sd(x))
x |
the normally distributed vector |
lambda |
the mean of the distribution to return |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
a vector with a poisson distribution
x <- rnorm(10000) y <- norm2pois(x, 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2pois(x, 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert a normal (gaussian) distribution to a truncated normal distribution with specified minimum and maximum
norm2trunc( x, min = -Inf, max = Inf, mu = mean(x), sd = stats::sd(x), x_mu = mean(x), x_sd = stats::sd(x) )
norm2trunc( x, min = -Inf, max = Inf, mu = mean(x), sd = stats::sd(x), x_mu = mean(x), x_sd = stats::sd(x) )
x |
the normally distributed vector |
min |
the minimum of the truncated distribution to return |
max |
the maximum of the truncated distribution to return |
mu |
the mean of the distribution to return (calculated from x if not given) |
sd |
the SD of the distribution to return (calculated from x if not given) |
x_mu |
the mean of x (calculated from x if not given) |
x_sd |
the SD of x (calculated from x if not given) |
a vector with a uniform distribution
x <- rnorm(10000) y <- norm2trunc(x, 1, 7, 3.5, 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2trunc(x, 1, 7, 3.5, 2) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert a normal (gaussian) distribution to a uniform distribution with specified minimum and maximum
norm2unif(x, min = 0, max = 1, mu = mean(x), sd = stats::sd(x))
norm2unif(x, min = 0, max = 1, mu = mean(x), sd = stats::sd(x))
x |
the normally distributed vector |
min |
the minimum of the uniform distribution to return |
max |
the maximum of the uniform distribution to return |
mu |
the mean of x (calculated from x if not given) |
sd |
the SD of x (calculated from x if not given) |
a vector with a uniform distribution
x <- rnorm(10000) y <- norm2unif(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- rnorm(10000) y <- norm2unif(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Likert distribution function
plikert(q, prob, labels = names(prob))
plikert(q, prob, labels = names(prob))
q |
the vector of quantiles |
prob |
a vector of probabilities or counts; if named, the output is a factor |
labels |
a vector of values, defaults to names(prob) or 1:length(prob), if numeric, the output is numeric |
a vector of the densities
q <- 1:5 prob <- c(.1, .2, .4, .2, .1) plikert(q, prob) q <- c("A", "C", "B", "B") prob <- c(A = 10, B = 20, C = 30) plikert(q, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 q <- labels prob <- rep(1, length(labels)) # uniform probability plikert(q, prob, labels)
q <- 1:5 prob <- c(.1, .2, .4, .2, .1) plikert(q, prob) q <- c("A", "C", "B", "B") prob <- c(A = 10, B = 20, C = 30) plikert(q, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 q <- labels prob <- rep(1, length(labels)) # uniform probability plikert(q, prob, labels)
Plots the specified within and between design. See vignette("plots", package = "faux")
for examples and details.
plot_design(x, ..., geoms = NULL, palette = "Dark2", labeller = "label_value") ## S3 method for class 'design' plot(x, ...) ## S3 method for class 'faux' plot(x, ...)
plot_design(x, ..., geoms = NULL, palette = "Dark2", labeller = "label_value") ## S3 method for class 'design' plot(x, ...) ## S3 method for class 'faux' plot(x, ...)
x |
A list of design parameters created by check_design() or a data tbl (in long format) |
... |
A list of factor names to determine visualisation (see vignette) in the order color, x, facet row(s), facet col(s) |
geoms |
A list of ggplot2 geoms to display, defaults to "pointrangeSD" (mean ± 1SD) for designs and c("violin", "box") for data, options are: pointrangeSD, pointrangeSE, violin, box, jitter |
palette |
A brewer palette, defaults to "Dark2" (see ggplot2::scale_colour_brewer) |
labeller |
How to label the facets (see ggplot2::facet_grid). "label_value" is used by default. |
plot
plot(design)
: Plotting from a faux design list
plot(faux)
: Plotting from a faux data table
within <- list(time = c("day", "night")) between <- list(pet = c("dog", "cat")) des <- check_design(within, between, plot = FALSE) plot_design(des) data <- sim_design(within, between, plot = FALSE) plot_design(data)
within <- list(time = c("day", "night")) between <- list(pet = c("dog", "cat")) des <- check_design(within, between, plot = FALSE) plot_design(des) data <- sim_design(within, between, plot = FALSE) plot_design(data)
pos_def_limits
returns min and max possible values for a positive definite matrix with a specified missing value
pos_def_limits(..., steps = 0.01, tol = 1e-08)
pos_def_limits(..., steps = 0.01, tol = 1e-08)
... |
the correlations among the variables as a vars\*(vars-1)/2 vector |
steps |
the tolerance for min and max values |
tol |
the tolerance for comparing eigenvalues to 0 |
dataframe with min and max values
pos_def_limits(.8, .2, NA)
pos_def_limits(.8, .2, NA)
Likert quantile function
qlikert(p, prob, labels = names(prob))
qlikert(p, prob, labels = names(prob))
p |
the vector of probabilities |
prob |
a vector of probabilities or counts; if named, the output is a factor |
labels |
a vector of values, defaults to names(prob) or 1:length(prob), if numeric, the output is numeric |
a vector of the quantiles
p <- seq(0, 1, .1) prob <- c(.1, .2, .4, .2, .1) qlikert(p, prob) p <- seq(0, 1, .1) prob <- c(A = 10, B = 20, C = 30) qlikert(p, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 p <- seq(0, 1, .1) prob <- rep(1, length(labels)) # uniform probability qlikert(p, prob, labels)
p <- seq(0, 1, .1) prob <- c(.1, .2, .4, .2, .1) qlikert(p, prob) p <- seq(0, 1, .1) prob <- c(A = 10, B = 20, C = 30) qlikert(p, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 p <- seq(0, 1, .1) prob <- rep(1, length(labels)) # uniform probability qlikert(p, prob, labels)
Check readline input
readline_check( prompt, type = c("numeric", "integer", "length", "grep"), min = -Inf, max = Inf, warning = NULL, default = NULL, ... )
readline_check( prompt, type = c("numeric", "integer", "length", "grep"), min = -Inf, max = Inf, warning = NULL, default = NULL, ... )
prompt |
the prompt for readline |
type |
what type of check to perform, one of c("numeric", "integer", "length", "grep") |
min |
the minimum value |
max |
the maximum value |
warning |
an optional custom warning message |
default |
the default option to return if the entry is blank, NULL allows no default, the default value will be displayed after the text as [default] |
... |
other arguments to pass to grep |
the validated result of readline
if(interactive()){ readline_check("Type a number: ", "numeric") readline_check("Type two characters: ", "length", min = 2, max = 2) readline_check("Type at least 3 characters: ", "length", min = 3) readline_check("Type no more than 4 characters: ", "length", max = 44) readline_check("Type a letter and a number: ", "grep", pattern = "^[a-zA-Z]\\d$") }
if(interactive()){ readline_check("Type a number: ", "numeric") readline_check("Type two characters: ", "length", min = 2, max = 2) readline_check("Type at least 3 characters: ", "length", min = 3) readline_check("Type no more than 4 characters: ", "length", max = 44) readline_check("Type a letter and a number: ", "grep", pattern = "^[a-zA-Z]\\d$") }
Random Likert distribution
rlikert(n, prob, labels = names(prob))
rlikert(n, prob, labels = names(prob))
n |
the number of observations |
prob |
a vector of probabilities or counts; if named, the output is a factor |
labels |
a vector of values, defaults to names(prob) or 1:length(prob), if numeric, the output is numeric |
a vector sampled from a likert distribution with the specified parameters
# no names or labels returns integer vector of values 1:length(prob) prob <- c(.1, .2, .4, .2, .1) rlikert(10, prob) # named prob returns factor prob <- c(A = 10, B = 20, C = 30) rlikert(10, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 prob <- rep(1, length(labels)) # uniform probability rlikert(10, prob, labels)
# no names or labels returns integer vector of values 1:length(prob) prob <- c(.1, .2, .4, .2, .1) rlikert(10, prob) # named prob returns factor prob <- c(A = 10, B = 20, C = 30) rlikert(10, prob) # specify labels if prob not named and not 1:length(prob) labels <- -2:2 prob <- rep(1, length(labels)) # uniform probability rlikert(10, prob, labels)
Multiple correlated distributions
rmulti( n = 100, dist = c(A = "norm", B = "norm"), params = list(), r = 0, empirical = FALSE, as.matrix = FALSE )
rmulti( n = 100, dist = c(A = "norm", B = "norm"), params = list(), r = 0, empirical = FALSE, as.matrix = FALSE )
n |
the number of samples required |
dist |
A named vector of the distributions of each variable |
params |
A list of lists of the arguments to pass to each distribution function |
r |
the correlations among the variables (can be a single number, vars\*vars matrix, vars\*vars vector, or a vars\*(vars-1)/2 vector) |
empirical |
logical. If true, params specify the sample parameters, not the population parameters |
as.matrix |
logical. If true, returns a matrix |
a tbl of vars vectors
dist <- c(A = "norm", B = "pois", C = "binom") params <- list(A = list(mean = 100, sd = 10), B = list(lambda = 5), C = list(size = 10, prob = 0.5)) x <- rmulti(100, dist, params, c(0.2, 0.4, 0.6), empirical = TRUE) get_params(x)
dist <- c(A = "norm", B = "pois", C = "binom") params <- list(A = list(mean = 100, sd = 10), B = list(lambda = 5), C = list(size = 10, prob = 0.5)) x <- rmulti(100, dist, params, c(0.2, 0.4, 0.6), empirical = TRUE) get_params(x)
Make normally distributed vectors with specified relationships. See vignette("rnorm_multi", package = "faux")
for details.
rnorm_multi( n = 100, vars = NULL, mu = 0, sd = 1, r = 0, varnames = NULL, empirical = FALSE, as.matrix = FALSE, seed = NULL )
rnorm_multi( n = 100, vars = NULL, mu = 0, sd = 1, r = 0, varnames = NULL, empirical = FALSE, as.matrix = FALSE, seed = NULL )
n |
the number of samples required |
vars |
the number of variables to return |
mu |
a vector giving the means of the variables (numeric vector of length 1 or vars) |
sd |
the standard deviations of the variables (numeric vector of length 1 or vars) |
r |
the correlations among the variables (can be a single number, vars\*vars matrix, vars\*vars vector, or a vars\*(vars-1)/2 vector) |
varnames |
optional names for the variables (string vector of length vars) defaults if r is a matrix with column names |
empirical |
logical. If true, mu, sd and r specify the empirical not population mean, sd and covariance |
as.matrix |
logical. If true, returns a matrix |
seed |
DEPRECATED use set.seed() instead before running this function |
a tbl of vars vectors
# 4 10-item vectors each correlated r = .5 rnorm_multi(10, 4, r = 0.5) # set r with the upper right triangle b <- rnorm_multi(100, 3, c(0, .5, 1), 1, r = c(0.2, -0.5, 0.5), varnames=c("A", "B", "C")) cor(b) # set r with a correlation matrix and column names from mu names c <- rnorm_multi( n = 100, mu = c(A = 0, B = 0.5, C = 1), r = c( 1, 0.2, -0.5, 0.2, 1, 0.5, -0.5, 0.5, 1) ) cor(c)
# 4 10-item vectors each correlated r = .5 rnorm_multi(10, 4, r = 0.5) # set r with the upper right triangle b <- rnorm_multi(100, 3, c(0, .5, 1), 1, r = c(0.2, -0.5, 0.5), varnames=c("A", "B", "C")) cor(b) # set r with a correlation matrix and column names from mu names c <- rnorm_multi( n = 100, mu = c(A = 0, B = 0.5, C = 1), r = c( 1, 0.2, -0.5, 0.2, 1, 0.5, -0.5, 0.5, 1) ) cor(c)
rnorm_pre
Produces a random normally distributed vector with the specified correlation to one or more existing vectors
rnorm_pre(x, mu = 0, sd = 1, r = 0, empirical = FALSE, threshold = 1e-12)
rnorm_pre(x, mu = 0, sd = 1, r = 0, empirical = FALSE, threshold = 1e-12)
x |
the existing vector or data table of all vectors |
mu |
desired mean of returned vector |
sd |
desired SD of returned vector |
r |
desired correlation(s) between existing and returned vectors |
empirical |
logical. If true, mu, sd and r specify the empirical not population mean, sd and covariance |
threshold |
for checking correlation matrix |
vector
v1 <- rnorm(10) v2 <- rnorm_pre(v1, 0, 1, 0.5) cor(v1, v2) x <- rnorm_multi(50, 2, .5) x$y <- rnorm_pre(x, r = c(0.5, 0.25)) cor(x)
v1 <- rnorm(10) v2 <- rnorm_pre(v1, 0, 1, 0.5) cor(v1, v2) x <- rnorm_multi(50, 2, .5) x$y <- rnorm_pre(x, r = c(0.5, 0.25)) cor(x)
Sample Parameters from Population Parameters
sample_from_pop(n = 100, mu = 0, sd = 1, r = 0)
sample_from_pop(n = 100, mu = 0, sd = 1, r = 0)
n |
sample size |
mu |
population mean |
sd |
population SD |
r |
population r |
list of sample parameters (mu, sd, r)
sample_from_pop(10)
sample_from_pop(10)
Add a design specification to a data table
set_design(data, design)
set_design(data, design)
data |
The data table |
design |
The design list |
A data frame with a design attribute
design <- check_design() data <- data.frame(id = 1:100, y = rnorm(100)) %>% set_design(design)
design <- check_design() data <- data.frame(id = 1:100, y = rnorm(100)) %>% set_design(design)
Generates a data table with a specified within and between design. See vignette("sim_design", package = "faux")
for examples and details.
sim_design( within = list(), between = list(), n = 100, mu = 0, sd = 1, r = 0, empirical = FALSE, long = faux_options("long"), dv = list(y = "value"), id = list(id = "id"), vardesc = list(), plot = faux_options("plot"), interactive = FALSE, design = NULL, rep = 1, nested = TRUE, seed = NULL, sep = faux_options("sep") )
sim_design( within = list(), between = list(), n = 100, mu = 0, sd = 1, r = 0, empirical = FALSE, long = faux_options("long"), dv = list(y = "value"), id = list(id = "id"), vardesc = list(), plot = faux_options("plot"), interactive = FALSE, design = NULL, rep = 1, nested = TRUE, seed = NULL, sep = faux_options("sep") )
within |
a list of the within-subject factors |
between |
a list of the between-subject factors |
n |
the number of samples required |
mu |
the means of the variables |
sd |
the standard deviations of the variables |
r |
the correlations among the variables (can be a single number, full correlation matrix as a matrix or vector, or a vector of the upper right triangle of the correlation matrix |
empirical |
logical. If true, mu, sd and r specify the empirical not population mean, sd and covariance |
long |
Whether the returned tbl is in wide or long format (defaults to value of 'faux_options("long")') |
dv |
the name of the dv for long plots (defaults to y) |
id |
the name of the id column (defaults to id) |
vardesc |
a list of variable descriptions having the names of the within- and between-subject factors |
plot |
whether to show a plot of the design |
interactive |
whether to run the function interactively |
design |
a design list including within, between, n, mu, sd, r, dv, id, and vardesc |
rep |
the number of data frames to return (default 1); if greater than 1, the returned data frame is nested by rep (if nested = TRUE) |
nested |
Whether to nest data frames by rep if rep > 1 |
seed |
DEPRECATED use set.seed() instead before running this function |
sep |
separator for factor levels |
a tbl
Produces a data table with the same distributions and correlations as an existing data table Only returns numeric columns and simulates all numeric variables from a continuous normal distribution (for now).
sim_df( data, n = 100, within = c(), between = c(), id = "id", dv = "value", empirical = FALSE, long = faux_options("long"), seed = NULL, missing = FALSE, sep = faux_options("sep") )
sim_df( data, n = 100, within = c(), between = c(), id = "id", dv = "value", empirical = FALSE, long = faux_options("long"), seed = NULL, missing = FALSE, sep = faux_options("sep") )
data |
the existing tbl |
n |
the number of samples to return per group |
within |
a list of the within-subject factor columns (if long format) |
between |
a list of the between-subject factor columns |
id |
the names of the column(s) for grouping observations |
dv |
the name of the DV (value) column |
empirical |
Should the returned data have these exact parameters? (versus be sampled from a population with these parameters) |
long |
whether to return the data table in long format |
seed |
DEPRECATED use set.seed() instead before running this function |
missing |
simulate missing data? |
sep |
separator for factor levels |
See vignette("sim_df", package = "faux")
for details.
a tbl
iris100 <- sim_df(iris, 100) iris_species <- sim_df(iris, 100, between = "Species") # set the names of within factors and (the separator character) # if you want to return a long version longdf <- sim_df(iris, between = "Species", within = c("type", "dim"), sep = ".", long = TRUE) # or if you are simulating data from a table in long format widedf <- sim_df(longdf, between = "Species", within = c("type", "dim"), sep = ".")
iris100 <- sim_df(iris, 100) iris_species <- sim_df(iris, 100, between = "Species") # set the names of within factors and (the separator character) # if you want to return a long version longdf <- sim_df(iris, between = "Species", within = c("type", "dim"), sep = ".", long = TRUE) # or if you are simulating data from a table in long format widedf <- sim_df(longdf, between = "Species", within = c("type", "dim"), sep = ".")
This function is mainly used internally, such as for simulating missing data patterns, but is available in case anyone finds it useful.
sim_joint_dist(data, ..., n = 100, empirical = FALSE)
sim_joint_dist(data, ..., n = 100, empirical = FALSE)
data |
the existing tbl |
... |
columns to calculate the joint distribution from, if none are chosen, all columns with 10 or fewer unique values will be chosen |
n |
the number of total observations to return |
empirical |
Should the returned data have the exact same distribution of conditions? (versus be sampled from a population with this distribution) |
data table
sim_joint_dist(ggplot2::diamonds, cut, color, n = 10)
sim_joint_dist(ggplot2::diamonds, cut, color, n = 10)
Makes a basic cross-classified design with random intercepts for subjects and items. See vignette("sim_mixed", package = "faux")
for examples and details.
sim_mixed_cc( sub_n = 100, item_n = 20, grand_i = 0, sub_sd = 1, item_sd = 1, error_sd = 1, empirical = FALSE, seed = NULL )
sim_mixed_cc( sub_n = 100, item_n = 20, grand_i = 0, sub_sd = 1, item_sd = 1, error_sd = 1, empirical = FALSE, seed = NULL )
sub_n |
the number of subjects |
item_n |
the number of items |
grand_i |
the grand intercept (overall mean) |
sub_sd |
the SD of subject random intercepts (or a sub_n-length named vector of random intercepts for each subject) |
item_sd |
the SD of item random intercepts (or an item_n-length named vector of random intercepts for each item) |
error_sd |
the SD of the error term |
empirical |
Should the returned data have these exact parameters? (versus be sampled from a population with these parameters) |
seed |
DEPRECATED use set.seed() instead before running this function |
a tbl
sim_mixed_cc(10, 10)
sim_mixed_cc(10, 10)
sim_mixed_df()
produces a data table with the same distributions of
by-subject and by-item random intercepts as an existing data table.
sim_mixed_df( data, sub_n = NULL, item_n = NULL, dv = "y", sub_id = "sub_id", item_id = "item_id" )
sim_mixed_df( data, sub_n = NULL, item_n = NULL, dv = "y", sub_id = "sub_id", item_id = "item_id" )
data |
the existing tbl |
sub_n |
the number of subjects to simulate (if NULL, returns data for the same subjects) |
item_n |
the number of items to simulate (if NULL, returns data for the same items) |
dv |
the column name or index containing the DV |
sub_id |
the column name or index for the subject IDs |
item_id |
the column name or index for the item IDs |
a tbl
sim_mixed_df(faceratings, 10, 10, "rating", "rater_id", "face_id")
sim_mixed_df(faceratings, 10, 10, "rating", "rater_id", "face_id")
Standardized Alpha to Average R
std_alpha2average_r(std_alpha, n)
std_alpha2average_r(std_alpha, n)
std_alpha |
The standarized alpha |
n |
The number of items |
The average inter-item correlation
std_alpha2average_r(.8, 10)
std_alpha2average_r(.8, 10)
Convert a truncated normal distribution to a normal (gaussian) distribution
trunc2norm(x, min = NULL, max = NULL, mu = mean(x), sd = stats::sd(x))
trunc2norm(x, min = NULL, max = NULL, mu = mean(x), sd = stats::sd(x))
x |
the truncated normally distributed vector |
min |
the minimum of the truncated distribution (calculated from x if not given) |
max |
the maximum of the truncated distribution (calculated from x if not given) |
mu |
the mean of the distribution to return (calculated from x if not given) |
sd |
the SD of the distribution to return (calculated from x if not given) |
a vector with a uniform distribution
x <- truncnorm::rtruncnorm(10000, 1, 7, 3.5, 2) y <- trunc2norm(x, 1, 7) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- truncnorm::rtruncnorm(10000, 1, 7, 3.5, 2) y <- trunc2norm(x, 1, 7) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Convert a uniform distribution to a normal (gaussian) distribution with specified mu and sd
unif2norm(x, mu = 0, sd = 1, min = NULL, max = NULL)
unif2norm(x, mu = 0, sd = 1, min = NULL, max = NULL)
x |
the uniformly distributed vector |
mu |
the mean of the normal distribution to return |
sd |
the SD of the normal distribution to return |
min |
the minimum possible value of x (calculated from x if not given) |
max |
the maximum possible value of x (calculated from x if not given) |
a vector with a gaussian distribution
x <- runif(10000) y <- unif2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
x <- runif(10000) y <- unif2norm(x) g <- ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x, y)) ggExtra::ggMarginal(g, type = "histogram")
Make unique pairs of level names for correlations
unique_pairs(v)
unique_pairs(v)
v |
a vector of level names or a number of levels |
a vector of all unique pairs
unique_pairs(c("O", "C", "E", "A", "N")) unique_pairs(3)
unique_pairs(c("O", "C", "E", "A", "N")) unique_pairs(3)
Convert data from wide to long format
wide2long( data, within_factors = c(), within_cols = c(), dv = "y", id = "id", sep = faux_options("sep") )
wide2long( data, within_factors = c(), within_cols = c(), dv = "y", id = "id", sep = faux_options("sep") )
data |
the tbl in wide format |
within_factors |
the names of the within factors |
within_cols |
the names (or indices) of the within-subject (value) columns |
dv |
the name of the dv column (defaults to "y") |
id |
the name of the ID column(s) if they don't exist, a new column will be made (defaults to ("id") |
sep |
separator for within-columns (to be used in strsplit, so can be regex), defaults to "_" |
a tbl in long format
wide2long(iris, c("Feature", "Measure"), 1:4, sep = "\\.")
wide2long(iris, c("Feature", "Measure"), 1:4, sep = "\\.")