Load required libraries.
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())
Load in Wordbank data.
items <- get_item_data() %>%
filter(type == "word") %>%
mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))
Function for getting vocabulary composition data for a given instrument.
get_vocab_comp <- function(input_language, input_form) {
lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
filter(lexical_category %in% c("nouns", "predicates", "function_words"))
lang_vocab_data <- get_instrument_data(instrument_language = input_language,
instrument_form = input_form,
items = lang_vocab_items$item_id,
iteminfo = lang_vocab_items) %>%
mutate(value = ifelse(is.na(value), "", value),
produces = value == "produces",
understands = value == "produces" | value == "understands") %>%
select(-value) %>%
gather(measure, value, produces, understands)
num_words <- nrow(lang_vocab_items)
lang_vocab_summary <- lang_vocab_data %>%
group_by(data_id, measure, lexical_category) %>%
summarise(num_true = sum(value),
prop = sum(value) / n())
lang_vocab_sizes <- lang_vocab_summary %>%
summarise(vocab_num = sum(num_true),
vocab = sum(num_true) / num_words)
lang_vocab_summary %>%
left_join(lang_vocab_sizes) %>%
mutate(prop_vocab = num_true / vocab_num) %>%
select(-num_true) %>%
mutate(language = input_language, form = input_form)
}
Get vocabulary composition data for all instruments.
instruments <- items %>%
select(language, form) %>%
distinct()
vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
bind_rows()
Show sample size of each instrument.
sample_sizes <- vocab_comp_data %>%
group_by(language, form, measure, lexical_category) %>%
summarise(n = n()) %>%
ungroup() %>%
select(language, form, n) %>%
distinct()
kable(sample_sizes)
language | form | n |
---|---|---|
British Sign Language | WG | 161 |
Cantonese | WS | 987 |
Croatian | WG | 250 |
Croatian | WS | 377 |
Danish | WS | 3714 |
English | WG | 2454 |
English | WS | 5824 |
German | WS | 1183 |
Hebrew | WG | 62 |
Hebrew | WS | 253 |
Italian | WG | 648 |
Italian | WS | 752 |
Mandarin | TC | 652 |
Mandarin | WS | 1056 |
Norwegian | WG | 3025 |
Norwegian | WS | 12969 |
Russian | WG | 768 |
Russian | WS | 1037 |
Spanish | WG | 778 |
Spanish | WS | 1094 |
Swedish | WG | 474 |
Swedish | WS | 900 |
Turkish | WG | 1115 |
Turkish | WS | 2422 |
Base plot for looking at vocabulary composition.
base_plot <- function(input_form, input_measure) {
vocab_comp_data %>%
filter(form == input_form, measure == input_measure) %>%
mutate(lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns ", "Predicates ", "Function Words"))) %>%
ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
facet_wrap(~language) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_colour_solarized(name = "") +
theme(legend.position = "top",
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"))
}
Plot WS productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WS", "produces") + geom_jitter(size = 0.7)
Plot WG productive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "produces") + geom_jitter(size = 0.7)
Plot WG receptive vocabulary composition as a function of vocabulary size for each language.
base_plot("WG", "understands") + geom_jitter(size = 0.7)
Plot WS productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WS", "produces") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)
Plot WG productive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "produces") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)
Plot WG receptive vocabulary composition as a function of vocabulary size for each language with cubic contrained lm curves.
base_plot("WG", "understands") +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1)
Function for resampling data and computing area estimate for each sample.
sample_areas <- function(d, nboot = 1000) {
poly_area <- function(group_data) {
model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1,
data = group_data)
return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
}
counter <- 1
sample_area <- function(d) {
d_frame <- d %>%
group_by(language, form, measure) %>%
sample_frac(replace = TRUE) %>%
group_by(language, form, measure, lexical_category) %>%
do(area = poly_area(.)) %>%
mutate(area = area[1]) %>%
rename_(.dots = setNames("area", counter))
counter <<- counter + 1 # increment counter outside scope
return(d_frame)
}
areas <- replicate(nboot, sample_area(d), simplify = FALSE)
Reduce(left_join, areas) %>%
gather(sample, area, -language, -form, -measure, -lexical_category)
}
Resample data and find the mean and CI of the area estimate.
areas <- sample_areas(vocab_comp_data, 1000)
area_summary <- areas %>%
group_by(language, form, measure, lexical_category) %>%
summarise(mean = mean(area),
ci_lower = ci_lower(area),
ci_upper = ci_upper(area)) %>%
ungroup() %>%
mutate(language = factor(language),
instrument = paste(language, form))
area_order <- filter(area_summary, form == "WS", measure == "produces",
lexical_category == "nouns")
language_levels <- area_order$language[order(area_order$mean,
area_order$language,
decreasing = FALSE)]
area_summary_ordered <- area_summary %>%
filter(form %in% c("WS", "WG"),
!(form == "WS" & measure == "understands")) %>%
ungroup() %>%
mutate(language = factor(language, levels = language_levels),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns", "Predicates", "Function Words")))
Plot each lexical category’s area estimate by language, form, and measure.
ggplot(area_summary_ordered,
aes(y = language, x = mean, colour = lexical_category)) +
facet_grid(lexical_category ~ form + measure) +
geom_point() +
geom_segment(aes(x = ci_lower, xend = ci_upper,
y = language, yend = language)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
scale_colour_solarized(name = "", guide = FALSE) +
scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
xlab("\nRelative representation in early vocabulary")
Data and models.
sample_sizes_ordered <- sample_sizes %>%
filter(form == "WS") %>%
mutate(language = factor(language, levels = rev(language_levels)))
language_size_order <- unlist(map(
levels(sample_sizes_ordered$language),
function(language) sprintf("%s (N = %s)", language,
sample_sizes_ordered$n[which(sample_sizes_ordered$language == language)])
))
vocab_comp_data_ordered <- vocab_comp_data %>%
filter(form == "WS", measure == "produces") %>%
mutate(language = factor(language, levels = rev(language_levels)),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns ", "Predicates ", "Function Words"))) %>%
left_join(sample_sizes_ordered) %>%
mutate(language_with_size = sprintf("%s (N = %s)", language, n),
language_with_size = factor(language_with_size, levels = language_size_order))
ggplot(vocab_comp_data_ordered, aes(x = vocab, y = prop, colour = lexical_category)) +
facet_wrap(~language_with_size, ncol = 5) +
geom_jitter(size = 0.7, alpha = 0.5) +
geom_smooth(method = "clm", formula = y ~ I(x ^ 3) + I(x ^ 2) + x - 1, size = 1, se = FALSE) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "Proportion of Category\n") +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
name = "\nVocabulary Size") +
scale_colour_solarized(name = "") +
theme_mikabr(base_size = 20) +
theme(legend.position = "top",
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"))
#ggsave("BUCLD/data_models.png", width = 15, height = 10)
Demo of area estimation.
demo_langs <- c("English", "Mandarin")
demo_data <- filter(vocab_comp_data, form == "WS", language %in% demo_langs) %>%
mutate(panel = paste(language, "(data)"),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates", "function_words"),
labels = c("Nouns", "Predicates", "Function Words")))
pts <- seq(0, 1, 0.01)
models <- demo_data %>%
group_by(language, lexical_category) %>%
do(model = clm(prop ~ I(vocab ^ 3) + I(vocab ^ 2) + vocab - 1, data = .))
get_lang_lexcat_predictions <- function(lang, lexcat) {
model <- filter(models, language == lang, lexical_category == lexcat)$model[[1]]
data.frame(vocab = pts,
prop = predict(model, newdata = data.frame(vocab = pts)),
lexical_category = lexcat,
language = lang)
}
get_lang_predictions <- function(lang) {
bind_rows(sapply(unique(demo_data$lexical_category),
function(lexcat) get_lang_lexcat_predictions(lang, lexcat),
simplify = FALSE))
}
predictions <- bind_rows(sapply(demo_langs, get_lang_predictions, simplify = FALSE))
diagonal <- expand.grid(vocab = rep(rev(pts)),
language = demo_langs,
lexical_category = unique(demo_data$lexical_category))
diagonal$prop <- diagonal$vocab
area_poly <- bind_rows(predictions, diagonal) %>%
mutate(panel = paste(language, "(models)"))
ggplot(filter(predictions, language == "English"), aes(x = vocab, y = prop)) +
facet_grid(. ~ lexical_category) +
geom_line(aes(colour = lexical_category), size = 1) +
geom_polygon(data = filter(area_poly, language == "English"),
aes(fill = lexical_category), alpha = 0.2) +
geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") +
scale_y_continuous(limits = c(0, 1), breaks = c(),
name = "") +
scale_x_continuous(limits = c(0, 1), breaks = c(),
name = "") +
scale_colour_solarized(guide = FALSE) +
scale_fill_solarized(guide = FALSE) +
theme(legend.position = c(0.061, 0.91),
legend.text = element_text(size = 8),
legend.key.height = unit(0.9, "char"),
legend.key.width = unit(0.88, "char"),
legend.background = element_rect(fill = "transparent"),
strip.background = element_blank(),
strip.text.x = element_blank())
#ggsave("BUCLD/area_demo.png", width = 9, height = 3.5)
WS area estimates.
ggplot(filter(area_summary_ordered, form == "WS"),
aes(y = language, x = mean, col = lexical_category)) +
facet_grid(lexical_category ~ .) +
geom_point() +
geom_segment(aes(x = ci_lower, xend = ci_upper,
y = language, yend = language)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
scale_colour_solarized(name = "", guide = FALSE) +
scale_y_discrete(name = "", limits = levels(area_summary_ordered$language)) +
xlab("\nRelative representation in early vocabulary") +
theme_mikabr(base_size = 20)
#ggsave("BUCLD/diffs.png", width = 8, height = 11)