This analysis examines how vocabulary development differs by children’s reported gender, replicating the results of:
Eriksson, M., Marschik, P. B., Tulviste, T., Almgren, M., Pérez Pereira, M., Wehberg, S., … Gallego, C. (2012). Differences between girls and boys in emerging language skills: Evidence from 10 language communities. British Journal of Developmental Psychology 30, 326–343.
Load required libraries.
library(wordbankr)
library(langcog)
library(dplyr)
library(ggplot2)
library(directlabels)
theme_set(theme_mikabr())
font <- theme_mikabr()$text$family
mode <- "local"
Get administration data and filter to administrations of Words & Sentences that have sex/gender coded.
vocab_admins <- get_administration_data(mode = mode) %>%
select(data_id, language, form, age, sex, production) %>%
filter(form == "WS", !is.na(sex))
Get item information to find the number of items on each language’s form.
num_words <- get_item_data(mode = mode) %>%
filter(form == "WS", type == "word") %>%
group_by(language) %>%
summarise(n = n())
Normalize productive vocabulary size as a proportion of items and calculate median vocabulary size for each language, sex/gender, and age.
vocab_data <- vocab_admins %>%
left_join(num_words) %>%
mutate(production = as.numeric(production) / n) %>%
group_by(language, sex, age) %>%
summarise(median = median(production))
Plot vocabulary size over age by gender.
ggplot(filter(vocab_data, language != "Hebrew"),
aes(x = age, y = median, colour = sex, label = sex)) +
facet_wrap(~language) +
geom_line(size = 1) +
scale_colour_solarized() +
scale_x_continuous(breaks = seq(min(vocab_data$age), max(vocab_data$age), 2),
limits = c(min(vocab_data$age), max(vocab_data$age) + 1),
name = "\nAge (months)") +
scale_y_continuous(name = "Median Productive Vocabulary (proportion of total words)\n",
limits = c(0, 1)) +
theme(legend.position = "none") +
geom_dl(method = list(dl.trans(x = x + 0.2), "last.qp", cex = 1, fontfamily = font))