Load required libraries.

library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(grid)
library(wordbankr)
library(langcog)
theme_set(theme_mikabr())

Load in Wordbank data.

items <- get_item_data() %>%
  filter(type == "word") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))))

Function for getting vocabulary composition data for a given instrument.

get_vocab_comp <- function(input_language, input_form) {
  
  lang_vocab_items <- filter(items, language == input_language, form == input_form) %>%
    filter(lexical_category %in% c("nouns", "predicates", "function_words"))
  
  lang_vocab_data <- get_instrument_data(instrument_language = input_language,
                                         instrument_form = input_form,
                                         items = lang_vocab_items$item_id,
                                         iteminfo = lang_vocab_items) %>%
    mutate(value = ifelse(is.na(value), "", value),
           produces = value == "produces",
           understands = value == "produces" | value == "understands") %>%
    select(-value) %>%
    gather(measure, value, produces, understands)
  
  num_words <- nrow(lang_vocab_items)
  
  lang_vocab_summary <- lang_vocab_data %>%
    group_by(data_id, measure, lexical_category) %>%
    summarise(num_true = sum(value),
              prop = sum(value) / n())
  
  lang_vocab_sizes <- lang_vocab_summary %>%
    summarise(vocab_num = sum(num_true),
              vocab = sum(num_true) / num_words)
  
  lang_vocab_summary %>%
    left_join(lang_vocab_sizes) %>%
    mutate(prop_vocab = num_true / vocab_num) %>%
    select(-num_true) %>%
    mutate(language = input_language, form = input_form)
  
}

Get vocabulary composition data for all instruments.

instruments <- items %>%
  select(language, form) %>%
  distinct()

vocab_comp_data <- map2(instruments$language, instruments$form, get_vocab_comp) %>%
  bind_rows()

Show sample size of each instrument.

sample_sizes <- vocab_comp_data %>%
  group_by(language, form, measure, lexical_category) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  select(language, form, n) %>%
  distinct()
kable(sample_sizes)
language form n
British Sign Language WG 161
Cantonese WS 987
Croatian WG 250
Croatian WS 377
Danish WS 3714
English WG 2454
English WS 5824
German WS 1183
Hebrew WG 62
Hebrew WS 253
Italian WG 648
Italian WS 752
Mandarin TC 652
Mandarin WS 1056
Norwegian WG 3025
Norwegian WS 12969
Russian WG 768
Russian WS 1037
Spanish WG 778
Spanish WS 1094
Swedish WG 474
Swedish WS 900
Turkish WG 1115
Turkish WS 2422

Base plot for looking at vocabulary composition.

base_plot <- function(input_form, input_measure) {
  vocab_comp_data %>%
    filter(form == input_form, measure == input_measure) %>%
    mutate(lexical_category = factor(lexical_category,
                                     levels = c("nouns", "predicates", "function_words"),
                                     labels = c("Nouns  ", "Predicates  ", "Function Words"))) %>%
    ggplot(aes(x = vocab, y = prop, colour = lexical_category)) +
    facet_wrap(~language) +
    geom_abline(slope = 1, intercept = 0, color = "gray", linetype = "dashed") + 
    scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "Proportion of Category\n") +
    scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2),
                       name = "\nVocabulary Size") +
    scale_colour_solarized(name = "") +
    theme(legend.position = "top",
          legend.key = element_blank(),
          legend.background = element_rect(fill = "transparent"))
}

Plot WS productive vocabulary composition as a function of vocabulary size for each language.

base_plot("WS", "produces") + geom_jitter(size = 0.7)