Get items, words, and uni_lemmas.

items <- get_item_data(mode = "local") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))),
         definition = tolower(definition))

words <- items %>%
  filter(type == "word", !is.na(uni_lemma), form == "WG")

uni_lemmas <- words %>%
  filter(language == "English") %>%
  select(lexical_class, uni_lemma)

Get AoA estimates.

aoa_data <- read_csv("aoa_data.csv")

Get frequency estimates.

instruments <- aoa_data %>%
  select(language, form) %>%
  distinct()

inst_freqs <- function(language, form) {
  freq_file <- sprintf("predictors/frequency/freqs/freqs_%s.csv", tolower(language))
  if (file.exists(freq_file)) {
    read_csv(freq_file) %>%
      rename(definition = item) %>%
      mutate(language = language, form = form,
             defintion = tolower(definition))
  }
}

freqs <- map2(instruments$language, instruments$form, inst_freqs) %>%
  bind_rows() %>%
  right_join(words)
#  right_join(freq_words)

uni_freqs <- freqs %>%
  group_by(language, form, lexical_class, uni_lemma) %>%
  summarise(frequency = sum(frequency))

Get estimates of valence, arousal, and dominance

valence <- read_csv("predictors/valence/valence.csv") %>%
  select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum) %>%
  rename(word = Word, valence = V.Mean.Sum, arousal = A.Mean.Sum,
         dominance = D.Mean.Sum)

#missing_valence <- setdiff(uni_lemmas$uni_lemma, valence$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
#           "predictors/valence/valence_replace.csv")

replacements_valence <- read_csv("predictors/valence/valence_replace.csv")
uni_valences <- uni_lemmas %>%
  left_join(replacements_valence) %>%
  rowwise() %>%
  mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(valence)

#uni_valences <- read_csv("predictors/valence/uni_lemma_valence.csv")

Get estimates of iconicity and babiness.

babiness <- read_csv("predictors/babiness_iconicity/english_iconicity.csv") %>%
  group_by(word) %>%
  summarise(iconicity = mean(rating),
            babiness = mean(babyAVG))

#missing_babiness <- setdiff(uni_lemmas$uni_lemma, babiness$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
#           "predictors/babiness_iconicity/babiness_iconicity_replace.csv")

replacements_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity_replace.csv")
uni_babiness <- uni_lemmas %>%
  left_join(replacements_babiness) %>%
  rowwise() %>%
  mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(babiness)

#uni_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity.csv") 

Get estimates of concreteness.

concreteness <- read_csv("predictors/concreteness/concreteness.csv")

#missing_concreteness <- setdiff(uni_lemmas$uni_lemma, concreteness$Word)
# write_csv(data.frame(uni_lemma = missing_concreteness),
#           "predictors/concreteness/concreteness_replace.csv")

replacements_concreteness <- read_csv("predictors/concreteness/concreteness_replace.csv")
uni_concreteness <- uni_lemmas %>%
  left_join(replacements_concreteness) %>%
  rowwise() %>%
  mutate(Word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(concreteness) %>%
  rename(concreteness = Conc.M) %>%
  select(uni_lemma, concreteness)

Get (English) phoneme and syllable counts.

phonemes <- read_csv("predictors/phonemes/english_phonemes.csv") %>%
  mutate(num_syllables = unlist(map(strsplit(syllables, " "), length)),
         num_phonemes = nchar(gsub("[', ]", "", syllables))) %>%
  select(-phones, -syllables)

Put together data and predictors.

uni_joined <- aoa_data %>%
  left_join(uni_freqs) %>%
  left_join(uni_valences) %>%
  left_join(uni_babiness) %>%
  left_join(uni_concreteness) %>%
  left_join(phonemes) %>%
  mutate(frequency = log(frequency))

Function to get number of characters from item definitions.

num_characters <- function(words) {
  strsplit(words, ", ") %>%
    map(function(word_set) {
      word_set %>%
        unlist() %>%
        strsplit(" [(].*[)]") %>%
        unlist() %>%
        strsplit("/") %>%
        unlist() %>%
        nchar() %>%
        mean()
    }) %>%
    unlist()
}

Analysis 1: English

english_predictors <- c("frequency",  "concreteness", "num_characters",
                        "valence", "arousal", "babiness")
# "iconicity" "dominance" "num_syllables" "num_phonemes"

woof_concreteness <- unique(uni_joined$concreteness[uni_joined$uni_lemma == "woof woof"])
mean_babiness <- mean(filter(uni_joined, language == "English")$babiness, na.rm = TRUE)
mean_iconicity <- mean(filter(uni_joined, language == "English")$iconicity, na.rm = TRUE)

english_data <- uni_joined %>%
  filter(language == "English", measure == "understands") %>%
  rowwise() %>%
  mutate(num_characters = num_characters(words),
         valence = if (is.na(valence)) 5 else valence,
         arousal = if (is.na(arousal)) 1 else arousal,
         concreteness = if (is.na(concreteness)) woof_concreteness else concreteness,
         babiness = if (is.na(babiness)) mean_babiness else babiness,
         iconicity = if (is.na(iconicity)) mean_iconicity else iconicity) %>%
  ungroup() %>%
  #  mutate(valence = abs(valence - 5)) %>%
  group_by(language, measure) %>%
  mutate_each_(funs(as.numeric(scale(.))), english_predictors)

english_na_lemmas <- english_data %>%
  select_(.dots = c("uni_lemma", english_predictors)) %>%
  gather(predictor, value, -uni_lemma) %>%
  filter(is.na(value)) %>%
  group_by(uni_lemma) %>%
  summarise(num_na = n(), na_preds = paste(predictor, collapse = ", "))
english_model_data <- english_data %>%
  ungroup() %>%
  select_(.dots = c("language", "lexical_class", "uni_lemma", "aoa", english_predictors)) %>%
  filter(complete.cases(.)) %>%
  mutate(lexical_class = factor(lexical_class,
                                levels = c("nouns", "adjectives", "verbs",
                                           "function_words", "other"),
                                labels = c("Nouns", "Adjectives", "Verbs",
                                           "Function Words", "Other")))

english_formula <- as.formula(
  sprintf("aoa ~ %s", paste(english_predictors, collapse = " + "))
)

english_model <- lm(english_formula, data = english_model_data)
english_predictions <- english_model_data
english_predictions$predicted_aoa <- english_model$fitted.values

#cor(english_predictions$aoa, english_predictions$predicted_aoa)

english_coef <- tidy(english_model) %>%
  filter(term != "(Intercept)")

english_predictors_ordered <- english_predictors %>%
  factor(levels = english_coef$term[order(abs(english_coef$estimate))])

english_predictor_levels <- english_coef$term[order(abs(english_coef$estimate),
                                                    decreasing = TRUE)]
english_coef <- english_coef %>%
  mutate(term = factor(term, levels = english_predictor_levels))
english_plot_data <- english_data %>%
  gather_("predictor", "value", english_predictors) %>%
  mutate(predictor = factor(predictor, levels = english_predictor_levels),
         lexical_class = factor(lexical_class,
                                levels = c("nouns", "adjectives", "verbs",
                                           "function_words", "other"),
                                labels = c("Nouns", "Adjectives", "Verbs",
                                           "Function Words", "Other")))

ggplot(english_plot_data, aes(x = value, y = aoa, colour = lexical_class)) +
  facet_grid(lexical_class ~ predictor) +
  geom_point(cex = 0.7) +
  geom_smooth(method = "lm", se = FALSE, cex = 1) +
  scale_colour_solarized(name = "") +
  xlab("\nPredictor Z-Score") +
  ylab("Age of Acquisition (months)\n") +
  theme(legend.position = "bottom",
        legend.direction = "horizontal")

ggplot(english_predictions, aes(x = predicted_aoa, y = aoa)) +
  geom_text(aes(label = uni_lemma, colour = lexical_class), cex = 4, show_guide = FALSE) +
  geom_smooth(aes(colour = lexical_class), weight = 1, method = "lm", se = FALSE) +
  geom_smooth(colour = "black", weight = 2, method = "lm") +
  scale_colour_solarized(name = "") +
  scale_x_continuous(name = "\nModel Predicted Age of Acquisition (months)",
                     limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
  scale_y_continuous(name = "Age of Acquisition (months)\n",
                     limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
  coord_fixed() +
  theme(panel.grid.minor = element_blank(),
        legend.position = c(0.12, 0.9),
        legend.background = element_rect(fill = "transparent"))

ggplot(english_coef, aes(x = term, y = abs(estimate), fill = term)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
                     ymax = abs(estimate) + 1.96 * std.error)) +
  scale_fill_solarized(guide = FALSE) +
  xlab("") +
  ylab("Coefficient Magnitude\n") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

english_predictor_means <- english_data %>%
  filter(aoa >= 6, aoa <= 25) %>%
  mutate(aoa = cut(floor(aoa), 4)) %>% #breaks = c(4, 8, 12, 16, 20, 24))) %>%
  gather_("predictor", "value", english_predictors) %>%
  group_by(predictor, aoa) %>%
  filter(!is.na(value)) %>%
  multi_boot_standard(column = "value")
ggplot(english_predictor_means, aes(x = aoa, y = mean, color = predictor)) +
  facet_grid(. ~ predictor) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  geom_line(aes(group = predictor)) +
  scale_colour_solarized(guide = FALSE) +
  xlab("\nAge of Acquisition (months)") +
  ylab("Mean Predictor Z-Score\n") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))


Analysis 2: Cross-Linguistic

crossling_predictors <- c("frequency", "num_characters", "concreteness",
                          "valence", "arousal", "babiness")

woof_concreteness <- unique(uni_joined$concreteness[uni_joined$uni_lemma == "woof woof"])
mean_babiness <- mean(filter(uni_joined, language == "English")$babiness, na.rm = TRUE)
mean_iconicity <- mean(filter(uni_joined, language == "English")$iconicity, na.rm = TRUE)

crossling_data <- uni_joined %>%
  filter(measure == "understands") %>%
  rowwise() %>%
  mutate(num_characters = num_characters(words),
         valence = if (is.na(valence)) 5 else valence,
         arousal = if (is.na(arousal)) 1 else arousal,
         concreteness = if (is.na(concreteness)) woof_concreteness else concreteness,
         babiness = if (is.na(babiness)) mean_babiness else babiness,
         iconicity = if (is.na(iconicity)) mean_iconicity else iconicity) %>%
  group_by(language, measure) %>%
  mutate_each_(funs(as.numeric(scale(.))), crossling_predictors)

crossling_na_lemmas <- crossling_data %>%
  select_(.dots = c("language", "uni_lemma", crossling_predictors)) %>%
  gather_("predictor", "value", crossling_predictors) %>%
  filter(is.na(value)) %>%
  group_by(language, uni_lemma) %>%
  summarise(num_na = n(), na_preds = paste(predictor, collapse = ", "))
crossling_model_data <- crossling_data %>%
  ungroup() %>%
  select_("language", "lexical_class", "uni_lemma", "aoa", crossling_predictors) %>%
  filter(complete.cases(.)) %>%
  mutate(lexical_class = factor(lexical_class,
                                levels = c("nouns", "adjectives", "verbs",
                                           "function_words", "other"),
                                labels = c("Nouns", "Adjectives", "Verbs",
                                           "Function Words", "Other")))

crossling_model <- lmer(aoa ~ frequency + num_characters + concreteness + valence + 
                          arousal + babiness + (1 + frequency + num_characters + 
                                                  concreteness + valence + arousal + 
                                                  babiness | language),
                        data = crossling_data)

crossling_coef <- data.frame(term = row.names(summary(crossling_model)$coefficients),
                             estimate = summary(crossling_model)$coefficients[,"Estimate"],
                             std.error = summary(crossling_model)$coefficients[,"Std. Error"],
                             row.names = NULL) %>%
  filter(term != "(Intercept)")

crossling_predictors_ordered <- crossling_predictors %>%
  factor(levels = crossling_coef$term[order(abs(crossling_coef$estimate))])

crossling_predictor_levels <- crossling_coef$term[order(abs(crossling_coef$estimate),
                                                        decreasing = TRUE)]
crossling_coef <- crossling_coef %>%
  mutate(term = factor(term, levels = crossling_predictor_levels))
ggplot(crossling_coef, aes(x = term, y = abs(estimate), fill = term)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
                     ymax = abs(estimate) + 1.96 * std.error)) +
  scale_fill_solarized(guide = FALSE) +
  xlab("") +
  ylab("Coefficient Magnitude\n") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

crossling_predictor_means_by_lang <- crossling_data %>%
  filter(aoa <= 25) %>%
  mutate(aoa = cut(floor(aoa), breaks = c(4, 8, 12, 16, 20, 24))) %>%
  gather_("predictor", "value", crossling_predictors) %>%
  group_by(language, predictor, aoa) %>%
  filter(!is.na(value)) %>%
  multi_boot_standard(column = "value")
crossling_predictor_means_by_lang %>%
  ungroup() %>%
  mutate(predictor = factor(predictor, levels = c(english_predictor_levels,
                                                  "num_characters"))) %>%
  ggplot(aes(x = aoa, y = mean, color = language)) +
  facet_grid(. ~ predictor) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  geom_line(aes(group = language)) +
  scale_colour_solarized(name = "") +
  xlab("\nAge of Acquisition (months)") +
  ylab("Mean Predictor Z-Score\n") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1),
        legend.position = "bottom",
        legend.direction = "horizontal")

crossling_predictor_means <- crossling_data %>%
  filter(aoa < 25) %>%
  mutate(aoa = cut(floor(aoa), breaks = c(4, 8, 12, 16, 20, 24))) %>%
  gather_("predictor", "value", crossling_predictors) %>%
  group_by(predictor, aoa) %>%
  filter(!is.na(value)) %>%
  multi_boot_standard(column = "value")
crossling_predictor_means %>%
  ungroup() %>%
  mutate(predictor = factor(predictor, levels = c(english_predictor_levels,
                                                  "num_characters"))) %>%
  ggplot(aes(x = aoa, y = mean, color = predictor)) +
  facet_grid(. ~ predictor) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  geom_line(aes(group = predictor)) +
  scale_colour_solarized(guide = FALSE) +
  xlab("\nAge of Acquisition (months)") +
  ylab("Mean Predictor Z-Score\n") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))


Summary Plots

joined_coef <- bind_rows(mutate(english_coef, type = "English"),
                         mutate(crossling_coef, type = "Cross-Linguistic")) %>%
  mutate(type = factor(type, levels = c("English", "Cross-Linguistic")),
         term = factor(term, levels = english_predictor_levels))
ggplot(joined_coef, aes(x = term, y = abs(estimate), fill = term)) +
  facet_wrap("type", scales = "free_x") +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
                     ymax = abs(estimate) + 1.96 * std.error)) +
  scale_fill_solarized(guide = FALSE) +
  xlab("") +
  ylab("Coefficient Magnitude\n") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

english_predictor_means_2 <- english_data %>%
  filter(floor(aoa) > 4, floor(aoa) <= 24) %>%
  mutate(aoa = cut(floor(aoa), breaks = c(4, 14, 24))) %>%
  gather_("predictor", "value", english_predictors) %>%
  group_by(predictor, aoa) %>%
  filter(!is.na(value)) %>%
  multi_boot_standard(column = "value")

crossling_predictor_means_2 <- crossling_data %>%
  filter(floor(aoa) > 4, floor(aoa) <= 24) %>%
  mutate(aoa = cut(floor(aoa), breaks = c(4, 14, 24))) %>%
  gather_("predictor", "value", crossling_predictors) %>%
  group_by(predictor, aoa) %>%
  filter(!is.na(value)) %>%
  multi_boot_standard(column = "value")

level_base <- english_predictor_means_2 %>% filter(aoa == "(4,14]")
predictor_mean_levels <- level_base$predictor[order(level_base$mean, decreasing = TRUE)]

joined_predictor_means <- bind_rows(
  mutate(english_predictor_means_2, type = "English"),
  mutate(crossling_predictor_means_2, type = "Cross-Linguistic")) %>%
  mutate(type = factor(type, levels = c("English", "Cross-Linguistic")))
joined_predictor_means %>%
  mutate(predictor = factor(predictor, levels = predictor_mean_levels)) %>%
  ggplot(aes(x = aoa, y = mean, color = predictor)) +
  facet_wrap(~type) +
  geom_point(position = position_dodge(width = 0.1)) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper, group = predictor),
                  alpha = 0.4,
                  position = position_dodge(width = 0.1)) +
  geom_line(aes(group = predictor), position = position_dodge(width = 0.1)) +
  geom_dl(aes(label = predictor),
          method = list("first.qp", dl.trans(x = x - 0.3), cex = 0.6)) +
  geom_dl(aes(label = predictor),
          method = list("last.qp", dl.trans(x = x + 0.3), cex = 0.6)) +
  scale_colour_solarized(guide = FALSE) +
  xlab("\nAge of Acquisition (months)") +
  ylab("Mean Predictor Z-Score\n")