Get items, words, and uni_lemmas.

items <- get_item_data(mode = "local") %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))),
         definition = tolower(definition))

words <- items %>%
  filter(type == "word", !is.na(uni_lemma), form == "WG")

uni_lemmas <- words %>%
  filter(language == "English") %>%
  select(lexical_class, uni_lemma)

Get AoA estimates.

aoa_data <- read_csv("aoa_data.csv")

Get frequency estimates.

instruments <- aoa_data %>%
  select(language, form) %>%
  distinct()

inst_freqs <- function(language, form) {
  freq_file <- sprintf("predictors/frequency/freqs/freqs_%s.csv", tolower(language))
  if (file.exists(freq_file)) {
    read_csv(freq_file) %>%
      rename(definition = item) %>%
      mutate(language = language, form = form,
             defintion = tolower(definition))
  }
}

freqs <- map2(instruments$language, instruments$form, inst_freqs) %>%
  bind_rows() %>%
  right_join(words)
#  right_join(freq_words)

uni_freqs <- freqs %>%
  group_by(language, form, lexical_class, uni_lemma) %>%
  summarise(frequency = sum(frequency))

Get estimates of valence, arousal, and dominance

valence <- read_csv("predictors/valence/valence.csv") %>%
  select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum) %>%
  rename(word = Word, valence = V.Mean.Sum, arousal = A.Mean.Sum,
         dominance = D.Mean.Sum)

#missing_valence <- setdiff(uni_lemmas$uni_lemma, valence$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
#           "predictors/valence/valence_replace.csv")

replacements_valence <- read_csv("predictors/valence/valence_replace.csv")
uni_valences <- uni_lemmas %>%
  left_join(replacements_valence) %>%
  rowwise() %>%
  mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(valence)

#uni_valences <- read_csv("predictors/valence/uni_lemma_valence.csv")

Get estimates of iconicity and babiness.

babiness <- read_csv("predictors/babiness_iconicity/english_iconicity.csv") %>%
  group_by(word) %>%
  summarise(iconicity = mean(rating),
            babiness = mean(babyAVG))

#missing_babiness <- setdiff(uni_lemmas$uni_lemma, babiness$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
#           "predictors/babiness_iconicity/babiness_iconicity_replace.csv")

replacements_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity_replace.csv")
uni_babiness <- uni_lemmas %>%
  left_join(replacements_babiness) %>%
  rowwise() %>%
  mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(babiness)

#uni_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity.csv") 

Get estimates of concreteness.

concreteness <- read_csv("predictors/concreteness/concreteness.csv")

#missing_concreteness <- setdiff(uni_lemmas$uni_lemma, concreteness$Word)
# write_csv(data.frame(uni_lemma = missing_concreteness),
#           "predictors/concreteness/concreteness_replace.csv")

replacements_concreteness <- read_csv("predictors/concreteness/concreteness_replace.csv")
uni_concreteness <- uni_lemmas %>%
  left_join(replacements_concreteness) %>%
  rowwise() %>%
  mutate(Word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
  select(-replacement) %>%
  left_join(concreteness) %>%
  rename(concreteness = Conc.M) %>%
  select(uni_lemma, concreteness)

Get (English) phoneme and syllable counts.

phonemes <- read_csv("predictors/phonemes/english_phonemes.csv") %>%
  mutate(num_syllables = unlist(map(strsplit(syllables, " "), length)),
         num_phonemes = nchar(gsub("[', ]", "", syllables))) %>%
  select(-phones, -syllables)

Put together data and predictors.

uni_joined <- aoa_data %>%
  left_join(uni_freqs) %>%
  left_join(uni_valences) %>%
  left_join(uni_babiness) %>%
  left_join(uni_concreteness) %>%
  left_join(phonemes) %>%
  mutate(frequency = log(frequency))

Function to get number of characters from item definitions.

num_characters <- function(words) {
  strsplit(words, ", ") %>%
    map(function(word_set) {
      word_set %>%
        unlist() %>%
        strsplit(" [(].*[)]") %>%
        unlist() %>%
        strsplit("/") %>%
        unlist() %>%
        nchar() %>%
        mean()
    }) %>%
    unlist()
}

Analysis 1: English

english_predictors <- c("frequency",  "concreteness", "num_characters",
                        "valence", "arousal", "babiness")
# "iconicity" "dominance" "num_syllables" "num_phonemes"

woof_concreteness <- unique(uni_joined$concreteness[uni_joined$uni_lemma == "woof woof"])
mean_babiness <- mean(filter(uni_joined, language == "English")$babiness, na.rm = TRUE)
mean_iconicity <- mean(filter(uni_joined, language == "English")$iconicity, na.rm = TRUE)

english_data <- uni_joined %>%
  filter(language == "English", measure == "understands") %>%
  rowwise() %>%
  mutate(num_characters = num_characters(words),
         valence = if (is.na(valence)) 5 else valence,
         arousal = if (is.na(arousal)) 1 else arousal,
         concreteness = if (is.na(concreteness)) woof_concreteness else concreteness,
         babiness = if (is.na(babiness)) mean_babiness else babiness,
         iconicity = if (is.na(iconicity)) mean_iconicity else iconicity) %>%
  ungroup() %>%
  #  mutate(valence = abs(valence - 5)) %>%
  group_by(language, measure) %>%
  mutate_each_(funs(as.numeric(scale(.))), english_predictors)

english_na_lemmas <- english_data %>%
  select_(.dots = c("uni_lemma", english_predictors)) %>%
  gather(predictor, value, -uni_lemma) %>%
  filter(is.na(value)) %>%
  group_by(uni_lemma) %>%
  summarise(num_na = n(), na_preds = paste(predictor, collapse = ", "))
english_model_data <- english_data %>%
  ungroup() %>%
  select_(.dots = c("language", "lexical_class", "uni_lemma", "aoa", english_predictors)) %>%
  filter(complete.cases(.)) %>%
  mutate(lexical_class = factor(lexical_class,
                                levels = c("nouns", "adjectives", "verbs",
                                           "function_words", "other"),
                                labels = c("Nouns", "Adjectives", "Verbs",
                                           "Function Words", "Other")))

english_formula <- as.formula(
  sprintf("aoa ~ %s", paste(english_predictors, collapse = " + "))
)

english_model <- lm(english_formula, data = english_model_data)
english_predictions <- english_model_data
english_predictions$predicted_aoa <- english_model$fitted.values

#cor(english_predictions$aoa, english_predictions$predicted_aoa)

english_coef <- tidy(english_model) %>%
  filter(term != "(Intercept)")

english_predictors_ordered <- english_predictors %>%
  factor(levels = english_coef$term[order(abs(english_coef$estimate))])

english_predictor_levels <- english_coef$term[order(abs(english_coef$estimate),
                                                    decreasing = TRUE)]
english_coef <- english_coef %>%
  mutate(term = factor(term, levels = english_predictor_levels))
english_plot_data <- english_data %>%
  gather_("predictor", "value", english_predictors) %>%
  mutate(predictor = factor(predictor, levels = english_predictor_levels),
         lexical_class = factor(lexical_class,
                                levels = c("nouns", "adjectives", "verbs",
                                           "function_words", "other"),
                                labels = c("Nouns", "Adjectives", "Verbs",
                                           "Function Words", "Other")))

ggplot(english_plot_data, aes(x = value, y = aoa, colour = lexical_class)) +
  facet_grid(lexical_class ~ predictor) +
  geom_point(cex = 0.7) +
  geom_smooth(method = "lm", se = FALSE, cex = 1) +
  scale_colour_solarized(name = "") +
  xlab("\nPredictor Z-Score") +
  ylab("Age of Acquisition (months)\n") +
  theme(legend.position = "bottom",
        legend.direction = "horizontal")

ggplot(english_predictions, aes(x = predicted_aoa, y = aoa)) +
  geom_text(aes(label = uni_lemma, colour = lexical_class), cex = 4, show_guide = FALSE) +
  geom_smooth(aes(colour = lexical_class), weight = 1, method = "lm", se = FALSE) +
  geom_smooth(colour = "black", weight = 2, method = "lm") +
  scale_colour_solarized(name = "") +
  scale_x_continuous(name = "\nModel Predicted Age of Acquisition (months)",
                     limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
  scale_y_continuous(name = "Age of Acquisition (months)\n",
                     limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
  coord_fixed() +
  theme(panel.grid.minor = element_blank(),
        legend.position = c(0.12, 0.9),
        legend.background = element_rect(fill = "transparent"))