Get items, words, and uni_lemmas.
items <- get_item_data(mode = "local") %>%
mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))),
definition = tolower(definition))
words <- items %>%
filter(type == "word", !is.na(uni_lemma), form == "WG")
uni_lemmas <- words %>%
filter(language == "English") %>%
select(lexical_class, uni_lemma)
Get AoA estimates.
aoa_data <- read_csv("aoa_data.csv")
Get frequency estimates.
instruments <- aoa_data %>%
select(language, form) %>%
distinct()
inst_freqs <- function(language, form) {
freq_file <- sprintf("predictors/frequency/freqs/freqs_%s.csv", tolower(language))
if (file.exists(freq_file)) {
read_csv(freq_file) %>%
rename(definition = item) %>%
mutate(language = language, form = form,
defintion = tolower(definition))
}
}
freqs <- map2(instruments$language, instruments$form, inst_freqs) %>%
bind_rows() %>%
right_join(words)
# right_join(freq_words)
uni_freqs <- freqs %>%
group_by(language, form, lexical_class, uni_lemma) %>%
summarise(frequency = sum(frequency))
Get estimates of valence, arousal, and dominance
valence <- read_csv("predictors/valence/valence.csv") %>%
select(Word, V.Mean.Sum, A.Mean.Sum, D.Mean.Sum) %>%
rename(word = Word, valence = V.Mean.Sum, arousal = A.Mean.Sum,
dominance = D.Mean.Sum)
#missing_valence <- setdiff(uni_lemmas$uni_lemma, valence$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
# "predictors/valence/valence_replace.csv")
replacements_valence <- read_csv("predictors/valence/valence_replace.csv")
uni_valences <- uni_lemmas %>%
left_join(replacements_valence) %>%
rowwise() %>%
mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
select(-replacement) %>%
left_join(valence)
#uni_valences <- read_csv("predictors/valence/uni_lemma_valence.csv")
Get estimates of iconicity and babiness.
babiness <- read_csv("predictors/babiness_iconicity/english_iconicity.csv") %>%
group_by(word) %>%
summarise(iconicity = mean(rating),
babiness = mean(babyAVG))
#missing_babiness <- setdiff(uni_lemmas$uni_lemma, babiness$word)
# write_csv(data.frame(uni_lemma = missing_babiness),
# "predictors/babiness_iconicity/babiness_iconicity_replace.csv")
replacements_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity_replace.csv")
uni_babiness <- uni_lemmas %>%
left_join(replacements_babiness) %>%
rowwise() %>%
mutate(word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
select(-replacement) %>%
left_join(babiness)
#uni_babiness <- read_csv("predictors/babiness_iconicity/babiness_iconicity.csv")
Get estimates of concreteness.
concreteness <- read_csv("predictors/concreteness/concreteness.csv")
#missing_concreteness <- setdiff(uni_lemmas$uni_lemma, concreteness$Word)
# write_csv(data.frame(uni_lemma = missing_concreteness),
# "predictors/concreteness/concreteness_replace.csv")
replacements_concreteness <- read_csv("predictors/concreteness/concreteness_replace.csv")
uni_concreteness <- uni_lemmas %>%
left_join(replacements_concreteness) %>%
rowwise() %>%
mutate(Word = if (!is.na(replacement) & replacement != "") replacement else uni_lemma) %>%
select(-replacement) %>%
left_join(concreteness) %>%
rename(concreteness = Conc.M) %>%
select(uni_lemma, concreteness)
Get (English) phoneme and syllable counts.
phonemes <- read_csv("predictors/phonemes/english_phonemes.csv") %>%
mutate(num_syllables = unlist(map(strsplit(syllables, " "), length)),
num_phonemes = nchar(gsub("[', ]", "", syllables))) %>%
select(-phones, -syllables)
Put together data and predictors.
uni_joined <- aoa_data %>%
left_join(uni_freqs) %>%
left_join(uni_valences) %>%
left_join(uni_babiness) %>%
left_join(uni_concreteness) %>%
left_join(phonemes) %>%
mutate(frequency = log(frequency))
Function to get number of characters from item definitions.
num_characters <- function(words) {
strsplit(words, ", ") %>%
map(function(word_set) {
word_set %>%
unlist() %>%
strsplit(" [(].*[)]") %>%
unlist() %>%
strsplit("/") %>%
unlist() %>%
nchar() %>%
mean()
}) %>%
unlist()
}
english_predictors <- c("frequency", "concreteness", "num_characters",
"valence", "arousal", "babiness")
# "iconicity" "dominance" "num_syllables" "num_phonemes"
woof_concreteness <- unique(uni_joined$concreteness[uni_joined$uni_lemma == "woof woof"])
mean_babiness <- mean(filter(uni_joined, language == "English")$babiness, na.rm = TRUE)
mean_iconicity <- mean(filter(uni_joined, language == "English")$iconicity, na.rm = TRUE)
english_data <- uni_joined %>%
filter(language == "English", measure == "understands") %>%
rowwise() %>%
mutate(num_characters = num_characters(words),
valence = if (is.na(valence)) 5 else valence,
arousal = if (is.na(arousal)) 1 else arousal,
concreteness = if (is.na(concreteness)) woof_concreteness else concreteness,
babiness = if (is.na(babiness)) mean_babiness else babiness,
iconicity = if (is.na(iconicity)) mean_iconicity else iconicity) %>%
ungroup() %>%
# mutate(valence = abs(valence - 5)) %>%
group_by(language, measure) %>%
mutate_each_(funs(as.numeric(scale(.))), english_predictors)
english_na_lemmas <- english_data %>%
select_(.dots = c("uni_lemma", english_predictors)) %>%
gather(predictor, value, -uni_lemma) %>%
filter(is.na(value)) %>%
group_by(uni_lemma) %>%
summarise(num_na = n(), na_preds = paste(predictor, collapse = ", "))
english_model_data <- english_data %>%
ungroup() %>%
select_(.dots = c("language", "lexical_class", "uni_lemma", "aoa", english_predictors)) %>%
filter(complete.cases(.)) %>%
mutate(lexical_class = factor(lexical_class,
levels = c("nouns", "adjectives", "verbs",
"function_words", "other"),
labels = c("Nouns", "Adjectives", "Verbs",
"Function Words", "Other")))
english_formula <- as.formula(
sprintf("aoa ~ %s", paste(english_predictors, collapse = " + "))
)
english_model <- lm(english_formula, data = english_model_data)
english_predictions <- english_model_data
english_predictions$predicted_aoa <- english_model$fitted.values
#cor(english_predictions$aoa, english_predictions$predicted_aoa)
english_coef <- tidy(english_model) %>%
filter(term != "(Intercept)")
english_predictors_ordered <- english_predictors %>%
factor(levels = english_coef$term[order(abs(english_coef$estimate))])
english_predictor_levels <- english_coef$term[order(abs(english_coef$estimate),
decreasing = TRUE)]
english_coef <- english_coef %>%
mutate(term = factor(term, levels = english_predictor_levels))
english_plot_data <- english_data %>%
gather_("predictor", "value", english_predictors) %>%
mutate(predictor = factor(predictor, levels = english_predictor_levels),
lexical_class = factor(lexical_class,
levels = c("nouns", "adjectives", "verbs",
"function_words", "other"),
labels = c("Nouns", "Adjectives", "Verbs",
"Function Words", "Other")))
ggplot(english_plot_data, aes(x = value, y = aoa, colour = lexical_class)) +
facet_grid(lexical_class ~ predictor) +
geom_point(cex = 0.7) +
geom_smooth(method = "lm", se = FALSE, cex = 1) +
scale_colour_solarized(name = "") +
xlab("\nPredictor Z-Score") +
ylab("Age of Acquisition (months)\n") +
theme(legend.position = "bottom",
legend.direction = "horizontal")
ggplot(english_predictions, aes(x = predicted_aoa, y = aoa)) +
geom_text(aes(label = uni_lemma, colour = lexical_class), cex = 4, show_guide = FALSE) +
geom_smooth(aes(colour = lexical_class), weight = 1, method = "lm", se = FALSE) +
geom_smooth(colour = "black", weight = 2, method = "lm") +
scale_colour_solarized(name = "") +
scale_x_continuous(name = "\nModel Predicted Age of Acquisition (months)",
limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
scale_y_continuous(name = "Age of Acquisition (months)\n",
limits = c(6, 27), breaks = seq(6, 26, by = 2)) +
coord_fixed() +
theme(panel.grid.minor = element_blank(),
legend.position = c(0.12, 0.9),
legend.background = element_rect(fill = "transparent"))
ggplot(english_coef, aes(x = term, y = abs(estimate), fill = term)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
ymax = abs(estimate) + 1.96 * std.error)) +
scale_fill_solarized(guide = FALSE) +
xlab("") +
ylab("Coefficient Magnitude\n") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
english_predictor_means <- english_data %>%
filter(aoa >= 6, aoa <= 25) %>%
mutate(aoa = cut(floor(aoa), 4)) %>% #breaks = c(4, 8, 12, 16, 20, 24))) %>%
gather_("predictor", "value", english_predictors) %>%
group_by(predictor, aoa) %>%
filter(!is.na(value)) %>%
multi_boot_standard(column = "value")
ggplot(english_predictor_means, aes(x = aoa, y = mean, color = predictor)) +
facet_grid(. ~ predictor) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_line(aes(group = predictor)) +
scale_colour_solarized(guide = FALSE) +
xlab("\nAge of Acquisition (months)") +
ylab("Mean Predictor Z-Score\n") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
crossling_predictors <- c("frequency", "num_characters", "concreteness",
"valence", "arousal", "babiness")
woof_concreteness <- unique(uni_joined$concreteness[uni_joined$uni_lemma == "woof woof"])
mean_babiness <- mean(filter(uni_joined, language == "English")$babiness, na.rm = TRUE)
mean_iconicity <- mean(filter(uni_joined, language == "English")$iconicity, na.rm = TRUE)
crossling_data <- uni_joined %>%
filter(measure == "understands") %>%
rowwise() %>%
mutate(num_characters = num_characters(words),
valence = if (is.na(valence)) 5 else valence,
arousal = if (is.na(arousal)) 1 else arousal,
concreteness = if (is.na(concreteness)) woof_concreteness else concreteness,
babiness = if (is.na(babiness)) mean_babiness else babiness,
iconicity = if (is.na(iconicity)) mean_iconicity else iconicity) %>%
group_by(language, measure) %>%
mutate_each_(funs(as.numeric(scale(.))), crossling_predictors)
crossling_na_lemmas <- crossling_data %>%
select_(.dots = c("language", "uni_lemma", crossling_predictors)) %>%
gather_("predictor", "value", crossling_predictors) %>%
filter(is.na(value)) %>%
group_by(language, uni_lemma) %>%
summarise(num_na = n(), na_preds = paste(predictor, collapse = ", "))
crossling_model_data <- crossling_data %>%
ungroup() %>%
select_("language", "lexical_class", "uni_lemma", "aoa", crossling_predictors) %>%
filter(complete.cases(.)) %>%
mutate(lexical_class = factor(lexical_class,
levels = c("nouns", "adjectives", "verbs",
"function_words", "other"),
labels = c("Nouns", "Adjectives", "Verbs",
"Function Words", "Other")))
crossling_model <- lmer(aoa ~ frequency + num_characters + concreteness + valence +
arousal + babiness + (1 + frequency + num_characters +
concreteness + valence + arousal +
babiness | language),
data = crossling_data)
crossling_coef <- data.frame(term = row.names(summary(crossling_model)$coefficients),
estimate = summary(crossling_model)$coefficients[,"Estimate"],
std.error = summary(crossling_model)$coefficients[,"Std. Error"],
row.names = NULL) %>%
filter(term != "(Intercept)")
crossling_predictors_ordered <- crossling_predictors %>%
factor(levels = crossling_coef$term[order(abs(crossling_coef$estimate))])
crossling_predictor_levels <- crossling_coef$term[order(abs(crossling_coef$estimate),
decreasing = TRUE)]
crossling_coef <- crossling_coef %>%
mutate(term = factor(term, levels = crossling_predictor_levels))
ggplot(crossling_coef, aes(x = term, y = abs(estimate), fill = term)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
ymax = abs(estimate) + 1.96 * std.error)) +
scale_fill_solarized(guide = FALSE) +
xlab("") +
ylab("Coefficient Magnitude\n") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
crossling_predictor_means_by_lang <- crossling_data %>%
filter(aoa <= 25) %>%
mutate(aoa = cut(floor(aoa), breaks = c(4, 8, 12, 16, 20, 24))) %>%
gather_("predictor", "value", crossling_predictors) %>%
group_by(language, predictor, aoa) %>%
filter(!is.na(value)) %>%
multi_boot_standard(column = "value")
crossling_predictor_means_by_lang %>%
ungroup() %>%
mutate(predictor = factor(predictor, levels = c(english_predictor_levels,
"num_characters"))) %>%
ggplot(aes(x = aoa, y = mean, color = language)) +
facet_grid(. ~ predictor) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_line(aes(group = language)) +
scale_colour_solarized(name = "") +
xlab("\nAge of Acquisition (months)") +
ylab("Mean Predictor Z-Score\n") +
theme(axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "bottom",
legend.direction = "horizontal")
crossling_predictor_means <- crossling_data %>%
filter(aoa < 25) %>%
mutate(aoa = cut(floor(aoa), breaks = c(4, 8, 12, 16, 20, 24))) %>%
gather_("predictor", "value", crossling_predictors) %>%
group_by(predictor, aoa) %>%
filter(!is.na(value)) %>%
multi_boot_standard(column = "value")
crossling_predictor_means %>%
ungroup() %>%
mutate(predictor = factor(predictor, levels = c(english_predictor_levels,
"num_characters"))) %>%
ggplot(aes(x = aoa, y = mean, color = predictor)) +
facet_grid(. ~ predictor) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_line(aes(group = predictor)) +
scale_colour_solarized(guide = FALSE) +
xlab("\nAge of Acquisition (months)") +
ylab("Mean Predictor Z-Score\n") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
joined_coef <- bind_rows(mutate(english_coef, type = "English"),
mutate(crossling_coef, type = "Cross-Linguistic")) %>%
mutate(type = factor(type, levels = c("English", "Cross-Linguistic")),
term = factor(term, levels = english_predictor_levels))
ggplot(joined_coef, aes(x = term, y = abs(estimate), fill = term)) +
facet_wrap("type", scales = "free_x") +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = abs(estimate) - 1.96 * std.error,
ymax = abs(estimate) + 1.96 * std.error)) +
scale_fill_solarized(guide = FALSE) +
xlab("") +
ylab("Coefficient Magnitude\n") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
english_predictor_means_2 <- english_data %>%
filter(floor(aoa) > 4, floor(aoa) <= 24) %>%
mutate(aoa = cut(floor(aoa), breaks = c(4, 14, 24))) %>%
gather_("predictor", "value", english_predictors) %>%
group_by(predictor, aoa) %>%
filter(!is.na(value)) %>%
multi_boot_standard(column = "value")
crossling_predictor_means_2 <- crossling_data %>%
filter(floor(aoa) > 4, floor(aoa) <= 24) %>%
mutate(aoa = cut(floor(aoa), breaks = c(4, 14, 24))) %>%
gather_("predictor", "value", crossling_predictors) %>%
group_by(predictor, aoa) %>%
filter(!is.na(value)) %>%
multi_boot_standard(column = "value")
level_base <- english_predictor_means_2 %>% filter(aoa == "(4,14]")
predictor_mean_levels <- level_base$predictor[order(level_base$mean, decreasing = TRUE)]
joined_predictor_means <- bind_rows(
mutate(english_predictor_means_2, type = "English"),
mutate(crossling_predictor_means_2, type = "Cross-Linguistic")) %>%
mutate(type = factor(type, levels = c("English", "Cross-Linguistic")))
joined_predictor_means %>%
mutate(predictor = factor(predictor, levels = predictor_mean_levels)) %>%
ggplot(aes(x = aoa, y = mean, color = predictor)) +
facet_wrap(~type) +
geom_point(position = position_dodge(width = 0.1)) +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper, group = predictor),
alpha = 0.4,
position = position_dodge(width = 0.1)) +
geom_line(aes(group = predictor), position = position_dodge(width = 0.1)) +
geom_dl(aes(label = predictor),
method = list("first.qp", dl.trans(x = x - 0.3), cex = 0.6)) +
geom_dl(aes(label = predictor),
method = list("last.qp", dl.trans(x = x + 0.3), cex = 0.6)) +
scale_colour_solarized(guide = FALSE) +
xlab("\nAge of Acquisition (months)") +
ylab("Mean Predictor Z-Score\n")