http://langcog.github.io/wordbankr/
clothing <- get_item_data(
language = "English (American)",
form = "WS"
) %>%
filter(category == "clothing")
# selecting hat / scarf / mittens
knitted_things <- c("hat", "scarf", "mittens")
# get the items for my selected knitted things
my_knits <- get_item_data(
language = "English (American)",
form = "WS"
) %>%
filter(definition %in% knitted_things)
my_knits
# A tibble: 3 x 11
item_id definition language form type category lexical_category
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 item_165 hat English (… WS word clothing nouns
2 item_168 mittens English (… WS word clothing nouns
3 item_172 scarf English (… WS word clothing nouns
# … with 4 more variables: lexical_class <chr>, uni_lemma <chr>,
# complexity_category <chr>, num_item_id <dbl>
Can I get median total vocab size?
# then get instrument data for those items
words <- get_instrument_data(
language = "English (American)",
form = "WS",
items = my_knits$item_id,
administrations = TRUE,
iteminfo = TRUE
) %>%
mutate(produces = value == "produces") %>%
select(-value) %>%
drop_na(produces) %>%
filter(type == "word", !is.na(uni_lemma))
words
# A tibble: 16,530 x 25
data_id num_item_id age comprehension production language form
<dbl> <dbl> <int> <int> <int> <chr> <chr>
1 129242 165 27 497 497 English (… WS
2 129243 165 21 369 369 English (… WS
3 129244 165 26 190 190 English (… WS
4 129245 165 27 264 264 English (… WS
5 129246 165 19 159 159 English (… WS
6 129247 165 30 513 513 English (… WS
7 129248 165 25 444 444 English (… WS
8 129249 165 24 582 582 English (… WS
9 129250 165 28 558 558 English (… WS
10 129251 165 18 7 7 English (… WS
# … with 16,520 more rows, and 18 more variables: birth_order <fct>,
# ethnicity <fct>, sex <fct>, zygosity <chr>, norming <lgl>,
# mom_ed <fct>, longitudinal <lgl>, source_name <chr>,
# license <chr>, item_id <chr>, definition <chr>, type <chr>,
# category <chr>, lexical_category <chr>, lexical_class <chr>,
# uni_lemma <chr>, complexity_category <chr>, produces <lgl>
words %>%
distinct(license)
# A tibble: 1 x 1
license
<chr>
1 CC-BY
What proportion of kids at each age produce each word?
word_summary
# A tibble: 24 x 4
age lemma prop_produce kids_n
<int> <chr> <dbl> <int>
1 16 hat 0.325 956
2 16 mittens 0.0262 956
3 16 scarf 0.00314 956
4 18 hat 0.543 442
5 18 mittens 0.104 442
6 18 scarf 0.0271 442
7 20 hat 0.675 274
8 20 mittens 0.165 273
9 20 scarf 0.0696 273
10 22 hat 0.785 200
# … with 14 more rows
ggplot(filter(word_summary, lemma == "scarf"),
aes(x = age, y = kids_n)) +
geom_col(fill = "darkcyan", alpha = .5, width = 2)
words_line <-
ggplot(word_summary, aes(x = age, y = prop_produce, color = lemma)) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
words_line
words_line + scale_color_paletteer_d("nationalparkcolors::Saguaro")
words_line + scale_color_paletteer_d("nationalparkcolors::Badlands")
words_line + scale_color_paletteer_d("lisa::PabloPicasso")
words_line + scale_color_paletteer_d("lisa::MarcChagall")
paletteer_d("nationalparkcolors::Badlands", n = 3) %>%
prismatic::check_color_blindness()
paletteer_d("nationalparkcolors::Saguaro", n = 3) %>%
prismatic::check_color_blindness()
paletteer_d("colorblindr::OkabeIto", n = 3) %>%
prismatic::check_color_blindness()
words_line + scale_color_paletteer_d("colorblindr::OkabeIto")
ggplot(word_summary, aes(x = age, y = prop_produce)) +
geom_col(aes(fill = lemma), alpha = .8,
position = "identity") +
scale_fill_paletteer_d("colorblindr::OkabeIto")
ggplot(word_summary, aes(x = age, y = prop_produce)) +
geom_col(aes(fill = lemma), alpha = .8) +
scale_fill_paletteer_d("colorblindr::OkabeIto", guide = FALSE) +
facet_wrap(~lemma)
ggplot(word_summary, aes(x = age, y = prop_produce)) +
geom_col(aes(fill = lemma), alpha = .8, position="dodge") +
scale_fill_paletteer_d("colorblindr::OkabeIto")
words_rose <-
ggplot(word_summary,
aes(x = age, y = prop_produce, fill = lemma)) +
geom_col(width = 2) +
coord_polar()
words_rose +
scale_fill_paletteer_d("nationalparkcolors::Saguaro") +
theme(axis.text.y = element_blank())
words_rose +
scale_fill_paletteer_d("nationalparkcolors::Everglades") +
theme(axis.text.y = element_blank())
Could be great for parameters, or facetting?
knitted_langs <- get_crossling_data(uni_lemmas = knitted_things)
knitted_langs %>%
# grouped already? that is tricky
ungroup() %>%
filter(n_children > 100) %>%
distinct(language)
# A tibble: 5 x 1
language
<chr>
1 Danish
2 English (American)
3 Norwegian
4 Swedish
5 Turkish
# oof item ids are different
knitted_langs %>%
# grouped already? that is tricky
ungroup() %>%
filter(n_children > 100) %>%
distinct(item_id, language, uni_lemma, definition)
# A tibble: 7 x 4
language item_id definition uni_lemma
<chr> <chr> <chr> <chr>
1 Danish item_112 hue hat
2 English (American) item_136 hat hat
3 Norwegian item_132 lue hat
4 Norwegian item_143 votter mittens
5 Swedish item_100 mössa hat
6 Swedish item_110 vantar mittens
7 Turkish item_58 Şapka hat
Now let’s export both data frames.