Knitting hats, scarves, and mittens

Data

http://langcog.github.io/wordbankr/

Packages

library(tidyverse)
library(wordbankr)
library(here)
library(paletteer)
library(prismatic)
ggplot2::theme_set(theme_minimal())

Getting knitted things

clothing <- get_item_data(
  language = "English (American)", 
  form = "WS"
  ) %>%
  filter(category == "clothing")

# selecting hat / scarf / mittens
knitted_things <- c("hat", "scarf", "mittens")

# get the items for my selected knitted things
my_knits <- get_item_data(
  language = "English (American)", 
  form = "WS"
  ) %>%
  filter(definition %in% knitted_things)

my_knits

# A tibble: 3 x 11
  item_id  definition language   form  type  category lexical_category
  <chr>    <chr>      <chr>      <chr> <chr> <chr>    <chr>           
1 item_165 hat        English (… WS    word  clothing nouns           
2 item_168 mittens    English (… WS    word  clothing nouns           
3 item_172 scarf      English (… WS    word  clothing nouns           
# … with 4 more variables: lexical_class <chr>, uni_lemma <chr>,
#   complexity_category <chr>, num_item_id <dbl>

Get word production data

Can I get median total vocab size?

# then get instrument data for those items
words <- get_instrument_data(
  language = "English (American)",
  form = "WS",
  items = my_knits$item_id,
  administrations = TRUE,
  iteminfo = TRUE
  ) %>% 
  mutate(produces = value == "produces") %>% 
  select(-value) %>% 
  drop_na(produces) %>%
  filter(type == "word", !is.na(uni_lemma))

words

# A tibble: 16,530 x 25
   data_id num_item_id   age comprehension production language   form 
     <dbl>       <dbl> <int>         <int>      <int> <chr>      <chr>
 1  129242         165    27           497        497 English (… WS   
 2  129243         165    21           369        369 English (… WS   
 3  129244         165    26           190        190 English (… WS   
 4  129245         165    27           264        264 English (… WS   
 5  129246         165    19           159        159 English (… WS   
 6  129247         165    30           513        513 English (… WS   
 7  129248         165    25           444        444 English (… WS   
 8  129249         165    24           582        582 English (… WS   
 9  129250         165    28           558        558 English (… WS   
10  129251         165    18             7          7 English (… WS   
# … with 16,520 more rows, and 18 more variables: birth_order <fct>,
#   ethnicity <fct>, sex <fct>, zygosity <chr>, norming <lgl>,
#   mom_ed <fct>, longitudinal <lgl>, source_name <chr>,
#   license <chr>, item_id <chr>, definition <chr>, type <chr>,
#   category <chr>, lexical_category <chr>, lexical_class <chr>,
#   uni_lemma <chr>, complexity_category <chr>, produces <lgl>

Check licenses

words %>% 
  distinct(license)

# A tibble: 1 x 1
  license
  <chr>  
1 CC-BY

Summarize

What proportion of kids at each age produce each word?

word_summary <- words %>% 
  group_by(age, uni_lemma) %>%
  summarise(kids_produce = sum(produces, na.rm = TRUE),
            kids_n = n_distinct(data_id),
            prop_produce = kids_produce / kids_n) %>% 
  ungroup() %>% 
  # grab even ages only
  filter(age %% 2 == 0) %>% 
  select(age, lemma = uni_lemma, prop_produce, kids_n)

word_summary

# A tibble: 24 x 4
     age lemma   prop_produce kids_n
   <int> <chr>          <dbl>  <int>
 1    16 hat          0.325      956
 2    16 mittens      0.0262     956
 3    16 scarf        0.00314    956
 4    18 hat          0.543      442
 5    18 mittens      0.104      442
 6    18 scarf        0.0271     442
 7    20 hat          0.675      274
 8    20 mittens      0.165      273
 9    20 scarf        0.0696     273
10    22 hat          0.785      200
# … with 14 more rows

Age distribution

ggplot(filter(word_summary, lemma == "scarf"), 
       aes(x = age, y = kids_n)) +
  geom_col(fill = "darkcyan", alpha = .5, width = 2)

Line plot

words_line <-
  ggplot(word_summary, aes(x = age, y = prop_produce, color = lemma)) + 
  geom_smooth(se = FALSE, lwd = .5) +
  geom_point(size = 2)  +
  labs(x = "Age (months)", y = "Proportion of Children Producing") 
words_line

Play with color on line plots

words_line + scale_color_paletteer_d("nationalparkcolors::Saguaro")

words_line + scale_color_paletteer_d("nationalparkcolors::Badlands")

words_line + scale_color_paletteer_d("lisa::PabloPicasso")

words_line + scale_color_paletteer_d("lisa::MarcChagall")

Check colorblindness

paletteer_d("nationalparkcolors::Badlands", n = 3) %>%
  prismatic::check_color_blindness()

paletteer_d("nationalparkcolors::Saguaro", n = 3) %>%
  prismatic::check_color_blindness()

paletteer_d("colorblindr::OkabeIto", n = 3) %>%
  prismatic::check_color_blindness()

Better for colorblind

words_line + scale_color_paletteer_d("colorblindr::OkabeIto")

Play with more plots

Bar plot

ggplot(word_summary, aes(x = age, y = prop_produce)) +
  geom_col(aes(fill = lemma), alpha = .8, 
           position = "identity") +
  scale_fill_paletteer_d("colorblindr::OkabeIto")

Facetted bar plot

ggplot(word_summary, aes(x = age, y = prop_produce)) +
  geom_col(aes(fill = lemma), alpha = .8) +
  scale_fill_paletteer_d("colorblindr::OkabeIto", guide = FALSE) + 
  facet_wrap(~lemma)

Dodged bar plot

ggplot(word_summary, aes(x = age, y = prop_produce)) +
  geom_col(aes(fill = lemma), alpha = .8, position="dodge") +
  scale_fill_paletteer_d("colorblindr::OkabeIto")

Rose chart

words_rose <-
  ggplot(word_summary, 
         aes(x = age, y = prop_produce, fill = lemma)) +
  geom_col(width = 2) +
  coord_polar()

words_rose + 
  scale_fill_paletteer_d("nationalparkcolors::Saguaro") + 
  theme(axis.text.y = element_blank())

words_rose + 
  scale_fill_paletteer_d("nationalparkcolors::Everglades") + 
  theme(axis.text.y = element_blank())

Crosslinguistic data

Could be great for parameters, or facetting?

knitted_langs <- get_crossling_data(uni_lemmas = knitted_things)

knitted_langs %>% 
  # grouped already? that is tricky
  ungroup() %>% 
  filter(n_children > 100) %>% 
  distinct(language)

# A tibble: 5 x 1
  language          
  <chr>             
1 Danish            
2 English (American)
3 Norwegian         
4 Swedish           
5 Turkish

# oof item ids are different
knitted_langs %>% 
  # grouped already? that is tricky
  ungroup() %>% 
  filter(n_children > 100) %>% 
  distinct(item_id, language, uni_lemma, definition)

# A tibble: 7 x 4
  language           item_id  definition uni_lemma
  <chr>              <chr>    <chr>      <chr>    
1 Danish             item_112 hue        hat      
2 English (American) item_136 hat        hat      
3 Norwegian          item_132 lue        hat      
4 Norwegian          item_143 votter     mittens  
5 Swedish            item_100 mössa      hat      
6 Swedish            item_110 vantar     mittens  
7 Turkish            item_58  Şapka      hat

Exporting

Now let’s export both data frames.

write_csv(words, 
          here::here("data","words.csv"))
write_csv(word_summary, 
          here::here("data","word_summary.csv"))