I downloaded all available equity data for Duke and UNC from 2003 through 2021.
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
General format information: everything is in separate files, but the format is the same (1 row per school per year).
Some of these counts appear to be inaccurate–for example, neither the UNC nor the Duke 2021 women’s basketball team is listed as having a head coach. A quick google search shows this is clearly incorrect. There are no part time female head coaches listed at all, in any sport–I have a feeling this might be the source of the error (but have no way to prove it).
To avoid drawing conclusions that rest on this data issue, and also for simplicity, I am just going to use the total number of coaches, not split out by coach gender or full time/part time status or head/assistant status.
# Load and transform the data on number of male and female coaches for mens and womens teamscommoncols <-c("Survey Year", "UNITID", "Institution Name", "OPE ID", "State CD", "Classification Name", "Classification Other", "Sanction Code", "Sanction Name", "Male Undergraduates", "Female Undergraduates", "Type of Varsity Sports Teams")# Load and transform data on number of athletesathletes <-read_csv("raw/Athletics_Participation_All_Sports_and_Men's_Women's_and_Coed_Teams_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021.csv", show_col_types =FALSE) |>select(-contains("Coed"),-contains("Total")) |>pivot_longer(names_to =c("sport", "teamgender"),names_pattern ="(.*) ([[:alpha:]]*en)'s .*",values_to ="nplayers",cols =-any_of(commoncols)) |>rename(school ="Institution Name",year ="Survey Year",maleug ="Male Undergraduates",femaleug ="Female Undergraduates",division ="Classification Name") |>filter(!is.na(nplayers)) |>mutate(teamgender =tolower(teamgender),school =ifelse(school =="Duke University", "Duke", "UNC")) |>select(school, year, division, sport, teamgender, nplayers) |>filter(sport !="Unduplicated Count")reshape_sports <-function(f){ tomerge <-read_csv(paste0("raw/", f), show_col_types =FALSE) |># we can calculate totals--so drop them; they complicate mergingselect(-contains("Total")) |>pivot_longer(names_to =c("sport", "coachgender", "coachlevel", "assignstatus"),names_pattern ="(.*) ([[:alpha:]]*ale) (.*) Coach (.T)",values_to ="ncoaches",cols =-any_of(commoncols) ) |>rename(teamgender ="Type of Varsity Sports Teams",school ="Institution Name",year ="Survey Year",maleug ="Male Undergraduates",femaleug ="Female Undergraduates",division ="Classification Name") |>select(school, year, division, teamgender, sport, coachgender, coachlevel, assignstatus, ncoaches) |>mutate(teamgender =ifelse(teamgender =="Men's Team", "men", "women"),coachgender =ifelse(coachgender =="Male", "men", "women"),coachlevel =ifelse(coachlevel =="Assistant", "assistant", "head"),school =ifelse(school =="Duke University", "Duke", "UNC")) |>pivot_wider(names_from =c(assignstatus, coachlevel, coachgender), values_from = ncoaches) # |># select(where(~any(!is.na(.))))return(tomerge)}assistants <-tibble()heads <-tibble()coachfiles <-list.files("raw/")[grepl("Coaches_", list.files("raw/"))]for(f in coachfiles){ tomerge <-reshape_sports(f)if(str_detect(f, "Assistant")){if(nrow(assistants) ==0){ assistants <- tomerge } else { assistants <-full_join(assistants, tomerge) } } elseif(str_detect(f, "Head")){if(nrow(heads) ==0){ heads <- tomerge } else { heads <-full_join(heads, tomerge) } }}
# Load and transform the data on average salaries for coaches on mens and womens teamscoachsalaries <-tibble()salaryfiles <-list.files("raw/")[grepl("Salaries", list.files("raw/"))]for(f in salaryfiles){ coachtype <-str_extract(f, "^[[:alpha:]]*") tomerge <-read_csv(paste0("raw/", f), show_col_types =FALSE) |>select(-contains("Coed"), -contains("Total")) |>pivot_longer(names_to =c("teamgender", "population"),names_pattern ="(.*)'s Team .* per (.*)",values_to ="avsalary",cols =c(-any_of(commoncols), -contains("Number"))) |>mutate(population =ifelse(population =="FTE", paste0(coachtype, " FTE"), population))if(nrow(coachsalaries) ==0){ coachsalaries <- tomerge } else { coachsalaries <-full_join(coachsalaries, tomerge) }}
Joining with `by = join_by(`Survey Year`, UNITID, `OPE ID`, `Institution Name`,
`State CD`, `Classification Name`, `Classification Other`, `Sanction Code`,
`Sanction Name`, `Male Undergraduates`, `Female Undergraduates`, `Men's Team
Number of FTEs Included in Average`, `Women's Team Number of FTEs Included in
Average`, teamgender, population, avsalary)`
coachsalaries <- coachsalaries |>rename(men_fte_n ="Men's Team Number of FTEs Included in Average",women_fte_n ="Women's Team Number of FTEs Included in Average",men_asstcoach_n ="Men's Team Number of Assistant Coaches Included in Average",women_asstcoach_n ="Women's Team Number of Assistant Coaches Included in Average",men_headcoach_n ="Men's Team Number of Head Coaches Included in Average",women_headcoach_n ="Women's Team Number of Head Coaches Included in Average") |>mutate(ncoaches =case_when(teamgender =="Men"&str_detect(population, "FTE") ~ men_fte_n, teamgender =="Women"&str_detect(population, "FTE") ~ women_fte_n, teamgender =="Men"& population =="Assistant Coach"~ men_asstcoach_n, teamgender =="Women"& population =="Assistant Coach"~ women_asstcoach_n, teamgender =="Men"& population =="Head Coach"~ men_headcoach_n, teamgender =="Women"& population =="Head Coach"~ women_headcoach_n, )) |>mutate(coachlevel =tolower(str_extract(population, "^[[:alpha:]]*")),emptype =tolower(str_extract(population, "[[:alpha:]]*$"))) |>rename(school ="Institution Name",year ="Survey Year",maleug ="Male Undergraduates",femaleug ="Female Undergraduates",division ="Classification Name") |>select(school, year, division, teamgender, coachlevel, emptype, avsalary, ncoaches) |>mutate(teamgender =tolower(teamgender),school =ifelse(school =="Duke University", "Duke", "UNC"))saveRDS(coachsalaries, file ="../slides/data/sports_coachsalaries.rds")saveRDS(coachsalaries, file ="sports_coachsalaries.rds")
# Load and transform data on aid given to student athletes and recruiting expenses (whole athletic program)aid <-tibble()
# Load and transform data on expenses and revenues (sport-specific)money <-tibble()