This file cleans data downloaded from the Equity in Athletics Data Analysis initiative, managed by the U.S. Department of Education.

https://ope.ed.gov/athletics/#/

I downloaded all available equity data for Duke and UNC from 2003 through 2021.

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

General format information: everything is in separate files, but the format is the same (1 row per school per year).

Some of these counts appear to be inaccurate–for example, neither the UNC nor the Duke 2021 women’s basketball team is listed as having a head coach. A quick google search shows this is clearly incorrect. There are no part time female head coaches listed at all, in any sport–I have a feeling this might be the source of the error (but have no way to prove it).

To avoid drawing conclusions that rest on this data issue, and also for simplicity, I am just going to use the total number of coaches, not split out by coach gender or full time/part time status or head/assistant status.

# Load and transform the data on number of male and female coaches for mens and womens teams
commoncols <- c("Survey Year", "UNITID", "Institution Name", "OPE ID", "State CD", "Classification Name", "Classification Other", "Sanction Code", "Sanction Name", "Male Undergraduates", "Female Undergraduates", "Type of Varsity Sports Teams")


# Load and transform data on number of athletes
athletes <- read_csv("raw/Athletics_Participation_All_Sports_and_Men's_Women's_and_Coed_Teams_2003_2004_2005_2006_2007_2008_2009_2010_2011_2012_2013_2014_2015_2016_2017_2018_2019_2020_2021.csv", 
                          show_col_types = FALSE) |> 
  select(-contains("Coed"),
         -contains("Total")) |> 
  pivot_longer(names_to = c("sport", "teamgender"),
               names_pattern = "(.*) ([[:alpha:]]*en)'s .*",
               values_to = "nplayers",
               cols = -any_of(commoncols)) |> 
  rename(
    school = "Institution Name",
    year = "Survey Year",
    maleug = "Male Undergraduates",
    femaleug = "Female Undergraduates",
    division = "Classification Name") |> 
  filter(!is.na(nplayers)) |> 
  mutate(teamgender = tolower(teamgender),
         school = ifelse(school == "Duke University", "Duke", "UNC")) |> 
  select(school, year, division, sport, teamgender, nplayers) |> 
  filter(sport != "Unduplicated Count")

reshape_sports <- function(f){
  tomerge <- read_csv(paste0("raw/", f), show_col_types = FALSE) |> 
    # we can calculate totals--so drop them; they complicate merging
    select(-contains("Total")) |> 
    pivot_longer(names_to = c("sport", "coachgender", "coachlevel", "assignstatus"),
                 names_pattern = "(.*) ([[:alpha:]]*ale) (.*) Coach (.T)",
                 values_to = "ncoaches",
                 cols = -any_of(commoncols)
    ) |> 
    rename(teamgender = "Type of Varsity Sports Teams",
           school = "Institution Name",
           year = "Survey Year",
           maleug = "Male Undergraduates",
           femaleug = "Female Undergraduates",
           division = "Classification Name") |> 
    select(school, year, division, teamgender, sport, coachgender, coachlevel, assignstatus, ncoaches) |> 
    mutate(teamgender = ifelse(teamgender == "Men's Team", "men", "women"),
           coachgender = ifelse(coachgender == "Male", "men", "women"),
           coachlevel = ifelse(coachlevel == "Assistant", "assistant", "head"),
           school = ifelse(school == "Duke University", "Duke", "UNC")) |> 
    pivot_wider(
      names_from = c(assignstatus, coachlevel, coachgender), 
      values_from = ncoaches) 
  
  # |>
  #   select(where(~any(!is.na(.))))
  
  return(tomerge)
}

assistants <- tibble()
heads <- tibble()
coachfiles <- list.files("raw/")[grepl("Coaches_", list.files("raw/"))]
for(f in coachfiles){
  tomerge <- reshape_sports(f)
  
  if(str_detect(f, "Assistant")){
    if(nrow(assistants) == 0){
      assistants <- tomerge
    } else {
      assistants <- full_join(assistants, tomerge)
    }
  } else if(str_detect(f, "Head")){
    if(nrow(heads) == 0){
      heads <- tomerge
    } else {
      heads <- full_join(heads, tomerge)
    }
  }
}
Joining with `by = join_by(school, year, division, teamgender, sport,
FT_assistant_men, PT_assistant_men, FT_assistant_women, PT_assistant_women)`
Joining with `by = join_by(school, year, division, teamgender, sport,
FT_head_men, PT_head_men, FT_head_women, PT_head_women)`
teams <- assistants |> 
  full_join(heads)
Joining with `by = join_by(school, year, division, teamgender, sport)`
teams <- teams |> 
  filter(any(
    !is.na(PT_head_women),
    !is.na(PT_head_men),
    !is.na(PT_assistant_women),
    !is.na(PT_assistant_men),
    !is.na(FT_head_women),
    !is.na(FT_head_men),
    !is.na(FT_assistant_women),
    !is.na(FT_assistant_men)
  ))

teams[is.na(teams)] <- 0

teams <- teams |> 
  mutate(
    ncoaches = 
      PT_head_men + FT_head_men + PT_head_women + FT_head_women + 
      PT_assistant_men + FT_assistant_men + PT_assistant_women + FT_assistant_women,
    coaches_head = PT_head_men + FT_head_men + PT_head_women + FT_head_women,
    coaches_assistant = PT_assistant_men + FT_assistant_men + PT_assistant_women + FT_assistant_women,
    coaches_FT = FT_head_men + FT_head_women + FT_assistant_men + FT_assistant_women,
    coaches_PT = PT_head_men + PT_head_women + PT_assistant_men + PT_assistant_women,
    coaches_women = FT_head_women + PT_head_women + FT_assistant_women + PT_assistant_women,
    coaches_men = FT_head_men + PT_head_men + FT_assistant_men + PT_assistant_men,
  ) |> 
  filter(ncoaches > 0)


# Merge coach data and athletes data
npeople <- full_join(teams, athletes) |> 
  mutate(player_coach_ratio = nplayers/ncoaches) |> 
  filter(!is.na(nplayers)) |> 
  select(-starts_with("coaches")) |> 
  select(-starts_with("PT"), -starts_with("FT"))
Joining with `by = join_by(school, year, division, teamgender, sport)`
# npeople_indiv <- npeople |> 
#   uncount(ncoaches) |> 
#   mutate(coachgender = ifelse(coachgender == "men", "man", "woman"))

saveRDS(npeople, file = "sports_teams.rds")
# saveRDS(npeople_indiv, file = "sports_npeople_indiv.rds")
saveRDS(npeople, file = "../slides/data/sports_teams.rds")
saveRDS(npeople, file = "../project_example/sports_teams.rds")
# Load and transform the data on average salaries for coaches on mens and womens teams

coachsalaries <- tibble()
salaryfiles <- list.files("raw/")[grepl("Salaries", list.files("raw/"))]
for(f in salaryfiles){
  coachtype <- str_extract(f, "^[[:alpha:]]*")
  tomerge <- read_csv(paste0("raw/", f), show_col_types = FALSE) |> 
    select(-contains("Coed"), -contains("Total")) |> 
    pivot_longer(names_to = c("teamgender", "population"),
                 names_pattern = "(.*)'s Team .* per (.*)",
                 values_to = "avsalary",
                 cols = c(-any_of(commoncols), -contains("Number"))) |> 
    mutate(population = ifelse(population == "FTE", paste0(coachtype, " FTE"), population))
  
  if(nrow(coachsalaries) == 0){
    coachsalaries <- tomerge
  } else {
    coachsalaries <- full_join(coachsalaries, tomerge) 
  }
}
Joining with `by = join_by(`Survey Year`, UNITID, `OPE ID`, `Institution Name`,
`State CD`, `Classification Name`, `Classification Other`, `Sanction Code`,
`Sanction Name`, `Male Undergraduates`, `Female Undergraduates`, `Men's Team
Number of FTEs Included in Average`, `Women's Team Number of FTEs Included in
Average`, teamgender, population, avsalary)`
coachsalaries <- coachsalaries |> 
    rename(men_fte_n = "Men's Team Number of FTEs Included in Average",
           women_fte_n = "Women's Team Number of FTEs Included in Average",
           men_asstcoach_n = "Men's Team Number of Assistant Coaches Included in Average",
           women_asstcoach_n = "Women's Team Number of Assistant Coaches Included in Average",
           men_headcoach_n = "Men's Team Number of Head Coaches Included in Average",
           women_headcoach_n = "Women's Team Number of Head Coaches Included in Average") |> 
    mutate(ncoaches = case_when(teamgender == "Men" & str_detect(population, "FTE") ~ men_fte_n,
                                teamgender == "Women" & str_detect(population, "FTE") ~ women_fte_n,
                                teamgender == "Men" & population == "Assistant Coach" ~ men_asstcoach_n,
                                teamgender == "Women" & population == "Assistant Coach" ~ women_asstcoach_n,
                                teamgender == "Men" & population == "Head Coach" ~ men_headcoach_n,
                                teamgender == "Women" & population == "Head Coach" ~ women_headcoach_n,
    )) |> 
  mutate(coachlevel = tolower(str_extract(population, "^[[:alpha:]]*")),
         emptype = tolower(str_extract(population, "[[:alpha:]]*$"))) |> 
  rename(
    school = "Institution Name",
    year = "Survey Year",
    maleug = "Male Undergraduates",
    femaleug = "Female Undergraduates",
    division = "Classification Name") |> 
  select(school, year, division, teamgender, coachlevel, emptype, avsalary, ncoaches) |> 
  mutate(teamgender = tolower(teamgender),
         school = ifelse(school == "Duke University", "Duke", "UNC"))

saveRDS(coachsalaries, file = "../slides/data/sports_coachsalaries.rds")
saveRDS(coachsalaries, file = "sports_coachsalaries.rds")
# Load and transform data on aid given to student athletes and recruiting expenses (whole athletic program)
aid <- tibble()
# Load and transform data on expenses and revenues (sport-specific)
money <- tibble()