Import data

Data sheets include percentages of each category in each frame. Multiply these by number of points (excluding tape, wand, and shadow) per frame to get number of points of each category.

# Find all file names ending in .xlsx 
files <- list.files(path = "data/CPCe", pattern = "^[^~].*.xlsx", 
                    recursive = TRUE, full.names = TRUE)

# Import and tidy data from each file
cpce <- data_frame(file = files) %>%
  # Extract original DCA CPCe data filepath from cell B4
  mutate(DCAfile = as.character(map(file, readxl::read_xlsx,      
                   sheet = "Data Summary", range = "B4", col_names = FALSE))) %>%
  # Read in data from rows 6 to 23 -- contains percent of each category for each frame
  mutate(data = map(file, readxl::read_xlsx, 
                    sheet = "Data Summary", range = cell_rows(6:23))) %>%
  # Transpose data so each category is a column, each frame a row
  mutate(data = map(data, ~ as_tibble(t(.)))) %>%
  # Set first row of transposed data to column names
  mutate(data = map(data, function(x) rename_at(x, vars(names(x)), funs(slice(x, 1))))) %>%
  # Remove first row (old column names)
  mutate(data = map(data, ~ slice(., -1))) %>%
  # Clean column names
  mutate(data = map(data, ~ janitor::clean_names(.))) %>%
  # Remove rows at end that summarize frames (has NA for number_of_frames)
  mutate(data = map(data, ~ filter(., !is.na(number_of_frames)))) %>%
  # Remove unneeded columns
  mutate(data = map(data, ~ select(., -c(1, 2, 4)))) %>%
  # Convert all columns to numeric
  mutate(data = map(data, ~ mutate_all(., funs(as.numeric)))) %>%
  # Convert all category colummns from percent to number of points of each category
  mutate(data = map(data, ~ mutate_at(., -1, funs(./100 * total_points_minus_tape_wand_shadow)))) %>%
  # Round all numbers to nearest whole number (some slightly different due to DCA rounding)
  mutate(data = map(data, ~ mutate_all(., funs(round)))) %>%
  # Convert all columns to integer
  mutate(data = map(data, ~ mutate_all(., funs(as.integer)))) %>%
  # Remove first column (total number of points column) - all columns are now categories
  mutate(data = map(data, ~ select(., -1)))

Add additional metadata (site and date)

Use information contained within the filenames to get site and date.

# Add site, week, transect information
cpce <- cpce %>%
  mutate(basename = tools::file_path_sans_ext(basename(file))) %>%
  mutate(basename = gsub("CPCe.", "", basename)) %>%
  mutate(basename = gsub("-RR", "", basename)) %>%
  mutate(basename = gsub("-LR", "", basename)) %>%
  separate(basename, into = c("site", "week", "transect"), sep = "\ |_|-") %>%
  #mutate(weekn = str_extract(week, "[[:digit:]]{1,2}")) %>%
  mutate(reef = str_sub(site, 1, 2),
         dir  = str_sub(site, 3, 3),
         channel = if_else(grepl("C", site), "control", "channelside"))

# Add date information
# Use information from "POM PERM SITE INVENTORY (5 29 18).xlsx" to add date information
# These are estimates of dates... actual dates may be in "Appendix H"
# Use baseline report table 1 and "MonitoringMatrix_NEW" to get dates
cpce <- cpce %>%
  mutate(date = case_when(
    str_detect(week, "B") ~ case_when(site == "HBN1" ~ as_date("2013-11-07"),
                                      site == "HBN2" ~ as_date("2013-11-07"),
                                      site == "HBN3" ~ as_date("2013-11-07"),
                                      site == "HBNC1" ~ as_date("2013-11-12"),
                                      site == "HBS1" ~ as_date("2013-11-07"),
                                      site == "HBS2" ~ as_date("2013-11-07"),
                                      site == "HBS3" ~ as_date("2013-11-12"),
                                      site == "HBS4" ~ as_date("2013-11-08"),
                                      site == "HBSC1" ~ as_date("2013-11-07"),
                                      site == "R2N1" ~ as_date("2013-11-19"),
                                      site == "R2S1" ~ as_date("2013-11-19"),
                                      site == "R2SC1" ~ as_date("2013-11-18"),
                                      site == "R2NC1" ~ as_date("2013-11-24"),
                                      site == "R2N2" ~ as_date("2013-12-15"),
                                      site == "R2NC2" ~ as_date("2013-12-02"),
                                      site == "R2NC3" ~ as_date("2013-12-10"),
                                      site == "R2S2" ~ as_date("2013-12-15"),
                                      # BL report Table 1 conflicts with Monitoring Matrix for R2SC2
                                      # latest tagged coral data are from 2013-12-15
                                      site == "R2SC2" ~ as_date("2013-12-15"),
                                      # BL report Table 1 conflicts with Monitoring Matrix for R3N1
                                      # latest tagged coral data are from 2013-12-11
                                      site == "R3N1" ~ as_date("2013-12-11"),
                                      site == "R3NC1" ~ as_date("2013-12-10"),
                                      site == "R3S1" ~ as_date("2013-12-30"),
                                      site == "R3S2" ~ as_date("2013-12-30"),
                                      site == "R3S3" ~ as_date("2013-12-30"),
                                      site == "R3SC1" ~ as_date("2013-12-12"),
                                      site == "R3SC2" ~ as_date("2013-12-12"),
                                      site == "R3SC3" ~ as_date("2013-12-12")),
    # Compliance weeks are the same for all reefs, number of weeks since 2013-11-20
    grepl("^C", week) ~ as.Date("2013-11-20") + 
      dweeks(as.numeric(str_extract(week, "[[:digit:]]{1,2}"))),
    # Postconstruction weeks
    week == "PC3" ~ as.Date("2015-07-01"),
    # Impact assessment
    week == "IA" ~ as.Date("2016-08-20")     ### THESE DATES NEED CONFIRMATION
  ))

# Check to see if dates of cpce surveys can be found in DCA filepath
bb <- unlist(str_extract_all(cpce$DCAfile, "[0-9]+-[0-9]+-[0-9]+"))
bb
## [1] "11-19-13" "11-19-13" "11-19-13" "12-15-13" "12-15-13" "12-15-13"
# only a few files actually have dates......

Additional QC

Categories were not all the same for all CPCe analyses. In some of the analyses, “Sand, Pavement, Rubble (SPR)” was used instead of using separate categories for “Sand” and “Rubble”. In these cases, we have to dig deeper into the excel sheets and get the “Sand” subcategory data.

One frame (R3SC2 IA T2 frame 9) contained no data – drop from dataset.

# What files used SPR instead of sand?
spr_files <- cpce %>%
  unnest(data) %>%
  filter(is.na(sand_sa)) %>%
  distinct(file)

# Import and tidy SPR subcategory data from each file in which SPR was used as a category
spr_subcat <- data_frame(file = spr_files$file) %>%
  # Read in data from all rows
  mutate(data = map(file, readxl::read_xlsx,  sheet = "Data Summary", col_types = "text", col_names = FALSE)) %>%
  # Get only lines for SPR subcategories
  mutate(data = map(data, ~ filter(., X__1 == "Number of frames" | 
                                      X__1 == "Sand (S)" | X__1 == "Pavement (P)" | X__1 == "Rubble (R)"))) %>%
  # Transpose data so each category is a column, each frame a row
  mutate(data = map(data, ~ as_tibble(t(.)))) %>%
  # Remove duplicate rows (excel sheet contained rows with n points and rows with percent of points)
  mutate(data = map(data, ~ select(., 1, 5, 6, 7))) %>%
  # Set first row of transposed data to column names
  mutate(data = map(data, function(x) rename_at(x, vars(names(x)), funs(slice(x, 1))))) %>%
  # Remove first row (old column names)
  mutate(data = map(data, ~ slice(., -1))) %>%
  # Clean column names
  mutate(data = map(data, ~ janitor::clean_names(.))) %>%
  # Convert all columns to integer
  mutate(data = map(data, ~ mutate_all(., funs(as.integer)))) %>%
  # Remove rows with NA number of frames (these were summary statistics in excel sheet) 
  mutate(data = map(data, ~ filter(., !is.na(number_of_frames)))) #%>%
  #unnest(data_t)

# Find files in which not all SPR is sand -- 
pr <- spr_subcat %>%
  unnest(data) %>%
  group_by(file) %>%
  summarise_all(sum) %>%
  filter(pavement_p + rubble_r > 0)
pr
## # A tibble: 13 x 5
##    file                        number_of_frames pavement_p rubble_r sand_s
##    <chr>                                  <int>      <int>    <int>  <int>
##  1 data/CPCe/HBN1 BW4 T1.xlsx                56          0        1    443
##  2 data/CPCe/HBN1 BW4 T3.xlsx                71          0        2    545
##  3 data/CPCe/HBS1 C42 T3.xlsx                39         57        0    149
##  4 data/CPCe/HBS2 BW4 T1.xlsx                53          0       23    116
##  5 data/CPCe/HBS2 BW4 T2.xlsx                56          0       19    154
##  6 data/CPCe/HBS2 BW4 T3.xlsx                58          0       12    143
##  7 data/CPCe/HBS2 C11 T3.xlsx                41          0        1    162
##  8 data/CPCe/HBS3 BW4 T1.xlsx                62          0        8     73
##  9 data/CPCe/HBS3 BW4 T2.xlsx                63          0        7     13
## 10 data/CPCe/HBS3 BW4 T3.xlsx                68          0        6     59
## 11 data/CPCe/HBSC1 BW4 T3.xlsx               68          0        1     52
## 12 data/CPCe/HBSC1 C16 T1 - R…               38          0        4     19
## 13 data/CPCe/R3N1 C30 T3.xlsx                40          0        1    250
# -- for these, must use this sand subcategory instead of spr major category...sub in... 

# Merge all imported CPCe data with the sand subcategory data for those datafiles containing pavement or rubble points
bb <- right_join(cpce, spr_subcat, by = "file")

# Replace the sand_sa column (major category sand) with the sand_s subcategory for those files where necessary
bb <- bb %>%
  mutate(data_sub = case_when(
    # when the file did have pavement and rubble, make sand and rubble the subcategory amounts
    file %in% pr$file ~ map2(data.x, data.y, ~ mutate(.x, sand_sa = .y$sand_s, rubble_r = .y$rubble_r)),
    # when the file did not have pavement and rubble, all spr is sand, and rubble is zero
                 TRUE ~ map(data.x, ~ mutate(.x, sand_sa = sand_pavement_rubble_spr, rubble_r = 0))))

# Replace original dataset with modified dataset containing sand subcategory data substituted in
cpce_sub <- left_join(cpce, bb) %>%
  mutate(data_final = case_when(file %in% bb$file ~ data_sub, TRUE ~ data)) %>%
  unnest(data_final) %>%
  select(-sand_pavement_rubble_spr, -tape_wand_shadow_tws) %>%
  drop_na()   # one missing value.. drop

Save CPCe data as .RData object

save(cpce_sub, file = "data/processed/cpce.RData")

Dataset metrics

cpce <- cpce_sub

# Date range for CPCe data
range(cpce$date)
## [1] "2013-11-07" "2016-08-20"
# Group by transect and count number of frames in each transect
transects <- cpce %>%
  group_by(site, week, transect, date) %>%
  count()

# Number of transects
nrow(transects)
## [1] 1772
# Number of still frames analyzed in each transect
## Range
range(transects$n)
## [1] 27 80
## Median
median(transects$n)
## [1] 40
# Number of points analyzed in each still frame
points <- cpce %>%
  mutate(sumpts = rowSums(select_if(., is.integer))) %>%
  select(site, transect, date, sumpts)
## Range
range(points$sumpts)
## [1]  1 10
## Median
median(points$sumpts)
## [1] 8