Data sheets include percentages of each category in each frame. Multiply these by number of points (excluding tape, wand, and shadow) per frame to get number of points of each category.
# Find all file names ending in .xlsx
files <- list.files(path = "data/CPCe", pattern = "^[^~].*.xlsx",
recursive = TRUE, full.names = TRUE)
# Import and tidy data from each file
cpce <- data_frame(file = files) %>%
# Extract original DCA CPCe data filepath from cell B4
mutate(DCAfile = as.character(map(file, readxl::read_xlsx,
sheet = "Data Summary", range = "B4", col_names = FALSE))) %>%
# Read in data from rows 6 to 23 -- contains percent of each category for each frame
mutate(data = map(file, readxl::read_xlsx,
sheet = "Data Summary", range = cell_rows(6:23))) %>%
# Transpose data so each category is a column, each frame a row
mutate(data = map(data, ~ as_tibble(t(.)))) %>%
# Set first row of transposed data to column names
mutate(data = map(data, function(x) rename_at(x, vars(names(x)), funs(slice(x, 1))))) %>%
# Remove first row (old column names)
mutate(data = map(data, ~ slice(., -1))) %>%
# Clean column names
mutate(data = map(data, ~ janitor::clean_names(.))) %>%
# Remove rows at end that summarize frames (has NA for number_of_frames)
mutate(data = map(data, ~ filter(., !is.na(number_of_frames)))) %>%
# Remove unneeded columns
mutate(data = map(data, ~ select(., -c(1, 2, 4)))) %>%
# Convert all columns to numeric
mutate(data = map(data, ~ mutate_all(., funs(as.numeric)))) %>%
# Convert all category colummns from percent to number of points of each category
mutate(data = map(data, ~ mutate_at(., -1, funs(./100 * total_points_minus_tape_wand_shadow)))) %>%
# Round all numbers to nearest whole number (some slightly different due to DCA rounding)
mutate(data = map(data, ~ mutate_all(., funs(round)))) %>%
# Convert all columns to integer
mutate(data = map(data, ~ mutate_all(., funs(as.integer)))) %>%
# Remove first column (total number of points column) - all columns are now categories
mutate(data = map(data, ~ select(., -1)))
Use information contained within the filenames to get site and date.
# Add site, week, transect information
cpce <- cpce %>%
mutate(basename = tools::file_path_sans_ext(basename(file))) %>%
mutate(basename = gsub("CPCe.", "", basename)) %>%
mutate(basename = gsub("-RR", "", basename)) %>%
mutate(basename = gsub("-LR", "", basename)) %>%
separate(basename, into = c("site", "week", "transect"), sep = "\ |_|-") %>%
#mutate(weekn = str_extract(week, "[[:digit:]]{1,2}")) %>%
mutate(reef = str_sub(site, 1, 2),
dir = str_sub(site, 3, 3),
channel = if_else(grepl("C", site), "control", "channelside"))
# Add date information
# Use information from "POM PERM SITE INVENTORY (5 29 18).xlsx" to add date information
# These are estimates of dates... actual dates may be in "Appendix H"
# Use baseline report table 1 and "MonitoringMatrix_NEW" to get dates
cpce <- cpce %>%
mutate(date = case_when(
str_detect(week, "B") ~ case_when(site == "HBN1" ~ as_date("2013-11-07"),
site == "HBN2" ~ as_date("2013-11-07"),
site == "HBN3" ~ as_date("2013-11-07"),
site == "HBNC1" ~ as_date("2013-11-12"),
site == "HBS1" ~ as_date("2013-11-07"),
site == "HBS2" ~ as_date("2013-11-07"),
site == "HBS3" ~ as_date("2013-11-12"),
site == "HBS4" ~ as_date("2013-11-08"),
site == "HBSC1" ~ as_date("2013-11-07"),
site == "R2N1" ~ as_date("2013-11-19"),
site == "R2S1" ~ as_date("2013-11-19"),
site == "R2SC1" ~ as_date("2013-11-18"),
site == "R2NC1" ~ as_date("2013-11-24"),
site == "R2N2" ~ as_date("2013-12-15"),
site == "R2NC2" ~ as_date("2013-12-02"),
site == "R2NC3" ~ as_date("2013-12-10"),
site == "R2S2" ~ as_date("2013-12-15"),
# BL report Table 1 conflicts with Monitoring Matrix for R2SC2
# latest tagged coral data are from 2013-12-15
site == "R2SC2" ~ as_date("2013-12-15"),
# BL report Table 1 conflicts with Monitoring Matrix for R3N1
# latest tagged coral data are from 2013-12-11
site == "R3N1" ~ as_date("2013-12-11"),
site == "R3NC1" ~ as_date("2013-12-10"),
site == "R3S1" ~ as_date("2013-12-30"),
site == "R3S2" ~ as_date("2013-12-30"),
site == "R3S3" ~ as_date("2013-12-30"),
site == "R3SC1" ~ as_date("2013-12-12"),
site == "R3SC2" ~ as_date("2013-12-12"),
site == "R3SC3" ~ as_date("2013-12-12")),
# Compliance weeks are the same for all reefs, number of weeks since 2013-11-20
grepl("^C", week) ~ as.Date("2013-11-20") +
dweeks(as.numeric(str_extract(week, "[[:digit:]]{1,2}"))),
# Postconstruction weeks
week == "PC3" ~ as.Date("2015-07-01"),
# Impact assessment
week == "IA" ~ as.Date("2016-08-20") ### THESE DATES NEED CONFIRMATION
))
# Check to see if dates of cpce surveys can be found in DCA filepath
bb <- unlist(str_extract_all(cpce$DCAfile, "[0-9]+-[0-9]+-[0-9]+"))
bb
## [1] "11-19-13" "11-19-13" "11-19-13" "12-15-13" "12-15-13" "12-15-13"
# only a few files actually have dates......
Categories were not all the same for all CPCe analyses. In some of the analyses, “Sand, Pavement, Rubble (SPR)” was used instead of using separate categories for “Sand” and “Rubble”. In these cases, we have to dig deeper into the excel sheets and get the “Sand” subcategory data.
One frame (R3SC2 IA T2 frame 9) contained no data – drop from dataset.
# What files used SPR instead of sand?
spr_files <- cpce %>%
unnest(data) %>%
filter(is.na(sand_sa)) %>%
distinct(file)
# Import and tidy SPR subcategory data from each file in which SPR was used as a category
spr_subcat <- data_frame(file = spr_files$file) %>%
# Read in data from all rows
mutate(data = map(file, readxl::read_xlsx, sheet = "Data Summary", col_types = "text", col_names = FALSE)) %>%
# Get only lines for SPR subcategories
mutate(data = map(data, ~ filter(., X__1 == "Number of frames" |
X__1 == "Sand (S)" | X__1 == "Pavement (P)" | X__1 == "Rubble (R)"))) %>%
# Transpose data so each category is a column, each frame a row
mutate(data = map(data, ~ as_tibble(t(.)))) %>%
# Remove duplicate rows (excel sheet contained rows with n points and rows with percent of points)
mutate(data = map(data, ~ select(., 1, 5, 6, 7))) %>%
# Set first row of transposed data to column names
mutate(data = map(data, function(x) rename_at(x, vars(names(x)), funs(slice(x, 1))))) %>%
# Remove first row (old column names)
mutate(data = map(data, ~ slice(., -1))) %>%
# Clean column names
mutate(data = map(data, ~ janitor::clean_names(.))) %>%
# Convert all columns to integer
mutate(data = map(data, ~ mutate_all(., funs(as.integer)))) %>%
# Remove rows with NA number of frames (these were summary statistics in excel sheet)
mutate(data = map(data, ~ filter(., !is.na(number_of_frames)))) #%>%
#unnest(data_t)
# Find files in which not all SPR is sand --
pr <- spr_subcat %>%
unnest(data) %>%
group_by(file) %>%
summarise_all(sum) %>%
filter(pavement_p + rubble_r > 0)
pr
## # A tibble: 13 x 5
## file number_of_frames pavement_p rubble_r sand_s
## <chr> <int> <int> <int> <int>
## 1 data/CPCe/HBN1 BW4 T1.xlsx 56 0 1 443
## 2 data/CPCe/HBN1 BW4 T3.xlsx 71 0 2 545
## 3 data/CPCe/HBS1 C42 T3.xlsx 39 57 0 149
## 4 data/CPCe/HBS2 BW4 T1.xlsx 53 0 23 116
## 5 data/CPCe/HBS2 BW4 T2.xlsx 56 0 19 154
## 6 data/CPCe/HBS2 BW4 T3.xlsx 58 0 12 143
## 7 data/CPCe/HBS2 C11 T3.xlsx 41 0 1 162
## 8 data/CPCe/HBS3 BW4 T1.xlsx 62 0 8 73
## 9 data/CPCe/HBS3 BW4 T2.xlsx 63 0 7 13
## 10 data/CPCe/HBS3 BW4 T3.xlsx 68 0 6 59
## 11 data/CPCe/HBSC1 BW4 T3.xlsx 68 0 1 52
## 12 data/CPCe/HBSC1 C16 T1 - R… 38 0 4 19
## 13 data/CPCe/R3N1 C30 T3.xlsx 40 0 1 250
# -- for these, must use this sand subcategory instead of spr major category...sub in...
# Merge all imported CPCe data with the sand subcategory data for those datafiles containing pavement or rubble points
bb <- right_join(cpce, spr_subcat, by = "file")
# Replace the sand_sa column (major category sand) with the sand_s subcategory for those files where necessary
bb <- bb %>%
mutate(data_sub = case_when(
# when the file did have pavement and rubble, make sand and rubble the subcategory amounts
file %in% pr$file ~ map2(data.x, data.y, ~ mutate(.x, sand_sa = .y$sand_s, rubble_r = .y$rubble_r)),
# when the file did not have pavement and rubble, all spr is sand, and rubble is zero
TRUE ~ map(data.x, ~ mutate(.x, sand_sa = sand_pavement_rubble_spr, rubble_r = 0))))
# Replace original dataset with modified dataset containing sand subcategory data substituted in
cpce_sub <- left_join(cpce, bb) %>%
mutate(data_final = case_when(file %in% bb$file ~ data_sub, TRUE ~ data)) %>%
unnest(data_final) %>%
select(-sand_pavement_rubble_spr, -tape_wand_shadow_tws) %>%
drop_na() # one missing value.. drop
save(cpce_sub, file = "data/processed/cpce.RData")
cpce <- cpce_sub
# Date range for CPCe data
range(cpce$date)
## [1] "2013-11-07" "2016-08-20"
# Group by transect and count number of frames in each transect
transects <- cpce %>%
group_by(site, week, transect, date) %>%
count()
# Number of transects
nrow(transects)
## [1] 1772
# Number of still frames analyzed in each transect
## Range
range(transects$n)
## [1] 27 80
## Median
median(transects$n)
## [1] 40
# Number of points analyzed in each still frame
points <- cpce %>%
mutate(sumpts = rowSums(select_if(., is.integer))) %>%
select(site, transect, date, sumpts)
## Range
range(points$sumpts)
## [1] 1 10
## Median
median(points$sumpts)
## [1] 8