Botany 2025

00-Setup

Purpose:

Install all R packages required for ENM workflows
Check that all packages load successfully
Install any needed GitHub packages

Install CRAN Packages

## ---- Define CRAN Packages ----

list_of_packages <- c(
  # Core ENM tools
  "terra", "dismo", "ENMeval", "ENMTools", "biomod2",

  # Spatial / Mapping
  "sf", "rnaturalearth", "ggspatial", "leaflet",
  "fields", "rangeBuilder",

  # Visualization
  "ggplot2", "viridis", "gridExtra",

  # Data manipulation
  "dplyr", "tidyr", "stringr", "gtools",

  # Phylogenetics
  "ape", "phytools",

  # Biodiversity data
  "ridigbio", "gatoRs",

  # Other utilities
  "multcompView", "usdm", "predicts", "rJava"
)

Check for missing CRAN Packages

## ---- Install Missing CRAN Packages ----

# Identify any packages from the list not currently installed.
new.packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]

if (length(new.packages)) {
  cat("\nInstalling missing CRAN packages:\n")
  print(new.packages)
  install.packages(new.packages)
} else {
  cat("\nAll required CRAN packages already installed.\n")
}

## ---- Load All CRAN Packages ----

# Try loading all required CRAN packages.
loaded <- sapply(list_of_packages, require, character.only = TRUE)

if (any(!loaded)) {
  cat("\nWARNING: These CRAN packages failed to load:\n")
  print(list_of_packages[!loaded])
} else {
  cat("\nAll CRAN packages loaded successfully.\n")
}

Install Github Packages

## ---- Install GitHub Packages ----

# devtools is required for installing from GitHub.
if (!require(devtools)) {
  install.packages("devtools")
  library(devtools)
}

# List of GitHub repositories for required packages.
github_repos <- c(
  "vqv/ggbiplot"
)

# Check whether these GitHub packages are installed, and install if not.
for (repo in github_repos) {
  pkg <- tail(strsplit(repo, "/")[[1]], 1)
  if (!pkg %in% installed.packages()[,"Package"]) {
    cat(paste("\nInstalling GitHub package:", repo, "\n"))
    devtools::install_github(repo, upgrade = "never")
  } else {
    cat(paste("\nGitHub package already installed:", pkg, "\n"))
  }
}

# Try loading GitHub packages.
github_packages <- c("ggbiplot")
github_loaded <- sapply(github_packages, require, character.only = TRUE)

if (any(!github_loaded)) {
  cat("\nWARNING: These GitHub packages failed to load:\n")
  print(github_packages[!github_loaded])
} else {
  cat("\nAll GitHub packages loaded successfully.\n")
}

Troubleshooting

This exercise requires the package rJava. This package requires Java (and the Oracle Java JDK) is installed on your system.

Step 1: Do you have Java installed?

system("java -version")

If yes, is it version 17 or 20? CRAN currently recommends that you use these version.

Step 2: Install Java

CRAN says the easiest way to install Java with Adoptium and into /usr/bin/java (on a Mac). Alternatively, other versions of OpenJDK can be found here and here. More information can be found on CRAN here.

For rJava, the newest package requires R >= 3.6.0 and can be downloaded here. Note: We found that homebrew was the best way to install rJava on Macs M1-M4 Chip during the 2025 workshop.

Step 3 and beyond: Debugging options that have worked in the past!

If you are experiencing errors with rJava:

## Remove rJava
remove.packages(rJava)
## Update your R Java configuration 
system("sudo R CMD javareconf")
## Restart R and check to see if the path matches
Sys.getenv("DYLD_FALLBACK_LIBRARY_PATH")
## Reinstall rJava
install.packages("rJava")
library(rJava)

If you are still having issues with rJava - check to see if you are using Java 12 (this should be included in the error message):

system("java -version")

The initial release of java 12 does not work - install an old version instead. To install an old version, navigate to the Oracle website or use homebrew.

system("brew cask install homebrew/cask-versions/java11")

Restart R and redo the previous steps. Required: uninstall the newer versions of java prior to repeating and restart R after.

## Uninstall Java 12, then repeat the following
## Remove rJava
remove.packages(rJava)
## Update your R Java configuration 
system("sudo R CMD javareconf")
## Restart R and check to see if the path matches
Sys.getenv("DYLD_FALLBACK_LIBRARY_PATH")
## Reinstall rJava
install.packages("rJava")
library(rJava)

More debugging: Check out jerrybreak’s comment from 12/30/2021.

01-Data_Download

Purpose:

Download and preview species occurrence records using iDigBio and gatoRs

Set-up

Load Required Packages

library(ridigbio)     # Interface to iDigBio API
library(gatoRs)       # Unified taxonomic/occurrence tools
library(leaflet)      # Interactive mapping

Download from iDigBio

Search the iDigBio API for occurrence record by taxonomic designations.

# Search for specific species (Galax urceolata)
iDigBio_GU <- idig_search_records(rq = list(scientificname = "Galax urceolata"))
# Search for all Diapensiaceae occurrences (limit to 1000 for example)
iDigBio_GU_family <- idig_search_records(rq = list(family = "Diapensiaceae"), limit = 1000)

Search the API with a geographic bounding box (e.g. Eastern USA extent)

# format a request in the structure of a list
rq_input <- list(scientificname = list(type = "exists"),
                 family = "Diapensiaceae",
                 geopoint = list(type = "geo_bounding_box",
                                  top_left = list(lon = -98.16, 
                                                  lat = 48.92),
                                  bottom_right = list(lon = -64.02, 
                                                      lat = 23.06)
                  )
                )
# input request into idigbio search fxn
iDigBio_GU_family_USA <- idig_search_records(rq_input, limit = 1000)

Save results as a Comma Separated File (.csv)

write.csv(iDigBio_GU, 
          file = "data/01_download/iDigBio_GU_2025_06_27.csv",
          row.names = FALSE)
write.csv(iDigBio_GU_family, 
          file = "data/01_download/iDigBio_GU_family_2025_06_27.csv", 
          row.names = FALSE)

Download Using gatoRs

Create a synonym vector for species to search by.

Shortia_galacifolia <- c("Shortia galacifolia", 
                         "Sherwoodia galacifolia")
Galax_urceolata <- c("Galax urceolata", 
                     "Galax aphylla")
Pyxidanthera_barbulata <- c("Pyxidanthera barbulata", 
                            "Pyxidanthera barbulata var. barbulata")
Pyxidanthera_brevifolia <- c("Pyxidanthera brevifolia", 
                             "Pyxidanthera barbulata var. brevifolia")

Use gatoRs to download species occurrence data for each name vector, searching both iDigBio & GBIF. Allows us to retrieve all versions of a name as these data aggregators have their own taxonomic backbone for organizing occurrence data downloads.

gators_download(synonyms.list = Shortia_galacifolia,
                write.file = TRUE,
                filename = "./data/01_download/raw/Shortia_galacifolia_raw_2025_06_27.csv")

gators_download(synonyms.list = Galax_urceolata,
                write.file = TRUE,
                filename = "./data/01_download/raw/Galax_urceolata_raw_2025_06_27.csv")

gators_download(synonyms.list = Pyxidanthera_barbulata,
                write.file = TRUE,
                filename = "./data/01_download/raw/Pyxidanthera_barbulata_raw_2025_06_27.csv")

gators_download(synonyms.list = Pyxidanthera_brevifolia,
                write.file = TRUE,
                filename = "./data/01_download/raw/Pyxidanthera_brevifolia_raw_2025_06_27.csv")

Preview Downloaded Files

Read in Shortia galacifolia’s data we downloaded with gatoRs above.

rawdf <- read.csv("data/01_download/raw/Shortia_galacifolia_raw_2025_06_27.csv")

View number of columns/fields in the dataset, also return the number of rows (individual occurrence records)

names(rawdf) # returns field headers

##  [1] "scientificName"                "genus"                        
##  [3] "specificEpithet"               "infraspecificEpithet"         
##  [5] "ID"                            "occurrenceID"                 
##  [7] "basisOfRecord"                 "eventDate"                    
##  [9] "year"                          "month"                        
## [11] "day"                           "institutionCode"              
## [13] "recordedBy"                    "country"                      
## [15] "county"                        "stateProvince"                
## [17] "locality"                      "latitude"                     
## [19] "longitude"                     "coordinateUncertaintyInMeters"
## [21] "informationWithheld"           "habitat"                      
## [23] "aggregator"

nrow(rawdf) # returns the number of rows = occurrence records

## [1] 1534

Use Leaflet to visualize records interactively - The error message here indicates many points do not have long/lat values

leaflet(rawdf) %>% 
  addTiles() %>% 
  addMarkers(label = paste0(rawdf$longitude,
                            ", ",
                            rawdf$latitude))

02-Data_Cleaning

Purpose:

Clean and harmonize occurrence records for ecological niche modeling.

Set-up

Load Required Packages

library(gatoRs)         # Taxonomic and geographic cleaning
library(fields)         # For calculating spatial distances
library(sf)             # For spatial operations and visualization
library(ggplot2)        # For plotting
library(ggspatial)      # Scale bars and north arrows
library(leaflet)        # For interactive mapping

Read Raw Occurrence Data

Read in raw species-specific data downloaded in 01 Data Downloads. Check how many occurrence records we start with in the raw data.

rawdf <- read.csv("data/01_download/raw/Shortia_galacifolia_raw_2025_06_27.csv")
nrow(rawdf)  # Starting number of records

## [1] 1534

Taxonomic Harmonization

Our initial download includes iDigBio & GBIF’s synonym lists in addition to the name vectors we searched. Let’s inspect and clean to our taxonomic standard. Note that taxa_clean() contains the argument taxa.filter which allow the options exact, fuzzy or interactive to allow us to tighten or loosen the requirements for name matching. Read more by running ?taxa_clean().

# Inspect unique names
unique(rawdf$scientificName)

## [1] "Shortia galacifolia Torr. & A.Gray"              
## [2] "Sherwoodia galacifolia (Torr. & A.Gray) House"   
## [3] "Shortia galacifolia"                             
## [4] "Shortia galacifolia Torr. & A. Gray"             
## [5] "Shortia galacifolia var. galacifolia"            
## [6] "Shortia galacifolia Torrey & A. Gray"            
## [7] "Shortia galacifolia var. brevistyla"             
## [8] "Shortia galacifolia var. brevistyla P. A. Davies"
## [9] "Sherwoodia galacifolia"

# Define our synonym list
search <- c("Shortia galacifolia", "Sherwoodia galacifolia")
# Clean names with fuzzy matching
df <- taxa_clean(df = rawdf,
                 synonyms.list = search,
                 taxa.filter = "fuzzy", # fuzzy allows some leeway over exact matching
                 accepted.name = "Shortia galacifolia")

## Current scientific names:

## [1] "Shortia galacifolia Torr. & A.Gray"              
## [2] "Sherwoodia galacifolia (Torr. & A.Gray) House"   
## [3] "Shortia galacifolia"                             
## [4] "Shortia galacifolia Torr. & A. Gray"             
## [5] "Shortia galacifolia var. galacifolia"            
## [6] "Shortia galacifolia Torrey & A. Gray"            
## [7] "Shortia galacifolia var. brevistyla"             
## [8] "Shortia galacifolia var. brevistyla P. A. Davies"
## [9] "Sherwoodia galacifolia"

## User selected a(n) fuzzy match.

nrow(df)  # Updated count

## [1] 1534

Locality Cleaning

Here we remove any records with missing coordinates, impossible coordinates, coordinates at (0,0), and any that are flagged as skewed. We also round the provided latitude and longitude values to a specified number of decimal places.

# Remove invalid/skewed records and round coordinates
df <- basic_locality_clean(df = df,
                           remove.zero = TRUE,
                           precision = TRUE,
                           digits = 2,
                           remove.skewed = TRUE)
nrow(df)

## [1] 77

Flag and Filter Cultivated Records

Remove coordinates in cultivated zones, botanical gardens, and outside our desired range. Here we set interactive = FALSE, however you can visualize the outliers by setting interactive = TRUE.

df <- process_flagged(df, 
                      interactive = FALSE, 
                      scientific.name = "accepted_name")

## Reading layer `ne_50m_land' from data source 
##   `/private/var/folders/98/rnr3bm497gn395gzqtwk917w0000gn/T/Rtmpcg2m3K/ne_50m_land.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1420 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -89.99893 xmax: 180 ymax: 83.59961
## Geodetic CRS:  WGS 84

nrow(df)

## [1] 69

Duplicate Removal

Here we identify and remove both (1) specimen duplicates and (2) aggregator duplicates based on each specimens coordinates, occurrenceID, and eventDate. To leverage all date information available, set remove.unparseable = FALSE to manually populate the year, month, and day columns. Here, we also confirm all ID (UUID and key) are unique to remove any within-aggregator duplicates that may accumulate due to processing errors.

# Remove duplicate specimens and aggregator artifacts
df <- remove_duplicates(df, 
                        remove.unparseable = TRUE)
nrow(df)

## [1] 56

Spatial Deduplication

Maxent will only retain one point per pixel. To make the ecological niche analysis comparable, we will retain only one point per pixel. Note: Default is a raster with 30 arc sec resolution.

# Retain one point per raster pixel (30 arc-sec default)
df <- one_point_per_pixel(df)
nrow(df)

## [1] 27

Spatial Thinning

Oversampled regions/habitats can bias our model. To reduce bias, we can thin occurrence data.

Step 1: What should your minimum distance be?

First, lets assess the current minimum distance among your occurrence records. Here, we calculate the minimum nearest neighbor distance in km.

# Calculate minimum nearest-neighbor distance
nnDm <- rdist.earth(as.matrix(df[, c("longitude", "latitude")]),
                    miles = FALSE)
nnDmin <- do.call(rbind, 
                  lapply(1:5, function(i) sort(nnDm[, i])[2]))
min(nnDmin)  # e.g., 2.22 km

## [1] 2.226477

Here the current minimum distance is 2.22 km. Based on literature, we find a 2 meters (or 0.002 km) distance was enough to collect unique genets, so we do not need to thin our points.

Step 2: Thin occurrence records using spThin through gatoRs

However when you do need to thin your records, this is a great function in gatoRs to do so!

# thin if you need
df <- thin_points(df, 
                  distance = 0.002, # in km 
                  reps = 100)

Static Map of Cleaned Points

Make points spatial

df_fixed <- st_as_sf(df, 
                     coords = c("longitude", "latitude"), 
                     crs = 4326)

Set up base maps

USA <- borders(database = "usa", 
               colour = "gray80", 
               fill = "gray80")
state <- borders(database = "state", 
                 colour = "black", 
                 fill = NA)

Plot: Use ggplot2 & ggspatial to plot occurrence records. We are using two base maps, USA and State. The geom_sf() fxn adds our cleaned occurrence points as blue based on their lon & lat values. We can also fix coordinate spatial view through coord_sf(). Finally can add a scale and arrow.

simple_map <- ggplot() +
                USA + 
                state +
                geom_sf(data = df_fixed, 
                        color = "blue") +
                coord_sf(xlim = c(min(df$longitude) - 3, 
                                  max(df$longitude) + 3),
                         ylim = c(min(df$latitude) - 3,
                                  max(df$latitude) + 3)) +
                xlab("Longitude") + 
                ylab("Latitude") +
                annotation_scale() +
                annotation_north_arrow(location = "tl", 
                                       height = unit(1, "cm"),
                                       width = unit(1, "cm"))

simple_map

## Scale on map varies by more than 10%, scale bar may be inaccurate

Interactive Map with Leaflet

Another fun way to visualize occurrence data is using leaflet’s interactive maps.

leaflet(df_fixed) %>%
  addMarkers(label = paste0(df$longitude, ", ", df$latitude)) %>%
  addTiles()

Save Cleaned CSV

write.csv(df, "data/02_cleaning/Shortia_galacifolia_2025_06_27_cleaned.csv", row.names = FALSE)

Batch Clean for All Species

Use a for-loop to iterate through all of the species.

files <- list.files("data/01_download/raw", full.names = TRUE)[1:3]
synonymns <- list(
  Galax_urceolata = c("Galax urceolata", 
                      "Galax urceolata (Poir.) Brummitt", 
                      "Galax urceolata (Poiret) Brummitt", 
                      "Galax urceolaa", 
                      "Galax aphylla L.", 
                      "Galax aphylla"),
  Pyxidanthera_barbulata = c("Pyxidanthera barbulata", 
                             "Pyxidanthera barbulata Michx.",
                             "Pyxidanthera barbulata var. barbulata",
                             "Pyxidenthera barbulata"),
  Pyxidanthera_brevifolia = c("Pyxidanthera brevifolia", 
                              "Pyxidanthera brevifolia Wells",
                              "Pyxidanthera barbulata var. brevifolia (Wells) H.E.Ahles",
                              "Pyxidanthera barbulata var. brevifolia")
)

for (i in 1:3) { # for loop just allows us to iterate through each species
  df <- read.csv(files[3])
  search <- synonymns[[3]]

  df <- taxa_clean(df,
                   synonyms.list = search, 
                   taxa.filter = "fuzzy",
                   accepted.name = search[1])
  df <- basic_locality_clean(df, 
                             remove.zero = TRUE, 
                             precision = TRUE, 
                             digits = 2, 
                             remove.skewed = TRUE)
  df <- process_flagged(df, 
                        interactive = FALSE, 
                        scientific.name = "accepted_name") # removing all points for brevifolia
  df <- remove_duplicates(df, 
                          remove.unparseable = TRUE)
  df <- one_point_per_pixel(df)
  df <- thin_points(df, 
                    distance = 0.002, 
                    reps = 100)

  outfile <- paste0("data/02_cleaning/", gsub(" ", "_", search[1]), "_2025_06_27_cleaned.csv")
  write.csv(df, 
            outfile, 
            row.names = FALSE)
  rm(df, search, outfile)
}

Merge Cleaned Files for Maxent

alldf <- list.files("data/02_cleaning", 
                    pattern = "*.csv", 
                    full.names = TRUE)
alldf <- lapply(alldf, read.csv)
alldf <- do.call(rbind, alldf)
write.csv(alldf, "data/02_cleaning/maxent_ready/diapensiaceae_maxentready_2025_06_27.csv", row.names = FALSE)

Map All Records

alldf_fixed <- st_as_sf(alldf, coords = c("longitude", "latitude"), crs = 4326)

all_map <- ggplot() +
  USA + state +
  geom_sf(data = alldf_fixed, aes(col = factor(accepted_name))) +
  coord_sf(xlim = c(min(alldf$longitude) - 3, max(alldf$longitude) + 3),
           ylim = c(min(alldf$latitude) - 3, max(alldf$latitude) + 3)) +
  xlab("Longitude") + ylab("Latitude") +
  labs(color = "Scientific name") +
  annotation_scale() +
  annotation_north_arrow(location = "tl", height = unit(1, "cm"), width = unit(1, "cm"))

all_map

## Scale on map varies by more than 10%, scale bar may be inaccurate

Prepare GeoLocate Batch File

Some records do not contain lat/lon info, but do have useful information in the locality field that can give us a reasonable estimate of location + some error. Here we’ll use the gatoRs fxn need_to_georeference() to isolate these records for batch processing on the georeferencing platform GeoLocate.

Read in raw data and check what needs to be georeferenced

rawdf <- read.csv("data/01_download/raw/Shortia_galacifolia_raw_2025_06_27.csv")
rawdf_GeoRef <- need_to_georeference(rawdf)

Format the columns/fields for GeoLocate’s standards, then write out for batch processing.

# Format columns for GeoLocate submission
rawdf_GeoRef <- rawdf_GeoRef %>%
  dplyr::select("locality string" = locality,
                country,
                state = stateProvince,
                county,
                latitude,
                longitude,
                ID,
                name = scientificName,
                basis = basisOfRecord)

# Add required empty columns
rawdf_GeoRef$'correction status' <- ""
rawdf_GeoRef$precision <- ""
rawdf_GeoRef$'error polygon' <- ""
rawdf_GeoRef$'multiple results' <- ""

# Reorder columns
rawdf_GeoRef2 <- rawdf_GeoRef[, c("locality string", "country", 
                                  "state", "county", "latitude", 
                                  "longitude","correction status", 
                                  "precision", "error polygon", 
                                  "multiple results","ID", 
                                  "name", "basis")]

# write out csv
write.csv(rawdf_GeoRef2, 
          "data/03_georeferencing/Shortia_galacifolia_Needing_GeoRef_2025_06_27.csv", 
          row.names = FALSE)

03-Data_Exploration

Purpose:

Analyze climatic variables associated with species occurrences using PCA and ANOVA.
plot top contributors to PC1 and PC2.

Set-up

Load Required Packages

library(gtools)         # For mixedsort of file names
library(terra)          # For raster and spatial extraction
library(dplyr)          # For data wrangling
library(tidyr)          # For handling NAs and reshaping
library(ggplot2)        # For plotting
library(ggbiplot)       # For PCA biplot visualization
library(multcompView)   # For compact letter display (Tukey HSD)
library(gridExtra)      # For combining plots

Load Cleaned Data

# Read in cleaned occurrence data with geographic coordinates and species ID
alldf <- read.csv("data/02_cleaning/maxent_ready/diapensiaceae_maxentready_2025_06_27.csv")

Load and Stack Climatic Variables from WorldClim

Raster files provided by WorldClim

# List and sort all .tif files in the WorldClim directory
list <- list.files("data/04_climate_processing/BioClim/", 
                   full.names = TRUE)
list <- gtools::mixedsort(sort(list))

# Stack the raster files
envtStack <- terra::rast(list)

Extract Climatic Variables at Occurrence Points

Extract climatic variable from raster cell where occurrence pts sit

# Extract values of environmental rasters at occurrence points
ptExtracted <- terra::extract(envtStack,
                              alldf[, c("longitude", "latitude")])

Organize these data to have species information

# Combine with species names and coordinates
ptExtracteddf <- ptExtracted %>%
  dplyr::mutate(name = as.character(alldf$accepted_name), 
                x = alldf$longitude, 
                y = alldf$latitude)

Drop any points that does not contain climatic data

# Remove any rows with missing climate values
ptExtracteddf <- ptExtracteddf %>% tidyr::drop_na()

Perform PCA on Climatic Variables

Subset the climatic variables, assumed to be columns 2 to 21.

data.bioclim <- ptExtracteddf[, 2:21]
data.species <- ptExtracteddf[, "name"]

Scale and run a Principal Component Analysis (PCA) on climatic variables. A PCA reduces the dimensionality of the climatic data to make it easier to identify broad patterns and which variables are most influential. For our data, it shows how species cluster & differentiate according to the climatic variables associated with their spatial presence.

When you use the command prcomp your loading variables show up as rotational variables.

pca_result <- prcomp(data.bioclim, scale. = TRUE)

Extract variable loadings: saving a matrix of ceofficients (called loadings) for each of the original climatic variable on each PC (principal component). These loadings show how much each original variable contributes to each PC. Variables with larger (pos or neg) loadings are the biggest drivers of variation in the data.

loadings <- pca_result$rotation

Calculate relative contributions of each variable to each PC

loadings_relative <- sweep(abs(loadings), 2, colSums(abs(loadings)), "/") * 100

Visualize PCA

Set up a nice custom theme for plotting aesthetics

# Customize theme
custom_theme <- theme(panel.background = element_blank(),
                      panel.border = element_rect(fill = NA),
                      panel.grid.major = element_blank(),
                      panel.grid.minor = element_blank(),
                      strip.background = element_blank(),
                      axis.ticks = element_line(colour = "black"),
                      plot.margin = unit(c(1, 1, 1, 1), "line"),
                      axis.text = element_text(size = 12),
                      legend.text = element_text(size = 12),
                      legend.title = element_text(size = 12),
                      text = element_text(size = 12))

# Define color palette for species
pal <- c("#D43F3AFF", "#EEA236FF", "#5CB85CFF", "#46B8DAFF")

Create a species biplot. obs.scale indicates the scale factor to apply to observation, var.scale indicates the scale factor to apply to variables, ellipse as TRUE draws a normal data ellipse for each group, and circle as TRUE draws a correlation circle.

This is all basically to show how our data clusters across the top 2 PCs per species!

# PCA Biplot
pca_plot <- ggbiplot(pca_result, 
                     obs.scale = 1, 
                     var.scale = 1,
                     groups = data.species,
                     ellipse = TRUE, 
                     circle = TRUE) +
            scale_color_manual(name = '',
                               values = pal) +
            theme(legend.direction = 'vertical', 
                  legend.position = 'bottom',
                  legend.text = element_text(size = 12, face = "italic")) +
            custom_theme

print(pca_plot)

Identify Top Contributors to PC1 and PC2

Pull out the bioclim layers that comprise PC1 and PC2

# Top 2 contributing variables to PC1 and PC2
top_PC1_vars <- rownames(loadings)[order(abs(loadings[, "PC1"]), 
                                         decreasing = TRUE)][1:2]
top_PC2_vars <- rownames(loadings)[order(abs(loadings[, "PC2"]),
                                         decreasing = TRUE)][1:2]
selected_vars <- unique(c(top_PC1_vars, top_PC2_vars))  # remove duplicates

ANOVA + Tukey HSD + Plotting for Top Variables

We can run an ANOVA (Analysis of Variance) and a Tukey HSD (a comparison test) Post-hoc to test whether there are statistically significant differences in the average values of specific climatic variables between the species.

Broken down, the ANOVA evaluates the difference of the mean value for a climatic variable across all species, and tests for a significant difference.

The Tukey HSD test follows up comparing all pairs of species to see which species pairs differ from eachother.

Here, we’ll use a for-loop to move through each extracted (via species occurrence points) bioclim layer for PC1 & PC2 and conduct the ANOVA and Tukey HSD. Plotted as a boxplot with real data to show relative density.

## ANOVA + Tukey HSD + Plotting for Top Variables ----

plotlist <- list()

for (i in seq_along(selected_vars)) {
  varname <- selected_vars[i]
  
  # Run ANOVA
  model <- aov(as.formula(paste0(varname, " ~ name")), 
               data = ptExtracteddf)
  tukey <- TukeyHSD(model)
  pvals <- tukey$`name`[, 4]
  
  # Convert p-values to significance letters
  letters_out <- multcompLetters(pvals)
  groups_df <- data.frame(name = names(letters_out$Letters),
                          groups = letters_out$Letters)
  
  
  # Set y position for labels
  max_y <- max(ptExtracteddf[[varname]], 
               na.rm = TRUE)
  groups_df$y_position <- max_y * 1.1
  
  # Create plot
  plotlist[[i]] <- ggplot(ptExtracteddf, aes(x = name, 
                                             y = .data[[varname]])) +
                      geom_jitter(aes(color = name), 
                                  width = 0.3, 
                                  alpha = 0.5,
                                  size = 1) +
                      geom_boxplot(fill = NA, 
                                   color = "black", 
                                   size = 0.6) +
                      geom_text(data = groups_df,
                                aes(x = name, 
                                    y = y_position, 
                                    label = groups),
                                inherit.aes = FALSE, 
                                size = 4) +
                      scale_fill_manual(values = pal) +
                      scale_color_manual(values = pal) +
                      ggtitle(paste0(varname)) +
                      ylab(varname) +
                      theme_classic() +
                      theme(legend.position = "none", 
                            axis.text.x = element_text(angle = 45, 
                                                       hjust = 1,
                                                       face = "italic")) +
                      ylim(0, max_y * 1.3)
}

# Combine plots
gridExtra::grid.arrange(grobs = plotlist, ncol = 2)

04_AccessibleArea

Purpose:

Process climatic raster data and occurrence records for Diapensiaceae SDMs.

Set-up

Load Required Packages

library(terra)        # For raster handling
library(sf)           # For vector spatial operations
library(ggplot2)      # For plotting
library(dplyr)        # For data manipulation
library(usdm)         # For VIF calculations
library(stringr)      # For string operations
library(gridExtra)    # For plotting

Load Cleaned Occurrence Data

Load cleaned occurrence data & convert it to spatial data.

# Read in cleaned occurrence data with geographic coordinates and species ID
alldf <- read.csv("data/02_cleaning/maxent_ready/diapensiaceae_maxentready_2025_06_27.csv")

# Convert occurrence data to an sf object
alldfsp <- st_as_sf(alldf,
                    coords = c("longitude", "latitude"),
                    crs = 4326)

Load and Stack Climatic Variables

Load and stack climatic variable raster data

# List and sort all .tif files in the BioClim directory
biolist <- list.files("data/04_climate_processing/BioClim/",
                      pattern = "\\.tif$",
                      full.names = TRUE)

# Stack the raster files into a SpatRaster
biostack <- terra::rast(biolist)

Crop and Mask Bioclim Layers to Accessible Area (M) for Each Species

We need to define the accessible space per species, done commonly using an alpha hull approach. Once we have defined each species’ AA, we can crop & mask climatic layers to these.

To do this, we’ll run a for-loop that processes each species’s AA using the package rangeBuilder’s getDynamicAlphaHull(), crops + masks the climatic raster layers to these AA shapes, then deposits them in a directory called Cropped.

# Define base output directory for cropped rasters
dir  <- "data/04_climate_processing/Cropped"

# Loop through each unique species in the occurrence data
for (species in unique(alldf$accepted_name)) {
  
  # Print current species for progress tracking
  message("Processing species: ", species)
  
  # Subset occurrence data for this species
  species_df <- alldf %>%
    filter(accepted_name == species)
  
  # Convert species occurrences to sf
  species_sf <- st_as_sf(species_df,
                         coords = c("longitude", "latitude"),
                         crs = 4326)
  
  # Run alpha hull via rangeBuilder
  hull <- rangeBuilder::getDynamicAlphaHull(x = species_df,
                                            coordHeaders = c("longitude", 
                                                             "latitude"),
                                            fraction = 1, # min. fraction of 
                                                          ## records we want included 
                                            partCount = 1, # number of polygons
                                            initialAlpha = 20, # initial alpha size,
                                                               ## 20m, scales up from here. 
                                            clipToCoast = "terrestrial",
                                            verbose = FALSE
  )
  
  # Convert hull to sf
  hull_sf <- st_as_sf(hull[[1]])
  
  # Transform hull and points to equal-area projection
  proj <- "+proj=cea +lat_ts=0 +lon_0=0"
  hull_sf_proj <- st_transform(hull_sf, 
                               crs = proj)
  species_sf_proj <- st_transform(species_sf,
                                  crs = proj)
  
  # Calculate nearest-neighbor distances
  dist_matrix <- st_distance(species_sf_proj)
  nearest_neighbor_dists <- apply(dist_matrix, 2, function(x) sort(x)[2])
  buffer_distance <- quantile(nearest_neighbor_dists, 
                              probs = 0.80, 
                              na.rm = TRUE)
  
  # Buffer the hull
  buffer_proj <- st_buffer(hull_sf_proj, 
                           dist = buffer_distance)
  
  # Union polygons in case of multiple pieces
  buffer_proj <- st_union(buffer_proj)
  
  # Transform both hull and buffered hull back to WGS84 for plotting
  hull_sf_wgs84 <- st_transform(hull_sf_proj, 
                                crs = 4326)
  buffer_wgs84 <- st_transform(buffer_proj,
                               crs = 4326)
  
  # plot hull before buffering
  p_before <- ggplot() +
              geom_sf(data = hull_sf_wgs84, 
                      fill = "gray70", 
                      alpha = 0.5, 
                      color = NA) +
              geom_sf(data = species_sf, 
                      color = "black", 
                      size = 0.5) +
              theme_minimal() +
              ggtitle(paste("Alpha Hull (No Buffer) -", species))
  
  # plot after buffering alpha hull
  p_after <- ggplot() +
              geom_sf(data = buffer_wgs84, 
                      fill = "gray70", 
                      alpha = 0.5,
                      color = NA) +
              geom_sf(data = species_sf, 
                      color = "red", 
                      size = 0.5) +
              theme_minimal() +
              ggtitle(paste("Buffered Hull -", species))
  
  gridExtra::grid.arrange(p_before, p_after, ncol = 2)
  
  # Convert buffered area to terra SpatVector
  buffer_vect <- terra::vect(buffer_wgs84)
  
  # Define output directory for this species
  spec <- gsub(" ", "_", species)
  species_dir <- file.path(dir, spec)
  dir.create(species_dir, showWarnings = FALSE)
  
  # Loop through each raster layer and crop/mask to buffered area
  for (i in seq_along(biolist)) {
    
    rast <- biostack[[i]]
    
    # Crop raster to extent
    rast_crop <- crop(rast, buffer_vect)
    
    # Mask raster to buffered hull shape
    rast_mask <- mask(rast_crop, buffer_vect)
    
    # Define output file name
    layer_name <- names(rast)
    outfile <- file.path(species_dir, paste0(layer_name, ".tif"))
    
    # Write cropped raster
    writeRaster(rast_mask,outfile,overwrite = TRUE)
    
    message("  Cropped and saved: ", outfile)
  }
}

Note that for defining AA, you can customize arguments to make more sense for your species of interest. Consider partCount for example, you can increase it to allow for individual polygons of AA per point cluster if you believe that populations are truly separated by some geographic boundary.

VIF Selection for Each Species

Variable Inflation Factor (VIF) can detect for multicollinearity in a set of multiple regression variables. We can compute this using usdm’s vifstep() to test which variables should be excluded due to collinearity.

# Loop through each species
for (species in unique(alldf$accepted_name)) {
  spec <- gsub(" ", "_", species)
  spec_dir <- file.path(dir, spec)
  
  # Read in species cropped rasters and stack them
  cropped_files <- list.files(spec_dir,
                              pattern = "\\.tif$",
                              full.names = TRUE)
  
  species_stack <- terra::rast(cropped_files)
  
  # Calculate VIF and exclude collinear variables
  vif_result <- usdm::vifstep(species_stack, th = 10)
  message("VIF calculated for species: ", species)
  
  reduced_stack <- exclude(species_stack, vif_result)
  
  # Create a folder for VIF-selected layers
  vif_dir <- file.path(spec_dir, "VIF")
  dir.create(vif_dir, showWarnings = FALSE)
  
  # Save each retained raster layer
  for (layer_name in names(reduced_stack)) {
    
    r_layer <- reduced_stack[[layer_name]]
    
    out_file <- file.path(vif_dir, paste0(layer_name, ".tif"))
    
    terra::writeRaster(r_layer,
                       out_file,
                       overwrite = TRUE)
    
    message("Saved layer: ", out_file)
  }
}

Correlation Matrix of Cropped Bioclim Layers for Each Species

We also check the overall correlation between climatic variables using a matrix.

# List species folders in the cropped directory
species_dirs <- list.dirs(dir, recursive = FALSE, full.names = TRUE)

# Loop through each species folder
for (species_dir in species_dirs) {
  
  # Extract species name from folder path
  species_name <- basename(species_dir)
  
  # List all cropped raster files for this species
  cropped_files <- list.files(species_dir,
                              pattern = "\\.tif$",
                              full.names = TRUE)
  
  # Stack the rasters for this species
  cropped_stack <- terra::rast(cropped_files)
    
  # Calculate correlation matrix
  corr <- terra::layerCor(cropped_stack, fun = "cor", na.rm = TRUE)
    
  # Extract absolute Pearson correlation values
  cor_abs <- abs(corr$correlation)
    
  # Define output CSV path
  corr_outfile <- file.path(species_dir,paste0("correlationBioclim_", species_name, ".csv"))
    
  # Write correlation matrix to CSV
  write.csv(cor_abs,corr_outfile,row.names = TRUE)

}

05-Ecological_Niche_Modeling

Purpose:

Build ecological niche models for Diapensiaceae species using ENMeval with VIF-selected environmental variables.

Set-up

Load Required Packages

library(terra)         # For raster data
library(ENMeval)       # For ENM evaluation and tuning
library(predicts)      # For niche modeling tools
library(dplyr)         # For data manipulation
library(ggplot2)       # Optional for additional plots
library(sf)            # For vector spatial operations
library(rnaturalearth) # For downloading shapefiles of countries and states
library(ggspatial)     # Scale bars and north arrows

Load Cleaned Occurrence Data

Read in cleaned occurrence data, then subset to a sample species to demo modeling.

# Load occurrence data
alldf <- read.csv("./data/02_cleaning/maxent_ready/diapensiaceae_maxentready_2025_06_27.csv")

Subset for Single Species

# Example species: Galax urceolata
Galax_urceolata <- alldf %>%
                    filter(accepted_name == "Galax urceolata")

Load VIF-Selected Climate Layers

# List environmental layers
vif_list <- list.files("./data/04_climate_processing/Cropped/Galax_urceolata/VIF/",
                        full.names = TRUE,
                        recursive = FALSE)
# Load as raster stack
vifStack <- terra::rast(vif_list)

Run ENMeval to Generate Models

ENMeval will generate multiple models and test them per the specified method. Two important variables here are the regularization multiplier (RM) value and the feature class (FC). FC will allow for different shapes in response curves (linear, hinge, quadratic, product, and threshold) to be used in the model and RM will influence how many parameters are included in the model.

eval <- ENMevaluate(occs = Galax_urceolata[, c("longitude", "latitude")],
                    envs = vifStack,
                    tune.args = list(fc = c("L", "Q"), rm = 1:2),
                    partitions = "block",
                    n.bg = 10000,
                    parallel = FALSE,
                    algorithm = 'maxent.jar')

## *** Running initial checks... ***

## Loading required namespace: rJava

## * Randomly sampling 10000 background points ...

## * Removed 1 occurrence points with NA predictor variable values.

## * Clamping predictor variable rasters...

## * Model evaluations with spatial block (4-fold) cross validation and lat_lon orientation...

## 
## *** Running ENMeval v2.0.4 with maxent.jar v3.4.3 from dismo package v1.3.16 ***

##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%

## ENMevaluate completed in 0 minutes 43.6 seconds.

# Save ENMeval object
save(eval, file = "./data/05_ENMs/Galax_urceolata_ENM_eval.RData")

Visualize Model Predictions

# Extract predictions from ENMeval object
maps <- eval@predictions

# Plot first 6 prediction rasters
terra::plot(maps,
            nc = 2,
            main = names(maps))

Calculate Niche Overlap Between Models

Compares how similar the niche recovery is between the models produced by ENMeval.

# Calculate Schoener's D among all models
mod_overlap <- calc.niche.overlap(maps, 
                                  overlapStat = "D")

##   |                                                                              |                                                                      |   0%  |                                                                              |=======================                                               |  33%  |                                                                              |===============================================                       |  67%  |                                                                              |======================================================================| 100%

print(mod_overlap)

##           fc.L_rm.1 fc.Q_rm.1 fc.L_rm.2 fc.Q_rm.2
## fc.L_rm.1        NA        NA        NA        NA
## fc.Q_rm.1 0.9019665        NA        NA        NA
## fc.L_rm.2 0.9867662 0.8945590        NA        NA
## fc.Q_rm.2 0.9101947 0.9822308 0.9038375        NA

Examine Overall Tuning Results

# Retrieve summary table
results <- eval.results(eval)
head(results)

##   fc rm tune.args auc.train cbi.train auc.diff.avg auc.diff.sd auc.val.avg
## 1  L  1 fc.L_rm.1 0.8736371     0.998   0.06330836  0.03156020   0.8528594
## 2  Q  1 fc.Q_rm.1 0.8682349     1.000   0.08680907  0.03856491   0.8374549
## 3  L  2 fc.L_rm.2 0.8741605     0.997   0.05984640  0.03132040   0.8557653
## 4  Q  2 fc.Q_rm.2 0.8676232     0.999   0.07932712  0.03209146   0.8419728
##   auc.val.sd cbi.val.avg cbi.val.sd or.10p.avg  or.10p.sd   or.mtp.avg
## 1 0.05949740     0.92775 0.09537426  0.1360656 0.08204368 0.0000000000
## 2 0.07674211     0.89450 0.08700383  0.1467213 0.07986933 0.0000000000
## 3 0.05743641     0.93325 0.09205931  0.1278689 0.07165766 0.0004098361
## 4 0.07044999     0.88900 0.09787407  0.1381148 0.07021007 0.0000000000
##      or.mtp.sd     AICc delta.AICc        w.AIC ncoef
## 1 0.0000000000 60588.06    0.00000 9.998765e-01     8
## 2 0.0000000000 60606.06   17.99866 1.234775e-04     8
## 3 0.0008196721 60632.19   44.13664 2.604931e-10     8
## 4 0.0000000000 60678.67   90.61461 2.104912e-20     7

## Plot model tuning results
evalplot.stats(
  e = eval,
  stats = c("or.10p", "auc.val"),
  color = "fc",
  x.var = "rm")

Select Optimal Model Based on Criteria

We want to select the model that has the smallest AICc score, has the lowest omission (model that drops the least number of species occurrence points in the reconstructed niche), and maximum AUC (Area Under Curve) value.

opt.seq <- results %>%
            filter(!is.na(AICc)) %>%        # Exclude models with NA AICc
            filter(AICc == min(AICc)) %>%   # Minimum AICc
            filter(or.10p.avg != 0) %>%     # Exclude zero omission
            filter(or.10p.avg == min(or.10p.avg)) %>%     # Minimum omission
            filter(auc.val.avg == max(auc.val.avg))       # Maximum AUC

opt.seq

##   fc rm tune.args auc.train cbi.train auc.diff.avg auc.diff.sd auc.val.avg
## 1  L  1 fc.L_rm.1 0.8736371     0.998   0.06330836   0.0315602   0.8528594
##   auc.val.sd cbi.val.avg cbi.val.sd or.10p.avg  or.10p.sd or.mtp.avg or.mtp.sd
## 1  0.0594974     0.92775 0.09537426  0.1360656 0.08204368          0         0
##       AICc delta.AICc     w.AIC ncoef
## 1 60588.06          0 0.9998765     8

# Save optimal model summary
write.table(opt.seq,
            file = "./data/05_ENMs/Galax_urceolata_OptModel.txt",
            sep = "\t",
            row.names = FALSE)

Visualize the Variable Contributions

# Retrieve the optimal model from ENMeval object
opt.mod <- eval.models(eval)[[opt.seq$tune.args]]

# Save optimal model as RData
save(opt.mod, file = "./data/05_ENMs/Galax_urceolata_opt_mod.RData")

# Plot variables and their contribution to the model
png("./data/05_ENMs/Galax_urceolata_Variable_Contribution.png",
    width = 1200,
    height = 800,
    res = 150)

# Plot variable contributions
dismo::plot(opt.mod, main = "Variable Contribution - Optimal Model")

dev.off()

## quartz_off_screen 
##                 2

Visualize Response Curves

How does the species niche respond to the variable elevation:

# Plot response curves for optimal model
predicts::partialResponse(eval@models[[opt.seq$tune.args]], var = "elev")

Plot Optimal Model

Build niche map that shows most suitable vs least suitable pixels according to our best model.

opt.pred <- eval.predictions(eval)[[as.character(opt.seq$tune.args)]]

r_df <- as.data.frame(opt.pred, xy = TRUE)
colnames(r_df) <- c("x", "y", "suitability")

#  Occurrences  sf
occ_sf <- st_as_sf(eval@occs, coords = c("longitude", "latitude"), crs = 4326)

# get bounding box for plotting from occurences
bbox_vals <- sf::st_bbox(occ_sf)

# USA states
usa <- ne_states(country = "United States of America", returnclass = "sf")

## The rnaturalearthhires package needs to be installed.

## Installing the rnaturalearthhires package.

## Using GitHub PAT from the git credential store.

## Downloading GitHub repo ropensci/rnaturalearthhires@HEAD

## ── R CMD build ─────────────────────────────────────────────────────────────────
##      checking for file ‘/private/var/folders/98/rnr3bm497gn395gzqtwk917w0000gn/T/Rtmpcg2m3K/remotesd65d4ad079af/ropensci-rnaturalearthhires-e4736f6/DESCRIPTION’ ...  ✔  checking for file ‘/private/var/folders/98/rnr3bm497gn395gzqtwk917w0000gn/T/Rtmpcg2m3K/remotesd65d4ad079af/ropensci-rnaturalearthhires-e4736f6/DESCRIPTION’
##   ─  preparing ‘rnaturalearthhires’:
##      checking DESCRIPTION meta-information ...  ✔  checking DESCRIPTION meta-information
##   ─  checking for LF line-endings in source and make files and shell scripts
##   ─  checking for empty or unneeded directories
##   ─  building ‘rnaturalearthhires_1.0.0.9000.tar.gz’
##      
##

# Plot
p <- ggplot() +
      geom_tile(data = r_df, 
                aes(x = x, 
                    y = y, 
                    fill = suitability)) +
      scale_fill_viridis_c(name = "Suitability") +
      geom_sf(data = occ_sf,
              color = "red", 
              size = 0.1) +
      geom_sf(data = usa, 
              fill = NA, color = "white", linewidth = 0.5) +
      coord_sf(xlim = c(bbox_vals["xmin"] - 2, bbox_vals["xmax"] + 2),
               ylim = c(bbox_vals["ymin"] - 2, bbox_vals["ymax"] + 2),
               expand = FALSE) +
      labs(title = "Predicted Suitability (Optimal ENMeval Model)",
           x = "Longitude",
           y = "Latitude")+
      theme_bw()+
      theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.background = element_rect(fill = "grey80", 
                                            color = NA),
            plot.background = element_rect(fill = "white", 
                                           color = NA)) +
      annotation_north_arrow(location = "tl",
                             height = unit(1, "cm"), 
                             width = unit(1, "cm"))

print(p)

# Save Outputs
ggsave(filename = "./data/05_ENMs/Galax_urceolata_Optimal_Model_Map.png",
       plot = p,
       width = 8,
       height = 5,
       dpi = 300)

terra::writeRaster(x = opt.pred,
            filename = "./data/05_ENMs/Galax_urceolata_ENM_optModel.asc",
            overwrite = TRUE,
            NAflag = -9999)

Loop Through Other Species

# List all unique species
species_list <- unique(alldf$accepted_name)

# Exclude Galax urceolata (already processed)
species_list <- setdiff(species_list, "Galax urceolata")

# Loop over remaining species
for (sp in species_list) {
  cat("Processing species:", sp, "\n")

  ## Subset occurrence data
  sp_df <- alldf %>%
           filter(accepted_name == sp)

  ## List VIF-selected rasters for this species
  sp_vif_path <- paste0("./data/04_climate_processing/Cropped/",
                    gsub(" ", "_", sp),
                    "/VIF/")

  vif_list <- list.files(sp_vif_path,
                         full.names = TRUE,
                         recursive = FALSE)

  ## Load rasters
  vifStack <- terra::rast(vif_list)

  ## Run ENMevaluate
  eval <- ENMevaluate(occs = sp_df[, c("longitude", "latitude")],
                      envs = vifStack,
                      tune.args = list(fc = c("L", "Q"), rm = 1:2),
                      partitions = "block",
                      n.bg = 10000,
                      parallel = FALSE,
                      algorithm = 'maxent.jar')

  ## Save ENMeval object
  save(eval, file = paste0("./data/05_ENMs/", gsub(" ", "_", sp), "_ENM_eval.RData"))

  ## Identify optimal model
  opt.seq <- results %>%
              filter(!is.na(AICc)) %>%
              filter(AICc == min(AICc)) %>%
              filter(or.10p.avg != 0) %>%
              filter(or.10p.avg == min(or.10p.avg)) %>%
              filter(auc.val.avg == max(auc.val.avg))

  ## Save optimal model summary
  write.table(opt.seq,
    file = paste0("./data/05_ENMs/", gsub(" ", "_", sp), "_OptModel.txt"),
    sep = "\t",
    row.names = FALSE)

  ## Plot variable contributions
  opt.mod <- eval.models(eval)[[opt.seq$tune.args]]

  # Save optimal model as RData
  save(opt.mod, file = paste0("data/05_ENMs/", gsub(" ", "_", sp), "_opt_mod.RData"))

  png(filename = paste0("data/05_ENMs/", gsub(" ", "_", sp), "_Variable_Contribution.png"),
      width = 1200,
      height = 800,
      res = 150)

  dismo::plot(opt.mod, main = paste("Variable Contribution -", sp))

  dev.off()

  ## Plot optimal prediction raster
  opt.pred <- eval.predictions(eval)[[as.character(opt.seq$tune.args)]]

  r_df <- as.data.frame(opt.pred, xy = TRUE)
  colnames(r_df) <- c("x", "y", "suitability")

  occ_sf <- st_as_sf(eval@occs, coords = c("longitude", "latitude"), crs = 4326)
  usa <- ne_states(country = "United States of America", returnclass = "sf")

  bbox_vals <- sf::st_bbox(occ_sf)

  p <- ggplot() +
    geom_tile(data = r_df, aes(x = x, y = y, fill = suitability)) +
    scale_fill_viridis_c(name = "Suitability") +
    geom_sf(data = occ_sf, color = "red", size = 0.1) +
    geom_sf(data = usa, fill = NA, color = "white", linewidth = 0.5) +
    coord_sf(
      xlim = c(bbox_vals["xmin"] - 2, bbox_vals["xmax"] + 2),
      ylim = c(bbox_vals["ymin"] - 2, bbox_vals["ymax"] + 2),
      expand = FALSE) +
    labs(
      title = paste("Predicted Suitability -", sp),
      x = "Longitude",
      y = "Latitude"
    ) +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "grey80", color = NA),
      plot.background = element_rect(fill = "white", color = NA)
    ) +
    annotation_north_arrow(
      location = "tl",
      height = unit(1, "cm"),
      width = unit(1, "cm")
    )

  print(p)

  ## Save raster
  writeRaster(
    x = opt.pred,
    filename = paste0("./data/05_ENMs/", gsub(" ", "_", sp), "_ENM_optModel.asc"),
    overwrite = TRUE,
    NAflag = -9999
  )
  ggsave(
    filename = paste0("./data/05_ENMs/", gsub(" ", "_", sp), "_Optimal_Model_Map.png"),
    plot = p,
    width = 8,
    height = 5,
    dpi = 300
  )

}

## Processing species: Pyxidanthera barbulata

## *** Running initial checks... ***

## * Randomly sampling 10000 background points ...

## * Removed 2 occurrence points with NA predictor variable values.

## * Clamping predictor variable rasters...

## * Model evaluations with spatial block (4-fold) cross validation and lat_lon orientation...

## 
## *** Running ENMeval v2.0.4 with maxent.jar v3.4.3 from dismo package v1.3.16 ***

##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%

## ENMevaluate completed in 0 minutes 34.8 seconds.

## Processing species: Pyxidanthera brevifolia

## *** Running initial checks... ***

## * Randomly sampling 10000 background points ...

## * Clamping predictor variable rasters...

## * Model evaluations with spatial block (4-fold) cross validation and lat_lon orientation...

## 
## *** Running ENMeval v2.0.4 with maxent.jar v3.4.3 from dismo package v1.3.16 ***

##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%

## ENMevaluate completed in 0 minutes 8.5 seconds.

## Processing species: Shortia galacifolia

## *** Running initial checks... ***

## * Randomly sampling 10000 background points ...

## * Clamping predictor variable rasters...

## * Model evaluations with spatial block (4-fold) cross validation and lat_lon orientation...

## 
## *** Running ENMeval v2.0.4 with maxent.jar v3.4.3 from dismo package v1.3.16 ***

##   |                                                                              |                                                                      |   0%  |                                                                              |==================                                                    |  25%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================================                  |  75%  |                                                                              |======================================================================| 100%

## ENMevaluate completed in 0 minutes 10.3 seconds.

06-Null_ENMs

Purpose:

Assess performance of niche models using null models

Set-up

Load Required Packages

library(ENMeval)
library(ggplot2)
library(dplyr)

Load Cleaned Occurrence Data

Load Occurrence Data

# Load all occurrence records
alldf <- read.csv("data/02_cleaning/maxent_ready/diapensiaceae_maxentready_2025_06_27.csv")

Load Optimal Model Parameters

# Load optimal feature class (fc) and regularization multiplier (rm)
opt.seq <- read.delim("data/05_ENMs/Galax_urceolata_OptModel.txt")

# Extract parameters
fc <- opt.seq$fc
rm <- opt.seq$rm

Load ENMeval Object

# Load saved ENMeval object for Galax urceolata
load("data/05_ENMs/Galax_urceolata_ENM_eval.RData")

Run Null Model Simulations

Save Null Model Comparison Results

# Extract comparison results between empirical and null models
null_comparison_results <- null.emp.results(spec.mod.null)

# Save results as CSV
write.csv(
  null_comparison_results,
  file = "data/05_ENMs/Galax_urceolata_null_comparison_results.csv",
  row.names = FALSE
)

Plot Null Model Results

# Generate histogram plots for AUC and OR.10p statistics
spec.null <- evalplot.nulls(
  spec.mod.null,
  stats = c("or.10p", "auc.val"),
  plot.type = "histogram"
)

# Display plot in R console
plot(spec.null)

# Save plot to PNG
ggsave(
  filename = "Galax_urceolata_null_histogram.png",
  plot = spec.null,
  path = "data/05_ENMs/",
  height = 12,
  width = 13
)

07-ENM_Processing

Purpose:

Project ecological niche models (ENMs) for four species onto current and future climate layers (Eastern Temperate Forests)
Visualize habitat suitability maps
Calculate niche breadth for each species
Calculate pairwise niche overlap
Test for phylogenetic signal in niche overlap
Compare current vs future projections to assess range shifts

Set-up

Increase Java memory limit to avoid OutOfMemoryError. Some ENM packages, especially dismo and older ENMeval workflows, may call Java routines that require more RAM. This sets the Java heap space to 8GB.

options(java.parameters = "-Xmx30g")

Load Required Packages

library(terra)            # For reading and manipulating raster 
                          ## data (modern replacement for raster package)
library(dismo)            # For predicting from ENMs and 
                          ## compatibility with Maxent models
library(ENMeval)          # For calculating niche overlap statistics
library(ENMTools)         # For calculating niche breadth (Levins' B2)
library(ggplot2)          # For creating publication-quality plots
library(rnaturalearth)    # For downloading shapefiles of countries and states
library(viridis)          # For colorblind-friendly color scales
library(ggspatial)        # For adding north arrows to plots
library(dplyr)            # For data manipulation
library(ape)              # For reading and processing phylogenetic trees
library(biomod2)          # For range size change calculations 
                          ## between binary rasters

Project Models onto Current Climate (Eastern Temperate Forests)

Load in climate files for the Eastern Temperate Forest Region

# List all climate raster files (.asc format) for the Eastern Temperate Forests (ETF) region.
# These will be used as environmental predictors for projecting the ENMs.
clim_files <- list.files("./data/04_climate_processing/CurrentEasternTemperateForests", 
                         pattern = "\\.asc$", 
                         full.names = TRUE)

# Read all climate rasters into a single SpatRaster object for efficient processing.
clim_stack <- terra::rast(clim_files)

# Plot a quick preview of the first four layers to check that rasters loaded correctly.
plot(clim_stack[[1:4]])

For a set of species, project where in the Eastern Temperate Forest Region they have suitable niche space given our model.

# Define a vector of the species you want to process.
# Each species has an ENM already built and saved to disk.
species_list <- c("Galax_urceolata",
                  "Pyxidanthera_barbulata",
                  "Pyxidanthera_brevifolia",
                  "Shortia_galacifolia")

# Load the USA state boundaries as an sf object.
# This will be used as a basemap for plotting projections.
usa <- rnaturalearth::ne_states(country = "United States of America",
                                returnclass = "sf")

# Loop through each species to project their ENM onto the current climate layers.
for (species in species_list) {

  cat("\nProjecting current climate for:", species, "\n")

  # Load the optimal ENM for this species.
  # The object `opt.mod` will be loaded into the workspace.
  mod_file <- paste0("data/05_ENMs/", species, "_opt_mod.RData")
  load(mod_file)  # loads object: opt.mod

  # Identify which layers were used to build the ENM.
  # This ensures that your projection uses the same predictor variables
  # as the model was trained on.
  vif_dir <- paste0("data/04_climate_processing/Cropped/", species, "/VIF/")
  species_layers <- list.files(vif_dir, 
                               full.names = TRUE)

  # Read in those specific rasters
  spec_stack <- terra::rast(species_layers)
  layer_names <- names(spec_stack)

  # Subset the full ETF raster stack to only the layers used in the ENM.
  ETF_Rasters <- terra::subset(clim_stack,
                               layer_names)
  #if error, try: ETF_Rasters <- raster::stack(ETF_Rasters)

  # Project the ENM onto the current ETF climate layers.
  proj_file <- paste0("data/06_ENM_processing/", 
                      species,
                      "_EFT_Projection.asc")

  p <- dismo::predict(opt.mod,
                      ETF_Rasters, 
                      filename = proj_file,
                      overwrite = TRUE, 
                      NAflag = -9999)

  # Convert the raster to a data frame for plotting in ggplot.
  p_df <- as.data.frame(p, xy = TRUE)
  colnames(p_df)[3] <- "Habitat_Suitability"

  # Get the bounding box coordinates of the raster for plotting.
  bbox_vals <- terra::ext(p)

  # Create the habitat suitability map.
  p_plot <- ggplot() +
            geom_sf(data = usa, 
                    fill = "grey70") +  # basemap
            geom_tile(data = p_df, aes(x = x, 
                                       y = y, 
                                       fill = Habitat_Suitability)) +
            geom_sf(data = usa, 
                    fill = NA, 
                    color = "white", 
                    linewidth = 0.5) +
            coord_sf(xlim = c(bbox_vals$xmin - 2, 
                              bbox_vals$xmax + 2),
                     ylim = c(bbox_vals$ymin - 2, 
                              bbox_vals$ymax + 2),
                     expand = FALSE) +
            scale_fill_viridis_c(name = "Suitability") +
            labs(title = paste0(gsub("_", " ", species),
                 " - Eastern Temperate Forest Habitat Suitability"),
                 x = "Longitude",
                 y = "Latitude") +
            theme_bw() +
            theme(panel.grid = element_blank(),
                  axis.text = element_text(size = 8),
                  axis.title = element_text(size = 8),
                  plot.title = element_text(size = 10)) +
            annotation_north_arrow(location = "tl",
                                   height = unit(1, "cm"),
                                   width = unit(1, "cm"))

  # Save the plot as a PNG file.
  png_file <- paste0("data/06_ENM_processing/", 
                     species,
                     "_ETF_Projection_Plot.png")

  ggsave(filename = png_file, 
         plot = p_plot, 
         width = 7, 
         height = 5, 
         dpi = 300)
}

Calculate Niche Breadth

For each species, calculate niche breadth across the ETF region. Niche breadth (Levins’ B2) indicates whether a species is a generalist (high value) or specialist (low value).

for (species in species_list) {

  cat("\nCalculating niche breadth for:", species, "\n")

  proj_file <- paste0("data/06_ENM_processing/", species, "_EFT_Projection.asc")

  proj_raster <- terra::rast(proj_file)

  # Compute Levins' B2 niche breadth
  breadth <- ENMTools::raster.breadth(proj_raster)

  cat("Niche breadth:", round(breadth$B2, 3), "\n")
}

Calculate Niche Overlap

We can stack all of the species niche models to evaluate how much they overlap

# Stack all species projections into a single SpatRaster object.
# This allows calculation of pairwise niche overlap (Schoener's D).
enm_stack <- c(terra::rast("data/06_ENM_processing/Galax_urceolata_EFT_Projection.asc"),
               terra::rast("data/06_ENM_processing/Pyxidanthera_barbulata_EFT_Projection.asc"),
               terra::rast("data/06_ENM_processing/Pyxidanthera_brevifolia_EFT_Projection.asc"),
               terra::rast("data/06_ENM_processing/Shortia_galacifolia_EFT_Projection.asc")
)

# Assign species names as layer names in the raster stack.
names(enm_stack) <- c("Galax urceolata",
                      "Pyxidanthera barbulata",
                      "Pyxidanthera brevifolia",
                      "Shortia galacifolia")
enm_stack_raster <- stack(enm_stack)

# Calculate pairwise Schoener's D overlap between species.
# Values range from 0 (no overlap) to 1 (identical niches).
overlap_matrix <- calc.niche.overlap(enm_stack_raster, 
                                     overlapStat = "D")

print(overlap_matrix)

Test Phylogenetic Signal in Niche Overlap

# Read the phylogenetic tree of the species.
tree <- ape::read.tree("data/06_ENM_processing/diapensiaceae_subset.tre")

# Drop the outgroup so only focal species remain.
tree <- ape::drop.tip(tree, "Cyrilla_racemiflora")

# Replace underscores with spaces to match overlap matrix species names.
tree$tip.label <- gsub("_", " ", tree$tip.label)

# Calculate pairwise phylogenetic distances between species.
phylo_dist <- cophenetic(tree)

# Ensure the phylogenetic distance matrix matches the species order in the overlap matrix.
species_names <- rownames(overlap_matrix)
species_names <- gsub("\\.", " ", species_names)
phylo_dist <- phylo_dist[species_names, species_names]

# Initialize vectors to store pairwise values.
pair_names <- NULL
phylo_vals <- NULL
overlap_vals <- NULL

# Loop through the lower triangle of the matrices to extract pairwise comparisons.
for (i in 2:length(species_names)) {
  for (j in 1:(i - 1)) {
    pair_names <- c(pair_names,
                    paste(species_names[i], 
                          species_names[j], sep = " - "))
    phylo_vals <- c(phylo_vals, phylo_dist[i, j])
    overlap_vals <- c(overlap_vals, 
                      overlap_matrix[i, j])
  }
}

# Fit a linear regression between phylogenetic distance and niche overlap.
lm_res <- lm(overlap_vals ~ phylo_vals)
s <- summary(lm_res)
print(s)

# Format regression statistics for display on the plot.
label_text <- paste0("Intercept = ", format(round(coef(lm_res)[1], 3)), "\n",
                     "Slope = ", format(round(coef(lm_res)[2], 5)), "\n",
                     "P-value = ", format(signif(s$coefficients[2, 4], 3)), "\n",
                     "R² = ", format(round(s$r.squared, 3)))

# Plot the relationship between phylogenetic distance and niche overlap.
plot(phylo_vals, overlap_vals,
     xlab = "Phylogenetic Distance",
     ylab = "Niche Overlap (Schoener's D)",
     pch = 19)
abline(lm_res, col = "red", lwd = 2)
text(phylo_vals - 3,
     overlap_vals - 0.02,
     labels = 1:length(pair_names),
     pos = 3,
     cex = 0.8)
text(x = max(phylo_vals) * 0.7,
     y = max(overlap_vals) * 0.9,
     labels = label_text,
     col = "red",
     cex = 0.9, pos = 4)
legend("topleft",
       legend = paste(1:length(pair_names), pair_names, sep = ": "),
       bty = "n", cex = 0.7)

For niche Age Overlap Correlations (AOCs), a high intercept and negative slope indicate niche conservatism, meaning closely related species tend to occupy similar ecological niches. A low intercept combined with a positive slope suggests niche divergence, where closely related species occupy different niches.The results here are pretty meaningless since we’re looking at very few, distantly related species, but it serves the purpose of the demo. A linear regression indicated no significant relationship between phylogenetic distance and niche overlap among the four species, suggesting minimal phylogenetic signal in ecological niche similarity.

Project Models onto Future Climate

# List future climate layers under the SSP370 scenario for 2081-2100.
future_files <- list.files("data/04_climate_processing/ACCESS-CM2_2081-2100_ssp370/", 
                           pattern = "\\.asc$", 
                           full.names = TRUE)

# Add the current elevation layer to the future stack, since elevation remains constant.
elev_file <- list.files("data/04_climate_processing/CurrentEasternTemperateForests", 
                        pattern = "elev\\.asc$", 
                        full.names = TRUE)

futurestack <- terra::rast(c(future_files, elev_file))

# Project each species onto future ETF climate conditions.
for (species in species_list) {

  cat("\nProjecting future climate for:", species, "\n")

  load(paste0("data/05_ENMs/", species, "_opt_mod.RData"))

  vif_dir <- paste0("data/04_climate_processing/Cropped/", species, "/VIF/")

  species_layers <- list.files(vif_dir, full.names = TRUE)
  spec_stack <- terra::rast(species_layers)
  layer_names <- names(spec_stack)

  FutureETF_Rasters <- terra::subset(futurestack, layer_names)
  #if error, try: FutureETF_Rasters <- raster::stack(FutureETF_Rasters)

  future_file <- paste0("data/06_ENM_processing/", species, "_Future_ETF_Projection.asc")

  p_future <- dismo::predict(opt.mod, FutureETF_Rasters, filename = future_file, overwrite = TRUE, NAflag = -9999)

  save(p_future, file = sub("\\.asc$", ".RData", future_file))

  # Plot the future habitat suitability map.
  p_future_df <- as.data.frame(p_future, xy = TRUE)
  colnames(p_future_df)[3] <- "Habitat_Suitability"

  bbox_vals <- terra::ext(p_future)

  p_plot <- ggplot() +
    geom_sf(data = usa, fill = "grey70") +
    geom_tile(data = p_future_df,
              aes(x = x, y = y, fill = Habitat_Suitability)) +
    geom_sf(data = usa, fill = NA, color = "white", linewidth = 0.5) +
    coord_sf(xlim = c(bbox_vals$xmin - 2, bbox_vals$xmax + 2),
             ylim = c(bbox_vals$ymin - 2, bbox_vals$ymax + 2),
             expand = FALSE) +
    scale_fill_viridis_c(name = "Future Suitability") +
    labs(title = paste0(gsub("_", " ", species), " - Future Habitat Suitability"),
         x = "Longitude",
         y = "Latitude") +
    theme_bw() +
    theme(panel.grid = element_blank(),
          axis.text = element_text(size = 8),
          axis.title = element_text(size = 8),
          plot.title = element_text(size = 10)) +
    annotation_north_arrow(location = "tl",
                           height = unit(1, "cm"),
                           width = unit(1, "cm"))

  png_file <- paste0("data/06_ENM_processing/", species, "_Future_ETF_Projection.png")

  ggsave(filename = png_file, plot = p_plot, width = 7, height = 5, dpi = 300)
}

Compare Current and Future Projections

# Initialize a list to hold change maps for each species.
all_dfs <- list()

for (species in species_list) {

  cat("\nComparing projections for:", species, "\n")

  # Load current and future suitability rasters.
  current <- terra::rast(paste0("data/06_ENM_processing/", species, "_EFT_Projection.asc"))
  future <- terra::rast(paste0("data/06_ENM_processing/", species, "_Future_ETF_Projection.asc"))

  # Threshold each raster at 0.7 to produce binary presence/absence maps.
  current_binary <- current >= 0.7
  future_binary <- future >= 0.7

  # Compute range changes between current and future projections.
  RangeSizeDiff <- BIOMOD_RangeSize(as(current_binary, "Raster"),
                                    as(future_binary, "Raster"))

  diff_raster <- RangeSizeDiff$Diff.By.Pixel

  df <- as.data.frame(diff_raster, xy = TRUE)
  colnames(df) <- c("x", "y", "change")

  # Convert numeric change codes to categorical labels for plotting.
  df$fill <- factor(df$change,
                    levels = c(-2, -1, 0, 1),
                    labels = c("lost", "unchanged", "not occupied", "occupied in future"))

  # Add species name for faceting plots.
  df$species <- gsub("_", " ", species)

  all_dfs[[species]] <- df
}

# Combine all species into one data frame for faceted plotting.
combined_df <- bind_rows(all_dfs)

# Determine overall plotting extent.
combined_bbox <- combined_df %>%
  summarise(xmin = min(x, na.rm = TRUE),
            xmax = max(x, na.rm = TRUE),
            ymin = min(y, na.rm = TRUE),
            ymax = max(y, na.rm = TRUE))

# Plot all species' range shifts together.
p_all <- ggplot(combined_df) +
  geom_sf(data = usa, fill = "grey70") +
  geom_tile(aes(x = x, y = y, fill = fill)) +
  geom_sf(data = usa, fill = NA, color = "white", linewidth = 0.5) +
  coord_sf(xlim = c(combined_bbox$xmin - 2, combined_bbox$xmax + 2),
           ylim = c(combined_bbox$ymin - 2, combined_bbox$ymax + 2),
           expand = FALSE) +
  scale_fill_viridis_d(name = "Pixel Change", na.value = "white") +
  facet_wrap(~ species) +
  labs(title = "Current vs Future Habitat Suitability Change",
       x = "Longitude",
       y = "Latitude") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 8),
        axis.title = element_text(size = 8),
        plot.title = element_text(size = 10),
        strip.text = element_text(size = 9)) +
  annotation_north_arrow(
    location = "tl",
    height = unit(1, "cm"),
    width = unit(1, "cm"))

ggsave("data/06_ENM_processing/all_species_range_change_map.png",
       plot = p_all,
       width = 10,
       height = 8,
       dpi = 300)