spatialsample:

A tidy approach to spatial cross-validation

About Me

Data splitting:

Cross-validation:

rsample and friends

spatialsample::boston_canopy |> head()
#> Simple feature collection with 6 features and 18 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: 749383.6 ymin: 2913059 xmax: 801174.4 ymax: 2965741
#> Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
#> # A tibble: 6 × 19
#>   grid_id land_area canopy_gain canopy_loss canopy_no_change canopy_area_2014
#>   <chr>       <dbl>       <dbl>       <dbl>            <dbl>            <dbl>
#> 1 AB-4      795045.      15323.       3126.           53676.           56802.
#> 2 I-33      265813.       8849.      11795.           78677.           90472.
#> 3 AO-9      270153        6187.       1184.           26930.           28114.
#> 4 H-10     2691490.      73098.      80362.          345823.          426185.
#> 5 V-7       107890.        219.       3612.             240.            3852.
#> 6 Q-22     2648089.     122211.     154236.         1026632.         1180868.
#> # ℹ 13 more variables: canopy_area_2019 <dbl>, change_canopy_area <dbl>,
#> #   change_canopy_percentage <dbl>, canopy_percentage_2014 <dbl>,
#> #   canopy_percentage_2019 <dbl>, change_canopy_absolute <dbl>,
#> #   mean_temp_morning <dbl>, mean_temp_evening <dbl>, mean_temp <dbl>,
#> #   mean_heat_index_morning <dbl>, mean_heat_index_evening <dbl>,
#> #   mean_heat_index <dbl>, geometry <MULTIPOLYGON [US_survey_foot]>

rsample and friends

library(tidymodels)
vfold_rset <- rsample::vfold_cv(spatialsample::boston_canopy)
vfold_rset
#> #  10-fold cross-validation 
#> # A tibble: 10 × 2
#>    splits           id    
#>    <list>           <chr> 
#>  1 <split [613/69]> Fold01
#>  2 <split [613/69]> Fold02
#>  3 <split [614/68]> Fold03
#>  4 <split [614/68]> Fold04
#>  5 <split [614/68]> Fold05
#>  6 <split [614/68]> Fold06
#>  7 <split [614/68]> Fold07
#>  8 <split [614/68]> Fold08
#>  9 <split [614/68]> Fold09
#> 10 <split [614/68]> Fold10
lobstr::obj_sizes(spatialsample::boston_canopy, vfold_rset)
#> * 984.07 kB
#> *  34.77 kB
vfold_rset |> 
  get_rsplit(1)
#> <Analysis/Assess/Total>
#> <613/69/682>

rsample and friends

vfold_rset |> 
  get_rsplit(1) |> 
  analysis() |> 
  head()
#> Simple feature collection with 6 features and 18 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: 749383.6 ymin: 2913059 xmax: 781922.7 ymax: 2965782
#> Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
#> # A tibble: 6 × 19
#>   grid_id land_area canopy_gain canopy_loss canopy_no_change canopy_area_2014
#>   <chr>       <dbl>       <dbl>       <dbl>            <dbl>            <dbl>
#> 1 AB-4      795045.      15323.       3126.           53676.           56802.
#> 2 I-33      265813.       8849.      11795.           78677.           90472.
#> 3 H-10     2691490.      73098.      80362.          345823.          426185.
#> 4 V-7       107890.        219.       3612.             240.            3852.
#> 5 Q-22     2648089.     122211.     154236.         1026632.         1180868.
#> 6 X-4       848558.       8275.       1760.            6872.            8632.
#> # ℹ 13 more variables: canopy_area_2019 <dbl>, change_canopy_area <dbl>,
#> #   change_canopy_percentage <dbl>, canopy_percentage_2014 <dbl>,
#> #   canopy_percentage_2019 <dbl>, change_canopy_absolute <dbl>,
#> #   mean_temp_morning <dbl>, mean_temp_evening <dbl>, mean_temp <dbl>,
#> #   mean_heat_index_morning <dbl>, mean_heat_index_evening <dbl>,
#> #   mean_heat_index <dbl>, geometry <MULTIPOLYGON [US_survey_foot]>

rsample and friends

vfold_rset$splits |> 
  lapply(analysis) |> 
  lapply(lm, formula = canopy_area_2019 ~ land_area * mean_temp) |> 
  mapply(
    FUN = \(mod, rsplit) {
      assessment <- assessment(rsplit)
      yardstick::rmse_vec(assessment[["canopy_area_2019"]], predict(mod, assessment))
    },
    mod = _,
    rsplit = vfold_rset$splits
  ) |> 
  mean()
#> [1] 375792

rsample and friends

workflow() |> 
  add_model(linear_reg()) |> 
  add_formula(canopy_area_2019 ~ land_area * mean_temp) |> 
  fit_resamples(vfold_cv(spatialsample::boston_canopy)) |> 
  collect_metrics()
#> # A tibble: 2 × 6
#>   .metric .estimator       mean     n    std_err .config             
#>   <chr>   <chr>           <dbl> <int>      <dbl> <chr>               
#> 1 rmse    standard   377089.       10 20426.     Preprocessor1_Model1
#> 2 rsq     standard        0.353    10     0.0178 Preprocessor1_Model1

What does “new data” mean?

ggplot(spatialsample::boston_canopy, aes(fill = canopy_area_2019)) + geom_sf() + 
  scale_fill_distiller(name = "Canopy area (2019)", palette = "YlGn", direction = 1)

Are these folds really unrelated?

rsample::vfold_cv(spatialsample::boston_canopy, v = 5)

Spatial clustering

library(spatialsample)
set.seed(1234)
spatial_clustering_cv(boston_canopy, v = 5)

Spatial clustering

library(purrr)
walk(spatial_clustering_cv(boston_canopy, v = 5)$splits, function(x) print(autoplot(x)))

Spatial blocking

spatial_block_cv(boston_canopy, v = 5, n = c(10, 10))

Spatial blocking

walk(spatial_block_cv(boston_canopy, v = 5, n = c(10, 10))$splits,
     function(x) print(autoplot(x)))

Spatial blocking

spatial_block_cv(boston_canopy, v = 5, n = c(10, 10), 
                 method = "continuous", relevant_only = FALSE)

Spatial LODO

folds <- spatial_buffer_vfold_cv(boston_canopy, v = Inf, radius = 1500, buffer = 1500)
walk(folds$splits, function(x) print(autoplot(x)))

Buffering

spatial_clustering_cv(boston_canopy, v = 5, buffer = 1500)
boston_clusters <- spatial_clustering_cv(boston_canopy)
boston_clusters$splits |> 
  lapply(analysis) |> 
  lapply(lm, formula = canopy_area_2019 ~ land_area * mean_temp) |> 
  mapply(
    FUN = \(mod, rsplit) {
      assessment <- assessment(rsplit)
      yardstick::rmse_vec(assessment[["canopy_area_2019"]], predict(mod, assessment))
    },
    mod = _,
    rsplit = boston_clusters$splits
  ) |> 
  mean()
#> [1] 949744.3

tidymodels integration

workflow() |> 
  add_model(linear_reg()) |> 
  add_formula(canopy_area_2019 ~ land_area * mean_temp) |> 
  fit_resamples(vfold_cv(spatialsample::boston_canopy)) |> 
  collect_metrics()
#> # A tibble: 2 × 6
#>   .metric .estimator       mean     n    std_err .config             
#>   <chr>   <chr>           <dbl> <int>      <dbl> <chr>               
#> 1 rmse    standard   377195.       10 21949.     Preprocessor1_Model1
#> 2 rsq     standard        0.359    10     0.0239 Preprocessor1_Model1
workflow() |> 
  add_model(linear_reg()) |> 
  add_formula(canopy_area_2019 ~ land_area * mean_temp) |> 
  fit_resamples(spatial_clustering_cv(spatialsample::boston_canopy)) |> 
  collect_metrics()
#> # A tibble: 2 × 6
#>   .metric .estimator       mean     n     std_err .config             
#>   <chr>   <chr>           <dbl> <int>       <dbl> <chr>               
#> 1 rmse    standard   887760.       10 499093.     Preprocessor1_Model1
#> 2 rsq     standard        0.327    10      0.0557 Preprocessor1_Model1

Other features:


All methods implemented in spatialsample itself

C++ implementation of most computationally-intensive parts

Works with projected & geographic CRS

Arguments accept explicit units

Aware of CRS units, functions do unit conversion

Handles all geometry types\(^*\)

Should we be doing this at all?

Should we be doing this at all?

Should we be doing this at all?

Should we be doing this at all?

From Mila et al. (2022). “Nearest neighbour distance matching Leave-One-Out Cross-Validation for map validation”. Methods in Ecology and Evolution 13(6), pp 1304-1316. doi: 10.1111/2041-210X.13851.

Should we be doing this at all?

When extrapolating into new regions

When working with clustered or convenience samples

When interpolating within the sampled area

When an independent probability sample exists

Thank you!


Find me online:

mm218.dev

@mikemahoney218

@MikeMahoney218@fosstodon.org


Slides available at mm218.dev/ehadec2023

More spatialsample: https://spatialsample.tidymodels.org/