1 Intro

Quick exploratory analysis

2 Setup

This section for reproducibility. We use R in the version 4.3.2. In order to be able to reproduce this analysis you need an openrouteservice API key. Get one here. Put it in a text file file called config.txt in the working directory. It has no structure, just the key in the first line.

YOURAPIKEYXXASDAWDOIJ@(

2.1 libraries

Almost all of the libraries can be isntalled via CRAN with the command install.packages('packagename'). The openrouteservice package is not on CRAN and needs to be installed via github. With the command remotes::install_github("GIScience/openrouteservice-r"); Also tmap in its most recent version 4 is not yet available on CRAN so we need to download and install it via github too.

# main libraries
library(tidyverse)
library(glue)
library(sf)
library(units)
library(ggplot2)
library(tictoc)
library(openrouteservice) # remotes::install_github("GIScience/openrouteservice-r");
library(jsonlite)
library(terra)
library(exactextractr)
library(tmap) # remotes::install_github("r-tmap/tmap")
library(leaflet)

api_key <- readLines("config.txt", n = 1)

# function to  adjust a list of coordinates to the location of the closest road segment via the snapping endpoint of ORS. ORS itself only allows for a maximum distance of 350m to snap.
ors_snap <- function(x, rowids, local=F) {
  
  library(httr)
  
  # Define the body
  body <- list(
    locations = x,
    radius = 100000000 # apparently the maximum snapping distance is 5km
  )
  
  # Define the headers
  headers <- c(
    'Accept' = 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8',
    'Authorization' = api_key,
    'Content-Type' = 'application/json; charset=utf-8'
  )
  
  endpoint <- if(local){'http://localhost:8080/ors/v2/snap/driving-car'} else {'https://api.openrouteservice.org/v2/snap/driving-car'}
  # Make the POST request
  response <- POST( endpoint,
    #'https://api.openrouteservice.org/v2/snap/driving-car', 
                   body = body, 
                   add_headers(headers), 
                   encode = "json")
  
  resp_content <- content(response)
  
  extract_data <- function(x, index) {
    if (is.null(x)) {
      return(data.frame(
        rowid = index,
        lon = NA,
        lat = NA,
        snapped_distance = NA
      ))
    } else {
      return(data.frame(
        rowid = index,
        lon = x$location[[1]],
        lat = x$location[[2]],
        snapped_distance = x$snapped_distance
      ))
    }
  }
  
  # Extract data from the list and create a dataframe
  df <- do.call(rbind, lapply(seq_along(resp_content$locations), function(i) extract_data(resp_content$locations[[i]], i)))
  df$rowid <- rowids
  # Convert the dataframe to an sf object
  sf_df <- df |> 
    drop_na(lon, lat) |>  # Drop rows with NA coordinates
    st_as_sf(coords = c("lon", "lat"), crs = 4326) |>  # Define the coordinate columns and set CRS
    st_sf()
  
  return(sf_df)

}


options(openrouteservice.url = "http://localhost:8080/ors")

3 Processing pipeline

We will need Data on

Cities - greater 50k population in Somalia as our destinations from OSM
Administrative boundaries of Somalia from HDX
Population raster - population raster for the whole country from worldpop

Make sure the data is in the working directory. The cities data is in geojson format, the boundaries in shapefile format and the population raster in tif format.

3.1 Data download

cities <- st_read('SOM_pop_50k.geojson', quiet=T)
pop <- rast('som_ppp_2020_UNadj_constrained.tif')
admin <- st_read('som_admbnda_adm2_ocha_20230308.shp', quiet=T)

3.2 Sample design and preparation

The destinations are defined as greater cities. The origins are sampled locations within Somalia. We decided for regular sampled grid cells of 15km x 15km. We join the population information from worldpop and snap the centroids of the grid cells to the closest road segment via the ORS API.

# create 15km grid
grid1km <-
  st_make_grid(admin |> st_transform(20538), cellsize = 5000, square = F, flat_topped = T)
# convert to sf df
grid1km <- grid1km |> st_as_sf()
names(grid1km) <- 'geometry'
st_geometry(grid1km) <- 'geometry'
# get rid of all cells that are not within somalia
grid1km <-
  grid1km |> st_transform(4326) |> st_filter(admin |> st_union())
# join admin codes
grid1km <-
  grid1km |> st_join(admin |> select(ADM2_PCODE),
                     left = T,
                     largest = T)
# join pop information
grid1km$pop <- exact_extract(pop, grid1km, 'sum', progress = F)
# add rowid
grid1km <- grid1km |> mutate(rowid = row_number())
# add rowid for the cities too
cities <- cities |>  mutate(rowid = row_number())

grid1km <- grid1km |> filter(pop>0)

coord_list <-
  lapply(grid1km$geometry |> st_centroid(), function(x)
    c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))

# put the lsit of coords to the snapping endpoint of ors
snapped_coords <- ors_snap(coord_list,rowids = grid1km$rowid, local = T)

# Join snapped_coords to grid1km by rowid and create original centroid
grid1km_snapped <- grid1km |>
  left_join(snapped_coords |> tibble(), by = "rowid") |>
  mutate(centroid = st_centroid(geometry.x))
# rework names & geom for sf
names(grid1km_snapped)[5:6] <- c('geometry', 'snapped_centroid')
st_geometry(grid1km_snapped) <- 'geometry'

# separate the snapped and non snapped grid cells for somalia
# snapped
grid1km_snapped_notNA <- grid1km_snapped |> 
  filter(!is.na(snapped_distance))
# not snapped
grid1km_snapped_NA <- grid1km_snapped |> 
  filter(is.na(snapped_distance))

Awesome now we got 17735 snapped and 3045 not snapped grid cells.

3.3 ORS matrix request and post processing

In the next chunk we will calculate the travel times and distances from each grid cell to each city. We will use the openrouteservice API for this.

IMPORTANT

The openrouteservice api is restricted to 2000 matrix requests per day. Each request can bear up to 3500 origin x destination combinations.

In our case we do 17735 * 12 combinations. If we like to increase the number of grid cells and therefore the amount of combinations we can do so by not running one request for one city x all grid cells as we currently do. Instead we could run multiple requests for one city * a subset of grid cells. We only need to ensure that the combination for a single request oes not exceed 3500.

# create a list of coordinates for the grid again but from snapped coords
coord_list <-
  lapply(grid1km_snapped_notNA$snapped_centroid, function(x)
    c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))

# run through all cities and calculate travel times an distances to each grid cell.
# For 12 cities and 2493 grid cells this is 2493*12 = 29916 results.
for ( i in 1:nrow(cities)) { 
  # log the current iteration
  tic(glue("run {i} / {nrow(cities)}"))
  # extract single city by iteration index
  cities_subset <- cities[i,]
  # get the coordinates as vector
  city_coords <-
    lapply(cities_subset$geometry, function(x)
      c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))  
  # add the city coordinates at the first position before our grid centroid coordinates
  coord_list_run <- c(city_coords, coord_list)
  
  # run the ors matrix request
  res1 = ors_matrix(
    coord_list_run, # list of all grid and one single city coordinate at the top
    sources = 0, # index of the coordiante which shall serve as origin. In our case the city. If we dont set it we get the matrix for all coords * all cords
    metrics = c("duration", "distance"), # we want duration and distance, not only one of it
    units = "km", # units is km
    api_key = api_key, # api key
    output = 'parsed') # output as parsed json
  
  rm(coord_list_run) # get rid of the list of coordinates as we create a new one the next iteration with a different city at the top
  
  # create a dataframe from the results if it is the first iteration, otherwise create and append to the existing one
  if (i == 1) {
    result_matrix <- data.frame(
    city_id = cities_subset$rowid |> rep((res1$distances |> as.vector())[-1] |>
                                           length()),
    grid_id = grid1km_snapped_notNA$rowid,
    durations = (res1$durations |> as.vector())[-1] |> as.numeric(),
    distances = (res1$distances |> as.vector())[-1] |> as.numeric()
  ) |> tibble()
  } else {
    result_matrix <- rbind(result_matrix,data.frame(
    city_id = cities_subset$rowid |> rep((res1$distances |> as.vector())[-1] |>
                                           length()),
    grid_id = grid1km_snapped_notNA$rowid,
    durations = (res1$durations |> as.vector())[-1] |> as.numeric(),
    distances = (res1$distances |> as.vector())[-1] |> as.numeric()
  ))
  }
  toc()
}

## run 1 / 12: 0.614 sec elapsed
## run 2 / 12: 0.582 sec elapsed
## run 3 / 12: 0.645 sec elapsed
## run 4 / 12: 0.615 sec elapsed
## run 5 / 12: 0.58 sec elapsed
## run 6 / 12: 0.544 sec elapsed
## run 7 / 12: 0.612 sec elapsed
## run 8 / 12: 0.612 sec elapsed
## run 9 / 12: 0.539 sec elapsed
## run 10 / 12: 0.599 sec elapsed
## run 11 / 12: 0.58 sec elapsed
## run 12 / 12: 0.519 sec elapsed

Awesome we now have the distances from all 12 cities to all grids. The resulting dataframe has 29916 rows. Now we want to find the city that is closest from every grid cell in terms of traveltime.

# Identify the lowest duration from each grid city combination
lowest_duration <- result_matrix |> 
  group_by(grid_id) |> 
  slice_min(durations, with_ties = F) |> 
  select(grid_id, city_id, durations, distances) |> 
  rename(shortest_duration = durations)

# Join the lowest duration to the grid data
grid1km_snapped_notNA <- grid1km_snapped_notNA |>
  left_join(lowest_duration, by = c('rowid' = 'grid_id')) |>
  mutate(shortest_duration = shortest_duration / 3600) # convert seconds to hours
names(grid1km_snapped_notNA)[c(6, 7)] <-
  c('duration_h', 'distance_km') # account for naming 

# join back in the not snapped grid cells. Those will bear a NA in the shortest_duration column
grid_final <- bind_rows(grid1km_snapped_notNA,
                        grid1km_snapped_NA)

4 Visualization & assessment

In the following chunks some visualizations are executed to assess the results

4.1 Non spatial

So what is cumulative population/area catchment of each of the cities?

We join the cities again to get the actual names of the cities instead of ids only.

summary_by_city <- grid_final |> group_by(city_id) |> 
  summarize(pop=sum(pop, na.rm=T),count=n(), km2=n()*225) |>
  st_drop_geometry() |> 
  left_join (cities |> 
               st_drop_geometry() |> 
               select(c(name, rowid)), by=c('city_id'='rowid'))

summary_by_city |> 
  ggplot(aes(x=name, y=pop, fill=name)) + 
  geom_bar(stat='identity') + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) + 
  xlab('') + ylab('Population')

summary_by_city |> 
  ggplot(aes(x=name, y=km2, fill=name)) + 
  geom_bar(stat='identity') + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) + 
  xlab('') + ylab('Area (km^2')

19694 grid centroids were not able to snap and therefore create a route to a city. This results in 4431150 km^2 area (94.77%) and 14782846.4804508 ( 93.01%) we cannot assess the closest city for.

What is the relation between Traveltime and population for each grid cell?

grid_final |> 
  ggplot(aes(x = duration_h, y = pop)) +
  geom_point() + theme_minimal() + 
  xlab('Traveltime in h') + ylab('Population')

grid_final |> 
  ggplot(aes(x = duration_h, y = pop |> log())) +
  geom_point() + theme_minimal() + 
  xlab('Traveltime in h') + ylab('Population log scaled')

4.2 Spatial

Maps of population distribution, travel time to the closest city and the closest city with respective administrative boundaries and the greater cities locations.

4.2.1 Population distribution

tmap_mode('view')
tm_shape(grid_final, name = 'Population') +
  tm_polygons(
    'pop',
    fill.scale = tm_scale_intervals(
      values = '-viridis',
      breaks = c(0, 1, 500, 1000, 5000, 10000, 50000, 2000000)
    ),
    fill.title = tm_legend(title = 'Population'),
    lwd = 0.1,
    border = 'white'
  ) +
  grid_final |> group_by(ADM2_PCODE) |> summarise(ADM2_PCODE = first(ADM2_PCODE)) |>
  tm_shape(name = 'Admin(2) boundaries') +
  tm_borders(lwd = 0.5) +
  tm_shape(cities, name = "Greater City") +
  tm_dots()

4.2.2 Traveltime to closest city

tm_shape(grid_final, name = 'Traveltime to next city') +
  tm_polygons(
    'duration_h',
    fill.scale = tm_scale_intervals(
      values = 'hcl.heat',
      breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
    ),
    fill.title = tm_legend(title = 'Traveltime to closest city'),
    lwd = 0.1,
    border = 'white'
  ) +
  grid_final |> group_by(ADM2_PCODE) |> summarise(ADM2_PCODE = first(ADM2_PCODE)) |>
  tm_shape(name = 'Admin(2) boundaries') +
  tm_borders(lwd = 0.5) +
  tm_shape(cities, name = "Greater City") +
  tm_dots()

4.2.3 Closest city

tm_shape(grid_final, name = 'Closest City') +
  tm_polygons(
    fill = 'city_id',
    fill.scale = tm_scale_discrete(values = 'brewer.pastel1'),
    fill.title = tm_legend(title = 'Traveltime to closest city'),
    lwd = 0.1,
    border = 'white'
  ) +
  grid_final |> group_by(ADM2_PCODE) |> summarise(ADM2_PCODE = first(ADM2_PCODE)) |>
  tm_shape(name = 'Admin(2) boundaries') +
  tm_borders(lwd = 0.5) +
  tm_shape(cities, name = "Greater City") +
  tm_dots()

5 Output

Grid export in geopackage format.

grid_final |> select(-c(snapped_centroid, centroid)) |>
  st_write('somalia_grid_shortest_duration.gpkg', append = F, quiet = T)

City access Somalia

Marcel Reinmuth

2024-07-29