Quick exploratory analysis
This section for reproducibility. We use R in the version 4.3.2. In
order to be able to reproduce this analysis you need an openrouteservice
API key. Get one here. Put it in a text
file file called config.txt
in the working directory. It
has no structure, just the key in the first line.
YOURAPIKEYXXASDAWDOIJ@(
Almost all of the libraries can be isntalled via CRAN with the
command install.packages('packagename')
. The
openrouteservice package is not on CRAN and needs to be installed via
github. With the command
remotes::install_github("GIScience/openrouteservice-r");
Also tmap in its most recent version 4 is not yet available on CRAN so
we need to download and install it via github too.
# main libraries
library(tidyverse)
library(glue)
library(sf)
library(units)
library(ggplot2)
library(tictoc)
library(openrouteservice) # remotes::install_github("GIScience/openrouteservice-r");
library(jsonlite)
library(terra)
library(exactextractr)
library(tmap) # remotes::install_github("r-tmap/tmap")
library(leaflet)
api_key <- readLines("config.txt", n = 1)
# function to adjust a list of coordinates to the location of the closest road segment via the snapping endpoint of ORS. ORS itself only allows for a maximum distance of 350m to snap.
ors_snap <- function(x, rowids, local=F) {
library(httr)
# Define the body
body <- list(
locations = x,
radius = 100000000 # apparently the maximum snapping distance is 5km
)
# Define the headers
headers <- c(
'Accept' = 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8',
'Authorization' = api_key,
'Content-Type' = 'application/json; charset=utf-8'
)
endpoint <- if(local){'http://localhost:8080/ors/v2/snap/driving-car'} else {'https://api.openrouteservice.org/v2/snap/driving-car'}
# Make the POST request
response <- POST( endpoint,
#'https://api.openrouteservice.org/v2/snap/driving-car',
body = body,
add_headers(headers),
encode = "json")
resp_content <- content(response)
extract_data <- function(x, index) {
if (is.null(x)) {
return(data.frame(
rowid = index,
lon = NA,
lat = NA,
snapped_distance = NA
))
} else {
return(data.frame(
rowid = index,
lon = x$location[[1]],
lat = x$location[[2]],
snapped_distance = x$snapped_distance
))
}
}
# Extract data from the list and create a dataframe
df <- do.call(rbind, lapply(seq_along(resp_content$locations), function(i) extract_data(resp_content$locations[[i]], i)))
df$rowid <- rowids
# Convert the dataframe to an sf object
sf_df <- df |>
drop_na(lon, lat) |> # Drop rows with NA coordinates
st_as_sf(coords = c("lon", "lat"), crs = 4326) |> # Define the coordinate columns and set CRS
st_sf()
return(sf_df)
}
options(openrouteservice.url = "http://localhost:8080/ors")
We will need Data on
Make sure the data is in the working directory. The cities data is in geojson format, the boundaries in shapefile format and the population raster in tif format.
cities <- st_read('SOM_pop_50k.geojson', quiet=T)
pop <- rast('som_ppp_2020_UNadj_constrained.tif')
admin <- st_read('som_admbnda_adm2_ocha_20230308.shp', quiet=T)
The destinations are defined as greater cities. The origins are sampled locations within Somalia. We decided for regular sampled grid cells of 15km x 15km. We join the population information from worldpop and snap the centroids of the grid cells to the closest road segment via the ORS API.
# create 15km grid
grid1km <-
st_make_grid(admin |> st_transform(20538), cellsize = 5000, square = F, flat_topped = T)
# convert to sf df
grid1km <- grid1km |> st_as_sf()
names(grid1km) <- 'geometry'
st_geometry(grid1km) <- 'geometry'
# get rid of all cells that are not within somalia
grid1km <-
grid1km |> st_transform(4326) |> st_filter(admin |> st_union())
# join admin codes
grid1km <-
grid1km |> st_join(admin |> select(ADM2_PCODE),
left = T,
largest = T)
# join pop information
grid1km$pop <- exact_extract(pop, grid1km, 'sum', progress = F)
# add rowid
grid1km <- grid1km |> mutate(rowid = row_number())
# add rowid for the cities too
cities <- cities |> mutate(rowid = row_number())
grid1km <- grid1km |> filter(pop>0)
coord_list <-
lapply(grid1km$geometry |> st_centroid(), function(x)
c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))
# put the lsit of coords to the snapping endpoint of ors
snapped_coords <- ors_snap(coord_list,rowids = grid1km$rowid, local = T)
# Join snapped_coords to grid1km by rowid and create original centroid
grid1km_snapped <- grid1km |>
left_join(snapped_coords |> tibble(), by = "rowid") |>
mutate(centroid = st_centroid(geometry.x))
# rework names & geom for sf
names(grid1km_snapped)[5:6] <- c('geometry', 'snapped_centroid')
st_geometry(grid1km_snapped) <- 'geometry'
# separate the snapped and non snapped grid cells for somalia
# snapped
grid1km_snapped_notNA <- grid1km_snapped |>
filter(!is.na(snapped_distance))
# not snapped
grid1km_snapped_NA <- grid1km_snapped |>
filter(is.na(snapped_distance))
Awesome now we got 17735 snapped and 3045 not snapped grid cells.
In the next chunk we will calculate the travel times and distances from each grid cell to each city. We will use the openrouteservice API for this.
IMPORTANT
The openrouteservice api is restricted to 2000 matrix requests per day. Each request can bear up to 3500 origin x destination combinations.
In our case we do 17735 * 12 combinations. If we like to increase the number of grid cells and therefore the amount of combinations we can do so by not running one request for one city x all grid cells as we currently do. Instead we could run multiple requests for one city * a subset of grid cells. We only need to ensure that the combination for a single request oes not exceed 3500.
# create a list of coordinates for the grid again but from snapped coords
coord_list <-
lapply(grid1km_snapped_notNA$snapped_centroid, function(x)
c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))
# run through all cities and calculate travel times an distances to each grid cell.
# For 12 cities and 2493 grid cells this is 2493*12 = 29916 results.
for ( i in 1:nrow(cities)) {
# log the current iteration
tic(glue("run {i} / {nrow(cities)}"))
# extract single city by iteration index
cities_subset <- cities[i,]
# get the coordinates as vector
city_coords <-
lapply(cities_subset$geometry, function(x)
c(st_coordinates(x)[, 1], st_coordinates(x)[, 2]))
# add the city coordinates at the first position before our grid centroid coordinates
coord_list_run <- c(city_coords, coord_list)
# run the ors matrix request
res1 = ors_matrix(
coord_list_run, # list of all grid and one single city coordinate at the top
sources = 0, # index of the coordiante which shall serve as origin. In our case the city. If we dont set it we get the matrix for all coords * all cords
metrics = c("duration", "distance"), # we want duration and distance, not only one of it
units = "km", # units is km
api_key = api_key, # api key
output = 'parsed') # output as parsed json
rm(coord_list_run) # get rid of the list of coordinates as we create a new one the next iteration with a different city at the top
# create a dataframe from the results if it is the first iteration, otherwise create and append to the existing one
if (i == 1) {
result_matrix <- data.frame(
city_id = cities_subset$rowid |> rep((res1$distances |> as.vector())[-1] |>
length()),
grid_id = grid1km_snapped_notNA$rowid,
durations = (res1$durations |> as.vector())[-1] |> as.numeric(),
distances = (res1$distances |> as.vector())[-1] |> as.numeric()
) |> tibble()
} else {
result_matrix <- rbind(result_matrix,data.frame(
city_id = cities_subset$rowid |> rep((res1$distances |> as.vector())[-1] |>
length()),
grid_id = grid1km_snapped_notNA$rowid,
durations = (res1$durations |> as.vector())[-1] |> as.numeric(),
distances = (res1$distances |> as.vector())[-1] |> as.numeric()
))
}
toc()
}
## run 1 / 12: 0.614 sec elapsed
## run 2 / 12: 0.582 sec elapsed
## run 3 / 12: 0.645 sec elapsed
## run 4 / 12: 0.615 sec elapsed
## run 5 / 12: 0.58 sec elapsed
## run 6 / 12: 0.544 sec elapsed
## run 7 / 12: 0.612 sec elapsed
## run 8 / 12: 0.612 sec elapsed
## run 9 / 12: 0.539 sec elapsed
## run 10 / 12: 0.599 sec elapsed
## run 11 / 12: 0.58 sec elapsed
## run 12 / 12: 0.519 sec elapsed
Awesome we now have the distances from all 12 cities to all grids. The resulting dataframe has 29916 rows. Now we want to find the city that is closest from every grid cell in terms of traveltime.
# Identify the lowest duration from each grid city combination
lowest_duration <- result_matrix |>
group_by(grid_id) |>
slice_min(durations, with_ties = F) |>
select(grid_id, city_id, durations, distances) |>
rename(shortest_duration = durations)
# Join the lowest duration to the grid data
grid1km_snapped_notNA <- grid1km_snapped_notNA |>
left_join(lowest_duration, by = c('rowid' = 'grid_id')) |>
mutate(shortest_duration = shortest_duration / 3600) # convert seconds to hours
names(grid1km_snapped_notNA)[c(6, 7)] <-
c('duration_h', 'distance_km') # account for naming
# join back in the not snapped grid cells. Those will bear a NA in the shortest_duration column
grid_final <- bind_rows(grid1km_snapped_notNA,
grid1km_snapped_NA)
In the following chunks some visualizations are executed to assess the results
So what is cumulative population/area catchment of each of the cities?
We join the cities again to get the actual names of the cities instead of ids only.
summary_by_city <- grid_final |> group_by(city_id) |>
summarize(pop=sum(pop, na.rm=T),count=n(), km2=n()*225) |>
st_drop_geometry() |>
left_join (cities |>
st_drop_geometry() |>
select(c(name, rowid)), by=c('city_id'='rowid'))
summary_by_city |>
ggplot(aes(x=name, y=pop, fill=name)) +
geom_bar(stat='identity') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) +
xlab('') + ylab('Population')
summary_by_city |>
ggplot(aes(x=name, y=km2, fill=name)) +
geom_bar(stat='identity') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) +
xlab('') + ylab('Area (km^2')
19694 grid centroids were not able to snap and therefore create a route to a city. This results in 4431150 km^2 area (94.77%) and 14782846.4804508 ( 93.01%) we cannot assess the closest city for.
What is the relation between Traveltime and population for each grid cell?
grid_final |>
ggplot(aes(x = duration_h, y = pop)) +
geom_point() + theme_minimal() +
xlab('Traveltime in h') + ylab('Population')
grid_final |>
ggplot(aes(x = duration_h, y = pop |> log())) +
geom_point() + theme_minimal() +
xlab('Traveltime in h') + ylab('Population log scaled')
Maps of population distribution, travel time to the closest city and the closest city with respective administrative boundaries and the greater cities locations.
tmap_mode('view')
tm_shape(grid_final, name = 'Population') +
tm_polygons(
'pop',
fill.scale = tm_scale_intervals(
values = '-viridis',
breaks = c(0, 1, 500, 1000, 5000, 10000, 50000, 2000000)
),
fill.title = tm_legend(title = 'Population'),
lwd = 0.1,
border = 'white'
) +
grid_final |> group_by(ADM2_PCODE) |> summarise(ADM2_PCODE = first(ADM2_PCODE)) |>
tm_shape(name = 'Admin(2) boundaries') +
tm_borders(lwd = 0.5) +
tm_shape(cities, name = "Greater City") +
tm_dots()