1 Intro

Quick exploratory analysis of cigarette vending machine locations in Heidelberg, uncovering insights and patterns to better understand their spatial distribution.

2 Setup

This section for reproducibility tbd.

2.1 libraries

# main libraries
library(tidyverse)
library(glue)
library(sf)
library(units)
library(ggplot2)
library(spatstat)
library(ggeffects)
library(tmap)
library(osmdata)
library(DT)
library(plyr)
library(tictoc)
library(openrouteservice)
library(jsonlite)

api_key <- readLines("config.txt", n = 1)

2.2 Data download & preprocess

# read in localy
# cvm <- st_read("cvm_data.gpkg", layer = "cvm", quiet=T)
# schools <- st_read("cvm_data.gpkg", layer = "schools", quiet=T)
# kindergartens <- st_read("cvm_data.gpkg", layer = "kindergarten", quiet=T)
# hd_boundary <- st_read("cvm_data.gpkg", layer = "hd bound", quiet=T)

# query osm via overpass
hd_boundary <- opq(bbox = 'Heidelberg, Germany') |>
  add_osm_feature(key = 'name', value = 'Heidelberg') |>
  add_osm_feature(key = 'admin_level', value = '6') |>
  osmdata_sf()
hd_boundary <- hd_boundary$osm_multipolygons

hd_districts <- opq(bbox = 'Heidelberg, Germany') |>
  add_osm_feature(key = 'admin_level', value = '9') |>
  osmdata_sf()
hd_districts <- hd_districts$osm_multipolygons

cvm <- opq(bbox = 'Heidelberg, Germany') |>
  add_osm_feature(key = 'vending', value = 'cigarettes') |>
  osmdata_sf()
cvm <- cvm$osm_points

osm_schools <- opq(bbox = 'Heidelberg, Germany') |>
  add_osm_feature(key = 'amenity', value = 'school') |>
  osmdata_sf()

osm_kindergarten <- opq(bbox = 'Heidelberg, Germany') |>
  add_osm_feature(key = 'amenity', value = 'kindergarten') |>
  osmdata_sf()




# Check for overlaps. Are schools/kindergartens all mapped with their building footprint or do we see some mapped as points only

# How many school points do not overlap with school polygons
# osm_schools$osm_points |> 
#   filter(!st_intersects(osm_schools$osm_polygons$geometry |> st_union(), sparse=F)) |> 
#   nrow()
# 
# # How many school polygons do not overlap with school multipolygons/relations
# osm_schools$osm_polygons |> 
#   st_point_on_surface() |> 
#   filter(!st_intersects(osm_schools$osm_multipolygons$geometry |> st_union(), sparse=F)) |> nrow()
# 
# # How many kindergarten points do not overlap with kindergarten polygons
# osm_kindergarten$osm_points |> 
#   filter(!st_intersects(osm_schools$osm_polygons$geometry |> st_union(), sparse=F)) |> 
#   nrow()
# 
# # How many kindergarten polygons do not overlap with kindergarten multipolygons/relations
# osm_kindergarten$osm_polygons |> 
#   st_point_on_surface() |> 
#   filter(!st_intersects(osm_schools$osm_multipolygons$geometry |> st_union(), sparse=F)) |> nrow()

# overview on types and objects
datatable(data.frame(object=c(rep("kindergarten",3),rep("school",3)),type=rep(c("nodes","ways","relations"),2), n=c(osm_kindergarten$osm_lines |> nrow(), osm_kindergarten$osm_polygons |> nrow(), osm_kindergarten$osm_multipolygons |> nrow(), osm_schools$osm_points |> nrow(), osm_schools$osm_polygons |> nrow(), osm_schools$osm_multipolygons |> nrow())))
hd_districts <- hd_districts |>
  filter(st_intersects(st_centroid(geometry), hd_boundary$geometry, sparse=F))


# generate the centroid/point of surface for all school and kindergarten buildings
schools <- osm_schools$osm_polygons |> st_point_on_surface() |>
  filter(st_intersects(geometry, hd_boundary$geometry, sparse=F))
kindergartens <- osm_kindergarten$osm_polygons |> st_point_on_surface() |>
  filter(st_intersects(geometry, hd_boundary$geometry, sparse=F))
cvm <- cvm |>
  filter(st_intersects(geometry, hd_boundary$geometry, sparse=F))

OpenStreetMap (OSM) is a collaborative map of the world created by people like you and me. The data model of OpenStreetMap consists of three main elements: nodes, ways, and relations.

  • Nodes: Nodes are the basic building blocks of the map. Think of them as small points on the map representing specific geographic features, such as a single tree, a streetlamp, a shop, or even a significant landmark like the Eiffel Tower. Each node is defined by its unique latitude and longitude coordinates, which pinpoint its exact location on the Earth’s surface.

  • Ways: Ways are like paths or lines connecting multiple nodes. They represent linear features like roads, rivers, railway tracks, and footpaths. A way is created by connecting multiple nodes in sequence. For example, a road can be represented as a way by connecting a series of nodes that follow the path of the road.

  • Relations: Relations are a way to describe more complex and interconnected features on the map. They group multiple nodes, ways, or even other relations together to represent things like bus routes, building outlines, or multi-part structures. Relations allow us to create more detailed and organized representations of the real world.

Example of a school relation in Heidelberg is the Kurfürst-Friedrich-Gymnasium which has 5 buildings and two distinct locations (https://www.openstreetmap.org/relation/13561009)

Amount of school buildings in Heidelberg: 49

Amount of kindergarten buildings in Heidelberg: 80

Amount of cigarette vending machines: 145

3 ESDA

Calculate distances between each school/kindergarten and CVM location. Aggregate the resulting matrix rowwise (by school/kindergarten) to get the minimum distance towards every distinct CVM.

dist_mat_schools <- st_distance(schools, cvm)
dist_mat_kindergartens <- st_distance(kindergartens, cvm)

distinct_min_schools <- apply(dist_mat_schools, 1, min)
distinct_min_kindergartens <- apply(dist_mat_kindergartens, 1, min)
Type Extreme Distance
kindergarte min 15.579
kindergarten max 2414.505
school min 39.773
school min 2330.165

Furthest distance towards a CVM is ~ 2600 Meter. We create a continuous dataframe with values from 0 to 2600 Meter and add cumulative counts for CVM within the respective distance to/from schools/kindergartens.

breaks <- seq(0, 2600, by = 1)  # Adjust the 'by' value as per your requirement

dist_counts_schools <- table(cut(distinct_min_schools, breaks = breaks, right = FALSE))
dist_counts_kindergartens <- table(cut(distinct_min_kindergartens, breaks = breaks, right = FALSE))

result_df <- data.frame(distance = breaks[2:length(breaks)],
                        cvm_count_schools = as.numeric(dist_counts_schools),
                        cvm_count_kindergartens = as.numeric(dist_counts_kindergartens))

result_df <- result_df |>
  mutate(cvm_count_schools_cum = cumsum(cvm_count_schools),
         cvm_count_kindergartens_cum = cumsum(cvm_count_kindergartens))

4 Viz

plot(result_df$distance, result_df$cvm_count_kindergartens_cum, type = "l", col = "blue", lwd = 2,
     xlab = "Distance in m", ylab = "cumulative count", main = "CVM distance to schools/kindergartens")
lines(result_df$distance, result_df$cvm_count_schools_cum, col = "green", lwd = 2)
legend("topleft", legend = c("kindergartens", "schools"), col = c("blue", "green"), lwd = 2, lty = 1)

# zoom 0 - 150

plot(result_df$distance[1:150], result_df$cvm_count_kindergartens_cum[1:150], type = "l", col = "blue", lwd = 2,
     xlab = "Distance in m", ylab = "cumulative count", main = "CVM distance to schools/kindergartens")
lines(result_df$distance[1:150], result_df$cvm_count_schools_cum[1:150], col = "green", lwd = 2)
legend("topleft", legend = c("kindergartens", "schools"), col = c("blue", "green"), lwd = 2, lty = 1)

The first plot indicates a steep growth in CVM locations within 0 - 500 meters distance to/from schools & kindergartens. However when zoomed in we see minimal growth in school locations but some growth in kindergarten locations below 100m.

But how many are exactly within 100m?

result_df |> 
  filter(distance<=100) |> 
  summarize(schools=sum(cvm_count_schools),
            kindergartens=sum(cvm_count_kindergartens))
##   schools kindergartens
## 1       2            18
tmap_mode("view")
## tmap mode set to interactive viewing
facilities <- rbind(schools |> select(c(amenity)),
                 kindergartens |> select(c(amenity)))

facilities_buff100 <- facilities |> st_transform(25832) |> st_buffer(100) |> st_transform(4326)

facilities_buff200 <- facilities |> st_transform(25832) |> st_buffer(200) |> st_transform(4326)

tm_basemap("OpenStreetMap") +
  tm_shape(facilities) +
  tm_symbols(
    col = "amenity",
    palette = c("darkgreen", "darkred"),
    size = 0.1,
    border.lwd = 0
  ) +
  tm_shape(cvm) +
  tm_symbols(col = "black", size = 0.1) +
  tm_shape(facilities_buff100) +
  tm_borders(col="grey50") +
  tm_shape(facilities_buff200) +
  tm_borders(col="grey25")

5 Model

schools_m <- glm(cvm_count_schools_cum ~ distance, data = result_df, family = poisson)
summary(schools_m)
## 
## Call:
## glm(formula = cvm_count_schools_cum ~ distance, family = poisson, 
##     data = result_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 3.442e+00  6.491e-03  530.30   <2e-16 ***
## distance    2.314e-04  4.027e-06   57.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 12629.8  on 2599  degrees of freedom
## Residual deviance:  9296.5  on 2598  degrees of freedom
## AIC: 23452
## 
## Number of Fisher Scoring iterations: 4
ggpredict(schools_m)
## $distance
## # Predicted counts of cvm_count_schools_cum
## 
## distance | Predicted |         95% CI
## -------------------------------------
##        0 |     31.25 | [30.86, 31.66]
##      400 |     34.29 | [33.94, 34.63]
##      600 |     35.91 | [35.60, 36.23]
##     1000 |     39.39 | [39.13, 39.66]
##     1400 |     43.22 | [42.96, 43.47]
##     1600 |     45.26 | [44.99, 45.54]
##     2000 |     49.65 | [49.29, 50.02]
##     2600 |     57.05 | [56.43, 57.68]
## 
## Not all rows are shown in the ouput. Use `print(..., n = Inf)` to show
##   all rows.
## 
## attr(,"class")
## [1] "ggalleffects" "list"        
## attr(,"model.name")
## [1] "schools_m"
kindergarten_m <- glm(cvm_count_kindergartens_cum ~ distance, data = result_df, family = poisson)
summary(kindergarten_m)
## 
## Call:
## glm(formula = cvm_count_kindergartens_cum ~ distance, family = poisson, 
##     data = result_df)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 3.947e+00  5.067e-03  778.93   <2e-16 ***
## distance    2.181e-04  3.155e-06   69.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 15303  on 2599  degrees of freedom
## Residual deviance: 10486  on 2598  degrees of freedom
## AIC: 26039
## 
## Number of Fisher Scoring iterations: 4
ggpredict(kindergarten_m)
## $distance
## # Predicted counts of cvm_count_kindergartens_cum
## 
## distance | Predicted |         95% CI
## -------------------------------------
##        0 |     51.77 | [51.26, 52.29]
##      400 |     56.49 | [56.05, 56.93]
##      600 |     59.01 | [58.61, 59.42]
##     1000 |     64.39 | [64.05, 64.73]
##     1400 |     70.26 | [69.94, 70.58]
##     1600 |     73.39 | [73.05, 73.74]
##     2000 |     80.08 | [79.62, 80.55]
##     2600 |     91.28 | [90.50, 92.07]
## 
## Not all rows are shown in the ouput. Use `print(..., n = Inf)` to show
##   all rows.
## 
## attr(,"class")
## [1] "ggalleffects" "list"        
## attr(,"model.name")
## [1] "kindergarten_m"
cvm_ppp <- cvm |> st_transform(25832) |> as.ppp()
## Warning in as.ppp.sf(st_transform(cvm, 25832)): only first attribute column is
## used for marks
marks(cvm_ppp) <- NULL

hd_owin <- hd_boundary |> st_transform(25832) |> as.owin()
Window(cvm_ppp) <- hd_owin

ripley_K <- Kest(cvm_ppp, correction = "Ripley")
plot(ripley_K, main = "Ripley's K Function")

The K function indicates the location of CVMs are not subject to complete spatial randomness, but are clustered.

6 Surroundings of CVM’s in Heidelberg

How many CVM’s per Heidelberg district

# join the HD districts
cvm <- st_join(cvm, hd_districts |> select(name))
cvm_nogeom <- cvm
st_geometry(cvm_nogeom) <- NULL

cvm_nogeom |> dplyr::group_by(name) |> 
  dplyr::summarise(count=n()) |> arrange(desc(count)) |> datatable()

Next we use the POI endpoint of openrouteservice to get information of all POIs in 100m distance from each CVM in Heidelberg.

for (i in 1:nrow(cvm)){
  tic(glue("run {i} / {nrow(cvm)}"))
  single_cvm <- cvm[i,]
  
  geometry <- list(
    geojson = list(
      type = "Point",
      coordinates = single_cvm |> st_coordinates() |> c()
    ),
    buffer = 100
  )
  
  json_data <-
    ors_pois(
      request = 'stats',
      geometry = geometry,
      limit = 2000,
      sortby = "distance",
      api_key = api_key,
      output = "text"
    )
  
  
  
  
  parsed_data <- fromJSON(json_data)
  
  result <- lapply(parsed_data$places, function(x) {
  
  if (!(is.atomic(x))) {
    names <- names(x$categories)
    counts <- sapply(x$categories, function(y) y$count)
    data.frame(
      categories = names,
      count = counts,
      stringsAsFactors = FALSE
    )
  } else {
    NULL
  }
})
  result_df <- do.call(rbind, result)
  result_df$places <- rownames(result_df)
  rownames(result_df) <- NULL
  result_df$cvm_osm_id <- single_cvm$osm_id
  result_df$name <- single_cvm$name
  
  
  
  result_df$places <- gsub("\\..*", "", result_df$places)
  

  
  if (i == 1) {
    final <- result_df
  } else {
    final <- rbind.fill(final, result_df)
  }
  toc(quiet = T)
  
}

Pie charts on POIs around CVM’s in Heidelberg and by district.

#categories_agg <- final |> dplyr::group_by(categories, name) |> 
#  dplyr::summarise(total_count=sum(count))

places_acc <- final |> dplyr::group_by(places, name) |> 
  dplyr::summarise(total_count=sum(count))
## `summarise()` has grouped output by 'places'. You can override using the
## `.groups` argument.
ggplot(places_acc, aes(x="", y=total_count, fill=places)) +
  geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0)

ggplot(places_acc, aes(x="", y=total_count, fill=places)) +
  geom_bar(width = 1, stat = "identity")+
  facet_wrap(vars(name)) + coord_polar("y", start=0)