DuckDB: Currentness of Buildings#

In this notebook we demonstrate how to analyze and visualize the up-to-date-ness or currentness of the latest OSM data.

These are the steps you see further down:

  • Set the connection parameters.

  • Prepare your input parameters, e.g. define area of interest and OSM tag filter.

  • Download data using DuckDB.

  • Plot currentness chart with Polars.

  • Create a Map, an interactive Slider to filter the map data.

Getting started#

Set connection params.

import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB.

import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB'
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

# Set s3 path for parquet input data
parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
#parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions/**"

# Define status filter
status = 'latest'

# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489), 
    'berlin': (13.088345, 52.338271, 13.761161, 52.675509)
}

selected_region = 'heidelberg'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file =f"../data/{selected_region}.geojson"

# Define geometry type filter
geometry_type = 'Polygon'

Get the Data#

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS 
(
SELECT
    a.valid_from,
    a.tags,
    a.geometry   
FROM
    read_parquet('{parquet_data_path}', hive_partitioning=true) as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and status = '{status}'
    and geometry_type = '{geometry_type}'
    and tags['building'][1] is not null
    and tags['building'][1] != 'no'
    -- spatial filtering part
    and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})
    and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 7.882 sec.

Plot currentness chart with polars and hvplot#

import polars as pl

chart_query = """
SELECT
    date_trunc('month', valid_from) as month,
    count(*) as n_edits
    FROM osm_data
    GROUP BY month
    ORDER BY month
"""

df = con.sql(chart_query).pl()
df.plot.step(
    x="month",
    y="n_edits"
)

Display currentness of OSM features on map#

Get data from DucDKB into GeoPandas dataframe.

import geopandas as gpd

map_query = """
    SELECT
        epoch_ms(valid_from) as valid_from,
        geometry
    FROM osm_data;
"""

df = con.sql(map_query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

Define map parameters and style.

import datetime
import lonboard
from palettable.matplotlib import Viridis_20


# compute lonboard color style for contious color map
min_valid_from = 1000 * datetime.datetime(2007,1,1).timestamp()
max_valid_from = 1000 * datetime.datetime(2024,6,1).timestamp()

# normalized color values from 0 to 1
valid_from_style = gdf["valid_from"].apply(
    lambda x: (x - min_valid_from) / (max_valid_from - min_valid_from))


# the lonboard map definition
layer = lonboard.SolidPolygonLayer.from_geopandas(
    gdf,
    extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=1)],
    get_filter_value=gdf["valid_from"],  # replace with desired column
    filter_range=[min_valid_from, max_valid_from],  # replace with desired filter range
    get_fill_color=lonboard.colormap.apply_continuous_cmap(valid_from_style, Viridis_20, alpha=1)

)

currentness_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[layer]
)

Define Dates Slider Selection Widget and link to map filter range.

from datetime import date, timedelta
import ipywidgets
from traitlets import directional_link

start = datetime.datetime(2007,1,1)
end = datetime.datetime(2024,6,1)
delta = end - start   # returns timedelta
dates = [start + timedelta(days=i) for i in range(delta.days + 1)]
options = [(i.strftime('%d-%b-%Y'), int(1000* i.timestamp())) for i in dates]

date_slider = ipywidgets.SelectionRangeSlider(
    options=options,
    index=(0, len(dates)-1),
    description='Last Edit:',
    layout=ipywidgets.Layout(width='1000px'),
    disabled=False
)

directional_link(
    (date_slider, 'value'),
    (layer, "filter_range"),
    #(slider, 'value')
)
<traitlets.traitlets.directional_link at 0x7cdde89582c0>

Display the map. Have fun exploring and moving around the time slider!

display(currentness_map, date_slider)