DuckDB: Data Extraction from geo-sorted ohsome contributions

DuckDB: Data Extraction from geo-sorted ohsome contributions#

Note

  1. Set the connection params to MinIO s3 object storage and configure DuckDB.

  2. Download the data in 2 steps:

    • Download and filter data with DuckDB in a single step.

    • Export results into geopackage file with GeoPandas.

Getting started#

Set connection params.

import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB.

import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB',
        # 'enable_object_cache': True
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Download with DuckDB#

In this step we can already filter all OSM contributions by four major factors. We will perform more detailed filtering (e.g. for OSM tags values) later:

  • status (e.g. latest, historic or deleted OSM features)

  • location (using the bounding box coordinates of each OSM feature)

  • geometry type (e.g. for Polygons, Linestrings or Points)

  • time (e.g. the edit timestamp of each OSM contribution)

# Set s3 path for parquet input data
#parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions_germany/**"
parquet_data_path = "s3a://heigit-ohsome-sotm24/data/geo_sort_ext/contributions/**"

# Define status filter
status = 'latest'

# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489), 
    'berlin': (13.088345, 52.338271, 13.761161, 52.675509)
}

selected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file =f"../data/{selected_region}.geojson"

# Define geometry type filter
geometry_type = 'Polygon'

# Define time filter (optional)
min_timestamp = '2024-01-01T00:00:00'
max_timestamp = '2024-06-01T00:00:00'

Furthermore, we define which attributes / columns this download should contain. Check out the dataset description page to get an overview on all available columns.

Usually you rarely want to extract all available columns as this would reduce speed of the data download. Here we are going to download the following information:

  • user_id

  • osm_id

  • osm_version

  • valid_from

  • tags

  • geometry

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS
(
SELECT
    a.user_id,
    a.osm_id,
    a.osm_version,
    a.valid_from,
    a.tags,
    a.geometry   
FROM
    read_parquet('{parquet_data_path}', hive_partitioning=true) a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and status = 'latest'
    and geometry_type = 'Polygon'
    and tags['building'][1] is not null
    and tags['building'][1] != 'no'
    -- spatial filtering part
     and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})
     and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})
     and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)


download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 34.878 sec.

Save data as GeoPackage#

Show the structure of the data we have just downloaded.

query = """
DESCRIBE
FROM osm_data;
"""
con.sql(query)
┌─────────────┬───────────────────────┬─────────┬─────────┬─────────┬─────────┐
│ column_name │      column_type      │  null   │   key   │ default │  extra  │
│   varchar   │        varchar        │ varchar │ varchar │ varchar │ varchar │
├─────────────┼───────────────────────┼─────────┼─────────┼─────────┼─────────┤
│ user_id     │ INTEGER               │ YES     │ NULL    │ NULL    │ NULL    │
│ osm_id      │ VARCHAR               │ YES     │ NULL    │ NULL    │ NULL    │
│ osm_version │ INTEGER               │ YES     │ NULL    │ NULL    │ NULL    │
│ valid_from  │ TIMESTAMP             │ YES     │ NULL    │ NULL    │ NULL    │
│ tags        │ MAP(VARCHAR, VARCHAR) │ YES     │ NULL    │ NULL    │ NULL    │
│ geometry    │ VARCHAR               │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────┴───────────────────────┴─────────┴─────────┴─────────┴─────────┘

Inspect a few features.

query = """
SELECT *
FROM osm_data
LIMIT 5;
"""
con.sql(query)
┌─────────┬───────────────┬─────────────┬─────────────────────┬──────────────────────┬─────────────────────────────────┐
│ user_id │    osm_id     │ osm_version │     valid_from      │         tags         │            geometry             │
│  int32  │    varchar    │    int32    │      timestamp      │ map(varchar, varch…  │             varchar             │
├─────────┼───────────────┼─────────────┼─────────────────────┼──────────────────────┼─────────────────────────────────┤
│  202726 │ way/389789385 │           1 │ 2016-01-05 21:45:29 │ {building=yes, sou…  │ POLYGON ((8.6298593 49.379877…  │
│  202726 │ way/97716061  │           2 │ 2016-01-05 21:45:53 │ {building=yes, sou…  │ POLYGON ((8.6300405 49.379283…  │
│  769836 │ way/97720617  │           7 │ 2023-09-21 21:19:00 │ {building=apartmen…  │ POLYGON ((8.6330643 49.379926…  │
│  769836 │ way/97720623  │           7 │ 2023-09-21 21:19:00 │ {building=apartmen…  │ POLYGON ((8.632587599999999 4…  │
│  769836 │ way/97753107  │           7 │ 2023-09-26 19:35:10 │ {building=apartmen…  │ POLYGON ((8.6317677 49.379866…  │
└─────────┴───────────────┴─────────────┴─────────────────────┴──────────────────────┴─────────────────────────────────┘

Count the number of features.

query = f"""
SELECT count(*)
FROM osm_data
"""
con.sql(query)
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        21160 │
└──────────────┘

Export as GeoPackage via GeoPandas.

import geopandas as gpd

start_time = time.time()
query = f"""
    SELECT osm_data.*
    FROM
        osm_data,
        st_read('{area_of_interest_file}') as aoi
    WHERE 1=1
        and ST_Intersects(st_GeomFromText(osm_data.geometry), aoi.geom)
"""
df = con.sql(query).df()

gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

output_filename = "../data/heidelberg_osm_data.gpkg"
gdf.to_file(output_filename, driver='GPKG')
processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")
processing took 1.778 sec.

Work with the data in QGIS#

Add your geopackage file in QGIS, e.g. via drag-and-drop or through file manager.