Attribute Completeness for Street Surface Tags#

In this notebook we demonstrate how can assess the completeness of tags in OSM.

These are the steps you see further down:

  • Set the connection parameters.

  • Prepare your input parameters, e.g. define area of interest and attribute filters.

  • Download data using PyIceberg and DuckDB.

  • Filter and process data with DuckDB.

  • Visualize the results on a map.

Getting started#

Set connection params.

from dotenv import load_dotenv
load_dotenv()
True
import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB.

!pip install duckdb==1.0.0
import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB',
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set the connection params to Iceberg Rest Catalog.

!pip install "pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]"
from pyiceberg.catalog.rest import RestCatalog

catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))


# Define status filter
status = 'latest'


# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489)
}

selected_region = 'mannheim'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"


# Define geometry type filter
geometry_type = 'LineString'

Get the Data#

First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis.

import time
start_time = time.time()

icebergtable.scan(
    row_filter=(
        f"status = '{status}' "
        f"and geometry_type = '{geometry_type}' "
        f"and (xmax >= {xmin} and xmin <= {xmax}) "
        f"and (ymax >= {ymin} and ymin <= {ymax}) "
    ),
    selected_fields=(
        "osm_id",
        "tags",
        "length",
        "geometry",
    ),
).to_duckdb('raw_osm_data',connection=con)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 14.455 sec.

Filter and process data with DuckDB#

Here we extract the tag values for highways and their correspoding road surface type (if mapped). We also clip the road geometry to the area of interest and calculate the length for the clipped geometry.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS
(
SELECT
    tags['highway'][1] as highway_tag_value,
    tags['surface'][1] as surface_tag_value,
    ST_GeomFromText(a.geometry) as osm_geom,
    CASE
        WHEN ST_Within(osm_geom, aoi.geom) THEN osm_geom
        ELSE ST_Intersection(osm_geom, aoi.geom)
    END as clipped_geometry,
    CASE
        WHEN ST_Within(osm_geom, aoi.geom) THEN length
        ELSE ST_Length_Spheroid(clipped_geometry) / 1000
    END as length_km
FROM
    raw_osm_data as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and tags['highway'][1] is not null
    -- spatial filtering part
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")
processing took 4.59 sec.

Proporation of Roads with Surface Tag#

Let’s inspect how many roads have a surface tag.

query = f"""
SELECT
    SUM(length_km) as length_km_total,
    SUM(CASE
      WHEN surface_tag_value IS NOT NULL THEN length_km
      ELSE 0
    END) as length_km_with_surface_tag,
    round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
"""
con.sql(query)
┌──────────────────┬────────────────────────────┬─────────────────────────────┐
│ length_km_total  │ length_km_with_surface_tag │ proportion_with_surface_tag │
│      double      │           double           │           double            │
├──────────────────┼────────────────────────────┼─────────────────────────────┤
│ 3285612.61274888 │          2259704.672583999 │                       0.688 │
└──────────────────┴────────────────────────────┴─────────────────────────────┘

Proporation of Roads with Surface Tag per Road Type#

We can break down by road type. For some road types the surface tag is mapped better than for others.

query = f"""
SELECT
    highway_tag_value,
    SUM(length_km) as length_km_total,
    SUM(CASE
      WHEN surface_tag_value IS NOT NULL THEN length_km
      ELSE 0
    END) as length_km_with_surface_tag,
    round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
GROUP BY highway_tag_value
ORDER BY length_km_total DESC
"""
df = con.sql(query).df()

display(df)
highway_tag_value length_km_total length_km_with_surface_tag proportion_with_surface_tag
0 service 694704.602918 278210.148166 0.400
1 footway 546853.269568 307684.651441 0.563
2 track 477267.685579 285274.586297 0.598
3 residential 464907.754558 447053.629756 0.962
4 path 446605.435393 343686.039051 0.770
5 unclassified 118270.021870 97051.021870 0.821
6 tertiary 110620.269124 109950.269124 0.994
7 secondary 83886.049524 83724.049524 0.998
8 primary 70041.951266 69604.951266 0.994
9 living_street 48703.000000 42401.000000 0.871
10 motorway 47689.847847 47511.847847 0.996
11 cycleway 36750.000000 31285.000000 0.851
12 trunk 26581.871319 25673.869133 0.966
13 bridleway 25781.881633 18633.871521 0.723
14 motorway_link 18060.630577 17632.630577 0.976
15 trunk_link 14148.107011 13349.107011 0.944
16 pedestrian 12229.000000 10507.000000 0.859
17 secondary_link 11996.000000 11982.000000 0.999
18 steps 11948.002884 5219.000000 0.437
19 primary_link 10418.000000 10171.000000 0.976
20 construction 2905.231679 620.000000 0.213
21 tertiary_link 1958.000000 1902.000000 0.971
22 corridor 935.000000 339.000000 0.363
23 busway 818.000000 49.000000 0.060
24 proposed 612.000000 0.000000 0.000
25 platform 441.000000 0.000000 0.000
26 raceway 189.000000 189.000000 1.000
27 fi 177.000000 0.000000 0.000
28 bus_stop 82.000000 0.000000 0.000
29 street_lamp 32.000000 0.000000 0.000

Map of roads with missing road surface information#

import geopandas as gpd


query = f"""
SELECT
    highway_tag_value,
    surface_tag_value,
    CASE 
       WHEN surface_tag_value IS NULL THEN 'missing'
       WHEN list_contains(['paved', 'asphalt', 'chipseal', 'concrete', 'concrete:lanes', 'concrete:plates', 'paving_stones', 'sett', 'unhewn_cobblestone', 'cobblestone', 'bricks', 'metal', 'wood'], surface_tag_value) THEN 'paved'
       WHEN list_contains(['unpaved', 'compacted', 'fine_gravel', 'gravel', 'shells', 'rock', 'pebblestone', 'ground', 'dirt', 'earth', 'grass', 'grass_paver', 'metal_grid', 'mud', 'sand', 'woodchips', 'snow', 'ice', 'salt'], surface_tag_value) THEN 'unpaved'
       ELSE 'other'
    END as surface_type,
    ST_AsText(clipped_geometry) as geometry
FROM osm_data
WHERE ST_GeometryType(clipped_geometry) = 'LINESTRING'
"""
df = con.sql(query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

Define map parameters and style.

import lonboard
from palettable.matplotlib import Viridis_20


gdf["surface_type_codes"] = gdf["surface_type"].astype('category').cat.codes

surface_type_colormap = {
    "missing": [228,26,28, 255],
    "other": [152,78,163, 255],
    "paved":  [55,126,184, 255],
    "unpaved": [77,175,74, 255]
}


# the lonboard map definition
layer = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=lonboard.colormap.apply_categorical_cmap(gdf["surface_type"], surface_type_colormap, alpha=1),
    get_width=1.2,
    width_units='pixels',
    extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=1)],
    get_filter_value=gdf["surface_type_codes"].astype('float'),  # replace with desired column
    filter_range=[0.0, 3.0],  # replace with desired filter range
)


highway_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.Positron,
    layers=[layer],
)

Set toggle buttons for different road surface types.

import ipywidgets
from traitlets import directional_link

group_options = [(key, i) for i, key in enumerate(surface_type_colormap.keys()) ]

surface_toggle = ipywidgets.ToggleButtons(
    options=group_options,
    description='Surface Type:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
)

directional_link(
    (surface_toggle, 'value'),
    (layer, "filter_range"),
    transform= lambda v: (v, v)
)
<traitlets.traitlets.directional_link at 0x7ac9f815d6d0>

Display the map. You can click on the buttons to change the surface type displayed.

display(highway_map, surface_toggle)