Attribute Completeness for Street Surface Tags

Attribute Completeness for Street Surface Tags#

In this notebook we demonstrate how can assess the completeness of tags in OSM.

These are the steps you see further down:

Set the connection parameters.
Prepare your input parameters, e.g. define area of interest and attribute filters.
Download data using PyIceberg and DuckDB.
Filter and process data with DuckDB.
Visualize the results on a map.

Getting started#

Set connection params.

from dotenv import load_dotenv
load_dotenv()

True

import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB.

!pip install duckdb==1.0.0

import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB',
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set the connection params to Iceberg Rest Catalog.

!pip install "pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]"

from pyiceberg.catalog.rest import RestCatalog

catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))


# Define status filter
status = 'latest'


# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489)
}

selected_region = 'mannheim'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"


# Define geometry type filter
geometry_type = 'LineString'

Get the Data#

First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis.

import time
start_time = time.time()

icebergtable.scan(
    row_filter=(
        f"status = '{status}' "
        f"and geometry_type = '{geometry_type}' "
        f"and (xmax >= {xmin} and xmin <= {xmax}) "
        f"and (ymax >= {ymin} and ymin <= {ymax}) "
    ),
    selected_fields=(
        "osm_id",
        "tags",
        "length",
        "geometry",
    ),
).to_duckdb('raw_osm_data',connection=con)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")

download took 14.455 sec.

Filter and process data with DuckDB#

Here we extract the tag values for highways and their correspoding road surface type (if mapped). We also clip the road geometry to the area of interest and calculate the length for the clipped geometry.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS
(
SELECT
    tags['highway'][1] as highway_tag_value,
    tags['surface'][1] as surface_tag_value,
    ST_GeomFromText(a.geometry) as osm_geom,
    CASE
        WHEN ST_Within(osm_geom, aoi.geom) THEN osm_geom
        ELSE ST_Intersection(osm_geom, aoi.geom)
    END as clipped_geometry,
    CASE
        WHEN ST_Within(osm_geom, aoi.geom) THEN length
        ELSE ST_Length_Spheroid(clipped_geometry) / 1000
    END as length_km
FROM
    raw_osm_data as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and tags['highway'][1] is not null
    -- spatial filtering part
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")

processing took 4.59 sec.

Proporation of Roads with Surface Tag#

Let’s inspect how many roads have a surface tag.

query = f"""
SELECT
    SUM(length_km) as length_km_total,
    SUM(CASE
      WHEN surface_tag_value IS NOT NULL THEN length_km
      ELSE 0
    END) as length_km_with_surface_tag,
    round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
"""
con.sql(query)

┌──────────────────┬────────────────────────────┬─────────────────────────────┐
│ length_km_total  │ length_km_with_surface_tag │ proportion_with_surface_tag │
│      double      │           double           │           double            │
├──────────────────┼────────────────────────────┼─────────────────────────────┤
│ 3285612.61274888 │          2259704.672583999 │                       0.688 │
└──────────────────┴────────────────────────────┴─────────────────────────────┘

Proporation of Roads with Surface Tag per Road Type#

We can break down by road type. For some road types the surface tag is mapped better than for others.

query = f"""
SELECT
    highway_tag_value,
    SUM(length_km) as length_km_total,
    SUM(CASE
      WHEN surface_tag_value IS NOT NULL THEN length_km
      ELSE 0
    END) as length_km_with_surface_tag,
    round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
GROUP BY highway_tag_value
ORDER BY length_km_total DESC
"""
df = con.sql(query).df()

display(df)

	highway_tag_value	length_km_total	length_km_with_surface_tag	proportion_with_surface_tag
0	service	694704.602918	278210.148166	0.400
1	footway	546853.269568	307684.651441	0.563
2	track	477267.685579	285274.586297	0.598
3	residential	464907.754558	447053.629756	0.962
4	path	446605.435393	343686.039051	0.770
5	unclassified	118270.021870	97051.021870	0.821
6	tertiary	110620.269124	109950.269124	0.994
7	secondary	83886.049524	83724.049524	0.998
8	primary	70041.951266	69604.951266	0.994
9	living_street	48703.000000	42401.000000	0.871
10	motorway	47689.847847	47511.847847	0.996
11	cycleway	36750.000000	31285.000000	0.851
12	trunk	26581.871319	25673.869133	0.966
13	bridleway	25781.881633	18633.871521	0.723
14	motorway_link	18060.630577	17632.630577	0.976
15	trunk_link	14148.107011	13349.107011	0.944
16	pedestrian	12229.000000	10507.000000	0.859
17	secondary_link	11996.000000	11982.000000	0.999
18	steps	11948.002884	5219.000000	0.437
19	primary_link	10418.000000	10171.000000	0.976
20	construction	2905.231679	620.000000	0.213
21	tertiary_link	1958.000000	1902.000000	0.971
22	corridor	935.000000	339.000000	0.363
23	busway	818.000000	49.000000	0.060
24	proposed	612.000000	0.000000	0.000
25	platform	441.000000	0.000000	0.000
26	raceway	189.000000	189.000000	1.000
27	fi	177.000000	0.000000	0.000
28	bus_stop	82.000000	0.000000	0.000
29	street_lamp	32.000000	0.000000	0.000

Map of roads with missing road surface information#

import geopandas as gpd


query = f"""
SELECT
    highway_tag_value,
    surface_tag_value,
    CASE 
       WHEN surface_tag_value IS NULL THEN 'missing'
       WHEN list_contains(['paved', 'asphalt', 'chipseal', 'concrete', 'concrete:lanes', 'concrete:plates', 'paving_stones', 'sett', 'unhewn_cobblestone', 'cobblestone', 'bricks', 'metal', 'wood'], surface_tag_value) THEN 'paved'
       WHEN list_contains(['unpaved', 'compacted', 'fine_gravel', 'gravel', 'shells', 'rock', 'pebblestone', 'ground', 'dirt', 'earth', 'grass', 'grass_paver', 'metal_grid', 'mud', 'sand', 'woodchips', 'snow', 'ice', 'salt'], surface_tag_value) THEN 'unpaved'
       ELSE 'other'
    END as surface_type,
    ST_AsText(clipped_geometry) as geometry
FROM osm_data
WHERE ST_GeometryType(clipped_geometry) = 'LINESTRING'
"""
df = con.sql(query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

Define map parameters and style.

import lonboard
from palettable.matplotlib import Viridis_20


gdf["surface_type_codes"] = gdf["surface_type"].astype('category').cat.codes

surface_type_colormap = {
    "missing": [228,26,28, 255],
    "other": [152,78,163, 255],
    "paved":  [55,126,184, 255],
    "unpaved": [77,175,74, 255]
}


# the lonboard map definition
layer = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=lonboard.colormap.apply_categorical_cmap(gdf["surface_type"], surface_type_colormap, alpha=1),
    get_width=1.2,
    width_units='pixels',
    extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=1)],
    get_filter_value=gdf["surface_type_codes"].astype('float'),  # replace with desired column
    filter_range=[0.0, 3.0],  # replace with desired filter range
)


highway_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.Positron,
    layers=[layer],
)

Set toggle buttons for different road surface types.

import ipywidgets
from traitlets import directional_link

group_options = [(key, i) for i, key in enumerate(surface_type_colormap.keys()) ]

surface_toggle = ipywidgets.ToggleButtons(
    options=group_options,
    description='Surface Type:',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
)

directional_link(
    (surface_toggle, 'value'),
    (layer, "filter_range"),
    transform= lambda v: (v, v)
)

<traitlets.traitlets.directional_link at 0x7ac9f815d6d0>

Display the map. You can click on the buttons to change the surface type displayed.

display(highway_map, surface_toggle)