Attribute Completeness for Street Surface Tags#
In this notebook we demonstrate how can assess the completeness of tags in OSM.
These are the steps you see further down:
Set the connection parameters.
Prepare your input parameters, e.g. define area of interest and attribute filters.
Download data using PyIceberg and DuckDB.
Filter and process data with DuckDB.
Visualize the results on a map.
Getting started#
Set connection params.
from dotenv import load_dotenv
load_dotenv()
True
import os
s3_user = os.environ["S3_ACCESS_KEY_ID"] # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"] # add your password here
Configure DuckDB.
!pip install duckdb==1.0.0
import duckdb
con = duckdb.connect(
config={
'threads': 8,
'max_memory': '8GB',
}
)
con.install_extension("spatial")
con.load_extension("spatial")
Set the connection params to Iceberg Rest Catalog.
!pip install "pyiceberg[s3fs,duckdb,sql-sqlite,pyarrow]"
from pyiceberg.catalog.rest import RestCatalog
catalog = RestCatalog(
name="default",
**{
"uri": "https://sotm2024.iceberg.ohsome.org",
"s3.endpoint": "https://sotm2024.minio.heigit.org",
"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
"s3.access-key-id": s3_user,
"s3.secret-access-key": s3_password,
"s3.region": "eu-central-1"
}
)
Set connection to MinIO object storage.
query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
TYPE S3,
KEY_ID '{s3_user}',
SECRET '{s3_password}',
REGION 'eu-central-1',
endpoint 'sotm2024.minio.heigit.org',
use_ssl true,
url_style 'path'
);
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true │
└─────────┘
Prepare the input parameters for your analysis#
# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))
# Define status filter
status = 'latest'
# Define location filter
bboxes = {
'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
'mannheim': (8.41416, 49.410362, 8.58999, 49.590489)
}
selected_region = 'mannheim'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
# Define geometry type filter
geometry_type = 'LineString'
Get the Data#
First, we do an iceberg table scan with a pre-filter. This is a fast way to download all potential OSM elements that are needed for our analysis.
import time
start_time = time.time()
icebergtable.scan(
row_filter=(
f"status = '{status}' "
f"and geometry_type = '{geometry_type}' "
f"and (xmax >= {xmin} and xmin <= {xmax}) "
f"and (ymax >= {ymin} and ymin <= {ymax}) "
),
selected_fields=(
"osm_id",
"tags",
"length",
"geometry",
),
).to_duckdb('raw_osm_data',connection=con)
download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 14.455 sec.
Filter and process data with DuckDB#
Here we extract the tag values for highways and their correspoding road surface type (if mapped). We also clip the road geometry to the area of interest and calculate the length for the clipped geometry.
import time
start_time = time.time()
query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS
(
SELECT
tags['highway'][1] as highway_tag_value,
tags['surface'][1] as surface_tag_value,
ST_GeomFromText(a.geometry) as osm_geom,
CASE
WHEN ST_Within(osm_geom, aoi.geom) THEN osm_geom
ELSE ST_Intersection(osm_geom, aoi.geom)
END as clipped_geometry,
CASE
WHEN ST_Within(osm_geom, aoi.geom) THEN length
ELSE ST_Length_Spheroid(clipped_geometry) / 1000
END as length_km
FROM
raw_osm_data as a,
st_read('{area_of_interest_file}') as aoi
WHERE 1=1
and tags['highway'][1] is not null
-- spatial filtering part
and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)
processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")
processing took 4.59 sec.
Proporation of Roads with Surface Tag#
Let’s inspect how many roads have a surface tag.
query = f"""
SELECT
SUM(length_km) as length_km_total,
SUM(CASE
WHEN surface_tag_value IS NOT NULL THEN length_km
ELSE 0
END) as length_km_with_surface_tag,
round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
"""
con.sql(query)
┌──────────────────┬────────────────────────────┬─────────────────────────────┐
│ length_km_total │ length_km_with_surface_tag │ proportion_with_surface_tag │
│ double │ double │ double │
├──────────────────┼────────────────────────────┼─────────────────────────────┤
│ 3285612.61274888 │ 2259704.672583999 │ 0.688 │
└──────────────────┴────────────────────────────┴─────────────────────────────┘
Proporation of Roads with Surface Tag per Road Type#
We can break down by road type. For some road types the surface tag is mapped better than for others.
query = f"""
SELECT
highway_tag_value,
SUM(length_km) as length_km_total,
SUM(CASE
WHEN surface_tag_value IS NOT NULL THEN length_km
ELSE 0
END) as length_km_with_surface_tag,
round(length_km_with_surface_tag / length_km_total, 3) as proportion_with_surface_tag
FROM osm_data
GROUP BY highway_tag_value
ORDER BY length_km_total DESC
"""
df = con.sql(query).df()
display(df)
highway_tag_value | length_km_total | length_km_with_surface_tag | proportion_with_surface_tag | |
---|---|---|---|---|
0 | service | 694704.602918 | 278210.148166 | 0.400 |
1 | footway | 546853.269568 | 307684.651441 | 0.563 |
2 | track | 477267.685579 | 285274.586297 | 0.598 |
3 | residential | 464907.754558 | 447053.629756 | 0.962 |
4 | path | 446605.435393 | 343686.039051 | 0.770 |
5 | unclassified | 118270.021870 | 97051.021870 | 0.821 |
6 | tertiary | 110620.269124 | 109950.269124 | 0.994 |
7 | secondary | 83886.049524 | 83724.049524 | 0.998 |
8 | primary | 70041.951266 | 69604.951266 | 0.994 |
9 | living_street | 48703.000000 | 42401.000000 | 0.871 |
10 | motorway | 47689.847847 | 47511.847847 | 0.996 |
11 | cycleway | 36750.000000 | 31285.000000 | 0.851 |
12 | trunk | 26581.871319 | 25673.869133 | 0.966 |
13 | bridleway | 25781.881633 | 18633.871521 | 0.723 |
14 | motorway_link | 18060.630577 | 17632.630577 | 0.976 |
15 | trunk_link | 14148.107011 | 13349.107011 | 0.944 |
16 | pedestrian | 12229.000000 | 10507.000000 | 0.859 |
17 | secondary_link | 11996.000000 | 11982.000000 | 0.999 |
18 | steps | 11948.002884 | 5219.000000 | 0.437 |
19 | primary_link | 10418.000000 | 10171.000000 | 0.976 |
20 | construction | 2905.231679 | 620.000000 | 0.213 |
21 | tertiary_link | 1958.000000 | 1902.000000 | 0.971 |
22 | corridor | 935.000000 | 339.000000 | 0.363 |
23 | busway | 818.000000 | 49.000000 | 0.060 |
24 | proposed | 612.000000 | 0.000000 | 0.000 |
25 | platform | 441.000000 | 0.000000 | 0.000 |
26 | raceway | 189.000000 | 189.000000 | 1.000 |
27 | fi | 177.000000 | 0.000000 | 0.000 |
28 | bus_stop | 82.000000 | 0.000000 | 0.000 |
29 | street_lamp | 32.000000 | 0.000000 | 0.000 |
Map of roads with missing road surface information#
import geopandas as gpd
query = f"""
SELECT
highway_tag_value,
surface_tag_value,
CASE
WHEN surface_tag_value IS NULL THEN 'missing'
WHEN list_contains(['paved', 'asphalt', 'chipseal', 'concrete', 'concrete:lanes', 'concrete:plates', 'paving_stones', 'sett', 'unhewn_cobblestone', 'cobblestone', 'bricks', 'metal', 'wood'], surface_tag_value) THEN 'paved'
WHEN list_contains(['unpaved', 'compacted', 'fine_gravel', 'gravel', 'shells', 'rock', 'pebblestone', 'ground', 'dirt', 'earth', 'grass', 'grass_paver', 'metal_grid', 'mud', 'sand', 'woodchips', 'snow', 'ice', 'salt'], surface_tag_value) THEN 'unpaved'
ELSE 'other'
END as surface_type,
ST_AsText(clipped_geometry) as geometry
FROM osm_data
WHERE ST_GeometryType(clipped_geometry) = 'LINESTRING'
"""
df = con.sql(query).df()
# convert the data to geodata
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')
Define map parameters and style.
import lonboard
from palettable.matplotlib import Viridis_20
gdf["surface_type_codes"] = gdf["surface_type"].astype('category').cat.codes
surface_type_colormap = {
"missing": [228,26,28, 255],
"other": [152,78,163, 255],
"paved": [55,126,184, 255],
"unpaved": [77,175,74, 255]
}
# the lonboard map definition
layer = lonboard.PathLayer.from_geopandas(
gdf,
get_color=lonboard.colormap.apply_categorical_cmap(gdf["surface_type"], surface_type_colormap, alpha=1),
get_width=1.2,
width_units='pixels',
extensions=[lonboard.layer_extension.DataFilterExtension(filter_size=1)],
get_filter_value=gdf["surface_type_codes"].astype('float'), # replace with desired column
filter_range=[0.0, 3.0], # replace with desired filter range
)
highway_map = lonboard.Map(
basemap_style=lonboard.basemap.CartoBasemap.Positron,
layers=[layer],
)
Set toggle buttons for different road surface types.
import ipywidgets
from traitlets import directional_link
group_options = [(key, i) for i, key in enumerate(surface_type_colormap.keys()) ]
surface_toggle = ipywidgets.ToggleButtons(
options=group_options,
description='Surface Type:',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
)
directional_link(
(surface_toggle, 'value'),
(layer, "filter_range"),
transform= lambda v: (v, v)
)
<traitlets.traitlets.directional_link at 0x7ac9f815d6d0>
Display the map. You can click on the buttons to change the surface type displayed.
display(highway_map, surface_toggle)