Mapillary Coverage Analysis#
In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.
We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.
These are the steps you see further down:
Set the connection parameters.
Prepare your input parameters, e.g. define area of interest and OSM tag filter.
Download data using DuckDB. This time we also download Mapillary data.
Display both datasets on a map.
Overlay OSM and Mapillary data using DuckDB.
Getting started#
Set connection params.
from dotenv import load_dotenv
load_dotenv()
True
import os
s3_user = os.environ["S3_ACCESS_KEY_ID"] # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"] # add your password here
Configure DuckDB, e.g. set available RAM and CPUs.
!pip install duckdb==1.0.0
import duckdb
con = duckdb.connect(
config={
'threads': 8,
'max_memory': '8GB'
}
)
con.install_extension("spatial")
con.load_extension("spatial")
Set the connection params to Iceberg Rest Catalog.
from pyiceberg.catalog.rest import RestCatalog
catalog = RestCatalog(
name="default",
**{
"uri": "https://sotm2024.iceberg.ohsome.org",
"s3.endpoint": "https://sotm2024.minio.heigit.org",
"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
"s3.access-key-id": s3_user,
"s3.secret-access-key": s3_password,
"s3.region": "eu-central-1"
}
)
Set connection to MinIO object storage.
query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
TYPE S3,
KEY_ID '{s3_user}',
SECRET '{s3_password}',
REGION 'eu-central-1',
endpoint 'sotm2024.minio.heigit.org',
use_ssl true,
url_style 'path'
);
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true │
└─────────┘
Prepare the input parameters for your analysis#
# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))
mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"
# Define status filter
status = 'latest'
# Define location filter
bboxes = {
'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
'mannheim': (8.41416, 49.410362, 8.58999, 49.590489),
'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
'berlin': (13.088345, 52.338271, 13.761161, 52.675509)
}
selected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"
epsg_codes = {
'heidelberg': 'EPSG:32632',
'mannheim': 'EPSG:32632',
'nairobi': 'EPSG:32737',
'berlin': 'EPSG:32632'
}
epsg_code = epsg_codes[selected_region]
# Define geometry type filter
geometry_type = 'LineString'
Get OSM data#
Download latest OSM data for bounding box.
import time
start_time = time.time()
icebergtable.scan(
row_filter=(
f"status = '{status}' "
f"and geometry_type = '{geometry_type}' "
f"and (xmax >= {xmin} and xmin <= {xmax}) "
f"and (ymax >= {ymin} and ymin <= {ymax}) "
),
selected_fields=(
"user_id",
"osm_id",
"osm_version",
"valid_from",
"tags",
"geometry",
),
).to_duckdb('raw_osm_data',connection=con)
download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 14.303 sec.
Clip OSM highways with to area of interest and calculate road length in kilometer.
import time
start_time = time.time()
query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS
(
SELECT
a.osm_id,
a.tags,
tags['highway'][1] as highway_value,
ST_GeomFromText(a.geometry) as geometry,
a.geometry.ST_GeomFromText().ST_Length_Spheroid() / 1000 as length_km,
FROM
raw_osm_data as a,
st_read('{area_of_interest_file}') as aoi
WHERE 1=1
and tags['highway'][1] is not null
-- spatial filtering part
and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)
processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")
processing took 2.446 sec.
Display OSM data on a map.
import geopandas as gpd
map_query = """
SELECT osm_id, tags, ST_AsText(geometry) as geometry, highway_value, length_km FROM osm_data;
"""
osm_data_df = con.sql(map_query).df()
# convert the data to geodata
osm_data_gdf = gpd.GeoDataFrame(
osm_data_df,
geometry=gpd.GeoSeries.from_wkt(osm_data_df['geometry'])
).set_crs('epsg:4326')
import lonboard
# the lonboard map definition
osm_layer = lonboard.PathLayer.from_geopandas(
osm_data_gdf,
get_color=[0,255,255,255],
width_min_pixels=0.8,
)
view_state = {
"longitude": xmin + ((xmax - xmin) / 2),
"latitude": ymin + ((ymax - ymin) / 2),
"zoom": 12
}
osm_map = lonboard.Map(
basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
layers=[osm_layer],
view_state=view_state
)
display(osm_map)
Get Mapillary Data#
We are going to download Mapillary sequences.
import time
start_time = time.time()
query = f"""
DROP TABLE IF EXISTS mapillary_data;
CREATE TABLE mapillary_data AS
(
SELECT
a.id,
ST_GeomFromText(a.geometry) as geometry
FROM
read_parquet('{mapillary_parquet_data_path}') as a,
st_read('{area_of_interest_file}') as aoi
WHERE 1=1
and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})
and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})
and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)
download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 1.716 sec.
Display Mapillary data on a map.
map_query = """
SELECT
id,
ST_AsText(geometry) as geometry
FROM mapillary_data;
"""
df = con.sql(map_query).df()
# convert the data to geodata
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')
# the lonboard map definition
mapillary_layer = lonboard.PathLayer.from_geopandas(
gdf,
get_color=[255,0,255,255],
width_min_pixels=0.8,
)
view_state = {
"longitude": xmin + ((xmax - xmin) / 2),
"latitude": ymin + ((ymax - ymin) / 2),
"zoom": 12
}
mapillary_map = lonboard.Map(
basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
layers=[mapillary_layer],
view_state=view_state
)
display(mapillary_map)
Find OSM ways not covered by (5m buffered) Mapillary sequence#
query = f"""
CREATE OR REPLACE TABLE not_covered_osm_data AS
(
WITH mapillary_coverage AS
(
SELECT
id,
geometry.ST_Transform('EPSG:4326', '{epsg_code}')
.ST_Buffer(10)
.ST_Transform('{epsg_code}', 'EPSG:4326')
as geometry
FROM mapillary_data
),
match as (
SELECT
a.osm_id,
a.tags,
a.geometry,
SUM(
CASE WHEN b.id IS NOT NULL THEN 1 ELSE 0 END
) as matching_mapillary_sequences
FROM osm_data a
LEFT JOIN mapillary_coverage b ON
ST_Intersects(a.geometry, b.geometry)
GROUP BY a.*
)
select
osm_id,
tags,
tags['highway'][1] as highway_value,
ST_AsText(geometry) as geometry,
ST_Length_Spheroid(geometry) / 1000 as length_km,
from match
where matching_mapillary_sequences = 0
);
"""
con.sql(query)
Display results on a map.
query = "SELECT * FROM not_covered_osm_data"
df = con.sql(query).df()
# convert the data to geodata
gdf = gpd.GeoDataFrame(
df,
geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')
# the lonboard map definition
not_covered_osm_data = lonboard.PathLayer.from_geopandas(
gdf,
get_color=[0,255,255,255],
get_width=3,
width_min_pixels=0.8
)
view_state = {
"longitude": xmin + ((xmax - xmin) / 2),
"latitude": ymin + ((ymax - ymin) / 2),
"zoom": 12
}
mapillary_map = lonboard.Map(
basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
layers=[not_covered_osm_data, mapillary_layer],
view_state=view_state
)
display(mapillary_map)