Mapillary Coverage Analysis#

In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.

We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.

These are the steps you see further down:

  • Set the connection parameters.

  • Prepare your input parameters, e.g. define area of interest and OSM tag filter.

  • Download data using DuckDB. This time we also download Mapillary data.

  • Display both datasets on a map.

  • Overlay OSM and Mapillary data using DuckDB.

Getting started#

Set connection params.

from dotenv import load_dotenv
load_dotenv()
True
import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB, e.g. set available RAM and CPUs.

!pip install duckdb==1.0.0
import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB'
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set the connection params to Iceberg Rest Catalog.

from pyiceberg.catalog.rest import RestCatalog

catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))

mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"

# Define status filter
status = 'latest'

# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'berlin': (13.088345, 52.338271, 13.761161, 52.675509)
}

selected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"

epsg_codes = {
    'heidelberg': 'EPSG:32632',
    'mannheim': 'EPSG:32632',
    'nairobi': 'EPSG:32737',
    'berlin': 'EPSG:32632'
}
epsg_code = epsg_codes[selected_region]


# Define geometry type filter
geometry_type = 'LineString'

Get OSM data#

Download latest OSM data for bounding box.

import time
start_time = time.time()

icebergtable.scan(
    row_filter=(
        f"status = '{status}' "
        f"and geometry_type = '{geometry_type}' "
        f"and (xmax >= {xmin} and xmin <= {xmax}) "
        f"and (ymax >= {ymin} and ymin <= {ymax}) "
    ),
    selected_fields=(
        "user_id",
        "osm_id",
        "osm_version",
        "valid_from",
        "tags",
        "geometry",
    ),
).to_duckdb('raw_osm_data',connection=con)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 14.303 sec.

Clip OSM highways with to area of interest and calculate road length in kilometer.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS 
(
SELECT
    a.osm_id,
    a.tags,
    tags['highway'][1] as highway_value,
    ST_GeomFromText(a.geometry) as geometry,
    a.geometry.ST_GeomFromText().ST_Length_Spheroid() / 1000 as length_km,
FROM
    raw_osm_data as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and tags['highway'][1] is not null
    -- spatial filtering part
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")
processing took 2.446 sec.

Display OSM data on a map.

import geopandas as gpd

map_query = """
    SELECT osm_id, tags, ST_AsText(geometry) as geometry, highway_value, length_km FROM osm_data;
"""

osm_data_df = con.sql(map_query).df()

# convert the data to geodata
osm_data_gdf = gpd.GeoDataFrame(
    osm_data_df,
    geometry=gpd.GeoSeries.from_wkt(osm_data_df['geometry'])
).set_crs('epsg:4326')
import lonboard

# the lonboard map definition
osm_layer = lonboard.PathLayer.from_geopandas(
    osm_data_gdf,
    get_color=[0,255,255,255],
    width_min_pixels=0.8,
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

osm_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[osm_layer],
    view_state=view_state
)

display(osm_map)

Get Mapillary Data#

We are going to download Mapillary sequences.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS mapillary_data;
CREATE TABLE mapillary_data AS 
(
SELECT
    a.id,
    ST_GeomFromText(a.geometry) as geometry
FROM
    read_parquet('{mapillary_parquet_data_path}') as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})
    and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 1.716 sec.

Display Mapillary data on a map.

map_query = """
    SELECT
        id,
        ST_AsText(geometry) as geometry
    FROM mapillary_data;
"""

df = con.sql(map_query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')
# the lonboard map definition
mapillary_layer = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=[255,0,255,255],
    width_min_pixels=0.8,
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

mapillary_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[mapillary_layer],
    view_state=view_state
)

display(mapillary_map)

Find OSM ways not covered by (5m buffered) Mapillary sequence#

query = f"""
CREATE OR REPLACE TABLE not_covered_osm_data AS
(
WITH mapillary_coverage AS
(
SELECT
    id,
    geometry.ST_Transform('EPSG:4326', '{epsg_code}')
        .ST_Buffer(10)
        .ST_Transform('{epsg_code}', 'EPSG:4326')
    as geometry
FROM mapillary_data
),
match as (
SELECT
    a.osm_id,
    a.tags,
    a.geometry,
    SUM(
        CASE WHEN b.id IS NOT NULL THEN 1 ELSE 0 END
    ) as matching_mapillary_sequences
FROM osm_data a
LEFT JOIN mapillary_coverage b ON 
    ST_Intersects(a.geometry, b.geometry)
GROUP BY a.*
)
select
    osm_id,
    tags,
    tags['highway'][1] as highway_value,
    ST_AsText(geometry) as geometry,
    ST_Length_Spheroid(geometry) / 1000 as length_km,
from match
where matching_mapillary_sequences = 0
);
"""
con.sql(query)

Display results on a map.

query = "SELECT * FROM not_covered_osm_data"
df = con.sql(query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')
# the lonboard map definition
not_covered_osm_data = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=[0,255,255,255],
    get_width=3,
    width_min_pixels=0.8
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

mapillary_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[not_covered_osm_data, mapillary_layer],
    view_state=view_state
)

display(mapillary_map)

Calculate share of road network covered by Mapillary#

total_osm_length = osm_data_df["length_km"].sum()
not_covered_osm_length = df["length_km"].sum()
mapillary_coverage = (total_osm_length - not_covered_osm_length) / total_osm_length

print(f"Mapillary coverage: {round(mapillary_coverage, 3)}")
Mapillary coverage: 0.277

Calculate Mapillary coverage per road type. Here we list the top 20 only.

query = """
with uncovered_stats AS
(
SELECT
    a.highway_value,
    SUM(a.length_km) as not_covered_length_km
FROM not_covered_osm_data as a
GROUP by a.highway_value
ORDER BY 2 DESC 
),
total_stats AS
(
SELECT
    a.highway_value,
    SUM(a.length_km) as total_length_km,
FROM osm_data as a
LEFT JOIN uncovered_stats b ON
    a.highway_value = b.highway_value
GROUP by a.highway_value
ORDER BY 2 DESC
)
SELECT
    a.highway_value,
    round(a.total_length_km, 3) as total_length_km,
    round(b.not_covered_length_km, 3) as not_covered_length_km ,
    round((a.total_length_km - b.not_covered_length_km) / a.total_length_km, 3) as mapillary_coverage
FROM total_stats a
LEFT JOIN uncovered_stats b ON
    a.highway_value = b.highway_value
ORDER BY mapillary_coverage DESC
LIMIT 20;
"""
con.sql(query).show()
┌────────────────┬─────────────────┬───────────────────────┬────────────────────┐
│ highway_value  │ total_length_km │ not_covered_length_km │ mapillary_coverage │
│    varchar     │     double      │        double         │       double       │
├────────────────┼─────────────────┼───────────────────────┼────────────────────┤
│ motorway       │          40.216 │                 2.602 │              0.935 │
│ trunk          │         183.263 │                14.096 │              0.923 │
│ motorway_link  │          13.904 │                 1.647 │              0.882 │
│ trunk_link     │          22.641 │                 3.948 │              0.826 │
│ primary_link   │           12.95 │                 2.376 │              0.817 │
│ primary        │         105.382 │                33.001 │              0.687 │
│ cycleway       │          29.469 │                11.735 │              0.602 │
│ secondary      │         365.844 │               152.154 │              0.584 │
│ secondary_link │           9.282 │                 4.035 │              0.565 │
│ tertiary       │         180.208 │                88.922 │              0.507 │
│ steps          │           1.386 │                 0.724 │              0.478 │
│ tertiary_link  │           1.126 │                 0.629 │              0.442 │
│ unclassified   │         354.191 │               219.945 │              0.379 │
│ pedestrian     │           4.059 │                 2.855 │              0.297 │
│ residential    │        3576.012 │              2809.398 │              0.214 │
│ service        │        1391.092 │              1110.059 │              0.202 │
│ footway        │         136.649 │               109.162 │              0.201 │
│ living_street  │           8.259 │                 6.724 │              0.186 │
│ track          │         401.247 │               326.452 │              0.186 │
│ path           │         212.552 │               192.475 │              0.094 │
├────────────────┴─────────────────┴───────────────────────┴────────────────────┤
│ 20 rows                                                             4 columns │
└───────────────────────────────────────────────────────────────────────────────┘