Mapillary Coverage Analysis

Mapillary Coverage Analysis#

In this notebook we demonstrate how to combine two datasets: Mapillary sequences and OSM road network.

We want to find out which parts of the road network in a city are not yet covered by Mapillary street level imagery.

These are the steps you see further down:

Set the connection parameters.
Prepare your input parameters, e.g. define area of interest and OSM tag filter.
Download data using DuckDB. This time we also download Mapillary data.
Display both datasets on a map.
Overlay OSM and Mapillary data using DuckDB.

Getting started#

Set connection params.

from dotenv import load_dotenv
load_dotenv()

True

import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB, e.g. set available RAM and CPUs.

!pip install duckdb==1.2.0

import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB'
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set the connection params to Iceberg Rest Catalog.

from pyiceberg.catalog.rest import RestCatalog

catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

Set connection to MinIO object storage.

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))

mapillary_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/mapillary_sequences/*"

# Define status filter
status = 'latest'

# Define location filter
bboxes = {
    'heidelberg': (8.629761, 49.379556, 8.742371, 49.437890),
    'mannheim': (8.41416, 49.410362, 8.58999, 49.590489),
    'nairobi': (36.650938, -1.444471, 37.103887, -1.163522),
    'berlin': (13.088345, 52.338271, 13.761161, 52.675509)
}

selected_region = 'nairobi'
xmin, ymin, xmax, ymax = bboxes[selected_region]
area_of_interest_file = f"s3a://heigit-ohsome-sotm24/data/sample_data/{selected_region}.geojson"

epsg_codes = {
    'heidelberg': 'EPSG:32632',
    'mannheim': 'EPSG:32632',
    'nairobi': 'EPSG:32737',
    'berlin': 'EPSG:32632'
}
epsg_code = epsg_codes[selected_region]


# Define geometry type filter
geometry_type = 'LineString'

Get OSM data#

Download latest OSM data for bounding box.

import time
start_time = time.time()

icebergtable.scan(
    row_filter=(
        f"status = '{status}' "
        f"and geometry_type = '{geometry_type}' "
        f"and (xmax >= {xmin} and xmin <= {xmax}) "
        f"and (ymax >= {ymin} and ymin <= {ymax}) "
    ),
    selected_fields=(
        "user_id",
        "osm_id",
        "osm_version",
        "valid_from",
        "tags",
        "geometry",
    ),
).to_duckdb('raw_osm_data',connection=con)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")

download took 14.303 sec.

Clip OSM highways with to area of interest and calculate road length in kilometer.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS osm_data;
CREATE TABLE osm_data AS 
(
SELECT
    a.osm_id,
    a.tags,
    tags['highway'] as highway_value,
    ST_GeomFromText(a.geometry) as geometry,
    a.geometry.ST_GeomFromText().ST_Length_Spheroid() / 1000 as length_km,
FROM
    raw_osm_data as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and tags['highway'] is not null
    -- spatial filtering part
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

processing_time = round(time.time() - start_time, 3)
print(f"processing took {processing_time} sec.")

processing took 2.446 sec.

Display OSM data on a map.

import geopandas as gpd

map_query = """
    SELECT osm_id, tags, ST_AsText(geometry) as geometry, highway_value, length_km FROM osm_data;
"""

osm_data_df = con.sql(map_query).df()

# convert the data to geodata
osm_data_gdf = gpd.GeoDataFrame(
    osm_data_df,
    geometry=gpd.GeoSeries.from_wkt(osm_data_df['geometry'])
).set_crs('epsg:4326')

import lonboard

# the lonboard map definition
osm_layer = lonboard.PathLayer.from_geopandas(
    osm_data_gdf,
    get_color=[0,255,255,255],
    width_min_pixels=0.8,
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

osm_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[osm_layer],
    view_state=view_state
)

display(osm_map)

Get Mapillary Data#

We are going to download Mapillary sequences.

import time
start_time = time.time()

query = f"""
DROP TABLE IF EXISTS mapillary_data;
CREATE TABLE mapillary_data AS 
(
SELECT
    a.id,
    ST_GeomFromText(a.geometry) as geometry
FROM
    read_parquet('{mapillary_parquet_data_path}') as a,
    st_read('{area_of_interest_file}') as aoi
WHERE 1=1
    and (a.bbox.xmax >= {xmin} AND a.bbox.xmin <= {xmax})
    and (a.bbox.ymax >= {ymin} AND a.bbox.ymin <= {ymax})
    and ST_Intersects(st_GeomFromText(a.geometry), aoi.geom)
)
;
"""
con.sql(query)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")

download took 1.716 sec.

Display Mapillary data on a map.

map_query = """
    SELECT
        id,
        ST_AsText(geometry) as geometry
    FROM mapillary_data;
"""

df = con.sql(map_query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

# the lonboard map definition
mapillary_layer = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=[255,0,255,255],
    width_min_pixels=0.8,
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

mapillary_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[mapillary_layer],
    view_state=view_state
)

display(mapillary_map)

Find OSM ways not covered by (5m buffered) Mapillary sequence#

query = f"""
CREATE OR REPLACE TABLE not_covered_osm_data AS
(
WITH mapillary_coverage AS
(
SELECT
    id,
    geometry.ST_Transform('EPSG:4326', '{epsg_code}')
        .ST_Buffer(10)
        .ST_Transform('{epsg_code}', 'EPSG:4326')
    as geometry
FROM mapillary_data
),
match as (
SELECT
    a.osm_id,
    a.tags,
    a.geometry,
    SUM(
        CASE WHEN b.id IS NOT NULL THEN 1 ELSE 0 END
    ) as matching_mapillary_sequences
FROM osm_data a
LEFT JOIN mapillary_coverage b ON 
    ST_Intersects(a.geometry, b.geometry)
GROUP BY a.*
)
select
    osm_id,
    tags,
    tags['highway'] as highway_value,
    ST_AsText(geometry) as geometry,
    ST_Length_Spheroid(geometry) / 1000 as length_km,
from match
where matching_mapillary_sequences = 0
);
"""
con.sql(query)

Display results on a map.

query = "SELECT * FROM not_covered_osm_data"
df = con.sql(query).df()

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

# the lonboard map definition
not_covered_osm_data = lonboard.PathLayer.from_geopandas(
    gdf,
    get_color=[0,255,255,255],
    get_width=3,
    width_min_pixels=0.8
)

view_state = {
    "longitude": xmin + ((xmax - xmin) / 2),
    "latitude": ymin + ((ymax - ymin) / 2),
    "zoom": 12
}

mapillary_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.DarkMatter,
    layers=[not_covered_osm_data, mapillary_layer],
    view_state=view_state
)

display(mapillary_map)

Calculate share of road network covered by Mapillary#

total_osm_length = osm_data_df["length_km"].sum()
not_covered_osm_length = df["length_km"].sum()
mapillary_coverage = (total_osm_length - not_covered_osm_length) / total_osm_length

print(f"Mapillary coverage: {round(mapillary_coverage, 3)}")

Mapillary coverage: 0.277

Calculate Mapillary coverage per road type. Here we list the top 20 only.

query = """
with uncovered_stats AS
(
SELECT
    a.highway_value,
    SUM(a.length_km) as not_covered_length_km
FROM not_covered_osm_data as a
GROUP by a.highway_value
ORDER BY 2 DESC 
),
total_stats AS
(
SELECT
    a.highway_value,
    SUM(a.length_km) as total_length_km,
FROM osm_data as a
LEFT JOIN uncovered_stats b ON
    a.highway_value = b.highway_value
GROUP by a.highway_value
ORDER BY 2 DESC
)
SELECT
    a.highway_value,
    round(a.total_length_km, 3) as total_length_km,
    round(b.not_covered_length_km, 3) as not_covered_length_km ,
    round((a.total_length_km - b.not_covered_length_km) / a.total_length_km, 3) as mapillary_coverage
FROM total_stats a
LEFT JOIN uncovered_stats b ON
    a.highway_value = b.highway_value
ORDER BY mapillary_coverage DESC
LIMIT 20;
"""
con.sql(query).show()

┌────────────────┬─────────────────┬───────────────────────┬────────────────────┐
│ highway_value  │ total_length_km │ not_covered_length_km │ mapillary_coverage │
│    varchar     │     double      │        double         │       double       │
├────────────────┼─────────────────┼───────────────────────┼────────────────────┤
│ motorway       │          40.216 │                 2.602 │              0.935 │
│ trunk          │         183.263 │                14.096 │              0.923 │
│ motorway_link  │          13.904 │                 1.647 │              0.882 │
│ trunk_link     │          22.641 │                 3.948 │              0.826 │
│ primary_link   │           12.95 │                 2.376 │              0.817 │
│ primary        │         105.382 │                33.001 │              0.687 │
│ cycleway       │          29.469 │                11.735 │              0.602 │
│ secondary      │         365.844 │               152.154 │              0.584 │
│ secondary_link │           9.282 │                 4.035 │              0.565 │
│ tertiary       │         180.208 │                88.922 │              0.507 │
│ steps          │           1.386 │                 0.724 │              0.478 │
│ tertiary_link  │           1.126 │                 0.629 │              0.442 │
│ unclassified   │         354.191 │               219.945 │              0.379 │
│ pedestrian     │           4.059 │                 2.855 │              0.297 │
│ residential    │        3576.012 │              2809.398 │              0.214 │
│ service        │        1391.092 │              1110.059 │              0.202 │
│ footway        │         136.649 │               109.162 │              0.201 │
│ living_street  │           8.259 │                 6.724 │              0.186 │
│ track          │         401.247 │               326.452 │              0.186 │
│ path           │         212.552 │               192.475 │              0.094 │
├────────────────┴─────────────────┴───────────────────────┴────────────────────┤
│ 20 rows                                                             4 columns │
└───────────────────────────────────────────────────────────────────────────────┘