HOT Tasking Manager project analysis#

In this notebook we demonstrate how to combine two datasets: HOT Tasking Manager project data and OSM contributions.

We want to investigate all contributions that have been made in OSM via HOT’s Tasking Manager. We will furthermore filter these contributions by using the OSM Changeset information.

These are the steps you see further down:

  • Set the connection parameters.

  • Prepare your input parameters, e.g. define HOT Tasking Manager project ID.

  • Download data using DuckDB. This time we also download HOT project data.

  • Filter OSM contributions using changeset attributes.

  • Display both datasets on a map.

Getting started#

Set connection params.

from dotenv import load_dotenv
load_dotenv()
True
import os

s3_user = os.environ["S3_ACCESS_KEY_ID"]  # add your user here
s3_password = os.environ["S3_SECRET_ACCESS_KEY"]  # add your password here

Configure DuckDB.

!pip install duckdb==1.0.0
import duckdb

con = duckdb.connect(
    config={
        'threads': 8,
        'max_memory': '8GB',
        # 'enable_object_cache': True
    }
)
con.install_extension("spatial")
con.load_extension("spatial")

Set the connection params to Iceberg Rest Catalog. (We need this for the OSM data.)

from pyiceberg.catalog.rest import RestCatalog

catalog = RestCatalog(
    name="default",
    **{
        "uri": "https://sotm2024.iceberg.ohsome.org",
        "s3.endpoint": "https://sotm2024.minio.heigit.org",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": s3_user,
        "s3.secret-access-key": s3_password,
        "s3.region": "eu-central-1"
    }
)

Set connection to MinIO object storage. (We need this for HOT Tasking Manager data)

query = f"""
DROP SECRET IF EXISTS "__default_s3";
CREATE SECRET (
      TYPE S3,
      KEY_ID '{s3_user}',
      SECRET '{s3_password}',
      REGION 'eu-central-1',
      endpoint 'sotm2024.minio.heigit.org',
      use_ssl true,
      url_style 'path'
  );
"""
con.sql(query).show()
┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

Prepare the input parameters for your analysis#

hot_tm_parquet_data_path = "s3a://heigit-ohsome-sotm24/data/hot_tasking_manager/**"

# Set iceberg table
namespace = 'geo_sort'
tablename = 'contributions'
icebergtable = catalog.load_table((namespace, tablename))

hot_tm_project_id = 15996

Get Information about HOT Tasking Manager project#

import geopandas as gpd

query = f"""
SELECT *
FROM read_parquet('{hot_tm_parquet_data_path}') a
WHERE project_id = {hot_tm_project_id};
"""
df = con.sql(query).df()

bbox = df["bbox"].values[0]

# convert the data to geodata
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df['geometry'])
).set_crs('epsg:4326')

Display where the project is located.

import lonboard

# the lonboard map definition
hot_tm_layer = lonboard.PolygonLayer.from_geopandas(
    gdf,
    get_fill_color=[0, 0, 0, 0],
    get_line_color=[255, 0, 255],
    get_line_width=10
)

view_state = {
    "longitude": df["centroid"].values[0]["x"],
    "latitude": df["centroid"].values[0]["y"],
    "zoom": 12
}

overview_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.Positron,
    layers=[hot_tm_layer],
    view_state=view_state
)

display(overview_map)

Download data OSM Data#

import time
start_time = time.time()


icebergtable.scan(
    row_filter=(
        f"( status = 'latest' or status = 'history' )"
        f"and (xmax >= {bbox['xmin']} and xmin <= {bbox['xmax']}) "
        f"and (ymax >= {bbox['ymin']} and ymin <= {bbox['ymax']}) "
    ),
    selected_fields=(
        "user_id",
        "osm_id",
        "osm_version",
        "valid_from",
        "valid_to",
        "tags",
        "tags_before",
        "changeset",
        "geometry",
    ),
).to_duckdb('raw_osm_data',connection=con)

download_time = round(time.time() - start_time, 3)
print(f"download took {download_time} sec.")
download took 42.551 sec.

Filter data using OSM changeset information#

query = f"""
    SELECT *
    FROM raw_osm_data
    WHERE 1=1
        and list_contains(changeset.hashtags, 'hotosm-project-{hot_tm_project_id}')
    """
osm_df = con.sql(query).df()

# convert the data to geodata
osm_gdf = gpd.GeoDataFrame(
    osm_df,
    geometry=gpd.GeoSeries.from_wkt(osm_df['geometry'])
).set_crs('epsg:4326')

Display OSM data and HOT Tasking Manager project on map.

# the lonboard map definition
osm_polygons_layer = lonboard.PolygonLayer.from_geopandas(
    osm_gdf[osm_gdf.geom_type == 'Polygon'],
    get_fill_color=[0, 0, 0, 0],
    get_line_width=1
)

osm_lines_layer = lonboard.PathLayer.from_geopandas(
    osm_gdf[osm_gdf.geom_type == 'LineString'],
)


osm_points_layer = lonboard.ScatterplotLayer.from_geopandas(
    osm_gdf[osm_gdf.geom_type == 'Point'],
)



osm_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.Positron,
    layers=[
        hot_tm_layer,
        osm_polygons_layer,
        osm_lines_layer,
        osm_points_layer,
    ],
    view_state=view_state
)

display(osm_map)

Display OSM user statistics#

Let’s analyse who mapped how much in this area.

query = f"""
    SELECT
        user_id,
        count(*) as n_edits,
        SUM(CASE
            WHEN contains(changeset.editor, 'JOSM') THEN 1
            ELSE 0
        END) as josm_edit,
        SUM(CASE
            WHEN contains(changeset.editor, 'iD') THEN 1
            ELSE 0
        END) as iD_edit
    FROM raw_osm_data
    WHERE 1=1
        and list_contains(changeset.hashtags, 'hotosm-project-{hot_tm_project_id}')
    GROUP BY user_id
    ORDER BY n_edits DESC
    """
user_stats_df = con.sql(query).df()

We are going to add OSM usernames to our dataframe.

import requests


def get_current_usernames(user_ids_list):
    api_url = "http://textual.ru/whosthat/whosthat.php"
    user_names_list = []

    for user_id in user_ids_list:
        params = {
            "action": "info",  # Use "info" action to get detailed information
            "id": user_id,
        }
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            if data and isinstance(data, list) and len(data) > 0:
                # The API returns an array of hashes with user information
                # We need to extract the user ID from the first entry
                user_info = data[0]
                user_names_list.append( user_info['names'][-1]["name"] )

    return user_names_list


user_stats_df["username"] = get_current_usernames(user_stats_df["user_id"].values)
user_stats_df["link"] = "https://www.openstreetmap.org/user/" + user_stats_df["username"]

Display a user leaderboard.

display(user_stats_df)
user_id n_edits josm_edit iD_edit username link
0 563947 898 0.0 858.0 notoncebut2x https://www.openstreetmap.org/user/notoncebut2x
1 11010237 734 734.0 0.0 Dannyoung https://www.openstreetmap.org/user/Dannyoung
2 10570996 468 468.0 0.0 Erick tamba https://www.openstreetmap.org/user/Erick tamba
3 21292317 439 0.0 439.0 abhimanyubarun https://www.openstreetmap.org/user/abhimanyubarun
4 17340053 388 388.0 0.0 Sally Mlingi https://www.openstreetmap.org/user/Sally Mlingi
5 13722972 261 186.0 75.0 Jimerveille https://www.openstreetmap.org/user/Jimerveille
6 21288883 251 0.0 251.0 BTR2 https://www.openstreetmap.org/user/BTR2
7 19063413 245 0.0 245.0 Aczel-Divin MOUNTOU https://www.openstreetmap.org/user/Aczel-Divin...
8 20450590 167 0.0 167.0 KOGNOKO ABOULATSAMBO https://www.openstreetmap.org/user/KOGNOKO ABO...
9 9449463 150 150.0 0.0 Mercy14846 https://www.openstreetmap.org/user/Mercy14846
10 14962448 117 0.0 117.0 Abdul-Nayar Rabiatu Sungumu https://www.openstreetmap.org/user/Abdul-Nayar...
11 8386618 104 0.0 104.0 Adele Birkenes https://www.openstreetmap.org/user/Adele Birkenes
12 15320583 75 0.0 75.0 Eldablessing https://www.openstreetmap.org/user/Eldablessing
13 21292063 60 0.0 60.0 GlobalTurtle https://www.openstreetmap.org/user/GlobalTurtle
14 21304240 57 0.0 57.0 sbribri https://www.openstreetmap.org/user/sbribri
15 20155561 51 13.0 38.0 BENYE MULTI MEDIA https://www.openstreetmap.org/user/BENYE MULTI...
16 18790262 50 50.0 0.0 NzahT https://www.openstreetmap.org/user/NzahT
17 20768452 44 0.0 44.0 BAZONZILA MBANI Gloire https://www.openstreetmap.org/user/BAZONZILA M...
18 20982460 40 18.0 22.0 Jaco452 https://www.openstreetmap.org/user/Jaco452
19 21256164 33 17.0 16.0 Perey Wisdom https://www.openstreetmap.org/user/Perey Wisdom
20 21186164 31 7.0 24.0 Ziggyentumi@20 https://www.openstreetmap.org/user/Ziggyentumi@20
21 21314713 29 0.0 29.0 Maryamarab https://www.openstreetmap.org/user/Maryamarab
22 17496710 17 17.0 0.0 dellington_deli https://www.openstreetmap.org/user/dellington_...
23 21230154 15 0.0 15.0 VijayB2345 https://www.openstreetmap.org/user/VijayB2345
24 21125012 15 0.0 15.0 R237497 https://www.openstreetmap.org/user/R237497
25 16690030 15 0.0 15.0 Bouyaossere https://www.openstreetmap.org/user/Bouyaossere
26 13007399 14 0.0 14.0 elodienix https://www.openstreetmap.org/user/elodienix
27 11500943 14 0.0 14.0 Mariannamukulunga https://www.openstreetmap.org/user/Mariannamuk...
28 16739098 11 0.0 11.0 TUNGBANI FRANCIS XAVIER https://www.openstreetmap.org/user/TUNGBANI FR...
29 21366183 9 0.0 9.0 George Nkpagneer Gmagnancha https://www.openstreetmap.org/user/George Nkpa...
30 19297985 9 0.0 9.0 susan27 https://www.openstreetmap.org/user/susan27
31 15320557 8 0.0 8.0 GDC_Christie https://www.openstreetmap.org/user/GDC_Christie
32 21363124 6 0.0 6.0 Ibrahim Wunpini Sulemana https://www.openstreetmap.org/user/Ibrahim Wun...
33 10394726 5 0.0 5.0 Calvin Menttor https://www.openstreetmap.org/user/Calvin Menttor
34 21363912 4 3.0 1.0 Panladawuda https://www.openstreetmap.org/user/Panladawuda
35 21207980 2 0.0 2.0 Kenaurd OKAMA https://www.openstreetmap.org/user/Kenaurd OKAMA
36 21331501 1 0.0 1.0 DEKU PETER MEDODZI https://www.openstreetmap.org/user/DEKU PETER ...

Inspect the contributions made by the mapper with most edits.

top_mapper = user_stats_df.sort_values("n_edits")["user_id"][0]

# the lonboard map definition
osm_polygons_layer = lonboard.PolygonLayer.from_geopandas(
    osm_gdf[(osm_gdf.geom_type == 'Polygon') & (osm_gdf["user_id"] == top_mapper)],
    get_fill_color=[0, 0, 0, 0],
    get_line_width=1
)

osm_lines_layer = lonboard.PathLayer.from_geopandas(
    osm_gdf[(osm_gdf.geom_type == 'LineString') & (osm_gdf["user_id"] == top_mapper)],
)


osm_points_layer = lonboard.ScatterplotLayer.from_geopandas(
    osm_gdf[(osm_gdf.geom_type == 'Point') & (osm_gdf["user_id"] == top_mapper)],
)



osm_map = lonboard.Map(
    basemap_style=lonboard.basemap.CartoBasemap.Positron,
    layers=[
        hot_tm_layer,
        osm_polygons_layer,
        osm_lines_layer,
        osm_points_layer,
    ],
    view_state=view_state
)

display(osm_map)