From 16114f53b8091fbedccce398b39d7fb17a946887 Mon Sep 17 00:00:00 2001 From: Maxime Buron <maxime.buron@inria.fr> Date: Tue, 6 Jun 2023 15:05:42 +0200 Subject: [PATCH] OSM data for MongoDB --- posts.org | 1 + posts/getting-osm-data-ready-for-mongodb.org | 169 ++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 posts/getting-osm-data-ready-for-mongodb.org diff --git a/posts.org b/posts.org index 2030889..1220278 100644 --- a/posts.org +++ b/posts.org @@ -1,5 +1,6 @@ #+TITLE: Posts +- [[file:posts/getting-osm-data-ready-for-mongodb.org ][Getting OSM data ready for MongoDB]] - [[file:posts/defense.org][Soutenance]] - [[file:posts/checking-postgres-statistics.org][Checking Posgresql Statistics]] - [[file:posts/starwars-demo.org][Star Wars RDF Integration Demo]] diff --git a/posts/getting-osm-data-ready-for-mongodb.org b/posts/getting-osm-data-ready-for-mongodb.org new file mode 100644 index 0000000..06f0a9c --- /dev/null +++ b/posts/getting-osm-data-ready-for-mongodb.org @@ -0,0 +1,169 @@ +#+TITLE: Getting OSM data ready for MongoDB + +In the following, we will download JSON documents representing node, ways and relations from OpenStreetMap using the [[https://overpass-turbo.eu/#][overpass API]]. Then, we will transform this data in order to have more nested documents. + +* Downloading data + +I will focus on the area of Clermont-Ferrand, but you can do the same with your area. I will present the different Overpass queries I used to download the three types OSM elements I need. Once you have selected the area using the map and you have typed the query, you can download the corresponding data with "Export>raw data from Overpass API". + +For nodes +#+BEGIN_example +[out:json]; +( +node + [name] + ({{bbox}}); +node + [amenity] + ({{bbox}}); +node + [natural] + ({{bbox}}); + ); +out; +#+END_example + + +For ways +#+BEGIN_example +[out:json]; + +way + [name] + (1,{{bbox}}); + +(._;>;); + +out; +#+END_example + +For relations +#+BEGIN_example +[out:json]; + +relation + [name] + (1,{{bbox}}); + +(._;>;); + +out; +#+END_example + +In my example, I just need the three following commands: +#+BEGIN_src sh +wget https://overpass-api.de/api/interpreter?data=%2F*%0AThis%20is%20an%20example%20Overpass%20query.%0ATry%20it%20out%20by%20pressing%20the%20Run%20button%20above%21%0AYou%20can%20find%20more%20examples%20with%20the%20Load%20tool.%0A*%2F%0A%5Bout%3Ajson%5D%3B%0A%28%0Anode%0A%20%20%5Bname%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0Anode%0A%20%20%5Bamenity%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0Anode%0A%20%20%5Bnatural%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0A%20%20%29%3B%0Aout%3B -O clermont-node.json + +wget https://overpass-api.de/api/interpreter?data=%0A%5Bout%3Ajson%5D%3B%0A%0Away%0A%20%20%5Bname%5D%0A%20%20%2845.726434941923486%2C3.0152320861816406%2C45.8374108259422%2C3.1963348388671875%29%3B%0A%0A%28._%3B%3E%3B%29%3B%0A%0Aout%3B -O clermont-way.json + +wget https://overpass-api.de/api/interpreter?data=%2F*%0AThis%20is%20an%20example%20Overpass%20query.%0ATry%20it%20out%20by%20pressing%20the%20Run%20button%20above%21%0AYou%20can%20find%20more%20examples%20with%20the%20Load%20tool.%0A*%2F%0A%5Bout%3Ajson%5D%3B%0Arelation%0A%20%20%5Bname%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0A%0A%2F*added%20by%20auto%20repair*%2F%0A%28._%3B%3E%3B%29%3B%0A%2F*end%20of%20auto%20repair*%2F%0Aout%3B -O clermont-relation.json + +#+END_src + +* Extracting an array of OSM elements + +The JSON document provided by the Overpass API is not ready to be loaded in MongoDB, we first need to project it on the elements array. + +#+BEGIN_example +jq .elements download-file.json > file-to-load.json +#+END_example + +In my case, I have +#+BEGIN_src sh + jq .elements clermont-node.json > clermont-nodes.json + jq .elements clermont-way.json > clermont-ways.json + jq .elements clermont-relation.json > clermont-relations.json +#+END_src + +* Ideas of interesting queries + +Query the parks + +On which place is the Tabac du Mazet ? +#+BEGIN_src js + db.clermont.find({type: 'node','tags.name': RegExp("Tabac du Mazet")}) + + db.clermont.find({type: 'relation', members: {$elemMatch: {ref: 170201638}}}) +#+END_src + +Create a 2d index requires to change the structure of the documents to include GeoJSON information. + +Create a small search engine using an text index (you should have results for "Usinage", "Le Rio", +10 results for "association") + +What is the mean elevation of the peaks + +#+BEGIN_src js +db.clermont.aggregate([{$match: {"node.tags.highway": "traffic_signals"}}, {$project: {"tags.name": 1}}]) +#+END_src + +** About ISIMA + +#+BEGIN_src js +db.clermont.find({"tags.name": /ISIMA/}) + +#+END_src + +* Cleaning and restructure the data :noexports: + +#+BEGIN_src js + // count the duplicates + db.clermont.aggregate([{$group: { _id: "$id", uniqueIds: {$addToSet: "$_id"}, count: {$sum: 1}}}, {$match: {count: {$gt: 1}}}, {"$count": "count"}]) + // remove them + db.clermont.aggregate([{$group: { _id: "$id", uniqueIds: {$addToSet: "$_id"}, count: {$sum: 1}}}, {$match: {count: {$gt: 1}}}]).forEach(function(doc){doc.uniqueIds.shift(); db.clermont.remove({_id: {$in: doc.uniqueIds}})}) + + db.newclermont.aggregate([{"$set": {"_id": "$id"}},{$unset: "id"}, {$merge: {into:"newclermont2"}}]) + db.newclermont2.aggregate([{"$set": {"url": {"$concat": ["https://www.openstreetmap.org/", "$type", "/", {"$toString": "$_id"}]}}}, {$merge: {into:"newclermont2"}}]) + + db.newclermont2.aggregate([{ + $match: { + type: 'way' + } + }, { + $unwind: { + path: '$nodes' + } + }, { + $lookup: { + from: 'newclermont2', + localField: 'nodes', + foreignField: '_id', + as: 'node' + } + }, { + $unwind: { + path: '$node' + } + }, { + $unset: 'nodes' + }, { + $group: { + _id: '$_id', + nodes: { + $push: '$node' + }, + obj: { + $first: '$$ROOT' + } + } + }, { + $set: { + 'obj.nodes': '$nodes' + } + }, { + $replaceRoot: { + newRoot: '$obj' + } + }, { + $unset: 'node' + }, {$merge: {into:"newclermont3"}}]) + + db.newclermont2.aggregate([{$match: {type: {$in:["relation", "node"]}}}, {$merge: {into: "newclermont3"}}]) + + // remove the _id of the node in the ways + db.clermont.updateMany({nodes: {$exists : true}}, {$unset: { "nodes.$[]._id": ""}}) + + // list the ref ids in the relations + db.clermont.aggregate([{$match: {type: "relation"}}, {$project: {"members":1}}, {$unwind: "$members"}, {$group:{_id: null, refs: {$addToSet:"$members.ref"}}}]) + + +#+END_src -- GitLab