From 16114f53b8091fbedccce398b39d7fb17a946887 Mon Sep 17 00:00:00 2001
From: Maxime Buron <maxime.buron@inria.fr>
Date: Tue, 6 Jun 2023 15:05:42 +0200
Subject: [PATCH] OSM data for MongoDB

---
 posts.org                                     |   1 +
 posts/getting-osm-data-ready-for-mongodb.org  | 169 ++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 posts/getting-osm-data-ready-for-mongodb.org 

diff --git a/posts.org b/posts.org
index 2030889..1220278 100644
--- a/posts.org
+++ b/posts.org
@@ -1,5 +1,6 @@
 #+TITLE: Posts
 
+- [[file:posts/getting-osm-data-ready-for-mongodb.org ][Getting OSM data ready for MongoDB]]
 - [[file:posts/defense.org][Soutenance]]
 - [[file:posts/checking-postgres-statistics.org][Checking Posgresql Statistics]]
 - [[file:posts/starwars-demo.org][Star Wars RDF Integration Demo]]
diff --git a/posts/getting-osm-data-ready-for-mongodb.org  b/posts/getting-osm-data-ready-for-mongodb.org 
new file mode 100644
index 0000000..06f0a9c
--- /dev/null
+++ b/posts/getting-osm-data-ready-for-mongodb.org 	
@@ -0,0 +1,169 @@
+#+TITLE: Getting OSM data ready for MongoDB
+
+In the following, we will download JSON documents representing node, ways and relations from OpenStreetMap using the [[https://overpass-turbo.eu/#][overpass API]]. Then, we will transform this data in order to have more nested documents.
+
+* Downloading data
+
+I will focus on the area of Clermont-Ferrand, but you can do the same with your area. I will present the different Overpass queries I used to download the three types OSM elements I need. Once you have selected the area using the map and you have typed the query, you can download the corresponding data with "Export>raw data from Overpass API".
+
+For nodes
+#+BEGIN_example
+[out:json];
+(
+node
+  [name]
+  ({{bbox}});
+node
+  [amenity]
+  ({{bbox}});
+node
+  [natural]
+  ({{bbox}});
+  );
+out;
+#+END_example
+
+
+For ways
+#+BEGIN_example
+[out:json];
+
+way
+  [name]
+  (1,{{bbox}});
+
+(._;>;);
+
+out;
+#+END_example
+
+For relations
+#+BEGIN_example
+[out:json];
+
+relation
+  [name]
+  (1,{{bbox}});
+
+(._;>;);
+
+out;
+#+END_example
+
+In my example, I just need the three following commands:
+#+BEGIN_src sh
+wget https://overpass-api.de/api/interpreter?data=%2F*%0AThis%20is%20an%20example%20Overpass%20query.%0ATry%20it%20out%20by%20pressing%20the%20Run%20button%20above%21%0AYou%20can%20find%20more%20examples%20with%20the%20Load%20tool.%0A*%2F%0A%5Bout%3Ajson%5D%3B%0A%28%0Anode%0A%20%20%5Bname%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0Anode%0A%20%20%5Bamenity%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0Anode%0A%20%20%5Bnatural%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0A%20%20%29%3B%0Aout%3B -O clermont-node.json
+
+wget https://overpass-api.de/api/interpreter?data=%0A%5Bout%3Ajson%5D%3B%0A%0Away%0A%20%20%5Bname%5D%0A%20%20%2845.726434941923486%2C3.0152320861816406%2C45.8374108259422%2C3.1963348388671875%29%3B%0A%0A%28._%3B%3E%3B%29%3B%0A%0Aout%3B -O clermont-way.json
+
+wget https://overpass-api.de/api/interpreter?data=%2F*%0AThis%20is%20an%20example%20Overpass%20query.%0ATry%20it%20out%20by%20pressing%20the%20Run%20button%20above%21%0AYou%20can%20find%20more%20examples%20with%20the%20Load%20tool.%0A*%2F%0A%5Bout%3Ajson%5D%3B%0Arelation%0A%20%20%5Bname%5D%0A%20%20%2845.72631510756138%2C3.0054473876953125%2C45.83729122987253%2C3.1865501403808594%29%3B%0A%0A%2F*added%20by%20auto%20repair*%2F%0A%28._%3B%3E%3B%29%3B%0A%2F*end%20of%20auto%20repair*%2F%0Aout%3B -O clermont-relation.json
+
+#+END_src
+
+* Extracting an array of OSM elements
+
+The JSON document provided by the Overpass API is not ready to be loaded in MongoDB, we first need to project it on the elements array.
+
+#+BEGIN_example
+jq .elements download-file.json > file-to-load.json
+#+END_example
+
+In my case, I have
+#+BEGIN_src sh
+  jq .elements clermont-node.json > clermont-nodes.json
+  jq .elements clermont-way.json > clermont-ways.json
+  jq .elements clermont-relation.json > clermont-relations.json
+#+END_src
+
+* Ideas of interesting queries
+
+Query the parks 
+
+On which place is the Tabac du Mazet ?
+#+BEGIN_src js
+  db.clermont.find({type: 'node','tags.name': RegExp("Tabac du Mazet")})
+
+  db.clermont.find({type: 'relation', members: {$elemMatch: {ref: 170201638}}})
+#+END_src
+
+Create a 2d index requires to change the structure of the documents to include GeoJSON information.
+
+Create a small search engine using an text index (you should have results for "Usinage", "Le Rio", +10 results for "association")
+
+What is the mean elevation of the peaks
+
+#+BEGIN_src js
+db.clermont.aggregate([{$match: {"node.tags.highway": "traffic_signals"}}, {$project: {"tags.name": 1}}])
+#+END_src
+
+** About ISIMA
+
+#+BEGIN_src js
+db.clermont.find({"tags.name": /ISIMA/})
+
+#+END_src
+
+* Cleaning and restructure the data                               :noexports:
+
+#+BEGIN_src js
+       // count the duplicates
+         db.clermont.aggregate([{$group: { _id: "$id", uniqueIds: {$addToSet: "$_id"}, count: {$sum: 1}}}, {$match: {count: {$gt: 1}}}, {"$count": "count"}])
+       // remove them
+      db.clermont.aggregate([{$group: { _id: "$id", uniqueIds: {$addToSet: "$_id"}, count: {$sum: 1}}}, {$match: {count: {$gt: 1}}}]).forEach(function(doc){doc.uniqueIds.shift(); db.clermont.remove({_id: {$in: doc.uniqueIds}})})
+
+    db.newclermont.aggregate([{"$set": {"_id": "$id"}},{$unset: "id"}, {$merge: {into:"newclermont2"}}])
+  db.newclermont2.aggregate([{"$set": {"url": {"$concat": ["https://www.openstreetmap.org/", "$type", "/", {"$toString": "$_id"}]}}}, {$merge: {into:"newclermont2"}}])
+
+  db.newclermont2.aggregate([{
+   $match: {
+    type: 'way'
+   }
+  }, {
+   $unwind: {
+    path: '$nodes'
+   }
+  }, {
+   $lookup: {
+    from: 'newclermont2',
+    localField: 'nodes',
+    foreignField: '_id',
+    as: 'node'
+   }
+  }, {
+   $unwind: {
+    path: '$node'
+   }
+  }, {
+   $unset: 'nodes'
+  }, {
+   $group: {
+    _id: '$_id',
+    nodes: {
+     $push: '$node'
+    },
+    obj: {
+     $first: '$$ROOT'
+    }
+   }
+  }, {
+   $set: {
+    'obj.nodes': '$nodes'
+   }
+  }, {
+   $replaceRoot: {
+    newRoot: '$obj'
+   }
+  }, {
+   $unset: 'node'
+  }, {$merge: {into:"newclermont3"}}])
+
+  db.newclermont2.aggregate([{$match: {type: {$in:["relation", "node"]}}}, {$merge: {into: "newclermont3"}}])
+
+      // remove the _id of the node in the ways
+      db.clermont.updateMany({nodes: {$exists : true}}, {$unset: { "nodes.$[]._id": ""}})
+
+    // list the ref ids in the relations
+    db.clermont.aggregate([{$match: {type: "relation"}}, {$project: {"members":1}}, {$unwind: "$members"}, {$group:{_id: null, refs: {$addToSet:"$members.ref"}}}])
+
+
+#+END_src
-- 
GitLab