gip-inclusion · vmttn · Sep 26, 2023 · Aug 9, 2023 · Aug 11, 2023 · Aug 11, 2023
@@ -87,6 +87,7 @@ MES_AIDES_AIRTABLE_KEY=
 MES_AIDES_GARAGES_URL=https://airtable.com/appEvva5gyqqoQRnr/tblnGf4Y5EUEeVHtJ/viw9ZZAUkexq6uDaI
 MONENFANT_CRECHES_FILE_URL=
 ODSPEP_S3_KEY_PREFIX=sources/odspep/2023-01-23/denormalized/Exports/
+RESEAU_ALPHA_HTML_FILE_URL=
 SIAO_FILE_URL=
 SIRENE_STOCK_ETAB_GEOCODE_FILE_URL=https://data.cquest.org/geo_sirene/v2019/last/StockEtablissementActif_utf8_geo.csv.gz
 SIRENE_STOCK_ETAB_HIST_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/88fbb6b4-0320-443e-b739-b4376a012c32

@@ -8,4 +8,5 @@ FINESS_FILE_URL=https://www.data.gouv.fr/fr/datasets/r/3dc9b1d5-0157-440d-a7b5-c
 CD72_FILE_URL=
 CD93_FILE_URL=
 CD35_FILE_URL=https://data.ille-et-vilaine.fr/dataset/8d5ec0f0-ebe1-442d-9d99-655b37d5ad07/resource/665776ae-fa25-46ab-9bfd-c4241866f03f/download/annuaire_sociale_fixe.csv
-CD62_FILE_URL=
+CD62_FILE_URL=
+RESEAU_ALPHA_TEST_W_LOCAL_FILES=0
@@ -0,0 +1,3 @@
+structures
+services
+*.html
@@ -0,0 +1,331 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Scraping des structures et services publiés sur le site Web de Réseau alpha\n",
+    "\n",
+    "Le scraping commence sur cette page pour l'Essonne : https://www.reseau-alpha.org/trouver-une-formation?form%5BcodePostal%5D%5B%5D=%7C91&form%5BcriteresScolarisation%5D=&form%5BniveauLinguistiqueVise%5D=&form%5Bprogramme%5D=&form%5BmotCle%5D=\n",
+    "\n",
+    "Cette page est générée dynamiquement et Scrapy ne peut donc pas en extraire le contenu. Le HTML doit donc être extrait à la main et sauvegardé dans le même dossier que ce notebook sous le nom `structure_list.html`.\n",
+    "\n",
+    "Le script permet de scraper une copie locale du HTML pour les formations et les structures. C'est utile pour tester le script sans envoyer de requêtes au site Web original. Pour ce faire :\n",
+    "\n",
+    "1. Faire tourner au moins une fois le scrap avec RESEAU_ALPHA_TEST_W_LOCAL_FILES=0 pour télécharger le HTML depuis le site live sur l'ordinateur dans les dossiers `./structures` et `./services`\n",
+    "2. Set RESEAU_ALPHA_TEST_W_LOCAL_FILES=1\n",
+    "\n",
+    "### Structure du script\n",
+    "\n",
+    "1. `start_requests` démarre le scraping à partir de la page de résultats de rechercher\n",
+    "2. `parse` parse cette page pour extraire la liste des formations (pas encore les permanences)\n",
+    "3. `parse_formation` scrape le contenu de la page de chaque formation et passe le dictionnaire item à la fonction suivante\n",
+    "4. `parse_structure` scrape la page de la structure liée à la formation en enrichissant le dictionnaire item. Cette fonction est appelée autant de fois qu'il y a de lieux pour la formation\n",
+    "5. à la fin de `parse_structure`, le dictionnaire item est \"yield\" pour former une ligne du CSV (ou un objet dans l'array JSON)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scrapy\n",
+    "from scrapy.crawler import CrawlerProcess\n",
+    "from pathlib import Path\n",
+    "from urllib.parse import urlparse \n",
+    "import re\n",
+    "import dateparser\n",
+    "import os\n",
+    "import dotenv\n",
+    "import trafilatura\n",
+    "\n",
+    "dotenv.load_dotenv(dotenv.find_dotenv())\n",
+    "TESTING_WITH_LOCAL_FILES = os.getenv(\"ENV_VAR\", 'False').lower() in ('true', '1', 't')\n",
+    "\n",
+    "# Local HTML\n",
+    "base_path =  'file://' + os.path.abspath('')\n",
+    "structure_base_path = base_path + '/structures'\n",
+    "formation_base_path = base_path + '/services'\n",
+    "\n",
+    "\n",
+    "\n",
+    "URL = f\"{base_path}/structure_list.html\"\n",
+    "if TESTING_WITH_LOCAL_FILES is False:\n",
+    "    os.makedirs(structure_base_path, exist_ok=True)\n",
+    "    os.makedirs(formation_base_path, exist_ok=True)\n",
+    "\n",
+    "# Live HTML (don't use too much to avoid being banned!)\n",
+    "# structure_base_url = 'https://www.reseau-alpha.org/structure/apprentissage-du-francais/'\n",
+    "\n",
+    "\n",
+    "# Structure avec antennes et formations : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries\n",
+    "# Structure sans antenne et sans formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/acafi\n",
+    "# Formation : https://www.reseau-alpha.org/structure/apprentissage-du-francais/aries/formation/francais-a-visee-professionnelle/b8a73-francais-a-visee-sociale-et-ou-professionnelle\n",
+    "\n",
+    "def html_to_markdown(s: str):\n",
+    "    if s is None or s == \"\" :\n",
+    "        return s\n",
+    "    if type(s) == list:\n",
+    "        s = \"<br/>\".join(s)\n",
+    "    return trafilatura.extract(trafilatura.load_html(\"<html>\" + s + \"</html>\"), no_fallback=True, max_tree_size=1000)\n",
+    "\n",
+    "def clean_adresse(adresses: list or scrapy.Selector) -> {} or []:\n",
+    "    lieux = []\n",
+    "    for adresse in adresses:\n",
+    "        adresse_text_chunks = adresse.xpath('text()').getall()\n",
+    "        clean_lieu = {\n",
+    "            \"structure_service_adresse_entiere\": \"\",\n",
+    "            \"structure_service_adresse\": \"\",\n",
+    "            \"structure_service_code_postal\": \"\",\n",
+    "            \"structure_service_commune\": \"\"\n",
+    "        }\n",
+    "        for part in adresse_text_chunks:\n",
+    "            part = part.strip()\n",
+    "            if re.match(r'^\\d', part):\n",
+    "                if re.match(r'^\\d{5}', part):\n",
+    "                    split_address = part.split(\" - \")\n",
+    "                    clean_lieu[\"structure_service_code_postal\"] = split_address[0]\n",
+    "                    clean_lieu[\"structure_service_commune\"] = split_address[1]\n",
+    "                else:\n",
+    "                    clean_lieu[\"structure_service_adresse\"] = part\n",
+    "            clean_lieu[\"structure_service_adresse_entiere\"] += part + \", \"\n",
+    "        lieux.append(clean_lieu)\n",
+    "    return lieux\n",
+    "\n",
+    "def strip(maybe_string):\n",
+    "    if type(maybe_string) == str:\n",
+    "        return maybe_string.strip()\n",
+    "    if maybe_string == None:\n",
+    "        return \"\"\n",
+    "    else:\n",
+    "        return maybe_string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AlphaSpider(scrapy.Spider):\n",
+    "    name = \"alpha\"\n",
+    "    custom_settings = {\n",
+    "        \"DOWNLOAD_DELAY\": 0 if TESTING_WITH_LOCAL_FILES else 0.5\n",
+    "    }\n",
+    "\n",
+    "    def start_requests(self):\n",
+    "        urls = [\n",
+    "            URL\n",
+    "        ]\n",
+    "        for url in urls:\n",
+    "            yield scrapy.Request(url=url, callback=self.parse)\n",
+    "\n",
+    "    def parse(self, response):\n",
+    "        \n",
+    "        formations_links = response.css('div#div-accordion-formation > div.contact-content a.readon')\n",
+    "        \n",
+    "        if TESTING_WITH_LOCAL_FILES:\n",
+    "            for slug in formations_links.xpath('@href').getall():\n",
+    "                next_page = f\"{formation_base_path}/{slug.split('/')[-1]}\"\n",
+    "                yield scrapy.Request(next_page, callback=self.parse_formation)\n",
+    "        else:\n",
+    "            for a in formations_links:\n",
+    "                yield response.follow(a, callback=self.parse_formation)\n",
+    "\n",
+    "\n",
+    "    def parse_formation(self, response):\n",
+    "\n",
+    "        if TESTING_WITH_LOCAL_FILES is False:\n",
+    "            # Downloading HTML content\n",
+    "            page = response.url.split(\"/\")[-1]\n",
+    "            # Path doesn't deal with file:// URIs\n",
+    "            filepath = Path(formation_base_path[7:]) / page\n",
+    "            filepath.write_bytes(response.body)\n",
+    "\n",
+    "        formation_entete = response.css('div.entete')\n",
+    "        formation_contenu = response.css('div.entete + div')\n",
+    "        formation_contenu_col1 = response.css('div.entete + div > div:nth-child(1)')\n",
+    "        formation_contenu_col2 = response.css('div.entete + div > div:nth-child(2)')\n",
+    "        formation_inscription_info = formation_contenu_col2.css('div:nth-of-type(1)')\n",
+    "        formation_inscription_contact = formation_contenu_col2.css('div:nth-of-type(2)')\n",
+    "        formation_informations_pratiques = formation_contenu_col2.css('div:nth-of-type(3)')\n",
+    "        formation_lieux_horaires = response.css('div#lieux-formation')\n",
+    "\n",
+    "\n",
+    "        # SERVICE\n",
+    "        item = {}\n",
+    "\n",
+    "        # Nom\n",
+    "        service_nom_1 = strip(response.css(\"div.titre-element > strong::text\").get())\n",
+    "        service_nom_2 = strip(response.css(\"a.underline.red-alpha + div::text\").get())\n",
+    "        item[\"nom\"] = f\"{service_nom_1} ({service_nom_2})\"\n",
+    "\n",
+    "        # Date de màj\n",
+    "        date_maj_fr = strip(response.css(\"a.underline.red-alpha + div + div::text\").get().split(\":\")[-1])\n",
+    "        item[\"date_maj\"] = dateparser.parse(date_maj_fr).isoformat()\n",
+    "        \n",
+    "        # Description\n",
+    "        contenu_objectif_public = formation_contenu_col1.css(\".row div\").getall()\n",
+    "        contenu_objectif_public += formation_informations_pratiques.get()\n",
+    "        # les descriptions sont très longues et rendent difficiles le test des autres champs\n",
+    "        # item[\"presentation_detail\"] = html_to_markdown(contenu_objectif_public)\n",
+    "\n",
+    "        # Lien vers la source\n",
+    "        item[\"lien_source\"] = response.url\n",
+    "\n",
+    "        # Courriel\n",
+    "        item[\"courriel\"] = strip(formation_inscription_contact.css('div.email.red-alpha > a::attr(href)').get()).split(\":\")[-1]\n",
+    "\n",
+    "        # Adresse\n",
+    "        clean_lieux = clean_adresse(formation_lieux_horaires.css(\"div.adresse\"))\n",
+    "\n",
+    "        # Téléphone\n",
+    "        item[\"telephone\"] = \"\"\n",
+    "        \n",
+    "        # Contact nom prénom\n",
+    "        item[\"contact_nom_prenom\"] = \"\"\n",
+    "\n",
+    "        # Thématiques\n",
+    "        item[\"thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n",
+    "        if service_nom_2 == \"Français à visée professionnelle\":\n",
+    "            item[\"thematiques\"].append(\"apprendre-francais--accompagnement-insertion-pro\")\n",
+    "        if service_nom_2 == \"Français à visée sociale et communicative\":\n",
+    "            item[\"thematiques\"].append(\"apprendre-francais--communiquer-vie-tous-les-jours\")\n",
+    "\n",
+    "        # Hard coded fields\n",
+    "        item[\"zone_diffusion_type\"] = \"departement\"\n",
+    "        item[\"zone_diffusion_code\"] = \"91\"\n",
+    "        item[\"zone_diffusion_nom\"] = \"Essonne\"\n",
+    "        item[\"types\"] = [\"formation\"]\n",
+    "        item[\"cumulable\"] = True\n",
+    "        item[\"contact_public\"] = True\n",
+    "        item[\"modes_accueil\"] = [\"en-presentiel\"]\n",
+    "\n",
+    "        \n",
+    "        # STRUCTURE\n",
+    "        # ID de la structure\n",
+    "        structure_link_element = formation_entete.css(\"div.titre-element ~ a.underline.red-alpha\")\n",
+    "        item[\"structure_id\"] = structure_link_element.xpath(\"@href\").get().split(\"/\")[-1]\n",
+    "        if TESTING_WITH_LOCAL_FILES:\n",
+    "            structure_link = f\"{structure_base_path}/{item['structure_id']}\"\n",
+    "        else:\n",
+    "            structure_link = structure_link_element.xpath(\"@href\").get()\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "        # Une ligne/record de service et une structure par lieu\n",
+    "        service_id_suffix = 1\n",
+    "        for lieu in clean_lieux:\n",
+    "            # Id\n",
+    "            item[\"id\"] = f\"{response.url.split('/')[-1]}_{str(service_id_suffix)}\"\n",
+    "            service_id_suffix += 1\n",
+    "            print(lieu)\n",
+    "            item = item | lieu\n",
+    "            yield scrapy.Request(structure_link, callback=self.parse_structure, meta={\"item\": item}, dont_filter=True)\n",
+    "    \n",
+    "    def parse_structure(self, response):\n",
+    "        if TESTING_WITH_LOCAL_FILES is False:\n",
+    "            # Downloading HTML content\n",
+    "            page = response.url.split(\"/\")[-1]\n",
+    "            # Path doesn't deal with file:// URIs\n",
+    "            filepath = Path(structure_base_path[7:]) / page\n",
+    "            filepath.write_bytes(response.body)\n",
+    "\n",
+    "        item = response.meta.get(\"item\")\n",
+    "    \n",
+    "\n",
+    "        # Nom\n",
+    "        item[\"structure_nom\"] = strip(response.css('div#structure > strong::text').get())\n",
+    "\n",
+    "        # Data màj\n",
+    "        item[\"structure_date_maj\"] = strip(response.css('div.structures-dates > div:nth-child(2)').xpath('text()').get())\n",
+    "        item[\"structure_date_maj\"] = item[\"structure_date_maj\"].split(\" : \")[-1]\n",
+    "        item[\"structure_date_maj\"] = dateparser.parse(item[\"structure_date_maj\"]).isoformat()\n",
+    "\n",
+    "        # Adresse\n",
+    "        # Sur le site Web, une structure a autant d'adresses qu'elle a de lieux pour ses services\n",
+    "        # Certains services sont proposés sur toutes les adresses de la structure, certains non.\n",
+    "\n",
+    "        # Téléphone\n",
+    "        telephone = response.css('div.lieu div.telephone > a::attr(href)').get()\n",
+    "        if type(telephone) == str:\n",
+    "            # Les numéro de téléphone sont préfixés par tel:\n",
+    "            telephone = telephone.strip()[4:]\n",
+    "        else:\n",
+    "            telephone = \"\"\n",
+    "        item[\"structure_telephone\"] = telephone\n",
+    "        \n",
+    "        # Site Web\n",
+    "        item[\"structure_site_web\"] = strip(response.css('div.lieu div.facebook::text').get())\n",
+    "\n",
+    "        # Lien source\n",
+    "        item[\"structure_lien_source\"] = response.url\n",
+    "\n",
+    "        # Labels\n",
+    "        item[\"structure_labels_autres\"] = [\"reseau-alpha\"]\n",
+    "\n",
+    "        # Thématiques\n",
+    "        item[\"structure_thematiques\"] = [\"apprendre-francais--suivre-formation\"]\n",
+    "\n",
+    "\n",
+    "        yield item\n",
+    "\n",
+    "    \n",
+    "process = CrawlerProcess(settings={\n",
+    "    \"FEEDS\": {\n",
+    "        # Seul le JSON est utilisable dans le pipeline car le CSV imprime les listes sans square brackets ([])\n",
+    "        # Le CSV est pratique pour tester\n",
+    "        \"alpha.json\": {\n",
+    "            \"format\": \"json\",\n",
+    "            \"overwrite\": True,\n",
+    "            \"ensure_ascii\": False,\n",
+    "            'encoding': 'utf8',\n",
+    "            'store_empty': False,\n",
+    "            },\n",
+    "        \"alpha.csv\": {\n",
+    "            \"format\": \"csv\",\n",
+    "            \"overwrite\": True,\n",
+    "            'encoding': 'utf8',\n",
+    "            },\n",
+    "    },\n",
+    "})\n",
+    "process.crawl(AlphaSpider)\n",
+    "process.start()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('./alpha.csv', dtype = str, index_col=None)\n",
+    "df.info()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv-analyse",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -12,3 +12,6 @@ seaborn
 pyairtable
 pyproj
 minio
+scrapy
+dateparser
+trafilatura