GeotrekCE · juggler31 · Jan 30, 2024 · Jan 22, 2024
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -5,6 +5,10 @@ CHANGELOG
 2.101.5+dev (XXXX-XX-XX)
 ------------------------
 
+**Improvements**
+
+- Allow Apidae Trek parser to handle traces not in utf-8
+
 **Documentation**
 
 - Improve performance in spatial intersection (zoning district and zoning city) for sql views (#3600)
@@ -5529,4 +5533,4 @@ Installation script
 * Fix regex for RAISE NOTICE (fixes #673)
 * Initial public version
 
-See project history in `docs/history.rst` (French).
+See project history in `docs/history.rst` (French).
diff --git a/geotrek/common/utils/file_infos.py b/geotrek/common/utils/file_infos.py
@@ -0,0 +1,13 @@
+from chardet.universaldetector import UniversalDetector
+
+
+def get_encoding_file(file_name):
+    # Get encoding mode (utf-8, ascii, ISO-8859-1...)
+    detector = UniversalDetector()
+    detector.reset()
+    for line in open(file_name, 'rb'):
+        detector.feed(line)
+        if detector.done:
+            break
+    detector.close()
+    return detector.result["encoding"]
diff --git a/geotrek/trekking/parsers.py b/geotrek/trekking/parsers.py
@@ -3,9 +3,11 @@
 import re
 import zipfile
 from collections import defaultdict
-from datetime import date, timedelta
+from datetime import date, timedelta, datetime
 from decimal import Decimal
 from tempfile import NamedTemporaryFile
+import codecs
+import os
 
 from django.conf import settings
 from django.contrib.gis.gdal import DataSource
@@ -14,6 +16,7 @@
 from django.utils.translation import gettext as _
 from modeltranslation.utils import build_localized_fieldname
 
+from geotrek.common.utils.file_infos import get_encoding_file
 from geotrek.common.models import Label, Theme
 from geotrek.common.parsers import (ApidaeBaseParser, AttachmentParserMixin,
                                     GeotrekParser, GlobalImportError, Parser,
@@ -903,7 +906,9 @@ def _get_geom_from_gpx(data):
         with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
             ntf.write(data)
             ntf.flush()
-            ds = DataSource(ntf.name)
+
+            file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
+            ds = DataSource(file_path)
             for layer_name in ('tracks', 'routes'):
                 layer = ApidaeTrekParser._get_layer(ds, layer_name)
                 geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer)
@@ -912,6 +917,24 @@ def _get_geom_from_gpx(data):
             geos.transform(settings.SRID)
             return geos
 
+    @staticmethod
+    def _maybe_fix_encoding_to_utf8(file_name):
+        encoding = get_encoding_file(file_name)
+
+        # If not utf-8, convert file to utf-8
+        if encoding != "utf-8":
+            tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp_' + str(datetime.now().timestamp()))
+            BLOCKSIZE = 9_048_576
+            with codecs.open(file_name, "r", encoding) as sourceFile:
+                with codecs.open(tmp_file_path, "w", "utf-8") as targetFile:
+                    while True:
+                        contents = sourceFile.read(BLOCKSIZE)
+                        if not contents:
+                            break
+                        targetFile.write(contents)
+            os.replace(tmp_file_path, file_name)
+        return file_name
+
     @staticmethod
     def _get_geom_from_kml(data):
         """Given KML data as bytes it returns a geom."""
@@ -935,7 +958,9 @@ def get_first_geom_with_type_in(types, geoms):
         with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
             ntf.write(data)
             ntf.flush()
-            ds = DataSource(ntf.name)
+
+            file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
+            ds = DataSource(file_path)
             geos = get_geos_linestring(ds)
             geos.transform(settings.SRID)
             return geos

diff --git a/geotrek/trekking/tests/data/file_bad_encoding.kml b/geotrek/trekking/tests/data/file_bad_encoding.kml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
+    <Document>
+        <description>
+            Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés
+        </description>
+    </Document>
+</kml>
diff --git a/geotrek/trekking/tests/data/file_good_encoding.kml b/geotrek/trekking/tests/data/file_good_encoding.kml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
+    <Document>
+        <description>
+            TestÃ© dans le cadre des parseurs donc les caractÃ¨res: & Ã  @ Ã© Ãª Ã¨ doivent Ãªtre bien affichÃ©s
+        </description>
+    </Document>
+</kml>
diff --git a/geotrek/trekking/tests/test_parsers.py b/geotrek/trekking/tests/test_parsers.py
@@ -7,6 +7,7 @@
 from unittest import skipIf
 from unittest.mock import Mock
 from urllib.parse import urlparse
+from shutil import copy as copyfile
 
 from django.conf import settings
 from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter
@@ -16,6 +17,7 @@
 from django.test.utils import override_settings
 
 from geotrek.common.utils import testdata
+from geotrek.common.utils.file_infos import get_encoding_file
 from geotrek.common.models import Theme, FileType, Attachment, Label
 from geotrek.common.tests.mixins import GeotrekParserTestMixin
 from geotrek.core.tests.factories import PathFactory
@@ -1148,6 +1150,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked
         self.assertEqual(Attachment.objects.count(), 0)
 
 
+class TestApidaeTrekParserConvertEncodingFiles(TestCase):
+    data_dir = "geotrek/trekking/tests/data"
+
+    def test_fix_encoding_to_utf8(self):
+        file_name = f'{self.data_dir}/file_bad_encoding_tmp.kml'
+        copyfile(f'{self.data_dir}/file_bad_encoding.kml', file_name)
+
+        encoding = get_encoding_file(file_name)
+        self.assertNotEqual(encoding, "utf-8")
+
+        new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)
+
+        encoding = get_encoding_file(new_file_name)
+        self.assertEqual(encoding, "utf-8")
+
+    def test_not_fix_encoding_to_utf8(self):
+        file_name = f'{self.data_dir}/file_good_encoding_tmp.kml'
+        copyfile(f'{self.data_dir}/file_good_encoding.kml', file_name)
+
+        encoding = get_encoding_file(file_name)
+        self.assertEqual(encoding, "utf-8")
+
+        new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)
+
+        encoding = get_encoding_file(new_file_name)
+        self.assertEqual(encoding, "utf-8")
+
+
 class TestApidaeTrekThemeParser(ApidaeTrekThemeParser):
 
     url = 'https://example.net/fake/api/'

diff --git a/requirements.txt b/requirements.txt
@@ -51,6 +51,8 @@ cffi==1.16.0
     #   persistent
     #   pyvips
     #   weasyprint
+chardet==5.2.0
+    # via geotrek (setup.py)
 charset-normalizer==3.2.0
     # via requests
 click==8.1.3

diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@ def run(self):
     install_requires=[
         'Django==3.2.*',
         'mapentity',
+        'chardet',
         'cairosvg',
         'cairocffi',
         'env_file',