diff --git a/docs/changelog.rst b/docs/changelog.rst index c09c82784a..e86dbea968 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,6 +5,10 @@ CHANGELOG 2.101.5+dev (XXXX-XX-XX) ------------------------ +**Improvements** + +- Convert encoding kml or gpx file from parser to utf-8 + 2.101.5 (2024-01-11) -------------------- diff --git a/geotrek/trekking/parsers.py b/geotrek/trekking/parsers.py index 5ce1b8a46f..4d892807d8 100644 --- a/geotrek/trekking/parsers.py +++ b/geotrek/trekking/parsers.py @@ -6,6 +6,9 @@ from datetime import date, timedelta from decimal import Decimal from tempfile import NamedTemporaryFile +import codecs +import os +from chardet.universaldetector import UniversalDetector from django.conf import settings from django.contrib.gis.gdal import DataSource @@ -903,7 +906,9 @@ def _get_geom_from_gpx(data): with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf: ntf.write(data) ntf.flush() - ds = DataSource(ntf.name) + + file_path = ApidaeTrekParser._get_file_with_good_encoding(ntf.name) + ds = DataSource(file_path) for layer_name in ('tracks', 'routes'): layer = ApidaeTrekParser._get_layer(ds, layer_name) geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer) @@ -912,6 +917,31 @@ def _get_geom_from_gpx(data): geos.transform(settings.SRID) return geos + @staticmethod + def _get_file_with_good_encoding(file_name): + # Get encoding mode (utf-8, ascii, ISO-8859-1...) + detector = UniversalDetector() + detector.reset() + for line in open(file_name, 'rb'): + detector.feed(line) + # Condition in 1 line to validate coverage. + if detector.done: break # noqa: E701 + detector.close() + + tmp_file_path = file_name + # If not utf-8, convert file to utf-8 + if detector.result["encoding"] != "utf-8": + tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp') + BLOCKSIZE = 9_048_576 + with codecs.open(file_name, "r", detector.result["encoding"]) as sourceFile: + with codecs.open(tmp_file_path, "w", "utf-8") as targetFile: + while True: + contents = sourceFile.read(BLOCKSIZE) + if not contents: + break + targetFile.write(contents) + return tmp_file_path + @staticmethod def _get_geom_from_kml(data): """Given KML data as bytes it returns a geom.""" @@ -935,7 +965,9 @@ def get_first_geom_with_type_in(types, geoms): with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf: ntf.write(data) ntf.flush() - ds = DataSource(ntf.name) + + file_path = ApidaeTrekParser._get_file_with_good_encoding(ntf.name) + ds = DataSource(file_path) geos = get_geos_linestring(ds) geos.transform(settings.SRID) return geos diff --git a/geotrek/trekking/tests/data/file_bad_encoding.kml b/geotrek/trekking/tests/data/file_bad_encoding.kml new file mode 100644 index 0000000000..604f71625a --- /dev/null +++ b/geotrek/trekking/tests/data/file_bad_encoding.kml @@ -0,0 +1,10 @@ + + + + + Testé dans le cadre des parseurs donc les caratères: & à @ é ê è doivent être bien affichés + + + + +file_bad_encoding \ No newline at end of file diff --git a/geotrek/trekking/tests/test_parsers.py b/geotrek/trekking/tests/test_parsers.py index efe2ef7aff..be84cdc4b8 100644 --- a/geotrek/trekking/tests/test_parsers.py +++ b/geotrek/trekking/tests/test_parsers.py @@ -7,6 +7,7 @@ from unittest import skipIf from unittest.mock import Mock from urllib.parse import urlparse +from chardet.universaldetector import UniversalDetector from django.conf import settings from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter @@ -1148,6 +1149,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked self.assertEqual(Attachment.objects.count(), 0) +class TestApidaeTrekParserConvertEncodingFiles(TestCase): + def test_get_file_with_good_encoding(self): + file_name = "geotrek/trekking/tests/data/file_bad_encoding.kml" + + # Get encoding mode (utf-8, ascii, ISO-8859-1...) + detector = UniversalDetector() + detector.reset() + for line in open(file_name, 'rb'): + detector.feed(line) + if detector.done: + break + detector.close() + self.assertNotEqual(detector.result["encoding"], "utf-8") + + filePath = ApidaeTrekParser._get_file_with_good_encoding(file_name) + + # Get encoding mode (utf-8, ascii, ISO-8859-1...) + detector = UniversalDetector() + detector.reset() + for line in open(filePath, 'rb'): + detector.feed(line) + if detector.done: + break + detector.close() + + self.assertEqual(detector.result["encoding"], "utf-8") + + class TestApidaeTrekThemeParser(ApidaeTrekThemeParser): url = 'https://example.net/fake/api/' diff --git a/requirements.txt b/requirements.txt index 079e609705..22ab01ab21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,6 +51,8 @@ cffi==1.16.0 # persistent # pyvips # weasyprint +chardet==5.2.0 + # via geotrek (setup.py) charset-normalizer==3.2.0 # via requests click==8.1.3 diff --git a/setup.py b/setup.py index 293661e53e..fe7b72f48b 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def run(self): install_requires=[ 'Django==3.2.*', 'mapentity', + 'chardet', 'cairosvg', 'cairocffi', 'env_file',