diff --git a/docs/changelog.rst b/docs/changelog.rst index f600df3b12..4ecc5145f8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,6 +5,10 @@ CHANGELOG 2.101.5+dev (XXXX-XX-XX) ------------------------ +**Improvements** + +- Allow Apidae Trek parser to handle traces not in utf-8 + **Documentation** - Improve performance in spatial intersection (zoning district and zoning city) for sql views (#3600) @@ -5529,4 +5533,4 @@ Installation script * Fix regex for RAISE NOTICE (fixes #673) * Initial public version -See project history in `docs/history.rst` (French). \ No newline at end of file +See project history in `docs/history.rst` (French). diff --git a/geotrek/common/utils/file_infos.py b/geotrek/common/utils/file_infos.py new file mode 100644 index 0000000000..5d2ef185ef --- /dev/null +++ b/geotrek/common/utils/file_infos.py @@ -0,0 +1,13 @@ +from chardet.universaldetector import UniversalDetector + + +def get_encoding_file(file_name): + # Get encoding mode (utf-8, ascii, ISO-8859-1...) + detector = UniversalDetector() + detector.reset() + for line in open(file_name, 'rb'): + detector.feed(line) + if detector.done: + break + detector.close() + return detector.result["encoding"] diff --git a/geotrek/trekking/parsers.py b/geotrek/trekking/parsers.py index 5ce1b8a46f..8e80294af4 100644 --- a/geotrek/trekking/parsers.py +++ b/geotrek/trekking/parsers.py @@ -3,9 +3,11 @@ import re import zipfile from collections import defaultdict -from datetime import date, timedelta +from datetime import date, timedelta, datetime from decimal import Decimal from tempfile import NamedTemporaryFile +import codecs +import os from django.conf import settings from django.contrib.gis.gdal import DataSource @@ -14,6 +16,7 @@ from django.utils.translation import gettext as _ from modeltranslation.utils import build_localized_fieldname +from geotrek.common.utils.file_infos import get_encoding_file from geotrek.common.models import Label, Theme from geotrek.common.parsers import (ApidaeBaseParser, AttachmentParserMixin, GeotrekParser, GlobalImportError, Parser, @@ -903,7 +906,9 @@ def _get_geom_from_gpx(data): with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf: ntf.write(data) ntf.flush() - ds = DataSource(ntf.name) + + file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name) + ds = DataSource(file_path) for layer_name in ('tracks', 'routes'): layer = ApidaeTrekParser._get_layer(ds, layer_name) geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer) @@ -912,6 +917,24 @@ def _get_geom_from_gpx(data): geos.transform(settings.SRID) return geos + @staticmethod + def _maybe_fix_encoding_to_utf8(file_name): + encoding = get_encoding_file(file_name) + + # If not utf-8, convert file to utf-8 + if encoding != "utf-8": + tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp_' + str(datetime.now().timestamp())) + BLOCKSIZE = 9_048_576 + with codecs.open(file_name, "r", encoding) as sourceFile: + with codecs.open(tmp_file_path, "w", "utf-8") as targetFile: + while True: + contents = sourceFile.read(BLOCKSIZE) + if not contents: + break + targetFile.write(contents) + os.replace(tmp_file_path, file_name) + return file_name + @staticmethod def _get_geom_from_kml(data): """Given KML data as bytes it returns a geom.""" @@ -935,7 +958,9 @@ def get_first_geom_with_type_in(types, geoms): with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf: ntf.write(data) ntf.flush() - ds = DataSource(ntf.name) + + file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name) + ds = DataSource(file_path) geos = get_geos_linestring(ds) geos.transform(settings.SRID) return geos diff --git a/geotrek/trekking/tests/data/file_bad_encoding.kml b/geotrek/trekking/tests/data/file_bad_encoding.kml new file mode 100644 index 0000000000..d9c0a67b4b --- /dev/null +++ b/geotrek/trekking/tests/data/file_bad_encoding.kml @@ -0,0 +1,8 @@ + + + + + Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés + + + \ No newline at end of file diff --git a/geotrek/trekking/tests/data/file_good_encoding.kml b/geotrek/trekking/tests/data/file_good_encoding.kml new file mode 100644 index 0000000000..cde1d55c24 --- /dev/null +++ b/geotrek/trekking/tests/data/file_good_encoding.kml @@ -0,0 +1,8 @@ + + + + + Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés + + + \ No newline at end of file diff --git a/geotrek/trekking/tests/test_parsers.py b/geotrek/trekking/tests/test_parsers.py index efe2ef7aff..0e02a38227 100644 --- a/geotrek/trekking/tests/test_parsers.py +++ b/geotrek/trekking/tests/test_parsers.py @@ -7,6 +7,7 @@ from unittest import skipIf from unittest.mock import Mock from urllib.parse import urlparse +from shutil import copy as copyfile from django.conf import settings from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter @@ -16,6 +17,7 @@ from django.test.utils import override_settings from geotrek.common.utils import testdata +from geotrek.common.utils.file_infos import get_encoding_file from geotrek.common.models import Theme, FileType, Attachment, Label from geotrek.common.tests.mixins import GeotrekParserTestMixin from geotrek.core.tests.factories import PathFactory @@ -1148,6 +1150,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked self.assertEqual(Attachment.objects.count(), 0) +class TestApidaeTrekParserConvertEncodingFiles(TestCase): + data_dir = "geotrek/trekking/tests/data" + + def test_fix_encoding_to_utf8(self): + file_name = f'{self.data_dir}/file_bad_encoding_tmp.kml' + copyfile(f'{self.data_dir}/file_bad_encoding.kml', file_name) + + encoding = get_encoding_file(file_name) + self.assertNotEqual(encoding, "utf-8") + + new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name) + + encoding = get_encoding_file(new_file_name) + self.assertEqual(encoding, "utf-8") + + def test_not_fix_encoding_to_utf8(self): + file_name = f'{self.data_dir}/file_good_encoding_tmp.kml' + copyfile(f'{self.data_dir}/file_good_encoding.kml', file_name) + + encoding = get_encoding_file(file_name) + self.assertEqual(encoding, "utf-8") + + new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name) + + encoding = get_encoding_file(new_file_name) + self.assertEqual(encoding, "utf-8") + + class TestApidaeTrekThemeParser(ApidaeTrekThemeParser): url = 'https://example.net/fake/api/' diff --git a/requirements.txt b/requirements.txt index a4ed76c50e..1857c6be34 100644 --- a/requirements.txt +++ b/requirements.txt @@ -51,6 +51,8 @@ cffi==1.16.0 # persistent # pyvips # weasyprint +chardet==5.2.0 + # via geotrek (setup.py) charset-normalizer==3.2.0 # via requests click==8.1.3 diff --git a/setup.py b/setup.py index 293661e53e..fe7b72f48b 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def run(self): install_requires=[ 'Django==3.2.*', 'mapentity', + 'chardet', 'cairosvg', 'cairocffi', 'env_file',