Skip to content

Commit

Permalink
💫 [FIX] Allow Apidae Trek parser to handle traces not in utf-8
Browse files Browse the repository at this point in the history
  • Loading branch information
juggler31 committed Jan 29, 2024
1 parent 216fa43 commit 7ae2cb0
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 4 deletions.
6 changes: 5 additions & 1 deletion docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ CHANGELOG
2.101.5+dev (XXXX-XX-XX)
------------------------

**Improvements**

- Allow Apidae Trek parser to handle traces not in utf-8

**Documentation**

- Improve performance in spatial intersection (zoning district and zoning city) for sql views (#3600)
Expand Down Expand Up @@ -5529,4 +5533,4 @@ Installation script
* Fix regex for RAISE NOTICE (fixes #673)
* Initial public version

See project history in `docs/history.rst` (French).
See project history in `docs/history.rst` (French).
13 changes: 13 additions & 0 deletions geotrek/common/utils/file_infos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from chardet.universaldetector import UniversalDetector


def get_encoding_file(file_name):
# Get encoding mode (utf-8, ascii, ISO-8859-1...)
detector = UniversalDetector()
detector.reset()
for line in open(file_name, 'rb'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result["encoding"]
31 changes: 28 additions & 3 deletions geotrek/trekking/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import re
import zipfile
from collections import defaultdict
from datetime import date, timedelta
from datetime import date, timedelta, datetime
from decimal import Decimal
from tempfile import NamedTemporaryFile
import codecs
import os

from django.conf import settings
from django.contrib.gis.gdal import DataSource
Expand All @@ -14,6 +16,7 @@
from django.utils.translation import gettext as _
from modeltranslation.utils import build_localized_fieldname

from geotrek.common.utils.file_infos import get_encoding_file
from geotrek.common.models import Label, Theme
from geotrek.common.parsers import (ApidaeBaseParser, AttachmentParserMixin,
GeotrekParser, GlobalImportError, Parser,
Expand Down Expand Up @@ -903,7 +906,9 @@ def _get_geom_from_gpx(data):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
ds = DataSource(file_path)
for layer_name in ('tracks', 'routes'):
layer = ApidaeTrekParser._get_layer(ds, layer_name)
geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer)
Expand All @@ -912,6 +917,24 @@ def _get_geom_from_gpx(data):
geos.transform(settings.SRID)
return geos

@staticmethod
def _maybe_fix_encoding_to_utf8(file_name):
encoding = get_encoding_file(file_name)

# If not utf-8, convert file to utf-8
if encoding != "utf-8":
tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp_' + str(datetime.now().timestamp()))
BLOCKSIZE = 9_048_576
with codecs.open(file_name, "r", encoding) as sourceFile:
with codecs.open(tmp_file_path, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
os.replace(tmp_file_path, file_name)
return file_name

@staticmethod
def _get_geom_from_kml(data):
"""Given KML data as bytes it returns a geom."""
Expand All @@ -935,7 +958,9 @@ def get_first_geom_with_type_in(types, geoms):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
ds = DataSource(file_path)
geos = get_geos_linestring(ds)
geos.transform(settings.SRID)
return geos
Expand Down
8 changes: 8 additions & 0 deletions geotrek/trekking/tests/data/file_bad_encoding.kml.kml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<description>
Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés
</description>
</Document>
</kml>
8 changes: 8 additions & 0 deletions geotrek/trekking/tests/data/file_good_encoding.kml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<description>
Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés
</description>
</Document>
</kml>
30 changes: 30 additions & 0 deletions geotrek/trekking/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unittest import skipIf
from unittest.mock import Mock
from urllib.parse import urlparse
from shutil import copy as copyfile

from django.conf import settings
from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter
Expand All @@ -16,6 +17,7 @@
from django.test.utils import override_settings

from geotrek.common.utils import testdata
from geotrek.common.utils.file_infos import get_encoding_file
from geotrek.common.models import Theme, FileType, Attachment, Label
from geotrek.common.tests.mixins import GeotrekParserTestMixin
from geotrek.core.tests.factories import PathFactory
Expand Down Expand Up @@ -1148,6 +1150,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked
self.assertEqual(Attachment.objects.count(), 0)


class TestApidaeTrekParserConvertEncodingFiles(TestCase):
data_dir = "geotrek/trekking/tests/data"

def test_fix_encoding_to_utf8(self):
file_name = f'{self.data_dir}/file_bad_encoding_tmp.kml'
copyfile(f'{self.data_dir}/file_bad_encoding.kml', file_name)

encoding = get_encoding_file(file_name)
self.assertNotEqual(encoding, "utf-8")

new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)

encoding = get_encoding_file(new_file_name)
self.assertEqual(encoding, "utf-8")

def test_not_fix_encoding_to_utf8(self):
file_name = f'{self.data_dir}/file_good_encoding_tmp.kml'
copyfile(f'{self.data_dir}/file_good_encoding.kml', file_name)

encoding = get_encoding_file(file_name)
self.assertEqual(encoding, "utf-8")

new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)

encoding = get_encoding_file(new_file_name)
self.assertEqual(encoding, "utf-8")


class TestApidaeTrekThemeParser(ApidaeTrekThemeParser):

url = 'https://example.net/fake/api/'
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ cffi==1.16.0
# persistent
# pyvips
# weasyprint
chardet==5.2.0
# via geotrek (setup.py)
charset-normalizer==3.2.0
# via requests
click==8.1.3
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def run(self):
install_requires=[
'Django==3.2.*',
'mapentity',
'chardet',
'cairosvg',
'cairocffi',
'env_file',
Expand Down

0 comments on commit 7ae2cb0

Please sign in to comment.