Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Allow Apidae Trek parser to handle traces not in utf-8 #3899

Merged
merged 1 commit into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ CHANGELOG
2.101.5+dev (XXXX-XX-XX)
------------------------

**Improvements**

- Allow Apidae Trek parser to handle traces not in utf-8

**Documentation**

- Improve performance in spatial intersection (zoning district and zoning city) for sql views (#3600)
Expand Down Expand Up @@ -5529,4 +5533,4 @@ Installation script
* Fix regex for RAISE NOTICE (fixes #673)
* Initial public version

See project history in `docs/history.rst` (French).
See project history in `docs/history.rst` (French).
13 changes: 13 additions & 0 deletions geotrek/common/utils/file_infos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from chardet.universaldetector import UniversalDetector


def get_encoding_file(file_name):
# Get encoding mode (utf-8, ascii, ISO-8859-1...)
detector = UniversalDetector()
detector.reset()
for line in open(file_name, 'rb'):
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result["encoding"]
31 changes: 28 additions & 3 deletions geotrek/trekking/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import re
import zipfile
from collections import defaultdict
from datetime import date, timedelta
from datetime import date, timedelta, datetime
from decimal import Decimal
from tempfile import NamedTemporaryFile
import codecs
import os

from django.conf import settings
from django.contrib.gis.gdal import DataSource
Expand All @@ -14,6 +16,7 @@
from django.utils.translation import gettext as _
from modeltranslation.utils import build_localized_fieldname

from geotrek.common.utils.file_infos import get_encoding_file
from geotrek.common.models import Label, Theme
from geotrek.common.parsers import (ApidaeBaseParser, AttachmentParserMixin,
GeotrekParser, GlobalImportError, Parser,
Expand Down Expand Up @@ -903,7 +906,9 @@ def _get_geom_from_gpx(data):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
ds = DataSource(file_path)
for layer_name in ('tracks', 'routes'):
layer = ApidaeTrekParser._get_layer(ds, layer_name)
geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer)
Expand All @@ -912,6 +917,24 @@ def _get_geom_from_gpx(data):
geos.transform(settings.SRID)
return geos

@staticmethod
def _maybe_fix_encoding_to_utf8(file_name):
encoding = get_encoding_file(file_name)

# If not utf-8, convert file to utf-8
if encoding != "utf-8":
tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp_' + str(datetime.now().timestamp()))
BLOCKSIZE = 9_048_576
with codecs.open(file_name, "r", encoding) as sourceFile:
with codecs.open(tmp_file_path, "w", "utf-8") as targetFile:
juggler31 marked this conversation as resolved.
Show resolved Hide resolved
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
os.replace(tmp_file_path, file_name)
juggler31 marked this conversation as resolved.
Show resolved Hide resolved
return file_name

@staticmethod
def _get_geom_from_kml(data):
"""Given KML data as bytes it returns a geom."""
Expand All @@ -935,7 +958,9 @@ def get_first_geom_with_type_in(types, geoms):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._maybe_fix_encoding_to_utf8(ntf.name)
ds = DataSource(file_path)
geos = get_geos_linestring(ds)
geos.transform(settings.SRID)
return geos
Expand Down
8 changes: 8 additions & 0 deletions geotrek/trekking/tests/data/file_bad_encoding.kml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<description>
Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés
</description>
</Document>
</kml>
8 changes: 8 additions & 0 deletions geotrek/trekking/tests/data/file_good_encoding.kml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<description>
Testé dans le cadre des parseurs donc les caractères: & à @ é ê è doivent être bien affichés
</description>
</Document>
</kml>
30 changes: 30 additions & 0 deletions geotrek/trekking/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unittest import skipIf
from unittest.mock import Mock
from urllib.parse import urlparse
from shutil import copy as copyfile

from django.conf import settings
from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter
Expand All @@ -16,6 +17,7 @@
from django.test.utils import override_settings

from geotrek.common.utils import testdata
from geotrek.common.utils.file_infos import get_encoding_file
from geotrek.common.models import Theme, FileType, Attachment, Label
from geotrek.common.tests.mixins import GeotrekParserTestMixin
from geotrek.core.tests.factories import PathFactory
Expand Down Expand Up @@ -1148,6 +1150,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked
self.assertEqual(Attachment.objects.count(), 0)


class TestApidaeTrekParserConvertEncodingFiles(TestCase):
data_dir = "geotrek/trekking/tests/data"

def test_fix_encoding_to_utf8(self):
file_name = f'{self.data_dir}/file_bad_encoding_tmp.kml'
copyfile(f'{self.data_dir}/file_bad_encoding.kml', file_name)

encoding = get_encoding_file(file_name)
self.assertNotEqual(encoding, "utf-8")

new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)

encoding = get_encoding_file(new_file_name)
self.assertEqual(encoding, "utf-8")

def test_not_fix_encoding_to_utf8(self):
file_name = f'{self.data_dir}/file_good_encoding_tmp.kml'
copyfile(f'{self.data_dir}/file_good_encoding.kml', file_name)

encoding = get_encoding_file(file_name)
self.assertEqual(encoding, "utf-8")

new_file_name = ApidaeTrekParser._maybe_fix_encoding_to_utf8(file_name)

encoding = get_encoding_file(new_file_name)
self.assertEqual(encoding, "utf-8")


class TestApidaeTrekThemeParser(ApidaeTrekThemeParser):

url = 'https://example.net/fake/api/'
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ cffi==1.16.0
# persistent
# pyvips
# weasyprint
chardet==5.2.0
# via geotrek (setup.py)
charset-normalizer==3.2.0
# via requests
click==8.1.3
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def run(self):
install_requires=[
'Django==3.2.*',
'mapentity',
'chardet',
'cairosvg',
'cairocffi',
'env_file',
Expand Down
Loading