Skip to content

Commit

Permalink
💫 fix convert encoding kml or gpx file from parser to utf-8
Browse files Browse the repository at this point in the history
  • Loading branch information
juggler31 committed Jan 23, 2024
1 parent 47a85b1 commit 8037c31
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 2 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ CHANGELOG
2.101.5+dev (XXXX-XX-XX)
------------------------

**Improvements**

- Convert encoding kml or gpx file from parser to utf-8

2.101.5 (2024-01-11)
--------------------

Expand Down
36 changes: 34 additions & 2 deletions geotrek/trekking/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from datetime import date, timedelta
from decimal import Decimal
from tempfile import NamedTemporaryFile
import codecs
import os
from chardet.universaldetector import UniversalDetector

from django.conf import settings
from django.contrib.gis.gdal import DataSource
Expand Down Expand Up @@ -903,7 +906,9 @@ def _get_geom_from_gpx(data):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._get_file_with_good_encoding(ntf.name)
ds = DataSource(file_path)
for layer_name in ('tracks', 'routes'):
layer = ApidaeTrekParser._get_layer(ds, layer_name)
geos = ApidaeTrekParser._maybe_get_linestring_from_layer(layer)
Expand All @@ -912,6 +917,31 @@ def _get_geom_from_gpx(data):
geos.transform(settings.SRID)
return geos

@staticmethod
def _get_file_with_good_encoding(file_name):
# Get encoding mode (utf-8, ascii, ISO-8859-1...)
detector = UniversalDetector()
detector.reset()
for line in open(file_name, 'rb'):
detector.feed(line)
# Condition in 1 line to validate coverage.
if detector.done: break # noqa: E701
detector.close()

tmp_file_path = file_name
# If not utf-8, convert file to utf-8
if detector.result["encoding"] != "utf-8":
tmp_file_path = os.path.join(settings.TMP_DIR, 'fileNameTmp')
BLOCKSIZE = 9_048_576
with codecs.open(file_name, "r", detector.result["encoding"]) as sourceFile:
with codecs.open(tmp_file_path, "w", "utf-8") as targetFile:
while True:
contents = sourceFile.read(BLOCKSIZE)
if not contents:
break
targetFile.write(contents)
return tmp_file_path

@staticmethod
def _get_geom_from_kml(data):
"""Given KML data as bytes it returns a geom."""
Expand All @@ -935,7 +965,9 @@ def get_first_geom_with_type_in(types, geoms):
with NamedTemporaryFile(mode='w+b', dir=settings.TMP_DIR) as ntf:
ntf.write(data)
ntf.flush()
ds = DataSource(ntf.name)

file_path = ApidaeTrekParser._get_file_with_good_encoding(ntf.name)
ds = DataSource(file_path)
geos = get_geos_linestring(ds)
geos.transform(settings.SRID)
return geos
Expand Down
10 changes: 10 additions & 0 deletions geotrek/trekking/tests/data/file_bad_encoding.kml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2" xmlns:gx="http://www.google.com/kml/ext/2.2" xmlns:kml="http://www.opengis.net/kml/2.2" xmlns:atom="http://www.w3.org/2005/Atom">
<Document>
<description>
Testé dans le cadre des parseurs donc les caratères: & à @ é ê è doivent être bien affichés
</description>

</Document>

file_bad_encoding
29 changes: 29 additions & 0 deletions geotrek/trekking/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unittest import skipIf
from unittest.mock import Mock
from urllib.parse import urlparse
from chardet.universaldetector import UniversalDetector

from django.conf import settings
from django.contrib.gis.geos import Point, LineString, MultiLineString, WKTWriter
Expand Down Expand Up @@ -1148,6 +1149,34 @@ def test_trek_illustration_is_not_imported_on_missing_file_metadata(self, mocked
self.assertEqual(Attachment.objects.count(), 0)


class TestApidaeTrekParserConvertEncodingFiles(TestCase):
def test_get_file_with_good_encoding(self):
file_name = "geotrek/trekking/tests/data/file_bad_encoding.kml"

# Get encoding mode (utf-8, ascii, ISO-8859-1...)
detector = UniversalDetector()
detector.reset()
for line in open(file_name, 'rb'):
detector.feed(line)
if detector.done:
break
detector.close()
self.assertNotEqual(detector.result["encoding"], "utf-8")

filePath = ApidaeTrekParser._get_file_with_good_encoding(file_name)

# Get encoding mode (utf-8, ascii, ISO-8859-1...)
detector = UniversalDetector()
detector.reset()
for line in open(filePath, 'rb'):
detector.feed(line)
if detector.done:
break
detector.close()

self.assertEqual(detector.result["encoding"], "utf-8")


class TestApidaeTrekThemeParser(ApidaeTrekThemeParser):

url = 'https://example.net/fake/api/'
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ cffi==1.16.0
# persistent
# pyvips
# weasyprint
chardet==5.2.0
# via geotrek (setup.py)
charset-normalizer==3.2.0
# via requests
click==8.1.3
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def run(self):
install_requires=[
'Django==3.2.*',
'mapentity',
'chardet',
'cairosvg',
'cairocffi',
'env_file',
Expand Down

0 comments on commit 8037c31

Please sign in to comment.