Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable decoding of tbm dataset name #131

Merged
merged 2 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions pygac/pod_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,20 @@

import datetime
import logging

try:
from enum import IntFlag
except ImportError:
# python version < 3.6, use a simple object without nice representation
IntFlag = object

import numpy as np

from pyorbital.geoloc_instrument_definitions import avhrr_gac
from pyorbital.geoloc import compute_pixels, get_lonlatalt
from pyorbital.geoloc_instrument_definitions import avhrr_gac

from pygac.clock_offsets_converter import get_offsets
from pygac.correct_tsm_issue import TSM_AFFECTED_INTERVALS_POD, get_tsm_idx
from pygac.reader import Reader, ReaderError, NoTLEData
from pygac.reader import DecodingError, NoTLEData, Reader, ReaderError
from pygac.slerp import slerp
from pygac.utils import file_opener

Expand Down Expand Up @@ -322,18 +322,13 @@ def read_header(cls, filename, fileobj=None, header_date="auto"):
fd_.read(tbm_header.itemsize),
dtype=tbm_header, count=1)
try:
data_set_name = _tbm_head['data_set_name'].decode()
except UnicodeDecodeError:
data_set_name = '---'
allowed_empty = (42*b'\x00' + b' ')
if (cls.data_set_pattern.match(data_set_name)
or (_tbm_head['data_set_name'] == allowed_empty)):
tbm_head = _tbm_head.copy()
tbm_head = cls._validate_tbm_header(_tbm_head)
tbm_offset = tbm_header.itemsize
else:
fd_.seek(0)
except DecodingError:
tbm_head = None
tbm_offset = 0

fd_.seek(tbm_offset, 0)
header = cls.choose_header_based_on_timestamp(header_date, fd_)
fd_.seek(tbm_offset, 0)
# need to copy frombuffer to have write access on head
Expand All @@ -344,6 +339,18 @@ def read_header(cls, filename, fileobj=None, header_date="auto"):
cls._validate_header(head)
return tbm_head, head

@classmethod
def _validate_tbm_header(cls, potential_tbm_header):
data_set_name = potential_tbm_header['data_set_name']
allowed_empty = (42*b'\x00' + b' ')
if data_set_name == allowed_empty:
return potential_tbm_header.copy()

# This will raise a DecodingError if the data_set_name is not valid.
cls._decode_data_set_name(data_set_name)
return potential_tbm_header.copy()


@classmethod
def choose_header_based_on_timestamp(cls, header_date, fd_):
"""Choose the header dtype based on the timestamp."""
Expand Down
28 changes: 17 additions & 11 deletions pygac/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,16 +209,10 @@ def _correct_data_set_name(cls, header, filename):
filename (str): path to file
"""
filename = str(filename)
for encoding in "utf-8", "cp500":
data_set_name = header['data_set_name']
try:
data_set_name = cls._decode_data_set_name(data_set_name, encoding)
except DecodingError as err:
LOG.debug(str(err))
else:
header["data_set_name"] = data_set_name
break
else:
data_set_name = header['data_set_name']
try:
header["data_set_name"] = cls._decode_data_set_name(data_set_name)
except DecodingError:
LOG.debug(f'The data_set_name in header {header["data_set_name"]} does not match.'
' Use filename instead.')
match = cls.data_set_pattern.search(filename)
Expand All @@ -232,7 +226,19 @@ def _correct_data_set_name(cls, header, filename):
return header

@classmethod
def _decode_data_set_name(cls, data_set_name, encoding):
def _decode_data_set_name(cls, data_set_name):
for encoding in "utf-8", "cp500":
try:
data_set_name = cls._decode_data_set_name_for_encoding(data_set_name, encoding)
except DecodingError as err:
LOG.debug(str(err))
else:
return data_set_name
else:
raise DecodingError("Could not reliably decode the dataset name.")

@classmethod
def _decode_data_set_name_for_encoding(cls, data_set_name, encoding):
data_set_name = data_set_name.decode(encoding, errors='ignore')
if not cls.data_set_pattern.match(data_set_name):
raise DecodingError(f'The data_set_name in header {data_set_name} '
Expand Down
18 changes: 9 additions & 9 deletions pygac/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,20 @@
import os
import sys
import unittest
import pytest

from unittest import mock

import numpy as np
import numpy.testing
from pygac.gac_reader import GACReader, ReaderError
from pygac.lac_reader import LACReader
from pygac.pod_reader import POD_QualityIndicator
import pytest

from pygac.gac_pod import scanline
from pygac.reader import NoTLEData
from pygac.gac_reader import GACReader, ReaderError
from pygac.lac_pod import LACPODReader

from pygac.pod_reader import tbm_header as tbm_header_dtype, header3
from pygac.lac_pod import scanline as lacpod_scanline
from pygac.lac_reader import LACReader
from pygac.pod_reader import POD_QualityIndicator, header3
from pygac.pod_reader import tbm_header as tbm_header_dtype
from pygac.reader import NoTLEData


class TestPath(os.PathLike):
Expand Down Expand Up @@ -688,7 +688,7 @@ def pod_file_with_tbm_header(tmp_path):
number_of_scans = 3

tbm_header = np.zeros(1, dtype=tbm_header_dtype)
tbm_header["data_set_name"] = b"BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL "
tbm_header["data_set_name"] = "BRN.HRPT.NJ.D00322.S0334.E0319.B3031919.BL\x80\x80".encode("cp500")
tbm_header["select_flag"] = b"S"
tbm_header["beginning_latitude"] = b"+77"
tbm_header["ending_latitude"] = b"+22"
Expand Down
Loading