Skip to content

Commit

Permalink
Added the Washington dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
vittoriopippi committed Feb 10, 2025
1 parent 3e248d1 commit 67625df
Showing 1 changed file with 62 additions and 22 deletions.
84 changes: 62 additions & 22 deletions hwd/datasets/washington.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,70 @@
from .base_dataset import BaseDataset
from pathlib import Path
import msgpack
from .shtg.base_dataset import extract_zip, download_file

WASHINGTON_URL = 'https://github.com/aimagelab/HWD/releases/download/washington/washingtondb-v1.0.zip'
WASHINGTON_ZIP_PATH = Path('~/.cache/washington/washingtondb-v1.0.zip').expanduser()
WASHINGTON_DIR_PATH = Path('~/.cache/washington').expanduser()

SPECIAL_MAP = {
"s_0": "0", "s_1": "1", "s_2": "2", "s_3": "3", "s_4": "4",
"s_5": "5", "s_6": "6", "s_7": "7", "s_8": "8", "s_9": "9",
"s_0th": "0th", "s_1st": "1st", "s_2nd": "2nd", "s_3rd": "3rd",
"s_4th": "4th", "s_5th": "5th", "s_6th": "6th", "s_7th": "7th",
"s_8th": "8th", "s_9th": "9th", "s_1th": "1th",
"s_pt": ".", "s_cm": ",", "s_s": "S", "s_mi": "_",
"s_sq": "Sq", "s_qt": "'", "s_GW": "G.W.", "s_qo": ":",
"s_et": "&", "s_bl": "(", "s_br": ")", "s_lb": "L", "s_sl": "Sl",
}

class WashingtonDataset(BaseDataset):
def __init__(self, path, transform=None, nameset=None):
"""
Args:
path (string): Path folder of the dataset.
transform (callable, optional): Optional transform to be applied
on a sample.
author_ids (list, optional): List of authors to consider.
nameset (string, optional): Name of the dataset.
max_samples (int, optional): Maximum number of samples to consider.
"""
imgs_dir = Path(path) / 'lines'
imgs = list(imgs_dir.rglob('*.png'))
def __init__(self, transform=None, nameset='train', split='cv1', dataset_type='lines'):

if not WASHINGTON_DIR_PATH.exists():
download_file(WASHINGTON_URL, WASHINGTON_ZIP_PATH)
extract_zip(WASHINGTON_ZIP_PATH, WASHINGTON_DIR_PATH, delete=True)
washington_unzip_path = WASHINGTON_DIR_PATH / 'washingtondb-v1.0'

filenames = list(washington_unzip_path.rglob(f'sets/{split}/{nameset}.txt'))
assert len(filenames) > 0, f'No file found for {nameset} in {washington_unzip_path}'

split_ids = []
for filename in filenames:
split_ids.extend(filename.read_text().splitlines())
split_ids = set(split_ids)

if dataset_type == 'lines':
imgs_root = washington_unzip_path / 'data' / 'line_images_normalized'
labels_path = washington_unzip_path / 'ground_truth' / 'transcription.txt'
elif dataset_type == 'words':
imgs_root = washington_unzip_path / 'data' / 'word_images_normalized'
labels_path = washington_unzip_path / 'ground_truth' / 'word_labels.txt'
else:
raise ValueError(f'Invalid dataset_type: {dataset_type}. Available types: ["lines", "words"]')

self.imgs = list(imgs_root.rglob('*.png'))
self.imgs = [img for img in self.imgs if img.stem[:6] in split_ids]
assert len(self.imgs) > 0, f'No images found for {nameset} in {imgs_root}'

super().__init__(Path(path), imgs, [0] * len(imgs), [0], nameset=nameset, transform=transform)
self.authors = ['0'] * len(self.imgs) # All samples are from the same author
super().__init__(
washington_unzip_path,
self.imgs,
self.authors,
['0'], # All samples are from the same author
transform=transform,
)

if nameset is not None:
data_path = Path(path) / f'{nameset}.msgpack'
if data_path.exists():
with open(data_path, 'rb') as f:
data = dict(msgpack.unpack(f, raw=False))
self.imgs = [img for img in self.imgs if img.name in data]
self.authors = [0] * len(self.imgs)
self.labels = [data[img.name] for img in self.imgs]
self.labels_dict = {}
self.charset = set()
for line in labels_path.read_text().splitlines():
img_id, label = line.split()
label = label.replace('|', '- -').split('-')
for i in range(len(label)):
if len(label[i]) > 1:
label[i] = SPECIAL_MAP[label[i]]
label = ''.join(label)
self.labels_dict[img_id] = label
self.labels = [self.labels_dict[img.stem] for img in self.imgs]
self.has_labels = True

0 comments on commit 67625df

Please sign in to comment.