ukiyo-e-face-blip2-captions.py

# Copyright 2024 Shunsuke Kitada and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script was generated from shunk031/cookiecutter-huggingface-datasets.
#
import os
from dataclasses import dataclass
from glob import glob
from typing import List, Union

import datasets as ds
import torch
from datasets.utils.logging import get_logger
from more_itertools import chunked
from PIL import Image
from transformers import Blip2ForConditionalGeneration, Blip2Processor

logger = get_logger(__name__)

_CITATION = """\
@misc{pinkney2020ukiyoe,
    author = {Pinkney, Justin N. M.},
    title = {Aligned Ukiyo-e faces dataset},
    year={2020},
    howpublished={\\url{https://www.justinpinkney.com/blog/2020/ukiyoe-dataset}}
}
@misc{kitada2024ukiyoe,
    author = {Kitada, Shunsuke},
    title = {Ukiyo-e face blip2 captions dataset},
    year={2024},
    howpublished={\\url{https://huggingface.co/datasets/py-img-gen/ukiyo-e-face-blip2-captions/settings}}
}
"""

_DESCRIPTION = """\
ukiyo-e-face-blip2-captions is a dataset of ukiyo-e faces with captions generated by the BLIP2 model.
"""

_HOMEPAGE = "https://www.justinpinkney.com/blog/2020/ukiyoe-dataset/"

_LICENSE = "cc-by-sa-4.0"


@dataclass
class UkiyoeFaceBlip2CaptionsConfig(ds.BuilderConfig):
    """BuilderConfig for UkiyoeFaceBlip2Captions."""

    model_id: str = "Salesforce/blip2-opt-2.7b"
    device: Union[str, torch.device] = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )
    batch_size: int = 512


class UkiyoeFaceBlip2CaptionsDataset(ds.GeneratorBasedBuilder):
    """A class for loading ukiyo-e-face-blip2-captions dataset."""

    BUILDER_CONFIGS = [
        UkiyoeFaceBlip2CaptionsConfig(
            version=ds.Version("2.0.0"),
            description="V2 - Removed 28 bad quality images (poor alignment or not face).",
        )
    ]
    BUILDER_CONFIG_CLASS = UkiyoeFaceBlip2CaptionsConfig

    @property
    def manual_download_instructions(self) -> str:
        return (
            "To use Ukiyo-e face dataset, you have to download it manually "
            "from the official website (https://www.justinpinkney.com/blog/2020/ukiyoe-dataset/). "
            'Then, load the dataset with: `datasets.load_dataset("py-img-gen", data_dir="/path/to/ukiyoe-1024-v2.tar")`'
        )

    def _info(self) -> ds.DatasetInfo:
        features = ds.Features(
            {
                "image": ds.Image(),
                "caption": ds.Value("string"),
            }
        )
        return ds.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(
        self, dl_manager: ds.DownloadManager
    ) -> List[ds.SplitGenerator]:
        assert (
            dl_manager.manual_dir is not None
        ), "Please download the Ukiyo-e face dataset manually and specify the path to the downloaded files via `data_dir` argument."

        dir_path = os.path.expanduser(dl_manager.manual_dir)
        data_dir = dl_manager.extract(dir_path)
        assert isinstance(data_dir, str)

        image_dir_path = os.path.join(data_dir, "ukiyoe-1024")
        logger.info(f"Loading images from {image_dir_path}")

        return [
            ds.SplitGenerator(
                name=ds.Split.TRAIN,  # type: ignore
                gen_kwargs={
                    "image_dir_path": image_dir_path,
                },
            ),
        ]

    def _generate_examples(self, image_dir_path: str):
        config: UkiyoeFaceBlip2CaptionsConfig = self.config  # type: ignore

        processor = Blip2Processor.from_pretrained(config.model_id)
        assert isinstance(processor, Blip2Processor)

        model = Blip2ForConditionalGeneration.from_pretrained(
            config.model_id, torch_dtype=torch.float16
        )
        model.to(config.device)  # type: ignore

        image_files = glob(os.path.join(image_dir_path, "*.jpg"))

        idx = 0
        for batch_image_files in chunked(image_files, config.batch_size):
            batched_images = [
                Image.open(image_file) for image_file in batch_image_files
            ]
            inputs = processor(images=batched_images, return_tensors="pt")  # type: ignore
            inputs = inputs.to(config.device)

            with torch.no_grad():
                generated_ids = model.generate(**inputs)

            generated_texts = processor.batch_decode(
                generated_ids, skip_special_tokens=True
            )
            for image, generated_text in zip(batched_images, generated_texts):
                example = {
                    "image": image,
                    "caption": generated_text,
                }
                yield idx, example
                idx += 1