From 22effd0225e8cc3ce5243ade4f484885e7cd7e7a Mon Sep 17 00:00:00 2001 From: duydq Date: Mon, 3 Jun 2024 09:42:38 +0700 Subject: [PATCH] Remove cli --- cli/README.md | 199 ----------- cli/dataset_loaders.py | 397 --------------------- cli/write_image.py | 50 --- cli/write_json_baidu.py | 83 ----- cli/write_json_facepp.py | 78 ---- cli/write_json_visage.py | 72 ---- cli/write_scripts/write_baidu.sh | 39 -- cli/write_scripts/write_facepp.sh | 48 --- cli/write_scripts/write_image.sh | 84 ----- cli/write_scripts/write_origin_label.sh | 117 ------ cli/write_scripts/write_visage.sh | 47 --- cli/write_text_label_in_filename.py | 83 ----- cli/write_text_label_in_key_in_some_txt.py | 85 ----- cli/write_text_label_in_some_csv.py | 81 ----- cli/write_text_label_in_some_txt.py | 74 ---- cli/write_text_label_in_txt.py | 81 ----- 16 files changed, 1618 deletions(-) delete mode 100644 cli/README.md delete mode 100644 cli/dataset_loaders.py delete mode 100644 cli/write_image.py delete mode 100644 cli/write_json_baidu.py delete mode 100644 cli/write_json_facepp.py delete mode 100644 cli/write_json_visage.py delete mode 100644 cli/write_scripts/write_baidu.sh delete mode 100644 cli/write_scripts/write_facepp.sh delete mode 100644 cli/write_scripts/write_image.sh delete mode 100644 cli/write_scripts/write_origin_label.sh delete mode 100644 cli/write_scripts/write_visage.sh delete mode 100644 cli/write_text_label_in_filename.py delete mode 100644 cli/write_text_label_in_key_in_some_txt.py delete mode 100644 cli/write_text_label_in_some_csv.py delete mode 100644 cli/write_text_label_in_some_txt.py delete mode 100644 cli/write_text_label_in_txt.py diff --git a/cli/README.md b/cli/README.md deleted file mode 100644 index c049f0d..0000000 --- a/cli/README.md +++ /dev/null @@ -1,199 +0,0 @@ -# Face Dataset -## Sample Label of 3rd Tools -
-Facepp - -```yaml -{ - "request_id": "1699959145,1bb349e4-c83f-4a39-ad4e-8dcdc0cd34fb", - "time_used": 0, - "faces": [ - { - "face_token": "974b332135a90dd90ac8c685fa983a69", - "face_rectangle": { - "top": 158, - "left": 66, - "width": 191, - "height": 191 - }, - "attributes": { - "gender": { - "value": "Female" - }, - "age": { - "value": 3 - }, - "headpose": { - "pitch_angle": 6.6730194, - "roll_angle": -2.4714718, - "yaw_angle": 3.7413023 - }, - "eyestatus": { - "left_eye_status": { - "no_glass_eye_open": 99.357, - "no_glass_eye_close": 0, - "normal_glass_eye_open": 0.635, - "normal_glass_eye_close": 0.001, - "dark_glasses": 0, - "occlusion": 0.007 - }, - "right_eye_status": { - "no_glass_eye_open": 99.763, - "no_glass_eye_close": 0.011, - "normal_glass_eye_open": 0.054, - "normal_glass_eye_close": 0.005, - "dark_glasses": 0.01, - "occlusion": 0.158 - } - }, - "emotion": { - "anger": 22.269, - "disgust": 1.227, - "fear": 0.41, - "happiness": 9.247, - "neutral": 12.255, - "sadness": 54.181, - "surprise": 0.41 - }, - "facequality": { - "value": 90.754, - "threshold": 70.1 - }, - "ethnicity": { - "value": "" - }, - "beauty": { - "male_score": 52.808, - "female_score": 47.62 - }, - "mouthstatus": { - "surgical_mask_or_respirator": 0, - "other_occlusion": 0.004, - "close": 0.245, - "open": 99.751 - }, - "glass": { - "value": "None" - } - } - } - ], - "image_id": "zGJ7eeUqKzu0NJBxIodt7w==", - "face_num": 1 -} -``` - -
- -
-Baidu - -```yaml -{ - "face_list": [ - { - "face_token": "2df98fc10747d685e4dd4794f91a9c26", - "location": { - "left": 18.03, - "top": 41.28, - "width": 88, - "height": 74, - "rotation": -2 - }, - "face_probability": 1, - "angle": { - "yaw": 5.83, - "pitch": 2.12, - "roll": -3.13 - }, - "age": 33, - "expression": { - "type": "none", - "probability": 1 - }, - "face_shape": { - "type": "square", - "probability": 0.73 - }, - "gender": { - "type": "male", - "probability": 0.99 - }, - "glasses": { - "type": "none", - "probability": 1 - }, - "landmark150": {}, - "quality": { - "occlusion": { - "left_eye": 0, - "right_eye": 0, - "nose": 0, - "mouth": 0, - "left_cheek": 0.42, - "right_cheek": 0.52, - "chin_contour": 0.8 - }, - "blur": 0, - "illumination": 129, - "completeness": 1 - }, - "emotion": { - "type": "neutral", - "probability": 0.83 - }, - "mask": { - "type": 0, - "probability": 0.96 - } - } - ], - "face_num": 1 -} -``` - -
- -## Write LMDB with All-Age-Faces Dataset -### Structure Dataset -``` -├── aglined faces -│ ├── 00000A02.jpg -│ ├── 00001A02.jpg -│ ├── ... -├── example -├── image sets -│ ├── train.txt -│ ├── val.txt -├── key points -├── original images -│ ├── 00000A02.jpg -│ ├── 00001A02.jpg -│ ├── ... -``` -Origin label of dataset get from "image sets/train.txt" and "image sets/val.txt" -### CMD -```shell -bash cli/aaf.sh -``` - -## Write LMDB with AFAD -### Structure Dataset -``` -├── 15 -│ ├── 111 -│ | ├── 292943-1.jpg -│ | ├── 292943-2.jpg -│ ├── 112 -│ | ├── 671487-0.jpg -│ | ├── 660728-0.jpg -├── ... -├── 75 -├── README.md -├── AFAD-Full.txt -``` -Origin label of dataset get from "AFAD-Full.txt" -### CMD -```shell -bash cli/afad.sh -``` diff --git a/cli/dataset_loaders.py b/cli/dataset_loaders.py deleted file mode 100644 index 79b2909..0000000 --- a/cli/dataset_loaders.py +++ /dev/null @@ -1,397 +0,0 @@ -import os -import re -import warnings - -from glob import glob -from typing import Any, Dict, Generator, List, Optional, Tuple - -from lmdbsystem.dataloader import DataLoader -from lmdbsystem.utils import ( - csv_line_reader, - dump_pickle, - get_md5_file, - get_relative_path, - json_reader, - normalize_path, - raw_reader, - removesuffix_path, - str2bytes, - text_line_reader, - text_reader, -) - - -class ImageLoader(DataLoader): - def __init__( - self, - directory: str, - suffix: str, - fn_md5_mode: str, - fn_md5_path: str, - ): - self.directory = directory - self.suffix = suffix - self.fn_md5_mode = fn_md5_mode - self.fn_md5_path = fn_md5_path - if fn_md5_mode == "r": - self.dict_filename_md5 = json_reader(fn_md5_path) - elif fn_md5_mode == "w": - self.dict_filename_md5 = {} - else: - raise ValueError(f"Don't support fn_md5_mode: {fn_md5_mode}") - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - filename = get_relative_path(self.directory, file_path).removesuffix(self.suffix) - value = raw_reader(file_path) - if self.fn_md5_mode == "r": - md5_file = self.dict_filename_md5[filename] - value = dump_pickle((str2bytes(get_md5_file(file_path)), value)) - else: - md5_file = get_md5_file(file_path) - self.dict_filename_md5[filename] = md5_file - key = str2bytes(md5_file) - - return key, value - - -class FaceppLoader(DataLoader): - def __init__( - self, directory: str, suffix: str, fn_md5_path: str, keys_extracted: List[str], values_map: Dict[str, str] - ): - self.directory = directory - self.suffix = suffix - self.keys_extracted = keys_extracted - self.values_map = values_map - self.dict_filename_md5 = json_reader(fn_md5_path) - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - md5_file = self.dict_filename_md5[get_relative_path(self.directory, file_path).removesuffix(self.suffix)] - key = str2bytes(md5_file) - sub_key = str2bytes(get_md5_file(file_path)) - - data = json_reader(file_path) - if data["face_num"] == 0: - return None, None - - attribute = data["faces"][0]["attributes"] - for _key in self.keys_extracted: - attribute = attribute[_key] - - if self.values_map: - attribute = self.values_map.get(attribute, attribute) - - value = dump_pickle((sub_key, str2bytes(str(attribute)))) - return key, value - - -class BaiduLoader(DataLoader): - def __init__( - self, - directory: str, - suffix: str, - fn_md5_path: str, - keys_extracted: List[str], - key_probability: Optional[float], - values_map: Optional[Dict[str, str]], - ): - self.directory = directory - self.suffix = suffix - self.keys_extracted = keys_extracted - self.key_probability = key_probability - self.values_map = values_map - self.dict_filename_md5 = json_reader(fn_md5_path) - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - md5_file = self.dict_filename_md5[get_relative_path(self.directory, file_path).removesuffix(self.suffix)] - key = str2bytes(md5_file) - sub_key = str2bytes(get_md5_file(file_path)) - - data = json_reader(file_path) - if data["face_num"] == 0: - return None, None - - attribute = data["face_list"][0] - for _key in self.keys_extracted: - if "probability" in attribute and attribute["probability"] < self.key_probability: - attribute = None - break - attribute = attribute[_key] - - if attribute is None: - return None, None - - if self.values_map: - attribute = self.values_map.get(attribute, attribute) - - value = dump_pickle((sub_key, str2bytes(str(attribute)))) - return key, value - - -class VisageLoader(DataLoader): - def __init__( - self, directory: str, suffix: str, fn_md5_path: str, keys_extracted: List[str], values_map: Dict[str, str] - ): - self.directory = directory - self.suffix = suffix - self.keys_extracted = keys_extracted - self.values_map = values_map - self.dict_filename_md5 = json_reader(fn_md5_path) - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - md5_file = self.dict_filename_md5[get_relative_path(self.directory, file_path).removesuffix(self.suffix)] - key = str2bytes(md5_file) - sub_key = str2bytes(get_md5_file(file_path)) - - data = json_reader(file_path) - if not data: - return None, None - - attribute = data[self.keys_extracted[0]] - - if self.values_map: - attribute = self.values_map.get(attribute, attribute) - - value = dump_pickle((sub_key, str2bytes(str(attribute)))) - return key, value - - -class LabelInFilenameLoader(DataLoader): - def __init__( - self, - directory: str, - suffix: str, - fn_md5_path: str, - values_index: List[int], - values_map: Dict[str, str], - delimiter: str, - ): - self.directory = directory - self.suffix = suffix - self.values_map = values_map - self.delimiter = delimiter - self.values_index = values_index - self.dict_filename_md5 = json_reader(fn_md5_path) - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - md5_file = self.dict_filename_md5[get_relative_path(self.directory, file_path).removesuffix(self.suffix)] - key = str2bytes(md5_file) - sub_key = str2bytes(get_md5_file(file_path)) - - line_values = os.path.basename(file_path).removesuffix(self.suffix).split(self.delimiter) - labels = [value.strip() for index, value in enumerate(line_values) if index in self.values_index] - - if self.values_map: - if "type" in self.values_map: - value_type = self.values_map["type"] - labels = [str(eval(value_type)(item)) for item in labels] - else: - labels = [self.values_map.get(item, item) for item in labels] - - value = dump_pickle((sub_key, str2bytes(" ".join(labels)))) - return key, value - - -class LabelInTxtLoader(DataLoader): - def __init__( - self, - directory: str, - suffix: str, - fn_md5_path: str, - values_index: List[int], - values_map: Dict[str, str], - delimiter: str, - ): - self.directory = directory - self.suffix = suffix - self.values_map = values_map - self.delimiter = delimiter - self.values_index = values_index - self.dict_filename_md5 = json_reader(fn_md5_path) - self.file_paths = sorted(glob(f"{directory}/**/*{suffix}", recursive=True)) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - yield self[file_path] - - def __getitem__(self, file_path: str) -> Tuple[Optional[bytes], Optional[bytes]]: - md5_file = self.dict_filename_md5[get_relative_path(self.directory, file_path).removesuffix(self.suffix)] - key = str2bytes(md5_file) - sub_key = str2bytes(get_md5_file(file_path)) - - line_values = text_reader(file_path).split(self.delimiter) - labels = [value.strip() for index, value in enumerate(line_values) if index in self.values_index] - - if self.values_map: - if "type" in self.values_map: - value_type = self.values_map["type"] - labels = [str(eval(value_type)(item)) for item in labels] - else: - labels = [self.values_map.get(item, item) for item in labels] - - value = dump_pickle((sub_key, str2bytes(" ".join(labels)))) - return key, value - - -class LabelInSomeCsvLoader(DataLoader): - def __init__( - self, - file_paths: str, - fn_md5_path: str, - key_index: int, - values_index: List[int], - values_map: Dict[str, str], - delimiter: str, - skip_header: bool = False, - ): - self.file_paths = file_paths - self.key_index = key_index - self.values_index = values_index - self.values_map = values_map - self.delimiter = delimiter - self.skip_header = skip_header - self.dict_filename_md5 = json_reader(fn_md5_path) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - sub_key = str2bytes(get_md5_file(file_path)) - for line_values in csv_line_reader(file_path, self.delimiter, self.skip_header): - yield self[(sub_key, line_values)] - - def __getitem__(self, item: Any) -> Tuple[Optional[bytes], Optional[bytes]]: - sub_key, line_values = item - filename = removesuffix_path(normalize_path(line_values[self.key_index])) - if filename not in self.dict_filename_md5: - warnings.warn(f"File {filename} not in image folder") - return None, None - md5_file = self.dict_filename_md5[filename] - key = str2bytes(md5_file) - - labels = [value.strip() for index, value in enumerate(line_values) if index in self.values_index] - - if self.values_map: - if "type" in self.values_map: - value_type = self.values_map["type"] - labels = [str(eval(value_type)(item)) for item in labels] - else: - labels = [self.values_map.get(item, item) for item in labels] - - value = dump_pickle((sub_key, str2bytes(" ".join(labels)))) - return key, value - - -class LabelInSomeTxtLoader(DataLoader): - def __init__( - self, - file_paths: str, - fn_md5_path: str, - key_index: int, - values_index: List[int], - values_map: Dict[str, str], - delimiter: str, - ): - self.file_paths = file_paths - self.key_index = key_index - self.values_index = values_index - self.values_map = values_map - self.delimiter = delimiter - self.dict_filename_md5 = json_reader(fn_md5_path) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - sub_key = str2bytes(get_md5_file(file_path)) - for line in text_line_reader(file_path): - yield self[(sub_key, line)] - - def __getitem__(self, item: Any) -> Tuple[Optional[bytes], Optional[bytes]]: - sub_key, line = item - line_values = line.split(self.delimiter) - - filename = removesuffix_path(normalize_path(line_values[self.key_index])) - if filename not in self.dict_filename_md5: - warnings.warn(f"File {filename} not in image folder") - return None, None - md5_file = self.dict_filename_md5[filename] - key = str2bytes(md5_file) - - labels = [value.strip() for index, value in enumerate(line_values) if index in self.values_index] - - if self.values_map: - if "type" in self.values_map: - value_type = self.values_map["type"] - labels = [str(eval(value_type)(item)) for item in labels] - else: - labels = [self.values_map.get(item, item) for item in labels] - - value = dump_pickle((sub_key, str2bytes(" ".join(labels)))) - return key, value - - -class LabelInKeyInSomeTxtLoader(DataLoader): - def __init__( - self, - file_paths: str, - fn_md5_path: str, - key_index: int, - pattern_value_in_key: str, - type_value_in_key: str, - values_map: Dict[str, str], - delimiter: str, - ): - self.file_paths = file_paths - self.key_index = key_index - self.pattern = re.compile(pattern_value_in_key) - self.value_type_of_key = type_value_in_key - self.values_map = values_map - self.delimiter = delimiter - self.dict_filename_md5 = json_reader(fn_md5_path) - - def iterator(self) -> Generator[Tuple[Optional[bytes], Optional[bytes]], Any, None]: - for file_path in self.file_paths: - sub_key = str2bytes(get_md5_file(file_path)) - for line in text_line_reader(file_path): - yield self[(sub_key, line)] - - def __getitem__(self, item: Any) -> Tuple[Optional[bytes], Optional[bytes]]: - sub_key, line = item - line_values = line.split(self.delimiter) - - filename = removesuffix_path(normalize_path(line_values[self.key_index])) - if filename not in self.dict_filename_md5: - warnings.warn(f"File {filename} not in image folder") - return None, None - md5_file = self.dict_filename_md5[filename] - key = str2bytes(md5_file) - - res = self.pattern.search(line_values[self.key_index]) - labels = [str(eval(self.value_type_of_key)(res.group(1)))] - - if self.values_map: - labels = [self.values_map.get(item, item) for item in labels] - - value = dump_pickle((sub_key, str2bytes(" ".join(labels)))) - return key, value diff --git a/cli/write_image.py b/cli/write_image.py deleted file mode 100644 index 87f8297..0000000 --- a/cli/write_image.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse - -from dataset_loaders import ImageLoader - -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.image import ImageWriteAdapter - - -def get_argument(): - parser = argparse.ArgumentParser(description="Convert pdf file to text detection and recognition label.") - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the image file", required=True) - - parser.add_argument("--suffix", default=".jpg", type=str, help="The suffix of image file") - - parser.add_argument( - "--lmdb-map-size", default=32212254720, type=int, help="Map size to dump lmdb file, default 30GB" # 30GB - ) - - parser.add_argument( - "--fn-md5-mode", - type=str, - help='The mode of handle with filename_to_md5 file. Only support ["r", "w"] mode', - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - lmdb_obj = Lmdb(ImageWriteAdapter(path=args.lmdb_file, map_size=args.lmdb_map_size)) - lmdb_obj.write_loader( - ImageLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_mode=args.fn_md5_mode, - fn_md5_path=args.fn_md5_path, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_json_baidu.py b/cli/write_json_baidu.py deleted file mode 100644 index 9dabbce..0000000 --- a/cli/write_json_baidu.py +++ /dev/null @@ -1,83 +0,0 @@ -import argparse -import os - -from dataset_loaders import BaiduLoader - -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the label file", required=True) - - parser.add_argument("--suffix", default=".json", type=str, help="The suffix of label file") - - parser.add_argument( - "--fn-md5-mode", - type=str, - help='The mode of handle with filename_to_md5 file. Only support ["r", "w"] mode', - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--keys-extracted", - type=str, - choices=["angle", "age", "expression,type", "gender,type", "glasses,type", "emotion,type", "mask,type"], - help="The key with multi level to extract from the label file", - required=True, - ) - - parser.add_argument( - "--key-probability", - type=float, - help="The minimum probability of value for attribute." 'Only using parameter when keys-extracted has "type"', - ) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - if args.folder and not os.path.isdir(args.folder): - raise ValueError("Folder not exists") - - if args.folder and not args.suffix: - raise ValueError("Do not empty --suffix argument when handle with some folder") - - keys_extracted = args.keys_extracted.split(",") if args.keys_extracted else [] - - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - BaiduLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_path=args.fn_md5_path, - keys_extracted=keys_extracted, - key_probability=args.key_probability, - values_map=values_map, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_json_facepp.py b/cli/write_json_facepp.py deleted file mode 100644 index 04ce5ca..0000000 --- a/cli/write_json_facepp.py +++ /dev/null @@ -1,78 +0,0 @@ -import argparse -import os - -from dataset_loaders import FaceppLoader - -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the label file", required=True) - - parser.add_argument("--suffix", default=".json", type=str, help="The suffix of label file") - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--keys-extracted", - type=str, - choices=[ - "gender,value", - "age,value", - "headpose", - "emotion", - "facequality,value", - "ethnicity,value", - "beauty", - "glass,value", - ], - help="The key with multi level to extract from the label file", - required=True, - ) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - if args.folder and not os.path.isdir(args.folder): - raise ValueError("Folder not exists") - - if args.folder and not args.suffix: - raise ValueError("Do not empty --suffix argument when handle with some folder") - - keys_extracted = args.keys_extracted.split(",") if args.keys_extracted else [] - - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - FaceppLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_path=args.fn_md5_path, - keys_extracted=keys_extracted, - values_map=values_map, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_json_visage.py b/cli/write_json_visage.py deleted file mode 100644 index f873f6a..0000000 --- a/cli/write_json_visage.py +++ /dev/null @@ -1,72 +0,0 @@ -import argparse -import os - -from dataset_loaders import VisageLoader - -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the label file", required=True) - - parser.add_argument("--suffix", default=".json", type=str, help="The suffix of label file") - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--keys-extracted", - type=str, - choices=[ - "gender", - "age", - ], - help="The key to extract from the label file", - required=True, - ) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - if args.folder and not os.path.isdir(args.folder): - raise ValueError("Folder not exists") - - if args.folder and not args.suffix: - raise ValueError("Do not empty --suffix argument when handle with some folder") - - keys_extracted = args.keys_extracted.split(",") if args.keys_extracted else [] - - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - VisageLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_path=args.fn_md5_path, - keys_extracted=keys_extracted, - values_map=values_map, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_scripts/write_baidu.sh b/cli/write_scripts/write_baidu.sh deleted file mode 100644 index a8c68a7..0000000 --- a/cli/write_scripts/write_baidu.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -set -eux - -# Write Label From Baidu Tool -root_data_path="/tmp/facedata/mount" -datasets=( - # "aaf" - # "afad" - # "afd" - "deepglint" - # "fairface" - "ffhq" - # "rfw" - # "utkface" -) -for dataset in "${datasets[@]}"; do - echo "Handling with dataset: $dataset ..." - echo "Building Gender LMDB File ..." - ## Gender - python cli/write_json_baidu.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_gender_baidu.lmdb" \ - --folder "${root_data_path}/${dataset}/label_baidu" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "gender,type" \ - --key-probability 0.9 \ - --values-map "Female:0,Male:1,female:0,male:1,111:1,112:0" - - echo "Building Age LMDB File ..." - ## Age - python cli/write_json_baidu.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_age_baidu.lmdb" \ - --folder "${root_data_path}/${dataset}/label_baidu" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "age" - -done diff --git a/cli/write_scripts/write_facepp.sh b/cli/write_scripts/write_facepp.sh deleted file mode 100644 index 91e822f..0000000 --- a/cli/write_scripts/write_facepp.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -set -eux - -# Write Label From Facepp Tool -root_data_path="/tmp/facedata/mount" -datasets=( - "aaf" - "afad" - "afd" - "deepglint" - "fairface" - "ffhq" - "rfw" - "utkface" - # "vggface2_cleandata_p1" - # "vggface2_cleandata_p2" - # "vggface2_cleandata_p3" - # "vggface2_cleandata_p4" - # "vggface2_cleandata_p5" - # "vggface2_cleandata_p6" - # "vggface2_cleandata_p7" - # "vggface2_cleandata_p8" - # "vggface2_cleandata_p9" - # "vggface2_cleandata_p10" -) -for dataset in "${datasets[@]}"; do - echo "Handling with dataset: $dataset ..." - echo "Building Gender LMDB File ..." - ## Gender - python cli/write_json_facepp.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_gender_facepp.lmdb" \ - --folder "${root_data_path}/${dataset}/label_facepp" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "gender,value" \ - --values-map "Female:0,Male:1,female:0,male:1,111:1,112:0" - - echo "Building Age LMDB File ..." - ## Age - python cli/write_json_facepp.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_age_facepp.lmdb" \ - --folder "${root_data_path}/${dataset}/label_facepp" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "age,value" - -done diff --git a/cli/write_scripts/write_image.sh b/cli/write_scripts/write_image.sh deleted file mode 100644 index 01b8570..0000000 --- a/cli/write_scripts/write_image.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash - -set -eux - -# Write Image -root_data_path="/tmp/facedata/mount" -datasets=( - "aaf;AAF/All-Age-Faces Dataset/original images;jpg" - "afad;AFAD;jpg" - "afd;generated_yellow-stylegan2;png" - "deepglint;DeepGlint_7000;jpg" - "fairface;fairface-img-margin125-trainval;jpg" - "rfw;test/data;jpg" - "utkface;UTKFace;jpg" - # "vggface2_cleandata_p1;VGGFACE2_Cleandata_p1;jpg" - # "vggface2_cleandata_p2;VGGFACE2_Cleandata_p2;jpg" - # "vggface2_cleandata_p3;VGGFACE2_Cleandata_p3;jpg" - # "vggface2_cleandata_p4;VGGFACE2_Cleandata_p4;jpg" - # "vggface2_cleandata_p5;VGGFACE2_Cleandata_p5;jpg" - # "vggface2_cleandata_p6;VGGFACE2_Cleandata_p6;jpg" - # "vggface2_cleandata_p7;VGGFACE2_Cleandata_p7;jpg" - # "vggface2_cleandata_p8;VGGFACE2_Cleandata_p8;jpg" - # "vggface2_cleandata_p9;VGGFACE2_Cleandata_p9;jpg" - # "vggface2_cleandata_p10;VGGFACE2_Cleandata_p10;jpg" -) -for dataset in "${datasets[@]}"; do - IFS=$';' - dataset_item=($dataset) - unset IFS - echo "Handling with dataset: ${dataset_item[0]} ..." - - python cli/write_image.py \ - --lmdb-file "${root_data_path}/${dataset_item[0]}/${dataset_item[0]}_image.lmdb" \ - --folder "${root_data_path}/${dataset_item[0]}/${dataset_item[1]}" \ - --suffix ".${dataset_item[2]}" \ - --fn-md5-mode w \ - --fn-md5-path "${root_data_path}/${dataset_item[0]}/${dataset_item[0]}_fn_md5.json" - -done - -# Special case for too large dataset -## FFHQ -python cli/write_image.py \ - --lmdb-file "/media/ubuntu/My Passport/ffhq_image.lmdb" \ - --folder "/media/ubuntu/My Passport/ffhq_1024x1024" \ - --suffix .png \ - --lmdb-map-size 214748364800 \ - --fn-md5-mode w \ - --fn-md5-path "${root_data_path}/ffhq/ffhq_fn_md5.json" - -## Write Cleaned Image -#python cli/write_image.py --lmdb-file "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/aaf_image.lmdb" \ -# --folder "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/AAF" \ -# --suffix .jpg \ -# --fn-md5-mode r \ -# --fn-md5-path "/tmp/facedata/lmdb/aaf_fn_md5.json" - -## Write Cleaned Image -#python cli/write_image.py --lmdb-file "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/fairface_image.lmdb" \ -# --folder "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/FairFace" \ -# --suffix .jpg \ -# --fn-md5-mode r \ -# --fn-md5-path "/tmp/facedata/lmdb/fairface_fn_md5.json" - -## Write Cleaned Image -#python cli/write_image.py --lmdb-file "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/ffhq_image.lmdb" \ -# --folder "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/FFHQ" \ -# --suffix .png \ -# --fn-md5-mode r \ -# --fn-md5-path "/tmp/facedata/lmdb/ffhq_fn_md5.json" - -## Write Cleaned Image -#python cli/write_image.py --lmdb-file "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/rfw_image.lmdb" \ -# --folder "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/RFW" \ -# --suffix .jpg \ -# --fn-md5-mode r \ -# --fn-md5-path "/tmp/facedata/lmdb/rfw_fn_md5.json" - -## Write Cleaned Image -#python cli/write_image.py --lmdb-file "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/utkface_image.lmdb" \ -# --folder "/tmp/facedata/SCRFD_500M_KPS_ALIGN_224x224/UTKFace" \ -# --suffix .jpg \ -# --fn-md5-mode r \ -# --fn-md5-path "/tmp/facedata/lmdb/utkface_fn_md5.json" diff --git a/cli/write_scripts/write_origin_label.sh b/cli/write_scripts/write_origin_label.sh deleted file mode 100644 index 2399ddd..0000000 --- a/cli/write_scripts/write_origin_label.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -set -eux - -## Write Label From Dataset -root_data_path="/tmp/facedata/mount" -# AAF -## Gender -python cli/write_text_label_in_some_txt.py \ - --lmdb-file "${root_data_path}/aaf/aaf_gender_origin.lmdb" \ - --files "${root_data_path}/aaf/AAF/All-Age-Faces Dataset/image sets/train.txt,${root_data_path}/aaf/AAF/All-Age-Faces Dataset/image sets/val.txt" \ - --delimiter " " \ - --key-index 0 \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/aaf/aaf_fn_md5.json" -## Age -python cli/write_text_label_in_filename.py \ - --lmdb-file "${root_data_path}/aaf/aaf_age_origin.lmdb" \ - --folder "${root_data_path}/aaf/AAF/All-Age-Faces Dataset/original images" \ - --suffix .jpg \ - --delimiter "A" \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/aaf/aaf_fn_md5.json" \ - --values-map "type:int" - -# AFAD -## Gender -python cli/write_text_label_in_key_in_some_txt.py \ - --lmdb-file "${root_data_path}/afad/afad_gender_origin.lmdb" \ - --files "${root_data_path}/afad/AFAD/AFAD-Full.txt" \ - --delimiter " " \ - --key-index 0 \ - --fn-md5-path "${root_data_path}/afad/afad_fn_md5.json" \ - --pattern-value-in-key "/(\d{3})/" \ - --type-value-in-key int \ - --values-map "Female:0,Male:1,female:0,male:1,111:1,112:0" -## Age -python cli/write_text_label_in_key_in_some_txt.py \ - --lmdb-file "${root_data_path}/afad/afad_age_origin.lmdb" \ - --files "${root_data_path}/afad/AFAD/AFAD-Full.txt" \ - --delimiter " " \ - --key-index 0 \ - --fn-md5-path "${root_data_path}/afad/afad_fn_md5.json" \ - --pattern-value-in-key "/(\d{2})/" \ - --type-value-in-key int - -# FairFace -## Gender -python cli/write_text_label_in_some_csv.py \ - --lmdb-file "${root_data_path}/fairface/fairface_gender_origin.lmdb" \ - --files "${root_data_path}/fairface/fairface_label_train.csv,${root_data_path}/facefair/fairface_label_val.csv" \ - --delimiter "," \ - --key-index 0 \ - --values-index 2 \ - --fn-md5-path "${root_data_path}/fairface/fairface_fn_md5.json" \ - --values-map "Female:0,Male:1,female:0,male:1,111:1,112:0" \ - --skip-header -## Age -python cli/write_text_label_in_some_csv.py \ - --lmdb-file "${root_data_path}/fairface/fairface_age_origin.lmdb" \ - --files "${root_data_path}/fairface/fairface_label_train.csv,${root_data_path}/facefair/fairface_label_val.csv" \ - --delimiter "," \ - --key-index 0 \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/fairface/fairface_fn_md5.json" \ - --skip-header - -# FFHQ -## Age -python cli/write_text_label_in_some_txt.py \ - --lmdb-file "${root_data_path}/ffhq/ffhq_age_human.lmdb" \ - --files "${root_data_path}/ffhq/label_human.txt" \ - --delimiter " " \ - --key-index 0 \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/ffhq/ffhq_fn_md5.json" - -# UTKFace -## Gender -python cli/write_text_label_in_filename.py \ - --lmdb-file "${root_data_path}/utkface/utkface_gender_origin.lmdb" \ - --folder "${root_data_path}/utkface/UTKFace" \ - --suffix .jpg \ - --delimiter "_" \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/utkface/utkface_fn_md5.json" \ - --values-map "0:1,1:0" -## Age -python cli/write_text_label_in_filename.py \ - --lmdb-file "${root_data_path}/utkface/utkface_age_origin.lmdb" \ - --folder "${root_data_path}/utkface/UTKFace" \ - --suffix .jpg \ - --delimiter "_" \ - --values-index 0 \ - --fn-md5-path "${root_data_path}/utkface/utkface_fn_md5.json" - -# VGGFACE2_Cleandata_p1 -## Age -python cli/write_text_label_in_some_txt.py \ - --lmdb-file "${root_data_path}/vggface2_cleandata_p1/vggface2_cleandata_p1_age_origin.lmdb" \ - --files "${root_data_path}/vggface2_cleandata_p1/label_origin.txt" \ - --delimiter "\t" \ - --key-index 0 \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/vggface2_cleandata_p1/vggface2_cleandata_p1_fn_md5.json" - -# VGGFACE2_Cleandata_p2 -## Age -python cli/write_text_label_in_some_txt.py \ - --lmdb-file "${root_data_path}/vggface2_cleandata_p2/vggface2_cleandata_p2_age_origin.lmdb" \ - --files "${root_data_path}/vggface2_cleandata_p2/label_origin.txt" \ - --delimiter "\t" \ - --key-index 0 \ - --values-index 1 \ - --fn-md5-path "${root_data_path}/vggface2_cleandata_p2/vggface2_cleandata_p2_fn_md5.json" - -# To p10 diff --git a/cli/write_scripts/write_visage.sh b/cli/write_scripts/write_visage.sh deleted file mode 100644 index dd9cd2b..0000000 --- a/cli/write_scripts/write_visage.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -eux - -# Write Label From Visage Tool -root_data_path="/tmp/facedata/mount" -datasets=( - "aaf" - "afad" - "afd" - # "deepglint" - "fairface" - # "ffhq" - "rfw" - "utkface" - # "vggface2_cleandata_p1" - # "vggface2_cleandata_p2" - # "vggface2_cleandata_p3" - # "vggface2_cleandata_p4" - # "vggface2_cleandata_p5" - # "vggface2_cleandata_p6" - # "vggface2_cleandata_p7" - # "vggface2_cleandata_p8" - # "vggface2_cleandata_p9" - # "vggface2_cleandata_p10" -) -for dataset in "${datasets[@]}"; do - echo "Handling with dataset: $dataset ..." - echo "Building Gender LMDB File ..." - ## Gender - python cli/write_json_visage.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_gender_visage.lmdb" \ - --folder "${root_data_path}/${dataset}/label_visage" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "gender" \ - --values-map "Female:0,Male:1,female:0,male:1,111:1,112:0" - - echo "Building Age LMDB File ..." - ## Age - python cli/write_json_visage.py \ - --lmdb-file "${root_data_path}/${dataset}/${dataset}_age_visage.lmdb" \ - --folder "${root_data_path}/${dataset}/label_visage" \ - --suffix .json \ - --fn-md5-path "${root_data_path}/${dataset}/${dataset}_fn_md5.json" \ - --keys-extracted "age" -done diff --git a/cli/write_text_label_in_filename.py b/cli/write_text_label_in_filename.py deleted file mode 100644 index 1ea7372..0000000 --- a/cli/write_text_label_in_filename.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -label of data in filename of image -ex: 12_0_0_20170117190914091 -""" - -import argparse -import os - -from cli.dataset_loaders import LabelInFilenameLoader -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def unescaped_str(arg_str): - return arg_str.encode().decode("unicode_escape") - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the label file") - - parser.add_argument("--suffix", default=".txt", type=str, help="The suffix of label file") - - parser.add_argument( - "--delimiter", - type=unescaped_str, - choices=["\t", "\n", " ", ",", "_", "A"], - help="punctuation for split the label line", - required=True, - ) - - parser.add_argument( - "--values-index", - type=str, - help="The list of index to extract values from the label line, Except value: 0,1,2,3", - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - if args.folder and not os.path.isdir(args.folder): - raise ValueError("Folder not exists") - - if args.folder and (not args.suffix or not args.values_index): - raise ValueError("Do not empty --suffix or --values-index argument when handle with some folder") - - values_index = [int(value) for value in args.values_index.split(",")] - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - LabelInFilenameLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_path=args.fn_md5_path, - values_map=values_map, - delimiter=args.delimiter, - values_index=values_index, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_text_label_in_key_in_some_txt.py b/cli/write_text_label_in_key_in_some_txt.py deleted file mode 100644 index 50b589a..0000000 --- a/cli/write_text_label_in_key_in_some_txt.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse - -from cli.dataset_loaders import LabelInKeyInSomeTxtLoader -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def unescaped_str(arg_str): - return arg_str.encode().decode("unicode_escape") - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument( - "--files", - type=str, - help="The list of file path, Except value: /tmp/test1.txt,/tmp/test2.txt,/tmp/test3.txt", - required=True, - ) - - parser.add_argument( - "--delimiter", - type=unescaped_str, - choices=["\t", "\n", " ", ",", "_"], - help="punctuation for split the label line", - required=True, - ) - - parser.add_argument("--key-index", type=int, help="The index to extract key from the label line", required=True) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--pattern-value-in-key", - type=str, - help="The pattern of value in key", - required=True, - ) - - parser.add_argument( - "--type-value-in-key", - type=str, - choices=["int", "str", "float"], - help="The type of value in key", - required=True, - ) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - LabelInKeyInSomeTxtLoader( - file_paths=args.files.split(","), - fn_md5_path=args.fn_md5_path, - key_index=args.key_index, - pattern_value_in_key=args.pattern_value_in_key, - type_value_in_key=args.type_value_in_key, - values_map=values_map, - delimiter=args.delimiter, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_text_label_in_some_csv.py b/cli/write_text_label_in_some_csv.py deleted file mode 100644 index 1feb4c1..0000000 --- a/cli/write_text_label_in_some_csv.py +++ /dev/null @@ -1,81 +0,0 @@ -import argparse - -from cli.dataset_loaders import LabelInSomeCsvLoader -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def unescaped_str(arg_str): - return arg_str.encode().decode("unicode_escape") - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument( - "--files", - type=str, - help="The list of file path, Except value: /tmp/test1.txt,/tmp/test2.txt,/tmp/test3.txt", - required=True, - ) - - parser.add_argument( - "--delimiter", - type=unescaped_str, - choices=["\t", "\n", " ", ",", "_"], - help="punctuation for split the label line", - required=True, - ) - - parser.add_argument("--key-index", type=int, help="The index to extract key from the label line", required=True) - - parser.add_argument( - "--values-index", - type=str, - help="The list of index to extract values from the label line, Except value: 0,1,2,3", - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - parser.add_argument( - "--skip-header", action=argparse.BooleanOptionalAction, help="ignore header of csv if True, default False" - ) - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - values_index = [int(value) for value in args.values_index.split(",")] - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - LabelInSomeCsvLoader( - file_paths=args.files.split(","), - fn_md5_path=args.fn_md5_path, - key_index=args.key_index, - values_index=values_index, - values_map=values_map, - delimiter=args.delimiter, - skip_header=args.skip_header | False, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_text_label_in_some_txt.py b/cli/write_text_label_in_some_txt.py deleted file mode 100644 index 7aa0bf8..0000000 --- a/cli/write_text_label_in_some_txt.py +++ /dev/null @@ -1,74 +0,0 @@ -import argparse - -from cli.dataset_loaders import LabelInSomeTxtLoader -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def unescaped_str(arg_str): - return arg_str.encode().decode("unicode_escape") - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument( - "--files", type=str, help="The list of file path, Except value: /tmp/test1.txt,/tmp/test2.txt,/tmp/test3.txt" - ) - - parser.add_argument( - "--delimiter", - type=unescaped_str, - choices=["\t", "\n", " ", ",", "_"], - help="punctuation for split the label line", - required=True, - ) - - parser.add_argument("--key-index", type=int, help="The index to extract key from the label line", required=True) - - parser.add_argument( - "--values-index", - type=str, - help="The list of index to extract values from the label line, Except value: 0,1,2,3", - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - - values_index = [int(value) for value in args.values_index.split(",")] - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - LabelInSomeTxtLoader( - file_paths=args.files.split(","), - fn_md5_path=args.fn_md5_path, - key_index=args.key_index, - values_index=values_index, - values_map=values_map, - delimiter=args.delimiter, - ), - ) - - -if __name__ == "__main__": - main() diff --git a/cli/write_text_label_in_txt.py b/cli/write_text_label_in_txt.py deleted file mode 100644 index de07091..0000000 --- a/cli/write_text_label_in_txt.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -label of data in txt file, and map 1:1 with image -ex: 12_0_0_20170117190914091 -""" - -import argparse -import os - -from cli.dataset_loaders import LabelInTxtLoader -from lmdbsystem.lmdb import Lmdb -from lmdbsystem.write_adapters.text import TextWriteAdapter - - -def unescaped_str(arg_str): - return arg_str.encode().decode("unicode_escape") - - -def get_argument(): - parser = argparse.ArgumentParser( - description="Convert pdf file to text detection and recognition label.", - ) - - parser.add_argument("--lmdb-file", type=str, help="The path of lmdb file", required=True) - - parser.add_argument("--folder", type=str, help="Directory to containing the label file") - - parser.add_argument("--suffix", default=".txt", type=str, help="The suffix of label file", required=True) - - parser.add_argument( - "--delimiter", - type=unescaped_str, - choices=["\t", "\n", " ", ",", "_"], - help="punctuation for split the label line", - required=True, - ) - - parser.add_argument( - "--values-index", - type=str, - help="The list of index to extract values from the label line, Except value: 0,1,2,3", - required=True, - ) - - parser.add_argument("--fn-md5-path", type=str, help="The path of filename_to_md5 file", required=True) - - parser.add_argument( - "--values-map", - type=str, - help="List of normalize the value." 'Ex: "Female:0,Male:1,female:0,male:1,111:1,112:0"', - ) - - parser.add_argument("--from-filename", action=argparse.BooleanOptionalAction, help="Extract value from filename") - args = parser.parse_args() - return args - - -def main(): - args = get_argument() - if args.folder and not os.path.isdir(args.folder): - raise ValueError("Folder not exists") - - values_index = [int(value) for value in args.values_index.split(",")] - values_map = ( - {value.split(":")[0]: value.split(":")[1] for value in args.values_map.split(",")} if args.values_map else None - ) - - lmdb_obj = Lmdb(TextWriteAdapter(path=args.lmdb_file)) - lmdb_obj.write_loader( - LabelInTxtLoader( - directory=args.folder, - suffix=args.suffix, - fn_md5_path=args.fn_md5_path, - values_map=values_map, - delimiter=args.delimiter, - values_index=values_index, - ), - ) - - -if __name__ == "__main__": - main()