Skip to content

Commit

Permalink
update dats importer
Browse files Browse the repository at this point in the history
  • Loading branch information
bigabig committed Oct 14, 2024
1 parent 7eb9c29 commit 874687f
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 39 deletions.
3 changes: 3 additions & 0 deletions tools/importer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ pip install python-magic
## Usage

```
# import klimawirtschaft
python importer/dats_importer.py --input_dir /ltstorage/shares/projects/dwts/backend/src/dev_notebooks/data/KlimaWirtschaft/json --backend_url http://localhost:19002/ --project_id 86 --tag_name wirtschaft --tag_description wirtschaft --is_json --metadata_keys Newspaper Date Length Section Author Headline --metadata_types STRING DATE NUMBER STRING STRING STRING --doctype text --content_key Article --mime_type text/html --username [email protected]
# import klimaallgemein metadata
python importer/dats_importer_metadata.py --input_dir /ltstorage/shares/projects/dwts/backend/src/dev_notebooks/data/KlimaAllgemein/json2 --backend_url http://localhost:19002/ --project_id 84 --metadata_keys paper paper_db headline date --metadata_types STRING STRING STRING DATE --doctype text --username [email protected]
Expand Down
9 changes: 6 additions & 3 deletions tools/importer/dats_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def bulk_apply_tags(self, sdoc_ids: List[int], tag_ids: List[int]):

# METADATA
def create_project_metadata(
self, proj_id: int, key: str, metatype: str, doctype: str
self, proj_id: int, key: str, metatype: str, doctype: str, description: str
):
# metatype is STRING DATE BOOLEAN NUMBER LIST
# doctype is text image video audio
Expand All @@ -280,6 +280,7 @@ def create_project_metadata(
"metatype": metatype,
"read_only": False,
"doctype": doctype,
"description": description,
"project_id": proj_id,
}
),
Expand Down Expand Up @@ -343,6 +344,7 @@ def update_sdoc_metadata(

# get project
project = dats.get_proj_by_title(title="test")
assert project is not None
print("got project by title", project)

# get project
Expand All @@ -365,6 +367,7 @@ def update_sdoc_metadata(

# get tag
tag = dats.get_tag_by_name(proj_id=project["id"], name="test tag")
assert tag is not None
print("got tag", tag)

# get tags
Expand Down Expand Up @@ -417,10 +420,10 @@ def update_sdoc_metadata(

# create project metadata
project_metadata = dats.create_project_metadata(
proj_id=project["id"], key="sdoc_id", metatype="STRING", doctype="text"
proj_id=project["id"], key="sdoc_id", metatype="STRING", doctype="text", description="sdoc_id"
)
project_metadata = dats.create_project_metadata(
proj_id=project["id"], key="sdoc_id", metatype="STRING", doctype="image"
proj_id=project["id"], key="sdoc_id", metatype="STRING", doctype="image", description="sdoc_id"
)
print("created project metadata", project_metadata)

Expand Down
47 changes: 11 additions & 36 deletions tools/importer/dats_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,14 @@
api.login()
api.me()

# create new project if it does not exist
# if project_id is set, use that
if args.project_id != -1:
project = api.get_proj_by_id(args.project_id)
if project is None:
print(f"Project with ID {args.project_id} does not exist!")
exit()
else:
# create new project if it does not exist
title = args.project_name
project = api.get_proj_by_title(title)
if project is None:
Expand All @@ -180,7 +180,7 @@
for key, metatype in zip(args.metadata_keys, args.metadata_types):
if key not in project_metadata_map:
project_metadata = api.create_project_metadata(
proj_id=project["id"], key=key, metatype=metatype, doctype=args.doctype
proj_id=project["id"], key=key, metatype=metatype, doctype=args.doctype, description=key
)
project_metadata_map[key] = project_metadata

Expand Down Expand Up @@ -211,20 +211,12 @@
if data[args.content_key] == "":
print(f"Skipping file {file.name} because {args.content_key} is empty!")
continue
json_data[filename] = data
if args.mime_type is None:
mime = magic.from_buffer(data[args.content_key], mime=True)
else:
mime = args.mime_type
sdoc_id = api.resolve_sdoc_id_from_proj_and_filename(
proj_id=project["id"], filename=filename
)
if sdoc_id is not None:
print(
f"Skipping file {filename} because it already exists in the project!"
)
continue

json_data[filename] = data
content = str(data[args.content_key])
files.append(("uploaded_files", (filename, content.encode("utf-8"), mime)))
except Exception as e:
Expand All @@ -237,7 +229,7 @@
mime = args.mime_type
files.append(("uploaded_files", (filename, file_bytes, mime)))

# remove duplicate files
# remove duplicate files by name
temp = {upload_file[1][0]: upload_file for upload_file in files}
files = list(temp.values())

Expand Down Expand Up @@ -305,6 +297,7 @@ def upload_file_batch(file_batch: List[Tuple[str, Tuple[str, bytes, str]]]):


# create new tag if it does not exist
api.refresh_login()
tag = api.get_tag_by_name(proj_id=project["id"], name=args.tag_name)
if tag is None:
tag = api.create_tag(
Expand All @@ -323,9 +316,15 @@ def upload_file_batch(file_batch: List[Tuple[str, Tuple[str, bytes, str]]]):

# apply sdoc metadata
applied = set()
idx = 0
for filename, data in tqdm(
json_data.items(), total=len(json_data), desc="Applying metadata to sdocs... "
):
# refresh login
if idx % 1000 == 0:
api.refresh_login()
idx += 1

sdoc_id = api.resolve_sdoc_id_from_proj_and_filename(
proj_id=project["id"], filename=filename
)
Expand All @@ -337,28 +336,4 @@ def upload_file_batch(file_batch: List[Tuple[str, Tuple[str, bytes, str]]]):
)
applied.add(sdoc_id)

# TODO: what about this?
# for image_name in data["image_names"]:
# if image_name:
# sdoc_id = api.resolve_sdoc_id_from_proj_and_filename(
# proj_id=project["id"], filename=filename
# )
# if sdoc_id not in applied and sdoc_id is not None:
# api.create_metadata(sdoc_id=sdoc_id, key="origin", value=data["url"])
# if "published_date" in data and data["published_date"] != "":
# api.create_metadata(
# sdoc_id=sdoc_id,
# key="published_date",
# value=data["published_date"],
# )
# if "visited_date" in data and data["visited_date"] != "":
# api.create_metadata(
# sdoc_id=sdoc_id, key="visited_date", value=data["visited_date"]
# )
# if "author" in data and data["author"] != "":
# api.create_metadata(
# sdoc_id=sdoc_id, key="author", value=data["author"]
# )
# applied.add(sdoc_id)

print("(: FINISHED :)")

0 comments on commit 874687f

Please sign in to comment.