Missing Images and Annotations in the 17K-Graffiti Dataset #1

dichrogfx · 2024-08-13T10:29:01Z

Hi,

I'm currently using the 17K-Graffiti dataset for my research, and I've encountered an issue where a significant number of image files are missing from the training set. Specifically, 1131 images referenced in the annotations file could not be found in the dataset I downloaded. Below is the code snippet I used to identify the missing files:

import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

# Define paths
base_path = '../Data/17kGraffiti'
test_path = os.path.join(base_path, 'test/graffiti')
train_path = os.path.join(base_path, 'train/graffiti')
test_labels_path = os.path.join(base_path, 'test_bboxes.pkl')
train_labels_path = os.path.join(base_path, 'train_bboxes.pkl')

# Load labels
def load_labels_pandas(pkl_path):
    return pd.read_pickle(pkl_path)

# Convert bounding boxes to YOLO format
def convert_bbox_to_yolo(size, bbox):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (bbox[0] + bbox[2]) / 2.0
    y = (bbox[1] + bbox[3]) / 2.0
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)

# Save YOLO labels
def save_yolo_labels(labels, img_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for idx, row in tqdm(labels.iterrows(), total=labels.shape[0], desc="Processing Labels"):
        img_file = row['FileName'] + '.jpg'
        img_path = os.path.join(img_dir, img_file)

        try:
            img = mpimg.imread(img_path)
        except FileNotFoundError:
            print(f"Image not found: {img_path}")
            continue

        if len(img.shape) == 3: 
            h, w, _ = img.shape
        elif len(img.shape) == 2:  
            h, w = img.shape
        else:
            print(f"Unexpected image shape: {img.shape} for image {img_path}")
            continue

        bboxes = row['bbox']
        with open(os.path.join(output_dir, row['FileName'] + '.txt'), 'w') as f:
            for bbox in bboxes:
                yolo_bbox = convert_bbox_to_yolo((w, h), bbox)
                f.write(f"0 {yolo_bbox[0]} {yolo_bbox[1]} {yolo_bbox[2]} {yolo_bbox[3]}\n")

        if idx < 5:
            print(f"File: {row['FileName']}")
            print(f"YOLO bbox: {yolo_bbox}\n")

# Load labels
print("Loading labels...")
test_labels = load_labels_pandas(test_labels_path)
train_labels = load_labels_pandas(train_labels_path)

# Print sample labels
print("Test labels sample:")
print(test_labels.head())
print("\nTrain labels sample:")
print(train_labels.head())

# Check for missing files
train_files = os.listdir(train_path)
missing_files = []

for filename in train_labels['FileName']:
    if f"{filename}.jpg" not in train_files:
        missing_files.append(filename)

print(f"Number of missing files in training set: {len(missing_files)}")
print("Sample missing files:", missing_files[:5])

if missing_files:
    updated_train_labels = train_labels[~train_labels['FileName'].isin(missing_files)]
else:
    updated_train_labels = train_labels

print(f"Updated number of training labels: {len(updated_train_labels)}")

# Convert and save labels in YOLO format
print("Converting and saving labels to YOLO format...")
save_yolo_labels(updated_train_labels, train_path, os.path.join(base_path, 'train/labels'))
save_yolo_labels(test_labels, test_path, os.path.join(base_path, 'test/labels'))
print("Conversion and saving complete.")

Sample Output:

Number of missing files in the training set: 1131
Sample missing files: ['10008971653_d32f09b87b_c', '10034339546_8a2486cbc9_c', '10112431913_06b2dfb89a_c', '10121326145_df091e3dd8_c', '10185187695_39e7589395_c']

I also retrieved the annotations from this repository, but it seems that my dataset might be incomplete or incorrectly structured. Could you please verify whether there might be an issue with the provided dataset or advise on how to resolve this?

Thank you in advance for your help!

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Missing Images and Annotations in the 17K-Graffiti Dataset #1

Missing Images and Annotations in the 17K-Graffiti Dataset #1

dichrogfx commented Aug 13, 2024

Missing Images and Annotations in the 17K-Graffiti Dataset #1

Missing Images and Annotations in the 17K-Graffiti Dataset #1

Comments

dichrogfx commented Aug 13, 2024