Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing Images and Annotations in the 17K-Graffiti Dataset #1

Open
dichrogfx opened this issue Aug 13, 2024 · 0 comments
Open

Missing Images and Annotations in the 17K-Graffiti Dataset #1

dichrogfx opened this issue Aug 13, 2024 · 0 comments

Comments

@dichrogfx
Copy link

Hi,

I'm currently using the 17K-Graffiti dataset for my research, and I've encountered an issue where a significant number of image files are missing from the training set. Specifically, 1131 images referenced in the annotations file could not be found in the dataset I downloaded. Below is the code snippet I used to identify the missing files:

import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

# Define paths
base_path = '../Data/17kGraffiti'
test_path = os.path.join(base_path, 'test/graffiti')
train_path = os.path.join(base_path, 'train/graffiti')
test_labels_path = os.path.join(base_path, 'test_bboxes.pkl')
train_labels_path = os.path.join(base_path, 'train_bboxes.pkl')

# Load labels
def load_labels_pandas(pkl_path):
    return pd.read_pickle(pkl_path)

# Convert bounding boxes to YOLO format
def convert_bbox_to_yolo(size, bbox):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (bbox[0] + bbox[2]) / 2.0
    y = (bbox[1] + bbox[3]) / 2.0
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)

# Save YOLO labels
def save_yolo_labels(labels, img_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for idx, row in tqdm(labels.iterrows(), total=labels.shape[0], desc="Processing Labels"):
        img_file = row['FileName'] + '.jpg'
        img_path = os.path.join(img_dir, img_file)

        try:
            img = mpimg.imread(img_path)
        except FileNotFoundError:
            print(f"Image not found: {img_path}")
            continue

        if len(img.shape) == 3: 
            h, w, _ = img.shape
        elif len(img.shape) == 2:  
            h, w = img.shape
        else:
            print(f"Unexpected image shape: {img.shape} for image {img_path}")
            continue

        bboxes = row['bbox']
        with open(os.path.join(output_dir, row['FileName'] + '.txt'), 'w') as f:
            for bbox in bboxes:
                yolo_bbox = convert_bbox_to_yolo((w, h), bbox)
                f.write(f"0 {yolo_bbox[0]} {yolo_bbox[1]} {yolo_bbox[2]} {yolo_bbox[3]}\n")

        if idx < 5:
            print(f"File: {row['FileName']}")
            print(f"YOLO bbox: {yolo_bbox}\n")

# Load labels
print("Loading labels...")
test_labels = load_labels_pandas(test_labels_path)
train_labels = load_labels_pandas(train_labels_path)

# Print sample labels
print("Test labels sample:")
print(test_labels.head())
print("\nTrain labels sample:")
print(train_labels.head())

# Check for missing files
train_files = os.listdir(train_path)
missing_files = []

for filename in train_labels['FileName']:
    if f"{filename}.jpg" not in train_files:
        missing_files.append(filename)

print(f"Number of missing files in training set: {len(missing_files)}")
print("Sample missing files:", missing_files[:5])

if missing_files:
    updated_train_labels = train_labels[~train_labels['FileName'].isin(missing_files)]
else:
    updated_train_labels = train_labels

print(f"Updated number of training labels: {len(updated_train_labels)}")

# Convert and save labels in YOLO format
print("Converting and saving labels to YOLO format...")
save_yolo_labels(updated_train_labels, train_path, os.path.join(base_path, 'train/labels'))
save_yolo_labels(test_labels, test_path, os.path.join(base_path, 'test/labels'))
print("Conversion and saving complete.")

Sample Output:

Number of missing files in the training set: 1131
Sample missing files: ['10008971653_d32f09b87b_c', '10034339546_8a2486cbc9_c', '10112431913_06b2dfb89a_c', '10121326145_df091e3dd8_c', '10185187695_39e7589395_c']

I also retrieved the annotations from this repository, but it seems that my dataset might be incomplete or incorrectly structured. Could you please verify whether there might be an issue with the provided dataset or advise on how to resolve this?

Thank you in advance for your help!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant