-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
134 lines (102 loc) · 4.66 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import pandas as pd
from PIL import Image
# Load your dataset
dataset_path = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\bmi.csv' # Update with the actual path
df = pd.read_csv(dataset_path)
# Paths to the image folders
sides_folder = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\illinois_doc_dataset\side\side'
front_folder = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\illinois_doc_dataset\front\front'
# Function to delete corrupted images
def delete_corrupted_images(folder_path):
corrupted_ids = []
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
file_id, _ = os.path.splitext(filename)
try:
with Image.open(file_path) as img:
img.verify() # Verify the image is not corrupted
except (IOError, SyntaxError):
print(f"Corrupted image found and deleted: {file_path}")
corrupted_ids.append(file_id.strip())
try:
os.remove(file_path)
except Exception as e:
print(f"Error deleting corrupted image {file_path}: {e}")
return corrupted_ids
# Remove corrupted images from both folders
corrupted_side_ids = delete_corrupted_images(sides_folder)
corrupted_front_ids = delete_corrupted_images(front_folder)
# Combine corrupted IDs
corrupted_ids = set(corrupted_side_ids + corrupted_front_ids)
# Check for missing images and add their IDs to the removal list
missing_ids = []
for index, row in df.iterrows():
image_id = str(row['id']).strip()
side_image_path = os.path.join(sides_folder, f"{image_id}.jpg")
front_image_path = os.path.join(front_folder, f"{image_id}.jpg")
if not os.path.exists(side_image_path) or not os.path.exists(front_image_path):
missing_ids.append(image_id)
# Combine corrupted and missing IDs
ids_to_remove = corrupted_ids.union(missing_ids)
# Remove rows with corrupted or missing images
df = df[~df['id'].astype(str).isin(ids_to_remove)]
# Count the number of males and females
gender_counts = df['sex'].value_counts()
num_males = gender_counts.get('Male', 0)
num_females = gender_counts.get('Female', 0)
print(f"Number of males: {num_males}")
print(f"Number of females: {num_females}")
# Balance the dataset by removing extra male rows
if num_males > num_females:
males_to_drop = num_males - num_females
male_ids_to_drop = df[df['sex'] == 'Male'].sample(n=males_to_drop, random_state=42)['id'].tolist()
df = df[~df['id'].isin(male_ids_to_drop)]
ids_to_remove.update(map(str, male_ids_to_drop))
print(f"Dropped {len(male_ids_to_drop)} male rows to balance genders.")
# Save the updated dataset
df.to_csv(dataset_path, index=False)
print("Updated dataset saved.")
# Function to delete images by IDs
def delete_images(folder_path, ids_to_delete):
for filename in os.listdir(folder_path):
file_id, _ = os.path.splitext(filename)
if file_id.strip() in ids_to_delete:
file_path = os.path.join(folder_path, filename)
try:
os.remove(file_path)
print(f"Deleted: {file_path}")
except Exception as e:
print(f"Error deleting {file_path}: {e}")
# Delete images corresponding to removed IDs
delete_images(sides_folder, ids_to_remove)
delete_images(front_folder, ids_to_remove)
print("Image deletion process completed.")
import os
import pandas as pd
from PIL import Image
# Load your dataset
dataset_path = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\bmi.csv' # Update with the actual path
df = pd.read_csv(dataset_path)
print(df.isnull().all())
# Debugging during training
# Encode 'Sex' column as binary (if not already)
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0}) # Update as per your dataset
print(df)
# Save the cleaned dataframe as a CSV file
output_path = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\bmi.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")
# Save the cleaned dataframe as a CSV file
output_path = r'C:\Users\azadk\OneDrive\Desktop\projects\bmi_detection\cleaned_bmi.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")
# Load the CSV file
df = pd.read_csv('bmi.csv')
# Convert height to inches (assumes height is in meters)
df['height'] = df['height'] * 100
# Calculate BMI
df['BMI'] = (df['weight'] * 703) / (df['height'] ** 2)
# Save the modified DataFrame to a new CSV file
df.to_csv('bmi_with_bmi_column.csv', index=False)
print("BMI column added and file saved as 'bmi.csv'.")