-
Notifications
You must be signed in to change notification settings - Fork 0
/
directory_audit_Step1.py
322 lines (269 loc) · 13 KB
/
directory_audit_Step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import os
import logging
import pandas as pd
import re
import datetime
import platform
from pathlib import Path
import time
import psutil
desktop_path = os.path.join(os.path.expanduser("~"), 'Desktop')
logging.basicConfig(filename='error.log', level=logging.ERROR)
def audit_directory_process(desktop_path):
directory_path = input("Enter the directory path to audit: ").strip()
print(f"Path to audit: '{os.path.abspath(directory_path)}'")
if os.path.isdir(directory_path):
exclude_folders = get_exclusion_list(directory_path)
valid_file_name = False
while not valid_file_name:
output_file_name = input("Enter the desired output file name (without extension): ").strip()
if output_file_name and re.match("^[a-zA-Z0-9_-]*$", output_file_name):
output_file = os.path.join(desktop_path, f'{output_file_name}.xlsx')
valid_file_name = True
else:
print("Invalid file name. Please use only letters, numbers, hyphens, and underscores.")
file_data = list_files(directory_path, exclude_folders)
df_files = pd.DataFrame(file_data)
hierarchical_data = generate_hierarchical_structure(directory_path, file_data)
df_hierarchy = pd.DataFrame(hierarchical_data)
try:
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
df_files.to_excel(writer, index=False, sheet_name='RawData')
df_hierarchy.to_excel(writer, index=False, sheet_name='AuditSheet')
print(f"File list saved to {output_file}")
except Exception as e:
logging.error(f"Failed to save the file {output_file}. Error: {e}")
print(f"An error occurred while saving the file. Please check the error.log for more details.")
else:
print("Invalid directory path. Please enter a valid path.")
def get_file_owner(file_path):
if platform.system() == 'Windows':
try:
import win32security
security_descriptor = win32security.GetFileSecurity(file_path, win32security.OWNER_SECURITY_INFORMATION)
owner_sid = security_descriptor.GetSecurityDescriptorOwner()
name, _, _ = win32security.LookupAccountSid(None, owner_sid)
return name
except FileNotFoundError as e:
logging.error(f"File not found when trying to get owner: {file_path}, error: {e}")
return "Unknown"
return ""
def list_files(dir_path, exclude_folders):
file_data = []
dir_path = Path(dir_path).resolve()
exclude_folders = [Path(folder).resolve() for folder in exclude_folders] # Convert exclude folders to resolved Path objects
for root, dirs, files in os.walk(dir_path, topdown=True):
root_path = Path(root).resolve()
# Exclude specified folders
dirs[:] = [d for d in dirs if root_path.joinpath(d).resolve() not in exclude_folders]
for file in files:
file_path = root_path / file
try:
file_stat = os.stat(file_path)
file_size_mb = file_stat.st_size / (1024 * 1024)
date_created = datetime.datetime.fromtimestamp(file_stat.st_ctime).strftime('%Y-%m-%d')
last_modified = datetime.datetime.fromtimestamp(file_stat.st_mtime).strftime('%Y-%m-%d')
file_data.append({
'File Name': file,
'File Type': file_path.suffix,
'File Path': str(file_path),
'File Size (MB)': round(file_size_mb, 2),
'Date Created': date_created,
'Last Modified': last_modified,
'Owner': get_file_owner(str(file_path))
})
except FileNotFoundError as e:
logging.error(f"File not found when accessing metadata: {file_path}, error: {e}")
continue # Skip to the next file
return file_data
def generate_hierarchical_structure(dir_path, file_data):
hierarchical_data = []
seen_dirs = set()
dir_path = Path(dir_path)
sorted_files = sorted(file_data, key=lambda x: Path(x['File Path']).parts)
for file_info in sorted_files:
file_path = Path(file_info['File Path'])
parts = file_path.relative_to(dir_path).parts
cumulative_path = dir_path # Initialize cumulative_path at the start of the loop
for i, part in enumerate(parts[:-1]): # Iterate over parts (excluding the last part which is the file)
cumulative_path = cumulative_path / part
if str(cumulative_path) not in seen_dirs:
folder_size_mb = get_folder_size(cumulative_path) / (1024 * 1024) # Convert to MB
hierarchical_data.append({
'Name': f'=HYPERLINK("{cumulative_path}", "{ " " * 4 * i + part}")',
'Item Type': 'Folder',
'Owner': file_info.get('Owner', 'Unknown'),
'Date Created': file_info.get('Date Created', ''),
'Last Modified': file_info.get('Last Modified', ''),
'Size (MB)': round(folder_size_mb, 2),
'Path': f'=HYPERLINK("{cumulative_path}", "Open Folder")',
'Action': '',
'Rename as…': '',
'Move to…': '',
'Consult staff?': '',
'Proposed retention': '',
'Justification': '',
'Notes': ''
})
seen_dirs.add(str(cumulative_path))
# Add file to the hierarchical data
file_size_mb = file_info['File Size (MB)'] # This assumes file sizes are already calculated in MB
hierarchical_data.append({
'Name': f'=HYPERLINK("{file_path}", "{ " " * 4 * (len(parts) - 1) + parts[-1]}")',
'Item Type': 'File',
'Owner': file_info['Owner'],
'Date Created': file_info['Date Created'],
'Last Modified': file_info['Last Modified'],
'Size (MB)': file_size_mb,
'Path': f'=HYPERLINK("{file_path}", "Open File")',
'Action': '',
'Rename as…': '',
'Move to…': '',
'Consult staff?': '',
'Proposed retention': '',
'Justification': '',
'Notes': ''
})
return hierarchical_data
def get_folder_size(folder_path):
total_size = 0
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
try:
if not os.path.islink(file_path): # Skip if it's a symbolic link
total_size += os.path.getsize(file_path)
except FileNotFoundError as e:
logging.error(f"File not found: {file_path}, error: {e}")
return total_size
def get_exclusion_list(dir_path, retry_limit=3):
dir_path = Path(dir_path).resolve()
# Get actual folder names and convert them to lowercase for a case-insensitive comparison
actual_folders = {f.name.lower(): f for f in dir_path.glob('*') if f.is_dir()}
retry_count = 0
while True:
exclude_input = input("Enter the folder names to exclude (comma-separated), 'none' to exclude nothing, or 'quit' to exit: ").strip()
if exclude_input.lower() in ['quit', 'exit']:
exit()
if exclude_input.lower() == 'none':
print("No directories will be excluded.")
return []
folder_names = [name.strip() for name in exclude_input.split(',') if name.strip()]
valid_paths = []
invalid_paths = []
for folder_name in folder_names:
folder_name_lower = folder_name.lower()
if folder_name_lower in actual_folders:
valid_paths.append(str(actual_folders[folder_name_lower].resolve()))
else:
invalid_paths.append(folder_name)
if not invalid_paths:
print("\nFolders selected for exclusion:")
for path in valid_paths:
print(path)
user_confirmation = input("Confirm exclusions (yes/no), or 'quit' to exit: ").strip().lower()
if user_confirmation in ['quit', 'exit']:
exit()
if user_confirmation == 'yes':
return valid_paths
print(f"Invalid or non-existent directories: {', '.join(invalid_paths)}. Please re-enter all folder names correctly.")
retry_count += 1
if retry_count >= retry_limit:
print(f"Exceeded maximum retries ({retry_limit}). Exiting program.")
exit()
def find_common_base_directory(file_paths):
paths = [Path(p) for p in file_paths]
common_base = paths[0].parent
for path in paths:
while not str(path).startswith(str(common_base)):
common_base = common_base.parent
if common_base == Path(common_base.root):
# We've reached the root of the file system
return common_base
return common_base
def process_uploaded_file(desktop_path):
file_path_input = input("Enter the path to the .xlsx or .csv file: ").strip()
file_path = Path(file_path_input)
if not file_path.exists():
print(f"File not found: {file_path}")
return
df_raw = None
if file_path.suffix.lower() == '.xlsx':
xls = pd.ExcelFile(file_path)
# Attempt to find the correct sheet based on the 'File Path' column
for sheet in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet)
if 'File Path' in df.columns:
df_raw = df
break
if df_raw is None:
print("No suitable sheet found in the Excel file.")
return
else:
df_raw = pd.read_csv(file_path, dtype=str)
if 'File Path' not in df_raw.columns:
print("The CSV file does not contain a 'File Path' column.")
return
# Ensure that df_raw is not None and contains 'File Path' column
if df_raw is not None and 'File Path' in df_raw.columns:
# If 'Owner' column is not in df_raw, calculate the owner information
if 'Owner' not in df_raw.columns:
df_raw['Owner'] = df_raw['File Path'].apply(lambda x: get_file_owner(x))
file_paths = df_raw['File Path'].tolist()
common_base = find_common_base_directory(file_paths)
print(f"The determined base directory for hierarchy is: {common_base}")
# Generate hierarchical data using the common base as the dir_path
hierarchical_data = generate_hierarchical_structure(common_base, df_raw.to_dict(orient='records'))
df_hierarchy = pd.DataFrame(hierarchical_data)
valid_file_name = False
while not valid_file_name:
output_file_name = input("Enter the desired output file name (without extension): ").strip()
if output_file_name and re.match("^[a-zA-Z0-9_-]*$", output_file_name):
output_file = os.path.join(desktop_path, f'{output_file_name}_audit_file.xlsx')
valid_file_name = True
else:
print("Invalid file name. Please use only letters, numbers, hyphens, and underscores.")
try:
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
df_raw.to_excel(writer, index=False, sheet_name='RawData')
df_hierarchy.to_excel(writer, index=False, sheet_name='AuditSheet')
print(f"Hierarchical structure and raw data saved to {output_file}")
except Exception as e:
print("An error occurred while saving the file. Please check the error.log for more details.")
logging.error(f"Failed to save the file {output_file}. Error: {e}")
else:
print("The uploaded file does not contain the required data to generate a hierarchy.")
def main():
start_time = time.time()
initial_memory = psutil.virtual_memory().used
print("Select an option:")
print("1: Audit a directory")
print("2: Process an uploaded file for hierarchical structure")
print("3: Exit")
max_tries = 0
while True:
user_choice = input("Enter your choice (1/2/3): ").strip()
if user_choice == '1':
audit_directory_process(desktop_path)
break
elif user_choice == '2':
process_uploaded_file(desktop_path)
break
elif user_choice == '3':
exit()
else:
max_tries += 1
print("Invalid choice, please select 1, 2, or 3.")
if max_tries >= 3:
print("Maximum attempts exceeded.")
break
# Calculate total time taken
end_time = time.time()
total_time = end_time - start_time
# Calculate memory usage after the program has run
final_memory = psutil.virtual_memory().used
memory_used = final_memory - initial_memory
print(f"Total time taken: {total_time} seconds")
print(f"Memory used: {memory_used / (1024 * 1024):.2f} MB")
if __name__ == '__main__':
main()