-
Notifications
You must be signed in to change notification settings - Fork 0
/
filescanner.py
47 lines (37 loc) · 1.44 KB
/
filescanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import pandas as pd
# Function to recursively scan a folder for CSV files and return their paths
def get_csv_paths(folder_path):
csv_files = []
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith('.csv'):
csv_files.append(os.path.join(root, file))
return csv_files
# Function to read a CSV file and return its column names (features)
def get_features_from_csv(csv_path):
df = pd.read_csv(csv_path, nrows=0) # Read only the header (nrows=0)
return df.columns.tolist()
def main():
folder_path = 'data'
# Get the paths of all CSV files in the folder and its subfolders
csv_paths = get_csv_paths(folder_path)
# Read the CSV files and extract their features
all_features = []
datasets = []
for csv_path in csv_paths:
features = get_features_from_csv(csv_path)
all_features.extend(features)
datasets.append(pd.read_csv(csv_path))
# Remove duplicates from the list of features
selected_features = list(set(all_features))
print("CSV Paths:")
print(csv_paths)
print("\nSelected Features:")
print(selected_features)
# Combine datasets while keeping only the selected features
combined_dataset = pd.concat([dataset.reindex(selected_features, axis=1) for dataset in datasets], ignore_index=True)
print("\nCombined Dataset:")
print(combined_dataset.head())
if __name__ == '__main__':
main()