-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_cleaner.py
62 lines (50 loc) · 2.47 KB
/
csv_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Overall, this is set up to take a .csv as either an argument or prompt response and save the cleaned version to "completeCSV.csv".
# The current filter criteria for each row are:
# 1. Row has visible = no -> remove
# 2. Row has stock = 0 or 'Unlimited' -> remove
# 3. Row is tagged with "Inventory Excluded" -> remove
# (for other cases where we don't need to track it week to week)
from numpy import true_divide
import pandas as pd
import sys
from datetime import datetime
# if filepath is passed as argument, use that, otherwise prompt for one
if len(sys.argv) == 2:
filepath = sys.argv[1]
else:
filepath = input("Please enter filepath of .csv to edit: ")
# add validation about csv filetypes only, maybe?
# Open csv into dataframe(df) using pandas to manipulate
df = pd.read_csv(filepath, sep=',')
# for each row, transfer the "Visible" status, title, and product page to any null rows below it
copyforward = ('Title','Product Page','Visible')
for i in copyforward:
df[i] = df[i].fillna(method = 'ffill')
# loop over all rows again, and remove row if visible = no OR stock = 0 or Unlimited
dropList = []
for i in range(0,len(df)):
# T/F criteria to drop row:
hidden = df.loc[i,'Visible']=='No'
out = df.loc[i,'Stock']=='0'
unlimited = df.loc[i,'Stock']=='Unlimited'
# for tag filtering, have to explicitly cast tags as a string since NaN is apparently considered a float? Sorry for the janky workaround.
tags = str(df.loc[i,'Tags'])
tagged = (tags.find("Inventory Excluded") != -1)
#if any criteria are met, add to drop list.
if hidden or out or unlimited or tagged:
dropList.append(i)
# also check if on sale, replace regular price with sale price
onSale = df.loc[i,'On Sale'] == 'Yes'
currentPrice = df.loc[i,'Sale Price']
if onSale:
df.loc[i,'Price'] = currentPrice
# using a list to keep track of what to drop was done since trying to drop within the for loop is...no bueno
# I think drop() was changing len(df) and that was making things unhappy
df.drop(index = dropList, inplace = True)
# Sort primary: Product Page, secondary:SKU
df = df.sort_values(by = ['Product Page','SKU'])
# datetime object containing current date and time (used to generate new file name)
now = datetime.now()
# print to new CSV
completeFile = "Completed_" + now.strftime("%d%m%Y_%H%M") + ".csv"
df.to_csv(completeFile, columns = ['Product Page', 'Title','SKU','Stock','Price'])