-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUI final for Demo_With Language Added_Uday_Edited.py
1156 lines (1011 loc) · 48.8 KB
/
UI final for Demo_With Language Added_Uday_Edited.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# import Tkinter as tk # python 2
# import tkFont as tkfont # python 2
import tkinter as tk # python 3
from tkinter import font as tkfont # python 3
from tkinter.filedialog import askopenfile
from tkinter.ttk import *
from tkinter import ttk
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup as soup
import requests
import re
import os
import pandas as pd
from requests.compat import quote_plus
from urllib.error import HTTPError
from urllib.error import URLError
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from pandas import DataFrame
import time
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import unicodedata
import re
import inflect
from pandas import read_csv
import pandas as pd
import os
import PyPDF2
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import csv as my_csv
global title, price, image , year, month, day, entry1, variable
pytesseract.pytesseract.tesseract_cmd= "C:/Program Files/Tesseract-OCR/tesseract.exe"
driver = 'C:\windows\geckodriver'
#path="D:/"
#file="{}".format(os.getpid())
#filename=os.path.join(path,file)
home_path="E:/"
original_scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
"https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
url = 'https://docs.google.com/spreadsheets/d/17dp3qr--cUlNRYoIB7Ur1Gao6NdMVv__YsIFGihkul4'
spreadsheet_name = 'CSV-to-Google-Sheet'
credential_file = 'client_secret.json'
def convert(scope, url, spreadsheet_name, credential_file, csv_file):
credentials = ServiceAccountCredentials.from_json_keyfile_name(credential_file, scope)
client = gspread.authorize(credentials)
spreadsheet = client.open(spreadsheet_name)
spreadsheet = client.open_by_url(url)
sheetname=csv_file.split("/")[-1]
print("CSV sheet name is",sheetname)
worksheet = spreadsheet.add_worksheet(title=sheetname, rows="100", cols="20")
# worksheet=spreadsheet.worksheet('data1.csv')
print(spreadsheet)
print(spreadsheet.worksheets())
print(worksheet.id)
print(dir(worksheet))
with open(csv_file, 'r',encoding="utf8") as file_obj:
print(file_obj)
reader = my_csv.reader(file_obj)
# worksheet.insert_rows(reader)
ar = []
for row in reader:
# print(index)
# worksheet.insert_row(row,index+1)
ar.append(row)
worksheet.insert_rows(ar, 1)
'''content = file_obj.read()
client.import_csv(worksheet.id, data=content)
'''
def checktypeofscrapping(ar1,ar2):
if ar1=="Amazon":
AmazonScraper(ar2)
elif ar1=="Ebay":
EbayScraper(ar2)
elif ar1=="Text":
TextScraper(ar2)
elif ar1=="Flipkart":
FlipkartScraper(ar2)
elif ar1=="Image":
ImageDownload(ar2)
elif ar1=="Walmart":
WalmartScraper(ar2)
#Calling the Amazonscrapper Function
def AmazonScraper(url):
url =url
#url = input("Enter the url: ")
def get_request(pageNo):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
req = requests.get(url + str(pageNo), headers=headers)
return req
def get_content(req):
return req.content
def apply_beautifulsoup(content):
return BeautifulSoup(content, 'lxml')
def get_name(div):
name_span = div.find('span', attrs={'class': 'a-size-medium a-color-base a-text-normal'})
if name_span is not None:
return name_span.text
else:
return 'no-info'
def get_price(div):
price_span = div.find('span', attrs={'class': 'a-offscreen'})
if price_span is not None:
return price_span.text
else:
return 'no-info'
def get_rating(div):
rating_span = div.find('span', attrs={'class': 'a-icon-alt'})
if rating_span is not None:
return rating_span.text
else:
return 'no-info'
all_info = []
for pageNo in range(1, 6):
req = get_request(pageNo)
content = get_content(req)
soup = apply_beautifulsoup(content)
for d in soup.findAll('div', attrs={
'class': 'sg-col-4-of-12 sg-col-8-of-16 sg-col-16-of-24 sg-col-12-of-20 sg-col-24-of-32 sg-col sg-col-28-of-36 sg-col-20-of-28'}):
name = get_name(d)
price = get_price(d)
rating = get_rating(d)
all_info.append([name, price, rating])
df= pd.DataFrame(all_info, columns=['name', 'price', 'ratings'])
#df
# product_info['price_$'] = product_info['price_$'].apply(clean_price)
#product_info.to_csv('D:\Amazon_products.csv')
target_folder=os.path.join(home_path,'_'.join("Amazon_scraper".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files ="{}".format(os.getpid())
filename=os.path.join(target_folder,files)
df.to_csv(filename+".csv") # writing data into csv file
print("file name:",filename+".csv")
return filename
#print("If file is empty, try other links")
#Calling the EbayScrapper Function
def EbayScraper(url):
#driver = 'C:\windows\chromedriver'
#Base_url = 'https://www.ebay.com/sch/i.html?_from=R40&_nkw={}&_sacat=0&_pgn=1'
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
search = url
#Mixing_url = Base_url.format(quote_plus(search))
#Final_url = requests.get(Mixing_url)
# print(Mixing_url)
#print(Final_url)
title = []
price = []
product_url = []
product_image_url = []
html = requests.get(search,headers=header)
soup = BeautifulSoup(html.content, 'html.parser')
# extracting the data
def scrape_data(pass_soup):
# get_url = requests.get(url)
# soup = BeautifulSoup(get_url.content, 'html.parser')
results = pass_soup.find(id="srp-river-results")
product_list = results.find_all(class_='s-item')
for item in product_list:
title_text = item.find(class_='s-item__title').get_text()
price_text = item.find(class_='s-item__price').get_text()
# rating = item.find('div', attrs = {'s-item__reviews'}).find('span', attrs = {'class': 'clipped'}).text
item_url = item.find('a').get('href')
image_url = item.find('img')['src']
title.append(title_text)
price.append(price_text)
# ratings.append(ratings)
product_url.append(item_url)
product_image_url.append(image_url)
# print(len(title))
scrape_data(soup)
pages = soup.find(class_='pagination__items')
print(len(pages))
links = pages.find_all('a')
# print(links)
for link in links:
href = link.get('href')
if (href == search):
print('')
continue
else:
other_url = requests.get(href)
print(other_url)
soup1 = BeautifulSoup(other_url.content, 'html.parser')
scrape_data(soup1)
df = pd.DataFrame({
'title': title,
'price': price,
# 'ratings':ratings,
'product_url': product_url,
'image_url': product_image_url,
})
#product_items.to_csv('D:\Ebay_product_list.csv')
target_folder=os.path.join(home_path,'_'.join("Ebay_scraper".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files ="{}".format(os.getpid())
filename=os.path.join(target_folder,files)
df.to_csv(filename+".csv") # writing data into csv file
print("File name: ",filename+".csv")
#print("If your file is empty, use anothor url")
return filename
#Calling the FlipkartScrapper Function
def FlipkartScraper(url):
#driver = 'C:\windows\chromedriver'
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
#search="https://www.flipkart.com/search?q=mobil&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"
#search="https://www.flipkart.com/search?q=laptops&as=on&as-show=on&otracker=AS_Query_TrendingAutoSuggest_7_0_na_na_na&otracker1=AS_Query_TrendingAutoSuggest_7_0_na_na_na&as-pos=7&as-type=TRENDING&suggestionId=laptops&requestId=a931c0b1-8729-4296-984f-7c17c5cf6cbb"
search=url
options = Options()
options.add_argument('--headless')
#options.add_argument('--disable-gpu')
#browser = webdriver.Chrome(driver, chrome_options=options)
ProductName = []
ProductPrice = []
ProductDescription = []
ProductRating = []
ProductReviewCount = []
ProductPreviousPrice = []
ProductPercentOff = []
try:
#browser.get('https://www.flipkart.com/search?as=off&as-show=on&otracker=start&page={}&q={}&viewType=grid'.format(x,Searchtext))
addr=search
html = requests.get(addr,headers=header) # get response from webpage
mysoup = BeautifulSoup(html.content,'lxml')
if mysoup.find("div",{"class": "_1-2Iqu row"}):
allcards = mysoup.findAll("div", {"class": "_1-2Iqu row"})
for i in allcards:
# Fetching Name of item
#print(i.find("div",{"class" : "_3wU53n"}).text)
ProductName.append(i.find("div",{"class" : "_3wU53n"}).text)
#Fetching Price
try:
#print(i.find("div",{'class':'_1vC4OE _2rQ-NK'}).text)
ProductPrice.append(i.find("div",{'class':'_1vC4OE _2rQ-NK'}).text)
except:
#print("Either Price is not Available or Item out of Stock")
ProductPrice.append("Either Price is not Available or Item out of Stock")
#Short Description
try:
#print(i.find("li",{"class" : "tVe95H"}).text)
ProductDescription.append(i.find("li",{"class" : "tVe95H"}).text)
except:
# print("No Attribute is listed")
ProductDescription.append("No Attribute is listed")
#Fetching Star Rating (Out of 5)
try:
#print(i.find("div",{"class" : "hGSR34 _2beYZw"}).text)
ProductRating.append(i.find("div",{"class" : "hGSR34 _2beYZw"}).text)
except:
#print("No Rating")
ProductRating.append("No Rating")
#Fetching Count of review and Rating
try:
#print(i.find('span',{'class':'_38sUEc'}).text)
ProductReviewCount.append(i.find("span",{"class" : "_38sUEc"}).text)
except:
#print("No Review")
ProductReviewCount.append("No Review")
# Product Previous Price
try:
#print(i.find('div',{'class':'_3auQ3N _2GcJzG'}).text)
ProductPreviousPrice.append(i.find("div",{"class" : "_3auQ3N _2GcJzG"}).text)
except:
#print("No Previous Price")
ProductPreviousPrice.append("No Previous Price")
# Discount Off on Product
try:
#print(i.find('div',{'class':'VGWI6T'}).text)
ProductPercentOff.append(i.find("div",{"class" : "VGWI6T"}).text)
except:
#print("No Discount")
ProductPercentOff.append("No Discount")
print("----------------------------------------------------------------")
else:
allcards = mysoup.findAll("div", {"class": "_3liAhj"})
for i in allcards:
# Fetching Name of item
#print(i.find("a",{"class" : "_2cLu-l"}).text)
ProductName.append(i.find("a",{"class" : "_2cLu-l"}).text)
# Below Code is For Fetching Price of item
try:
#print(i.find("div",{"class" : "_1vC4OE"}).text)
ProductPrice.append(i.find("div",{"class" : "_1vC4OE"}).text)
except:
# print("Either Price is not Available or Item out of Stock")
ProductPrice.append("Either Price is not Available or Item out of Stock")
#Short Description
try:
#print(i.find("div",{"class" : "_1rcHFq"}).text)
ProductDescription.append(i.find("div",{"class" : "_1rcHFq"}).text)
except:
# print("No Attribute is listed")
ProductDescription.append("No Attribute is listed")
#Fetching Star Rating (Out of 5)
try:
#print(i.find('div',{'class':'hGSR34'}).text)
ProductRating.append(i.find("div",{"class" : "hGSR34 _2beYZw"}).text)
except:
#print("No Rating")
ProductRating.append("No Rating")
#Fetching Count of review and Rating
try:
#print(i.find('span',{'class':'_38sUEc'}).text)
ProductReviewCount.append(i.find("span",{"class" : "_38sUEc"}).text)
except:
#print("No Review")
ProductReviewCount.append("No Review")
# Product Previous Price
try:
# print(i.find('div',{'class':'_3auQ3N'}).text)
ProductPreviousPrice.append(i.find("div",{"class" : "_3auQ3N"}).text)
except:
#print("No Previous Price")
ProductPreviousPrice.append("No Previous Price")
# Discount Off on Product
try:
# print(i.find('div',{'class':'VGWI6T'}).text)
ProductPercentOff.append(i.find("div",{"class" : "VGWI6T"}).text)
except:
#print("No Discount")
ProductPercentOff.append("No Discount")
print("----------------------------------------------------------------\n")
time.sleep(5)
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
print("Excel File Writing Started")
df = DataFrame({'Product Name': ProductName,'Current Product Price': ProductPrice, 'Product Description': ProductDescription, 'Product Rating': ProductRating,'Product Rating & Review Count':ProductReviewCount,'Previous Product Price' : ProductPreviousPrice,'Product Percent Off': ProductPercentOff})
df = df[["Product Name","Product Description","Current Product Price","Previous Product Price","Product Percent Off","Product Rating","Product Rating & Review Count"]]
#df.to_excel('D:\FlipkartDataExtract.xlsx', sheet_name='Flipkart-Data', index=False)
target_folder=os.path.join(home_path,'_'.join("Flipkart_scraper".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files ="{}".format(os.getpid())
filename=os.path.join(target_folder,files)
df.to_csv(filename+".csv") # writing data into csv file
#browser.close()
print("file name:",filename+".csv")
return filename
#print("If Your scraped data file is empty, then use other things")
# Calling the Image Download Function
def ImageDownload(url):
#driver = 'C:\windows\chromedriver'
#imageno = int(input("how many images u need to download ( upto 50 images ) : "))
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1):
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep_between_interactions)
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_I=img"
wd.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
scroll_to_end(wd)
# get all image thumbnail results
thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
number_results = len(thumbnail_results)
print(f"Found:{number_results} search results. Extracting links from{results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
# try to click every thumbnail such that we can get the realimage behind it
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
# extract image urls
actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found: {len(image_urls)} image links Done...")
break
else:
print("Found:", len(image_urls), "image links, looking for more..")
time.sleep(30)
return
load_more_button = wd.find_element_by_css_selector(".mye4qd")
if load_more_button:
wd.execute_script("documnet.querySelector('.mye4qd').click();")
results_start = len(thumbnail_results)
return image_urls
def persist_image(folder_path: str, url: str, counter):
try:
image_content = requests.get(url).content
except Exception as e:
print(f"Error-Could not download{url} - {e}")
try:
f = open(os.path.join(folder_path, 'jpg' + '_' + str(counter) + ".jpg"), 'wb')
f.write(image_content)
f.close()
print(f"Success - saved {url} - as{folder_path}")
except Exception as e:
print(f"Error - Could not save {url} - {e}")
def search_and_download(search_term: str, driver_path: str, target_path='E:\Image_scraper', number_images=50):
target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = False
with webdriver.Firefox(capabilities=cap, executable_path=driver_path) as wd:
res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
counter = 0
for elem in res:
persist_image(target_folder, elem, counter)
counter += 1
# Inputs are given here
DRIVER_PATH = driver
search_term = url#input("Enter search term here : ")
# images=input("enter number of images:")
# imagesno=input("Enter the number of images u need")
# num of images you can pass it from here by default it is 10 if you are not passing
# number_images=10
search_and_download(search_term=search_term, driver_path=DRIVER_PATH)
# Calling the TextScrapping Function
def TextScraper(url):
url=url
response=requests.get(url)
html=response.content
soup=BeautifulSoup(html,'lxml')
title=soup.find('title')
print(title.text)
body=soup.find('body')
for x in body.find_all('script'):
x.decompose()
text=body.getText(separator=u'\n').strip()
pattern=re.compile(r'\n+', re.MULTILINE)
text=pattern.sub('\n',text)
print(text)
target_folder=os.path.join(home_path,'_'.join("Text_scraper".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files ="{}".format(os.getpid())
filename=os.path.join(target_folder,files)
file1 = open(filename + ".txt", "w",encoding="utf-8")
# \n is placed to indicate EOL (End of Line)
file1.writelines(text)
file1.close()
print(filename+".txt")
return filename
#Calling Wallmart
def WalmartScraper(url):
try:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
#url="https://www.walmart.com/browse/household-essentials/air-fresheners/1115193_1025739?povid=1115193+%7C+2018-12-25+%7C+LHN%20- %20Best%20Sellers%20-%20Air%20Fresheners"
#url=input("Enter required catagory url: ")
url=url
html = requests.get(url,headers=header) # get response from webpage
data = soup(html.content,'lxml')
url_list = [] # empty list to get sub urls or the given main url
for i in range(1,26):
url_list.append(url + str(i))
# creating empty lists to store the data
item_names = []
price_list = []
item_ratings = []
item_reviews = []
# loop for scrape the product data
for url in url_list:
result = requests.get(url)
data = soup(result.content,'lxml')
product_name = data.findAll('div',{'class':'search-result-product-title gridview'})
product_rating = data.findAll('span',{'class':'seo-avg-rating'})
product_reviews = data.findAll('span',{'class':'stars-reviews-count'})
product_price = data.findAll('span',{'class':'price display-inline-block arrange-fit price price-main'})
for names,rating,reviews,price in zip(product_name,product_rating,product_reviews,product_price):
item_names.append(names.a.span.text.strip())
item_ratings.append(rating.text)
item_reviews.append(reviews.text.replace('ratings',''))
price_list.append(price.findAll('span',{'class':'visuallyhidden'})[0].text)
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
# creating a dataframe
import pandas as pd
df = pd.DataFrame({'Product_Name':item_names, 'Price':price_list, 'Rating':item_ratings,'No_Of_Reviews':item_reviews}, columns=['Product_Name', 'Price', 'Rating', 'No_Of_Reviews'])
df
#df.to_csv('D:\Walmart_product_list.csv') # writing data into csv file
target_folder=os.path.join(home_path,'_'.join("Walmart_scraper".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files ="{}".format(os.getpid())
filename=os.path.join(target_folder,files)
df.to_csv(filename+".csv") # writing data into csv file
print("file name:",filename+".csv")
return filename
#print("If your file is empty! try another url")
def recognisetext(file,language):
path =file
print(path)
lang=language
print(lang)
teslanguages = {"Afrikaans": "afr", "Amharic": "amh", "Arabic": "ara", "Assamese": "asm", "Azerbaijani": "aze",
"Belarusian": "bel", "Bengali": "ben", "Tibetan": "bod",
"Bosnian": "bos", "Bulgarian": "bul", "Catalan": "cat", "Valencian": "cat", "Cebuano": "ceb",
"Czech": "ces", "Chinese-Simplified ": "chi_sim",
"Chinese-Traditional ": "chi_tra", "Cherokee": "chr", "Welsh": "cym", "Danish": "dan",
"German": "deu", "Dzongkha": "dzo", "Greek": "ell",
"English": "eng", "Esperanto": "epo", "Estonian": "est", "Basque": "eus", "Persian": "fas",
"Finnish": "fin", "French": "fra", "German Fraktur": "frk",
"Irish": "gle", "Galician": "glg", "Greek": "grc", "Gujarati": "guj", "Haitian": "hat",
"Haitian Creole": "hat", "Hebrew": "heb", "Hindi": "hin",
"Croatian": "hrv", "Hungarian": "hun", "Inuktitut": "iku", "Indonesian": "ind", "Icelandic": "isl",
"Italian": "ita", "Italian - Old": "ita_old",
"Javanese": "jav", "Japanese": "jpn", "Kannada": "kan", "Georgian": "kat",
"Georgian - Old": "kat_old", "Kazakh": "kaz", "Central Khmer": "khm",
"Kirghiz": "kir", "Kyrgyz": "kir", "Korean": "kor", "Kurdish": "kur", "Lao": "lao", "Latin": "lat",
"Latvian": "lav", "Lithuanian": "lit", "Malayalam": "mal",
"Marathi": "mar", "Macedonian": "mkd", "Maltese": "mlt", "Malay": "msa", "Burmese": "mya",
"Nepali": "nep", "Dutch": "nld", "Flemish": "nld", "Norwegian": "nor",
"Oriya": "ori", "Panjabi": "pan", "Punjabi": "pan", "Polish": "pol", "Portuguese": "por",
"Pushto": "pus", "Pashto": "pus", "Romanian": "ron", "Moldavian": "ron",
"Moldovan": "ron", "Russian": "rus", "Sanskrit": "san", "Sinhala": "sin", "Sinhalese": "sin",
"Slovak": "slk", "Slovenian": "slv", "Spanish": "spa", "Castilian": "spa",
"Castilian - Old": "spa_old", "Albanian": "sqi", "Serbian": "srp", "Serbian - Latin": "srp_latn",
"Swahili": "swa", "Swedish": "swe", "Syriac": "syr", "Tamil": "tam",
"Telugu": "tel", "Tajik": "tgk", "Tagalog": "tgl", "Thai": "tha", "Tigrinya": "tir",
"Turkish": "tur", "Uighur": "uig", "Uyghur": "uig", "Ukrainian": "ukr", "Urdu": "urd",
"Uzbek": "uzb", "Uzbek - Cyrillic": "uzb_cyrl", "Vietnamese": "vie", "Yiddish": "yid"}
selected_lang = teslanguages.get(lang)
extension = path.split('.')[1] #slitting to get the extension
print("File extension is " + extension)
extensions = ["jpg", "png", "jpeg", "pdf"]
# checking of the file extension is pdf and then calling the function convertpdftotext method
if extension == 'pdf':
print(path)
text = convert_pdf_to_text_py(path)
target_folder = os.path.join(home_path, '_'.join("TextRecognition".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files = "{}".format(os.getpid())
filename = os.path.join(target_folder, files)
file1 = open(filename + ".txt", "w") #writing the text to the file with the pid as the file name
# \n is placed to indicate EOL (End of Line)
file1.writelines(text)
file1.close() # to change file access modes
print("filename is", file1)
#print(text)
else:
# Reading the image and getting its attributes
image = cv2.imread(path)
w, h, c = image.shape
print(w, h)
# if (w, h) <= (300, 300):
# print("Please give good image")
# else:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# cv2.imshow("Image", gray)
# check to see if we should apply thresholding to preprocess the image
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove
# noise
gray = cv2.medianBlur(gray, 3)
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
file_name = "{}".format(os.getpid())
#cv2.imwrite(file_name, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
# then running the pytesseract to get the string values from the image
options = "-l {} ".format(selected_lang)
text = pytesseract.image_to_string(Image.open(path), config=options)
target_folder = os.path.join(home_path, '_'.join("TextRecognition".lower().split(' ')))
if not os.path.exists(target_folder):
os.makedirs(target_folder) # make dir if not present
files = "{}".format(os.getpid())
filename = os.path.join(target_folder, files)
file1 = open(filename + ".txt", "w",encoding='utf8') # writing the text to the file with the pid as the file name
# \n is placed to indicate EOL (End of Line)
file1.writelines(text)
file1.close() # to change file access modes
print("filename is", file1)
#name = file_name
#print(name)
#file1 = open(name + ".txt", "w")
# \n is placed to indicate EOL (End of Line)
#file1.writelines(text)
#file1.close() # to change file access modes
#os.remove(name)
# print(text)
# show the output images
# cv2.imshow("Image", image)
# cv2.imshow("Output", gray)
# cv2.waitKey(0)
#Function that will convert the pdf file to text
def convert_pdf_to_text_py(path):
content = ""
with open(path, "rb") as f:
pdfDoc = PyPDF2.PdfFileReader(f, strict=True)
for i in range(0, pdfDoc.getNumPages()):
content += pdfDoc.getPage(i).extractText() + "\n"
print("The contents of PDF are",content)
return (content)
# Calling the Web scrapper Model
def callwebscrapper(var1, var2):
url = var1
typeo = var2
#print(url)
#print(typeo)
filecreated=checktypeofscrapping(typeo,url)
print("File created is",filecreated)
# Calling the function to get the input for the WebScraper
def scrapertype():
top = tk.Tk()
ttk.Label(top, text='Enter the Url: ').place(x=20, y=20)
input_text = tk.StringVar() # to treat the input as string
entry1 = Entry(top, textvariable=input_text)
entry1.focus_force()
entry1.pack(pady=10)
OptionList = ['Amazon', 'Ebay', 'Flipkart', 'Walmart', 'Text', 'Image']
variable = tk.StringVar(top)
variable.set(OptionList[0])
opt = tk.OptionMenu(top, variable, *OptionList)
opt.config(width=90, font=('Helvetica', 12))
opt.pack(side="top")
labelTest = tk.Label(text="", font=('Helvetica', 12), fg='red')
labelTest.pack(side="top")
buttoncallweb = tk.Button(top, text="Submit", command=lambda: callwebscrapper(entry1.get(), variable.get()))
buttoncallweb.pack(pady=10)
def calltransformation(function, file):
# Unpacking the tuple
print("Function called is ",function)
print("File to read is",file)
file_name, file_ext = os.path.splitext(file)
# Comparing the extension of the file
if (file_ext==".txt"):
words1 = open(file,"rt",encoding='utf-8')
words1 = words1.read()
# Transformation
if function=="Remove_punctuation_txt":
remove_punctuation_txt(words1)
elif function=="Remove_numbers_txt":
remove_number_txt(words1)
elif function=="Lower_case_txt":
lowercase_txt(words1)
elif function=="Upper_case_txt":
uppercase_txt(words1)
elif function=="Remove_non_ASCII_txt":
remove_non_ascii_txt(words1)
elif function=="Convert_number_into_words_txt":
replace_numbers_txt(words1)
elif function=="Extract_date_txt":
extract_date_txt(words1)
elif (file_ext==".csv"):
words3=open(file,"rt",encoding='utf-8')
words3=words3.read()
words2=read_csv(file)
# Transformation
if function=="Remove_numbers":
remove_number(words3)
elif function=="Lower_case":
lowercase(words3)
elif function=="Upper_case":
uppercase(words3)
elif function=="Remove_non_ASCII":
remove_non_ascii(words3)
elif function=="CSV_merge":
csv_merge(words3)
elif function=="Column_remove":
column_rem(words2)
elif function=="Row_remove":
row_rem(words2)
elif function=="Remove_empty_cell":
remove_empty(words2)
elif function=="Merge_txt_csv":
merge_tx_col(words3)
elif function=="Specific_column":
specific_col(file)
elif function=="Row_range":
row_range(file)
else:
print("Error enter only text file or csv file ")
def savefile_txt(output):
with open("F:\output.txt", "w",encoding='utf-8') as file:
file.write(output)
def savefile_csv(output):
with open("F:\output.csv", "w",encoding='utf-8') as file:
file.write(output)
def savefile_csv2(output):
output.to_csv("F:\output.csv", index=False ,encoding='utf-8')
def remove_punctuation_txt(words1):
#Remove punctuation from list of words
# define punctuation
punctuations = '''!()-[]{};:'"=+\,<>./?@#$%^&*_~'''
no_punct = ""
for char in words1:
if char not in punctuations:
no_punct = no_punct + char
output=no_punct
# display the unpunctuated string
savefile_txt(output)
def remove_number_txt(words1):
#Remove numbers from list of words
output = ''.join([i for i in words1 if not i.isdigit()])
savefile_txt(output)
def lowercase_txt(words1):
#Convert all characters to lowercase from list of words
for word in words1:
output = words1.lower()
# Saving the data in a file
savefile_txt(output)
def uppercase_txt(words1):
#Convert all characters to lowercase from list of words
for word in words1:
output = words1.upper()
# Saving the data in a file
savefile_txt(output)
def remove_non_ascii_txt(words1):
#Remove non-ASCII characters from list of words
for word in words1:
output = unicodedata.normalize('NFKD', words1).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# Saving the data in a file
savefile_txt(output)
def replace_numbers_txt(words1):
#Replace all interger occurrences in list of tokenized words with textual representation
p = inflect.engine()
# split string into list of words
temp_str = words1.split()
# initialise empty list
new_string = []
for word in temp_str:
# if word is a digit, convert the digit
# to numbers and append into the new_string list
if word.isdigit():
temp = p.number_to_words(word)
new_string.append(temp)
# append the word as it is
else:
new_string.append(word)
# join the words of new_string to form a string
output = ' '.join(new_string)
# Saving the data in a file
savefile_txt(output)
def extract_date_txt(words1):
global year, month, day
# The regex pattern that we created
pattern = "\d+[/.-]\d+[/.-]\d+"
# Will return all the strings that are matched
dates = re.findall(pattern, words1)
for date in dates:
if "-" in date:
((year, day, month) or (day, month, year) or (month, year, day)) == map(int, date.split("-"))
elif "/" in date:
((year, day, month) or (day, month, year) or (month, year, day)) == map(int, date.split("/"))
else:
((year, day, month) or (day, month, year) or (month, year, day)) == map(int, date.split("."))
if 1 <= day <= 31 and 1 <= month <= 12:
output=date
# Saving the data in a file
savefile_txt(output)
def remove_number(words3):
#Remove numbers from list of words
output = ''.join([i for i in words3 if not i.isdigit()])
# Saving the data in a file
savefile_csv(output)
def lowercase(words3):
#Convert all characters to lowercase from list of words
for word in words3:
output = words3.lower()
# Saving the data in a file
savefile_csv(output)
def uppercase(words3):
#Convert all characters to lowercase from list of words
for word in words3:
output = words3.upper()
# Saving the data in a file
savefile_csv(output)
def remove_non_ascii(words3):
#Remove non-ASCII characters from list of words
for word in words3:
output = unicodedata.normalize('NFKD', words3).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# Saving the data in a file
savefile_csv(output)
def csv_merge(words3):
# Reading data from file1
data = words3
# Initializing the file
words4 = input("Enter the csv file name: ")
words4=open(words4,'rt')
# Reading data from file2
data2 = words4.read()
# Merging 2 files to add the data of file2
data += "\n"
data += data2
output=data
# Saving the data in a file
savefile_csv(output)
def column_rem(words2):
for row in words2:
print(row)
mention=input("Enter the column to be removed: ")
output=words2.drop([mention], axis=1)
# Saving the data in a file
savefile_csv2(output)
def row_rem(words2):
# The row to be removed from the data
mention=int(input("Enter the row to be removed: "))
output=words2.drop([mention], axis=0)
# Saving the data in a file
savefile_csv2(output)
def remove_empty(words2):
# Remove empty column and row from the data
filtered_data = words2.dropna(axis='columns',how='all')
output = filtered_data.dropna(axis='rows',how='all')
# Saving the data in a file
savefile_csv2(output)
def merge_tx_col(words3):
# Initializing the file
words4 = input("Enter the text file name: ")
words4=open(words4,"rt",encoding='utf-8')
# Reading data from file1
data = words4.read()
# Reading data from file2
data2 = words3
# Merging 2 files and to add the data of file2
data += "\n"
data += data2
output=data
# Saving the data in a file
savefile_csv(output)
def specific_col(file):
words3=read_csv(file)
for row in words3:
print(row)
# To show the specific column in the output
col_name=input("Enter the column name: ")
output = pd.read_csv(file, usecols = [col_name])
savefile_csv2(output)
def row_range(file):
# To show the row from initial to certain range of in the output
row_name=int(input("Enter the row range to be displayed: "))
output = pd.read_csv(file, nrows = row_name)
savefile_csv2(output)
#Calling the to select the file for the transformation
def selectfile(function):
file = askopenfile(mode='r', filetypes=[('csv', '*.csv'), ('text', '*.txt')])
if file is not None:
print(file.name)
calltransformation(function, file.name)
#Calling the to select the file for the uploading to Google
def selectfilegoole():
file = askopenfile(mode='r', filetypes=[('csv', '*.csv')])
if file is not None:
print(file.name)
convert(original_scope,url,spreadsheet_name,credential_file, file.name)
#Calling the function to get the transforamtion type from the user
def transforamtionSelection():
top = tk.Tk()
OptionList = ['Remove_punctuation_txt','Remove_numbers_txt','Lower_case_txt','Upper_case_txt','Remove_non_ASCII_txt','Convert_number_into_words_txt','Extract_date_txt'
,'Remove_numbers','Lower_case','Upper_case','Remove_non_ASCII','CSV_merge','Column_remove','Row_remove','Remove_empty_cell','Merge_txt_csv',
'Specific_column','Row_range']
variable = tk.StringVar(top)
variable.set(OptionList[0])
opt = tk.OptionMenu(top, variable, *OptionList)
opt.config(width=90, font=('Helvetica', 12))
opt.pack(side="top")
labelTest = tk.Label(text="", font=('Helvetica', 12), fg='red')
labelTest.pack(side="top")
selectfilebutton = tk.Button(top, text="Select the file", command=lambda:selectfile(variable.get()))
selectfilebutton.pack(pady=10)