-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
121 lines (89 loc) · 3.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from argparse import ArgumentParser
from time import sleep
from models import Profile, Post
# parse args
parser = ArgumentParser()
parser.add_argument('-u', '--username', dest='username', help='Profile username')
parser.add_argument('-d', '--debug', dest='debug', default=False, required=False, help='If True it shows debug output')
args = parser.parse_args()
# chrome config
options = Options()
options.add_argument('--headless')
options.add_argument('--window-size=1920x1080')
driver = '/usr/bin/chromedriver'
chrome = Chrome(
chrome_options=options,
executable_path=driver
)
chrome.get('https://www.instagram.com/' + args.username)
if args.debug: print('Chrome running at ', chrome.current_url)
selectors = {
'name': 'header h1',
'num_posts': 'header ul li:nth-child(1) span',
'num_followers': 'header ul li:nth-child(2) span',
'num_following': 'header ul li:nth-child(3) span',
'posts': 'main article a',
'desc': 'article ul li[role=menuitem] div span:not([role=link])',
'img': 'main article > div img',
'local': 'header a[href*=locations]',
'lat': 'meta[property*=latitude]',
'lng': 'meta[property*=longitude]',
}
# Create Profile instance and get info
profile = Profile(args.username)
name_el = chrome.find_element_by_css_selector(selectors['name'])
profile.name = name_el.text
num_posts_el = chrome.find_element_by_css_selector(selectors['num_posts'])
profile.num_posts = int(num_posts_el.text.replace(',', ''))
num_followers_el = chrome.find_element_by_css_selector(selectors['num_followers'])
profile.num_followers = int(num_followers_el.text.replace(',', '').replace('mil', '').replace('milhões', ''))
num_following_el = chrome.find_element_by_css_selector(selectors['num_following'])
profile.num_following = int(num_following_el.text.replace(',', ''))
if args.debug: print('Saved', profile)
profile.save()
# Scroll untill all pictures are visible
urls = []
while len(urls) < profile.num_posts:
if args.debug: print('Scroll down... ', end='')
chrome.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep(1)
for a in chrome.find_elements_by_css_selector( selectors['posts'] ):
href = a.get_attribute('href')
if href not in urls:
urls.append(href)
if args.debug: print('found', len(urls), 'of', profile.num_posts)
for i, url in enumerate(urls):
# Create new Post object
post = Post(profile)
post.id_ = i
post.url = url
# Navigate to post url
chrome.get(url)
if args.debug: chrome.get_screenshot_as_file('data/{}/post{}.png'.format(profile.username, i))
# get description (first comment)
desc_el = chrome.find_elements_by_css_selector(selectors['desc'])
if len(desc_el) > 0:
post.desc = desc_el[0].text.replace('\n', ' ')
# TODO: explain https://github.com/gevent/gevent/issues/614
# replace \n by space
# get image
img_el = chrome.find_element_by_css_selector(selectors['img'])
post.download_img(img_el.get_attribute('src'))
# get locations
local_el = chrome.find_elements_by_css_selector(selectors['local'])
if len(local_el) > 0:
href = local_el[0].get_attribute('href')
if 'locations' in href:
chrome.get(href)
if args.debug: chrome.get_screenshot_as_file('data/{}/loc{}.png'.format(profile.username, i))
lat_metas = chrome.find_elements_by_css_selector(selectors['lat'])
if len(lat_metas) > 0:
post.lat = lat_metas[0].get_attribute('content')
lng_metas = chrome.find_elements_by_css_selector(selectors['lng'])
if len(lng_metas) > 0:
post.lng = lng_metas[0].get_attribute('content')
if args.debug: print('Saved', post)
post.save()
chrome.quit()