-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
61 lines (50 loc) · 1.42 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
f = open('instagram.html', 'r+')
htmlfile = f.read()
#print htmlfile
def get_body(htmlfile):
start_link = htmlfile.find('<body')
if start_link == -1:
return None, 0
start_quote = htmlfile.find('"', start_link)
end_quote = htmlfile.find('</body>', start_quote + 1)
url = htmlfile[start_quote + 1:end_quote]
return url, end_quote
def get_only_body(htmlfile):
links = []
while True:
url,endpos = get_body(htmlfile)
if url:
links.append(url)
htmlfile = htmlfile[endpos:]
else:
break
return links
page = get_only_body(htmlfile)
pageStr = ''.join(page)
def get_next_target(page):
start_link = page.find("""display_src": """)
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('?', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
pageStr = get_all_links(pageStr)
pageStr = ''.join(pageStr)
pageStr = pageStr.replace(":", "")
pageStr = pageStr.replace(" ", """" > <img src= """)
pageStr = pageStr.replace("https", "https:")
pageStr = pageStr+"""">"""
pageStr = pageStr[3:]
print pageStr