Skip to content

Commit 34eea30

Browse files
author
long.zhang
committed
注释内容
0 parents  commit 34eea30

12 files changed

+997
-0
lines changed

.idea/misc.xml

+4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

+8
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/tutorial.iml

+11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

+644
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = tutorial.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = tutorial

tutorial/__init__.py

Whitespace-only changes.

tutorial/items.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
import scrapy
8+
9+
class EtherscanProductRaw(scrapy.Item):
10+
'''定义需要格式化的内容(或是需要保存到数据库的字段)'''
11+
name = scrapy.Field()
12+
TxHash = scrapy.Field()
13+
Block = scrapy.Field()
14+
From_account = scrapy.Field()
15+
To_account = scrapy.Field()
16+
Value = scrapy.Field()
17+
TxFee = scrapy.Field()
18+
create_time = scrapy.Field()
19+
operate_type = scrapy.Field()

tutorial/middlewares.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your spider middleware
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7+
8+
from scrapy import signals
9+
10+
11+
class TutorialSpiderMiddleware(object):
12+
# Not all methods need to be defined. If a method is not defined,
13+
# scrapy acts as if the spider middleware does not modify the
14+
# passed objects.
15+
16+
@classmethod
17+
def from_crawler(cls, crawler):
18+
# This method is used by Scrapy to create your spiders.
19+
s = cls()
20+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21+
return s
22+
23+
def process_spider_input(self, response, spider):
24+
# Called for each response that goes through the spider
25+
# middleware and into the spider.
26+
27+
# Should return None or raise an exception.
28+
return None
29+
30+
def process_spider_output(self, response, result, spider):
31+
# Called with the results returned from the Spider, after
32+
# it has processed the response.
33+
34+
# Must return an iterable of Request, dict or Item objects.
35+
for i in result:
36+
yield i
37+
38+
def process_spider_exception(self, response, exception, spider):
39+
# Called when a spider or process_spider_input() method
40+
# (from other spider middleware) raises an exception.
41+
42+
# Should return either None or an iterable of Response, dict
43+
# or Item objects.
44+
pass
45+
46+
def process_start_requests(self, start_requests, spider):
47+
# Called with the start requests of the spider, and works
48+
# similarly to the process_spider_output() method, except
49+
# that it doesn’t have a response associated.
50+
51+
# Must return only requests (not items).
52+
for r in start_requests:
53+
yield r
54+
55+
def spider_opened(self, spider):
56+
spider.logger.info('Spider opened: %s' % spider.name)

tutorial/pipelines.py

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
from twisted.enterprise import adbapi
9+
import MySQLdb.cursors
10+
class DBPipeline(object):
11+
12+
def __init__(self):
13+
self.db_pool = adbapi.ConnectionPool('MySQLdb',
14+
db='capture',
15+
user='root',
16+
passwd='111111',
17+
cursorclass=MySQLdb.cursors.DictCursor,
18+
use_unicode=True)
19+
20+
def process_item(self, item, spider):
21+
query = self.db_pool.runInteraction(self._conditional_insert, item)
22+
query.addErrback(self.handle_error)
23+
return item
24+
25+
def _conditional_insert(self, tx, item):
26+
try:
27+
sql = 'select * from transfer_records where TxHash = "{TxHash}"'.format(**item)
28+
print sql
29+
tx.execute(sql)
30+
result = tx.fetchone()
31+
if result:
32+
pass
33+
else:
34+
sql = 'insert into transfer_records(name,TxHash,Block,From_account,To_account,Value,TxFee,create_time,operate_type) values("{name}","{TxHash}","{Block}","{From_account}","{To_account}",{Value},{TxFee},"{create_time}","{operate_type}")'.format(**item)
35+
tx.execute(sql)
36+
except Exception,e:
37+
print 'e'*20
38+
print sql
39+
print e
40+
41+
def handle_error(self, e):
42+
print 'error',e

tutorial/settings.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for tutorial project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10+
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'tutorial'
13+
14+
SPIDER_MODULES = ['tutorial.spiders']
15+
NEWSPIDER_MODULE = 'tutorial.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20+
21+
# Obey robots.txt rules
22+
ROBOTSTXT_OBEY = True
23+
24+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
25+
#CONCURRENT_REQUESTS = 32
26+
27+
# Configure a delay for requests for the same website (default: 0)
28+
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29+
# See also autothrottle settings and docs
30+
#DOWNLOAD_DELAY = 3
31+
# The download delay setting will honor only one of:
32+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
33+
#CONCURRENT_REQUESTS_PER_IP = 16
34+
35+
# Disable cookies (enabled by default)
36+
#COOKIES_ENABLED = False
37+
38+
# Disable Telnet Console (enabled by default)
39+
#TELNETCONSOLE_ENABLED = False
40+
41+
# Override the default request headers:
42+
#DEFAULT_REQUEST_HEADERS = {
43+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44+
# 'Accept-Language': 'en',
45+
#}
46+
47+
# Enable or disable spider middlewares
48+
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49+
#SPIDER_MIDDLEWARES = {
50+
# 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
51+
#}
52+
53+
# Enable or disable downloader middlewares
54+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55+
#DOWNLOADER_MIDDLEWARES = {
56+
# 'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
57+
#}
58+
59+
# Enable or disable extensions
60+
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61+
#EXTENSIONS = {
62+
# 'scrapy.extensions.telnet.TelnetConsole': None,
63+
#}
64+
65+
# Configure item pipelines
66+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67+
#ITEM_PIPELINES = {
68+
# 'tutorial.pipelines.TutorialPipeline': 300,
69+
#}
70+
71+
# Enable and configure the AutoThrottle extension (disabled by default)
72+
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73+
#AUTOTHROTTLE_ENABLED = True
74+
# The initial download delay
75+
#AUTOTHROTTLE_START_DELAY = 5
76+
# The maximum download delay to be set in case of high latencies
77+
#AUTOTHROTTLE_MAX_DELAY = 60
78+
# The average number of requests Scrapy should be sending in parallel to
79+
# each remote server
80+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81+
# Enable showing throttling stats for every response received:
82+
#AUTOTHROTTLE_DEBUG = False
83+
84+
85+
#Mysql数据库的配置信息
86+
MYSQL_HOST = '127.0.0.1'
87+
MYSQL_DBNAME = 'capture' #数据库名字,请修改
88+
MYSQL_USER = 'root' #数据库账号,请修改
89+
MYSQL_PASSWD = '111111' #数据库密码,请修改
90+
91+
MYSQL_PORT = 3306 #数据库端口,在dbhelper中使用
92+
93+
ITEM_PIPELINES = {
94+
'tutorial.pipelines.DBPipeline': 300,#保存到mysql数据库
95+
}
96+

tutorial/spiders/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.

tutorial/spiders/etherscan.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Time : 2017/12/1 13:57
4+
# @Author : long.zhang
5+
# @Contact : [email protected]
6+
# @Site :
7+
# @File : amazon.py
8+
# @Software: PyCharm
9+
# @Desc :
10+
from tutorial.items import EtherscanProductRaw
11+
import scrapy
12+
import re
13+
import time
14+
15+
from bs4 import BeautifulSoup
16+
17+
def getDict4str(strsource, match=':'):
18+
outdict = {}
19+
lists = strsource.split('\n')
20+
for list in lists:
21+
list = list.strip()
22+
if list:
23+
strbegin = list.find(match)
24+
outdict[list[:strbegin]] = list[strbegin + 1:] if strbegin != len(list) else ''
25+
return outdict
26+
HEADER = '''
27+
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
28+
accept-encoding:gzip, deflate, br
29+
accept-language:zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7
30+
cache-control:max-age=0
31+
upgrade-insecure-requests:1
32+
User-Agent:{}
33+
'''
34+
header = getDict4str(HEADER.format(
35+
r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'))
36+
37+
class Etherscan(scrapy.Spider):
38+
name = 'etherscan'
39+
def __init__(self, id_name=None, *args, **kwargs):
40+
super(Etherscan, self).__init__(*args, **kwargs)
41+
self.start_urls = ['https://etherscan.io/address/{}'.format(id_name)]
42+
self.id_name = id_name
43+
def parse(self, response):
44+
formatUrl = 'https://etherscan.io/txs?a={}&p={}'
45+
soup = BeautifulSoup(response.body, 'lxml')
46+
num = int(soup.find('span', {'title': "Normal Transactions"}).getText().strip('\n').strip()[:-4].strip())
47+
if num%50 == 0:
48+
page = num/36
49+
else:
50+
page = num / 36+1
51+
52+
for x in range(page):
53+
x+=1
54+
url = formatUrl.format(self.id_name, x)
55+
print url
56+
yield scrapy.Request(url=url, meta={"id_name": self.id_name},
57+
callback=self.parse_url)
58+
def parse_url(self, response):
59+
soup = BeautifulSoup(response.body, 'lxml')
60+
datas = soup.find('table', {'class': "table table-hover "}).find('tbody').findAll('tr')
61+
now = time.time()
62+
for sourceData in datas:
63+
infos = sourceData.findAll('td')
64+
if len(infos) != 8:
65+
break
66+
resultData = EtherscanProductRaw()
67+
resultData['name'] = response.meta.get('id_name')
68+
if infos[0].find('font'):
69+
continue
70+
71+
resultData['TxHash'] = infos[0].find('span').getText().strip()
72+
resultData['Block'] = int(infos[1].getText().strip())
73+
resultData['From_account'] = infos[3].find('span').getText().strip()
74+
resultData['To_account'] = infos[5].find('span').getText().strip()
75+
value = ''.join(infos[6].getText().split(','))
76+
resultData['Value'] = float(value[:-5].strip() if value.find('Ether') !=-1 else value.strip())
77+
TxFee = ''.join(infos[7].getText().split(','))
78+
resultData['TxFee'] = float(TxFee[:-5].strip() if TxFee.find('Ether') !=-1 else TxFee.strip())
79+
80+
# import pdb
81+
# pdb.set_trace()
82+
Age = infos[2].find('span').getText().strip()
83+
pattern_hr = re.compile('\d+ hr')
84+
pattern_day = re.compile('\d+ day')
85+
pattern_min = re.compile('\d+ min')
86+
pattern_count = re.compile('\d+')
87+
88+
hours = int(pattern_count.findall(pattern_hr.findall(Age)[0])[0]) if pattern_hr.findall(Age) else 0
89+
day = int(pattern_count.findall(pattern_day.findall(Age)[0])[0]) if pattern_day.findall(Age) else 0
90+
min = int(pattern_count.findall(pattern_min.findall(Age)[0])[0]) if pattern_min.findall(Age) else 0
91+
t = now - hours*3600 - day*86400 - min*60
92+
resultData['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))
93+
resultData['operate_type'] = infos[4].find('span').getText().strip()
94+
yield resultData
95+
96+
97+
98+
99+
100+
101+
102+

0 commit comments

Comments
 (0)