IPProxyPoolippy2py3
ip super1-chen,fancoo,Leibnizhu
1.sqlite(): apt-get install sqlite3 2.requests,chardet,web.py,gevent psutil: pip install requests chardet web.py sqlalchemy gevent psutil 3.lxml: apt-get install python-lxml
1.sqlite, 2.requests,chardet,web.py,gevent: pip install requests chardet web.py sqlalchemy gevent 3.lxml: pip install lxmllxml windows
sqlitesqlalchemyORMMySQLMongoDB
dbISqlHelperDataStore
try:
if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
from db.MongoHelper import MongoHelper as SqlHelper
else:
from db.SqlHelper import SqlHelper as SqlHelper
sqlhelper = SqlHelper()
sqlhelper.init_db()
except Exception,e:
raise Con_DB_Fail
Redis
clone
$ git clone
$ cd IPProxyPool
python IPProxy.py
IPProxyPool----->>>>>>>>beginning
http://0.0.0.0:8000/
IPProxyPool----->>>>>>>>db exists ip:0
IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...
IPProxyPool----->>>>>>>>Success ip num :134,Fail ip num:7882
GET /
ip
Name | Type | Description |
---|---|---|
types | int | 0: ,1:,2 |
protocol | int | 0: http, 1 https, 2 http/https |
count | int | |
country | str | , |
area | str |
1.5iphttp://127.0.0.1:8000/?types=0&count=5&country= 2.JSON
[["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]]
import requests
import json
r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=')
ip_ports = json.loads(r.text)
print ip_ports
ip = ip_ports[0][0]
port = ip_ports[0][1]
proxies={
'http':'http://%s:%s'%(ip,port),
'https':'http://%s:%s'%(ip,port)
}
r = requests.get('http://ip.chinaz.com/',proxies=proxies)
r.encoding='utf-8'
print r.text
GET /delete
ip
Name | Type | Description |
---|---|---|
ip | str | 192.168.1.1 |
port | int | 80 |
types | int | 0: ,1:,2 |
protocol | int | 0: http, 1 https, 2 http/https |
count | int | |
country | str | , |
area | str |
1.ip120.92.3.127http://127.0.0.1:8000/delete?ip=120.92.3.127 2.JSON,, ["deleteNum", "ok"]["deleteNum", 1]
import requests
r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
print r.text
#parserList,,,
parserList = [
{
'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
'type': 'xpath',
'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
},
......
{
'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
'type': 'module',
'moduleName': 'CnproxyPraser',
'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
}
]
#
DB_CONFIG = {
'DB_CONNECT_TYPE': 'sqlalchemy', # 'pymongo'sqlalchemy;redis
# 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
# DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
# 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis
# 'DB_CONNECT_STRING': 'redis://localhost:6379/8',
}
#THREADNUMgevent pool
THREADNUM = 5
#API_PORTAPI web
API_PORT = 8000
#ip
#ip
# UPDATE_TIME:ip
UPDATE_TIME = 30 * 60
# ipMINNUM
MINNUM = 50
# socket
TIMEOUT = 5
#
RETRY_TIME = 3
#USER_AGENTS ,
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
]
#ip20,,,
DEFAULT_SCORE=10
#CHECK_PROXY,CHECK_PROXY={'function':'checkProxy'}
#httpbin.org,ip
#iphttpbin.org,
#,
#Validator.pybaidu_checkdetect_proxy
CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'}
1.squid
-----------------------------2017-4-6---------------------------- 1.
2.,config.pyCHECK_PROXY
CHECK_PROXYCHECK_PROXY={'function':'checkProxy'}
httpbin.org,ip
iphttpbin.org,
,
Validator.pybaidu_checkdetect_proxy
CHECK_PROXY={'function':'baidu_check'}
3.,
-----------------------------2017-1-16---------------------------- 1.py2py3 2.pymongobug -----------------------------2017-1-11---------------------------- 1.httpbin.orgip 2. country 3.typesprotocolprotocolhttp://www.baidu.comhttps://www.baidu.com 4. -----------------------------2016-12-11----------------------------
1.+50IP 2.web.pyAPIHTTP 3.Mysql,MongoDB 4. 5.ip 6.python3 -----------------------------2016-11-24---------------------------- 1.chardet 2.66ip.cn -----------------------------2016-10-27---------------------------- 1. 2. 3.
-----------------------------2016-7-20---------------------------- 1.bug ,