- 중복 제거 후 insert - proxy.txt가 모두 만료되면 db 사용 - proxy db에서 중복 제거해서 가져오기 - 프록시 문제로 페이지 요청 시 0.1초 딜레이 - 크롤러 stop 동작하도록 - realtime 적용
180 lines
5.4 KiB
Python
180 lines
5.4 KiB
Python
import re
|
|
import random
|
|
import pymysql
|
|
import os
|
|
from selenium import webdriver
|
|
import sys
|
|
|
|
proxy_filename = 'proxy.txt'
|
|
re_ip = re.compile('([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})[^\d]([\d]{2,5})')
|
|
random.seed()
|
|
|
|
linux_driver_path = {
|
|
'chrome': 'chromedriver',
|
|
'opera': 'operadriver',
|
|
'firefox': None,
|
|
'ie': None
|
|
}
|
|
|
|
window_driver_path = {
|
|
'firefox': None,
|
|
'chrome': 'chromedriver.exe',
|
|
'ie': 'IEDriverServer.exe',
|
|
'opera': 'operadriver.exe'
|
|
}
|
|
|
|
driver_path = window_driver_path if sys.platform == 'win32' else linux_driver_path
|
|
|
|
platform_desired_capabilities = {
|
|
'firefox': webdriver.DesiredCapabilities.FIREFOX,
|
|
'ie': webdriver.DesiredCapabilities.INTERNETEXPLORER,
|
|
'opera': webdriver.DesiredCapabilities.OPERA,
|
|
'chrome': webdriver.DesiredCapabilities.CHROME
|
|
}
|
|
|
|
platform_webdriver = {
|
|
'firefox': webdriver.Firefox,
|
|
'chrome': webdriver.Chrome,
|
|
'ie': webdriver.Ie,
|
|
'opera': webdriver.Opera
|
|
}
|
|
|
|
|
|
# pl_webdriver = {
|
|
# 'firefox': {
|
|
# 'path': None,
|
|
# 'desired_capabilities': webdriver.DesiredCapabilities.FIREFOX,
|
|
# 'webdriver': webdriver.Firefox
|
|
# },
|
|
# 'chrome': {
|
|
# 'path': 'chromedriver.exe' if sys.platform == 'win32' else 'chromedriver',
|
|
# 'desired_capabilities': webdriver.DesiredCapabilities.CHROME,
|
|
# 'webdriver': webdriver.Chrome
|
|
# },
|
|
# 'ie': {
|
|
# 'path': 'IEDriverServer.exe' if sys.platform == 'win32' else None,
|
|
# 'desired_capabilities': webdriver.DesiredCapabilities.INTERNETEXPLORER,
|
|
# 'webdriver': webdriver.Ie
|
|
# },
|
|
# 'opera': {
|
|
# 'path': 'operadriver.exe' if sys.platform == 'win32' else 'operadriver',
|
|
# 'desired_capabilities': webdriver.DesiredCapabilities.OPERA,
|
|
# 'webdriver': webdriver.Opera
|
|
# }
|
|
# }
|
|
|
|
|
|
def get_driver(platform, proxies):
|
|
"""
|
|
|
|
:param platform: 'chrome', 'ie', 'opera', 'firefox'
|
|
:param proxies: format : ip:port ex) '192.168.0.1:9999'
|
|
:return: driver applied proxy
|
|
"""
|
|
# copy desired_capabilities
|
|
desired_capabilities = platform_desired_capabilities[platform].copy()
|
|
|
|
# set proxy
|
|
desired_capabilities['proxy'] = {
|
|
'httpProxy': proxies,
|
|
'ftpProxy': proxies,
|
|
'sslProxy': proxies,
|
|
'noProxy': None,
|
|
'proxyType': 'MANUAL',
|
|
# 'autodetect': False
|
|
# 'autodetect': True
|
|
}
|
|
|
|
# return driver applied proxy
|
|
if platform == 'ie':
|
|
return platform_webdriver[platform](executable_path=driver_path[platform],
|
|
capabilities=desired_capabilities)
|
|
|
|
if driver_path[platform]:
|
|
return platform_webdriver[platform](executable_path=driver_path[platform],
|
|
desired_capabilities=desired_capabilities)
|
|
# for firefox
|
|
else:
|
|
return platform_webdriver[platform](capabilities=desired_capabilities)
|
|
|
|
_expired_proxies = []
|
|
|
|
|
|
def set_proxy_expired(proxy):
|
|
if not os.path.exists(proxy_filename) or not os.path.isfile(proxy_filename):
|
|
return
|
|
|
|
if proxy not in _expired_proxies:
|
|
_expired_proxies.append(proxy)
|
|
|
|
address = proxy['http'][len('http://'):]
|
|
|
|
with open(proxy_filename, 'r') as f:
|
|
lines = f.readlines()
|
|
|
|
expired_idx = -1
|
|
for idx, line in enumerate(lines):
|
|
if line.startswith(address):
|
|
expired_idx = idx
|
|
break
|
|
|
|
if expired_idx >= 0:
|
|
lines[expired_idx] = '# ' + lines[expired_idx]
|
|
lines.append(lines.pop(expired_idx))
|
|
|
|
with open(proxy_filename, 'w') as f:
|
|
f.writelines(lines)
|
|
|
|
|
|
def get_proxy_from_file(filename):
|
|
"""
|
|
:param filename:
|
|
:return (ip, port): string, string
|
|
if ip, port or filename is invalid, return (None, None)
|
|
"""
|
|
proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)]
|
|
if proxy_lists:
|
|
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
|
|
if m:
|
|
return m.group(1), m.group(2)
|
|
return None, None
|
|
|
|
|
|
def get_proxy_from_db():
|
|
try:
|
|
conn = pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=pymysql.cursors.DictCursor)
|
|
with conn.cursor() as cursor:
|
|
cursor.execute("select * from Proxy group by Proxy")
|
|
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Proxy'] and i['Port']]
|
|
proxy_lists.sort()
|
|
conn.close()
|
|
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
|
|
except:
|
|
return None, None
|
|
|
|
|
|
def get_proxy():
|
|
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
|
|
ip, port = get_proxy_from_file(proxy_filename)
|
|
if not ip or not port:
|
|
return get_proxy_from_db()
|
|
else:
|
|
return ip, port
|
|
else:
|
|
return get_proxy_from_db()
|
|
|
|
|
|
def get_requests_proxy(proxies):
|
|
return {
|
|
'http': 'http://{}'.format(proxies),
|
|
'https': 'https://{}'.format(proxies),
|
|
}
|
|
|
|
|
|
def get_proxy_for_requests():
|
|
ip, port = get_proxy()
|
|
return get_requests_proxy(ip + ":" + port)
|