Files
clients/WebBasedCrawler/base/proxy.py
mjjo 3d5e2d0c98 - 트위터 크롤러 수정
- 중복 제거 후 insert
  - proxy.txt가 모두 만료되면 db 사용
  - proxy db에서 중복 제거해서 가져오기
  - 프록시 문제로 페이지 요청 시 0.1초 딜레이
  - 크롤러 stop 동작하도록
  - realtime 적용
2017-07-28 14:29:05 +09:00

180 lines
5.4 KiB
Python

import re
import random
import pymysql
import os
from selenium import webdriver
import sys
proxy_filename = 'proxy.txt'
re_ip = re.compile('([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})[^\d]([\d]{2,5})')
random.seed()
linux_driver_path = {
'chrome': 'chromedriver',
'opera': 'operadriver',
'firefox': None,
'ie': None
}
window_driver_path = {
'firefox': None,
'chrome': 'chromedriver.exe',
'ie': 'IEDriverServer.exe',
'opera': 'operadriver.exe'
}
driver_path = window_driver_path if sys.platform == 'win32' else linux_driver_path
platform_desired_capabilities = {
'firefox': webdriver.DesiredCapabilities.FIREFOX,
'ie': webdriver.DesiredCapabilities.INTERNETEXPLORER,
'opera': webdriver.DesiredCapabilities.OPERA,
'chrome': webdriver.DesiredCapabilities.CHROME
}
platform_webdriver = {
'firefox': webdriver.Firefox,
'chrome': webdriver.Chrome,
'ie': webdriver.Ie,
'opera': webdriver.Opera
}
# pl_webdriver = {
# 'firefox': {
# 'path': None,
# 'desired_capabilities': webdriver.DesiredCapabilities.FIREFOX,
# 'webdriver': webdriver.Firefox
# },
# 'chrome': {
# 'path': 'chromedriver.exe' if sys.platform == 'win32' else 'chromedriver',
# 'desired_capabilities': webdriver.DesiredCapabilities.CHROME,
# 'webdriver': webdriver.Chrome
# },
# 'ie': {
# 'path': 'IEDriverServer.exe' if sys.platform == 'win32' else None,
# 'desired_capabilities': webdriver.DesiredCapabilities.INTERNETEXPLORER,
# 'webdriver': webdriver.Ie
# },
# 'opera': {
# 'path': 'operadriver.exe' if sys.platform == 'win32' else 'operadriver',
# 'desired_capabilities': webdriver.DesiredCapabilities.OPERA,
# 'webdriver': webdriver.Opera
# }
# }
def get_driver(platform, proxies):
"""
:param platform: 'chrome', 'ie', 'opera', 'firefox'
:param proxies: format : ip:port ex) '192.168.0.1:9999'
:return: driver applied proxy
"""
# copy desired_capabilities
desired_capabilities = platform_desired_capabilities[platform].copy()
# set proxy
desired_capabilities['proxy'] = {
'httpProxy': proxies,
'ftpProxy': proxies,
'sslProxy': proxies,
'noProxy': None,
'proxyType': 'MANUAL',
# 'autodetect': False
# 'autodetect': True
}
# return driver applied proxy
if platform == 'ie':
return platform_webdriver[platform](executable_path=driver_path[platform],
capabilities=desired_capabilities)
if driver_path[platform]:
return platform_webdriver[platform](executable_path=driver_path[platform],
desired_capabilities=desired_capabilities)
# for firefox
else:
return platform_webdriver[platform](capabilities=desired_capabilities)
_expired_proxies = []
def set_proxy_expired(proxy):
if not os.path.exists(proxy_filename) or not os.path.isfile(proxy_filename):
return
if proxy not in _expired_proxies:
_expired_proxies.append(proxy)
address = proxy['http'][len('http://'):]
with open(proxy_filename, 'r') as f:
lines = f.readlines()
expired_idx = -1
for idx, line in enumerate(lines):
if line.startswith(address):
expired_idx = idx
break
if expired_idx >= 0:
lines[expired_idx] = '# ' + lines[expired_idx]
lines.append(lines.pop(expired_idx))
with open(proxy_filename, 'w') as f:
f.writelines(lines)
def get_proxy_from_file(filename):
"""
:param filename:
:return (ip, port): string, string
if ip, port or filename is invalid, return (None, None)
"""
proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)]
if proxy_lists:
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
if m:
return m.group(1), m.group(2)
return None, None
def get_proxy_from_db():
try:
conn = pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
cursor.execute("select * from Proxy group by Proxy")
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Proxy'] and i['Port']]
proxy_lists.sort()
conn.close()
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
except:
return None, None
def get_proxy():
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
ip, port = get_proxy_from_file(proxy_filename)
if not ip or not port:
return get_proxy_from_db()
else:
return ip, port
else:
return get_proxy_from_db()
def get_requests_proxy(proxies):
return {
'http': 'http://{}'.format(proxies),
'https': 'https://{}'.format(proxies),
}
def get_proxy_for_requests():
ip, port = get_proxy()
return get_requests_proxy(ip + ":" + port)