import re import random import pymysql import os from selenium import webdriver import sys import base.proxy_crawler proxy_filename = 'proxy.txt' re_ip = re.compile('([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})[^\d]([\d]{2,5})') random.seed() linux_driver_path = { 'chrome': 'chromedriver', 'opera': 'operadriver', 'firefox': None, 'ie': None } window_driver_path = { 'firefox': None, 'chrome': 'chromedriver.exe', 'ie': 'IEDriverServer.exe', 'opera': 'operadriver.exe' } driver_path = window_driver_path if sys.platform == 'win32' else linux_driver_path platform_desired_capabilities = { 'firefox': webdriver.DesiredCapabilities.FIREFOX, 'ie': webdriver.DesiredCapabilities.INTERNETEXPLORER, 'opera': webdriver.DesiredCapabilities.OPERA, 'chrome': webdriver.DesiredCapabilities.CHROME } platform_webdriver = { 'firefox': webdriver.Firefox, 'chrome': webdriver.Chrome, 'ie': webdriver.Ie, 'opera': webdriver.Opera } # pl_webdriver = { # 'firefox': { # 'path': None, # 'desired_capabilities': webdriver.DesiredCapabilities.FIREFOX, # 'webdriver': webdriver.Firefox # }, # 'chrome': { # 'path': 'chromedriver.exe' if sys.platform == 'win32' else 'chromedriver', # 'desired_capabilities': webdriver.DesiredCapabilities.CHROME, # 'webdriver': webdriver.Chrome # }, # 'ie': { # 'path': 'IEDriverServer.exe' if sys.platform == 'win32' else None, # 'desired_capabilities': webdriver.DesiredCapabilities.INTERNETEXPLORER, # 'webdriver': webdriver.Ie # }, # 'opera': { # 'path': 'operadriver.exe' if sys.platform == 'win32' else 'operadriver', # 'desired_capabilities': webdriver.DesiredCapabilities.OPERA, # 'webdriver': webdriver.Opera # } # } def get_driver(platform, proxies): """ :param platform: 'chrome', 'ie', 'opera', 'firefox' :param proxies: format : ip:port ex) '192.168.0.1:9999' :return: driver applied proxy """ # copy desired_capabilities desired_capabilities = platform_desired_capabilities[platform].copy() # set proxy desired_capabilities['proxy'] = { 'httpProxy': proxies, 'ftpProxy': proxies, 'sslProxy': proxies, 'noProxy': None, 'proxyType': 'MANUAL', # 'autodetect': False # 'autodetect': True } # return driver applied proxy if platform == 'ie': return platform_webdriver[platform](executable_path=driver_path[platform], capabilities=desired_capabilities) if driver_path[platform]: return platform_webdriver[platform](executable_path=driver_path[platform], desired_capabilities=desired_capabilities) # for firefox else: return platform_webdriver[platform](capabilities=desired_capabilities) _expired_proxies = [] def set_proxy_expired(proxy): if not os.path.exists(proxy_filename) or not os.path.isfile(proxy_filename): return if proxy not in _expired_proxies: _expired_proxies.append(proxy) address = proxy['http'][len('http://'):] with open(proxy_filename, 'r') as f: lines = f.readlines() expired_idx = -1 for idx, line in enumerate(lines): if line.startswith(address): expired_idx = idx break if expired_idx >= 0: lines[expired_idx] = '# ' + lines[expired_idx] lines.append(lines.pop(expired_idx)) with open(proxy_filename, 'w') as f: f.writelines(lines) def get_proxy_from_file(filename, check_url=None): """ :param filename: :param check_url: valid check url :return (ip, port): string, string if ip, port or filename is invalid, return (None, None) """ proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)] if len(proxy_lists) <= 0: base.proxy_crawler.crawl_proxies(check_url) proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)] if len(proxy_lists) > 0: m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) if m: return m.group(1), m.group(2) return None, None def get_proxy_from_db(): try: conn = pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=pymysql.cursors.DictCursor) with conn.cursor() as cursor: cursor.execute("select * from Proxy group by Proxy") proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Proxy'] and i['Port']] proxy_lists.sort() conn.close() return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None) except: return None, None def get_proxy(check_url=None): if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename): ip, port = get_proxy_from_file(proxy_filename, check_url) if not ip or not port: return get_proxy_from_db() else: return ip, port else: return get_proxy_from_db() def get_requests_proxy(proxies): return { 'http': 'http://{}'.format(proxies), 'https': 'https://{}'.format(proxies), } def get_proxy_for_requests(check_url=None): ip, port = get_proxy(check_url) return get_requests_proxy(ip + ":" + port)