import os import pickle import requests import bs4 import concurrent.futures from .Logger import Logger class ProxyHandler: PROXY_FILE_NAME = 'temp/proxy.bin' def __init__(self): if not os.path.exists('temp'): os.makedirs('temp') self.proxies = [] self.check_url = '' def check_proxy(self, proxy, top_url): try: resp = requests.get(top_url, proxies=proxy, timeout=2) except: proxy['alive'] = False else: if resp.status_code != 200: proxy['alive'] = False def check_proxy_all(self, proxies, check_url): Logger.log('checking proxies for {}'.format(check_url)) worker_cnt = 16 pool = concurrent.futures.ThreadPoolExecutor(worker_cnt) [pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies] pool.shutdown() def has_file(self): return os.path.exists(self.PROXY_FILE_NAME) def load_proxy(self): with open(self.PROXY_FILE_NAME, 'rb') as f: proxies = pickle.load(f) return proxies def crawl_proxy(self): proxies = [] resp = requests.get('https://www.us-proxy.org') soup = bs4.BeautifulSoup(resp.text, 'lxml') table = soup.select('table.table') trs = table[0].select('tr') for tr in trs[1:]: tds = tr.select('td') if len(tds) < 2: continue ip, port = tds[0].text, tds[1].text proxies.append( { 'alive': True, 'http': '{}:{}'.format(ip, port), 'https': '{}:{}'.format(ip, port), } ) self.check_proxy_all(proxies, self.check_url) alive_proxies = [proxy for proxy in proxies if proxy['alive']] Logger.log('proxies checking end: available : {}'.format(len(alive_proxies))) with open(self.PROXY_FILE_NAME, 'wb') as f: pickle.dump(alive_proxies, f) return alive_proxies def get_proxy(self): if len(self.proxies) <= 0: if self.has_file(): self.proxies = self.load_proxy() else: self.proxies = self.crawl_proxy() for proxy in self.proxies: if proxy['alive']: return proxy return None def set_proxy_dead(self, proxy): proxy['alive'] = False for proxy in self.proxies: if proxy['alive']: with open(self.PROXY_FILE_NAME, 'wb') as f: pickle.dump(self.proxies, f) return os.remove(self.PROXY_FILE_NAME) self.proxies = []