- torrentkim 사이트로 변경
- 파일 폴더 구성
This commit is contained in:
98
Crawler/ProxyHandler.py
Normal file
98
Crawler/ProxyHandler.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import os
|
||||
import pickle
|
||||
import requests
|
||||
import bs4
|
||||
import concurrent.futures
|
||||
|
||||
from .Logger import Logger
|
||||
|
||||
|
||||
class ProxyHandler:
|
||||
PROXY_FILE_NAME = 'temp/proxy.bin'
|
||||
|
||||
def __init__(self):
|
||||
if not os.path.exists('temp'):
|
||||
os.makedirs('temp')
|
||||
|
||||
self.proxies = []
|
||||
self.check_url = ''
|
||||
|
||||
def check_proxy(self, proxy, top_url):
|
||||
try:
|
||||
resp = requests.get(top_url, proxies=proxy, timeout=2)
|
||||
except:
|
||||
proxy['alive'] = False
|
||||
else:
|
||||
if resp.status_code != 200:
|
||||
proxy['alive'] = False
|
||||
|
||||
def check_proxy_all(self, proxies, check_url):
|
||||
Logger.log('checking proxies for {}'.format(check_url))
|
||||
|
||||
worker_cnt = 64
|
||||
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
|
||||
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
|
||||
pool.shutdown()
|
||||
|
||||
def has_file(self):
|
||||
return os.path.exists(self.PROXY_FILE_NAME)
|
||||
|
||||
def load_proxy(self):
|
||||
with open(self.PROXY_FILE_NAME, 'rb') as f:
|
||||
proxies = pickle.load(f)
|
||||
|
||||
return proxies
|
||||
|
||||
def crawl_proxy(self):
|
||||
proxies = []
|
||||
|
||||
resp = requests.get('https://www.us-proxy.org')
|
||||
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||
table = soup.select('table.table')
|
||||
trs = table[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td')
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
|
||||
ip, port = tds[0].text, tds[1].text
|
||||
proxies.append(
|
||||
{
|
||||
'alive': True,
|
||||
'http': '{}:{}'.format(ip, port),
|
||||
'https': '{}:{}'.format(ip, port),
|
||||
}
|
||||
)
|
||||
|
||||
self.check_proxy_all(proxies, self.check_url)
|
||||
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
|
||||
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
|
||||
|
||||
with open(self.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(alive_proxies, f)
|
||||
|
||||
return alive_proxies
|
||||
|
||||
def get_proxy(self):
|
||||
if len(self.proxies) <= 0:
|
||||
if self.has_file():
|
||||
self.proxies = self.load_proxy()
|
||||
else:
|
||||
self.proxies = self.crawl_proxy()
|
||||
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive']:
|
||||
return proxy
|
||||
|
||||
return None
|
||||
|
||||
def set_proxy_dead(self, proxy):
|
||||
proxy['alive'] = False
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive']:
|
||||
with open(self.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(self.proxies, f)
|
||||
return
|
||||
|
||||
os.remove(self.PROXY_FILE_NAME)
|
||||
self.proxies = []
|
||||
Reference in New Issue
Block a user