Files
clients/WebBasedCrawler/base/proxy_crawler.py
mjjo 3ceb59e815 트위터 크롤러 max_position 수정
빠진 파일들 추가
2017-08-02 10:52:11 +09:00

134 lines
2.6 KiB
Python

import requests
import bs4
import subprocess as sp
import sys
import queue
import threading
def get_page(url):
resp = requests.get(url)
return resp
def get_proxies_free_proxy():
proxies = []
resp = get_page('https://free-proxy-list.net')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs:
tds = tr.select('td')
if len(tds) > 0:
ip = tds[0].text
port = tds[1].text
proxies.append('{}:{}'.format(ip, port))
return proxies
def get_proxies_proxy_searcher():
proxies = []
resp = get_page('http://proxysearcher.sourceforge.net/Proxy List.php')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.tablesorter')
trs = table[0].select('tr')
for tr in trs:
tds = tr.select('td')
if len(tds) > 0:
proxy = tds[1].text
proxies.append(proxy)
return proxies
# def get_proxies_nntime():
# proxies = []
#
# resp = get_page('http://nntime.com/')
# soup = bs4.BeautifulSoup(resp.text, 'lxml')
# table = soup.select('table.data')
# trs = table[0].select('tr')
#
# for tr in trs[1:]:
# tds = tr.select('td')
# if len(tds) > 0:
# proxy = tds[1].text
# proxies.append(proxy)
#
# return proxies
def check_proxy(qu, proxy, url):
proxy_dict = {
'http': proxy,
'https': proxy,
}
try:
resp = requests.get(url, proxies=proxy_dict, timeout=2)
except Exception as e:
return False
else:
if resp.status_code != 200:
return False
qu.put(proxy)
return True
def crawl_proxies(check_url=None):
print('proxy crawling start')
proxies = get_proxies_free_proxy()
proxies += get_proxies_proxy_searcher()
# proxies += get_proxies_nntime()
proxies = list(set(proxies))
if check_url:
qu = queue.Queue()
threads = []
for proxy in proxies:
th = threading.Thread(target=check_proxy, args=(qu, proxy, check_url))
threads.append(th)
[th.start() for th in threads]
[th.join() for th in threads]
proxies_alive = []
while not qu.empty():
proxy = qu.get()
proxies_alive.append(proxy)
else:
proxies_alive = proxies
proxies_alive.sort()
print('proxy crawler got {} proxies'.format(len(proxies_alive)))
with open('proxy.txt', 'w') as f:
print('proxy crawler dump start')
for proxy in proxies_alive:
# print(proxy)
f.write(proxy + '\n')
print('proxy crawler dump end')
print('proxy crawling end')
if __name__ == '__main__':
check_url = None
if len(sys.argv) > 1:
check_url = sys.argv[1]
viewer = None
if len(sys.argv) > 2:
viewer = sys.argv[2]
crawl_proxies(check_url)
if viewer:
sp.Popen([viewer, 'proxy.txt'])