Files
clients/WebBasedCrawler/base/proxy_crawler.py
mjjo 1fb61f0b4c 트위터 크롤러 수정
- 프록시를 porxy2 db에 넣고 사용
2017-08-09 15:32:57 +09:00

144 lines
2.8 KiB
Python

import requests
import bs4
import subprocess as sp
import sys
import queue
import threading
def get_page(url):
resp = requests.get(url)
return resp
def get_proxies_free_proxy():
proxies = []
resp = get_page('https://free-proxy-list.net')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs:
tds = tr.select('td')
if len(tds) > 0:
ip = tds[0].text
port = tds[1].text
proxies.append({
'ip': ip,
'port': int(port),
})
return proxies
def get_proxies_proxy_searcher():
proxies = []
resp = get_page('http://proxysearcher.sourceforge.net/Proxy List.php')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.tablesorter')
trs = table[0].select('tr')
for tr in trs:
tds = tr.select('td')
if len(tds) > 0:
tokens = tds[1].text.split(':')
proxies.append({
'ip': tokens[0],
'port': int(tokens[1]),
})
return proxies
# def get_proxies_nntime():
# proxies = []
#
# resp = get_page('http://nntime.com/')
# soup = bs4.BeautifulSoup(resp.text, 'lxml')
# table = soup.select('table.data')
# trs = table[0].select('tr')
#
# for tr in trs[1:]:
# tds = tr.select('td')
# if len(tds) > 0:
# proxy = tds[1].text
# proxies.append(proxy)
#
# return proxies
def check_proxy(qu, proxy, url):
proxy_dict = {
'http': '{}:{}'.format(proxy['ip'], proxy['port']),
'https': '{}:{}'.format(proxy['ip'], proxy['port']),
}
try:
resp = requests.get(url, proxies=proxy_dict, timeout=2)
except Exception as e:
return False
else:
if resp.status_code != 200:
return False
qu.put(proxy)
return True
def crawl_proxies(check_url=None):
print('proxy crawling start')
proxies = get_proxies_free_proxy()
proxies += get_proxies_proxy_searcher()
# proxies += get_proxies_nntime()
# proxies = list(set(proxies))
print('proxy crawled {}'.format(len(proxies)))
if check_url:
qu = queue.Queue()
threads = []
for proxy in proxies:
th = threading.Thread(target=check_proxy, args=(qu, proxy, check_url))
threads.append(th)
[th.start() for th in threads]
[th.join() for th in threads]
proxies_alive = []
while not qu.empty():
proxy = qu.get()
proxies_alive.append(proxy)
else:
proxies_alive = proxies
print('proxy crawling end')
return proxies_alive
# proxies_alive.sort()
# print('proxy crawler got {} proxies'.format(len(proxies_alive)))
#
# with open('proxy.txt', 'w') as f:
# print('proxy crawler dump start')
# for proxy in proxies_alive:
# # print(proxy)
# f.write(proxy + '\n')
# print('proxy crawler dump end')
#
# print('proxy crawling end')
if __name__ == '__main__':
check_url = None
if len(sys.argv) > 1:
check_url = sys.argv[1]
viewer = None
if len(sys.argv) > 2:
viewer = sys.argv[2]
crawl_proxies(check_url)
if viewer:
sp.Popen([viewer, 'proxy.txt'])