144 lines
2.8 KiB
Python
144 lines
2.8 KiB
Python
import requests
|
|
import bs4
|
|
import subprocess as sp
|
|
import sys
|
|
import queue
|
|
import threading
|
|
|
|
|
|
def get_page(url):
|
|
resp = requests.get(url)
|
|
return resp
|
|
|
|
|
|
def get_proxies_free_proxy():
|
|
proxies = []
|
|
|
|
resp = get_page('https://free-proxy-list.net')
|
|
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
|
table = soup.select('table.table')
|
|
trs = table[0].select('tr')
|
|
|
|
for tr in trs:
|
|
tds = tr.select('td')
|
|
if len(tds) > 0:
|
|
ip = tds[0].text
|
|
port = tds[1].text
|
|
proxies.append({
|
|
'ip': ip,
|
|
'port': int(port),
|
|
})
|
|
|
|
return proxies
|
|
|
|
|
|
def get_proxies_proxy_searcher():
|
|
proxies = []
|
|
|
|
resp = get_page('http://proxysearcher.sourceforge.net/Proxy List.php')
|
|
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
|
table = soup.select('table.tablesorter')
|
|
trs = table[0].select('tr')
|
|
|
|
for tr in trs:
|
|
tds = tr.select('td')
|
|
if len(tds) > 0:
|
|
tokens = tds[1].text.split(':')
|
|
proxies.append({
|
|
'ip': tokens[0],
|
|
'port': int(tokens[1]),
|
|
})
|
|
|
|
return proxies
|
|
|
|
# def get_proxies_nntime():
|
|
# proxies = []
|
|
#
|
|
# resp = get_page('http://nntime.com/')
|
|
# soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
|
# table = soup.select('table.data')
|
|
# trs = table[0].select('tr')
|
|
#
|
|
# for tr in trs[1:]:
|
|
# tds = tr.select('td')
|
|
# if len(tds) > 0:
|
|
# proxy = tds[1].text
|
|
# proxies.append(proxy)
|
|
#
|
|
# return proxies
|
|
|
|
|
|
def check_proxy(qu, proxy, url):
|
|
proxy_dict = {
|
|
'http': '{}:{}'.format(proxy['ip'], proxy['port']),
|
|
'https': '{}:{}'.format(proxy['ip'], proxy['port']),
|
|
}
|
|
try:
|
|
resp = requests.get(url, proxies=proxy_dict, timeout=2)
|
|
except Exception as e:
|
|
return False
|
|
else:
|
|
if resp.status_code != 200:
|
|
return False
|
|
|
|
qu.put(proxy)
|
|
return True
|
|
|
|
|
|
def crawl_proxies(check_url=None):
|
|
# print('proxy crawling start')
|
|
proxies = get_proxies_free_proxy()
|
|
proxies += get_proxies_proxy_searcher()
|
|
# proxies += get_proxies_nntime()
|
|
# proxies = list(set(proxies))
|
|
# print('proxy crawled {}'.format(len(proxies)))
|
|
|
|
if check_url:
|
|
qu = queue.Queue()
|
|
threads = []
|
|
for proxy in proxies:
|
|
th = threading.Thread(target=check_proxy, args=(qu, proxy, check_url))
|
|
threads.append(th)
|
|
|
|
[th.start() for th in threads]
|
|
[th.join() for th in threads]
|
|
|
|
proxies_alive = []
|
|
while not qu.empty():
|
|
proxy = qu.get()
|
|
proxies_alive.append(proxy)
|
|
|
|
else:
|
|
proxies_alive = proxies
|
|
|
|
# print('proxy crawling end')
|
|
return proxies_alive
|
|
|
|
# proxies_alive.sort()
|
|
# print('proxy crawler got {} proxies'.format(len(proxies_alive)))
|
|
#
|
|
# with open('proxy.txt', 'w') as f:
|
|
# print('proxy crawler dump start')
|
|
# for proxy in proxies_alive:
|
|
# # print(proxy)
|
|
# f.write(proxy + '\n')
|
|
# print('proxy crawler dump end')
|
|
#
|
|
# print('proxy crawling end')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
check_url = None
|
|
if len(sys.argv) > 1:
|
|
check_url = sys.argv[1]
|
|
|
|
viewer = None
|
|
if len(sys.argv) > 2:
|
|
viewer = sys.argv[2]
|
|
|
|
crawl_proxies(check_url)
|
|
|
|
if viewer:
|
|
sp.Popen([viewer, 'proxy.txt'])
|