i don't know
This commit is contained in:
@@ -66,7 +66,7 @@ class Crawler:
|
|||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
'Accept-Charset': 'utf-8,ISO-8859-1;q=0.7,*;q=0.3',
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||||
'Connection': 'keep-alive',
|
'Connection': 'keep-alive',
|
||||||
@@ -94,6 +94,7 @@ class Crawler:
|
|||||||
Logger.log('checking page {}'.format(url))
|
Logger.log('checking page {}'.format(url))
|
||||||
|
|
||||||
resp = self.request_get(url)
|
resp = self.request_get(url)
|
||||||
|
import pdb; pdb.set_trace()
|
||||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||||
|
|
||||||
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
||||||
@@ -179,7 +180,7 @@ class Crawler:
|
|||||||
self.setting.downloaded[file_link.title].append(file_link.episode)
|
self.setting.downloaded[file_link.title].append(file_link.episode)
|
||||||
self.setting.save()
|
self.setting.save()
|
||||||
|
|
||||||
Logger.log("downloaded {}".format(file_link.file_name))
|
Logger.log("downloaded {}".format(file_name))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
Logger.log(e)
|
Logger.log(e)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import pickle
|
|||||||
import requests
|
import requests
|
||||||
import bs4
|
import bs4
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import time
|
||||||
|
|
||||||
from .Logger import Logger
|
from .Logger import Logger
|
||||||
|
|
||||||
@@ -64,7 +65,45 @@ class ProxyHandler:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
resp = requests.get('https://www.socks-proxy.net')
|
||||||
|
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||||
|
table = soup.select('table.table')
|
||||||
|
trs = table[0].select('tr')
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td')
|
||||||
|
if len(tds) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ip, port = tds[0].text, tds[1].text
|
||||||
|
proxies.append(
|
||||||
|
{
|
||||||
|
'alive': True,
|
||||||
|
'http': '{}:{}'.format(ip, port),
|
||||||
|
'https': '{}:{}'.format(ip, port),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = requests.get('https://www.sslproxies.org')
|
||||||
|
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||||
|
table = soup.select('table.table')
|
||||||
|
trs = table[0].select('tr')
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td')
|
||||||
|
if len(tds) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ip, port = tds[0].text, tds[1].text
|
||||||
|
proxies.append(
|
||||||
|
{
|
||||||
|
'alive': True,
|
||||||
|
'http': '{}:{}'.format(ip, port),
|
||||||
|
'https': '{}:{}'.format(ip, port),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# print(proxies)
|
||||||
self.check_proxy_all(proxies, self.check_url)
|
self.check_proxy_all(proxies, self.check_url)
|
||||||
|
# print(proxies)
|
||||||
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
|
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
|
||||||
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
|
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
|
||||||
|
|
||||||
@@ -77,8 +116,16 @@ class ProxyHandler:
|
|||||||
if len(self.proxies) <= 0:
|
if len(self.proxies) <= 0:
|
||||||
if self.has_file():
|
if self.has_file():
|
||||||
self.proxies = self.load_proxy()
|
self.proxies = self.load_proxy()
|
||||||
else:
|
self.proxies = [proxy for proxy in self.proxies if proxy['alive']]
|
||||||
|
|
||||||
|
if len(self.proxies) <= 0:
|
||||||
|
while True:
|
||||||
self.proxies = self.crawl_proxy()
|
self.proxies = self.crawl_proxy()
|
||||||
|
if len(self.proxies) > 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
Logger.log('there is no available proxy. sleep 10secs..')
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
for proxy in self.proxies:
|
for proxy in self.proxies:
|
||||||
if proxy['alive']:
|
if proxy['alive']:
|
||||||
|
|||||||
16
Main.py
Normal file → Executable file
16
Main.py
Normal file → Executable file
@@ -3,3 +3,19 @@ from Crawler.Crawler import Crawler
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawler = Crawler()
|
crawler = Crawler()
|
||||||
crawler.crawl()
|
crawler.crawl()
|
||||||
|
|
||||||
|
# import requests
|
||||||
|
# import bs4
|
||||||
|
|
||||||
|
# resp = requests.get('http://www.gatherproxy.com')
|
||||||
|
# soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||||
|
# # print(soup)
|
||||||
|
# table = soup.select('table#tblproxy')
|
||||||
|
# trs = table[0].select('tr')
|
||||||
|
# for tr in trs[2:5]:
|
||||||
|
# tds = tr.select('td')
|
||||||
|
# if len(tds) < 2:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
# ip, port = tds[1].text, tds[2].text
|
||||||
|
# print('ip: {}, port: {}'.format(ip, port))
|
||||||
|
|||||||
Reference in New Issue
Block a user