프록시 추가
This commit is contained in:
@@ -5,12 +5,14 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import re
|
import re
|
||||||
|
import pickle
|
||||||
|
|
||||||
from .Setting import Setting
|
from .Setting import Setting
|
||||||
|
|
||||||
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
|
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
|
||||||
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
|
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
|
||||||
|
|
||||||
|
|
||||||
class PageLink:
|
class PageLink:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.title = ''
|
self.title = ''
|
||||||
@@ -28,6 +30,7 @@ class PageLink:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str(self)
|
return str(self)
|
||||||
|
|
||||||
|
|
||||||
class TorrentFile:
|
class TorrentFile:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.title = ''
|
self.title = ''
|
||||||
@@ -49,36 +52,133 @@ class TorrentFile:
|
|||||||
|
|
||||||
|
|
||||||
class Crawler:
|
class Crawler:
|
||||||
|
PROXY_FILE_NAME = 'proxy.bin'
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.setting = Setting()
|
self.setting = Setting()
|
||||||
|
self.proxies = []
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.cookies = None
|
||||||
|
|
||||||
def print_log(self, files):
|
@staticmethod
|
||||||
|
def print_log(files):
|
||||||
f = open('output/log.txt', 'at')
|
f = open('output/log.txt', 'at')
|
||||||
for file in files:
|
for file in files:
|
||||||
f.write(file.file_name+'\n')
|
f.write(file.file_name+'\n')
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
def crawl_proxy(self):
|
||||||
|
proxies = []
|
||||||
|
|
||||||
|
if os.path.exists(Crawler.PROXY_FILE_NAME):
|
||||||
|
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
|
||||||
|
proxies = pickle.load(f)
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
else:
|
||||||
|
resp = requests.get('https://www.us-proxy.org')
|
||||||
|
soup = BeautifulSoup(resp.text, 'lxml')
|
||||||
|
table = soup.select('table.table')
|
||||||
|
trs = table[0].select('tr')
|
||||||
|
cnt = 0
|
||||||
|
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td')
|
||||||
|
if len(tds) > 0:
|
||||||
|
ip, port = tds[0].text, tds[1].text
|
||||||
|
proxies.append(
|
||||||
|
{
|
||||||
|
'http': '{}:{}'.format(ip, port),
|
||||||
|
'https': '{}:{}'.format(ip, port),
|
||||||
|
'alive': True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# print('{}:{}'.format(ip, port))
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
|
||||||
|
pickle.dump(proxies, f)
|
||||||
|
|
||||||
|
print('proxy cnt : {}'.format(cnt))
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
def get_proxy(self):
|
||||||
|
if len(self.proxies) <= 0:
|
||||||
|
if os.path.exists(Crawler.PROXY_FILE_NAME):
|
||||||
|
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
|
||||||
|
self.proxies = pickle.load(f)
|
||||||
|
else:
|
||||||
|
self.proxies = self.crawl_proxy()
|
||||||
|
|
||||||
|
for proxy in self.proxies:
|
||||||
|
if proxy['alive']:
|
||||||
|
return proxy
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set_proxy_dead(self, proxy):
|
||||||
|
proxy['alive'] = False
|
||||||
|
for proxy in self.proxies:
|
||||||
|
if proxy['alive']:
|
||||||
|
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
|
||||||
|
pickle.dump(self.proxies, f)
|
||||||
|
return
|
||||||
|
|
||||||
|
os.remove(Crawler.PROXY_FILE_NAME)
|
||||||
|
self.proxies = []
|
||||||
|
|
||||||
|
def request_get(self, url):
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||||
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
}
|
||||||
|
|
||||||
|
proxy = self.get_proxy()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
|
||||||
|
self.cookies = resp.cookies
|
||||||
|
except Exception as e:
|
||||||
|
self.set_proxy_dead(proxy)
|
||||||
|
proxy = self.get_proxy()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if resp.status_code != 200:
|
||||||
|
self.set_proxy_dead(proxy)
|
||||||
|
proxy = self.get_proxy()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
def crawl_list(self, url):
|
def crawl_list(self, url):
|
||||||
|
|
||||||
print('checking page {}'.format(url), flush=True)
|
print('checking page {}'.format(url), flush=True)
|
||||||
|
|
||||||
code = requests.get(url)
|
resp = self.request_get(url)
|
||||||
html = code.text
|
html = resp.text
|
||||||
soup = BeautifulSoup(html, 'lxml')
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
|
|
||||||
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
||||||
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
|
tables = soup.select('table.table')
|
||||||
board_title = link.get_text().strip()
|
trs = tables[0].select('tr.')
|
||||||
|
for tr in trs:
|
||||||
|
tds = tr.select('div.td-subject')
|
||||||
|
title = tds[0].text.strip()
|
||||||
|
link = tds[0].select('a')[0].attrs['href']
|
||||||
|
|
||||||
title_match = re_title.search(board_title)
|
title_match = re_title.search(title)
|
||||||
if not title_match:
|
if not title_match:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ep_match = re_episode.search(board_title)
|
ep_match = re_episode.search(title)
|
||||||
if not ep_match:
|
if not ep_match:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -93,31 +193,73 @@ class Crawler:
|
|||||||
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
link_url = link.get('href')
|
if not link.startswith('http'):
|
||||||
if not link_url.startswith('http'):
|
|
||||||
top_end = url[8:].find('/')
|
top_end = url[8:].find('/')
|
||||||
if top_end < 0:
|
if top_end < 0:
|
||||||
top_url = url[:8 + top_end]
|
top_url = url[:8 + top_end]
|
||||||
else:
|
else:
|
||||||
top_url = url
|
top_url = url
|
||||||
|
|
||||||
if link_url[0] != '/':
|
if link[0] != '/':
|
||||||
link_url = '/' + link_url
|
link = '/' + link
|
||||||
|
|
||||||
link_url = top_url + link_url
|
link = top_url + link
|
||||||
|
|
||||||
links.append(PageLink(video['title'], ep, link_url))
|
links.append(PageLink(video['title'], ep, link))
|
||||||
|
|
||||||
print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
|
print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim
|
||||||
|
# for link in soup.select(selector):
|
||||||
|
# if link.has_attr('rel') and 'nofollow' in link['rel']:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# board_title = link.get_text().strip()
|
||||||
|
#
|
||||||
|
# title_match = re_title.search(board_title)
|
||||||
|
# if not title_match:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# ep_match = re_episode.search(board_title)
|
||||||
|
# if not ep_match:
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# title_idx = int(title_match.lastgroup[3:])
|
||||||
|
# video = self.setting.settings['video'][title_idx]
|
||||||
|
# ep = int(ep_match.group(1))
|
||||||
|
#
|
||||||
|
# if ep <= video['ignore_ep_under']:
|
||||||
|
# print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
|
||||||
|
# continue
|
||||||
|
# elif ep in self.setting.downloaded[video['title']]:
|
||||||
|
# print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# link_url = link.get('href')
|
||||||
|
# if not link_url.startswith('http'):
|
||||||
|
# top_end = url[8:].find('/')
|
||||||
|
# if top_end < 0:
|
||||||
|
# top_url = url[:8 + top_end]
|
||||||
|
# else:
|
||||||
|
# top_url = url
|
||||||
|
#
|
||||||
|
# if link_url[0] != '/':
|
||||||
|
# link_url = '/' + link_url
|
||||||
|
#
|
||||||
|
# link_url = top_url + link_url
|
||||||
|
#
|
||||||
|
# links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
|
||||||
|
#
|
||||||
|
# print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def crawl_downlink(self, link):
|
def crawl_downlink(self, link):
|
||||||
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
|
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
|
||||||
|
|
||||||
code = requests.get(link.url)
|
resp = self.request_get(link.url)
|
||||||
html = code.text
|
soup = BeautifulSoup(resp.text, 'lxml')
|
||||||
soup = BeautifulSoup(html, 'lxml')
|
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
|
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
|
||||||
@@ -137,11 +279,9 @@ class Crawler:
|
|||||||
print("start download {}".format(file.file_name), flush=True)
|
print("start download {}".format(file.file_name), flush=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(file.url)
|
response = self.request_get(file.url, cookies=file.cookie)
|
||||||
data = response.content
|
with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
|
||||||
f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
|
f.write(response.content)
|
||||||
f.write(data)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
self.setting.downloaded[file.title].append(file.episode)
|
self.setting.downloaded[file.title].append(file.episode)
|
||||||
self.setting.save()
|
self.setting.save()
|
||||||
@@ -151,16 +291,13 @@ class Crawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
|
def crawl_torrent(self):
|
||||||
def crawl(self):
|
|
||||||
|
|
||||||
print('Crawling start')
|
|
||||||
|
|
||||||
page_links = []
|
page_links = []
|
||||||
for url in self.setting.settings['urls']:
|
for org_url in self.setting.settings['urls']:
|
||||||
page = 1
|
page = 1
|
||||||
while page <= self.setting.settings['max_page']:
|
while page <= self.setting.settings['max_page']:
|
||||||
page_links += self.crawl_list(url+str(page))
|
url = org_url.replace('<page>', str(page))
|
||||||
|
page_links += self.crawl_list(url)
|
||||||
page += 1
|
page += 1
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
@@ -170,4 +307,9 @@ class Crawler:
|
|||||||
for file in files:
|
for file in files:
|
||||||
self.download_files(file)
|
self.download_files(file)
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
|
print('Crawling start')
|
||||||
|
|
||||||
|
self.crawl_torrent()
|
||||||
|
|
||||||
print('Crawling finished')
|
print('Crawling finished')
|
||||||
|
|||||||
18
Main.py
18
Main.py
@@ -1,21 +1,5 @@
|
|||||||
from Crawler.Crawler import Crawler
|
from Crawler.Crawler import Crawler
|
||||||
|
|
||||||
|
|
||||||
# def do_it():
|
|
||||||
# crawler = Crawler()
|
|
||||||
#
|
|
||||||
# files = []
|
|
||||||
# for url in setting.urls:
|
|
||||||
# for page in range(1, setting.max_page+1):
|
|
||||||
# page_url = url+str(page)
|
|
||||||
# page_links = crawler.crawl_list(page_url)
|
|
||||||
#
|
|
||||||
# # for link in page_links:
|
|
||||||
# # files += crawl_downlink(link)
|
|
||||||
# #
|
|
||||||
# # download_files(files)
|
|
||||||
#
|
|
||||||
# do_it()
|
|
||||||
|
|
||||||
crawler = Crawler()
|
crawler = Crawler()
|
||||||
crawler.crawl()
|
crawler.crawl()
|
||||||
|
|||||||
Reference in New Issue
Block a user