Files
TorrentCrawler/Crawler/Crawler.py
2018-05-24 03:50:40 +09:00

223 lines
6.3 KiB
Python
Executable File

import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)
class Crawler:
def __init__(self):
self.setting = Setting()
self.proxy_handler = ProxyHandler()
if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
@staticmethod
def print_log(files):
f = open('output/log.txt', 'at')
for file in files:
f.write(file.file_name+'\n')
f.close()
def request_get(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
}
proxy = self.proxy_handler.get_proxy()
while True:
try:
resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
except Exception as e:
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
if resp.status_code != 200:
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
break
return resp
def crawl_list(self, url):
Logger.log('checking page {}'.format(url))
resp = self.request_get(url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = self.setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in self.setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links
def download_files(self, file_link):
if file_link.episode in self.setting.downloaded[file_link.title]:
return
Logger.log("start download {}".format(file_link.file_name))
try:
resp = self.request_get(file_link.url)
file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
file_name = urllib.parse.unquote(file_name[0])
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save()
Logger.log("downloaded {}".format(file_name))
except Exception as e:
Logger.log(e)
def crawl_torrent(self):
page_links = []
for org_url in self.setting.settings['urls']:
page = 1
while page <= self.setting.settings['max_page']:
url = org_url.replace('<page>', str(page))
page_links += self.crawl_list(url)
page += 1
files = []
for link in page_links:
files += self.crawl_downlink(link)
for file in files:
self.download_files(file)
def crawl(self):
Logger.log('Crawling start')
self.crawl_torrent()
Logger.log('Crawling finished')
class Sorter:
@staticmethod
def move_files():
setting = Setting()
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
return
file_list = os.listdir(setting.settings['file_download_path'])
for filename in file_list:
for video in setting.settings['video']:
if video['title'] in filename:
old_path = os.path.join(setting.settings['file_download_path'], filename)
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
os.rename(old_path, new_path)