Files
TorrentCrawler/Crawler/Crawler.py
2018-06-25 00:47:32 +09:00

151 lines
4.3 KiB
Python
Executable File

import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
from .WorkerTorrentKim import WorkerTorrentKim
from .WorkerTfreeca import WorkerTfreeca
from .WorkerTocops import WorkerTocops
class Crawler:
def __init__(self):
self.setting = Setting()
self.proxy_handler = ProxyHandler()
self.worker = None
if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
if 'torrentkim' in top_url:
self.worker = WorkerTorrentKim()
elif 'tfreeca' in top_url:
self.worker = WorkerTfreeca()
elif 'tcorea' in top_url:
self.worker = WorkerTocops()
@staticmethod
def print_log(files):
f = open('output/log.txt', 'at')
for file in files:
f.write(file.file_name+'\n')
f.close()
def request_get(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
}
proxy = self.proxy_handler.get_proxy()
while True:
try:
resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
except Exception as e:
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
if resp.status_code != 200:
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
break
return resp
def crawl_list(self, url):
Logger.log('checking page {}'.format(url))
resp = self.request_get(url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
return links
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = self.worker.crawl_downlink(page_link, soup)
return links
def download_files(self, file_link):
if file_link.episode in self.setting.downloaded[file_link.title]:
return
Logger.log("start download {}".format(file_link.file_name))
try:
resp = self.request_get(file_link.url)
file_name = file_link.file_name
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save()
Logger.log("downloaded {}".format(file_name))
except Exception as e:
Logger.log(e)
def crawl_torrent(self):
page_links = []
for org_url in self.setting.settings['urls']:
page = 1
while page <= self.setting.settings['max_page']:
url = org_url.replace('<page>', str(page))
page_links += self.crawl_list(url)
page += 1
files = []
for link in page_links:
files += self.crawl_downlink(link)
for file in files:
self.download_files(file)
def crawl(self):
if Util.get_free_space() < 4*1024*1024:
Logger.log('Disk space is less than 4GB. Aborted')
return
Logger.log('Crawling start')
self.crawl_torrent()
Logger.log('Crawling finished')
class Sorter:
@staticmethod
def move_files():
setting = Setting()
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
return
file_list = os.listdir(setting.settings['file_download_path'])
for filename in file_list:
for video in setting.settings['video']:
if video['title'] in filename:
old_path = os.path.join(setting.settings['file_download_path'], filename)
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
os.rename(old_path, new_path)