151 lines
4.3 KiB
Python
Executable File
151 lines
4.3 KiB
Python
Executable File
import requests
|
|
import urllib
|
|
import bs4
|
|
import re
|
|
import os
|
|
|
|
from .Setting import Setting
|
|
from .ProxyHandler import ProxyHandler
|
|
from .Logger import Logger
|
|
from .Util import Util
|
|
from .DataType import PageLink, TorrentFile
|
|
|
|
from .WorkerTorrentKim import WorkerTorrentKim
|
|
from .WorkerTfreeca import WorkerTfreeca
|
|
from .WorkerTocops import WorkerTocops
|
|
|
|
|
|
class Crawler:
|
|
def __init__(self):
|
|
self.setting = Setting()
|
|
self.proxy_handler = ProxyHandler()
|
|
self.worker = None
|
|
|
|
if len(self.setting.settings['urls']) > 0:
|
|
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
|
|
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
|
|
self.proxy_handler.check_url = top_url
|
|
|
|
if 'torrentkim' in top_url:
|
|
self.worker = WorkerTorrentKim()
|
|
elif 'tfreeca' in top_url:
|
|
self.worker = WorkerTfreeca()
|
|
elif 'tcorea' in top_url:
|
|
self.worker = WorkerTocops()
|
|
|
|
@staticmethod
|
|
def print_log(files):
|
|
f = open('output/log.txt', 'at')
|
|
for file in files:
|
|
f.write(file.file_name+'\n')
|
|
f.close()
|
|
|
|
def request_get(self, url):
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
|
|
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
'Connection': 'keep-alive',
|
|
}
|
|
|
|
proxy = self.proxy_handler.get_proxy()
|
|
while True:
|
|
try:
|
|
resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
|
|
except Exception as e:
|
|
self.proxy_handler.set_proxy_dead(proxy)
|
|
proxy = self.proxy_handler.get_proxy()
|
|
continue
|
|
else:
|
|
if resp.status_code != 200:
|
|
self.proxy_handler.set_proxy_dead(proxy)
|
|
proxy = self.proxy_handler.get_proxy()
|
|
continue
|
|
else:
|
|
break
|
|
|
|
return resp
|
|
|
|
def crawl_list(self, url):
|
|
Logger.log('checking page {}'.format(url))
|
|
|
|
resp = self.request_get(url)
|
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
|
|
|
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
|
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
|
|
|
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
|
|
return links
|
|
|
|
def crawl_downlink(self, page_link):
|
|
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
|
|
|
|
resp = self.request_get(page_link.url)
|
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
|
|
|
links = self.worker.crawl_downlink(page_link, soup)
|
|
return links
|
|
|
|
def download_files(self, file_link):
|
|
if file_link.episode in self.setting.downloaded[file_link.title]:
|
|
return
|
|
|
|
Logger.log("start download {}".format(file_link.file_name))
|
|
|
|
try:
|
|
resp = self.request_get(file_link.url)
|
|
file_name = file_link.file_name
|
|
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
|
|
f.write(resp.content)
|
|
|
|
self.setting.downloaded[file_link.title].append(file_link.episode)
|
|
self.setting.save()
|
|
|
|
Logger.log("downloaded {}".format(file_name))
|
|
|
|
except Exception as e:
|
|
Logger.log(e)
|
|
|
|
def crawl_torrent(self):
|
|
page_links = []
|
|
for org_url in self.setting.settings['urls']:
|
|
page = 1
|
|
while page <= self.setting.settings['max_page']:
|
|
url = org_url.replace('<page>', str(page))
|
|
page_links += self.crawl_list(url)
|
|
page += 1
|
|
|
|
files = []
|
|
for link in page_links:
|
|
files += self.crawl_downlink(link)
|
|
|
|
for file in files:
|
|
self.download_files(file)
|
|
|
|
def crawl(self):
|
|
if Util.get_free_space() < 4*1024*1024:
|
|
Logger.log('Disk space is less than 4GB. Aborted')
|
|
return
|
|
|
|
Logger.log('Crawling start')
|
|
self.crawl_torrent()
|
|
Logger.log('Crawling finished')
|
|
|
|
|
|
class Sorter:
|
|
@staticmethod
|
|
def move_files():
|
|
setting = Setting()
|
|
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
|
|
return
|
|
|
|
file_list = os.listdir(setting.settings['file_download_path'])
|
|
for filename in file_list:
|
|
for video in setting.settings['video']:
|
|
if video['title'] in filename:
|
|
old_path = os.path.join(setting.settings['file_download_path'], filename)
|
|
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
|
|
os.rename(old_path, new_path)
|