import requests import urllib import bs4 import re import os from .Setting import Setting from .ProxyHandler import ProxyHandler from .Logger import Logger from .Util import Util from .DataType import PageLink, TorrentFile from .WorkerTorrentKim import WorkerTorrentKim from .WorkerTfreeca import WorkerTfreeca from .WorkerTocops import WorkerTocops class Crawler: def __init__(self): self.setting = Setting() self.session = requests.session() self.proxy_handler = ProxyHandler() self.worker = None if len(self.setting.settings['urls']) > 0: urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0]) top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', '')) self.proxy_handler.check_url = top_url if 'torrentkim' in top_url: self.worker = WorkerTorrentKim() elif 'tfreeca' in top_url: self.worker = WorkerTfreeca() elif 'tcorea' in top_url: self.worker = WorkerTocops() @staticmethod def print_log(files): f = open('output/log.txt', 'at') for file in files: f.write(file.file_name+'\n') f.close() def request_get(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3', 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7', 'Connection': 'keep-alive', } proxy = self.proxy_handler.get_proxy() while True: try: resp = self.session.get(url, proxies=proxy, headers=headers, timeout=3) except Exception as e: self.proxy_handler.set_proxy_dead(proxy) proxy = self.proxy_handler.get_proxy() continue else: if resp.status_code != 200: self.proxy_handler.set_proxy_dead(proxy) proxy = self.proxy_handler.get_proxy() continue else: break return resp def crawl_list(self, url): Logger.log('checking page {}'.format(url)) resp = self.request_get(url) soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') re_title = re.compile('|'.join(['(?P' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting) return links def crawl_downlink(self, page_link): Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url)) resp = self.request_get(page_link.url) soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') links = self.worker.crawl_downlink(page_link, soup) return links def download_files(self, file_link): if file_link.episode in self.setting.downloaded[file_link.title]: return Logger.log("start download {}".format(file_link.file_name)) try: resp = self.request_get(file_link.url) file_name = file_link.file_name with open(self.setting.settings['download_path'] + file_name, 'wb') as f: f.write(resp.content) self.setting.downloaded[file_link.title].append(file_link.episode) self.setting.save() Logger.log("downloaded {}".format(file_name)) except Exception as e: Logger.log(e) def crawl_torrent(self): page_links = [] for org_url in self.setting.settings['urls']: page = 1 while page <= self.setting.settings['max_page']: url = org_url.replace('', str(page)) page_links += self.crawl_list(url) page += 1 files = [] for link in page_links: files += self.crawl_downlink(link) for file in files: self.download_files(file) def crawl(self): if Util.get_free_space() < 4*1024*1024: Logger.log('Disk space is less than 4GB. Aborted') return # self.test() Logger.log('Crawling start') self.crawl_torrent() Logger.log('Crawling finished') class Sorter: @staticmethod def move_files(): setting = Setting() if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings: return file_list = os.listdir(setting.settings['file_download_path']) for filename in file_list: for video in setting.settings['video']: if video['title'] in filename: old_path = os.path.join(setting.settings['file_download_path'], filename) new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename) os.rename(old_path, new_path)