From 8c9ddb9ce84d0199a4c469fa6e5056ff7cc3fe17 Mon Sep 17 00:00:00 2001 From: mjjo53 Date: Mon, 25 Jun 2018 00:47:32 +0900 Subject: [PATCH] refactoring and add tocops --- Crawler/Crawler.py | 111 ++++++------------------------------ Crawler/DataType.py | 36 ++++++++++++ Crawler/Util.py | 10 +++- Crawler/WorkerTfreeca.py | 78 +++++++++++++++++++++++++ Crawler/WorkerTocops.py | 97 +++++++++++++++++++++++++++++++ Crawler/WorkerTorrentKim.py | 79 +++++++++++++++++++++++++ 6 files changed, 314 insertions(+), 97 deletions(-) create mode 100644 Crawler/DataType.py create mode 100644 Crawler/WorkerTfreeca.py create mode 100644 Crawler/WorkerTocops.py create mode 100644 Crawler/WorkerTorrentKim.py diff --git a/Crawler/Crawler.py b/Crawler/Crawler.py index 6680838..29993ec 100755 --- a/Crawler/Crawler.py +++ b/Crawler/Crawler.py @@ -8,55 +8,31 @@ from .Setting import Setting from .ProxyHandler import ProxyHandler from .Logger import Logger from .Util import Util +from .DataType import PageLink, TorrentFile - -class PageLink: - def __init__(self): - self.title = '' - self.episode = '' - self.url = '' - - def __init__(self, title, episode, url): - self.title = title - self.episode = episode - self.url = url - - def __str__(self): - return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url) - - def __repr__(self): - return str(self) - - -class TorrentFile: - def __init__(self): - self.title = '' - self.episode = '' - self.file_name = '' - self.url = '' - - def __init__(self, title, episode, file_name, url): - self.title = title - self.file_name = file_name - self.episode = episode - self.url = url - - def __str__(self): - return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url) - - def __repr__(self): - return str(self) +from .WorkerTorrentKim import WorkerTorrentKim +from .WorkerTfreeca import WorkerTfreeca +from .WorkerTocops import WorkerTocops class Crawler: def __init__(self): self.setting = Setting() self.proxy_handler = ProxyHandler() + self.worker = None + if len(self.setting.settings['urls']) > 0: urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0]) top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', '')) self.proxy_handler.check_url = top_url + if 'torrentkim' in top_url: + self.worker = WorkerTorrentKim() + elif 'tfreeca' in top_url: + self.worker = WorkerTfreeca() + elif 'tcorea' in top_url: + self.worker = WorkerTocops() + @staticmethod def print_log(files): f = open('output/log.txt', 'at') @@ -97,41 +73,10 @@ class Crawler: resp = self.request_get(url) soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') - re_title = re.compile('|'.join(['(?P'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I) + re_title = re.compile('|'.join(['(?P' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) - links = [] - tables = soup.select('table.board_list') - trs = tables[0].select('tr') - for tr in trs[1:]: - tds = tr.select('td.subject') - title = tds[0].text.strip() - link = tds[0].select('a')[0].attrs['href'] - - title_match = re_title.search(title) - if not title_match: - continue - - ep_match = re_episode.search(title) - if not ep_match: - continue - - title_idx = int(title_match.lastgroup[3:]) - video = self.setting.settings['video'][title_idx] - ep = int(ep_match.group(1)) - - if ep <= video['ignore_ep_under']: - Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) - continue - elif ep in self.setting.downloaded[video['title']]: - Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) - continue - - link = urllib.parse.urljoin(url, link) - links.append(PageLink(video['title'], ep, link)) - - Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) - + links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting) return links def crawl_downlink(self, page_link): @@ -140,28 +85,7 @@ class Crawler: resp = self.request_get(page_link.url) soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') - links = [] - file_table = soup.select('table#file_table') - a_tags = file_table[0].select('a') - torrent_links = [a for a in a_tags if '.torrent' in a.text] - smi_links = [a for a in a_tags if '.smi' in a.text] - - for link in torrent_links: - file_name = link.text.strip() - sub_url = link.attrs['href'] - url = urllib.parse.urljoin(page_link.url, sub_url) - links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) - - Logger.log(' found download link : {}({})'.format(file_name, url)) - - for link in smi_links: - file_name = link.text.strip() - sub_url = link.attrs['href'] - url = urllib.parse.urljoin(link.url, sub_url) - links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) - - Logger.log(' found download link : {}({})'.format(file_name, url)) - + links = self.worker.crawl_downlink(page_link, soup) return links def download_files(self, file_link): @@ -172,8 +96,7 @@ class Crawler: try: resp = self.request_get(file_link.url) - file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition']) - file_name = urllib.parse.unquote(file_name[0]) + file_name = file_link.file_name with open(self.setting.settings['download_path'] + file_name, 'wb') as f: f.write(resp.content) diff --git a/Crawler/DataType.py b/Crawler/DataType.py new file mode 100644 index 0000000..a514631 --- /dev/null +++ b/Crawler/DataType.py @@ -0,0 +1,36 @@ +class PageLink: + def __init__(self): + self.title = '' + self.episode = '' + self.url = '' + + def __init__(self, title, episode, url): + self.title = title + self.episode = episode + self.url = url + + def __str__(self): + return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url) + + def __repr__(self): + return str(self) + + +class TorrentFile: + def __init__(self): + self.title = '' + self.episode = '' + self.file_name = '' + self.url = '' + + def __init__(self, title, episode, file_name, url): + self.title = title + self.file_name = file_name + self.episode = episode + self.url = url + + def __str__(self): + return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url) + + def __repr__(self): + return str(self) diff --git a/Crawler/Util.py b/Crawler/Util.py index 115209f..4d845c5 100644 --- a/Crawler/Util.py +++ b/Crawler/Util.py @@ -1,9 +1,13 @@ import subprocess +import platform class Util: @staticmethod def get_free_space(): - df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE) - output = df.communicate()[0] - device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split() + if platform.system() == 'Linux': + df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE) + output = df.communicate()[0] + device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split() + else: + available = 1024*1024*1024*1024; return int(available) diff --git a/Crawler/WorkerTfreeca.py b/Crawler/WorkerTfreeca.py new file mode 100644 index 0000000..a46b6c0 --- /dev/null +++ b/Crawler/WorkerTfreeca.py @@ -0,0 +1,78 @@ +import requests +import urllib +import bs4 +import re +import os + +from .Setting import Setting +from .ProxyHandler import ProxyHandler +from .Logger import Logger +from .Util import Util +from .DataType import PageLink, TorrentFile + + +class WorkerTfreeca: + def crawl_list(self, top_url, soup, re_title, re_episode, setting): + + links = [] + tables = soup.select('table.b_list') + trs = tables[0].select('tr') + for tr in trs[1:]: + tds = tr.select('td.subject') + if len(tds) < 1: + continue + + title = tds[0].text.strip() + link = tds[0].select('a')[1].attrs['href'] + + title_match = re_title.search(title) + if not title_match: + continue + + ep_match = re_episode.search(title) + if not ep_match: + continue + + title_idx = int(title_match.lastgroup[3:]) + video = setting.settings['video'][title_idx] + ep = int(ep_match.group(1)) + + if ep <= video['ignore_ep_under']: + Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) + continue + elif ep in setting.downloaded[video['title']]: + Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) + continue + + link = urllib.parse.urljoin(top_url, link) + links.append(PageLink(video['title'], ep, link)) + + Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) + + return links + + def crawl_downlink(self, page_link, soup): + + links = [] + file_table = soup.select('table#file_table') + a_tags = file_table[0].select('a') + torrent_links = [a for a in a_tags if '.torrent' in a.text] + smi_links = [a for a in a_tags if '.smi' in a.text] + + for link in torrent_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(page_link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) + + for link in smi_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) + + return links \ No newline at end of file diff --git a/Crawler/WorkerTocops.py b/Crawler/WorkerTocops.py new file mode 100644 index 0000000..06219c5 --- /dev/null +++ b/Crawler/WorkerTocops.py @@ -0,0 +1,97 @@ +import requests +import urllib +import bs4 +import re +import os + +from .Setting import Setting +from .ProxyHandler import ProxyHandler +from .Logger import Logger +from .Util import Util +from .DataType import PageLink, TorrentFile + +class WorkerTocops: + def crawl_list(self, top_url, soup, re_title, re_episode, setting): + + links = [] + tables = soup.select('table.board_list') + trs = tables[0].select('tr') + for tr in trs[1:]: + tds = tr.select('td.subject') + if len(tds) < 1: + continue + + title = tds[0].text.strip() + link = tds[0].select('a')[0].attrs['href'] + + title_match = re_title.search(title) + if not title_match: + continue + + ep_match = re_episode.search(title) + if not ep_match: + continue + + title_idx = int(title_match.lastgroup[3:]) + video = setting.settings['video'][title_idx] + ep = int(ep_match.group(1)) + + if ep <= video['ignore_ep_under']: + Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) + continue + elif ep in setting.downloaded[video['title']]: + Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) + continue + + link = urllib.parse.urljoin(top_url, link) + links.append(PageLink(video['title'], ep, link)) + + Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) + + return links + + def crawl_downlink(self, page_link, soup): + + links = [] + if '신고된 게시물' in soup.find(id='writeContents').text: + return links + + for a in soup.find_all('a'): + if 'href' in a.attrs and 'javascript:file_download' in a['href']: + jscript = a['href'] + start = len("javascript:file_download('") + end = jscript.index("','") + sub_url = jscript[start:end] + url = urllib.parse.urljoin(page_link.url, sub_url) + + file_name = jscript[end+3:-3] + file_name = urllib.parse.unquote(file_name) + + torrent = TorrentFile(page_link.title, page_link.episode, file_name, url) + links.append(torrent) + Logger.log(' found download link : {}({})'.format(file_name, url)) + + # + # links = [] + # file_table = soup.select('table#file_table') + # a_tags = file_table[0].select('a') + # torrent_links = [a for a in a_tags if '.torrent' in a.text] + # smi_links = [a for a in a_tags if '.smi' in a.text] + # + # for link in torrent_links: + # file_name = link.text.strip() + # sub_url = link.attrs['href'] + # url = urllib.parse.urljoin(page_link.url, sub_url) + # links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + # + # Logger.log(' found download link : {}({})'.format(file_name, url)) + # + # for link in smi_links: + # file_name = link.text.strip() + # sub_url = link.attrs['href'] + # url = urllib.parse.urljoin(link.url, sub_url) + # links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + # + # Logger.log(' found download link : {}({})'.format(file_name, url)) + + return links \ No newline at end of file diff --git a/Crawler/WorkerTorrentKim.py b/Crawler/WorkerTorrentKim.py new file mode 100644 index 0000000..781b5c7 --- /dev/null +++ b/Crawler/WorkerTorrentKim.py @@ -0,0 +1,79 @@ +import requests +import urllib +import bs4 +import re +import os + +from .Setting import Setting +from .ProxyHandler import ProxyHandler +from .Logger import Logger +from .Util import Util +from .DataType import PageLink, TorrentFile + + +class WorkerTorrentKim: + def crawl_list(self, top_url, soup, re_title, re_episode, setting): + + links = [] + tables = soup.select('table.board_list') + trs = tables[0].select('tr') + for tr in trs[1:]: + tds = tr.select('td.subject') + title = tds[0].text.strip() + link = tds[0].select('a')[0].attrs['href'] + + title_match = re_title.search(title) + if not title_match: + continue + + ep_match = re_episode.search(title) + if not ep_match: + continue + + title_idx = int(title_match.lastgroup[3:]) + video = setting.settings['video'][title_idx] + ep = int(ep_match.group(1)) + + if ep <= video['ignore_ep_under']: + Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) + continue + elif ep in setting.downloaded[video['title']]: + Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) + continue + + link = urllib.parse.urljoin(top_url, link) + links.append(PageLink(video['title'], ep, link)) + + Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) + + return links + + def crawl_downlink(self, page_link): + Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url)) + + resp = self.request_get(page_link.url) + soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') + + links = [] + file_table = soup.select('table#file_table') + a_tags = file_table[0].select('a') + torrent_links = [a for a in a_tags if '.torrent' in a.text] + smi_links = [a for a in a_tags if '.smi' in a.text] + + for link in torrent_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(page_link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) + + for link in smi_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) + + return links \ No newline at end of file