refactoring and add tocops

This commit is contained in:
2018-06-25 00:47:32 +09:00
parent 9d7afbdc1b
commit 8c9ddb9ce8
6 changed files with 314 additions and 97 deletions

View File

@@ -8,55 +8,31 @@ from .Setting import Setting
from .ProxyHandler import ProxyHandler from .ProxyHandler import ProxyHandler
from .Logger import Logger from .Logger import Logger
from .Util import Util from .Util import Util
from .DataType import PageLink, TorrentFile
from .WorkerTorrentKim import WorkerTorrentKim
class PageLink: from .WorkerTfreeca import WorkerTfreeca
def __init__(self): from .WorkerTocops import WorkerTocops
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)
class Crawler: class Crawler:
def __init__(self): def __init__(self):
self.setting = Setting() self.setting = Setting()
self.proxy_handler = ProxyHandler() self.proxy_handler = ProxyHandler()
self.worker = None
if len(self.setting.settings['urls']) > 0: if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0]) urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', '')) top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url self.proxy_handler.check_url = top_url
if 'torrentkim' in top_url:
self.worker = WorkerTorrentKim()
elif 'tfreeca' in top_url:
self.worker = WorkerTfreeca()
elif 'tcorea' in top_url:
self.worker = WorkerTocops()
@staticmethod @staticmethod
def print_log(files): def print_log(files):
f = open('output/log.txt', 'at') f = open('output/log.txt', 'at')
@@ -97,41 +73,10 @@ class Crawler:
resp = self.request_get(url) resp = self.request_get(url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = [] links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = self.setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in self.setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links return links
def crawl_downlink(self, page_link): def crawl_downlink(self, page_link):
@@ -140,28 +85,7 @@ class Crawler:
resp = self.request_get(page_link.url) resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = [] links = self.worker.crawl_downlink(page_link, soup)
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links return links
def download_files(self, file_link): def download_files(self, file_link):
@@ -172,8 +96,7 @@ class Crawler:
try: try:
resp = self.request_get(file_link.url) resp = self.request_get(file_link.url)
file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition']) file_name = file_link.file_name
file_name = urllib.parse.unquote(file_name[0])
with open(self.setting.settings['download_path'] + file_name, 'wb') as f: with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content) f.write(resp.content)

36
Crawler/DataType.py Normal file
View File

@@ -0,0 +1,36 @@
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)

View File

@@ -1,9 +1,13 @@
import subprocess import subprocess
import platform
class Util: class Util:
@staticmethod @staticmethod
def get_free_space(): def get_free_space():
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE) if platform.system() == 'Linux':
output = df.communicate()[0] df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split() output = df.communicate()[0]
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
else:
available = 1024*1024*1024*1024;
return int(available) return int(available)

78
Crawler/WorkerTfreeca.py Normal file
View File

@@ -0,0 +1,78 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTfreeca:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.b_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[1].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

97
Crawler/WorkerTocops.py Normal file
View File

@@ -0,0 +1,97 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTocops:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
if '신고된 게시물' in soup.find(id='writeContents').text:
return links
for a in soup.find_all('a'):
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
jscript = a['href']
start = len("javascript:file_download('")
end = jscript.index("','")
sub_url = jscript[start:end]
url = urllib.parse.urljoin(page_link.url, sub_url)
file_name = jscript[end+3:-3]
file_name = urllib.parse.unquote(file_name)
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
links.append(torrent)
Logger.log(' found download link : {}({})'.format(file_name, url))
#
# links = []
# file_table = soup.select('table#file_table')
# a_tags = file_table[0].select('a')
# torrent_links = [a for a in a_tags if '.torrent' in a.text]
# smi_links = [a for a in a_tags if '.smi' in a.text]
#
# for link in torrent_links:
# file_name = link.text.strip()
# sub_url = link.attrs['href']
# url = urllib.parse.urljoin(page_link.url, sub_url)
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
#
# Logger.log(' found download link : {}({})'.format(file_name, url))
#
# for link in smi_links:
# file_name = link.text.strip()
# sub_url = link.attrs['href']
# url = urllib.parse.urljoin(link.url, sub_url)
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
#
# Logger.log(' found download link : {}({})'.format(file_name, url))
return links

View File

@@ -0,0 +1,79 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTorrentKim:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links