refactoring and add tocops
This commit is contained in:
@@ -8,55 +8,31 @@ from .Setting import Setting
|
|||||||
from .ProxyHandler import ProxyHandler
|
from .ProxyHandler import ProxyHandler
|
||||||
from .Logger import Logger
|
from .Logger import Logger
|
||||||
from .Util import Util
|
from .Util import Util
|
||||||
|
from .DataType import PageLink, TorrentFile
|
||||||
|
|
||||||
|
from .WorkerTorrentKim import WorkerTorrentKim
|
||||||
class PageLink:
|
from .WorkerTfreeca import WorkerTfreeca
|
||||||
def __init__(self):
|
from .WorkerTocops import WorkerTocops
|
||||||
self.title = ''
|
|
||||||
self.episode = ''
|
|
||||||
self.url = ''
|
|
||||||
|
|
||||||
def __init__(self, title, episode, url):
|
|
||||||
self.title = title
|
|
||||||
self.episode = episode
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self)
|
|
||||||
|
|
||||||
|
|
||||||
class TorrentFile:
|
|
||||||
def __init__(self):
|
|
||||||
self.title = ''
|
|
||||||
self.episode = ''
|
|
||||||
self.file_name = ''
|
|
||||||
self.url = ''
|
|
||||||
|
|
||||||
def __init__(self, title, episode, file_name, url):
|
|
||||||
self.title = title
|
|
||||||
self.file_name = file_name
|
|
||||||
self.episode = episode
|
|
||||||
self.url = url
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self)
|
|
||||||
|
|
||||||
|
|
||||||
class Crawler:
|
class Crawler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.setting = Setting()
|
self.setting = Setting()
|
||||||
self.proxy_handler = ProxyHandler()
|
self.proxy_handler = ProxyHandler()
|
||||||
|
self.worker = None
|
||||||
|
|
||||||
if len(self.setting.settings['urls']) > 0:
|
if len(self.setting.settings['urls']) > 0:
|
||||||
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
|
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
|
||||||
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
|
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
|
||||||
self.proxy_handler.check_url = top_url
|
self.proxy_handler.check_url = top_url
|
||||||
|
|
||||||
|
if 'torrentkim' in top_url:
|
||||||
|
self.worker = WorkerTorrentKim()
|
||||||
|
elif 'tfreeca' in top_url:
|
||||||
|
self.worker = WorkerTfreeca()
|
||||||
|
elif 'tcorea' in top_url:
|
||||||
|
self.worker = WorkerTocops()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def print_log(files):
|
def print_log(files):
|
||||||
f = open('output/log.txt', 'at')
|
f = open('output/log.txt', 'at')
|
||||||
@@ -97,41 +73,10 @@ class Crawler:
|
|||||||
resp = self.request_get(url)
|
resp = self.request_get(url)
|
||||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||||
|
|
||||||
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
||||||
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
||||||
|
|
||||||
links = []
|
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
|
||||||
tables = soup.select('table.board_list')
|
|
||||||
trs = tables[0].select('tr')
|
|
||||||
for tr in trs[1:]:
|
|
||||||
tds = tr.select('td.subject')
|
|
||||||
title = tds[0].text.strip()
|
|
||||||
link = tds[0].select('a')[0].attrs['href']
|
|
||||||
|
|
||||||
title_match = re_title.search(title)
|
|
||||||
if not title_match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
ep_match = re_episode.search(title)
|
|
||||||
if not ep_match:
|
|
||||||
continue
|
|
||||||
|
|
||||||
title_idx = int(title_match.lastgroup[3:])
|
|
||||||
video = self.setting.settings['video'][title_idx]
|
|
||||||
ep = int(ep_match.group(1))
|
|
||||||
|
|
||||||
if ep <= video['ignore_ep_under']:
|
|
||||||
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
|
||||||
continue
|
|
||||||
elif ep in self.setting.downloaded[video['title']]:
|
|
||||||
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
|
||||||
continue
|
|
||||||
|
|
||||||
link = urllib.parse.urljoin(url, link)
|
|
||||||
links.append(PageLink(video['title'], ep, link))
|
|
||||||
|
|
||||||
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def crawl_downlink(self, page_link):
|
def crawl_downlink(self, page_link):
|
||||||
@@ -140,28 +85,7 @@ class Crawler:
|
|||||||
resp = self.request_get(page_link.url)
|
resp = self.request_get(page_link.url)
|
||||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||||
|
|
||||||
links = []
|
links = self.worker.crawl_downlink(page_link, soup)
|
||||||
file_table = soup.select('table#file_table')
|
|
||||||
a_tags = file_table[0].select('a')
|
|
||||||
torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
|
||||||
smi_links = [a for a in a_tags if '.smi' in a.text]
|
|
||||||
|
|
||||||
for link in torrent_links:
|
|
||||||
file_name = link.text.strip()
|
|
||||||
sub_url = link.attrs['href']
|
|
||||||
url = urllib.parse.urljoin(page_link.url, sub_url)
|
|
||||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
|
||||||
|
|
||||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
|
||||||
|
|
||||||
for link in smi_links:
|
|
||||||
file_name = link.text.strip()
|
|
||||||
sub_url = link.attrs['href']
|
|
||||||
url = urllib.parse.urljoin(link.url, sub_url)
|
|
||||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
|
||||||
|
|
||||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
|
||||||
|
|
||||||
return links
|
return links
|
||||||
|
|
||||||
def download_files(self, file_link):
|
def download_files(self, file_link):
|
||||||
@@ -172,8 +96,7 @@ class Crawler:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
resp = self.request_get(file_link.url)
|
resp = self.request_get(file_link.url)
|
||||||
file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
|
file_name = file_link.file_name
|
||||||
file_name = urllib.parse.unquote(file_name[0])
|
|
||||||
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
|
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
|
||||||
f.write(resp.content)
|
f.write(resp.content)
|
||||||
|
|
||||||
|
|||||||
36
Crawler/DataType.py
Normal file
36
Crawler/DataType.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
class PageLink:
|
||||||
|
def __init__(self):
|
||||||
|
self.title = ''
|
||||||
|
self.episode = ''
|
||||||
|
self.url = ''
|
||||||
|
|
||||||
|
def __init__(self, title, episode, url):
|
||||||
|
self.title = title
|
||||||
|
self.episode = episode
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
||||||
|
|
||||||
|
|
||||||
|
class TorrentFile:
|
||||||
|
def __init__(self):
|
||||||
|
self.title = ''
|
||||||
|
self.episode = ''
|
||||||
|
self.file_name = ''
|
||||||
|
self.url = ''
|
||||||
|
|
||||||
|
def __init__(self, title, episode, file_name, url):
|
||||||
|
self.title = title
|
||||||
|
self.file_name = file_name
|
||||||
|
self.episode = episode
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self)
|
||||||
@@ -1,9 +1,13 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
import platform
|
||||||
|
|
||||||
class Util:
|
class Util:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_free_space():
|
def get_free_space():
|
||||||
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
|
if platform.system() == 'Linux':
|
||||||
output = df.communicate()[0]
|
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
|
||||||
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
|
output = df.communicate()[0]
|
||||||
|
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
|
||||||
|
else:
|
||||||
|
available = 1024*1024*1024*1024;
|
||||||
return int(available)
|
return int(available)
|
||||||
|
|||||||
78
Crawler/WorkerTfreeca.py
Normal file
78
Crawler/WorkerTfreeca.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
import bs4
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .Setting import Setting
|
||||||
|
from .ProxyHandler import ProxyHandler
|
||||||
|
from .Logger import Logger
|
||||||
|
from .Util import Util
|
||||||
|
from .DataType import PageLink, TorrentFile
|
||||||
|
|
||||||
|
|
||||||
|
class WorkerTfreeca:
|
||||||
|
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
tables = soup.select('table.b_list')
|
||||||
|
trs = tables[0].select('tr')
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td.subject')
|
||||||
|
if len(tds) < 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = tds[0].text.strip()
|
||||||
|
link = tds[0].select('a')[1].attrs['href']
|
||||||
|
|
||||||
|
title_match = re_title.search(title)
|
||||||
|
if not title_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ep_match = re_episode.search(title)
|
||||||
|
if not ep_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_idx = int(title_match.lastgroup[3:])
|
||||||
|
video = setting.settings['video'][title_idx]
|
||||||
|
ep = int(ep_match.group(1))
|
||||||
|
|
||||||
|
if ep <= video['ignore_ep_under']:
|
||||||
|
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||||
|
continue
|
||||||
|
elif ep in setting.downloaded[video['title']]:
|
||||||
|
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = urllib.parse.urljoin(top_url, link)
|
||||||
|
links.append(PageLink(video['title'], ep, link))
|
||||||
|
|
||||||
|
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def crawl_downlink(self, page_link, soup):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
file_table = soup.select('table#file_table')
|
||||||
|
a_tags = file_table[0].select('a')
|
||||||
|
torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||||
|
smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||||
|
|
||||||
|
for link in torrent_links:
|
||||||
|
file_name = link.text.strip()
|
||||||
|
sub_url = link.attrs['href']
|
||||||
|
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||||
|
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
|
||||||
|
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
for link in smi_links:
|
||||||
|
file_name = link.text.strip()
|
||||||
|
sub_url = link.attrs['href']
|
||||||
|
url = urllib.parse.urljoin(link.url, sub_url)
|
||||||
|
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
|
||||||
|
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
return links
|
||||||
97
Crawler/WorkerTocops.py
Normal file
97
Crawler/WorkerTocops.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
import bs4
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .Setting import Setting
|
||||||
|
from .ProxyHandler import ProxyHandler
|
||||||
|
from .Logger import Logger
|
||||||
|
from .Util import Util
|
||||||
|
from .DataType import PageLink, TorrentFile
|
||||||
|
|
||||||
|
class WorkerTocops:
|
||||||
|
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
tables = soup.select('table.board_list')
|
||||||
|
trs = tables[0].select('tr')
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td.subject')
|
||||||
|
if len(tds) < 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = tds[0].text.strip()
|
||||||
|
link = tds[0].select('a')[0].attrs['href']
|
||||||
|
|
||||||
|
title_match = re_title.search(title)
|
||||||
|
if not title_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ep_match = re_episode.search(title)
|
||||||
|
if not ep_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_idx = int(title_match.lastgroup[3:])
|
||||||
|
video = setting.settings['video'][title_idx]
|
||||||
|
ep = int(ep_match.group(1))
|
||||||
|
|
||||||
|
if ep <= video['ignore_ep_under']:
|
||||||
|
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||||
|
continue
|
||||||
|
elif ep in setting.downloaded[video['title']]:
|
||||||
|
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = urllib.parse.urljoin(top_url, link)
|
||||||
|
links.append(PageLink(video['title'], ep, link))
|
||||||
|
|
||||||
|
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def crawl_downlink(self, page_link, soup):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
if '신고된 게시물' in soup.find(id='writeContents').text:
|
||||||
|
return links
|
||||||
|
|
||||||
|
for a in soup.find_all('a'):
|
||||||
|
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
|
||||||
|
jscript = a['href']
|
||||||
|
start = len("javascript:file_download('")
|
||||||
|
end = jscript.index("','")
|
||||||
|
sub_url = jscript[start:end]
|
||||||
|
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||||
|
|
||||||
|
file_name = jscript[end+3:-3]
|
||||||
|
file_name = urllib.parse.unquote(file_name)
|
||||||
|
|
||||||
|
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
|
||||||
|
links.append(torrent)
|
||||||
|
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
#
|
||||||
|
# links = []
|
||||||
|
# file_table = soup.select('table#file_table')
|
||||||
|
# a_tags = file_table[0].select('a')
|
||||||
|
# torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||||
|
# smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||||
|
#
|
||||||
|
# for link in torrent_links:
|
||||||
|
# file_name = link.text.strip()
|
||||||
|
# sub_url = link.attrs['href']
|
||||||
|
# url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||||
|
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
#
|
||||||
|
# Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
#
|
||||||
|
# for link in smi_links:
|
||||||
|
# file_name = link.text.strip()
|
||||||
|
# sub_url = link.attrs['href']
|
||||||
|
# url = urllib.parse.urljoin(link.url, sub_url)
|
||||||
|
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
#
|
||||||
|
# Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
return links
|
||||||
79
Crawler/WorkerTorrentKim.py
Normal file
79
Crawler/WorkerTorrentKim.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
import bs4
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
from .Setting import Setting
|
||||||
|
from .ProxyHandler import ProxyHandler
|
||||||
|
from .Logger import Logger
|
||||||
|
from .Util import Util
|
||||||
|
from .DataType import PageLink, TorrentFile
|
||||||
|
|
||||||
|
|
||||||
|
class WorkerTorrentKim:
|
||||||
|
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||||
|
|
||||||
|
links = []
|
||||||
|
tables = soup.select('table.board_list')
|
||||||
|
trs = tables[0].select('tr')
|
||||||
|
for tr in trs[1:]:
|
||||||
|
tds = tr.select('td.subject')
|
||||||
|
title = tds[0].text.strip()
|
||||||
|
link = tds[0].select('a')[0].attrs['href']
|
||||||
|
|
||||||
|
title_match = re_title.search(title)
|
||||||
|
if not title_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ep_match = re_episode.search(title)
|
||||||
|
if not ep_match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_idx = int(title_match.lastgroup[3:])
|
||||||
|
video = setting.settings['video'][title_idx]
|
||||||
|
ep = int(ep_match.group(1))
|
||||||
|
|
||||||
|
if ep <= video['ignore_ep_under']:
|
||||||
|
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||||
|
continue
|
||||||
|
elif ep in setting.downloaded[video['title']]:
|
||||||
|
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = urllib.parse.urljoin(top_url, link)
|
||||||
|
links.append(PageLink(video['title'], ep, link))
|
||||||
|
|
||||||
|
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def crawl_downlink(self, page_link):
|
||||||
|
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
|
||||||
|
|
||||||
|
resp = self.request_get(page_link.url)
|
||||||
|
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||||
|
|
||||||
|
links = []
|
||||||
|
file_table = soup.select('table#file_table')
|
||||||
|
a_tags = file_table[0].select('a')
|
||||||
|
torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||||
|
smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||||
|
|
||||||
|
for link in torrent_links:
|
||||||
|
file_name = link.text.strip()
|
||||||
|
sub_url = link.attrs['href']
|
||||||
|
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||||
|
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
|
||||||
|
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
for link in smi_links:
|
||||||
|
file_name = link.text.strip()
|
||||||
|
sub_url = link.attrs['href']
|
||||||
|
url = urllib.parse.urljoin(link.url, sub_url)
|
||||||
|
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||||
|
|
||||||
|
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||||
|
|
||||||
|
return links
|
||||||
Reference in New Issue
Block a user