refactoring and add tocops

2018-06-25 00:47:32 +09:00
parent 9d7afbdc1b
commit 8c9ddb9ce8
6 changed files with 314 additions and 97 deletions
--- a/Crawler/Crawler.py
+++ b/Crawler/Crawler.py
@@ -8,55 +8,31 @@ from .Setting import Setting
 from .ProxyHandler import ProxyHandler
 from .Logger import Logger
 from .Util import Util
 from .DataType import PageLink, TorrentFile
-
+from .WorkerTorrentKim import WorkerTorrentKim
-class PageLink:
+from .WorkerTfreeca import WorkerTfreeca
-	def __init__(self):
+from .WorkerTocops import  WorkerTocops
 		self.title = ''
 		self.episode = ''
 		self.url = ''
 	def __init__(self, title, episode, url):
 		self.title = title
 		self.episode = episode
 		self.url = url
 	def __str__(self):
 		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
 	def __repr__(self):
 		return str(self)
 class TorrentFile:
 	def __init__(self):
 		self.title = ''
 		self.episode = ''
 		self.file_name = ''
 		self.url = ''
 	def __init__(self, title, episode, file_name, url):
 		self.title = title
 		self.file_name = file_name
 		self.episode = episode
 		self.url = url
 	def __str__(self):
 		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
 	def __repr__(self):
 		return str(self)
 class Crawler:
 	def __init__(self):
 		self.setting = Setting()
 		self.proxy_handler = ProxyHandler()
 		self.worker = None
 		if len(self.setting.settings['urls']) > 0:
 			urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
 			top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
 			self.proxy_handler.check_url = top_url
 		if 'torrentkim' in top_url:
 			self.worker = WorkerTorrentKim()
 		elif 'tfreeca' in top_url:
 			self.worker = WorkerTfreeca()
 		elif 'tcorea' in top_url:
 			self.worker = WorkerTocops()
 	@staticmethod
 	def print_log(files):
 		f = open('output/log.txt', 'at')
@@ -97,41 +73,10 @@ class Crawler:
 		resp = self.request_get(url)
 		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
-		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
+		re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
 		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
-		links = []
+		links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
 		tables = soup.select('table.board_list')
 		trs = tables[0].select('tr')
 		for tr in trs[1:]:
 			tds = tr.select('td.subject')
 			title = tds[0].text.strip()
 			link = tds[0].select('a')[0].attrs['href']
 			title_match = re_title.search(title)
 			if not title_match:
 				continue
 			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue
 			title_idx = int(title_match.lastgroup[3:])
 			video = self.setting.settings['video'][title_idx]
 			ep = int(ep_match.group(1))
 			if ep <= video['ignore_ep_under']:
 				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
 				continue
 			elif ep in self.setting.downloaded[video['title']]:
 				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
 				continue
 			link = urllib.parse.urljoin(url, link)
 			links.append(PageLink(video['title'], ep, link))
 			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
 		return links
 	def crawl_downlink(self, page_link):
@@ -140,28 +85,7 @@ class Crawler:
 		resp = self.request_get(page_link.url)
 		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
-		links = []
+		links = self.worker.crawl_downlink(page_link, soup)
 		file_table = soup.select('table#file_table')
 		a_tags = file_table[0].select('a')
 		torrent_links = [a for a in a_tags if '.torrent' in a.text]
 		smi_links = [a for a in a_tags if '.smi' in a.text]
 		for link in torrent_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(page_link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		for link in smi_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		return links
 	def download_files(self, file_link):
@@ -172,8 +96,7 @@ class Crawler:
 		try:
 			resp = self.request_get(file_link.url)
-			file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
+			file_name = file_link.file_name
 			file_name = urllib.parse.unquote(file_name[0])
 			with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
 				f.write(resp.content)
--- a/Crawler/DataType.py
+++ b/Crawler/DataType.py
@@ -0,0 +1,36 @@
 class PageLink:
 	def __init__(self):
 		self.title = ''
 		self.episode = ''
 		self.url = ''
 	def __init__(self, title, episode, url):
 		self.title = title
 		self.episode = episode
 		self.url = url
 	def __str__(self):
 		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
 	def __repr__(self):
 		return str(self)
 class TorrentFile:
 	def __init__(self):
 		self.title = ''
 		self.episode = ''
 		self.file_name = ''
 		self.url = ''
 	def __init__(self, title, episode, file_name, url):
 		self.title = title
 		self.file_name = file_name
 		self.episode = episode
 		self.url = url
 	def __str__(self):
 		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
 	def __repr__(self):
 		return str(self)
--- a/Crawler/Util.py
+++ b/Crawler/Util.py
@@ -1,9 +1,13 @@
 import subprocess
 import platform
 class Util:
 	@staticmethod
 	def get_free_space():
-		df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
+		if platform.system() == 'Linux':
-		output = df.communicate()[0]
+			df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
-		device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
+			output = df.communicate()[0]
 			device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
 		else:
 			available = 1024*1024*1024*1024;
 		return int(available)
--- a/Crawler/WorkerTfreeca.py
+++ b/Crawler/WorkerTfreeca.py
@@ -0,0 +1,78 @@
 import requests
 import urllib
 import bs4
 import re
 import os
 from .Setting import Setting
 from .ProxyHandler import ProxyHandler
 from .Logger import Logger
 from .Util import Util
 from .DataType import PageLink, TorrentFile
 class WorkerTfreeca:
 	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
 		links = []
 		tables = soup.select('table.b_list')
 		trs = tables[0].select('tr')
 		for tr in trs[1:]:
 			tds = tr.select('td.subject')
 			if len(tds) < 1:
 				continue
 			title = tds[0].text.strip()
 			link = tds[0].select('a')[1].attrs['href']
 			title_match = re_title.search(title)
 			if not title_match:
 				continue
 			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue
 			title_idx = int(title_match.lastgroup[3:])
 			video = setting.settings['video'][title_idx]
 			ep = int(ep_match.group(1))
 			if ep <= video['ignore_ep_under']:
 				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
 				continue
 			elif ep in setting.downloaded[video['title']]:
 				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
 				continue
 			link = urllib.parse.urljoin(top_url, link)
 			links.append(PageLink(video['title'], ep, link))
 			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
 		return links
 	def crawl_downlink(self, page_link, soup):
 		links = []
 		file_table = soup.select('table#file_table')
 		a_tags = file_table[0].select('a')
 		torrent_links = [a for a in a_tags if '.torrent' in a.text]
 		smi_links = [a for a in a_tags if '.smi' in a.text]
 		for link in torrent_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(page_link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		for link in smi_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		return links
--- a/Crawler/WorkerTocops.py
+++ b/Crawler/WorkerTocops.py
@@ -0,0 +1,97 @@
 import requests
 import urllib
 import bs4
 import re
 import os
 from .Setting import Setting
 from .ProxyHandler import ProxyHandler
 from .Logger import Logger
 from .Util import Util
 from .DataType import PageLink, TorrentFile
 class WorkerTocops:
 	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
 		links = []
 		tables = soup.select('table.board_list')
 		trs = tables[0].select('tr')
 		for tr in trs[1:]:
 			tds = tr.select('td.subject')
 			if len(tds) < 1:
 				continue
 			title = tds[0].text.strip()
 			link = tds[0].select('a')[0].attrs['href']
 			title_match = re_title.search(title)
 			if not title_match:
 				continue
 			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue
 			title_idx = int(title_match.lastgroup[3:])
 			video = setting.settings['video'][title_idx]
 			ep = int(ep_match.group(1))
 			if ep <= video['ignore_ep_under']:
 				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
 				continue
 			elif ep in setting.downloaded[video['title']]:
 				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
 				continue
 			link = urllib.parse.urljoin(top_url, link)
 			links.append(PageLink(video['title'], ep, link))
 			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
 		return links
 	def crawl_downlink(self, page_link, soup):
 		links = []
 		if '신고된 게시물' in soup.find(id='writeContents').text:
 			return links
 		for a in soup.find_all('a'):
 			if 'href' in a.attrs and 'javascript:file_download' in a['href']:
 				jscript = a['href']
 				start = len("javascript:file_download('")
 				end = jscript.index("','")
 				sub_url = jscript[start:end]
 				url = urllib.parse.urljoin(page_link.url, sub_url)
 				file_name = jscript[end+3:-3]
 				file_name = urllib.parse.unquote(file_name)
 				torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
 				links.append(torrent)
 				Logger.log('    found download link : {}({})'.format(file_name, url))
 		#
 		# links = []
 		# file_table = soup.select('table#file_table')
 		# a_tags = file_table[0].select('a')
 		# torrent_links = [a for a in a_tags if '.torrent' in a.text]
 		# smi_links = [a for a in a_tags if '.smi' in a.text]
 		#
 		# for link in torrent_links:
 		# 	file_name = link.text.strip()
 		# 	sub_url = link.attrs['href']
 		# 	url = urllib.parse.urljoin(page_link.url, sub_url)
 		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 		#
 		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
 		#
 		# for link in smi_links:
 		# 	file_name = link.text.strip()
 		# 	sub_url = link.attrs['href']
 		# 	url = urllib.parse.urljoin(link.url, sub_url)
 		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 		#
 		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
 		return links
--- a/Crawler/WorkerTorrentKim.py
+++ b/Crawler/WorkerTorrentKim.py
@@ -0,0 +1,79 @@
 import requests
 import urllib
 import bs4
 import re
 import os
 from .Setting import Setting
 from .ProxyHandler import ProxyHandler
 from .Logger import Logger
 from .Util import Util
 from .DataType import PageLink, TorrentFile
 class WorkerTorrentKim:
 	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
 		links = []
 		tables = soup.select('table.board_list')
 		trs = tables[0].select('tr')
 		for tr in trs[1:]:
 			tds = tr.select('td.subject')
 			title = tds[0].text.strip()
 			link = tds[0].select('a')[0].attrs['href']
 			title_match = re_title.search(title)
 			if not title_match:
 				continue
 			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue
 			title_idx = int(title_match.lastgroup[3:])
 			video = setting.settings['video'][title_idx]
 			ep = int(ep_match.group(1))
 			if ep <= video['ignore_ep_under']:
 				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
 				continue
 			elif ep in setting.downloaded[video['title']]:
 				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
 				continue
 			link = urllib.parse.urljoin(top_url, link)
 			links.append(PageLink(video['title'], ep, link))
 			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
 		return links
 	def crawl_downlink(self, page_link):
 		Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
 		resp = self.request_get(page_link.url)
 		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
 		links = []
 		file_table = soup.select('table#file_table')
 		a_tags = file_table[0].select('a')
 		torrent_links = [a for a in a_tags if '.torrent' in a.text]
 		smi_links = [a for a in a_tags if '.smi' in a.text]
 		for link in torrent_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(page_link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		for link in smi_links:
 			file_name = link.text.strip()
 			sub_url = link.attrs['href']
 			url = urllib.parse.urljoin(link.url, sub_url)
 			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
 			Logger.log('    found download link : {}({})'.format(file_name, url))
 		return links