From 8c9ddb9ce84d0199a4c469fa6e5056ff7cc3fe17 Mon Sep 17 00:00:00 2001
From: mjjo53 <mjjo53@gmail.com>
Date: Mon, 25 Jun 2018 00:47:32 +0900
Subject: [PATCH] refactoring and add tocops

---
 Crawler/Crawler.py          | 111 ++++++------------------------------
 Crawler/DataType.py         |  36 ++++++++++++
 Crawler/Util.py             |  10 +++-
 Crawler/WorkerTfreeca.py    |  78 +++++++++++++++++++++++++
 Crawler/WorkerTocops.py     |  97 +++++++++++++++++++++++++++++++
 Crawler/WorkerTorrentKim.py |  79 +++++++++++++++++++++++++
 6 files changed, 314 insertions(+), 97 deletions(-)
 create mode 100644 Crawler/DataType.py
 create mode 100644 Crawler/WorkerTfreeca.py
 create mode 100644 Crawler/WorkerTocops.py
 create mode 100644 Crawler/WorkerTorrentKim.py

diff --git a/Crawler/Crawler.py b/Crawler/Crawler.py
index 6680838..29993ec 100755
--- a/Crawler/Crawler.py
+++ b/Crawler/Crawler.py
@@ -8,55 +8,31 @@ from .Setting import Setting
 from .ProxyHandler import ProxyHandler
 from .Logger import Logger
 from .Util import Util
+from .DataType import PageLink, TorrentFile
 
-
-class PageLink:
-	def __init__(self):
-		self.title = ''
-		self.episode = ''
-		self.url = ''
-
-	def __init__(self, title, episode, url):
-		self.title = title
-		self.episode = episode
-		self.url = url
-
-	def __str__(self):
-		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
-
-	def __repr__(self):
-		return str(self)
-
-
-class TorrentFile:
-	def __init__(self):
-		self.title = ''
-		self.episode = ''
-		self.file_name = ''
-		self.url = ''
-
-	def __init__(self, title, episode, file_name, url):
-		self.title = title
-		self.file_name = file_name
-		self.episode = episode
-		self.url = url
-
-	def __str__(self):
-		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
-
-	def __repr__(self):
-		return str(self)
+from .WorkerTorrentKim import WorkerTorrentKim
+from .WorkerTfreeca import WorkerTfreeca
+from .WorkerTocops import  WorkerTocops
 
 
 class Crawler:
 	def __init__(self):
 		self.setting = Setting()
 		self.proxy_handler = ProxyHandler()
+		self.worker = None
+
 		if len(self.setting.settings['urls']) > 0:
 			urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
 			top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
 			self.proxy_handler.check_url = top_url
 
+		if 'torrentkim' in top_url:
+			self.worker = WorkerTorrentKim()
+		elif 'tfreeca' in top_url:
+			self.worker = WorkerTfreeca()
+		elif 'tcorea' in top_url:
+			self.worker = WorkerTocops()
+
 	@staticmethod
 	def print_log(files):
 		f = open('output/log.txt', 'at')
@@ -97,41 +73,10 @@ class Crawler:
 		resp = self.request_get(url)
 		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
 
-		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
+		re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
 		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
 
-		links = []
-		tables = soup.select('table.board_list')
-		trs = tables[0].select('tr')
-		for tr in trs[1:]:
-			tds = tr.select('td.subject')
-			title = tds[0].text.strip()
-			link = tds[0].select('a')[0].attrs['href']
-
-			title_match = re_title.search(title)
-			if not title_match:
-				continue
-
-			ep_match = re_episode.search(title)
-			if not ep_match:
-				continue
-
-			title_idx = int(title_match.lastgroup[3:])
-			video = self.setting.settings['video'][title_idx]
-			ep = int(ep_match.group(1))
-
-			if ep <= video['ignore_ep_under']:
-				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
-				continue
-			elif ep in self.setting.downloaded[video['title']]:
-				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
-				continue
-
-			link = urllib.parse.urljoin(url, link)
-			links.append(PageLink(video['title'], ep, link))
-
-			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
-
+		links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
 		return links
 
 	def crawl_downlink(self, page_link):
@@ -140,28 +85,7 @@ class Crawler:
 		resp = self.request_get(page_link.url)
 		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
 
-		links = []
-		file_table = soup.select('table#file_table')
-		a_tags = file_table[0].select('a')
-		torrent_links = [a for a in a_tags if '.torrent' in a.text]
-		smi_links = [a for a in a_tags if '.smi' in a.text]
-
-		for link in torrent_links:
-			file_name = link.text.strip()
-			sub_url = link.attrs['href']
-			url = urllib.parse.urljoin(page_link.url, sub_url)
-			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
-
-			Logger.log('    found download link : {}({})'.format(file_name, url))
-
-		for link in smi_links:
-			file_name = link.text.strip()
-			sub_url = link.attrs['href']
-			url = urllib.parse.urljoin(link.url, sub_url)
-			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
-
-			Logger.log('    found download link : {}({})'.format(file_name, url))
-
+		links = self.worker.crawl_downlink(page_link, soup)
 		return links
 
 	def download_files(self, file_link):
@@ -172,8 +96,7 @@ class Crawler:
 
 		try:
 			resp = self.request_get(file_link.url)
-			file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
-			file_name = urllib.parse.unquote(file_name[0])
+			file_name = file_link.file_name
 			with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
 				f.write(resp.content)
 
diff --git a/Crawler/DataType.py b/Crawler/DataType.py
new file mode 100644
index 0000000..a514631
--- /dev/null
+++ b/Crawler/DataType.py
@@ -0,0 +1,36 @@
+class PageLink:
+	def __init__(self):
+		self.title = ''
+		self.episode = ''
+		self.url = ''
+
+	def __init__(self, title, episode, url):
+		self.title = title
+		self.episode = episode
+		self.url = url
+
+	def __str__(self):
+		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
+
+	def __repr__(self):
+		return str(self)
+
+
+class TorrentFile:
+	def __init__(self):
+		self.title = ''
+		self.episode = ''
+		self.file_name = ''
+		self.url = ''
+
+	def __init__(self, title, episode, file_name, url):
+		self.title = title
+		self.file_name = file_name
+		self.episode = episode
+		self.url = url
+
+	def __str__(self):
+		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
+
+	def __repr__(self):
+		return str(self)
diff --git a/Crawler/Util.py b/Crawler/Util.py
index 115209f..4d845c5 100644
--- a/Crawler/Util.py
+++ b/Crawler/Util.py
@@ -1,9 +1,13 @@
 import subprocess
+import platform
 
 class Util:
 	@staticmethod
 	def get_free_space():
-		df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
-		output = df.communicate()[0]
-		device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
+		if platform.system() == 'Linux':
+			df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
+			output = df.communicate()[0]
+			device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
+		else:
+			available = 1024*1024*1024*1024;
 		return int(available)
diff --git a/Crawler/WorkerTfreeca.py b/Crawler/WorkerTfreeca.py
new file mode 100644
index 0000000..a46b6c0
--- /dev/null
+++ b/Crawler/WorkerTfreeca.py
@@ -0,0 +1,78 @@
+import requests
+import urllib
+import bs4
+import re
+import os
+
+from .Setting import Setting
+from .ProxyHandler import ProxyHandler
+from .Logger import Logger
+from .Util import Util
+from .DataType import PageLink, TorrentFile
+
+
+class WorkerTfreeca:
+	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
+
+		links = []
+		tables = soup.select('table.b_list')
+		trs = tables[0].select('tr')
+		for tr in trs[1:]:
+			tds = tr.select('td.subject')
+			if len(tds) < 1:
+				continue
+
+			title = tds[0].text.strip()
+			link = tds[0].select('a')[1].attrs['href']
+
+			title_match = re_title.search(title)
+			if not title_match:
+				continue
+
+			ep_match = re_episode.search(title)
+			if not ep_match:
+				continue
+
+			title_idx = int(title_match.lastgroup[3:])
+			video = setting.settings['video'][title_idx]
+			ep = int(ep_match.group(1))
+
+			if ep <= video['ignore_ep_under']:
+				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
+				continue
+			elif ep in setting.downloaded[video['title']]:
+				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
+				continue
+
+			link = urllib.parse.urljoin(top_url, link)
+			links.append(PageLink(video['title'], ep, link))
+
+			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
+
+		return links
+
+	def crawl_downlink(self, page_link, soup):
+
+		links = []
+		file_table = soup.select('table#file_table')
+		a_tags = file_table[0].select('a')
+		torrent_links = [a for a in a_tags if '.torrent' in a.text]
+		smi_links = [a for a in a_tags if '.smi' in a.text]
+
+		for link in torrent_links:
+			file_name = link.text.strip()
+			sub_url = link.attrs['href']
+			url = urllib.parse.urljoin(page_link.url, sub_url)
+			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+
+			Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		for link in smi_links:
+			file_name = link.text.strip()
+			sub_url = link.attrs['href']
+			url = urllib.parse.urljoin(link.url, sub_url)
+			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+
+			Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		return links
\ No newline at end of file
diff --git a/Crawler/WorkerTocops.py b/Crawler/WorkerTocops.py
new file mode 100644
index 0000000..06219c5
--- /dev/null
+++ b/Crawler/WorkerTocops.py
@@ -0,0 +1,97 @@
+import requests
+import urllib
+import bs4
+import re
+import os
+
+from .Setting import Setting
+from .ProxyHandler import ProxyHandler
+from .Logger import Logger
+from .Util import Util
+from .DataType import PageLink, TorrentFile
+
+class WorkerTocops:
+	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
+
+		links = []
+		tables = soup.select('table.board_list')
+		trs = tables[0].select('tr')
+		for tr in trs[1:]:
+			tds = tr.select('td.subject')
+			if len(tds) < 1:
+				continue
+
+			title = tds[0].text.strip()
+			link = tds[0].select('a')[0].attrs['href']
+
+			title_match = re_title.search(title)
+			if not title_match:
+				continue
+
+			ep_match = re_episode.search(title)
+			if not ep_match:
+				continue
+
+			title_idx = int(title_match.lastgroup[3:])
+			video = setting.settings['video'][title_idx]
+			ep = int(ep_match.group(1))
+
+			if ep <= video['ignore_ep_under']:
+				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
+				continue
+			elif ep in setting.downloaded[video['title']]:
+				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
+				continue
+
+			link = urllib.parse.urljoin(top_url, link)
+			links.append(PageLink(video['title'], ep, link))
+
+			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
+
+		return links
+
+	def crawl_downlink(self, page_link, soup):
+
+		links = []
+		if '신고된 게시물' in soup.find(id='writeContents').text:
+			return links
+
+		for a in soup.find_all('a'):
+			if 'href' in a.attrs and 'javascript:file_download' in a['href']:
+				jscript = a['href']
+				start = len("javascript:file_download('")
+				end = jscript.index("','")
+				sub_url = jscript[start:end]
+				url = urllib.parse.urljoin(page_link.url, sub_url)
+
+				file_name = jscript[end+3:-3]
+				file_name = urllib.parse.unquote(file_name)
+
+				torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
+				links.append(torrent)
+				Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		#
+		# links = []
+		# file_table = soup.select('table#file_table')
+		# a_tags = file_table[0].select('a')
+		# torrent_links = [a for a in a_tags if '.torrent' in a.text]
+		# smi_links = [a for a in a_tags if '.smi' in a.text]
+		#
+		# for link in torrent_links:
+		# 	file_name = link.text.strip()
+		# 	sub_url = link.attrs['href']
+		# 	url = urllib.parse.urljoin(page_link.url, sub_url)
+		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+		#
+		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
+		#
+		# for link in smi_links:
+		# 	file_name = link.text.strip()
+		# 	sub_url = link.attrs['href']
+		# 	url = urllib.parse.urljoin(link.url, sub_url)
+		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+		#
+		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		return links
\ No newline at end of file
diff --git a/Crawler/WorkerTorrentKim.py b/Crawler/WorkerTorrentKim.py
new file mode 100644
index 0000000..781b5c7
--- /dev/null
+++ b/Crawler/WorkerTorrentKim.py
@@ -0,0 +1,79 @@
+import requests
+import urllib
+import bs4
+import re
+import os
+
+from .Setting import Setting
+from .ProxyHandler import ProxyHandler
+from .Logger import Logger
+from .Util import Util
+from .DataType import PageLink, TorrentFile
+
+
+class WorkerTorrentKim:
+	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
+
+		links = []
+		tables = soup.select('table.board_list')
+		trs = tables[0].select('tr')
+		for tr in trs[1:]:
+			tds = tr.select('td.subject')
+			title = tds[0].text.strip()
+			link = tds[0].select('a')[0].attrs['href']
+
+			title_match = re_title.search(title)
+			if not title_match:
+				continue
+
+			ep_match = re_episode.search(title)
+			if not ep_match:
+				continue
+
+			title_idx = int(title_match.lastgroup[3:])
+			video = setting.settings['video'][title_idx]
+			ep = int(ep_match.group(1))
+
+			if ep <= video['ignore_ep_under']:
+				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
+				continue
+			elif ep in setting.downloaded[video['title']]:
+				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
+				continue
+
+			link = urllib.parse.urljoin(top_url, link)
+			links.append(PageLink(video['title'], ep, link))
+
+			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
+
+		return links
+
+	def crawl_downlink(self, page_link):
+		Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
+
+		resp = self.request_get(page_link.url)
+		soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
+
+		links = []
+		file_table = soup.select('table#file_table')
+		a_tags = file_table[0].select('a')
+		torrent_links = [a for a in a_tags if '.torrent' in a.text]
+		smi_links = [a for a in a_tags if '.smi' in a.text]
+
+		for link in torrent_links:
+			file_name = link.text.strip()
+			sub_url = link.attrs['href']
+			url = urllib.parse.urljoin(page_link.url, sub_url)
+			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+
+			Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		for link in smi_links:
+			file_name = link.text.strip()
+			sub_url = link.attrs['href']
+			url = urllib.parse.urljoin(link.url, sub_url)
+			links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+
+			Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		return links
\ No newline at end of file