refactoring and add tocops

2018-06-25 00:47:32 +09:00
parent 9d7afbdc1b
commit 8c9ddb9ce8
6 changed files with 314 additions and 97 deletions
--- a/Crawler/WorkerTocops.py
+++ b/Crawler/WorkerTocops.py
@@ -0,0 +1,97 @@
+import requests
+import urllib
+import bs4
+import re
+import os
+
+from .Setting import Setting
+from .ProxyHandler import ProxyHandler
+from .Logger import Logger
+from .Util import Util
+from .DataType import PageLink, TorrentFile
+
+class WorkerTocops:
+	def crawl_list(self, top_url, soup, re_title, re_episode, setting):
+
+		links = []
+		tables = soup.select('table.board_list')
+		trs = tables[0].select('tr')
+		for tr in trs[1:]:
+			tds = tr.select('td.subject')
+			if len(tds) < 1:
+				continue
+
+			title = tds[0].text.strip()
+			link = tds[0].select('a')[0].attrs['href']
+
+			title_match = re_title.search(title)
+			if not title_match:
+				continue
+
+			ep_match = re_episode.search(title)
+			if not ep_match:
+				continue
+
+			title_idx = int(title_match.lastgroup[3:])
+			video = setting.settings['video'][title_idx]
+			ep = int(ep_match.group(1))
+
+			if ep <= video['ignore_ep_under']:
+				Logger.log('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
+				continue
+			elif ep in setting.downloaded[video['title']]:
+				Logger.log('    {}({}) is ignored (already downloaded)'.format(video['title'], ep))
+				continue
+
+			link = urllib.parse.urljoin(top_url, link)
+			links.append(PageLink(video['title'], ep, link))
+
+			Logger.log('   found content page : {}({}), {}'.format(video['title'], ep, link))
+
+		return links
+
+	def crawl_downlink(self, page_link, soup):
+
+		links = []
+		if '신고된 게시물' in soup.find(id='writeContents').text:
+			return links
+
+		for a in soup.find_all('a'):
+			if 'href' in a.attrs and 'javascript:file_download' in a['href']:
+				jscript = a['href']
+				start = len("javascript:file_download('")
+				end = jscript.index("','")
+				sub_url = jscript[start:end]
+				url = urllib.parse.urljoin(page_link.url, sub_url)
+
+				file_name = jscript[end+3:-3]
+				file_name = urllib.parse.unquote(file_name)
+
+				torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
+				links.append(torrent)
+				Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		#
+		# links = []
+		# file_table = soup.select('table#file_table')
+		# a_tags = file_table[0].select('a')
+		# torrent_links = [a for a in a_tags if '.torrent' in a.text]
+		# smi_links = [a for a in a_tags if '.smi' in a.text]
+		#
+		# for link in torrent_links:
+		# 	file_name = link.text.strip()
+		# 	sub_url = link.attrs['href']
+		# 	url = urllib.parse.urljoin(page_link.url, sub_url)
+		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+		#
+		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
+		#
+		# for link in smi_links:
+		# 	file_name = link.text.strip()
+		# 	sub_url = link.attrs['href']
+		# 	url = urllib.parse.urljoin(link.url, sub_url)
+		# 	links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
+		#
+		# 	Logger.log('    found download link : {}({})'.format(file_name, url))
+
+		return links