- 다운로드 되는 데까지 구현

2017-07-02 04:15:04 +09:00
parent b7864b94ee
commit 20e048ca45
8 changed files with 321 additions and 147 deletions
--- a/Crawler/Crawler.py
+++ b/Crawler/Crawler.py
@@ -0,0 +1,173 @@
+import sys
+import io
+import os
+
+import requests
+from bs4 import BeautifulSoup
+import re
+
+from .Setting import Setting
+
+sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
+sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
+
+class PageLink:
+	def __init__(self):
+		self.title = ''
+		self.episode = ''
+		self.url = ''
+
+	def __init__(self, title, episode, url):
+		self.title = title
+		self.episode = episode
+		self.url = url
+
+	def __str__(self):
+		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
+
+	def __repr__(self):
+		return str(self)
+
+class TorrentFile:
+	def __init__(self):
+		self.title = ''
+		self.episode = ''
+		self.file_name = ''
+		self.url = ''
+
+	def __init__(self, title, episode, file_name, url):
+		self.title = title
+		self.file_name = file_name
+		self.episode = episode
+		self.url = url
+
+	def __str__(self):
+		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
+
+	def __repr__(self):
+		return str(self)
+
+
+class Crawler:
+	def __init__(self):
+		self.setting = Setting()
+
+	def print_log(self, files):
+		f = open('output/log.txt', 'at')
+		for file in files:
+			f.write(file.file_name+'\n')
+		f.close()
+
+
+	def crawl_list(self, url):
+
+		print('checking page {}'.format(url), flush=True)
+
+		code = requests.get(url)
+		html = code.text
+		soup = BeautifulSoup(html, 'lxml')
+
+		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
+		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
+
+		links = []
+		for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
+			board_title = link.get_text().strip()
+
+			title_match = re_title.search(board_title)
+			if not title_match:
+				continue
+
+			ep_match = re_episode.search(board_title)
+			if not ep_match:
+				continue
+
+			title_idx = int(title_match.lastgroup[3:])
+			video = self.setting.settings['video'][title_idx]
+			ep = int(ep_match.group(1))
+
+			if ep <= video['ignore_ep_under']:
+				print('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
+				continue
+			elif ep in self.setting.downloaded[video['title']]:
+				print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
+				continue
+
+			link_url = link.get('href')
+			if not link_url.startswith('http'):
+				top_end = url[8:].find('/')
+				if top_end < 0:
+					top_url = url[:8 + top_end]
+				else:
+					top_url = url
+
+				if link_url[0] != '/':
+					link_url = '/' + link_url
+
+				link_url = top_url + link_url
+
+			links.append(PageLink(video['title'], ep, link_url))
+
+			print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
+
+		return links
+
+	def crawl_downlink(self, link):
+		print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
+
+		code = requests.get(link.url)
+		html = code.text
+		soup = BeautifulSoup(html, 'lxml')
+
+		links = []
+		a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
+		for tag in a_tags:
+			file_name = str(tag.find('strong').text)
+			url = tag.get('href')
+			links.append(TorrentFile(link.title, link.episode, file_name, url))
+
+			print('    found download link : {}({})'.format(file_name, url), flush=True)
+
+		return links
+
+	def download_files(self, file):
+		if file.episode in self.setting.downloaded[file.title]:
+			return
+
+		print("start download {}".format(file.file_name), flush=True)
+
+		try:
+			response = requests.get(file.url)
+			data = response.content
+			f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
+			f.write(data)
+			f.close()
+
+			self.setting.downloaded[file.title].append(file.episode)
+			self.setting.save()
+
+			print("downloaded {}".format(file.file_name), flush=True)
+
+		except Exception as e:
+			print(e)
+
+
+	def crawl(self):
+
+		print('Crawling start')
+
+		page_links = []
+		for url in self.setting.settings['urls']:
+			page = 1
+			while page <= self.setting.settings['max_page']:
+				page_links += self.crawl_list(url+str(page))
+				page += 1
+
+		files = []
+		for link in page_links:
+			files += self.crawl_downlink(link)
+
+		for file in files:
+			self.download_files(file)
+
+		print('Crawling finished')
--- a/Crawler/Setting.py
+++ b/Crawler/Setting.py
@@ -0,0 +1,113 @@
+import yaml
+import os
+
+class Setting:
+	def __init__(self):
+		self.settings = None
+		self.downloaded = None
+
+		self.load()
+		pass
+
+	def load_settings(self):
+		if not os.path.isfile('settings.yml'):
+			print('There is no settings.yml', flush=True)
+			exit()
+
+		with open('settings.yml', encoding='utf-8') as setting_file:
+			try:
+				self.settings = yaml.load(setting_file)
+			except ValueError as e:
+				print(e, flush=True)
+				exit()
+
+		if 'video' not in self.settings:
+			print('video key is need in settings.json', flush=True)
+			exit()
+
+		for i, video in enumerate(self.settings['video']):
+			if 'title' not in video:
+				print('title key is need in video({})'.format(i), flush=True)
+				exit()
+
+			if 'keyword' not in video:
+				video['keyword'] = video['title']
+
+			if 'ignore_ep_under' not in video:
+				video['ignore_ep_under'] = 0
+
+			if 'keyword_append' not in self.settings:
+				self.settings['keyword_append'] = ''
+
+			if 'download_path' not in self.settings:
+				self.settings['download_path'] = '.'
+
+			if self.settings['download_path'][-1] != '\\':
+				self.settings['download_path'] += '\\'
+
+			if not os.path.exists(self.settings['download_path']):
+				try:
+					os.makedirs(self.settings['download_path'])
+				except Exception as e:
+					print(e, flush=True)
+					exit()
+
+			video['keyword'] += self.settings['keyword_append']
+
+	def load_downloaded(self):
+		if os.path.isfile('downloaded.yml'):
+			with open("downloaded.yml", 'r', encoding='utf-8') as stream:
+				try:
+					self.downloaded = yaml.load(stream)
+				except yaml.YAMLError as e:
+					print(e, flush=True)
+		else:
+			self.downloaded = {}
+
+		for video in self.settings['video']:
+			if video['title'] not in self.downloaded.keys():
+				self.downloaded[video['title']] = []
+
+		self.save()
+
+	def load(self):
+		self.load_settings()
+		self.load_downloaded()
+
+	@staticmethod
+	def dump_settings_example():
+		settings_ex = {
+			'urls': [
+				'https://todaum.com/bbs/board.php?bo_table=torrent_kortv_ent&device=pc&page=',
+				'https://todaum.com/bbs/board.php?bo_table=torrent_kortv_ent&device=pc&page=',
+			],
+			'max_page': 2,
+			'video':
+				[
+					{
+						'title': '무한도전',
+						'ignore_ep_under': 325
+					},
+					{
+						'title': '라디오스타'
+					}
+				],
+			'keyword_append': '.*720p.*NEXT',
+			'download_path': '.',
+
+		}
+
+		with open('settings_example.yml', 'w', encoding='utf-8') as outfile:
+			yaml.dump(settings_ex, outfile, allow_unicode=True, default_flow_style=False)
+
+	@staticmethod
+	def dump_downloaded_example():
+		downloaded_ex = {'무한도전': [1, 2, 3], '라디오스타': [1, 2, 3]}
+
+		with open('downloaded_example.yml', 'w', encoding='utf-8') as outfile:
+			yaml.dump(downloaded_ex, outfile, allow_unicode=True)
+
+	def save(self):
+		with open('downloaded.yml', 'w', encoding='utf-8') as outfile:
+			yaml.dump(self.downloaded, outfile, allow_unicode=True)
+		pass
--- a/Crawler/init.py
+++ b/Crawler/init.py