- 다운로드 되는 데까지 구현
This commit is contained in:
173
Crawler/Crawler.py
Normal file
173
Crawler/Crawler.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import sys
|
||||
import io
|
||||
import os
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
from .Setting import Setting
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
|
||||
|
||||
class PageLink:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, url):
|
||||
self.title = title
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
class TorrentFile:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.file_name = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, file_name, url):
|
||||
self.title = title
|
||||
self.file_name = file_name
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
class Crawler:
|
||||
def __init__(self):
|
||||
self.setting = Setting()
|
||||
|
||||
def print_log(self, files):
|
||||
f = open('output/log.txt', 'at')
|
||||
for file in files:
|
||||
f.write(file.file_name+'\n')
|
||||
f.close()
|
||||
|
||||
|
||||
def crawl_list(self, url):
|
||||
|
||||
print('checking page {}'.format(url), flush=True)
|
||||
|
||||
code = requests.get(url)
|
||||
html = code.text
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
||||
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
||||
|
||||
links = []
|
||||
for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
|
||||
board_title = link.get_text().strip()
|
||||
|
||||
title_match = re_title.search(board_title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(board_title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = self.setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
|
||||
continue
|
||||
elif ep in self.setting.downloaded[video['title']]:
|
||||
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
||||
continue
|
||||
|
||||
link_url = link.get('href')
|
||||
if not link_url.startswith('http'):
|
||||
top_end = url[8:].find('/')
|
||||
if top_end < 0:
|
||||
top_url = url[:8 + top_end]
|
||||
else:
|
||||
top_url = url
|
||||
|
||||
if link_url[0] != '/':
|
||||
link_url = '/' + link_url
|
||||
|
||||
link_url = top_url + link_url
|
||||
|
||||
links.append(PageLink(video['title'], ep, link_url))
|
||||
|
||||
print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
|
||||
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, link):
|
||||
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
|
||||
|
||||
code = requests.get(link.url)
|
||||
html = code.text
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
links = []
|
||||
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
|
||||
for tag in a_tags:
|
||||
file_name = str(tag.find('strong').text)
|
||||
url = tag.get('href')
|
||||
links.append(TorrentFile(link.title, link.episode, file_name, url))
|
||||
|
||||
print(' found download link : {}({})'.format(file_name, url), flush=True)
|
||||
|
||||
return links
|
||||
|
||||
def download_files(self, file):
|
||||
if file.episode in self.setting.downloaded[file.title]:
|
||||
return
|
||||
|
||||
print("start download {}".format(file.file_name), flush=True)
|
||||
|
||||
try:
|
||||
response = requests.get(file.url)
|
||||
data = response.content
|
||||
f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
self.setting.downloaded[file.title].append(file.episode)
|
||||
self.setting.save()
|
||||
|
||||
print("downloaded {}".format(file.file_name), flush=True)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
|
||||
def crawl(self):
|
||||
|
||||
print('Crawling start')
|
||||
|
||||
page_links = []
|
||||
for url in self.setting.settings['urls']:
|
||||
page = 1
|
||||
while page <= self.setting.settings['max_page']:
|
||||
page_links += self.crawl_list(url+str(page))
|
||||
page += 1
|
||||
|
||||
files = []
|
||||
for link in page_links:
|
||||
files += self.crawl_downlink(link)
|
||||
|
||||
for file in files:
|
||||
self.download_files(file)
|
||||
|
||||
print('Crawling finished')
|
||||
113
Crawler/Setting.py
Normal file
113
Crawler/Setting.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import yaml
|
||||
import os
|
||||
|
||||
class Setting:
|
||||
def __init__(self):
|
||||
self.settings = None
|
||||
self.downloaded = None
|
||||
|
||||
self.load()
|
||||
pass
|
||||
|
||||
def load_settings(self):
|
||||
if not os.path.isfile('settings.yml'):
|
||||
print('There is no settings.yml', flush=True)
|
||||
exit()
|
||||
|
||||
with open('settings.yml', encoding='utf-8') as setting_file:
|
||||
try:
|
||||
self.settings = yaml.load(setting_file)
|
||||
except ValueError as e:
|
||||
print(e, flush=True)
|
||||
exit()
|
||||
|
||||
if 'video' not in self.settings:
|
||||
print('video key is need in settings.json', flush=True)
|
||||
exit()
|
||||
|
||||
for i, video in enumerate(self.settings['video']):
|
||||
if 'title' not in video:
|
||||
print('title key is need in video({})'.format(i), flush=True)
|
||||
exit()
|
||||
|
||||
if 'keyword' not in video:
|
||||
video['keyword'] = video['title']
|
||||
|
||||
if 'ignore_ep_under' not in video:
|
||||
video['ignore_ep_under'] = 0
|
||||
|
||||
if 'keyword_append' not in self.settings:
|
||||
self.settings['keyword_append'] = ''
|
||||
|
||||
if 'download_path' not in self.settings:
|
||||
self.settings['download_path'] = '.'
|
||||
|
||||
if self.settings['download_path'][-1] != '\\':
|
||||
self.settings['download_path'] += '\\'
|
||||
|
||||
if not os.path.exists(self.settings['download_path']):
|
||||
try:
|
||||
os.makedirs(self.settings['download_path'])
|
||||
except Exception as e:
|
||||
print(e, flush=True)
|
||||
exit()
|
||||
|
||||
video['keyword'] += self.settings['keyword_append']
|
||||
|
||||
def load_downloaded(self):
|
||||
if os.path.isfile('downloaded.yml'):
|
||||
with open("downloaded.yml", 'r', encoding='utf-8') as stream:
|
||||
try:
|
||||
self.downloaded = yaml.load(stream)
|
||||
except yaml.YAMLError as e:
|
||||
print(e, flush=True)
|
||||
else:
|
||||
self.downloaded = {}
|
||||
|
||||
for video in self.settings['video']:
|
||||
if video['title'] not in self.downloaded.keys():
|
||||
self.downloaded[video['title']] = []
|
||||
|
||||
self.save()
|
||||
|
||||
def load(self):
|
||||
self.load_settings()
|
||||
self.load_downloaded()
|
||||
|
||||
@staticmethod
|
||||
def dump_settings_example():
|
||||
settings_ex = {
|
||||
'urls': [
|
||||
'https://todaum.com/bbs/board.php?bo_table=torrent_kortv_ent&device=pc&page=',
|
||||
'https://todaum.com/bbs/board.php?bo_table=torrent_kortv_ent&device=pc&page=',
|
||||
],
|
||||
'max_page': 2,
|
||||
'video':
|
||||
[
|
||||
{
|
||||
'title': '무한도전',
|
||||
'ignore_ep_under': 325
|
||||
},
|
||||
{
|
||||
'title': '라디오스타'
|
||||
}
|
||||
],
|
||||
'keyword_append': '.*720p.*NEXT',
|
||||
'download_path': '.',
|
||||
|
||||
}
|
||||
|
||||
with open('settings_example.yml', 'w', encoding='utf-8') as outfile:
|
||||
yaml.dump(settings_ex, outfile, allow_unicode=True, default_flow_style=False)
|
||||
|
||||
@staticmethod
|
||||
def dump_downloaded_example():
|
||||
downloaded_ex = {'무한도전': [1, 2, 3], '라디오스타': [1, 2, 3]}
|
||||
|
||||
with open('downloaded_example.yml', 'w', encoding='utf-8') as outfile:
|
||||
yaml.dump(downloaded_ex, outfile, allow_unicode=True)
|
||||
|
||||
def save(self):
|
||||
with open('downloaded.yml', 'w', encoding='utf-8') as outfile:
|
||||
yaml.dump(self.downloaded, outfile, allow_unicode=True)
|
||||
pass
|
||||
0
Crawler/__init__.py
Normal file
0
Crawler/__init__.py
Normal file
Reference in New Issue
Block a user