Compare commits

...

2 Commits

Author SHA1 Message Date
ef0c1e78ac .gitignore 파일 수정 2017-08-06 03:21:11 +09:00
6729cab06a - torrentkim 사이트로 변경
- 파일 폴더 구성
2017-08-06 03:19:23 +09:00
9 changed files with 193 additions and 184 deletions

3
.gitignore vendored
View File

@@ -1,3 +1,6 @@
output/ output/
.idea/ .idea/
**/__pycache__/ **/__pycache__/
conf/
temp/
download/

View File

@@ -1,16 +1,11 @@
import sys
import io
import os
import requests import requests
from bs4 import BeautifulSoup import urllib
import bs4
import re import re
import pickle
from .Setting import Setting from .Setting import Setting
from .ProxyHandler import ProxyHandler
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') from .Logger import Logger
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
class PageLink: class PageLink:
@@ -52,13 +47,13 @@ class TorrentFile:
class Crawler: class Crawler:
PROXY_FILE_NAME = 'proxy.bin'
def __init__(self): def __init__(self):
self.setting = Setting() self.setting = Setting()
self.proxies = [] self.proxy_handler = ProxyHandler()
self.session = requests.Session() if len(self.setting.settings['urls']) > 0:
self.cookies = None urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
@staticmethod @staticmethod
def print_log(files): def print_log(files):
@@ -67,66 +62,6 @@ class Crawler:
f.write(file.file_name+'\n') f.write(file.file_name+'\n')
f.close() f.close()
def crawl_proxy(self):
proxies = []
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
else:
resp = requests.get('https://www.us-proxy.org')
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
cnt = 0
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) > 0:
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
'alive': True,
}
)
# print('{}:{}'.format(ip, port))
cnt += 1
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(proxies, f)
print('proxy cnt : {}'.format(cnt))
return proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
self.proxies = pickle.load(f)
else:
self.proxies = self.crawl_proxy()
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(Crawler.PROXY_FILE_NAME)
self.proxies = []
def request_get(self, url): def request_get(self, url):
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
@@ -137,19 +72,18 @@ class Crawler:
'Connection': 'keep-alive', 'Connection': 'keep-alive',
} }
proxy = self.get_proxy() proxy = self.proxy_handler.get_proxy()
while True: while True:
try: try:
resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3) resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
self.cookies = resp.cookies
except Exception as e: except Exception as e:
self.set_proxy_dead(proxy) self.proxy_handler.set_proxy_dead(proxy)
proxy = self.get_proxy() proxy = self.proxy_handler.get_proxy()
continue continue
else: else:
if resp.status_code != 200: if resp.status_code != 200:
self.set_proxy_dead(proxy) self.proxy_handler.set_proxy_dead(proxy)
proxy = self.get_proxy() proxy = self.proxy_handler.get_proxy()
continue continue
else: else:
break break
@@ -157,20 +91,19 @@ class Crawler:
return resp return resp
def crawl_list(self, url): def crawl_list(self, url):
print('checking page {}'.format(url), flush=True) Logger.log('checking page {}'.format(url))
resp = self.request_get(url) resp = self.request_get(url)
html = resp.text soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
soup = BeautifulSoup(html, 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = [] links = []
tables = soup.select('table.table') tables = soup.select('table.board_list')
trs = tables[0].select('tr.') trs = tables[0].select('tr')
for tr in trs: for tr in trs[1:]:
tds = tr.select('div.td-subject') tds = tr.select('td.subject')
title = tds[0].text.strip() title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href'] link = tds[0].select('a')[0].attrs['href']
@@ -187,109 +120,69 @@ class Crawler:
ep = int(ep_match.group(1)) ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']: if ep <= video['ignore_ep_under']:
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True) Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue continue
elif ep in self.setting.downloaded[video['title']]: elif ep in self.setting.downloaded[video['title']]:
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True) Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue continue
if not link.startswith('http'): link = urllib.parse.urljoin(url, link)
top_end = url[8:].find('/')
if top_end < 0:
top_url = url[:8 + top_end]
else:
top_url = url
if link[0] != '/':
link = '/' + link
link = top_url + link
links.append(PageLink(video['title'], ep, link)) links.append(PageLink(video['title'], ep, link))
print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True) Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim
# for link in soup.select(selector):
# if link.has_attr('rel') and 'nofollow' in link['rel']:
# continue
#
# board_title = link.get_text().strip()
#
# title_match = re_title.search(board_title)
# if not title_match:
# continue
#
# ep_match = re_episode.search(board_title)
# if not ep_match:
# continue
#
# title_idx = int(title_match.lastgroup[3:])
# video = self.setting.settings['video'][title_idx]
# ep = int(ep_match.group(1))
#
# if ep <= video['ignore_ep_under']:
# print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
# continue
# elif ep in self.setting.downloaded[video['title']]:
# print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
# continue
#
# link_url = link.get('href')
# if not link_url.startswith('http'):
# top_end = url[8:].find('/')
# if top_end < 0:
# top_url = url[:8 + top_end]
# else:
# top_url = url
#
# if link_url[0] != '/':
# link_url = '/' + link_url
#
# link_url = top_url + link_url
#
# links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
#
# print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
return links return links
def crawl_downlink(self, link): def crawl_downlink(self, page_link):
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True) Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(link.url) resp = self.request_get(page_link.url)
soup = BeautifulSoup(resp.text, 'lxml') soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = [] links = []
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'}) file_table = soup.select('table#file_table')
for tag in a_tags: a_tags = file_table[0].select('a')
file_name = str(tag.find('strong').text) torrent_links = [a for a in a_tags if '.torrent' in a.text]
url = tag.get('href') smi_links = [a for a in a_tags if '.smi' in a.text]
links.append(TorrentFile(link.title, link.episode, file_name, url))
print(' found download link : {}({})'.format(file_name, url), flush=True) for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links return links
def download_files(self, file): def download_files(self, file_link):
if file.episode in self.setting.downloaded[file.title]: if file_link.episode in self.setting.downloaded[file_link.title]:
return return
print("start download {}".format(file.file_name), flush=True) Logger.log("start download {}".format(file_link.file_name))
try: try:
response = self.request_get(file.url, cookies=file.cookie) resp = self.request_get(file_link.url)
with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f: file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
f.write(response.content) file_name = urllib.parse.unquote(file_name[0])
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file.title].append(file.episode) self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save() self.setting.save()
print("downloaded {}".format(file.file_name), flush=True) Logger.log("downloaded {}".format(file_link.file_name))
except Exception as e: except Exception as e:
print(e) Logger.log(e)
def crawl_torrent(self): def crawl_torrent(self):
page_links = [] page_links = []
@@ -308,8 +201,6 @@ class Crawler:
self.download_files(file) self.download_files(file)
def crawl(self): def crawl(self):
print('Crawling start') Logger.log('Crawling start')
self.crawl_torrent() self.crawl_torrent()
Logger.log('Crawling finished')
print('Crawling finished')

6
Crawler/Logger.py Normal file
View File

@@ -0,0 +1,6 @@
class Logger:
@staticmethod
def log(msg):
print(msg)

98
Crawler/ProxyHandler.py Normal file
View File

@@ -0,0 +1,98 @@
import os
import pickle
import requests
import bs4
import concurrent.futures
from .Logger import Logger
class ProxyHandler:
PROXY_FILE_NAME = 'temp/proxy.bin'
def __init__(self):
if not os.path.exists('temp'):
os.makedirs('temp')
self.proxies = []
self.check_url = ''
def check_proxy(self, proxy, top_url):
try:
resp = requests.get(top_url, proxies=proxy, timeout=2)
except:
proxy['alive'] = False
else:
if resp.status_code != 200:
proxy['alive'] = False
def check_proxy_all(self, proxies, check_url):
Logger.log('checking proxies for {}'.format(check_url))
worker_cnt = 64
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
pool.shutdown()
def has_file(self):
return os.path.exists(self.PROXY_FILE_NAME)
def load_proxy(self):
with open(self.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
def crawl_proxy(self):
proxies = []
resp = requests.get('https://www.us-proxy.org')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
self.check_proxy_all(proxies, self.check_url)
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(alive_proxies, f)
return alive_proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if self.has_file():
self.proxies = self.load_proxy()
else:
self.proxies = self.crawl_proxy()
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(self.PROXY_FILE_NAME)
self.proxies = []

View File

@@ -1,33 +1,42 @@
import yaml import yaml
import os import os
from .Logger import Logger
class Setting: class Setting:
SETTING_FILE = 'conf/settings.yml'
DOWNLOADED_FILE = 'temp/downloaded.yml'
def __init__(self): def __init__(self):
self.settings = None self.settings = None
self.downloaded = None self.downloaded = None
if not os.path.exists('temp'):
os.mkdir('temp')
self.load() self.load()
pass pass
def load_settings(self): def load_settings(self):
if not os.path.isfile('settings.yml'): if not os.path.isfile(self.SETTING_FILE):
print('There is no settings.yml', flush=True) Logger.log('There is no {}'.format(self.SETTING_FILE))
exit() exit()
with open('settings.yml', encoding='utf-8') as setting_file: with open(self.SETTING_FILE, encoding='utf-8') as setting_file:
try: try:
self.settings = yaml.load(setting_file) self.settings = yaml.load(setting_file)
except ValueError as e: except ValueError as e:
print(e, flush=True) Logger.log(e)
exit() exit()
if 'video' not in self.settings: if 'video' not in self.settings:
print('video key is need in settings.json', flush=True) Logger.log('video key is need in settings.json')
exit() exit()
for i, video in enumerate(self.settings['video']): for i, video in enumerate(self.settings['video']):
if 'title' not in video: if 'title' not in video:
print('title key is need in video({})'.format(i), flush=True) Logger.log('title key is need in video({})'.format(i))
exit() exit()
if 'keyword' not in video: if 'keyword' not in video:
@@ -49,18 +58,18 @@ class Setting:
try: try:
os.makedirs(self.settings['download_path']) os.makedirs(self.settings['download_path'])
except Exception as e: except Exception as e:
print(e, flush=True) Logger.log(e)
exit() exit()
video['keyword'] += self.settings['keyword_append'] video['keyword'] += self.settings['keyword_append']
def load_downloaded(self): def load_downloaded(self):
if os.path.isfile('downloaded.yml'): if os.path.isfile(self.DOWNLOADED_FILE):
with open("downloaded.yml", 'r', encoding='utf-8') as stream: with open(self.DOWNLOADED_FILE, 'r', encoding='utf-8') as stream:
try: try:
self.downloaded = yaml.load(stream) self.downloaded = yaml.load(stream)
except yaml.YAMLError as e: except yaml.YAMLError as e:
print(e, flush=True) Logger.log(e)
else: else:
self.downloaded = {} self.downloaded = {}
@@ -108,6 +117,6 @@ class Setting:
yaml.dump(downloaded_ex, outfile, allow_unicode=True) yaml.dump(downloaded_ex, outfile, allow_unicode=True)
def save(self): def save(self):
with open('downloaded.yml', 'w', encoding='utf-8') as outfile: with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile:
yaml.dump(self.downloaded, outfile, allow_unicode=True) yaml.dump(self.downloaded, outfile, allow_unicode=True)
pass pass

View File

@@ -1,5 +1,5 @@
from Crawler.Crawler import Crawler from Crawler.Crawler import Crawler
if __name__ == '__main__':
crawler = Crawler() crawler = Crawler()
crawler.crawl() crawler.crawl()

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
requests
bs4