- torrentkim 사이트로 변경

- 파일 폴더 구성
This commit is contained in:
2017-08-06 03:19:23 +09:00
parent e9549f3ab5
commit 6729cab06a
8 changed files with 190 additions and 184 deletions

View File

@@ -1,16 +1,11 @@
import sys
import io
import os
import requests
from bs4 import BeautifulSoup
import urllib
import bs4
import re
import pickle
from .Setting import Setting
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
from .ProxyHandler import ProxyHandler
from .Logger import Logger
class PageLink:
@@ -52,13 +47,13 @@ class TorrentFile:
class Crawler:
PROXY_FILE_NAME = 'proxy.bin'
def __init__(self):
self.setting = Setting()
self.proxies = []
self.session = requests.Session()
self.cookies = None
self.proxy_handler = ProxyHandler()
if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
@staticmethod
def print_log(files):
@@ -67,66 +62,6 @@ class Crawler:
f.write(file.file_name+'\n')
f.close()
def crawl_proxy(self):
proxies = []
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
else:
resp = requests.get('https://www.us-proxy.org')
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
cnt = 0
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) > 0:
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
'alive': True,
}
)
# print('{}:{}'.format(ip, port))
cnt += 1
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(proxies, f)
print('proxy cnt : {}'.format(cnt))
return proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
self.proxies = pickle.load(f)
else:
self.proxies = self.crawl_proxy()
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(Crawler.PROXY_FILE_NAME)
self.proxies = []
def request_get(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
@@ -137,19 +72,18 @@ class Crawler:
'Connection': 'keep-alive',
}
proxy = self.get_proxy()
proxy = self.proxy_handler.get_proxy()
while True:
try:
resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
self.cookies = resp.cookies
resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
except Exception as e:
self.set_proxy_dead(proxy)
proxy = self.get_proxy()
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
if resp.status_code != 200:
self.set_proxy_dead(proxy)
proxy = self.get_proxy()
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
break
@@ -157,20 +91,19 @@ class Crawler:
return resp
def crawl_list(self, url):
print('checking page {}'.format(url), flush=True)
Logger.log('checking page {}'.format(url))
resp = self.request_get(url)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = []
tables = soup.select('table.table')
trs = tables[0].select('tr.')
for tr in trs:
tds = tr.select('div.td-subject')
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
@@ -187,109 +120,69 @@ class Crawler:
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in self.setting.downloaded[video['title']]:
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
if not link.startswith('http'):
top_end = url[8:].find('/')
if top_end < 0:
top_url = url[:8 + top_end]
else:
top_url = url
if link[0] != '/':
link = '/' + link
link = top_url + link
link = urllib.parse.urljoin(url, link)
links.append(PageLink(video['title'], ep, link))
print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim
# for link in soup.select(selector):
# if link.has_attr('rel') and 'nofollow' in link['rel']:
# continue
#
# board_title = link.get_text().strip()
#
# title_match = re_title.search(board_title)
# if not title_match:
# continue
#
# ep_match = re_episode.search(board_title)
# if not ep_match:
# continue
#
# title_idx = int(title_match.lastgroup[3:])
# video = self.setting.settings['video'][title_idx]
# ep = int(ep_match.group(1))
#
# if ep <= video['ignore_ep_under']:
# print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
# continue
# elif ep in self.setting.downloaded[video['title']]:
# print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
# continue
#
# link_url = link.get('href')
# if not link_url.startswith('http'):
# top_end = url[8:].find('/')
# if top_end < 0:
# top_url = url[:8 + top_end]
# else:
# top_url = url
#
# if link_url[0] != '/':
# link_url = '/' + link_url
#
# link_url = top_url + link_url
#
# links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
#
# print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, link):
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(link.url)
soup = BeautifulSoup(resp.text, 'lxml')
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
for tag in a_tags:
file_name = str(tag.find('strong').text)
url = tag.get('href')
links.append(TorrentFile(link.title, link.episode, file_name, url))
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
print(' found download link : {}({})'.format(file_name, url), flush=True)
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links
def download_files(self, file):
if file.episode in self.setting.downloaded[file.title]:
def download_files(self, file_link):
if file_link.episode in self.setting.downloaded[file_link.title]:
return
print("start download {}".format(file.file_name), flush=True)
Logger.log("start download {}".format(file_link.file_name))
try:
response = self.request_get(file.url, cookies=file.cookie)
with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
f.write(response.content)
resp = self.request_get(file_link.url)
file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
file_name = urllib.parse.unquote(file_name[0])
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file.title].append(file.episode)
self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save()
print("downloaded {}".format(file.file_name), flush=True)
Logger.log("downloaded {}".format(file_link.file_name))
except Exception as e:
print(e)
Logger.log(e)
def crawl_torrent(self):
page_links = []
@@ -308,8 +201,6 @@ class Crawler:
self.download_files(file)
def crawl(self):
print('Crawling start')
Logger.log('Crawling start')
self.crawl_torrent()
print('Crawling finished')
Logger.log('Crawling finished')

6
Crawler/Logger.py Normal file
View File

@@ -0,0 +1,6 @@
class Logger:
@staticmethod
def log(msg):
print(msg)

98
Crawler/ProxyHandler.py Normal file
View File

@@ -0,0 +1,98 @@
import os
import pickle
import requests
import bs4
import concurrent.futures
from .Logger import Logger
class ProxyHandler:
PROXY_FILE_NAME = 'temp/proxy.bin'
def __init__(self):
if not os.path.exists('temp'):
os.makedirs('temp')
self.proxies = []
self.check_url = ''
def check_proxy(self, proxy, top_url):
try:
resp = requests.get(top_url, proxies=proxy, timeout=2)
except:
proxy['alive'] = False
else:
if resp.status_code != 200:
proxy['alive'] = False
def check_proxy_all(self, proxies, check_url):
Logger.log('checking proxies for {}'.format(check_url))
worker_cnt = 64
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
pool.shutdown()
def has_file(self):
return os.path.exists(self.PROXY_FILE_NAME)
def load_proxy(self):
with open(self.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
def crawl_proxy(self):
proxies = []
resp = requests.get('https://www.us-proxy.org')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
self.check_proxy_all(proxies, self.check_url)
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(alive_proxies, f)
return alive_proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if self.has_file():
self.proxies = self.load_proxy()
else:
self.proxies = self.crawl_proxy()
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(self.PROXY_FILE_NAME)
self.proxies = []

View File

@@ -1,33 +1,42 @@
import yaml
import os
from .Logger import Logger
class Setting:
SETTING_FILE = 'conf/settings.yml'
DOWNLOADED_FILE = 'temp/downloaded.yml'
def __init__(self):
self.settings = None
self.downloaded = None
if not os.path.exists('temp'):
os.mkdir('temp')
self.load()
pass
def load_settings(self):
if not os.path.isfile('settings.yml'):
print('There is no settings.yml', flush=True)
if not os.path.isfile(self.SETTING_FILE):
Logger.log('There is no {}'.format(self.SETTING_FILE))
exit()
with open('settings.yml', encoding='utf-8') as setting_file:
with open(self.SETTING_FILE, encoding='utf-8') as setting_file:
try:
self.settings = yaml.load(setting_file)
except ValueError as e:
print(e, flush=True)
Logger.log(e)
exit()
if 'video' not in self.settings:
print('video key is need in settings.json', flush=True)
Logger.log('video key is need in settings.json')
exit()
for i, video in enumerate(self.settings['video']):
if 'title' not in video:
print('title key is need in video({})'.format(i), flush=True)
Logger.log('title key is need in video({})'.format(i))
exit()
if 'keyword' not in video:
@@ -49,18 +58,18 @@ class Setting:
try:
os.makedirs(self.settings['download_path'])
except Exception as e:
print(e, flush=True)
Logger.log(e)
exit()
video['keyword'] += self.settings['keyword_append']
def load_downloaded(self):
if os.path.isfile('downloaded.yml'):
with open("downloaded.yml", 'r', encoding='utf-8') as stream:
if os.path.isfile(self.DOWNLOADED_FILE):
with open(self.DOWNLOADED_FILE, 'r', encoding='utf-8') as stream:
try:
self.downloaded = yaml.load(stream)
except yaml.YAMLError as e:
print(e, flush=True)
Logger.log(e)
else:
self.downloaded = {}
@@ -108,6 +117,6 @@ class Setting:
yaml.dump(downloaded_ex, outfile, allow_unicode=True)
def save(self):
with open('downloaded.yml', 'w', encoding='utf-8') as outfile:
with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile:
yaml.dump(self.downloaded, outfile, allow_unicode=True)
pass

View File

@@ -1,5 +1,5 @@
from Crawler.Crawler import Crawler
crawler = Crawler()
crawler.crawl()
if __name__ == '__main__':
crawler = Crawler()
crawler.crawl()

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
requests
bs4