Compare commits
10 Commits
e9549f3ab5
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| b19499b6a4 | |||
| 8c9ddb9ce8 | |||
| 9d7afbdc1b | |||
| 917894fcac | |||
| f32ff66d2f | |||
| 125c201638 | |||
| 34c0cc5a29 | |||
| 493c44999a | |||
| ef0c1e78ac | |||
| 6729cab06a |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,3 +1,6 @@
|
||||
output/
|
||||
.idea/
|
||||
**/__pycache__/
|
||||
conf/
|
||||
temp/
|
||||
download/
|
||||
|
||||
310
Crawler/Crawler.py
Normal file → Executable file
310
Crawler/Crawler.py
Normal file → Executable file
@@ -1,64 +1,38 @@
|
||||
import sys
|
||||
import io
|
||||
import requests
|
||||
import urllib
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import pickle
|
||||
|
||||
from .Setting import Setting
|
||||
from .ProxyHandler import ProxyHandler
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
from .DataType import PageLink, TorrentFile
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
|
||||
|
||||
|
||||
class PageLink:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, url):
|
||||
self.title = title
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
class TorrentFile:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.file_name = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, file_name, url):
|
||||
self.title = title
|
||||
self.file_name = file_name
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
from .WorkerTorrentKim import WorkerTorrentKim
|
||||
from .WorkerTfreeca import WorkerTfreeca
|
||||
from .WorkerTocops import WorkerTocops
|
||||
|
||||
|
||||
class Crawler:
|
||||
PROXY_FILE_NAME = 'proxy.bin'
|
||||
|
||||
def __init__(self):
|
||||
self.setting = Setting()
|
||||
self.proxies = []
|
||||
self.session = requests.Session()
|
||||
self.cookies = None
|
||||
self.session = requests.session()
|
||||
self.proxy_handler = ProxyHandler()
|
||||
self.worker = None
|
||||
|
||||
if len(self.setting.settings['urls']) > 0:
|
||||
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
|
||||
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
|
||||
self.proxy_handler.check_url = top_url
|
||||
|
||||
if 'torrentkim' in top_url:
|
||||
self.worker = WorkerTorrentKim()
|
||||
elif 'tfreeca' in top_url:
|
||||
self.worker = WorkerTfreeca()
|
||||
elif 'tcorea' in top_url:
|
||||
self.worker = WorkerTocops()
|
||||
|
||||
@staticmethod
|
||||
def print_log(files):
|
||||
@@ -67,89 +41,27 @@ class Crawler:
|
||||
f.write(file.file_name+'\n')
|
||||
f.close()
|
||||
|
||||
def crawl_proxy(self):
|
||||
proxies = []
|
||||
|
||||
if os.path.exists(Crawler.PROXY_FILE_NAME):
|
||||
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
|
||||
proxies = pickle.load(f)
|
||||
return proxies
|
||||
|
||||
else:
|
||||
resp = requests.get('https://www.us-proxy.org')
|
||||
soup = BeautifulSoup(resp.text, 'lxml')
|
||||
table = soup.select('table.table')
|
||||
trs = table[0].select('tr')
|
||||
cnt = 0
|
||||
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td')
|
||||
if len(tds) > 0:
|
||||
ip, port = tds[0].text, tds[1].text
|
||||
proxies.append(
|
||||
{
|
||||
'http': '{}:{}'.format(ip, port),
|
||||
'https': '{}:{}'.format(ip, port),
|
||||
'alive': True,
|
||||
}
|
||||
)
|
||||
# print('{}:{}'.format(ip, port))
|
||||
cnt += 1
|
||||
|
||||
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(proxies, f)
|
||||
|
||||
print('proxy cnt : {}'.format(cnt))
|
||||
return proxies
|
||||
|
||||
def get_proxy(self):
|
||||
if len(self.proxies) <= 0:
|
||||
if os.path.exists(Crawler.PROXY_FILE_NAME):
|
||||
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
|
||||
self.proxies = pickle.load(f)
|
||||
else:
|
||||
self.proxies = self.crawl_proxy()
|
||||
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive']:
|
||||
return proxy
|
||||
|
||||
return None
|
||||
|
||||
def set_proxy_dead(self, proxy):
|
||||
proxy['alive'] = False
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive']:
|
||||
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(self.proxies, f)
|
||||
return
|
||||
|
||||
os.remove(Crawler.PROXY_FILE_NAME)
|
||||
self.proxies = []
|
||||
|
||||
def request_get(self, url):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
|
||||
proxy = self.get_proxy()
|
||||
proxy = self.proxy_handler.get_proxy()
|
||||
while True:
|
||||
try:
|
||||
resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
|
||||
self.cookies = resp.cookies
|
||||
resp = self.session.get(url, proxies=proxy, headers=headers, timeout=3)
|
||||
except Exception as e:
|
||||
self.set_proxy_dead(proxy)
|
||||
proxy = self.get_proxy()
|
||||
self.proxy_handler.set_proxy_dead(proxy)
|
||||
proxy = self.proxy_handler.get_proxy()
|
||||
continue
|
||||
else:
|
||||
if resp.status_code != 200:
|
||||
self.set_proxy_dead(proxy)
|
||||
proxy = self.get_proxy()
|
||||
self.proxy_handler.set_proxy_dead(proxy)
|
||||
proxy = self.proxy_handler.get_proxy()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
@@ -157,139 +69,45 @@ class Crawler:
|
||||
return resp
|
||||
|
||||
def crawl_list(self, url):
|
||||
print('checking page {}'.format(url), flush=True)
|
||||
Logger.log('checking page {}'.format(url))
|
||||
|
||||
resp = self.request_get(url)
|
||||
html = resp.text
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||
|
||||
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
|
||||
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
|
||||
|
||||
links = []
|
||||
tables = soup.select('table.table')
|
||||
trs = tables[0].select('tr.')
|
||||
for tr in trs:
|
||||
tds = tr.select('div.td-subject')
|
||||
title = tds[0].text.strip()
|
||||
link = tds[0].select('a')[0].attrs['href']
|
||||
|
||||
title_match = re_title.search(title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = self.setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
|
||||
continue
|
||||
elif ep in self.setting.downloaded[video['title']]:
|
||||
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
||||
continue
|
||||
|
||||
if not link.startswith('http'):
|
||||
top_end = url[8:].find('/')
|
||||
if top_end < 0:
|
||||
top_url = url[:8 + top_end]
|
||||
else:
|
||||
top_url = url
|
||||
|
||||
if link[0] != '/':
|
||||
link = '/' + link
|
||||
|
||||
link = top_url + link
|
||||
|
||||
links.append(PageLink(video['title'], ep, link))
|
||||
|
||||
print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
|
||||
|
||||
|
||||
# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim
|
||||
# for link in soup.select(selector):
|
||||
# if link.has_attr('rel') and 'nofollow' in link['rel']:
|
||||
# continue
|
||||
#
|
||||
# board_title = link.get_text().strip()
|
||||
#
|
||||
# title_match = re_title.search(board_title)
|
||||
# if not title_match:
|
||||
# continue
|
||||
#
|
||||
# ep_match = re_episode.search(board_title)
|
||||
# if not ep_match:
|
||||
# continue
|
||||
#
|
||||
# title_idx = int(title_match.lastgroup[3:])
|
||||
# video = self.setting.settings['video'][title_idx]
|
||||
# ep = int(ep_match.group(1))
|
||||
#
|
||||
# if ep <= video['ignore_ep_under']:
|
||||
# print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
|
||||
# continue
|
||||
# elif ep in self.setting.downloaded[video['title']]:
|
||||
# print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
|
||||
# continue
|
||||
#
|
||||
# link_url = link.get('href')
|
||||
# if not link_url.startswith('http'):
|
||||
# top_end = url[8:].find('/')
|
||||
# if top_end < 0:
|
||||
# top_url = url[:8 + top_end]
|
||||
# else:
|
||||
# top_url = url
|
||||
#
|
||||
# if link_url[0] != '/':
|
||||
# link_url = '/' + link_url
|
||||
#
|
||||
# link_url = top_url + link_url
|
||||
#
|
||||
# links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
|
||||
#
|
||||
# print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
|
||||
|
||||
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, link):
|
||||
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
|
||||
def crawl_downlink(self, page_link):
|
||||
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
|
||||
|
||||
resp = self.request_get(link.url)
|
||||
soup = BeautifulSoup(resp.text, 'lxml')
|
||||
|
||||
links = []
|
||||
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
|
||||
for tag in a_tags:
|
||||
file_name = str(tag.find('strong').text)
|
||||
url = tag.get('href')
|
||||
links.append(TorrentFile(link.title, link.episode, file_name, url))
|
||||
|
||||
print(' found download link : {}({})'.format(file_name, url), flush=True)
|
||||
resp = self.request_get(page_link.url)
|
||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||
|
||||
links = self.worker.crawl_downlink(page_link, soup)
|
||||
return links
|
||||
|
||||
def download_files(self, file):
|
||||
if file.episode in self.setting.downloaded[file.title]:
|
||||
def download_files(self, file_link):
|
||||
if file_link.episode in self.setting.downloaded[file_link.title]:
|
||||
return
|
||||
|
||||
print("start download {}".format(file.file_name), flush=True)
|
||||
Logger.log("start download {}".format(file_link.file_name))
|
||||
|
||||
try:
|
||||
response = self.request_get(file.url, cookies=file.cookie)
|
||||
with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
|
||||
f.write(response.content)
|
||||
resp = self.request_get(file_link.url)
|
||||
file_name = file_link.file_name
|
||||
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
|
||||
f.write(resp.content)
|
||||
|
||||
self.setting.downloaded[file.title].append(file.episode)
|
||||
self.setting.downloaded[file_link.title].append(file_link.episode)
|
||||
self.setting.save()
|
||||
|
||||
print("downloaded {}".format(file.file_name), flush=True)
|
||||
Logger.log("downloaded {}".format(file_name))
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
Logger.log(e)
|
||||
|
||||
def crawl_torrent(self):
|
||||
page_links = []
|
||||
@@ -308,8 +126,28 @@ class Crawler:
|
||||
self.download_files(file)
|
||||
|
||||
def crawl(self):
|
||||
print('Crawling start')
|
||||
if Util.get_free_space() < 4*1024*1024:
|
||||
Logger.log('Disk space is less than 4GB. Aborted')
|
||||
return
|
||||
|
||||
# self.test()
|
||||
|
||||
Logger.log('Crawling start')
|
||||
self.crawl_torrent()
|
||||
Logger.log('Crawling finished')
|
||||
|
||||
print('Crawling finished')
|
||||
|
||||
class Sorter:
|
||||
@staticmethod
|
||||
def move_files():
|
||||
setting = Setting()
|
||||
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
|
||||
return
|
||||
|
||||
file_list = os.listdir(setting.settings['file_download_path'])
|
||||
for filename in file_list:
|
||||
for video in setting.settings['video']:
|
||||
if video['title'] in filename:
|
||||
old_path = os.path.join(setting.settings['file_download_path'], filename)
|
||||
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
|
||||
os.rename(old_path, new_path)
|
||||
|
||||
36
Crawler/DataType.py
Normal file
36
Crawler/DataType.py
Normal file
@@ -0,0 +1,36 @@
|
||||
class PageLink:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, url):
|
||||
self.title = title
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
|
||||
|
||||
class TorrentFile:
|
||||
def __init__(self):
|
||||
self.title = ''
|
||||
self.episode = ''
|
||||
self.file_name = ''
|
||||
self.url = ''
|
||||
|
||||
def __init__(self, title, episode, file_name, url):
|
||||
self.title = title
|
||||
self.file_name = file_name
|
||||
self.episode = episode
|
||||
self.url = url
|
||||
|
||||
def __str__(self):
|
||||
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
|
||||
|
||||
def __repr__(self):
|
||||
return str(self)
|
||||
6
Crawler/Logger.py
Normal file
6
Crawler/Logger.py
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
class Logger:
|
||||
|
||||
@staticmethod
|
||||
def log(msg):
|
||||
print(msg)
|
||||
146
Crawler/ProxyHandler.py
Executable file
146
Crawler/ProxyHandler.py
Executable file
@@ -0,0 +1,146 @@
|
||||
import os
|
||||
import pickle
|
||||
import requests
|
||||
import bs4
|
||||
import concurrent.futures
|
||||
import time
|
||||
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
|
||||
class ProxyHandler:
|
||||
PROXY_FILE_NAME = 'temp/proxy.bin'
|
||||
|
||||
def __init__(self):
|
||||
if not os.path.exists('temp'):
|
||||
os.makedirs('temp')
|
||||
|
||||
self.proxies = []
|
||||
self.check_url = ''
|
||||
|
||||
def check_proxy(self, proxy, top_url):
|
||||
try:
|
||||
resp = requests.get(top_url, proxies=proxy, timeout=2)
|
||||
except:
|
||||
proxy['alive'] = False
|
||||
else:
|
||||
if resp.status_code != 200:
|
||||
proxy['alive'] = False
|
||||
|
||||
def check_proxy_all(self, proxies, check_url):
|
||||
Logger.log('checking proxies for {}'.format(check_url))
|
||||
|
||||
worker_cnt = 16
|
||||
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
|
||||
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
|
||||
pool.shutdown()
|
||||
|
||||
def has_file(self):
|
||||
return os.path.exists(self.PROXY_FILE_NAME)
|
||||
|
||||
def load_proxy(self):
|
||||
with open(self.PROXY_FILE_NAME, 'rb') as f:
|
||||
proxies = pickle.load(f)
|
||||
|
||||
return proxies
|
||||
|
||||
def crawl_proxy(self):
|
||||
proxies = []
|
||||
|
||||
resp = requests.get('https://www.us-proxy.org')
|
||||
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||
table = soup.select('table.table')
|
||||
trs = table[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td')
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
|
||||
ip, port = tds[0].text, tds[1].text
|
||||
proxies.append(
|
||||
{
|
||||
'alive': True,
|
||||
'http': '{}:{}'.format(ip, port),
|
||||
'https': '{}:{}'.format(ip, port),
|
||||
}
|
||||
)
|
||||
|
||||
resp = requests.get('https://www.socks-proxy.net')
|
||||
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||
table = soup.select('table.table')
|
||||
trs = table[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td')
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
|
||||
ip, port = tds[0].text, tds[1].text
|
||||
proxies.append(
|
||||
{
|
||||
'alive': True,
|
||||
'http': '{}:{}'.format(ip, port),
|
||||
'https': '{}:{}'.format(ip, port),
|
||||
}
|
||||
)
|
||||
|
||||
resp = requests.get('https://www.sslproxies.org')
|
||||
soup = bs4.BeautifulSoup(resp.text, 'lxml')
|
||||
table = soup.select('table.table')
|
||||
trs = table[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td')
|
||||
if len(tds) < 2:
|
||||
continue
|
||||
|
||||
ip, port = tds[0].text, tds[1].text
|
||||
proxies.append(
|
||||
{
|
||||
'alive': True,
|
||||
'http': '{}:{}'.format(ip, port),
|
||||
'https': '{}:{}'.format(ip, port),
|
||||
}
|
||||
)
|
||||
|
||||
# print(proxies)
|
||||
self.check_proxy_all(proxies, self.check_url)
|
||||
# print(proxies)
|
||||
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
|
||||
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
|
||||
|
||||
if Util.get_free_space() >= 1024:
|
||||
with open(self.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(alive_proxies, f)
|
||||
|
||||
return alive_proxies
|
||||
|
||||
def get_proxy(self):
|
||||
if len(self.proxies) <= 0:
|
||||
if self.has_file():
|
||||
self.proxies = self.load_proxy()
|
||||
self.proxies = [proxy for proxy in self.proxies if proxy['alive']]
|
||||
|
||||
if len(self.proxies) <= 0:
|
||||
while True:
|
||||
self.proxies = self.crawl_proxy()
|
||||
if len(self.proxies) > 0:
|
||||
break
|
||||
else:
|
||||
Logger.log('there is no available proxy. sleep 10secs..')
|
||||
time.sleep(10)
|
||||
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive']:
|
||||
return proxy
|
||||
|
||||
return None
|
||||
|
||||
def set_proxy_dead(self, proxy):
|
||||
proxy['alive'] = False
|
||||
for proxy in self.proxies:
|
||||
if proxy['alive'] and Util.get_free_space() >= 1024:
|
||||
with open(self.PROXY_FILE_NAME, 'wb') as f:
|
||||
pickle.dump(self.proxies, f)
|
||||
return
|
||||
|
||||
os.remove(self.PROXY_FILE_NAME)
|
||||
self.proxies = []
|
||||
52
Crawler/Setting.py
Normal file → Executable file
52
Crawler/Setting.py
Normal file → Executable file
@@ -1,33 +1,42 @@
|
||||
import yaml
|
||||
import os
|
||||
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
|
||||
class Setting:
|
||||
SETTING_FILE = 'conf/settings.yml'
|
||||
DOWNLOADED_FILE = 'temp/downloaded.yml'
|
||||
|
||||
def __init__(self):
|
||||
self.settings = None
|
||||
self.downloaded = None
|
||||
|
||||
if not os.path.exists('temp'):
|
||||
os.mkdir('temp')
|
||||
|
||||
self.load()
|
||||
pass
|
||||
|
||||
def load_settings(self):
|
||||
if not os.path.isfile('settings.yml'):
|
||||
print('There is no settings.yml', flush=True)
|
||||
if not os.path.isfile(self.SETTING_FILE):
|
||||
Logger.log('There is no {}'.format(self.SETTING_FILE))
|
||||
exit()
|
||||
|
||||
with open('settings.yml', encoding='utf-8') as setting_file:
|
||||
with open(self.SETTING_FILE, encoding='utf-8') as setting_file:
|
||||
try:
|
||||
self.settings = yaml.load(setting_file)
|
||||
except ValueError as e:
|
||||
print(e, flush=True)
|
||||
Logger.log(e)
|
||||
exit()
|
||||
|
||||
if 'video' not in self.settings:
|
||||
print('video key is need in settings.json', flush=True)
|
||||
Logger.log('video key is need in settings.json')
|
||||
exit()
|
||||
|
||||
for i, video in enumerate(self.settings['video']):
|
||||
if 'title' not in video:
|
||||
print('title key is need in video({})'.format(i), flush=True)
|
||||
Logger.log('title key is need in video({})'.format(i))
|
||||
exit()
|
||||
|
||||
if 'keyword' not in video:
|
||||
@@ -42,25 +51,39 @@ class Setting:
|
||||
if 'download_path' not in self.settings:
|
||||
self.settings['download_path'] = '.'
|
||||
|
||||
if self.settings['download_path'][-1] != '\\':
|
||||
self.settings['download_path'] += '\\'
|
||||
if self.settings['download_path'][-1] != '/':
|
||||
self.settings['download_path'] += '/'
|
||||
|
||||
if not os.path.exists(self.settings['download_path']):
|
||||
try:
|
||||
os.makedirs(self.settings['download_path'])
|
||||
except Exception as e:
|
||||
print(e, flush=True)
|
||||
Logger.log(e)
|
||||
exit()
|
||||
|
||||
if 'download_path' in self.settings and self.settings['file_download_path'][-1] != '/':
|
||||
self.settings['file_download_path'] += '/'
|
||||
|
||||
if 'file_move_path' in self.settings:
|
||||
if self.settings['file_move_path'][-1] != '/':
|
||||
self.settings['file_move_path'] += '/'
|
||||
|
||||
if not os.path.exists(self.settings['file_move_path']):
|
||||
try:
|
||||
os.makedirs(self.settings['file_move_path'])
|
||||
except Exception as e:
|
||||
Logger.log(e)
|
||||
exit()
|
||||
|
||||
video['keyword'] += self.settings['keyword_append']
|
||||
|
||||
def load_downloaded(self):
|
||||
if os.path.isfile('downloaded.yml'):
|
||||
with open("downloaded.yml", 'r', encoding='utf-8') as stream:
|
||||
if os.path.isfile(self.DOWNLOADED_FILE):
|
||||
with open(self.DOWNLOADED_FILE, 'r', encoding='utf-8') as stream:
|
||||
try:
|
||||
self.downloaded = yaml.load(stream)
|
||||
except yaml.YAMLError as e:
|
||||
print(e, flush=True)
|
||||
Logger.log(e)
|
||||
else:
|
||||
self.downloaded = {}
|
||||
|
||||
@@ -108,6 +131,9 @@ class Setting:
|
||||
yaml.dump(downloaded_ex, outfile, allow_unicode=True)
|
||||
|
||||
def save(self):
|
||||
with open('downloaded.yml', 'w', encoding='utf-8') as outfile:
|
||||
if Util.get_free_space() < 1024:
|
||||
return
|
||||
|
||||
with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile:
|
||||
yaml.dump(self.downloaded, outfile, allow_unicode=True)
|
||||
pass
|
||||
|
||||
13
Crawler/Util.py
Normal file
13
Crawler/Util.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import subprocess
|
||||
import platform
|
||||
|
||||
class Util:
|
||||
@staticmethod
|
||||
def get_free_space():
|
||||
if platform.system() == 'Linux':
|
||||
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
|
||||
output = df.communicate()[0]
|
||||
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
|
||||
else:
|
||||
available = 1024*1024*1024*1024;
|
||||
return int(available)
|
||||
78
Crawler/WorkerTfreeca.py
Normal file
78
Crawler/WorkerTfreeca.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import requests
|
||||
import urllib
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
from .Setting import Setting
|
||||
from .ProxyHandler import ProxyHandler
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
from .DataType import PageLink, TorrentFile
|
||||
|
||||
|
||||
class WorkerTfreeca:
|
||||
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||
|
||||
links = []
|
||||
tables = soup.select('table.b_list')
|
||||
trs = tables[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td.subject')
|
||||
if len(tds) < 1:
|
||||
continue
|
||||
|
||||
title = tds[0].text.strip()
|
||||
link = tds[0].select('a')[1].attrs['href']
|
||||
|
||||
title_match = re_title.search(title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||
continue
|
||||
elif ep in setting.downloaded[video['title']]:
|
||||
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||
continue
|
||||
|
||||
link = urllib.parse.urljoin(top_url, link)
|
||||
links.append(PageLink(video['title'], ep, link))
|
||||
|
||||
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, page_link, soup):
|
||||
|
||||
links = []
|
||||
file_table = soup.select('table#file_table')
|
||||
a_tags = file_table[0].select('a')
|
||||
torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||
smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||
|
||||
for link in torrent_links:
|
||||
file_name = link.text.strip()
|
||||
sub_url = link.attrs['href']
|
||||
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
for link in smi_links:
|
||||
file_name = link.text.strip()
|
||||
sub_url = link.attrs['href']
|
||||
url = urllib.parse.urljoin(link.url, sub_url)
|
||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
return links
|
||||
74
Crawler/WorkerTocops.py
Normal file
74
Crawler/WorkerTocops.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import requests
|
||||
import urllib
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
from .Setting import Setting
|
||||
from .ProxyHandler import ProxyHandler
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
from .DataType import PageLink, TorrentFile
|
||||
|
||||
class WorkerTocops:
|
||||
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||
|
||||
links = []
|
||||
tables = soup.select('table.board_list')
|
||||
trs = tables[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td.subject')
|
||||
if len(tds) < 1:
|
||||
continue
|
||||
|
||||
title = tds[0].text.strip()
|
||||
link = tds[0].select('a')[0].attrs['href']
|
||||
|
||||
title_match = re_title.search(title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||
continue
|
||||
elif ep in setting.downloaded[video['title']]:
|
||||
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||
continue
|
||||
|
||||
link = urllib.parse.urljoin(top_url, link)
|
||||
links.append(PageLink(video['title'], ep, link))
|
||||
|
||||
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, page_link, soup):
|
||||
|
||||
links = []
|
||||
if '신고된 게시물' in soup.find(id='writeContents').text:
|
||||
return links
|
||||
|
||||
for a in soup.find_all('a'):
|
||||
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
|
||||
jscript = a['href']
|
||||
start = len("javascript:file_download('")
|
||||
end = jscript.index("','")
|
||||
sub_url = jscript[start:end]
|
||||
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||
|
||||
file_name = jscript[end+3:-3]
|
||||
file_name = urllib.parse.unquote(file_name)
|
||||
|
||||
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
|
||||
links.append(torrent)
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
return links
|
||||
79
Crawler/WorkerTorrentKim.py
Normal file
79
Crawler/WorkerTorrentKim.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import requests
|
||||
import urllib
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
from .Setting import Setting
|
||||
from .ProxyHandler import ProxyHandler
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
from .DataType import PageLink, TorrentFile
|
||||
|
||||
|
||||
class WorkerTorrentKim:
|
||||
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||
|
||||
links = []
|
||||
tables = soup.select('table.board_list')
|
||||
trs = tables[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td.subject')
|
||||
title = tds[0].text.strip()
|
||||
link = tds[0].select('a')[0].attrs['href']
|
||||
|
||||
title_match = re_title.search(title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||
continue
|
||||
elif ep in setting.downloaded[video['title']]:
|
||||
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||
continue
|
||||
|
||||
link = urllib.parse.urljoin(top_url, link)
|
||||
links.append(PageLink(video['title'], ep, link))
|
||||
|
||||
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, page_link):
|
||||
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
|
||||
|
||||
resp = self.request_get(page_link.url)
|
||||
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
|
||||
|
||||
links = []
|
||||
file_table = soup.select('table#file_table')
|
||||
a_tags = file_table[0].select('a')
|
||||
torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||
smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||
|
||||
for link in torrent_links:
|
||||
file_name = link.text.strip()
|
||||
sub_url = link.attrs['href']
|
||||
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
for link in smi_links:
|
||||
file_name = link.text.strip()
|
||||
sub_url = link.attrs['href']
|
||||
url = urllib.parse.urljoin(link.url, sub_url)
|
||||
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
return links
|
||||
5
Main.py
Normal file → Executable file
5
Main.py
Normal file → Executable file
@@ -1,5 +1,10 @@
|
||||
from Crawler.Crawler import Crawler
|
||||
from Crawler.Crawler import Sorter
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
file_sorter = Sorter()
|
||||
file_sorter.move_files()
|
||||
|
||||
crawler = Crawler()
|
||||
crawler.crawl()
|
||||
|
||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
requests
|
||||
bs4
|
||||
yaml
|
||||
Reference in New Issue
Block a user