Compare commits

...

10 Commits

Author SHA1 Message Date
b19499b6a4 request with session 2018-06-25 20:23:40 +09:00
8c9ddb9ce8 refactoring and add tocops 2018-06-25 00:47:32 +09:00
9d7afbdc1b 디스크가 4GB 이상 남았을 때만 Crawling 2018-05-24 04:13:49 +09:00
917894fcac file sorter 추가 2018-05-24 03:50:40 +09:00
f32ff66d2f 디스크 용량 확인 및 기타 수정 2018-05-04 21:39:32 +09:00
125c201638 압축 형식 삭제 2017-12-16 21:19:10 +09:00
34c0cc5a29 i don't know 2017-12-16 20:55:25 +09:00
493c44999a 리눅스 환경에서 디버깅 2017-08-06 05:11:53 +09:00
ef0c1e78ac .gitignore 파일 수정 2017-08-06 03:21:11 +09:00
6729cab06a - torrentkim 사이트로 변경
- 파일 폴더 구성
2017-08-06 03:19:23 +09:00
14 changed files with 559 additions and 252 deletions

3
.gitignore vendored
View File

@@ -1,3 +1,6 @@
output/
.idea/
**/__pycache__/
conf/
temp/
download/

312
Crawler/Crawler.py Normal file → Executable file
View File

@@ -1,64 +1,38 @@
import sys
import io
import requests
import urllib
import bs4
import re
import os
import requests
from bs4 import BeautifulSoup
import re
import pickle
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)
from .WorkerTorrentKim import WorkerTorrentKim
from .WorkerTfreeca import WorkerTfreeca
from .WorkerTocops import WorkerTocops
class Crawler:
PROXY_FILE_NAME = 'proxy.bin'
def __init__(self):
self.setting = Setting()
self.proxies = []
self.session = requests.Session()
self.cookies = None
self.session = requests.session()
self.proxy_handler = ProxyHandler()
self.worker = None
if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
if 'torrentkim' in top_url:
self.worker = WorkerTorrentKim()
elif 'tfreeca' in top_url:
self.worker = WorkerTfreeca()
elif 'tcorea' in top_url:
self.worker = WorkerTocops()
@staticmethod
def print_log(files):
@@ -67,89 +41,27 @@ class Crawler:
f.write(file.file_name+'\n')
f.close()
def crawl_proxy(self):
proxies = []
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
else:
resp = requests.get('https://www.us-proxy.org')
soup = BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
cnt = 0
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) > 0:
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
'alive': True,
}
)
# print('{}:{}'.format(ip, port))
cnt += 1
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(proxies, f)
print('proxy cnt : {}'.format(cnt))
return proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if os.path.exists(Crawler.PROXY_FILE_NAME):
with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
self.proxies = pickle.load(f)
else:
self.proxies = self.crawl_proxy()
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(Crawler.PROXY_FILE_NAME)
self.proxies = []
def request_get(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
}
proxy = self.get_proxy()
proxy = self.proxy_handler.get_proxy()
while True:
try:
resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
self.cookies = resp.cookies
resp = self.session.get(url, proxies=proxy, headers=headers, timeout=3)
except Exception as e:
self.set_proxy_dead(proxy)
proxy = self.get_proxy()
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
if resp.status_code != 200:
self.set_proxy_dead(proxy)
proxy = self.get_proxy()
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
continue
else:
break
@@ -157,139 +69,45 @@ class Crawler:
return resp
def crawl_list(self, url):
print('checking page {}'.format(url), flush=True)
Logger.log('checking page {}'.format(url))
resp = self.request_get(url)
html = resp.text
soup = BeautifulSoup(html, 'lxml')
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = []
tables = soup.select('table.table')
trs = tables[0].select('tr.')
for tr in trs:
tds = tr.select('div.td-subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = self.setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
continue
elif ep in self.setting.downloaded[video['title']]:
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
continue
if not link.startswith('http'):
top_end = url[8:].find('/')
if top_end < 0:
top_url = url[:8 + top_end]
else:
top_url = url
if link[0] != '/':
link = '/' + link
link = top_url + link
links.append(PageLink(video['title'], ep, link))
print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim
# for link in soup.select(selector):
# if link.has_attr('rel') and 'nofollow' in link['rel']:
# continue
#
# board_title = link.get_text().strip()
#
# title_match = re_title.search(board_title)
# if not title_match:
# continue
#
# ep_match = re_episode.search(board_title)
# if not ep_match:
# continue
#
# title_idx = int(title_match.lastgroup[3:])
# video = self.setting.settings['video'][title_idx]
# ep = int(ep_match.group(1))
#
# if ep <= video['ignore_ep_under']:
# print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
# continue
# elif ep in self.setting.downloaded[video['title']]:
# print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
# continue
#
# link_url = link.get('href')
# if not link_url.startswith('http'):
# top_end = url[8:].find('/')
# if top_end < 0:
# top_url = url[:8 + top_end]
# else:
# top_url = url
#
# if link_url[0] != '/':
# link_url = '/' + link_url
#
# link_url = top_url + link_url
#
# links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
#
# print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
return links
def crawl_downlink(self, link):
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(link.url)
soup = BeautifulSoup(resp.text, 'lxml')
links = []
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
for tag in a_tags:
file_name = str(tag.find('strong').text)
url = tag.get('href')
links.append(TorrentFile(link.title, link.episode, file_name, url))
print(' found download link : {}({})'.format(file_name, url), flush=True)
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = self.worker.crawl_downlink(page_link, soup)
return links
def download_files(self, file):
if file.episode in self.setting.downloaded[file.title]:
def download_files(self, file_link):
if file_link.episode in self.setting.downloaded[file_link.title]:
return
print("start download {}".format(file.file_name), flush=True)
Logger.log("start download {}".format(file_link.file_name))
try:
response = self.request_get(file.url, cookies=file.cookie)
with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
f.write(response.content)
resp = self.request_get(file_link.url)
file_name = file_link.file_name
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file.title].append(file.episode)
self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save()
print("downloaded {}".format(file.file_name), flush=True)
Logger.log("downloaded {}".format(file_name))
except Exception as e:
print(e)
Logger.log(e)
def crawl_torrent(self):
page_links = []
@@ -308,8 +126,28 @@ class Crawler:
self.download_files(file)
def crawl(self):
print('Crawling start')
if Util.get_free_space() < 4*1024*1024:
Logger.log('Disk space is less than 4GB. Aborted')
return
# self.test()
Logger.log('Crawling start')
self.crawl_torrent()
Logger.log('Crawling finished')
print('Crawling finished')
class Sorter:
@staticmethod
def move_files():
setting = Setting()
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
return
file_list = os.listdir(setting.settings['file_download_path'])
for filename in file_list:
for video in setting.settings['video']:
if video['title'] in filename:
old_path = os.path.join(setting.settings['file_download_path'], filename)
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
os.rename(old_path, new_path)

36
Crawler/DataType.py Normal file
View File

@@ -0,0 +1,36 @@
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)

6
Crawler/Logger.py Normal file
View File

@@ -0,0 +1,6 @@
class Logger:
@staticmethod
def log(msg):
print(msg)

146
Crawler/ProxyHandler.py Executable file
View File

@@ -0,0 +1,146 @@
import os
import pickle
import requests
import bs4
import concurrent.futures
import time
from .Logger import Logger
from .Util import Util
class ProxyHandler:
PROXY_FILE_NAME = 'temp/proxy.bin'
def __init__(self):
if not os.path.exists('temp'):
os.makedirs('temp')
self.proxies = []
self.check_url = ''
def check_proxy(self, proxy, top_url):
try:
resp = requests.get(top_url, proxies=proxy, timeout=2)
except:
proxy['alive'] = False
else:
if resp.status_code != 200:
proxy['alive'] = False
def check_proxy_all(self, proxies, check_url):
Logger.log('checking proxies for {}'.format(check_url))
worker_cnt = 16
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
pool.shutdown()
def has_file(self):
return os.path.exists(self.PROXY_FILE_NAME)
def load_proxy(self):
with open(self.PROXY_FILE_NAME, 'rb') as f:
proxies = pickle.load(f)
return proxies
def crawl_proxy(self):
proxies = []
resp = requests.get('https://www.us-proxy.org')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
resp = requests.get('https://www.socks-proxy.net')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
resp = requests.get('https://www.sslproxies.org')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
# print(proxies)
self.check_proxy_all(proxies, self.check_url)
# print(proxies)
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
if Util.get_free_space() >= 1024:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(alive_proxies, f)
return alive_proxies
def get_proxy(self):
if len(self.proxies) <= 0:
if self.has_file():
self.proxies = self.load_proxy()
self.proxies = [proxy for proxy in self.proxies if proxy['alive']]
if len(self.proxies) <= 0:
while True:
self.proxies = self.crawl_proxy()
if len(self.proxies) > 0:
break
else:
Logger.log('there is no available proxy. sleep 10secs..')
time.sleep(10)
for proxy in self.proxies:
if proxy['alive']:
return proxy
return None
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive'] and Util.get_free_space() >= 1024:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return
os.remove(self.PROXY_FILE_NAME)
self.proxies = []

52
Crawler/Setting.py Normal file → Executable file
View File

@@ -1,33 +1,42 @@
import yaml
import os
from .Logger import Logger
from .Util import Util
class Setting:
SETTING_FILE = 'conf/settings.yml'
DOWNLOADED_FILE = 'temp/downloaded.yml'
def __init__(self):
self.settings = None
self.downloaded = None
if not os.path.exists('temp'):
os.mkdir('temp')
self.load()
pass
def load_settings(self):
if not os.path.isfile('settings.yml'):
print('There is no settings.yml', flush=True)
if not os.path.isfile(self.SETTING_FILE):
Logger.log('There is no {}'.format(self.SETTING_FILE))
exit()
with open('settings.yml', encoding='utf-8') as setting_file:
with open(self.SETTING_FILE, encoding='utf-8') as setting_file:
try:
self.settings = yaml.load(setting_file)
except ValueError as e:
print(e, flush=True)
Logger.log(e)
exit()
if 'video' not in self.settings:
print('video key is need in settings.json', flush=True)
Logger.log('video key is need in settings.json')
exit()
for i, video in enumerate(self.settings['video']):
if 'title' not in video:
print('title key is need in video({})'.format(i), flush=True)
Logger.log('title key is need in video({})'.format(i))
exit()
if 'keyword' not in video:
@@ -42,25 +51,39 @@ class Setting:
if 'download_path' not in self.settings:
self.settings['download_path'] = '.'
if self.settings['download_path'][-1] != '\\':
self.settings['download_path'] += '\\'
if self.settings['download_path'][-1] != '/':
self.settings['download_path'] += '/'
if not os.path.exists(self.settings['download_path']):
try:
os.makedirs(self.settings['download_path'])
except Exception as e:
print(e, flush=True)
Logger.log(e)
exit()
if 'download_path' in self.settings and self.settings['file_download_path'][-1] != '/':
self.settings['file_download_path'] += '/'
if 'file_move_path' in self.settings:
if self.settings['file_move_path'][-1] != '/':
self.settings['file_move_path'] += '/'
if not os.path.exists(self.settings['file_move_path']):
try:
os.makedirs(self.settings['file_move_path'])
except Exception as e:
Logger.log(e)
exit()
video['keyword'] += self.settings['keyword_append']
def load_downloaded(self):
if os.path.isfile('downloaded.yml'):
with open("downloaded.yml", 'r', encoding='utf-8') as stream:
if os.path.isfile(self.DOWNLOADED_FILE):
with open(self.DOWNLOADED_FILE, 'r', encoding='utf-8') as stream:
try:
self.downloaded = yaml.load(stream)
except yaml.YAMLError as e:
print(e, flush=True)
Logger.log(e)
else:
self.downloaded = {}
@@ -108,6 +131,9 @@ class Setting:
yaml.dump(downloaded_ex, outfile, allow_unicode=True)
def save(self):
with open('downloaded.yml', 'w', encoding='utf-8') as outfile:
if Util.get_free_space() < 1024:
return
with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile:
yaml.dump(self.downloaded, outfile, allow_unicode=True)
pass

13
Crawler/Util.py Normal file
View File

@@ -0,0 +1,13 @@
import subprocess
import platform
class Util:
@staticmethod
def get_free_space():
if platform.system() == 'Linux':
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
output = df.communicate()[0]
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
else:
available = 1024*1024*1024*1024;
return int(available)

78
Crawler/WorkerTfreeca.py Normal file
View File

@@ -0,0 +1,78 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTfreeca:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.b_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[1].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

74
Crawler/WorkerTocops.py Normal file
View File

@@ -0,0 +1,74 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTocops:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
if '신고된 게시물' in soup.find(id='writeContents').text:
return links
for a in soup.find_all('a'):
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
jscript = a['href']
start = len("javascript:file_download('")
end = jscript.index("','")
sub_url = jscript[start:end]
url = urllib.parse.urljoin(page_link.url, sub_url)
file_name = jscript[end+3:-3]
file_name = urllib.parse.unquote(file_name)
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
links.append(torrent)
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

View File

@@ -0,0 +1,79 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTorrentKim:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

9
Main.py Normal file → Executable file
View File

@@ -1,5 +1,10 @@
from Crawler.Crawler import Crawler
from Crawler.Crawler import Sorter
if __name__ == '__main__':
file_sorter = Sorter()
file_sorter.move_files()
crawler = Crawler()
crawler.crawl()
crawler = Crawler()
crawler.crawl()

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
requests
bs4
yaml