Compare commits

...

8 Commits

Author SHA1 Message Date
b19499b6a4 request with session 2018-06-25 20:23:40 +09:00
8c9ddb9ce8 refactoring and add tocops 2018-06-25 00:47:32 +09:00
9d7afbdc1b 디스크가 4GB 이상 남았을 때만 Crawling 2018-05-24 04:13:49 +09:00
917894fcac file sorter 추가 2018-05-24 03:50:40 +09:00
f32ff66d2f 디스크 용량 확인 및 기타 수정 2018-05-04 21:39:32 +09:00
125c201638 압축 형식 삭제 2017-12-16 21:19:10 +09:00
34c0cc5a29 i don't know 2017-12-16 20:55:25 +09:00
493c44999a 리눅스 환경에서 디버깅 2017-08-06 05:11:53 +09:00
10 changed files with 406 additions and 108 deletions

145
Crawler/Crawler.py Normal file → Executable file
View File

@@ -2,59 +2,38 @@ import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)
from .WorkerTorrentKim import WorkerTorrentKim
from .WorkerTfreeca import WorkerTfreeca
from .WorkerTocops import WorkerTocops
class Crawler:
def __init__(self):
self.setting = Setting()
self.session = requests.session()
self.proxy_handler = ProxyHandler()
self.worker = None
if len(self.setting.settings['urls']) > 0:
urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0])
top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', ''))
self.proxy_handler.check_url = top_url
if 'torrentkim' in top_url:
self.worker = WorkerTorrentKim()
elif 'tfreeca' in top_url:
self.worker = WorkerTfreeca()
elif 'tcorea' in top_url:
self.worker = WorkerTocops()
@staticmethod
def print_log(files):
f = open('output/log.txt', 'at')
@@ -66,16 +45,15 @@ class Crawler:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
'Accept-Charset': 'utf-8,euc-kr;q=0.7,*;q=0.3',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
}
proxy = self.proxy_handler.get_proxy()
while True:
try:
resp = requests.get(url, proxies=proxy, headers=headers, timeout=3)
resp = self.session.get(url, proxies=proxy, headers=headers, timeout=3)
except Exception as e:
self.proxy_handler.set_proxy_dead(proxy)
proxy = self.proxy_handler.get_proxy()
@@ -96,41 +74,10 @@ class Crawler:
resp = self.request_get(url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_title = re.compile('|'.join(['(?P<key' + str(i) + '>' + video['keyword'] + ')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = self.setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in self.setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
links = self.worker.crawl_list(url, soup, re_title, re_episode, self.setting)
return links
def crawl_downlink(self, page_link):
@@ -139,28 +86,7 @@ class Crawler:
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
links = self.worker.crawl_downlink(page_link, soup)
return links
def download_files(self, file_link):
@@ -171,15 +97,14 @@ class Crawler:
try:
resp = self.request_get(file_link.url)
file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition'])
file_name = urllib.parse.unquote(file_name[0])
file_name = file_link.file_name
with open(self.setting.settings['download_path'] + file_name, 'wb') as f:
f.write(resp.content)
self.setting.downloaded[file_link.title].append(file_link.episode)
self.setting.save()
Logger.log("downloaded {}".format(file_link.file_name))
Logger.log("downloaded {}".format(file_name))
except Exception as e:
Logger.log(e)
@@ -201,6 +126,28 @@ class Crawler:
self.download_files(file)
def crawl(self):
if Util.get_free_space() < 4*1024*1024:
Logger.log('Disk space is less than 4GB. Aborted')
return
# self.test()
Logger.log('Crawling start')
self.crawl_torrent()
Logger.log('Crawling finished')
class Sorter:
@staticmethod
def move_files():
setting = Setting()
if not 'file_download_path' in setting.settings or not 'file_move_path' in setting.settings:
return
file_list = os.listdir(setting.settings['file_download_path'])
for filename in file_list:
for video in setting.settings['video']:
if video['title'] in filename:
old_path = os.path.join(setting.settings['file_download_path'], filename)
new_path = os.path.join(setting.settings['file_move_path'], video['title'] + '/' + filename)
os.rename(old_path, new_path)

36
Crawler/DataType.py Normal file
View File

@@ -0,0 +1,36 @@
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)

60
Crawler/ProxyHandler.py Normal file → Executable file
View File

@@ -3,9 +3,10 @@ import pickle
import requests
import bs4
import concurrent.futures
import time
from .Logger import Logger
from .Util import Util
class ProxyHandler:
PROXY_FILE_NAME = 'temp/proxy.bin'
@@ -29,7 +30,7 @@ class ProxyHandler:
def check_proxy_all(self, proxies, check_url):
Logger.log('checking proxies for {}'.format(check_url))
worker_cnt = 64
worker_cnt = 16
pool = concurrent.futures.ThreadPoolExecutor(worker_cnt)
[pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies]
pool.shutdown()
@@ -64,12 +65,51 @@ class ProxyHandler:
}
)
resp = requests.get('https://www.socks-proxy.net')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
resp = requests.get('https://www.sslproxies.org')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.select('table.table')
trs = table[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td')
if len(tds) < 2:
continue
ip, port = tds[0].text, tds[1].text
proxies.append(
{
'alive': True,
'http': '{}:{}'.format(ip, port),
'https': '{}:{}'.format(ip, port),
}
)
# print(proxies)
self.check_proxy_all(proxies, self.check_url)
# print(proxies)
alive_proxies = [proxy for proxy in proxies if proxy['alive']]
Logger.log('proxies checking end: available : {}'.format(len(alive_proxies)))
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(alive_proxies, f)
if Util.get_free_space() >= 1024:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(alive_proxies, f)
return alive_proxies
@@ -77,8 +117,16 @@ class ProxyHandler:
if len(self.proxies) <= 0:
if self.has_file():
self.proxies = self.load_proxy()
else:
self.proxies = [proxy for proxy in self.proxies if proxy['alive']]
if len(self.proxies) <= 0:
while True:
self.proxies = self.crawl_proxy()
if len(self.proxies) > 0:
break
else:
Logger.log('there is no available proxy. sleep 10secs..')
time.sleep(10)
for proxy in self.proxies:
if proxy['alive']:
@@ -89,7 +137,7 @@ class ProxyHandler:
def set_proxy_dead(self, proxy):
proxy['alive'] = False
for proxy in self.proxies:
if proxy['alive']:
if proxy['alive'] and Util.get_free_space() >= 1024:
with open(self.PROXY_FILE_NAME, 'wb') as f:
pickle.dump(self.proxies, f)
return

23
Crawler/Setting.py Normal file → Executable file
View File

@@ -2,7 +2,7 @@ import yaml
import os
from .Logger import Logger
from .Util import Util
class Setting:
SETTING_FILE = 'conf/settings.yml'
@@ -51,8 +51,8 @@ class Setting:
if 'download_path' not in self.settings:
self.settings['download_path'] = '.'
if self.settings['download_path'][-1] != '\\':
self.settings['download_path'] += '\\'
if self.settings['download_path'][-1] != '/':
self.settings['download_path'] += '/'
if not os.path.exists(self.settings['download_path']):
try:
@@ -61,6 +61,20 @@ class Setting:
Logger.log(e)
exit()
if 'download_path' in self.settings and self.settings['file_download_path'][-1] != '/':
self.settings['file_download_path'] += '/'
if 'file_move_path' in self.settings:
if self.settings['file_move_path'][-1] != '/':
self.settings['file_move_path'] += '/'
if not os.path.exists(self.settings['file_move_path']):
try:
os.makedirs(self.settings['file_move_path'])
except Exception as e:
Logger.log(e)
exit()
video['keyword'] += self.settings['keyword_append']
def load_downloaded(self):
@@ -117,6 +131,9 @@ class Setting:
yaml.dump(downloaded_ex, outfile, allow_unicode=True)
def save(self):
if Util.get_free_space() < 1024:
return
with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile:
yaml.dump(self.downloaded, outfile, allow_unicode=True)
pass

13
Crawler/Util.py Normal file
View File

@@ -0,0 +1,13 @@
import subprocess
import platform
class Util:
@staticmethod
def get_free_space():
if platform.system() == 'Linux':
df = subprocess.Popen(["df", "/"], stdout=subprocess.PIPE)
output = df.communicate()[0]
device, size, used, available, percent, mountpoint = str(output).split("\\n")[1].split()
else:
available = 1024*1024*1024*1024;
return int(available)

78
Crawler/WorkerTfreeca.py Normal file
View File

@@ -0,0 +1,78 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTfreeca:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.b_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[1].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

74
Crawler/WorkerTocops.py Normal file
View File

@@ -0,0 +1,74 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTocops:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
if '신고된 게시물' in soup.find(id='writeContents').text:
return links
for a in soup.find_all('a'):
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
jscript = a['href']
start = len("javascript:file_download('")
end = jscript.index("','")
sub_url = jscript[start:end]
url = urllib.parse.urljoin(page_link.url, sub_url)
file_name = jscript[end+3:-3]
file_name = urllib.parse.unquote(file_name)
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
links.append(torrent)
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

View File

@@ -0,0 +1,79 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTorrentKim:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link):
Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url))
resp = self.request_get(page_link.url)
soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml')
links = []
file_table = soup.select('table#file_table')
a_tags = file_table[0].select('a')
torrent_links = [a for a in a_tags if '.torrent' in a.text]
smi_links = [a for a in a_tags if '.smi' in a.text]
for link in torrent_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(page_link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
for link in smi_links:
file_name = link.text.strip()
sub_url = link.attrs['href']
url = urllib.parse.urljoin(link.url, sub_url)
links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
Logger.log(' found download link : {}({})'.format(file_name, url))
return links

5
Main.py Normal file → Executable file
View File

@@ -1,5 +1,10 @@
from Crawler.Crawler import Crawler
from Crawler.Crawler import Sorter
if __name__ == '__main__':
file_sorter = Sorter()
file_sorter.move_files()
crawler = Crawler()
crawler.crawl()

View File

@@ -1,2 +1,3 @@
requests
bs4
yaml