refactoring and add tocops

This commit is contained in:
2018-06-25 00:47:32 +09:00
parent 9d7afbdc1b
commit 8c9ddb9ce8
6 changed files with 314 additions and 97 deletions

97
Crawler/WorkerTocops.py Normal file
View File

@@ -0,0 +1,97 @@
import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTocops:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
if '신고된 게시물' in soup.find(id='writeContents').text:
return links
for a in soup.find_all('a'):
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
jscript = a['href']
start = len("javascript:file_download('")
end = jscript.index("','")
sub_url = jscript[start:end]
url = urllib.parse.urljoin(page_link.url, sub_url)
file_name = jscript[end+3:-3]
file_name = urllib.parse.unquote(file_name)
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
links.append(torrent)
Logger.log(' found download link : {}({})'.format(file_name, url))
#
# links = []
# file_table = soup.select('table#file_table')
# a_tags = file_table[0].select('a')
# torrent_links = [a for a in a_tags if '.torrent' in a.text]
# smi_links = [a for a in a_tags if '.smi' in a.text]
#
# for link in torrent_links:
# file_name = link.text.strip()
# sub_url = link.attrs['href']
# url = urllib.parse.urljoin(page_link.url, sub_url)
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
#
# Logger.log(' found download link : {}({})'.format(file_name, url))
#
# for link in smi_links:
# file_name = link.text.strip()
# sub_url = link.attrs['href']
# url = urllib.parse.urljoin(link.url, sub_url)
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
#
# Logger.log(' found download link : {}({})'.format(file_name, url))
return links