Files
TorrentCrawler/Crawler/WorkerTocops.py
2018-06-25 20:23:40 +09:00

75 lines
2.0 KiB
Python

import requests
import urllib
import bs4
import re
import os
from .Setting import Setting
from .ProxyHandler import ProxyHandler
from .Logger import Logger
from .Util import Util
from .DataType import PageLink, TorrentFile
class WorkerTocops:
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
links = []
tables = soup.select('table.board_list')
trs = tables[0].select('tr')
for tr in trs[1:]:
tds = tr.select('td.subject')
if len(tds) < 1:
continue
title = tds[0].text.strip()
link = tds[0].select('a')[0].attrs['href']
title_match = re_title.search(title)
if not title_match:
continue
ep_match = re_episode.search(title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
continue
elif ep in setting.downloaded[video['title']]:
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
continue
link = urllib.parse.urljoin(top_url, link)
links.append(PageLink(video['title'], ep, link))
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
return links
def crawl_downlink(self, page_link, soup):
links = []
if '신고된 게시물' in soup.find(id='writeContents').text:
return links
for a in soup.find_all('a'):
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
jscript = a['href']
start = len("javascript:file_download('")
end = jscript.index("','")
sub_url = jscript[start:end]
url = urllib.parse.urljoin(page_link.url, sub_url)
file_name = jscript[end+3:-3]
file_name = urllib.parse.unquote(file_name)
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
links.append(torrent)
Logger.log(' found download link : {}({})'.format(file_name, url))
return links