import requests import urllib import bs4 import re import os from .Setting import Setting from .ProxyHandler import ProxyHandler from .Logger import Logger from .Util import Util from .DataType import PageLink, TorrentFile class WorkerTocops: def crawl_list(self, top_url, soup, re_title, re_episode, setting): links = [] tables = soup.select('table.board_list') trs = tables[0].select('tr') for tr in trs[1:]: tds = tr.select('td.subject') if len(tds) < 1: continue title = tds[0].text.strip() link = tds[0].select('a')[0].attrs['href'] title_match = re_title.search(title) if not title_match: continue ep_match = re_episode.search(title) if not ep_match: continue title_idx = int(title_match.lastgroup[3:]) video = setting.settings['video'][title_idx] ep = int(ep_match.group(1)) if ep <= video['ignore_ep_under']: Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) continue elif ep in setting.downloaded[video['title']]: Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) continue link = urllib.parse.urljoin(top_url, link) links.append(PageLink(video['title'], ep, link)) Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) return links def crawl_downlink(self, page_link, soup): links = [] if '신고된 게시물' in soup.find(id='writeContents').text: return links for a in soup.find_all('a'): if 'href' in a.attrs and 'javascript:file_download' in a['href']: jscript = a['href'] start = len("javascript:file_download('") end = jscript.index("','") sub_url = jscript[start:end] url = urllib.parse.urljoin(page_link.url, sub_url) file_name = jscript[end+3:-3] file_name = urllib.parse.unquote(file_name) torrent = TorrentFile(page_link.title, page_link.episode, file_name, url) links.append(torrent) Logger.log(' found download link : {}({})'.format(file_name, url)) return links