import requests import urllib import bs4 import re import os from .Setting import Setting from .ProxyHandler import ProxyHandler from .Logger import Logger from .Util import Util from .DataType import PageLink, TorrentFile class WorkerTorrentKim: def crawl_list(self, top_url, soup, re_title, re_episode, setting): links = [] tables = soup.select('table.board_list') trs = tables[0].select('tr') for tr in trs[1:]: tds = tr.select('td.subject') title = tds[0].text.strip() link = tds[0].select('a')[0].attrs['href'] title_match = re_title.search(title) if not title_match: continue ep_match = re_episode.search(title) if not ep_match: continue title_idx = int(title_match.lastgroup[3:]) video = setting.settings['video'][title_idx] ep = int(ep_match.group(1)) if ep <= video['ignore_ep_under']: Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) continue elif ep in setting.downloaded[video['title']]: Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) continue link = urllib.parse.urljoin(top_url, link) links.append(PageLink(video['title'], ep, link)) Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) return links def crawl_downlink(self, page_link): Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url)) resp = self.request_get(page_link.url) soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') links = [] file_table = soup.select('table#file_table') a_tags = file_table[0].select('a') torrent_links = [a for a in a_tags if '.torrent' in a.text] smi_links = [a for a in a_tags if '.smi' in a.text] for link in torrent_links: file_name = link.text.strip() sub_url = link.attrs['href'] url = urllib.parse.urljoin(page_link.url, sub_url) links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) Logger.log(' found download link : {}({})'.format(file_name, url)) for link in smi_links: file_name = link.text.strip() sub_url = link.attrs['href'] url = urllib.parse.urljoin(link.url, sub_url) links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) Logger.log(' found download link : {}({})'.format(file_name, url)) return links