import requests import urllib import bs4 import re import os from .Setting import Setting from .ProxyHandler import ProxyHandler from .Logger import Logger from .Util import Util from .DataType import PageLink, TorrentFile class WorkerTfreeca: def crawl_list(self, top_url, soup, re_title, re_episode, setting): links = [] tables = soup.select('table.b_list') trs = tables[0].select('tr') for tr in trs[1:]: tds = tr.select('td.subject') if len(tds) < 1: continue title = tds[0].text.strip() link = tds[0].select('a')[1].attrs['href'] title_match = re_title.search(title) if not title_match: continue ep_match = re_episode.search(title) if not ep_match: continue title_idx = int(title_match.lastgroup[3:]) video = setting.settings['video'][title_idx] ep = int(ep_match.group(1)) if ep <= video['ignore_ep_under']: Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) continue elif ep in setting.downloaded[video['title']]: Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) continue link = urllib.parse.urljoin(top_url, link) links.append(PageLink(video['title'], ep, link)) Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) return links def crawl_downlink(self, page_link, soup): links = [] file_table = soup.select('table#file_table') a_tags = file_table[0].select('a') torrent_links = [a for a in a_tags if '.torrent' in a.text] smi_links = [a for a in a_tags if '.smi' in a.text] for link in torrent_links: file_name = link.text.strip() sub_url = link.attrs['href'] url = urllib.parse.urljoin(page_link.url, sub_url) links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) Logger.log(' found download link : {}({})'.format(file_name, url)) for link in smi_links: file_name = link.text.strip() sub_url = link.attrs['href'] url = urllib.parse.urljoin(link.url, sub_url) links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) Logger.log(' found download link : {}({})'.format(file_name, url)) return links