refactoring and add tocops
This commit is contained in:
97
Crawler/WorkerTocops.py
Normal file
97
Crawler/WorkerTocops.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import requests
|
||||
import urllib
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
from .Setting import Setting
|
||||
from .ProxyHandler import ProxyHandler
|
||||
from .Logger import Logger
|
||||
from .Util import Util
|
||||
from .DataType import PageLink, TorrentFile
|
||||
|
||||
class WorkerTocops:
|
||||
def crawl_list(self, top_url, soup, re_title, re_episode, setting):
|
||||
|
||||
links = []
|
||||
tables = soup.select('table.board_list')
|
||||
trs = tables[0].select('tr')
|
||||
for tr in trs[1:]:
|
||||
tds = tr.select('td.subject')
|
||||
if len(tds) < 1:
|
||||
continue
|
||||
|
||||
title = tds[0].text.strip()
|
||||
link = tds[0].select('a')[0].attrs['href']
|
||||
|
||||
title_match = re_title.search(title)
|
||||
if not title_match:
|
||||
continue
|
||||
|
||||
ep_match = re_episode.search(title)
|
||||
if not ep_match:
|
||||
continue
|
||||
|
||||
title_idx = int(title_match.lastgroup[3:])
|
||||
video = setting.settings['video'][title_idx]
|
||||
ep = int(ep_match.group(1))
|
||||
|
||||
if ep <= video['ignore_ep_under']:
|
||||
Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']))
|
||||
continue
|
||||
elif ep in setting.downloaded[video['title']]:
|
||||
Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep))
|
||||
continue
|
||||
|
||||
link = urllib.parse.urljoin(top_url, link)
|
||||
links.append(PageLink(video['title'], ep, link))
|
||||
|
||||
Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link))
|
||||
|
||||
return links
|
||||
|
||||
def crawl_downlink(self, page_link, soup):
|
||||
|
||||
links = []
|
||||
if '신고된 게시물' in soup.find(id='writeContents').text:
|
||||
return links
|
||||
|
||||
for a in soup.find_all('a'):
|
||||
if 'href' in a.attrs and 'javascript:file_download' in a['href']:
|
||||
jscript = a['href']
|
||||
start = len("javascript:file_download('")
|
||||
end = jscript.index("','")
|
||||
sub_url = jscript[start:end]
|
||||
url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||
|
||||
file_name = jscript[end+3:-3]
|
||||
file_name = urllib.parse.unquote(file_name)
|
||||
|
||||
torrent = TorrentFile(page_link.title, page_link.episode, file_name, url)
|
||||
links.append(torrent)
|
||||
Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
#
|
||||
# links = []
|
||||
# file_table = soup.select('table#file_table')
|
||||
# a_tags = file_table[0].select('a')
|
||||
# torrent_links = [a for a in a_tags if '.torrent' in a.text]
|
||||
# smi_links = [a for a in a_tags if '.smi' in a.text]
|
||||
#
|
||||
# for link in torrent_links:
|
||||
# file_name = link.text.strip()
|
||||
# sub_url = link.attrs['href']
|
||||
# url = urllib.parse.urljoin(page_link.url, sub_url)
|
||||
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
#
|
||||
# Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
#
|
||||
# for link in smi_links:
|
||||
# file_name = link.text.strip()
|
||||
# sub_url = link.attrs['href']
|
||||
# url = urllib.parse.urljoin(link.url, sub_url)
|
||||
# links.append(TorrentFile(page_link.title, page_link.episode, file_name, url))
|
||||
#
|
||||
# Logger.log(' found download link : {}({})'.format(file_name, url))
|
||||
|
||||
return links
|
||||
Reference in New Issue
Block a user