Files
TorrentCrawler/Crawler/Crawler.py

174 lines
4.2 KiB
Python

import sys
import io
import os
import requests
from bs4 import BeautifulSoup
import re
from .Setting import Setting
sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
class PageLink:
def __init__(self):
self.title = ''
self.episode = ''
self.url = ''
def __init__(self, title, episode, url):
self.title = title
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)
def __repr__(self):
return str(self)
class TorrentFile:
def __init__(self):
self.title = ''
self.episode = ''
self.file_name = ''
self.url = ''
def __init__(self, title, episode, file_name, url):
self.title = title
self.file_name = file_name
self.episode = episode
self.url = url
def __str__(self):
return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)
def __repr__(self):
return str(self)
class Crawler:
def __init__(self):
self.setting = Setting()
def print_log(self, files):
f = open('output/log.txt', 'at')
for file in files:
f.write(file.file_name+'\n')
f.close()
def crawl_list(self, url):
print('checking page {}'.format(url), flush=True)
code = requests.get(url)
html = code.text
soup = BeautifulSoup(html, 'lxml')
re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
links = []
for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
board_title = link.get_text().strip()
title_match = re_title.search(board_title)
if not title_match:
continue
ep_match = re_episode.search(board_title)
if not ep_match:
continue
title_idx = int(title_match.lastgroup[3:])
video = self.setting.settings['video'][title_idx]
ep = int(ep_match.group(1))
if ep <= video['ignore_ep_under']:
print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
continue
elif ep in self.setting.downloaded[video['title']]:
print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
continue
link_url = link.get('href')
if not link_url.startswith('http'):
top_end = url[8:].find('/')
if top_end < 0:
top_url = url[:8 + top_end]
else:
top_url = url
if link_url[0] != '/':
link_url = '/' + link_url
link_url = top_url + link_url
links.append(PageLink(video['title'], ep, link_url))
print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
return links
def crawl_downlink(self, link):
print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
code = requests.get(link.url)
html = code.text
soup = BeautifulSoup(html, 'lxml')
links = []
a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
for tag in a_tags:
file_name = str(tag.find('strong').text)
url = tag.get('href')
links.append(TorrentFile(link.title, link.episode, file_name, url))
print(' found download link : {}({})'.format(file_name, url), flush=True)
return links
def download_files(self, file):
if file.episode in self.setting.downloaded[file.title]:
return
print("start download {}".format(file.file_name), flush=True)
try:
response = requests.get(file.url)
data = response.content
f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
f.write(data)
f.close()
self.setting.downloaded[file.title].append(file.episode)
self.setting.save()
print("downloaded {}".format(file.file_name), flush=True)
except Exception as e:
print(e)
def crawl(self):
print('Crawling start')
page_links = []
for url in self.setting.settings['urls']:
page = 1
while page <= self.setting.settings['max_page']:
page_links += self.crawl_list(url+str(page))
page += 1
files = []
for link in page_links:
files += self.crawl_downlink(link)
for file in files:
self.download_files(file)
print('Crawling finished')