TorrentCrawler/Crawler/Crawler.py

import sys
import io
import os

import requests
from bs4 import BeautifulSoup
import re
import pickle

from .Setting import Setting

sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')


class PageLink:
	def __init__(self):
		self.title = ''
		self.episode = ''
		self.url = ''

	def __init__(self, title, episode, url):
		self.title = title
		self.episode = episode
		self.url = url

	def __str__(self):
		return '{{ {}({}): {} }}'.format(self.title, self.episode, self.url)

	def __repr__(self):
		return str(self)


class TorrentFile:
	def __init__(self):
		self.title = ''
		self.episode = ''
		self.file_name = ''
		self.url = ''

	def __init__(self, title, episode, file_name, url):
		self.title = title
		self.file_name = file_name
		self.episode = episode
		self.url = url

	def __str__(self):
		return '{{ {}({}): {}({}) }}'.format(self.title, self.episode, self.file_name, self.url)

	def __repr__(self):
		return str(self)


class Crawler:
	PROXY_FILE_NAME = 'proxy.bin'

	def __init__(self):
		self.setting = Setting()
		self.proxies = []
		self.session = requests.Session()
		self.cookies = None

	@staticmethod
	def print_log(files):
		f = open('output/log.txt', 'at')
		for file in files:
			f.write(file.file_name+'\n')
		f.close()

	def crawl_proxy(self):
		proxies = []

		if os.path.exists(Crawler.PROXY_FILE_NAME):
			with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
				proxies = pickle.load(f)
			return proxies

		else:
			resp = requests.get('https://www.us-proxy.org')
			soup = BeautifulSoup(resp.text, 'lxml')
			table = soup.select('table.table')
			trs = table[0].select('tr')
			cnt = 0

			for tr in trs[1:]:
				tds = tr.select('td')
				if len(tds) > 0:
					ip, port = tds[0].text, tds[1].text
					proxies.append(
						{
							'http': '{}:{}'.format(ip, port),
							'https': '{}:{}'.format(ip, port),
							'alive': True,
						}
					)
					# print('{}:{}'.format(ip, port))
					cnt += 1

			with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
				pickle.dump(proxies, f)

			print('proxy cnt : {}'.format(cnt))
			return proxies

	def get_proxy(self):
		if len(self.proxies) <= 0:
			if os.path.exists(Crawler.PROXY_FILE_NAME):
				with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
					self.proxies = pickle.load(f)
			else:
				self.proxies = self.crawl_proxy()

		for proxy in self.proxies:
			if proxy['alive']:
				return proxy

		return None

	def set_proxy_dead(self, proxy):
		proxy['alive'] = False
		for proxy in self.proxies:
			if proxy['alive']:
				with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
					pickle.dump(self.proxies, f)
				return

		os.remove(Crawler.PROXY_FILE_NAME)
		self.proxies = []

	def request_get(self, url):
		headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
			'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
			'Accept-Encoding': 'gzip, deflate, br',
			'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
			'Connection': 'keep-alive',
		}

		proxy = self.get_proxy()
		while True:
			try:
				resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
				self.cookies = resp.cookies
			except Exception as e:
				self.set_proxy_dead(proxy)
				proxy = self.get_proxy()
				continue
			else:
				if resp.status_code != 200:
					self.set_proxy_dead(proxy)
					proxy = self.get_proxy()
					continue
				else:
					break

		return resp

	def crawl_list(self, url):
		print('checking page {}'.format(url), flush=True)

		resp = self.request_get(url)
		html = resp.text
		soup = BeautifulSoup(html, 'lxml')

		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)

		links = []
		tables = soup.select('table.table')
		trs = tables[0].select('tr.')
		for tr in trs:
			tds = tr.select('div.td-subject')
			title = tds[0].text.strip()
			link = tds[0].select('a')[0].attrs['href']

			title_match = re_title.search(title)
			if not title_match:
				continue

			ep_match = re_episode.search(title)
			if not ep_match:
				continue

			title_idx = int(title_match.lastgroup[3:])
			video = self.setting.settings['video'][title_idx]
			ep = int(ep_match.group(1))

			if ep <= video['ignore_ep_under']:
				print('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
				continue
			elif ep in self.setting.downloaded[video['title']]:
				print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
				continue

			if not link.startswith('http'):
				top_end = url[8:].find('/')
				if top_end < 0:
					top_url = url[:8 + top_end]
				else:
					top_url = url

				if link[0] != '/':
					link = '/' + link

					link = top_url + link

			links.append(PageLink(video['title'], ep, link))

			print('   found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)


		# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a'    # torrentkim
		# for link in soup.select(selector):
		# 	if link.has_attr('rel') and 'nofollow' in link['rel']:
		# 		continue
		#
		# 	board_title = link.get_text().strip()
		#
		# 	title_match = re_title.search(board_title)
		# 	if not title_match:
		# 		continue
		#
		# 	ep_match = re_episode.search(board_title)
		# 	if not ep_match:
		# 		continue
		#
		# 	title_idx = int(title_match.lastgroup[3:])
		# 	video = self.setting.settings['video'][title_idx]
		# 	ep = int(ep_match.group(1))
		#
		# 	if ep <= video['ignore_ep_under']:
		# 		print('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
		# 		continue
		# 	elif ep in self.setting.downloaded[video['title']]:
		# 		print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
		# 		continue
		#
		# 	link_url = link.get('href')
		# 	if not link_url.startswith('http'):
		# 		top_end = url[8:].find('/')
		# 		if top_end < 0:
		# 			top_url = url[:8 + top_end]
		# 		else:
		# 			top_url = url
		#
		# 		if link_url[0] != '/':
		# 			link_url = '/' + link_url
		#
		# 		link_url = top_url + link_url
		#
		# 	links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
		#
		# 	print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)

		return links

	def crawl_downlink(self, link):
		print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)

		resp = self.request_get(link.url)
		soup = BeautifulSoup(resp.text, 'lxml')

		links = []
		a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
		for tag in a_tags:
			file_name = str(tag.find('strong').text)
			url = tag.get('href')
			links.append(TorrentFile(link.title, link.episode, file_name, url))

			print('    found download link : {}({})'.format(file_name, url), flush=True)

		return links

	def download_files(self, file):
		if file.episode in self.setting.downloaded[file.title]:
			return

		print("start download {}".format(file.file_name), flush=True)

		try:
			response = self.request_get(file.url, cookies=file.cookie)
			with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
				f.write(response.content)

			self.setting.downloaded[file.title].append(file.episode)
			self.setting.save()

			print("downloaded {}".format(file.file_name), flush=True)

		except Exception as e:
			print(e)

	def crawl_torrent(self):
		page_links = []
		for org_url in self.setting.settings['urls']:
			page = 1
			while page <= self.setting.settings['max_page']:
				url = org_url.replace('<page>', str(page))
				page_links += self.crawl_list(url)
				page += 1

		files = []
		for link in page_links:
			files += self.crawl_downlink(link)

		for file in files:
			self.download_files(file)

	def crawl(self):
		print('Crawling start')

		self.crawl_torrent()

		print('Crawling finished')