프록시 추가

2017-07-27 22:16:49 +09:00
parent 20e048ca45
commit e9549f3ab5
2 changed files with 173 additions and 47 deletions
--- a/Crawler/Crawler.py
+++ b/Crawler/Crawler.py
@@ -5,12 +5,14 @@ import os
 import requests
 from bs4 import BeautifulSoup
 import re
 import pickle
 from .Setting import Setting
 sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
 sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
 class PageLink:
 	def __init__(self):
 		self.title = ''
@@ -28,6 +30,7 @@ class PageLink:
 	def __repr__(self):
 		return str(self)
 class TorrentFile:
 	def __init__(self):
 		self.title = ''
@@ -49,36 +52,133 @@ class TorrentFile:
 class Crawler:
 	PROXY_FILE_NAME = 'proxy.bin'
 	def __init__(self):
 		self.setting = Setting()
 		self.proxies = []
 		self.session = requests.Session()
 		self.cookies = None
-	def print_log(self, files):
+	@staticmethod
 	def print_log(files):
 		f = open('output/log.txt', 'at')
 		for file in files:
 			f.write(file.file_name+'\n')
 		f.close()
 	def crawl_proxy(self):
 		proxies = []
 		if os.path.exists(Crawler.PROXY_FILE_NAME):
 			with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
 				proxies = pickle.load(f)
 			return proxies
 		else:
 			resp = requests.get('https://www.us-proxy.org')
 			soup = BeautifulSoup(resp.text, 'lxml')
 			table = soup.select('table.table')
 			trs = table[0].select('tr')
 			cnt = 0
 			for tr in trs[1:]:
 				tds = tr.select('td')
 				if len(tds) > 0:
 					ip, port = tds[0].text, tds[1].text
 					proxies.append(
 						{
 							'http': '{}:{}'.format(ip, port),
 							'https': '{}:{}'.format(ip, port),
 							'alive': True,
 						}
 					)
 					# print('{}:{}'.format(ip, port))
 					cnt += 1
 			with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
 				pickle.dump(proxies, f)
 			print('proxy cnt : {}'.format(cnt))
 			return proxies
 	def get_proxy(self):
 		if len(self.proxies) <= 0:
 			if os.path.exists(Crawler.PROXY_FILE_NAME):
 				with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
 					self.proxies = pickle.load(f)
 			else:
 				self.proxies = self.crawl_proxy()
 		for proxy in self.proxies:
 			if proxy['alive']:
 				return proxy
 		return None
 	def set_proxy_dead(self, proxy):
 		proxy['alive'] = False
 		for proxy in self.proxies:
 			if proxy['alive']:
 				with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
 					pickle.dump(self.proxies, f)
 				return
 		os.remove(Crawler.PROXY_FILE_NAME)
 		self.proxies = []
 	def request_get(self, url):
 		headers = {
 			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
 			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 			'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 			'Accept-Encoding': 'gzip, deflate, br',
 			'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
 			'Connection': 'keep-alive',
 		}
 		proxy = self.get_proxy()
 		while True:
 			try:
 				resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
 				self.cookies = resp.cookies
 			except Exception as e:
 				self.set_proxy_dead(proxy)
 				proxy = self.get_proxy()
 				continue
 			else:
 				if resp.status_code != 200:
 					self.set_proxy_dead(proxy)
 					proxy = self.get_proxy()
 					continue
 				else:
 					break
 		return resp
 	def crawl_list(self, url):
 		print('checking page {}'.format(url), flush=True)
-		code = requests.get(url)
+		resp = self.request_get(url)
-		html = code.text
+		html = resp.text
 		soup = BeautifulSoup(html, 'lxml')
 		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
 		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)
 		links = []
-		for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
+		tables = soup.select('table.table')
-			board_title = link.get_text().strip()
+		trs = tables[0].select('tr.')
 		for tr in trs:
 			tds = tr.select('div.td-subject')
 			title = tds[0].text.strip()
 			link = tds[0].select('a')[0].attrs['href']
-			title_match = re_title.search(board_title)
+			title_match = re_title.search(title)
 			if not title_match:
 				continue
-			ep_match = re_episode.search(board_title)
+			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue
@@ -93,31 +193,73 @@ class Crawler:
 				print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
 				continue
-			link_url = link.get('href')
+			if not link.startswith('http'):
 			if not link_url.startswith('http'):
 				top_end = url[8:].find('/')
 				if top_end < 0:
 					top_url = url[:8 + top_end]
 				else:
 					top_url = url
-				if link_url[0] != '/':
+				if link[0] != '/':
-					link_url = '/' + link_url
+					link = '/' + link
-				link_url = top_url + link_url
+					link = top_url + link
-			links.append(PageLink(video['title'], ep, link_url))
+			links.append(PageLink(video['title'], ep, link))
-			print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
+			print('   found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
 		# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a'    # torrentkim
 		# for link in soup.select(selector):
 		# 	if link.has_attr('rel') and 'nofollow' in link['rel']:
 		# 		continue
 		#
 		# 	board_title = link.get_text().strip()
 		#
 		# 	title_match = re_title.search(board_title)
 		# 	if not title_match:
 		# 		continue
 		#
 		# 	ep_match = re_episode.search(board_title)
 		# 	if not ep_match:
 		# 		continue
 		#
 		# 	title_idx = int(title_match.lastgroup[3:])
 		# 	video = self.setting.settings['video'][title_idx]
 		# 	ep = int(ep_match.group(1))
 		#
 		# 	if ep <= video['ignore_ep_under']:
 		# 		print('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
 		# 		continue
 		# 	elif ep in self.setting.downloaded[video['title']]:
 		# 		print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
 		# 		continue
 		#
 		# 	link_url = link.get('href')
 		# 	if not link_url.startswith('http'):
 		# 		top_end = url[8:].find('/')
 		# 		if top_end < 0:
 		# 			top_url = url[:8 + top_end]
 		# 		else:
 		# 			top_url = url
 		#
 		# 		if link_url[0] != '/':
 		# 			link_url = '/' + link_url
 		#
 		# 		link_url = top_url + link_url
 		#
 		# 	links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
 		#
 		# 	print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
 		return links
 	def crawl_downlink(self, link):
 		print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)
-		code = requests.get(link.url)
+		resp = self.request_get(link.url)
-		html = code.text
+		soup = BeautifulSoup(resp.text, 'lxml')
 		soup = BeautifulSoup(html, 'lxml')
 		links = []
 		a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
@@ -137,11 +279,9 @@ class Crawler:
 		print("start download {}".format(file.file_name), flush=True)
 		try:
-			response = requests.get(file.url)
+			response = self.request_get(file.url, cookies=file.cookie)
-			data = response.content
+			with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
-			f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
+				f.write(response.content)
 			f.write(data)
 			f.close()
 			self.setting.downloaded[file.title].append(file.episode)
 			self.setting.save()
@@ -151,16 +291,13 @@ class Crawler:
 		except Exception as e:
 			print(e)
-
+	def crawl_torrent(self):
 	def crawl(self):
 		print('Crawling start')
 		page_links = []
-		for url in self.setting.settings['urls']:
+		for org_url in self.setting.settings['urls']:
 			page = 1
 			while page <= self.setting.settings['max_page']:
-				page_links += self.crawl_list(url+str(page))
+				url = org_url.replace('<page>', str(page))
 				page_links += self.crawl_list(url)
 				page += 1
 		files = []
@@ -170,4 +307,9 @@ class Crawler:
 		for file in files:
 			self.download_files(file)
 	def crawl(self):
 		print('Crawling start')
 		self.crawl_torrent()
 		print('Crawling finished')
--- a/Main.py
+++ b/Main.py
@@ -1,21 +1,5 @@
 from Crawler.Crawler import Crawler
 # def do_it():
 # 	crawler = Crawler()
 #
 # 	files = []
 # 	for url in setting.urls:
 # 		for page in range(1, setting.max_page+1):
 # 			page_url = url+str(page)
 # 			page_links = crawler.crawl_list(page_url)
 #
 # 	#         for link in page_links:
 # 	#             files += crawl_downlink(link)
 # 	#
 # 	# download_files(files)
 #
 # do_it()
 crawler = Crawler()
-crawler.crawl()
+crawler.crawl()