프록시 추가

2017-07-27 22:16:49 +09:00
parent 20e048ca45
commit e9549f3ab5
2 changed files with 173 additions and 47 deletions
--- a/Crawler/Crawler.py
+++ b/Crawler/Crawler.py
@@ -5,12 +5,14 @@ import os
 import requests
 from bs4 import BeautifulSoup
 import re
+import pickle

 from .Setting import Setting

 sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
 sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')

+
 class PageLink:
 	def __init__(self):
 		self.title = ''
@@ -28,6 +30,7 @@ class PageLink:
 	def __repr__(self):
 		return str(self)

+
 class TorrentFile:
 	def __init__(self):
 		self.title = ''
@@ -49,36 +52,133 @@ class TorrentFile:


 class Crawler:
+	PROXY_FILE_NAME = 'proxy.bin'
+
 	def __init__(self):
 		self.setting = Setting()
+		self.proxies = []
+		self.session = requests.Session()
+		self.cookies = None

-	def print_log(self, files):
+	@staticmethod
+	def print_log(files):
 		f = open('output/log.txt', 'at')
 		for file in files:
 			f.write(file.file_name+'\n')
 		f.close()

+	def crawl_proxy(self):
+		proxies = []
+
+		if os.path.exists(Crawler.PROXY_FILE_NAME):
+			with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
+				proxies = pickle.load(f)
+			return proxies
+
+		else:
+			resp = requests.get('https://www.us-proxy.org')
+			soup = BeautifulSoup(resp.text, 'lxml')
+			table = soup.select('table.table')
+			trs = table[0].select('tr')
+			cnt = 0
+
+			for tr in trs[1:]:
+				tds = tr.select('td')
+				if len(tds) > 0:
+					ip, port = tds[0].text, tds[1].text
+					proxies.append(
+						{
+							'http': '{}:{}'.format(ip, port),
+							'https': '{}:{}'.format(ip, port),
+							'alive': True,
+						}
+					)
+					# print('{}:{}'.format(ip, port))
+					cnt += 1
+
+			with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
+				pickle.dump(proxies, f)
+
+			print('proxy cnt : {}'.format(cnt))
+			return proxies
+
+	def get_proxy(self):
+		if len(self.proxies) <= 0:
+			if os.path.exists(Crawler.PROXY_FILE_NAME):
+				with open(Crawler.PROXY_FILE_NAME, 'rb') as f:
+					self.proxies = pickle.load(f)
+			else:
+				self.proxies = self.crawl_proxy()
+
+		for proxy in self.proxies:
+			if proxy['alive']:
+				return proxy
+
+		return None
+
+	def set_proxy_dead(self, proxy):
+		proxy['alive'] = False
+		for proxy in self.proxies:
+			if proxy['alive']:
+				with open(Crawler.PROXY_FILE_NAME, 'wb') as f:
+					pickle.dump(self.proxies, f)
+				return
+
+		os.remove(Crawler.PROXY_FILE_NAME)
+		self.proxies = []
+
+	def request_get(self, url):
+		headers = {
+			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
+			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+			'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
+			'Accept-Encoding': 'gzip, deflate, br',
+			'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
+			'Connection': 'keep-alive',
+		}
+
+		proxy = self.get_proxy()
+		while True:
+			try:
+				resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3)
+				self.cookies = resp.cookies
+			except Exception as e:
+				self.set_proxy_dead(proxy)
+				proxy = self.get_proxy()
+				continue
+			else:
+				if resp.status_code != 200:
+					self.set_proxy_dead(proxy)
+					proxy = self.get_proxy()
+					continue
+				else:
+					break
+
+		return resp

 	def crawl_list(self, url):
-
 		print('checking page {}'.format(url), flush=True)

-		code = requests.get(url)
-		html = code.text
+		resp = self.request_get(url)
+		html = resp.text
 		soup = BeautifulSoup(html, 'lxml')

 		re_title = re.compile('|'.join(['(?P<key'+str(i)+'>'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I)
 		re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I)

 		links = []
-		for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'):
-			board_title = link.get_text().strip()
+		tables = soup.select('table.table')
+		trs = tables[0].select('tr.')
+		for tr in trs:
+			tds = tr.select('div.td-subject')
+			title = tds[0].text.strip()
+			link = tds[0].select('a')[0].attrs['href']

-			title_match = re_title.search(board_title)
+			title_match = re_title.search(title)
 			if not title_match:
 				continue

-			ep_match = re_episode.search(board_title)
+			ep_match = re_episode.search(title)
 			if not ep_match:
 				continue

@@ -93,31 +193,73 @@ class Crawler:
 				print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
 				continue

-			link_url = link.get('href')
-			if not link_url.startswith('http'):
+			if not link.startswith('http'):
 				top_end = url[8:].find('/')
 				if top_end < 0:
 					top_url = url[:8 + top_end]
 				else:
 					top_url = url

-				if link_url[0] != '/':
-					link_url = '/' + link_url
+				if link[0] != '/':
+					link = '/' + link

-				link_url = top_url + link_url
+					link = top_url + link

-			links.append(PageLink(video['title'], ep, link_url))
+			links.append(PageLink(video['title'], ep, link))

-			print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)
+			print('   found content page : {}({}), {}'.format(video['title'], ep, link), flush=True)
+
+
+		# selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a'    # torrentkim
+		# for link in soup.select(selector):
+		# 	if link.has_attr('rel') and 'nofollow' in link['rel']:
+		# 		continue
+		#
+		# 	board_title = link.get_text().strip()
+		#
+		# 	title_match = re_title.search(board_title)
+		# 	if not title_match:
+		# 		continue
+		#
+		# 	ep_match = re_episode.search(board_title)
+		# 	if not ep_match:
+		# 		continue
+		#
+		# 	title_idx = int(title_match.lastgroup[3:])
+		# 	video = self.setting.settings['video'][title_idx]
+		# 	ep = int(ep_match.group(1))
+		#
+		# 	if ep <= video['ignore_ep_under']:
+		# 		print('    {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True)
+		# 		continue
+		# 	elif ep in self.setting.downloaded[video['title']]:
+		# 		print('    {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True)
+		# 		continue
+		#
+		# 	link_url = link.get('href')
+		# 	if not link_url.startswith('http'):
+		# 		top_end = url[8:].find('/')
+		# 		if top_end < 0:
+		# 			top_url = url[:8 + top_end]
+		# 		else:
+		# 			top_url = url
+		#
+		# 		if link_url[0] != '/':
+		# 			link_url = '/' + link_url
+		#
+		# 		link_url = top_url + link_url
+		#
+		# 	links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie')))
+		#
+		# 	print('   found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True)

 		return links

 	def crawl_downlink(self, link):
 		print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True)

-		code = requests.get(link.url)
-		html = code.text
-		soup = BeautifulSoup(html, 'lxml')
+		resp = self.request_get(link.url)
+		soup = BeautifulSoup(resp.text, 'lxml')

 		links = []
 		a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'})
@@ -137,11 +279,9 @@ class Crawler:
 		print("start download {}".format(file.file_name), flush=True)

 		try:
-			response = requests.get(file.url)
-			data = response.content
-			f = open(self.setting.settings['download_path'] + file.file_name, 'wb')
-			f.write(data)
-			f.close()
+			response = self.request_get(file.url, cookies=file.cookie)
+			with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f:
+				f.write(response.content)

 			self.setting.downloaded[file.title].append(file.episode)
 			self.setting.save()
@@ -151,16 +291,13 @@ class Crawler:
 		except Exception as e:
 			print(e)

-
-	def crawl(self):
-
-		print('Crawling start')
-
+	def crawl_torrent(self):
 		page_links = []
-		for url in self.setting.settings['urls']:
+		for org_url in self.setting.settings['urls']:
 			page = 1
 			while page <= self.setting.settings['max_page']:
-				page_links += self.crawl_list(url+str(page))
+				url = org_url.replace('<page>', str(page))
+				page_links += self.crawl_list(url)
 				page += 1

 		files = []
@@ -170,4 +307,9 @@ class Crawler:
 		for file in files:
 			self.download_files(file)

+	def crawl(self):
+		print('Crawling start')
+
+		self.crawl_torrent()
+
 		print('Crawling finished')
--- a/Main.py
+++ b/Main.py
@@ -1,21 +1,5 @@
 from Crawler.Crawler import Crawler


-# def do_it():
-# 	crawler = Crawler()
-#
-# 	files = []
-# 	for url in setting.urls:
-# 		for page in range(1, setting.max_page+1):
-# 			page_url = url+str(page)
-# 			page_links = crawler.crawl_list(page_url)
-#
-# 	#         for link in page_links:
-# 	#             files += crawl_downlink(link)
-# 	#
-# 	# download_files(files)
-#
-# do_it()
-
 crawler = Crawler()
-crawler.crawl()
+crawler.crawl()