diff --git a/Crawler/Crawler.py b/Crawler/Crawler.py index 346efee..19b252a 100644 --- a/Crawler/Crawler.py +++ b/Crawler/Crawler.py @@ -5,12 +5,14 @@ import os import requests from bs4 import BeautifulSoup import re +import pickle from .Setting import Setting sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8') + class PageLink: def __init__(self): self.title = '' @@ -28,6 +30,7 @@ class PageLink: def __repr__(self): return str(self) + class TorrentFile: def __init__(self): self.title = '' @@ -49,36 +52,133 @@ class TorrentFile: class Crawler: + PROXY_FILE_NAME = 'proxy.bin' + def __init__(self): self.setting = Setting() + self.proxies = [] + self.session = requests.Session() + self.cookies = None - def print_log(self, files): + @staticmethod + def print_log(files): f = open('output/log.txt', 'at') for file in files: f.write(file.file_name+'\n') f.close() + def crawl_proxy(self): + proxies = [] + + if os.path.exists(Crawler.PROXY_FILE_NAME): + with open(Crawler.PROXY_FILE_NAME, 'rb') as f: + proxies = pickle.load(f) + return proxies + + else: + resp = requests.get('https://www.us-proxy.org') + soup = BeautifulSoup(resp.text, 'lxml') + table = soup.select('table.table') + trs = table[0].select('tr') + cnt = 0 + + for tr in trs[1:]: + tds = tr.select('td') + if len(tds) > 0: + ip, port = tds[0].text, tds[1].text + proxies.append( + { + 'http': '{}:{}'.format(ip, port), + 'https': '{}:{}'.format(ip, port), + 'alive': True, + } + ) + # print('{}:{}'.format(ip, port)) + cnt += 1 + + with open(Crawler.PROXY_FILE_NAME, 'wb') as f: + pickle.dump(proxies, f) + + print('proxy cnt : {}'.format(cnt)) + return proxies + + def get_proxy(self): + if len(self.proxies) <= 0: + if os.path.exists(Crawler.PROXY_FILE_NAME): + with open(Crawler.PROXY_FILE_NAME, 'rb') as f: + self.proxies = pickle.load(f) + else: + self.proxies = self.crawl_proxy() + + for proxy in self.proxies: + if proxy['alive']: + return proxy + + return None + + def set_proxy_dead(self, proxy): + proxy['alive'] = False + for proxy in self.proxies: + if proxy['alive']: + with open(Crawler.PROXY_FILE_NAME, 'wb') as f: + pickle.dump(self.proxies, f) + return + + os.remove(Crawler.PROXY_FILE_NAME) + self.proxies = [] + + def request_get(self, url): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4', + 'Connection': 'keep-alive', + } + + proxy = self.get_proxy() + while True: + try: + resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3) + self.cookies = resp.cookies + except Exception as e: + self.set_proxy_dead(proxy) + proxy = self.get_proxy() + continue + else: + if resp.status_code != 200: + self.set_proxy_dead(proxy) + proxy = self.get_proxy() + continue + else: + break + + return resp def crawl_list(self, url): - print('checking page {}'.format(url), flush=True) - code = requests.get(url) - html = code.text + resp = self.request_get(url) + html = resp.text soup = BeautifulSoup(html, 'lxml') re_title = re.compile('|'.join(['(?P'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) links = [] - for link in soup.select('#fboardlist > div.tbl_head01.tbl_wrap > table > tbody > tr > td.td_subject > a'): - board_title = link.get_text().strip() + tables = soup.select('table.table') + trs = tables[0].select('tr.') + for tr in trs: + tds = tr.select('div.td-subject') + title = tds[0].text.strip() + link = tds[0].select('a')[0].attrs['href'] - title_match = re_title.search(board_title) + title_match = re_title.search(title) if not title_match: continue - ep_match = re_episode.search(board_title) + ep_match = re_episode.search(title) if not ep_match: continue @@ -93,31 +193,73 @@ class Crawler: print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True) continue - link_url = link.get('href') - if not link_url.startswith('http'): + if not link.startswith('http'): top_end = url[8:].find('/') if top_end < 0: top_url = url[:8 + top_end] else: top_url = url - if link_url[0] != '/': - link_url = '/' + link_url + if link[0] != '/': + link = '/' + link - link_url = top_url + link_url + link = top_url + link - links.append(PageLink(video['title'], ep, link_url)) + links.append(PageLink(video['title'], ep, link)) - print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True) + print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True) + + + # selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim + # for link in soup.select(selector): + # if link.has_attr('rel') and 'nofollow' in link['rel']: + # continue + # + # board_title = link.get_text().strip() + # + # title_match = re_title.search(board_title) + # if not title_match: + # continue + # + # ep_match = re_episode.search(board_title) + # if not ep_match: + # continue + # + # title_idx = int(title_match.lastgroup[3:]) + # video = self.setting.settings['video'][title_idx] + # ep = int(ep_match.group(1)) + # + # if ep <= video['ignore_ep_under']: + # print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True) + # continue + # elif ep in self.setting.downloaded[video['title']]: + # print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True) + # continue + # + # link_url = link.get('href') + # if not link_url.startswith('http'): + # top_end = url[8:].find('/') + # if top_end < 0: + # top_url = url[:8 + top_end] + # else: + # top_url = url + # + # if link_url[0] != '/': + # link_url = '/' + link_url + # + # link_url = top_url + link_url + # + # links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie'))) + # + # print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True) return links def crawl_downlink(self, link): print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True) - code = requests.get(link.url) - html = code.text - soup = BeautifulSoup(html, 'lxml') + resp = self.request_get(link.url) + soup = BeautifulSoup(resp.text, 'lxml') links = [] a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'}) @@ -137,11 +279,9 @@ class Crawler: print("start download {}".format(file.file_name), flush=True) try: - response = requests.get(file.url) - data = response.content - f = open(self.setting.settings['download_path'] + file.file_name, 'wb') - f.write(data) - f.close() + response = self.request_get(file.url, cookies=file.cookie) + with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f: + f.write(response.content) self.setting.downloaded[file.title].append(file.episode) self.setting.save() @@ -151,16 +291,13 @@ class Crawler: except Exception as e: print(e) - - def crawl(self): - - print('Crawling start') - + def crawl_torrent(self): page_links = [] - for url in self.setting.settings['urls']: + for org_url in self.setting.settings['urls']: page = 1 while page <= self.setting.settings['max_page']: - page_links += self.crawl_list(url+str(page)) + url = org_url.replace('', str(page)) + page_links += self.crawl_list(url) page += 1 files = [] @@ -170,4 +307,9 @@ class Crawler: for file in files: self.download_files(file) + def crawl(self): + print('Crawling start') + + self.crawl_torrent() + print('Crawling finished') diff --git a/Main.py b/Main.py index 5cf28ec..01a6fd1 100644 --- a/Main.py +++ b/Main.py @@ -1,21 +1,5 @@ from Crawler.Crawler import Crawler -# def do_it(): -# crawler = Crawler() -# -# files = [] -# for url in setting.urls: -# for page in range(1, setting.max_page+1): -# page_url = url+str(page) -# page_links = crawler.crawl_list(page_url) -# -# # for link in page_links: -# # files += crawl_downlink(link) -# # -# # download_files(files) -# -# do_it() - crawler = Crawler() -crawler.crawl() \ No newline at end of file +crawler.crawl()