From 6729cab06a282f0e3b49b2264a91c3f106488d43 Mon Sep 17 00:00:00 2001 From: mjjo53 Date: Sun, 6 Aug 2017 03:19:23 +0900 Subject: [PATCH] =?UTF-8?q?-=20torrentkim=20=EC=82=AC=EC=9D=B4=ED=8A=B8?= =?UTF-8?q?=EB=A1=9C=20=EB=B3=80=EA=B2=BD=20-=20=ED=8C=8C=EC=9D=BC=20?= =?UTF-8?q?=ED=8F=B4=EB=8D=94=20=EA=B5=AC=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Crawler/Crawler.py | 231 +++++------------- Crawler/Logger.py | 6 + Crawler/ProxyHandler.py | 98 ++++++++ Crawler/Setting.py | 31 ++- Main.py | 6 +- .../downloaded_example.yml | 0 .../settings_example.yml | 0 requirements.txt | 2 + 8 files changed, 190 insertions(+), 184 deletions(-) create mode 100644 Crawler/Logger.py create mode 100644 Crawler/ProxyHandler.py rename downloaded_example.yml => conf/downloaded_example.yml (100%) rename settings_example.yml => conf/settings_example.yml (100%) create mode 100644 requirements.txt diff --git a/Crawler/Crawler.py b/Crawler/Crawler.py index 19b252a..441eff0 100644 --- a/Crawler/Crawler.py +++ b/Crawler/Crawler.py @@ -1,16 +1,11 @@ -import sys -import io -import os - import requests -from bs4 import BeautifulSoup +import urllib +import bs4 import re -import pickle from .Setting import Setting - -sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') -sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8') +from .ProxyHandler import ProxyHandler +from .Logger import Logger class PageLink: @@ -52,13 +47,13 @@ class TorrentFile: class Crawler: - PROXY_FILE_NAME = 'proxy.bin' - def __init__(self): self.setting = Setting() - self.proxies = [] - self.session = requests.Session() - self.cookies = None + self.proxy_handler = ProxyHandler() + if len(self.setting.settings['urls']) > 0: + urlinfo = urllib.parse.urlparse(self.setting.settings['urls'][0]) + top_url = urllib.parse.urlunparse((urlinfo.scheme, urlinfo.netloc, '', '', '', '')) + self.proxy_handler.check_url = top_url @staticmethod def print_log(files): @@ -67,66 +62,6 @@ class Crawler: f.write(file.file_name+'\n') f.close() - def crawl_proxy(self): - proxies = [] - - if os.path.exists(Crawler.PROXY_FILE_NAME): - with open(Crawler.PROXY_FILE_NAME, 'rb') as f: - proxies = pickle.load(f) - return proxies - - else: - resp = requests.get('https://www.us-proxy.org') - soup = BeautifulSoup(resp.text, 'lxml') - table = soup.select('table.table') - trs = table[0].select('tr') - cnt = 0 - - for tr in trs[1:]: - tds = tr.select('td') - if len(tds) > 0: - ip, port = tds[0].text, tds[1].text - proxies.append( - { - 'http': '{}:{}'.format(ip, port), - 'https': '{}:{}'.format(ip, port), - 'alive': True, - } - ) - # print('{}:{}'.format(ip, port)) - cnt += 1 - - with open(Crawler.PROXY_FILE_NAME, 'wb') as f: - pickle.dump(proxies, f) - - print('proxy cnt : {}'.format(cnt)) - return proxies - - def get_proxy(self): - if len(self.proxies) <= 0: - if os.path.exists(Crawler.PROXY_FILE_NAME): - with open(Crawler.PROXY_FILE_NAME, 'rb') as f: - self.proxies = pickle.load(f) - else: - self.proxies = self.crawl_proxy() - - for proxy in self.proxies: - if proxy['alive']: - return proxy - - return None - - def set_proxy_dead(self, proxy): - proxy['alive'] = False - for proxy in self.proxies: - if proxy['alive']: - with open(Crawler.PROXY_FILE_NAME, 'wb') as f: - pickle.dump(self.proxies, f) - return - - os.remove(Crawler.PROXY_FILE_NAME) - self.proxies = [] - def request_get(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', @@ -137,19 +72,18 @@ class Crawler: 'Connection': 'keep-alive', } - proxy = self.get_proxy() + proxy = self.proxy_handler.get_proxy() while True: try: - resp = self.session.get(url, proxies=proxy, headers=headers, cookies=self.cookies, timeout=3) - self.cookies = resp.cookies + resp = requests.get(url, proxies=proxy, headers=headers, timeout=3) except Exception as e: - self.set_proxy_dead(proxy) - proxy = self.get_proxy() + self.proxy_handler.set_proxy_dead(proxy) + proxy = self.proxy_handler.get_proxy() continue else: if resp.status_code != 200: - self.set_proxy_dead(proxy) - proxy = self.get_proxy() + self.proxy_handler.set_proxy_dead(proxy) + proxy = self.proxy_handler.get_proxy() continue else: break @@ -157,20 +91,19 @@ class Crawler: return resp def crawl_list(self, url): - print('checking page {}'.format(url), flush=True) + Logger.log('checking page {}'.format(url)) resp = self.request_get(url) - html = resp.text - soup = BeautifulSoup(html, 'lxml') + soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') re_title = re.compile('|'.join(['(?P'+video['keyword']+')' for i, video in enumerate(self.setting.settings['video'])]), re.I) re_episode = re.compile(r'[ .,]E([0-9]+)[ .,]', re.I) links = [] - tables = soup.select('table.table') - trs = tables[0].select('tr.') - for tr in trs: - tds = tr.select('div.td-subject') + tables = soup.select('table.board_list') + trs = tables[0].select('tr') + for tr in trs[1:]: + tds = tr.select('td.subject') title = tds[0].text.strip() link = tds[0].select('a')[0].attrs['href'] @@ -187,109 +120,69 @@ class Crawler: ep = int(ep_match.group(1)) if ep <= video['ignore_ep_under']: - print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True) + Logger.log(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under'])) continue elif ep in self.setting.downloaded[video['title']]: - print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True) + Logger.log(' {}({}) is ignored (already downloaded)'.format(video['title'], ep)) continue - if not link.startswith('http'): - top_end = url[8:].find('/') - if top_end < 0: - top_url = url[:8 + top_end] - else: - top_url = url - - if link[0] != '/': - link = '/' + link - - link = top_url + link - + link = urllib.parse.urljoin(url, link) links.append(PageLink(video['title'], ep, link)) - print(' found content page : {}({}), {}'.format(video['title'], ep, link), flush=True) - - - # selector = '#main_body > table > tbody > tr > td > table > tbody > tr > td.subject > a' # torrentkim - # for link in soup.select(selector): - # if link.has_attr('rel') and 'nofollow' in link['rel']: - # continue - # - # board_title = link.get_text().strip() - # - # title_match = re_title.search(board_title) - # if not title_match: - # continue - # - # ep_match = re_episode.search(board_title) - # if not ep_match: - # continue - # - # title_idx = int(title_match.lastgroup[3:]) - # video = self.setting.settings['video'][title_idx] - # ep = int(ep_match.group(1)) - # - # if ep <= video['ignore_ep_under']: - # print(' {}({}) is ignored (previous episode than ep{})'.format(video['title'], ep, video['ignore_ep_under']), flush=True) - # continue - # elif ep in self.setting.downloaded[video['title']]: - # print(' {}({}) is ignored (already downloaded)'.format(video['title'], ep), flush=True) - # continue - # - # link_url = link.get('href') - # if not link_url.startswith('http'): - # top_end = url[8:].find('/') - # if top_end < 0: - # top_url = url[:8 + top_end] - # else: - # top_url = url - # - # if link_url[0] != '/': - # link_url = '/' + link_url - # - # link_url = top_url + link_url - # - # links.append(PageLink(video['title'], ep, link_url, code.headers.get('Set-Cookie'))) - # - # print(' found content page : {}({}), {}'.format(video['title'], ep, link_url), flush=True) + Logger.log(' found content page : {}({}), {}'.format(video['title'], ep, link)) return links - def crawl_downlink(self, link): - print('searching content page : {}({}) : {}'.format(link.title, link.episode, link.url), flush=True) + def crawl_downlink(self, page_link): + Logger.log('searching content page : {}({}) : {}'.format(page_link.title, page_link.episode, page_link.url)) - resp = self.request_get(link.url) - soup = BeautifulSoup(resp.text, 'lxml') + resp = self.request_get(page_link.url) + soup = bs4.BeautifulSoup(resp.content.decode('utf8'), 'lxml') links = [] - a_tags = soup.find(id='bo_v_file').find_all('a', {'class':'view_file_download'}) - for tag in a_tags: - file_name = str(tag.find('strong').text) - url = tag.get('href') - links.append(TorrentFile(link.title, link.episode, file_name, url)) + file_table = soup.select('table#file_table') + a_tags = file_table[0].select('a') + torrent_links = [a for a in a_tags if '.torrent' in a.text] + smi_links = [a for a in a_tags if '.smi' in a.text] - print(' found download link : {}({})'.format(file_name, url), flush=True) + for link in torrent_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(page_link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) + + for link in smi_links: + file_name = link.text.strip() + sub_url = link.attrs['href'] + url = urllib.parse.urljoin(link.url, sub_url) + links.append(TorrentFile(page_link.title, page_link.episode, file_name, url)) + + Logger.log(' found download link : {}({})'.format(file_name, url)) return links - def download_files(self, file): - if file.episode in self.setting.downloaded[file.title]: + def download_files(self, file_link): + if file_link.episode in self.setting.downloaded[file_link.title]: return - print("start download {}".format(file.file_name), flush=True) + Logger.log("start download {}".format(file_link.file_name)) try: - response = self.request_get(file.url, cookies=file.cookie) - with open(self.setting.settings['download_path'] + file.file_name, 'wb') as f: - f.write(response.content) + resp = self.request_get(file_link.url) + file_name = re.findall("filename=\"(.+)\"", resp.headers['content-disposition']) + file_name = urllib.parse.unquote(file_name[0]) + with open(self.setting.settings['download_path'] + file_name, 'wb') as f: + f.write(resp.content) - self.setting.downloaded[file.title].append(file.episode) + self.setting.downloaded[file_link.title].append(file_link.episode) self.setting.save() - print("downloaded {}".format(file.file_name), flush=True) + Logger.log("downloaded {}".format(file_link.file_name)) except Exception as e: - print(e) + Logger.log(e) def crawl_torrent(self): page_links = [] @@ -308,8 +201,6 @@ class Crawler: self.download_files(file) def crawl(self): - print('Crawling start') - + Logger.log('Crawling start') self.crawl_torrent() - - print('Crawling finished') + Logger.log('Crawling finished') diff --git a/Crawler/Logger.py b/Crawler/Logger.py new file mode 100644 index 0000000..d3f0fc6 --- /dev/null +++ b/Crawler/Logger.py @@ -0,0 +1,6 @@ + +class Logger: + + @staticmethod + def log(msg): + print(msg) diff --git a/Crawler/ProxyHandler.py b/Crawler/ProxyHandler.py new file mode 100644 index 0000000..521546c --- /dev/null +++ b/Crawler/ProxyHandler.py @@ -0,0 +1,98 @@ +import os +import pickle +import requests +import bs4 +import concurrent.futures + +from .Logger import Logger + + +class ProxyHandler: + PROXY_FILE_NAME = 'temp/proxy.bin' + + def __init__(self): + if not os.path.exists('temp'): + os.makedirs('temp') + + self.proxies = [] + self.check_url = '' + + def check_proxy(self, proxy, top_url): + try: + resp = requests.get(top_url, proxies=proxy, timeout=2) + except: + proxy['alive'] = False + else: + if resp.status_code != 200: + proxy['alive'] = False + + def check_proxy_all(self, proxies, check_url): + Logger.log('checking proxies for {}'.format(check_url)) + + worker_cnt = 64 + pool = concurrent.futures.ThreadPoolExecutor(worker_cnt) + [pool.submit(self.check_proxy, proxy, check_url) for proxy in proxies] + pool.shutdown() + + def has_file(self): + return os.path.exists(self.PROXY_FILE_NAME) + + def load_proxy(self): + with open(self.PROXY_FILE_NAME, 'rb') as f: + proxies = pickle.load(f) + + return proxies + + def crawl_proxy(self): + proxies = [] + + resp = requests.get('https://www.us-proxy.org') + soup = bs4.BeautifulSoup(resp.text, 'lxml') + table = soup.select('table.table') + trs = table[0].select('tr') + for tr in trs[1:]: + tds = tr.select('td') + if len(tds) < 2: + continue + + ip, port = tds[0].text, tds[1].text + proxies.append( + { + 'alive': True, + 'http': '{}:{}'.format(ip, port), + 'https': '{}:{}'.format(ip, port), + } + ) + + self.check_proxy_all(proxies, self.check_url) + alive_proxies = [proxy for proxy in proxies if proxy['alive']] + Logger.log('proxies checking end: available : {}'.format(len(alive_proxies))) + + with open(self.PROXY_FILE_NAME, 'wb') as f: + pickle.dump(alive_proxies, f) + + return alive_proxies + + def get_proxy(self): + if len(self.proxies) <= 0: + if self.has_file(): + self.proxies = self.load_proxy() + else: + self.proxies = self.crawl_proxy() + + for proxy in self.proxies: + if proxy['alive']: + return proxy + + return None + + def set_proxy_dead(self, proxy): + proxy['alive'] = False + for proxy in self.proxies: + if proxy['alive']: + with open(self.PROXY_FILE_NAME, 'wb') as f: + pickle.dump(self.proxies, f) + return + + os.remove(self.PROXY_FILE_NAME) + self.proxies = [] diff --git a/Crawler/Setting.py b/Crawler/Setting.py index b0fe806..77cb4f8 100644 --- a/Crawler/Setting.py +++ b/Crawler/Setting.py @@ -1,33 +1,42 @@ import yaml import os +from .Logger import Logger + + class Setting: + SETTING_FILE = 'conf/settings.yml' + DOWNLOADED_FILE = 'temp/downloaded.yml' + def __init__(self): self.settings = None self.downloaded = None + if not os.path.exists('temp'): + os.mkdir('temp') + self.load() pass def load_settings(self): - if not os.path.isfile('settings.yml'): - print('There is no settings.yml', flush=True) + if not os.path.isfile(self.SETTING_FILE): + Logger.log('There is no {}'.format(self.SETTING_FILE)) exit() - with open('settings.yml', encoding='utf-8') as setting_file: + with open(self.SETTING_FILE, encoding='utf-8') as setting_file: try: self.settings = yaml.load(setting_file) except ValueError as e: - print(e, flush=True) + Logger.log(e) exit() if 'video' not in self.settings: - print('video key is need in settings.json', flush=True) + Logger.log('video key is need in settings.json') exit() for i, video in enumerate(self.settings['video']): if 'title' not in video: - print('title key is need in video({})'.format(i), flush=True) + Logger.log('title key is need in video({})'.format(i)) exit() if 'keyword' not in video: @@ -49,18 +58,18 @@ class Setting: try: os.makedirs(self.settings['download_path']) except Exception as e: - print(e, flush=True) + Logger.log(e) exit() video['keyword'] += self.settings['keyword_append'] def load_downloaded(self): - if os.path.isfile('downloaded.yml'): - with open("downloaded.yml", 'r', encoding='utf-8') as stream: + if os.path.isfile(self.DOWNLOADED_FILE): + with open(self.DOWNLOADED_FILE, 'r', encoding='utf-8') as stream: try: self.downloaded = yaml.load(stream) except yaml.YAMLError as e: - print(e, flush=True) + Logger.log(e) else: self.downloaded = {} @@ -108,6 +117,6 @@ class Setting: yaml.dump(downloaded_ex, outfile, allow_unicode=True) def save(self): - with open('downloaded.yml', 'w', encoding='utf-8') as outfile: + with open(self.DOWNLOADED_FILE, 'w', encoding='utf-8') as outfile: yaml.dump(self.downloaded, outfile, allow_unicode=True) pass diff --git a/Main.py b/Main.py index 01a6fd1..ceec55f 100644 --- a/Main.py +++ b/Main.py @@ -1,5 +1,5 @@ from Crawler.Crawler import Crawler - -crawler = Crawler() -crawler.crawl() +if __name__ == '__main__': + crawler = Crawler() + crawler.crawl() diff --git a/downloaded_example.yml b/conf/downloaded_example.yml similarity index 100% rename from downloaded_example.yml rename to conf/downloaded_example.yml diff --git a/settings_example.yml b/conf/settings_example.yml similarity index 100% rename from settings_example.yml rename to conf/settings_example.yml diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dc1536f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +bs4