diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 9330608..bfe2ab5 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -91,6 +91,7 @@ def find_elements_by_xpath(driver, tag, time=0): ) return elements + class Browser: def __init__(self, driver=None): self.driver = driver diff --git a/WebBasedCrawler/effect/effectkakaostory.py b/WebBasedCrawler/effect/effectkakaostory.py index 658d2e2..b16b233 100644 --- a/WebBasedCrawler/effect/effectkakaostory.py +++ b/WebBasedCrawler/effect/effectkakaostory.py @@ -60,8 +60,6 @@ class BodyCrawler(object): self.soup = None self.section_activity = None self.set_soup_and_activity() - if not self.section_activity: - raise NotFoundElementError("section _activity is not Found") # calling point may differ def set_soup_and_activity(self): @@ -231,11 +229,20 @@ class BodyCrawler(object): article_id = self.find_article_id() return 'channel' if article_id.startswith('ch/') else 'story' + def find_error(self): + error = self.soup.find('div', class_='info_error') + if error: + return True + else: + return False + def get(self): """ you need to put 'keyword_id' :return: dict for crawled body content """ + if not self.section_activity: + raise NotFoundElementError("section _activity is not Found") content = dict() content['article_id'] = self.find_article_id() content['article_nickname'] = self.find_article_nickname() @@ -421,6 +428,16 @@ class EffectKakaostory(object): wait(3) body_crawler = BodyCrawler(self.driver) reply_crawler = ReplyCrawler(self.driver) + except Exception as e: + raise effect.effecterror.OutDatedCrawler(str(e)) + + try: + error = body_crawler.find_error() + except Exception as e: + raise effect.effecterror.OutDatedCrawler(str(e)) + if error: + raise effect.effecterror.DeletedUrlError("The URL is Deleted") + try: body = body_crawler.get() replies = reply_crawler.get() except Exception as e: diff --git a/WebBasedCrawler/effect/resultsender.py b/WebBasedCrawler/effect/resultsender.py index f4d5c04..5478dc4 100644 --- a/WebBasedCrawler/effect/resultsender.py +++ b/WebBasedCrawler/effect/resultsender.py @@ -34,8 +34,8 @@ class ResultSender: val_list.append(str(val)) else: val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val)))) - return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \ - # ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list))) + return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \ + ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list))) def send(self, table_name, dictionary): query = self._make_query(table_name, dictionary) diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index 56a458a..b6ba641 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -1,1025 +1,1091 @@ -#-*- coding: utf-8 -*- -''' -Created on 2015. 12. 8. - -@author: cococo -''' -import re -import datetime -import insta.instaparser as instaparser -import insta.instaheaders as instaheaders -import requests -import logging -# from multiprocessing import Queue -# import multiprocessing -from queue import Queue -import threading - -from base.baseclasses import SendtoDB -from base.baseclasses import CrawlInit -from base.baseclasses import wait -from base.baseclasses import Browser -from selenium.webdriver.common.keys import Keys -from base.baseclasses import enter_element -import base.proxy - - -def printl(*objects, sep=' ', end='\n', file=None, flush=True): - print(*objects, sep=sep, end=end, file=file, flush=flush) - -insta_url = "https://www.instagram.com/" -insta_tag_url = "https://www.instagram.com/explore/tags/" -insta_query = "https://www.instagram.com/query/" -insta_body_url = 'https://www.instagram.com/p/' - -is_debuging = False -is_debug = False - - -def printd(*objects, sep=' ', end='\n', file=None, flush=True): - if is_debug: - print(*objects, sep=sep, end=end, file=file, flush=flush) - - -num_of_list_ajax = 24 -num_of_reply_ajax = 100 -list_wait_sec = 0.9 -body_wait_sec = 0.5 -reply_wait_sec = 0.8 -num_of_page_down = 20 -num_of_content_process = 10 -requests_timeout = 60 -num_of_retry_proxy = 5 - -logging.basicConfig(level=logging.INFO, - format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") -logging.getLogger('requests').setLevel(logging.WARNING) -logging.getLogger('pymysql').setLevel(logging.WARNING) - - -def click_insta_load_more(driver): - element = driver.find_element_by_css_selector("div._pupj3 > a") - enter_element(element) - - -def push_page_down(driver): - body = driver.find_element_by_tag_name('body') - body.send_keys(Keys.PAGE_DOWN) - - -def focus_driver(driver): - position = driver.get_window_position() - size = driver.get_window_size() - driver.maximize_window() - driver.set_window_size(size['width'], size["height"]) - driver.set_window_position(position['x'], position['y']) - - -def instance_wrapper(func): - # to save nice ip, port of proxy - ip, port = base.proxy.get_proxy() - - def retry_load(*args, **kwargs): - while True: - # use clouser - nonlocal ip, port - proxies = base.proxy.get_requests_proxy(ip + ":" + port) - kwargs['proxies'] = proxies - # retry = num_of_retry_proxy - # while retry: - res = func(*args, **kwargs) - if res: - # printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident())) - return res - # if the proxy was not good, get new proxy - # printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident())) - ip, port = base.proxy.get_proxy() - # retry -= 1 - return retry_load - - -class InstanceWrapper(object): - def __init__(self, func): - self.ip, self.port = base.proxy.get_proxy() - self.func = func - self.num_of_retry_proxy = num_of_retry_proxy - - def do(self, *args, **kwargs): - while True: - proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) - kwargs['proxies'] = proxies - # retry = num_of_retry_proxy - # while retry: - res = self.func(*args, **kwargs) - if res: - # printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) - return res - # if the proxy was not good, get new proxy - # printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) - self.ip, self.port = base.proxy.get_proxy() - # retry -= 1 - - def do_retry(self, *args, **kwargs): - while True: - proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) - kwargs['proxies'] = proxies - retry = self.num_of_retry_proxy - while retry: - res = self.func(*args, **kwargs) - if res: - # printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) - return res - # if the proxy was not good, get new proxy - # printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) - retry -= 1 - self.ip, self.port = base.proxy.get_proxy() - - def do_no_proxy(self, *args, **kwargs): - while True: - retry = self.num_of_retry_proxy - while retry: - proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) - kwargs['proxies'] = proxies - res = self.func(*args, **kwargs) - if res: - printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) - return res - # if the proxy was not good, get new proxy - printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) - retry -= 1 - self.ip, self.port = base.proxy.get_proxy() - - # if get content with proxy failed, set no proxy - # func guarantee returning a instance except the case where a url is invalid - kwargs['proxies'] = None - res = self.func(*args, **kwargs) - # if res: - # printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) - # printl(args, kwargs) - printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) - return res - - def change_proxy(self): - self.ip, self.port = base.proxy.get_proxy() - - -@instance_wrapper -def make_list_instance(url, proxies=None): - try: - if insta_tag_url in url: - list_crawler = ListTag(url, proxies) - else: - list_crawler = ListUser(url, proxies) - return list_crawler - except: - return None - - -# @instance_wrapper -def make_content_instance(url, proxies=None): - try: - content = InstaContent(url, {}, url, proxies) - return content - except: - return None - - -def ajax_wrapper(func): - def retry_ajax_load(*args, **kwargs): - retry = num_of_retry_proxy - while retry: - res = func(*args, **kwargs) - if res is not None: - break - retry -= 1 - return res - return retry_ajax_load - - -@ajax_wrapper -def load_ajax_list(ins): - try: - insta_list = ins.load_more() - # if insta_list: - # return insta_list - # else: - # return None - return insta_list - except: - return None - - -@ajax_wrapper -def load_ajax_reply(ins): - try: - replies = ins.load_reply_more() - # if replies: - # return replies - # else: - # return None - return replies - except: - return None - - -# def crawl_content_process(qu, keyword_id, db_num): -# send_to_db = SendtoDB() -# send_to_db.set_db(db_num) -# while True: -# element = qu.get() -# if element is None: -# break -# ok = True -# while ok: -# try: -# ip, port = base.proxy.get_proxy() -# proxies = base.proxy.get_requests_proxy(ip + ":" + port) -# content = InstaContent(element['url'], {}, element['url'], proxies) -# body = content.get_body() -# replies = content.get_reply() -# body['article_url'] = element['url'] -# body['keyword_id'] = keyword_id -# while content.has_previous: -# replies = content.load_reply_more() + replies -# wait(reply_wait_sec) -# for j in range(0, len(replies)): -# replies[j]['article_url'] = body['article_url'] -# replies[j]['platform_id'] = body['platform_id'] -# replies[j]['article_order'] = j -# send_to_db.delete_url(body['article_url']) -# send_to_db.send_body(body) -# if replies: -# send_to_db.send_reply(replies) -# printl(element['url']) -# printl('ok') -# ok = False -# except: -# printl("failed proxy {0}:{1}".format(ip, port)) -# printl('finish thread') - - -def crawl_content_process(qu, keyword_id, db_num): - # m_c_i = instance_wrapper(make_content_instance) - m_c_i = InstanceWrapper(make_content_instance) - send_to_db = SendtoDB() - send_to_db.set_db(db_num) - while True: - element = qu.get() - if element is None: - break - ok = True - while ok: - try: - # get a instance of InstaContent by do_no_proxy func. - # if element['url'] is invalid, content is None - content = m_c_i.do_no_proxy(element['url']) - if not content: - break - body = content.get_body() - replies = content.get_reply() - body['article_url'] = element['url'] - body['keyword_id'] = keyword_id - while content.has_previous: - rep = load_ajax_reply(content) - if rep is None: - printl("proxies = ", content.proxies) - m_c_i.change_proxy() - raise Exception("reply load error") - replies = rep + replies - wait(reply_wait_sec) - for j in range(0, len(replies)): - replies[j]['article_url'] = body['article_url'] - replies[j]['platform_id'] = body['platform_id'] - replies[j]['article_order'] = j - send_to_db.delete_url(body['article_url']) - send_to_db.send_body(body) - if replies: - send_to_db.send_reply(replies) - printl(element['url']) - printl('ok') - ok = False - except UnicodeEncodeError as ue: - printl(element['url']) - printl(ue) - break - except Exception as e: - # catch error when send_to_db error occur - printl(element['url']) - printl(e) - printl('finish thread') - - -class InstaInit(CrawlInit): - def __init__(self, before_day=0): - super().__init__(before_day) - self.urls = dict() - self.urls[9] = insta_tag_url - self.urls[10] = insta_url - - def split_searches(self): - search = self.searches() - splited_list = search.split(',') - trimmed_list = list() - if self.platform() == 10: - for x in splited_list: - trimmed_list.append(x.strip()) - else: - for x in splited_list: - trimmed_list.append(self.utf8(x)) - return trimmed_list - - def make_url(self): - urls = list() - for x in self.split_searches(): - url = self.urls[self.platform()] + x - urls.append(url) - return urls - - def get_begin_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - result += datetime.timedelta(days=self.before_day) - return result.date() - else: - return self.start_day() - - def get_end_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - return result.date() - else: - return self.end_day() - - -class ListTag: - def __init__(self, url, proxies=None): - self.__r = None - self.__tag = '' - self.__url = '' - self.list_tag = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.proxies = proxies - self.load_url(url, self.proxies) - - def load_url(self, url, proxies): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, - timeout=requests_timeout) - self.log_load_url_before() - self.__r.raise_for_status() - self.__tag = self.__get_tag(url) - self.__set_cookies(self.__r.cookies) - self.__url = url - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) - self.log_load_url_after() - return self.list_tag - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - self.log_load_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout) - self.__set_cookies(self.__r.cookies) - self.__r.raise_for_status() - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - self.log_load_more_after() - return self.list_tag - - def __get_tag(self, url): - m = re.search(insta_tag_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_tag - - def get_proxy(self): - return self.proxies - - def log_load_url_before(self): - if is_debuging: - printl("") - printl("") - printl('headers = ', end=' ') - printl(instaheaders.get_headers_for_list_html()) - - def log_load_url_after(self): - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl('proxies = ', end='') - printl(self.proxies) - printl("") - - def log_load_more_before(self, form_data, headers): - if is_debuging: - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - - def log_load_more_after(self): - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl('proxies = ', end='') - printl(self.proxies) - printl("") - - -class ListUser: - def __init__(self, url, proxies=None): - self.__r = None - self.__user = '' - self.__url = '' - self.list_user = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.proxies = proxies - self.load_url(url, self.proxies) - - def load_url(self, url, proxies): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, - timeout=requests_timeout) - self.__r.raise_for_status() - self.__url = url - self.__set_cookies(self.__r.cookies) - self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) - return self.list_user - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - self.log_load_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - - self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - self.log_load_more_after() - return self.list_user - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_user - - def get_proxy(self): - return self.proxies - - def log_load_more_before(self, form_data, headers): - if is_debuging: - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - - def log_load_more_after(self): - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl('proxies = ', end='') - printl(self.proxies) - printl("") - - -class InstaContent: - def __init__(self, url, cookies, referer, proxies=None): - self.__r = None - self.__referer = '' - self.__code = '' - self.body = None - self.reply = [] - self.start_cursor = None - self.has_previous = False - self.cookies = {} - self.proxies = proxies - self.load_url(url, cookies, referer, self.proxies) - - def load_url(self, url, cookies, referer, proxies): - self.__set_cookies(cookies) - self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, - timeout=requests_timeout) - self.__r.raise_for_status() - self.__referer = referer - self.__code = self.__get_code(url) - self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) - self.__set_cookies(self.__r.cookies) - return self.body, self.reply - - def get_body(self): - return self.body - - def get_reply(self): - return self.reply - - def load_reply_more(self): - form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) - self.log_load_reply_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) - self.log_load_reply_more_after() - return self.reply - - def get_cookies(self): - return self.cookies - - def __get_code(self, url): - m = re.search(insta_body_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_proxy(self): - return self.proxies - - def log_load_reply_more_before(self, form_data, headers): - if is_debuging: - printl("") - printl("") - printl('start_cursor = ' + self.start_cursor) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - - def log_load_reply_more_after(self): - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('start_cursor = ' + str(self.start_cursor)) - printl('has_previous = ', end='') - printl(self.has_previous) - printl('proxies = ', end='') - printl(self.proxies) - printl("") - - -class InstaAlgorithm: - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - self.send_to_db = send_to_db - self.crawl_init = crawl_init - self.browser = browser - self.driver = driver - self.keyword_id = keyword_id - self.reload_wait_second = reload_wait_second - self.num_of_load_content = num_of_load_content - self.page_down = page_down - self.list_crawl = [] - - def crawl_content(self, url, cookies, referer): - content = InstaContent(url, cookies, referer) - body = content.get_body() - replies = content.get_reply() - body['article_url'] = url - body['keyword_id'] = self.keyword_id - # printl(body['article_url']) - while content.has_previous: - replies = content.load_reply_more() + replies - wait(reply_wait_sec) - for j in range(0, len(replies)): - replies[j]['article_url'] = body['article_url'] - replies[j]['platform_id'] = body['platform_id'] - replies[j]['article_order'] = j - self.send_to_db.delete_url(body['article_url']) - self.send_to_db.send_body(body) - if replies: - self.send_to_db.send_reply(replies) - printl('ok') - printl() - - def start_crawl(self): - self.crawl() - self.close() - - def close(self): - if self.driver and not is_debuging: - self.driver.quit() - self.send_to_db.close() - printl("Finished Crawling :)") - - def crawl(self): - raise NotImplementedError - - def is_until_page(self): - if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl): - return True - else: - return False - - def crawl_contents(self, contents_list, backup_set): - """ - :param contents_list: - :param backup_set: - :return: is_load_more - """ - old_elements = 0 - for element in contents_list: - if element['date'].date() > self.crawl_init.get_end_day(): - # printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - - elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - old_elements += 1 - if old_elements > 6: - return False - else: - if not element['url'] in backup_set: - # printl(element['url']) - # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - # wait(1.5) - # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) - self.list_crawl.append(element) - backup_set.add(element['url']) - if self.is_until_page(): - return False - if self.list_crawl: - printl("Number of Lists = {0}".format(len(self.list_crawl))) - return True - - def crawl_list(self): - if self.list_crawl: - printl() - printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S"))) - printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S"))) - printl("Total gathered contents = {0}".format(len(self.list_crawl))) - printl() - for element in self.list_crawl: - try: - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - wait(body_wait_sec) - self.crawl_content(element['url'], {}, element['url']) - except Exception as e: - printl(e) - logging.info(e) - - -class InstaAlgorithmNormal(InstaAlgorithm): - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second, num_of_load_content, page_down) - if self.driver: - self.driver.quit() - - def crawl(self): - real_time = True - while real_time: - printl("Crawling Start") - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - printl(url_list[i] + "\n") - if insta_tag_url in url_list[i]: - list_crawler = ListTag(url_list[i]) - else: - list_crawler = ListUser(url_list[i]) - wait(1) - insta_list = list_crawler.get_list() - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - # ajax load - while is_load_more: - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(self.reload_wait_second) - insta_list = list_crawler.load_more() - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - self.crawl_list() - self.list_crawl.clear() - i += 1 - except Exception as e: - logging.info(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - - -class InstaAlgorithmMulti(InstaAlgorithm): - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second, num_of_load_content, page_down) - if self.driver: - self.driver.quit() - self.list_crawl = Queue() - self.total_num = 0 - - def crawl_contents(self, contents_list, backup_set): - """ - :param contents_list: - :param backup_set: - :return: is_load_more - """ - old_elements = 0 - for element in contents_list: - if element['date'].date() > self.crawl_init.get_end_day(): - # printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - - elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - old_elements += 1 - if old_elements > 6: - return False - else: - if not element['url'] in backup_set: - # printl(element['url']) - # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - # wait(1.5) - # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) - try: - self.list_crawl.put(element, timeout=10) - except Exception as e: - printl(e) - printl("queue size = ", self.list_crawl.qsize()) - backup_set.add(element['url']) - self.total_num += 1 - if self.is_until_page(): - return False - # if self.list_crawl: - # printl("Number of Lists = {0}".format(len(self.list_crawl))) - return True - - def crawl(self): - real_time = True - while real_time: - printl("Crawling Start") - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - printl(url_list[i] + "\n") - # insta_content process create and start - # p_list = [multiprocessing.Process(target=crawl_content_process, - # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) - # for i in range(num_of_content_process)] - p_list = [threading.Thread(target=crawl_content_process, - args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) - for i in range(num_of_content_process)] - for p in p_list: - p.daemon = True - p.start() - - # crawl list - ok = True - while ok: - try: - list_crawler = make_list_instance(url_list[i]) - ok = False - except Exception as e: - printl(e) - wait(1) - insta_list = list_crawler.get_list() - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - # ajax load - while is_load_more: - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(self.reload_wait_second) - try: - insta_list = load_ajax_list(list_crawler) - if insta_list is None: - break - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - except Exception as e: - printl('is_load_more exception') - printl(e) - is_load_more = False - #self.crawl_list() - #self.list_crawl.close() - printl("end load") - printl("total number of crawled list = {0}".format(self.total_num)) - self.total_num = 0 - - # stop child process - for i in range(num_of_content_process): - self.list_crawl.put(None, timeout=10) - - # wait child process - for p in p_list: - p.join() - - for _ in range(self.list_crawl.qsize()): - self.list_crawl.get(block=False) - - i += 1 - except Exception as e: - logging.info(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - - -class InstaAlgorithmBrowser(InstaAlgorithm): - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second, num_of_load_content, page_down) - - def url_load(self, url): - if insta_tag_url in url: - list_tag = ListTag(url) - insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source) - return list_tag, insta_list, end_cursor, has_next - else: - list_user = ListUser(url) - insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source) - return list_user, insta_list, end_cursor, has_next - - def crawl(self): - real_time = True - while real_time: - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - wait(3) - printl(url_list[i] + "\n") - self.driver.get(url_list[i]) - wait(5) - list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i]) - is_load_more = self.crawl_contents(insta_list, backup_set) and has_next - list_crawler.set_end_cursor(end_cursor2) - list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()} - # ajax load - page_down = 0 - while is_load_more: - if page_down == self.page_down: - page_down = 0 - try: - focus_driver(self.driver) - click_insta_load_more(self.driver) - except: - push_page_down(self.driver) - page_down += 1 - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(self.reload_wait_second) - insta_list = list_crawler.load_more() - # printl("list length = " + str(len(insta_list))) - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - # printl("number of backup_set = {0}".format(len(backup_set))) - i += 1 - self.crawl_list() - self.list_crawl.clear() - except Exception as e: - logging.info(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - if self.driver: - self.driver.close() - wait(3) - self.driver = self.browser.new_browser() - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - - -class InstaMainCrawler: - def __init__(self): - self.send_to_db = SendtoDB() - self.crawl_init = InstaInit() - # self.browser = Browser() - self.browser = None - self.driver = None - - def set_keyword_id(self, keyword_id): - self.keyword_id = keyword_id - - def crawl_all(self, backup_set=None): - pass - - def start(self): - self.crawler_start() - - def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): - self.init_keyword_id(keyword_id) - self.init_db(db_num) - self.init_before_day(before_day) - self.init_until_page(until_page) - # self.init_browser(browser) - - def set_driver(self, driver): - self.driver = driver - - def init_browser(self, browser): - try: - self.set_driver(self.browser.get_new_driver(browser)) - except Exception as e: - logging.info(e) - - def init_keyword_id(self, keyword_id): - if type(keyword_id) != int: - self.keyword_id = int(keyword_id) - else: - self.keyword_id = keyword_id - self.crawl_init.get_keyword_parameters(keyword_id) - self.crawl_init.disconnect() - - def init_db(self, db_num): - self.send_to_db.set_db(db_num) - - def init_before_day(self, before_day): - self.crawl_init.set_before_day(before_day) - - def init_until_page(self, until_page): - self.crawl_init.set_until_page(until_page) - - def crawler_start(self): - # if self.driver: - # algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db, - # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) - # else: - # algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db, - # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) - algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db, - self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) - algorithm.start_crawl() +#-*- coding: utf-8 -*- +''' +Created on 2015. 12. 8. + +@author: cococo +''' +import re +import datetime +import insta.instaparser as instaparser +import insta.instaheaders as instaheaders +import requests +import logging +# from multiprocessing import Queue +# import multiprocessing +from queue import Queue +import threading +import time +import sys + + +from base.baseclasses import SendtoDB +from base.baseclasses import CrawlInit +from base.baseclasses import wait +# from base.baseclasses import Browser +from selenium.webdriver.common.keys import Keys +from base.baseclasses import enter_element +import base.proxy +import eventlet + +def printl(*objects, sep=' ', end='\n', file=None, flush=True): + print(*objects, sep=sep, end=end, file=file, flush=flush) + +insta_url = "https://www.instagram.com/" +insta_tag_url = "https://www.instagram.com/explore/tags/" +insta_query = "https://www.instagram.com/query/" +insta_body_url = 'https://www.instagram.com/p/' + +is_debuging = False +is_debug = False + + +def printd(*objects, sep=' ', end='\n', file=None, flush=True): + if is_debug: + print(*objects, sep=sep, end=end, file=file, flush=flush) + + +num_of_list_ajax = 24 +num_of_reply_ajax = 100 +list_wait_sec = 0.9 +body_wait_sec = 0.5 +reply_wait_sec = 0.8 +num_of_page_down = 20 +num_of_content_process = 10 +requests_timeout = 60 +num_of_retry_proxy = 5 + +logging.basicConfig(level=logging.INFO, + format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") +logging.getLogger('requests').setLevel(logging.WARNING) +logging.getLogger('pymysql').setLevel(logging.WARNING) + + +def click_insta_load_more(driver): + element = driver.find_element_by_css_selector("div._pupj3 > a") + enter_element(element) + + +def push_page_down(driver): + body = driver.find_element_by_tag_name('body') + body.send_keys(Keys.PAGE_DOWN) + + +def focus_driver(driver): + position = driver.get_window_position() + size = driver.get_window_size() + driver.maximize_window() + driver.set_window_size(size['width'], size["height"]) + driver.set_window_position(position['x'], position['y']) + + +def requests_get(req, timeout=requests_timeout): + body = [] + start = time.time() + for chunk in req.iter_content(1024): + body.append(chunk) + if time.time() > (start + timeout): + req.close() + raise Exception("timeout") + return b''.join(body) + + +eventlet.monkey_patch() + + +def requests_wrapper(func): + if sys.platform == 'win32': + return func + else: + def wrapper(*args, **kwargs): + with eventlet.Timeout(requests_timeout, Exception): + return func(*args, **kwargs) + return wrapper + + +requests.get = requests_wrapper(requests.get) +requests.post = requests_wrapper(requests.post) + + +def instance_wrapper(func): + # to save nice ip, port of proxy + ip, port = base.proxy.get_proxy() + + def retry_load(*args, **kwargs): + while True: + # use clouser + nonlocal ip, port + proxies = base.proxy.get_requests_proxy(ip + ":" + port) + kwargs['proxies'] = proxies + # retry = num_of_retry_proxy + # while retry: + res = func(*args, **kwargs) + if res: + # printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident())) + return res + # if the proxy was not good, get new proxy + # printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident())) + ip, port = base.proxy.get_proxy() + # retry -= 1 + return retry_load + + +class InstanceWrapper(object): + def __init__(self, func): + self.ip, self.port = base.proxy.get_proxy() + self.func = func + self.num_of_retry_proxy = num_of_retry_proxy + + def do(self, *args, **kwargs): + while True: + proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) + kwargs['proxies'] = proxies + # retry = num_of_retry_proxy + # while retry: + res = self.func(*args, **kwargs) + if res: + # printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) + return res + # if the proxy was not good, get new proxy + # printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) + self.ip, self.port = base.proxy.get_proxy() + # retry -= 1 + + def do_retry(self, *args, **kwargs): + while True: + proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) + kwargs['proxies'] = proxies + retry = self.num_of_retry_proxy + while retry: + res = self.func(*args, **kwargs) + if res: + # printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) + return res + # if the proxy was not good, get new proxy + # printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) + retry -= 1 + self.ip, self.port = base.proxy.get_proxy() + + def do_no_proxy(self, *args, **kwargs): + while True: + retry = self.num_of_retry_proxy + while retry: + proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) + kwargs['proxies'] = proxies + res = self.func(*args, **kwargs) + if res: + printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) + return res + # if the proxy was not good, get new proxy + printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) + retry -= 1 + self.ip, self.port = base.proxy.get_proxy() + + # if get content with proxy failed, set no proxy + # func guarantee returning a instance except the case where a url is invalid + kwargs['proxies'] = None + res = self.func(*args, **kwargs) + # if res: + # printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) + # printl(args, kwargs) + printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) + return res + + def change_proxy(self): + self.ip, self.port = base.proxy.get_proxy() + + +@instance_wrapper +def make_list_instance(url, proxies=None): + try: + if insta_tag_url in url: + list_crawler = ListTag(url, proxies) + else: + list_crawler = ListUser(url, proxies) + return list_crawler + except Exception as e: + printd(e) + printd("Fail to make list instance") + return None + + +# @instance_wrapper +def make_content_instance(url, proxies=None): + try: + content = InstaContent(url, {}, url, proxies) + return content + except Exception as e: + printd(e) + printd("Fail to make contanet instance") + return None + + +def ajax_wrapper(func): + def retry_ajax_load(*args, **kwargs): + retry = num_of_retry_proxy + while retry: + res = func(*args, **kwargs) + if res is not None: + break + retry -= 1 + return res + return retry_ajax_load + + +@ajax_wrapper +def load_ajax_list(ins): + try: + insta_list = ins.load_more() + # if insta_list: + # return insta_list + # else: + # return None + return insta_list + except Exception as e: + printd(e) + printd("Fail to load ajax list") + return None + + +@ajax_wrapper +def load_ajax_reply(ins): + try: + replies = ins.load_reply_more() + # if replies: + # return replies + # else: + # return None + return replies + except Exception as e: + printd(e) + printd("Fail to load ajax reply") + return None + + +# def crawl_content_process(qu, keyword_id, db_num): +# send_to_db = SendtoDB() +# send_to_db.set_db(db_num) +# while True: +# element = qu.get() +# if element is None: +# break +# ok = True +# while ok: +# try: +# ip, port = base.proxy.get_proxy() +# proxies = base.proxy.get_requests_proxy(ip + ":" + port) +# content = InstaContent(element['url'], {}, element['url'], proxies) +# body = content.get_body() +# replies = content.get_reply() +# body['article_url'] = element['url'] +# body['keyword_id'] = keyword_id +# while content.has_previous: +# replies = content.load_reply_more() + replies +# wait(reply_wait_sec) +# for j in range(0, len(replies)): +# replies[j]['article_url'] = body['article_url'] +# replies[j]['platform_id'] = body['platform_id'] +# replies[j]['article_order'] = j +# send_to_db.delete_url(body['article_url']) +# send_to_db.send_body(body) +# if replies: +# send_to_db.send_reply(replies) +# printl(element['url']) +# printl('ok') +# ok = False +# except: +# printl("failed proxy {0}:{1}".format(ip, port)) +# printl('finish thread') + + +def crawl_content_process(qu, keyword_id, db_num): + # m_c_i = instance_wrapper(make_content_instance) + m_c_i = InstanceWrapper(make_content_instance) + send_to_db = SendtoDB() + send_to_db.set_db(db_num) + while True: + try: + element = qu.get(timeout=60) + except Exception as e: + printl("getting queue is timeout") + continue + + if element is None: + break + ok = True + while ok: + try: + # get a instance of InstaContent by do_no_proxy func. + # if element['url'] is invalid, content is None + content = m_c_i.do_no_proxy(element['url']) + if not content: + break + body = content.get_body() + replies = content.get_reply() + body['article_url'] = element['url'] + body['keyword_id'] = keyword_id + while content.has_previous: + rep = load_ajax_reply(content) + if rep is None: + printl("proxies = ", content.proxies) + m_c_i.change_proxy() + raise Exception("reply load error") + replies = rep + replies + wait(reply_wait_sec) + for j in range(0, len(replies)): + replies[j]['article_url'] = body['article_url'] + replies[j]['platform_id'] = body['platform_id'] + replies[j]['article_order'] = j + send_to_db.delete_url(body['article_url']) + send_to_db.send_body(body) + if replies: + send_to_db.send_reply(replies) + printl(element['url']) + printl('ok') + ok = False + except UnicodeEncodeError as ue: + printl(element['url']) + printl(ue) + break + except Exception as e: + # catch error when send_to_db error occur + printl(element['url']) + printl(e) + qu.task_done() + printl('finish thread') + + +class InstaInit(CrawlInit): + def __init__(self, before_day=0): + super().__init__(before_day) + self.urls = dict() + self.urls[9] = insta_tag_url + self.urls[10] = insta_url + + def split_searches(self): + search = self.searches() + splited_list = search.split(',') + trimmed_list = list() + if self.platform() == 10: + for x in splited_list: + trimmed_list.append(x.strip()) + else: + for x in splited_list: + trimmed_list.append(self.utf8(x)) + return trimmed_list + + def make_url(self): + urls = list() + for x in self.split_searches(): + url = self.urls[self.platform()] + x + urls.append(url) + return urls + + def get_begin_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + result += datetime.timedelta(days=self.before_day) + return result.date() + else: + return self.start_day() + + def get_end_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + return result.date() + else: + return self.end_day() + + +class ListTag: + def __init__(self, url, proxies=None): + self.__r = None + self.__tag = '' + self.__url = '' + self.list_tag = [] + self.end_cursor = None + self.has_next = False + self.cookies = {} + self.proxies = proxies + self.load_url(url, self.proxies) + + def load_url(self, url, proxies): + self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.log_load_url_before() + self.__r.raise_for_status() + self.__tag = self.__get_tag(url) + self.__set_cookies(self.__r.cookies) + self.__url = url + # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) + self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) + self.__r.close() + self.log_load_url_after() + return self.list_tag + + def load_more(self): + form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) + headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) + self.log_load_more_before(form_data, headers) + self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.__set_cookies(self.__r.cookies) + self.__r.raise_for_status() + # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) + self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + self.__r.close() + self.log_load_more_after() + return self.list_tag + + def __get_tag(self, url): + m = re.search(insta_tag_url + "([^/]*)", url) + if m: + return m.group(1) + else: + raise RuntimeError('Tag Error') + + def get_cookies(self): + return self.cookies + + def get_url(self): + return self.__url + + def set_end_cursor(self, cursor): + self.end_cursor = cursor + + def get_end_cursor(self): + return self.end_cursor + + def __set_cookies(self, cookies): + for k, v in cookies.items(): + self.cookies[k] = v + + def get_list(self): + return self.list_tag + + def get_proxy(self): + return self.proxies + + def log_load_url_before(self): + if is_debuging: + printl("") + printl("") + printl('headers = ', end=' ') + printl(instaheaders.get_headers_for_list_html()) + + def log_load_url_after(self): + if is_debuging: + printl("") + printl('self.__r.cookies=', end='') + printl(self.__r.cookies) + printl('end_cursor = ' + str(self.end_cursor)) + printl('has_next = ', end='') + printl(self.has_next) + printl('proxies = ', end='') + printl(self.proxies) + printl("") + + def log_load_more_before(self, form_data, headers): + if is_debuging: + printl("") + printl("") + printl('end_cursor = ' + str(self.end_cursor)) + printl('form_data' + form_data) + printl('headers = ', end=' ') + printl(headers) + + def log_load_more_after(self): + if is_debuging: + printl("") + printl('self.__r.cookies=', end='') + printl(self.__r.cookies) + printl('end_cursor = ' + str(self.end_cursor)) + printl('has_next = ', end='') + printl(self.has_next) + printl('proxies = ', end='') + printl(self.proxies) + printl("") + + +class ListUser: + def __init__(self, url, proxies=None): + self.__r = None + self.__user = '' + self.__url = '' + self.list_user = [] + self.end_cursor = None + self.has_next = False + self.cookies = {} + self.proxies = proxies + self.load_url(url, self.proxies) + + def load_url(self, url, proxies): + self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.__r.raise_for_status() + self.__url = url + self.__set_cookies(self.__r.cookies) + # self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) + self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) + self.__r.close() + return self.list_user + + def load_more(self): + form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) + headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) + self.log_load_more_before(form_data, headers) + self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.__r.raise_for_status() + self.__set_cookies(self.__r.cookies) + + # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) + self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + self.__r.close() + self.log_load_more_after() + return self.list_user + + def get_cookies(self): + return self.cookies + + def get_url(self): + return self.__url + + def set_end_cursor(self, cursor): + self.end_cursor = cursor + + def get_end_cursor(self): + return self.end_cursor + + def __set_cookies(self, cookies): + for k, v in cookies.items(): + self.cookies[k] = v + + def get_list(self): + return self.list_user + + def get_proxy(self): + return self.proxies + + def log_load_more_before(self, form_data, headers): + if is_debuging: + printl("") + printl("") + printl('end_cursor = ' + str(self.end_cursor)) + printl('form_data' + form_data) + printl('headers = ', end=' ') + printl(headers) + + def log_load_more_after(self): + if is_debuging: + printl("") + printl('self.__r.cookies=', end='') + printl(self.__r.cookies) + printl('end_cursor = ' + str(self.end_cursor)) + printl('has_next = ', end='') + printl(self.has_next) + printl('proxies = ', end='') + printl(self.proxies) + printl("") + + +class InstaContent: + def __init__(self, url, cookies, referer, proxies=None): + self.__r = None + self.__referer = '' + self.__code = '' + self.body = None + self.reply = [] + self.start_cursor = None + self.has_previous = False + self.cookies = {} + self.proxies = proxies + self.load_url(url, cookies, referer, self.proxies) + + def load_url(self, url, cookies, referer, proxies): + self.__set_cookies(cookies) + self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.__r.raise_for_status() + self.__referer = referer + self.__code = self.__get_code(url) + # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) + self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) + self.__set_cookies(self.__r.cookies) + self.__r.close() + return self.body, self.reply + + def get_body(self): + return self.body + + def get_reply(self): + return self.reply + + def load_reply_more(self): + form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) + headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) + self.log_load_reply_more_before(form_data, headers) + self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, + timeout=requests_timeout, stream=True) + content = requests_get(self.__r) + self.__r.raise_for_status() + self.__set_cookies(self.__r.cookies) + # self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) + self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content) + self.__r.close() + self.log_load_reply_more_after() + return self.reply + + def get_cookies(self): + return self.cookies + + def __get_code(self, url): + m = re.search(insta_body_url + "([^/]*)", url) + if m: + return m.group(1) + else: + raise RuntimeError('Tag Error') + + def __set_cookies(self, cookies): + for k, v in cookies.items(): + self.cookies[k] = v + + def get_proxy(self): + return self.proxies + + def log_load_reply_more_before(self, form_data, headers): + if is_debuging: + printl("") + printl("") + printl('start_cursor = ' + self.start_cursor) + printl('form_data' + form_data) + printl('headers = ', end=' ') + printl(headers) + + def log_load_reply_more_after(self): + if is_debuging: + printl("") + printl('self.__r.cookies=', end='') + printl(self.__r.cookies) + printl('start_cursor = ' + str(self.start_cursor)) + printl('has_previous = ', end='') + printl(self.has_previous) + printl('proxies = ', end='') + printl(self.proxies) + printl("") + + +class InstaAlgorithm: + def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second=2, num_of_load_content=12, page_down=50): + self.send_to_db = send_to_db + self.crawl_init = crawl_init + self.browser = browser + self.driver = driver + self.keyword_id = keyword_id + self.reload_wait_second = reload_wait_second + self.num_of_load_content = num_of_load_content + self.page_down = page_down + self.list_crawl = [] + + def crawl_content(self, url, cookies, referer): + content = InstaContent(url, cookies, referer) + body = content.get_body() + replies = content.get_reply() + body['article_url'] = url + body['keyword_id'] = self.keyword_id + # printl(body['article_url']) + while content.has_previous: + replies = content.load_reply_more() + replies + wait(reply_wait_sec) + for j in range(0, len(replies)): + replies[j]['article_url'] = body['article_url'] + replies[j]['platform_id'] = body['platform_id'] + replies[j]['article_order'] = j + self.send_to_db.delete_url(body['article_url']) + self.send_to_db.send_body(body) + if replies: + self.send_to_db.send_reply(replies) + printl('ok') + printl() + + def start_crawl(self): + self.crawl() + self.close() + + def close(self): + if self.driver and not is_debuging: + self.driver.quit() + self.send_to_db.close() + printl("Finished Crawling :)") + + def crawl(self): + raise NotImplementedError + + def is_until_page(self): + if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl): + return True + else: + return False + + def crawl_contents(self, contents_list, backup_set): + """ + :param contents_list: + :param backup_set: + :return: is_load_more + """ + old_elements = 0 + for element in contents_list: + if element['date'].date() > self.crawl_init.get_end_day(): + # printl(element['url']) + printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + + elif element['date'].date() < self.crawl_init.get_begin_day(): + printl(element['url']) + printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + old_elements += 1 + if old_elements > 6: + return False + else: + if not element['url'] in backup_set: + # printl(element['url']) + # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + # wait(1.5) + # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) + self.list_crawl.append(element) + backup_set.add(element['url']) + if self.is_until_page(): + return False + if self.list_crawl: + printl("Number of Lists = {0}".format(len(self.list_crawl))) + return True + + def crawl_list(self): + if self.list_crawl: + printl() + printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S"))) + printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S"))) + printl("Total gathered contents = {0}".format(len(self.list_crawl))) + printl() + for element in self.list_crawl: + try: + printl(element['url']) + printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + wait(body_wait_sec) + self.crawl_content(element['url'], {}, element['url']) + except Exception as e: + printl(e) + logging.info(e) + + +class InstaAlgorithmNormal(InstaAlgorithm): + def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second=2, num_of_load_content=12, page_down=50): + super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second, num_of_load_content, page_down) + if self.driver: + self.driver.quit() + + def crawl(self): + real_time = True + while real_time: + printl("Crawling Start") + url_list = self.crawl_init.make_url() + i = 0 + end_cursor = None + backup_set = set() + while i < len(url_list): + # first connect + try: + printl(url_list[i] + "\n") + if insta_tag_url in url_list[i]: + list_crawler = ListTag(url_list[i]) + else: + list_crawler = ListUser(url_list[i]) + wait(1) + insta_list = list_crawler.get_list() + is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next + # ajax load + while is_load_more: + if end_cursor: + list_crawler.end_cursor = end_cursor + end_cursor = None + wait(self.reload_wait_second) + insta_list = list_crawler.load_more() + is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next + self.crawl_list() + self.list_crawl.clear() + i += 1 + except Exception as e: + logging.info(e) + end_cursor = list_crawler.end_cursor + printl('end_cursor=' + end_cursor) + if e.args: + wait(300) + real_time = self.crawl_init.is_realtime() + printl("Finished Crawling :)") + + +class InstaAlgorithmMulti(InstaAlgorithm): + def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second=2, num_of_load_content=12, page_down=50): + super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second, num_of_load_content, page_down) + if self.driver: + self.driver.quit() + self.list_crawl = Queue() + self.total_num = 0 + + def crawl_contents(self, contents_list, backup_set): + """ + :param contents_list: + :param backup_set: + :return: is_load_more + """ + old_elements = 0 + for element in contents_list: + if element['date'].date() > self.crawl_init.get_end_day(): + # printl(element['url']) + printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + + elif element['date'].date() < self.crawl_init.get_begin_day(): + printl(element['url']) + printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + old_elements += 1 + if old_elements > 6: + return False + else: + if not element['url'] in backup_set: + # printl(element['url']) + # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + # wait(1.5) + # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) + try: + self.list_crawl.put(element, timeout=10) + except Exception as e: + printl(e) + printl("queue size = ", self.list_crawl.qsize()) + backup_set.add(element['url']) + self.total_num += 1 + if self.is_until_page(): + return False + # if self.list_crawl: + # printl("Number of Lists = {0}".format(len(self.list_crawl))) + return True + + def crawl(self): + real_time = True + while real_time: + printl("Crawling Start") + url_list = self.crawl_init.make_url() + i = 0 + end_cursor = None + backup_set = set() + while i < len(url_list): + # first connect + try: + printl(url_list[i] + "\n") + # insta_content process create and start + # p_list = [multiprocessing.Process(target=crawl_content_process, + # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) + # for i in range(num_of_content_process)] + p_list = [threading.Thread(target=crawl_content_process, + args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) + for i in range(num_of_content_process)] + for p in p_list: + p.daemon = True + p.start() + + # crawl list + ok = True + while ok: + try: + list_crawler = make_list_instance(url_list[i]) + ok = False + except Exception as e: + printl(e) + wait(1) + insta_list = list_crawler.get_list() + is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next + # ajax load + while is_load_more: + if end_cursor: + list_crawler.end_cursor = end_cursor + end_cursor = None + wait(self.reload_wait_second) + try: + insta_list = load_ajax_list(list_crawler) + if insta_list is None: + break + is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next + except Exception as e: + printl('is_load_more exception') + printl(e) + is_load_more = False + #self.crawl_list() + #self.list_crawl.close() + printl("end load") + printl("total number of crawled list = {0}".format(self.total_num)) + self.total_num = 0 + + # check task is done in queue + # self.list_crawl.join() + + # stop child process + for i in range(num_of_content_process): + self.list_crawl.put(None, timeout=10) + + # wait child process + for p in p_list: + p.join() + + for _ in range(self.list_crawl.qsize()): + self.list_crawl.get(block=False) + + i += 1 + except Exception as e: + logging.info(e) + end_cursor = list_crawler.end_cursor + printl('end_cursor=' + end_cursor) + if e.args: + wait(300) + real_time = self.crawl_init.is_realtime() + printl("Finished Crawling :)") + + +class InstaAlgorithmBrowser(InstaAlgorithm): + def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second=2, num_of_load_content=12, page_down=50): + super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, + reload_wait_second, num_of_load_content, page_down) + + def url_load(self, url): + if insta_tag_url in url: + list_tag = ListTag(url) + insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source) + return list_tag, insta_list, end_cursor, has_next + else: + list_user = ListUser(url) + insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source) + return list_user, insta_list, end_cursor, has_next + + def crawl(self): + real_time = True + while real_time: + url_list = self.crawl_init.make_url() + i = 0 + end_cursor = None + backup_set = set() + while i < len(url_list): + # first connect + try: + wait(3) + printl(url_list[i] + "\n") + self.driver.get(url_list[i]) + wait(5) + list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i]) + is_load_more = self.crawl_contents(insta_list, backup_set) and has_next + list_crawler.set_end_cursor(end_cursor2) + list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()} + # ajax load + page_down = 0 + while is_load_more: + if page_down == self.page_down: + page_down = 0 + try: + focus_driver(self.driver) + click_insta_load_more(self.driver) + except: + push_page_down(self.driver) + page_down += 1 + if end_cursor: + list_crawler.end_cursor = end_cursor + end_cursor = None + wait(self.reload_wait_second) + insta_list = list_crawler.load_more() + # printl("list length = " + str(len(insta_list))) + is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next + # printl("number of backup_set = {0}".format(len(backup_set))) + i += 1 + self.crawl_list() + self.list_crawl.clear() + except Exception as e: + logging.info(e) + end_cursor = list_crawler.end_cursor + printl('end_cursor=' + end_cursor) + if e.args: + wait(300) + if self.driver: + self.driver.close() + wait(3) + self.driver = self.browser.new_browser() + real_time = self.crawl_init.is_realtime() + printl("Finished Crawling :)") + + +class InstaMainCrawler: + def __init__(self): + self.send_to_db = SendtoDB() + self.crawl_init = InstaInit() + # self.browser = Browser() + self.browser = None + self.driver = None + + def set_keyword_id(self, keyword_id): + self.keyword_id = keyword_id + + def crawl_all(self, backup_set=None): + pass + + def start(self): + self.crawler_start() + + def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): + self.init_keyword_id(keyword_id) + self.init_db(db_num) + self.init_before_day(before_day) + self.init_until_page(until_page) + # self.init_browser(browser) + + def set_driver(self, driver): + self.driver = driver + + def init_browser(self, browser): + try: + self.set_driver(self.browser.get_new_driver(browser)) + except Exception as e: + logging.info(e) + + def init_keyword_id(self, keyword_id): + if type(keyword_id) != int: + self.keyword_id = int(keyword_id) + else: + self.keyword_id = keyword_id + self.crawl_init.get_keyword_parameters(keyword_id) + self.crawl_init.disconnect() + + def init_db(self, db_num): + self.send_to_db.set_db(db_num) + + def init_before_day(self, before_day): + self.crawl_init.set_before_day(before_day) + + def init_until_page(self, until_page): + self.crawl_init.set_until_page(until_page) + + def crawler_start(self): + # if self.driver: + # algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db, + # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) + # else: + # algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db, + # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) + algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db, + self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) + algorithm.start_crawl() diff --git a/WebBasedCrawler/insta/instaparser.py b/WebBasedCrawler/insta/instaparser.py index 7b82a9a..4d3ea3e 100644 --- a/WebBasedCrawler/insta/instaparser.py +++ b/WebBasedCrawler/insta/instaparser.py @@ -112,6 +112,7 @@ def parse_body_html(content): "article_form": "body", "article_profileurl": media["owner"]["profile_pic_url"], "article_order": str(media["comments"]["count"]), + "article_hit": str(media.get('video_views', 0)), "reply_url": str(media["likes"]["count"]) } comments = postpage[0]["media"]["comments"] diff --git a/WebBasedCrawler/kakao/kakaocrawl.py b/WebBasedCrawler/kakao/kakaocrawl.py index c0ce984..3b892b8 100644 --- a/WebBasedCrawler/kakao/kakaocrawl.py +++ b/WebBasedCrawler/kakao/kakaocrawl.py @@ -336,7 +336,7 @@ class ReplyCrawler(object): def set_soup_and_activity(self): self.soup = BeautifulSoup(self.driver.page_source, parser_opt) - # There are many div.section _activity. But element we use is in div.cover_wrapper + # There are many div.section _activity. But a element we use is in div.cover_wrapper cover_wrapper = self.soup.find('div', class_='cover_wrapper') self.section_activity = cover_wrapper.find('div', class_='section _activity') self.ul = self.section_activity.find('ul', class_='list _listContainer') @@ -345,7 +345,7 @@ class ReplyCrawler(object): previous_num_of_replies = 0 while self.has_more(): self.click_load_more_reply_btn() - # check number of replies before and after click_load_more_reply_btn() + # check the number of replies before and after click_load_more_reply_btn() # If These were equal, the link or ajax failed current_num_of_replies = self.get_num_of_replies() if previous_num_of_replies == current_num_of_replies: diff --git a/WebBasedCrawler/naver/navercrawl.py b/WebBasedCrawler/naver/navercrawl.py index 60abc1e..7513642 100644 --- a/WebBasedCrawler/naver/navercrawl.py +++ b/WebBasedCrawler/naver/navercrawl.py @@ -1 +1 @@ -#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import sys import datetime import re from base.baseclasses import wait from base.baseclasses import print_and_flush from base.baseclasses import Browser from base.baseclasses import SendtoDB from base.baseclasses import enter_element class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) # self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_set = set() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_set(self): self.content_num_set.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_set: continue self.content_num_set.add(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ',' ').replace('.','-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) # class NaverCafeInit: # pymysql = __import__('pymysql.cursors') # url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" # url_second = "&search.searchdate=" # url_third = "&search.searchBy=0&search.query=" # url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" # # def __init__(self, before_day=0): # self.conn = self.pymysql.connect(host ='bigbird.iptime.org', # user='admin', passwd='admin123', # db='concepters', charset='utf8', # cursorclass=self.pymysql.cursors.DictCursor) # self.urls = dict() # self.before_day = before_day # # def set_before_day(self, before_day): # if type(before_day) == str: # self.before_day = int(before_day) # elif type(before_day) == int: # self.before_day = before_day # # def set_until_page(self, until_page): # if type(until_page) == str: # self.before_day = int(until_page) # elif type(until_page) == int: # self.before_day = until_page # # def split_searches(self): # search = self.searches() # splited_list = search.split(',') # trimmed_list = list() # for x in splited_list: # trimmed_list.append(self.euc_kr(x.strip())) # return trimmed_list # # def get_keyword_parameters(self, keyword_id): # query = "select * from keyword where id = " + str(keyword_id) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # self.params = cursor.fetchone() # return self.params # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return dict() # # def get_naver_cafe_list(self): # query = "select url, clubid from navercafelist" # if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: # pass # else: # query += (" where group_num = " + str(self.authorship())) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # list_result = cursor.fetchall() # for i in list_result: # self.urls[i["url"]] = i["clubid"] # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return self.urls # # def start_day(self): # return self.params["start"] # # def end_day(self): # return self.params["end"] # # def keyword_id(self): # return self.params["id"] # # def realtime(self): # return self.params["realtime"] # # def searches(self): # return self.params["searches"] # # def authorship(self): # return self.params["authorship"] # # def platform(self): # return self.params["platform"] # # def is_realtime(self): # if str(self.realtime()) == '0': # return False # else: # return True # # def euc_kr(self, keyword): # byte_code = list(keyword.encode("euc_kr")) # encoded_keyword = "" # for i in byte_code: # if i == 0x20: # encoded_keyword += "+" # else: # encoded_keyword += str(hex(i)).replace("0x", "%").upper() # return encoded_keyword # # def url_all_days(self): # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # today = datetime.date.today() # url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) # else: # url = self.make_url(self.start_day(), self.end_day(), val) # for i in url: # url_list.append(i) # return url_list # # def url_day_by_day(self): # one_day = datetime.timedelta(days=1) # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # end = datetime.date.today() # start = end + datetime.timedelta(days=self.before_day) # else: # start = self.start_day() # end = self.end_day() # while start <= end: # url = self.make_url(start, start, val) # for i in url: # url_list.append(i) # start += one_day # return url_list # # def make_url(self, start_day, end_day, clubid): # urls = list() # for x in self.split_searches(): # url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth # urls.append(url) # return urls # # def disconnect(self): # self.conn.close() # # def date_to_str(self, arg_date): # return arg_date.strftime("%Y-%m-%d") class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") class NaverCafeInit(CrawlInit): url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): super().__init__(before_day) def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.crawl_init = NaverCafeInit() self.browser = Browser() self.naver_cafe = NaverCafeCrawler() def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.naver_cafe.set_driver(driver) self.driver = driver def copy_list(self, backup_set): for i in backup_set: self.board_crawler.content_num_set.add(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_set() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index == -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() == "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def start(self): self.crawl_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_browser(browser) self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) def init_browser(self, browser): self.set_driver(self.browser.get_new_driver(browser)) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.get_naver_cafe_list() self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawl_start(self): naver_id = "ehotnsdl1234" naver_password = "66556655*" asis = Asistance() self.naver_cafe.naver_login(naver_id, naver_password) wait(5) real_time = True while real_time: print_and_flush("Crawler Start") url_list = self.crawl_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") self.driver.get(url_list[i]) wait(5) self.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = self.board_crawler.content_num_set.copy() self.driver.quit() self.set_driver(self.browser.new_browser()) wait(5) self.naver_cafe.naver_login(naver_id, naver_password) wait(3) real_time = self.crawl_init.is_realtime() print_and_flush("Finished Crawling :)") self.send_to_db.close() self.driver.quit() if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "66556655*" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("chrome")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = list(naver_main_area_crawler.board_crawler.content_num_set) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file +#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import sys import datetime import re from base.baseclasses import wait from base.baseclasses import print_and_flush from base.baseclasses import Browser from base.baseclasses import SendtoDB from base.baseclasses import enter_element class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) # self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_set = set() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_set(self): self.content_num_set.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('`') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_set: continue self.content_num_set.add(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) # class NaverCafeInit: # pymysql = __import__('pymysql.cursors') # url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" # url_second = "&search.searchdate=" # url_third = "&search.searchBy=0&search.query=" # url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" # # def __init__(self, before_day=0): # self.conn = self.pymysql.connect(host ='bigbird.iptime.org', # user='admin', passwd='admin123', # db='concepters', charset='utf8', # cursorclass=self.pymysql.cursors.DictCursor) # self.urls = dict() # self.before_day = before_day # # def set_before_day(self, before_day): # if type(before_day) == str: # self.before_day = int(before_day) # elif type(before_day) == int: # self.before_day = before_day # # def set_until_page(self, until_page): # if type(until_page) == str: # self.before_day = int(until_page) # elif type(until_page) == int: # self.before_day = until_page # # def split_searches(self): # search = self.searches() # splited_list = search.split(',') # trimmed_list = list() # for x in splited_list: # trimmed_list.append(self.euc_kr(x.strip())) # return trimmed_list # # def get_keyword_parameters(self, keyword_id): # query = "select * from keyword where id = " + str(keyword_id) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # self.params = cursor.fetchone() # return self.params # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return dict() # # def get_naver_cafe_list(self): # query = "select url, clubid from navercafelist" # if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: # pass # else: # query += (" where group_num = " + str(self.authorship())) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # list_result = cursor.fetchall() # for i in list_result: # self.urls[i["url"]] = i["clubid"] # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return self.urls # # def start_day(self): # return self.params["start"] # # def end_day(self): # return self.params["end"] # # def keyword_id(self): # return self.params["id"] # # def realtime(self): # return self.params["realtime"] # # def searches(self): # return self.params["searches"] # # def authorship(self): # return self.params["authorship"] # # def platform(self): # return self.params["platform"] # # def is_realtime(self): # if str(self.realtime()) == '0': # return False # else: # return True # # def euc_kr(self, keyword): # byte_code = list(keyword.encode("euc_kr")) # encoded_keyword = "" # for i in byte_code: # if i == 0x20: # encoded_keyword += "+" # else: # encoded_keyword += str(hex(i)).replace("0x", "%").upper() # return encoded_keyword # # def url_all_days(self): # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # today = datetime.date.today() # url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) # else: # url = self.make_url(self.start_day(), self.end_day(), val) # for i in url: # url_list.append(i) # return url_list # # def url_day_by_day(self): # one_day = datetime.timedelta(days=1) # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # end = datetime.date.today() # start = end + datetime.timedelta(days=self.before_day) # else: # start = self.start_day() # end = self.end_day() # while start <= end: # url = self.make_url(start, start, val) # for i in url: # url_list.append(i) # start += one_day # return url_list # # def make_url(self, start_day, end_day, clubid): # urls = list() # for x in self.split_searches(): # url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth # urls.append(url) # return urls # # def disconnect(self): # self.conn.close() # # def date_to_str(self, arg_date): # return arg_date.strftime("%Y-%m-%d") class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") class NaverCafeInit(CrawlInit): url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): super().__init__(before_day) def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.crawl_init = NaverCafeInit() self.browser = Browser() self.naver_cafe = NaverCafeCrawler() def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.naver_cafe.set_driver(driver) self.driver = driver def copy_list(self, backup_set): for i in backup_set: self.board_crawler.content_num_set.add(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_set() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index == -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() == "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def start(self): self.crawl_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_browser(browser) self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) def init_browser(self, browser): self.set_driver(self.browser.get_new_driver(browser)) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.get_naver_cafe_list() self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawl_start(self): naver_id = "ehotnsdl1234" naver_password = "66556655*" asis = Asistance() self.naver_cafe.naver_login(naver_id, naver_password) wait(5) real_time = True while real_time: print_and_flush("Crawler Start") url_list = self.crawl_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") self.driver.get(url_list[i]) wait(5) self.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = self.board_crawler.content_num_set.copy() self.driver.quit() self.set_driver(self.browser.new_browser()) wait(5) self.naver_cafe.naver_login(naver_id, naver_password) wait(3) real_time = self.crawl_init.is_realtime() print_and_flush("Finished Crawling :)") self.send_to_db.close() self.driver.quit() if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "66556655*" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("chrome")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = list(naver_main_area_crawler.board_crawler.content_num_set) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file