From cc8122e0746a2f8ca73202c971f1d85f51f7a954 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 7 Dec 2015 03:25:49 +0000 Subject: [PATCH] =?UTF-8?q?WebBasedCrawler=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@229 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- WebBasedCrawler/kakaocrawl.py | 1310 ++++++++++++++++++++++++++++ WebBasedCrawler/navercrawl.py | 1 + WebBasedCrawler/navercrawl11.23.py | 1 + 3 files changed, 1312 insertions(+) create mode 100644 WebBasedCrawler/kakaocrawl.py create mode 100644 WebBasedCrawler/navercrawl.py create mode 100644 WebBasedCrawler/navercrawl11.23.py diff --git a/WebBasedCrawler/kakaocrawl.py b/WebBasedCrawler/kakaocrawl.py new file mode 100644 index 0000000..82feda3 --- /dev/null +++ b/WebBasedCrawler/kakaocrawl.py @@ -0,0 +1,1310 @@ +#-*- coding: utf-8 -*- +__author__ = 'cococo' +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +import sys +import re +import datetime +import json +import os +import time + +from navercrawl import wait +from navercrawl import print_and_flush +from navercrawl import SendtoDB +from navercrawl import Browser +from navercrawl import CrawlInit +from selenium.common.exceptions import WebDriverException + +kakaostory_url = 'https://story.kakao.com/' +kakaostory_channel_url = 'https://story.kakao.com/ch/' + +class KakaoBodyCrawler: + def __init__(self, driver=None): + self.driver = driver + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def find_article_profileurl(self): + img = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a/img") + return img.get_attribute("src") + + def find_article_nickname(self): + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/div[@class='myid']/a") + return a.text + + def find_article_modified_date(self): + try: + span = self.activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") + except: + return None + ac = ActionChains(self.driver) + ac.move_to_element(span).perform() + wait(0.3) + data_tooltip = span.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + return None + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + def find_article_date(self): + time_modified_date = self.find_article_modified_date() + if time_modified_date is not None: + return time_modified_date + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + a.send_keys(Keys.NULL) + ac = ActionChains(self.driver) + ac.move_to_element(a).perform() + wait(0.2) + data_tooltip = a.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return str(temp_date) + + def find_article_id(self): + a = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a") + href = a.get_attribute("href") + #str_id = href[href.rindex('/') + 1:] + str_id = href.replace(kakaostory_url, "") + return str_id + + def find_article_url(self): + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + url = a.get_attribute("href") + return url + + def find_platform_name(self): + return "kakaostory" + + def find_platform_form(self): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def find_article_form(self): + return "body" + + def find_article_data(self): + more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']") + display = more.get_attribute("style") + if display.find('none') == -1: + a = more.find_element_by_tag_name("a") + self.enter_element(a) + try: + content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']") + except: + return str("") + return content.text + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_platform_id(self): + return self.find_article_id() + + def find_article_title(self): + content = self.find_article_data() + if not content: + return "" + try: + return content.strip().splitlines()[0] + except: + return "" + + def find_feeling_users3(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") + like_num = int(str_like.text) + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, 15).perform() + wait(1) + lis = fake_scroll.find_elements_by_tag_name("li") + data = list() + for li in lis: + try: + a = li.find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + # str_id = href[href.rindex('/') + 1:] + str_id = href.replace(kakaostory_url, "") + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'profileurl': profileurl}) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + wait(1) + return feelings + + def find_reply_users(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewComments' and not(@style)]") + except: + return None + count = a.find_element_by_css_selector("strong._commentCount").text + if len(count.strip()) < 1: + return None + else: + return int(count.replace(",", "").strip()) + + def find_feeling_users(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) + str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") + like_num = int(str_like.text.replace(",", "")) + # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) + start_time = time.time() + while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + ac.drag_and_drop_by_offset(scroll, 0, 30).perform() + wait(0.5) + if time.time() - start_time > 600.0: + break + ul = fake_scroll.find_element_by_tag_name("ul") + data = list() + try: + a_list = ul.find_elements_by_css_selector("a[class='link_people']") + # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") + for i in range(0, len(a_list)): + href = a_list[i].get_attribute('href') + str_id = href.replace(kakaostory_url, "") + # profileurl = img_list[i].get_attribute('src') + # data.append({'id': str_id, 'profileurl': profileurl}) + data.append({'id': str_id}) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + wait(1) + return feelings + + def find_feeling_users2(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + scroll_style = scroll.get_attribute("style") + re_height = re.compile("height: ([0-9]*\\.[0-9]+|[0-9]+)px") + re_top = re.compile("top: ([0-9]*\\.[0-9]+|[0-9]+)px") + m_h = re_height.search(scroll_style) + m_t = re_top.search(scroll_style) + if m_t is None: + top = 0.0 + else: + top = float(m_t.group(1)) + if m_h is None: + height = 0.0 + else: + height = float(m_h.group(1)) + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + i = 0 + data = list() + while height + top < 320: + lis = fake_scroll.find_elements_by_tag_name("li") + for j in range(i, (i+6) if i+6 < len(lis) else len(lis)): + a = lis[j].find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + str_id = href[href.rindex('/') + 1:] + em = a.find_element_by_css_selector("em[class='tit_userinfo']") + nickname = em.text + span = a.find_element_by_css_selector("span[class='txt_feel']") + emotion = span.text + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'nickname': nickname, 'emotion': emotion, 'profileurl': profileurl}) + i += 6 + move_pixel = 1968.0 / len(fake_scroll.find_elements_by_tag_name("li")) + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, move_pixel).perform() + wait(1) + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + scroll_style = scroll.get_attribute("style") + m_h = re_height.search(scroll_style) + m_t = re_top.search(scroll_style) + if m_t is None: + top = 0.0 + else: + top = float(m_t.group(1)) + if m_h is None: + height = 0.0 + else: + height = float(m_h.group(1)) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + return feelings + + def find_share_users2(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") + re_share = re.compile("\\(([\\d]+)\\)") + m = re_share.search(str_share.text) + if m is None: + share_num = 0 + else: + share_num = int(m.group(1)) + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, 15).perform() + wait(1) + lis = fake_scroll.find_elements_by_tag_name("li") + data = list() + for li in lis: + try: + a = li.find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + last_slush = href.rindex('/') + # begin_slush = href[:last_slush].rindex('/') + # str_id = href[begin_slush+1:last_slush] + str_id = href[:last_slush].replace(kakaostory_url, "") + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'profileurl': profileurl}) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + shares = dict() + shares['data'] = data + shares['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") + self.click_element(a) + return shares + + def find_share_users(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]") + except: + return None + self.enter_element(a) + # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) + str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") + re_share = re.compile("\\(([\\d]+)\\)") + m = re_share.search(str_share.text) + if m is None: + share_num = 0 + else: + share_num = int(m.group(1).replace(",", "")) + # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) + start_time = time.time() + while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + ac.drag_and_drop_by_offset(scroll, 0, 30).perform() + wait(0.5) + if time.time() - start_time > 600.0: + break + ul = fake_scroll.find_element_by_tag_name("ul") + data = list() + try: + a_list = ul.find_elements_by_css_selector("a[class='link_people']") + # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") + for i in range(0, len(a_list)): + href = a_list[i].get_attribute('href') + last_slush = href.rindex('/') + # begin_slush = href[:last_slush].rindex('/') + # str_id = href[begin_slush+1:last_slush] + str_id = href[:last_slush].replace(kakaostory_url, "") + # profileurl = img_list[i].get_attribute('src') + # data.append({'id': str_id, 'profileurl': profileurl}) + data.append({'id': str_id}) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + shares = dict() + shares['data'] = data + shares['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") + self.click_element(a) + return shares + + def find_platform_title(self): + return self.driver.title + + def get_content(self): + content = dict() + content["article_id"] = self.find_article_id() + # print_and_flush("article_id") + content["article_nickname"] = self.find_article_nickname() + # print_and_flush("article_nickname") + content["article_title"] = self.find_article_title() + # print_and_flush("article_title") + content["article_date"] = self.find_article_date() + # print_and_flush("article_date") + #content["article_hit"] = self.find_article_hit() + content["article_url"] = self.find_article_url() + # print_and_flush("article_url") + content["article_data"] = self.find_article_data() + # print_and_flush("article_data") + content["article_form"] = self.find_article_form() + # print_and_flush("article_form") + content["article_profileurl"] = self.find_article_profileurl() + # print_and_flush("article_profileurl") + #content["platform_title"] = self.find_platform_title() + content["platform_title"] = content["article_nickname"] + # print_and_flush("platform_title") + content["platform_name"] = self.find_platform_name() + if content["article_url"].find(kakaostory_channel_url) != -1: + content["platform_form"] = "channel" + else: + content["platform_form"] = "story" + # print_and_flush("platform_form") + content["platform_id"] = self.find_platform_id() + # print_and_flush("platform_id") + data = list() + # print_and_flush("start feelings") + feelings = self.find_feeling_users() + # print_and_flush("feelings") + # print_and_flush("done feelings") + if feelings is not None: + data.append({"feelings": feelings}) + content["article_profile"] = str(feelings["count"]) + # print_and_flush("start shares") + shares = self.find_share_users() + # print_and_flush("shares") + # print_and_flush("done shares") + if shares is not None: + data.append({"shares": shares}) + content["reply_url"] = str(shares["count"]) + if data: + json_data = {"data": data} + content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode(json_data) + reply_count = self.find_reply_users() + if type(reply_count) == int: + content["article_order"] = reply_count + return content + + +class KakaoReplyCrawler_backup: + def __init__(self, driver=None, activity=None): + self.driver = driver + self.activity = activity + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.reply_list = list() + self.order = 0 + + def find_init(self): + self.reply_list.clear() + self.order = 0 + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def has_more(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + if more.get_attribute('style').find('block') != -1: + return True + else: + return False + + def read_more_reply(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + a = more.find_element_by_css_selector("a[class='_btnCommentMore']") + self.enter_element(a) + + def read_all_reply(self): + while self.has_more(): + self.read_more_reply() + + def get_reply_lis(self): + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + lis = ul.find_elements_by_tag_name("li") + return lis + + def has_reply(self): + try: + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + lis = ul.find_elements_by_tag_name("li") + if len(lis) > 0: + return True + else: + return False + except: + return False + + def crawl_reply(self, li): + content = dict() + content["article_id"] = self.find_article_id(li) + content["article_nickname"] = self.find_article_nickname(li) + content["article_date"] = self.find_article_date(li) + content["article_data"] = self.find_article_data(li) + content["article_order"] = self.order + content["article_url"] = self.find_article_url(li) + content["platform_id"] = self.find_platform_id(li) + content["article_form"] = self.find_article_form() + content["article_profileurl"] = self.find_article_profileurl(li) + content["platform_name"] = self.find_platform_name() + if content["article_url"].find(kakaostory_channel_url) != -1: + content["platform_form"] = "channel" + else: + content["platform_form"] = "story" + article_parent = self.find_article_parent(li) + if article_parent is not None: + content["article_parent"] = article_parent + self.order += 1 + self.reply_list.append(content) + + def get_content(self): + return self.reply_list + + def crawl_all(self): + self.find_init() + self.read_all_reply() + try: + lis = self.get_reply_lis() + for li in lis: + self.crawl_reply(li) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + + def find_article_id(self, li): + a = li.find_element_by_xpath("div[@class='pf']/a") + href = a.get_attribute('href') + str_id = href.replace(kakaostory_url, "").strip() + return str_id + + def find_article_profileurl(self, li): + img = li.find_element_by_xpath("div[@class='pf']/a/img") + return img.get_attribute('src') + + def find_article_nickname(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@data-profile-popup]") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel + return a.text + + def find_article_date(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") + # a.send_keys(Keys.NULL) + # ac = ActionChains(self.driver) + # ac.move_to_element(a).perform() + # wait(0.1) + # data_tooltip = a.get_attribute("data-tooltip") + data_tooltip = a.get_attribute("title") + #a.get_attribute('title') <-- data_tooltip + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return str(temp_date) + + def find_article_parent(self, li): + comment = li.find_element_by_xpath("div[@class='txt']") + try: + a = comment.find_element_by_xpath("a[@data-profile-popup]") + return a.text + except: + return None + + def find_article_data(self, li): + all_element = li.find_element_by_xpath("div[@class='txt']") + all_text = all_element.text + p = all_element.find_element_by_tag_name('p') + p_text = p.text + return all_text[len(p_text):].strip() + + def find_article_url(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") + href = a.get_attribute("href") + return href[:href.rindex('/')] + + def find_platform_id(self, li): + article_url = self.find_article_url(li) + main_url = article_url[:article_url.rindex('/')] + #return main_url[main_url.rindex('/')+1:] + return main_url.replace(kakaostory_url, "") + + def find_article_form(self, li=None): + return 'reply' + + def find_platform_name(self, li=None): + return 'kakaostory' + + def find_platform_form(self, li=None): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_like_count(self, li): + try: + like = li.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") + return like.text + except: + return '0' + + +class KakaoReplyCrawler: + def __init__(self, driver=None, activity=None): + self.driver = driver + self.activity = activity + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.reply_list = list() + self.order = 0 + + def find_init(self): + self.reply_list.clear() + self.order = 0 + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def has_more(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + if more.get_attribute('style').find('block') != -1: + return True + else: + return False + + def read_more_reply(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + a = more.find_element_by_css_selector("a[class='_btnCommentMore']") + self.enter_element(a) + + def read_all_reply(self): + start_time = time.time() + while self.has_more(): + self.read_more_reply() + if time.time() - start_time > 600.0: + raise WebDriverException + + def get_reply_ul(self): + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + return ul + + def has_reply(self): + try: + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + lis = ul.find_elements_by_tag_name("li") + if len(lis) > 0: + return True + else: + return False + except: + return False + + def crawl_reply(self, ul): + article_id = self.find_article_id(ul) + article_nickname = self.find_article_nickname(ul) + article_date = self.find_article_date(ul) + article_data = self.find_article_data(ul) + article_url = self.find_article_url(ul) + platform_id = self.find_platform_id(ul) + article_profileurl = self.find_article_profileurl(ul) + article_parent = self.find_article_parent(ul) + # print_and_flush(str(len(article_id))) + # print_and_flush(str(len(article_nickname))) + # print_and_flush(str(len(article_date))) + # print_and_flush(str(len(article_data))) + # print_and_flush(str(len(article_url))) + # print_and_flush(str(len(platform_id))) + # print_and_flush(str(len(article_profileurl))) + # print_and_flush(str(len(article_parent))) + if article_url[0].find(kakaostory_channel_url) != -1: + platform_form = "channel" + else: + platform_form = "story" + for i in range(0, len(article_id)): + content = dict() + content["article_id"] = article_id[i] + content["article_nickname"] = article_nickname[i] + content["article_profileurl"] = article_profileurl[i] + content["article_url"] = article_url[i] + content["platform_id"] = platform_id[i] + content["article_date"] = article_date[i] + content["article_data"] = article_data[i] + content["platform_form"] = platform_form + content["article_order"] = i + content["platform_name"] = self.find_platform_name() + content["article_form"] = self.find_article_form() + if len(article_parent[i]) > 0: + content["article_parent"] = article_parent[i] + self.reply_list.append(content) + + def get_content(self): + return self.reply_list + + def crawl_all(self): + self.find_init() + self.read_all_reply() + try: + ul = self.get_reply_ul() + self.crawl_reply(ul) + except WebDriverException: + raise WebDriverException + except Exception as e: + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + + def find_article_id(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a") + str_id_list = list() + for a in a_list: + href = a.get_attribute('href') + str_id = href.replace(kakaostory_url, "").strip() + str_id_list.append(str_id) + return str_id_list + + def find_article_profileurl(self, ul): + img = ul.find_elements_by_xpath("li/div[@class='pf']/a/img") + img_list = list() + for im in img: + img_list.append(im.get_attribute('src')) + return img_list + + def find_article_nickname(self, ul): + a = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@data-profile-popup]") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel + nickname_list = list() + for i in a: + nickname_list.append(i.text) + return nickname_list + + def find_article_date(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") + # a.send_keys(Keys.NULL) + # ac = ActionChains(self.driver) + # ac.move_to_element(a).perform() + # wait(0.1) + # data_tooltip = a.get_attribute("data-tooltip") + date_list = list() + for a in a_list: + data_tooltip = a.get_attribute("title") + m = self.re_date.search(data_tooltip) + if m is None: + date_list.append("0000-00-00 00:00:00") + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + date_list.append(str(temp_date)) + return date_list + + def find_article_parent(self, ul): + comments = ul.find_elements_by_xpath("li/div[@class='txt']") + article_parents = list() + for comment in comments: + try: + a = comment.find_element_by_xpath("a[@data-profile-popup]") + article_parents.append(a.text) + except: + article_parents.append("") + return article_parents + + def find_article_data(self, ul): + all_elements = ul.find_elements_by_xpath("li/div[@class='txt']") + all_elements_p = ul.find_elements_by_xpath("li/div[@class='txt']/p") + all_text_list = list() + for i in range(0, len(all_elements)): + all_text = all_elements[i].text + p_text = all_elements_p[i].text + all_text_list.append(all_text[len(p_text):].strip()) + return all_text_list + + def find_article_url(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") + article_url_list = list() + for a in a_list: + href = a.get_attribute("href") + article_url_list.append(href[:href.rindex('/')]) + return article_url_list + + def find_platform_id(self, ul): + article_urls = self.find_article_url(ul) + platform_id = list() + for article_url in article_urls: + main_url = article_url[:article_url.rindex('/')] + #return main_url[main_url.rindex('/')+1:] + platform_id.append(main_url.replace(kakaostory_url, "")) + return platform_id + + def find_article_form(self, ul=None): + return 'reply' + + def find_platform_name(self, ul=None): + return 'kakaostory' + + def find_platform_form(self, ul=None): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_like_count(self, ul): + try: + like = ul.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") + return like.text + except: + return '0' + + +class KakaoPageCrawler: + def __init__(self, driver=None, begin_date=None, end_date=None): + self.driver = driver + self.activity_data_model_set = set() + self.begin_date = begin_date + self.end_date = end_date + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.index = 0 + self.activities = None + self.present_activity = 0 + self.previous_activity = 0 + self.reload_count = 0 + + def move_to_url(self, url): + self.driver.get(url) + self.index = 0 + self.activity_data_model_set.clear() + + def init(self): + self.index = 0 + self.previous_activity = 0 + self.activities = None + self.activity_data_model_set.clear() + + def set_date(self, begin_date, end_date): + self.set_begin_date(begin_date) + self.set_end_date(end_date) + + def set_end_date(self, end_date): + if type(end_date) == str: + self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') + elif type(end_date) == datetime.datetime or type(end_date) == datetime.date: + self.end_date = end_date + else: + self.end_date = datetime.datetime.today() + self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day) + self.end_date += datetime.timedelta(days=1) + + def set_begin_date(self, begin_date): + if type(begin_date) == str: + self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d') + elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date: + self.begin_date = begin_date + else: + self.begin_date = datetime.datetime.today() + self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day) + + def next_activity_backup(self): + try: + if not self.activities: + self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='section _activity']"))) + self.index = 0 + if len(self.activities) == 0: + print_and_flush("activities are not found") + self.activities = None + return None + except: + print_and_flush("activities are not found") + self.activities = None + return None + has_more_activities = True + self.present_activity = len(self.activities) + while has_more_activities: + for activity in self.activities[self.previous_activity:]: + if activity.get_attribute("data-model") in self.activity_data_model_set: + continue + self.activity_data_model_set.add(activity.get_attribute("data-model")) + time_date = self.find_article_date(activity) + if self.is_earlier(time_date): + self.activities = None + return None + if self.is_late(time_date): + continue + return activity + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(self.activities) == self.present_activity: + has_more_activities = self.load_more_activities() + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + else: + has_more_activities = True + self.previous_activity = self.present_activity + self.present_activity = len(self.activities) + self.activities = None + return None + + def next_activity(self): + try: + if self.activities is None: + self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located( + (By.CSS_SELECTOR, "div[class='section _activity']")) + ) + if len(self.activities) == 0: + print_and_flush("activities are not found") + self.activities = None + return None + except: + print_and_flush("activities are not found") + self.activities = None + return None + while True: + self.index += 1 + if self.index >= len(self.activities): + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if self.index >= len(self.activities): + if self.load_more_activities() is False: + self.activities = None + return None + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if self.activities[self.index - 1].get_attribute("data-model") in self.activity_data_model_set: + continue + time_date = self.find_article_date(self.activities[self.index - 1]) + time_modified_date = self.find_article_modified_date(self.activities[self.index - 1]) + if time_modified_date is not None: + time_date = time_modified_date + print_and_flush(str(time_date)) + if type(time_date) == str: + continue + if self.is_earlier(time_date): + self.activities = None + return None + if self.is_late(time_date): + continue + return self.activities[self.index - 1] + + def crawling_ok(self): + self.activity_data_model_set.add(self.activities[self.index - 1].get_attribute("data-model")) + + def next_activity_prepare(self): + try: + activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(activities) == 0: + return None + except: + return None + has_more_activities = True + while has_more_activities: + if self.index < len(activities): + temp_index = self.index + self.index += 1 + time_date = self.find_article_date(activities[temp_index]) + if self.is_earlier(time_date): + return None + if self.is_late(time_date): + continue + return activities[temp_index] + else: + has_more_activities = self.load_more_activities() + activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + return None + + def load_more_activities(self): + previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + for i in range(0, 5): + print_and_flush("Try load more") + body = self.driver.find_element_by_tag_name("body") + body.send_keys(Keys.NULL) + body.send_keys(Keys.END) + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + for i in range(0, 5): + print_and_flush("Try load more") + body = self.driver.find_element_by_tag_name("body") + for j in range(0, 3): + body.send_keys(Keys.PAGE_UP) + wait(0.1) + for j in range(0, 50): + body.send_keys(Keys.PAGE_DOWN) + wait(0.1) + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + for i in range(0, 10): + print_and_flush("Try load more") + self.driver.execute_script("window.scrollBy(0, 800)") + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + if self.reload_count < 10: + print_and_flush("index reload") + self.reload_count += 1 + self.index //= 2 + position = self.driver.get_window_position() + size = self.driver.get_window_size() + self.driver.maximize_window() + self.driver.set_window_size(size['width'], size["height"]) + self.driver.set_window_position(position['x'], position['y']) + return True + if self.reload_count < 15: + print_and_flush("refresh") + self.driver.refresh() + wait(5) + self.index = 0 + self.reload_count += 1 + return True + return False + + def is_earlier(self, time_date): + return True if time_date < self.begin_date else False + + def is_late(self, time_date): + return True if time_date > self.end_date else False + + def set_driver(self, driver): + self.driver = driver + + def find_article_date(self, activity): + a = activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + a.send_keys(Keys.NULL) + ac = ActionChains(self.driver) + ac.move_to_element(a).perform() + wait(0.5) + ac.move_to_element(a).perform() + wait(0.5) + data_tooltip = a.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + data_tooltip = a.get_attribute("title") + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + def find_article_modified_date(self, activity): + try: + span = activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") + except: + return None + ac = ActionChains(self.driver) + ac.move_to_element(span).perform() + wait(0.8) + data_tooltip = span.get_attribute("data-tooltip") + wait(0.2) + m = self.re_date.search(data_tooltip) + if m is None: + return None + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + +class KakaoMainCrawler: + def __init__(self): + self.page_crawler = KakaoPageCrawler() + self.body_crawler = KakaoBodyCrawler() + self.reply_crawler = KakaoReplyCrawler() + self.send_to_db = SendtoDB() + self.driver = None + self.browser = None + + def set_driver(self, driver): + self.page_crawler.set_driver(driver) + self.body_crawler.set_driver(driver) + self.reply_crawler.set_driver(driver) + self.driver = driver + + def set_keyword_id(self, keyword_id): + self.keyword_id = keyword_id + + def crawl_all_current_url(self, backup_set=None): + self.page_crawler.init() + if backup_set: + self.page_crawler.activity_data_model_set = backup_set.copy() + while True: + activity = self.page_crawler.next_activity() + if activity is None: + break + try: + self.crawl_body(activity) + self.crawl_reply(activity) + self.page_crawler.crawling_ok() + print_and_flush("ok") + except WebDriverException as ee: + print_and_flush(ee) + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush("fail") + raise WebDriverException + except Exception as e: + print_and_flush("failed") + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + print_and_flush(e) + + def crawl_body(self, activity): + # print_and_flush("start body crawl") + self.body_crawler.set_driver(self.driver) + self.body_crawler.set_activity(activity) + content = self.body_crawler.get_content() + content["keyword_id"] = self.keyword_id + print_and_flush(content["article_url"]) + self.send_to_db.delete_url(content['article_url']) + self.send_to_db.send_body(content) + + def crawl_reply(self, activity): + # print_and_flush("start reply crawl") + self.reply_crawler.set_driver(self.driver) + self.reply_crawler.set_activity(activity) + if self.reply_crawler.has_reply(): + self.reply_crawler.crawl_all() + self.send_to_db.send_reply(self.reply_crawler.get_content()) + + +class KakaoInit(CrawlInit): + def __init__(self, before_day=0): + super().__init__(before_day) + self.urls = dict() + self.urls[6] = "https://story.kakao.com/ch/" + self.urls[7] = "https://story.kakao.com/hashtag/" + self.urls[8] = "https://story.kakao.com/" + + def split_searches(self): + search = self.searches() + splited_list = search.split(',') + trimmed_list = list() + if self.platform() == 6 or self.platform() == 8: + for x in splited_list: + trimmed_list.append(x.strip()) + else: + for x in splited_list: + trimmed_list.append(self.utf8(x.strip())) + return trimmed_list + + def make_url(self): + urls = list() + for x in self.split_searches(): + url = self.urls[self.platform()] + x + urls.append(url) + return urls + + def get_begin_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + result += datetime.timedelta(days=self.before_day) + return result + else: + return self.start_day() + + def get_end_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + return result + else: + return self.end_day() + + +if __name__ == '__main__': + """ + argv: + 0 - kakaocrawl.py + 1 - keyword_id + 2 - data db num + 3 - before_day + 4 - until_page + """ + + if len(sys.argv) < 5: + print("Fail to process execute") + exit(1) + else: + print("Start Python Crawling") + + kakao_init = KakaoInit(int(sys.argv[3])) + kakao_init.get_keyword_parameters(sys.argv[1]) + kakao_init.disconnect() + browser = Browser() + kakao_main = KakaoMainCrawler() + kakao_main.set_driver(browser.get_new_driver("chrome")) + # kakao_main.driver.implicitly_wait(5) + wait(3) + kakao_main.set_keyword_id(sys.argv[1]) + kakao_main.send_to_db.set_db(sys.argv[2]) + realtime = True + while realtime: + print_and_flush("Crawler Start") + url_list = kakao_init.make_url() + i = 0 + backup_set = set() + while i < len(url_list): + try: + print_and_flush(url_list[i] + "\n") + kakao_main.driver.get(url_list[i]) + wait(3) + kakao_main.page_crawler.set_date(begin_date=kakao_init.get_begin_day(), + end_date=kakao_init.get_end_day()) + kakao_main.crawl_all_current_url(backup_set) + i += 1 + backup_set.clear() + except Exception as e: + print_and_flush(e) + exc_type, exc_obj, exc_tb = sys.exc_info() + fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] + print(exc_type, fname, exc_tb.tb_lineno) + backup_set = kakao_main.page_crawler.activity_data_model_set.copy() + kakao_main.set_driver(browser.new_browser()) + # kakao_main.driver.implicitly_wait(5) + wait(5) + realtime = kakao_init.is_realtime() + print_and_flush("Finished Crawling :)") + # kakao_main.driver.quit() + kakao_main.send_to_db.close() + print_and_flush("ByeBye :)") + + exit(0) \ No newline at end of file diff --git a/WebBasedCrawler/navercrawl.py b/WebBasedCrawler/navercrawl.py new file mode 100644 index 0000000..a8e99f4 --- /dev/null +++ b/WebBasedCrawler/navercrawl.py @@ -0,0 +1 @@ +#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import threading from time import localtime, strftime import time import os import sys import datetime import psutil import re def fcntwait(n): time.sleep(n) def wait(n): th = threading.Thread(target=fcntwait, args=(n,)) th.start() th.join() def insert_log(msg): pid = os.getpid() tm = strftime("%Y_%m_%d", localtime()) filename = tm + "_" + str(pid) + ".log" total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg with open(filename, "a") as f: f.write(total_msg) f.flush() def print_and_flush(string): print(string) sys.stdout.flush() class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class Browser: def __init__(self, driver=None): self.driver = driver self.info = "" def get_new_driver(self, name): """ windows system: name = chrome, ie, opera, firefox default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe linux system: name = chrome, opera, firefox default driver_exec: chromedriver, operadriver """ if sys.platform == "win32": if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver.exe") elif name == "ie": return self.new_ie_browser(driver_exec="IEDriverServer.exe") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver.exe") elif name == "firefox": return self.new_firefox_browser() else: return None else: if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver") elif name == "firefox": return self.new_firefox_browser() else: return None def new_chrome_browser(self, driver_exec=None): self.info = "chrome" if driver_exec is not None: self.chrome_driver_path = driver_exec self.chrome_basename = os.path.basename(driver_exec) if self.is_server_executed(self.chrome_basename): port = self.port(self.chrome_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME) else: self.driver = webdriver.Chrome(self.chrome_driver_path) return self.driver def new_ie_browser(self, driver_exec=None): self.info = "ie" if driver_exec is not None: self.ie_driver_path = driver_exec self.ie_basename = os.path.basename(driver_exec) if self.is_server_executed(self.ie_basename): port = self.port(self.ie_basename) self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER) else: self.driver = webdriver.Ie(self.ie_driver_path) return self.driver def new_firefox_browser(self): self.info = "firefox" self.driver = webdriver.Firefox() return self.driver def new_opera_browser(self, driver_exec=None): self.info = "opera" if driver_exec is not None: self.opera_driver_path = driver_exec self.opera_basename = os.path.basename(driver_exec) if self.is_server_executed(self.opera_basename): port = self.port(self.opera_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA) else: self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path) return self.driver def driver(self): return self.driver def is_server_executed(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return True return False def port(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return str(x.laddr[1]) return str(9999) def new_browser(self): if self.info == "chrome": return self.new_chrome_browser() elif self.info == "ie": return self.new_ie_browser() elif self.info == "opera": return self.new_opera_browser() elif self.info == "firefox": return self.new_firefox_browser() else: return None class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.browser = None def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.driver = driver def copy_list(self, backup_list): for i in backup_list: self.board_crawler.content_num_list.append(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_list() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index is -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() is "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_list = list() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_list(self): self.content_num_list.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_list: continue self.content_num_list.append(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") self.enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") self.enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ',' ').replace('.','-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": self.enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class SendtoDB: pymysql = __import__('pymysql.cursors') def __init__(self, db_num=0): self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.db_num = db_num def set_db(self, db_num): self.db_num = str(db_num) def make_insert_query_backup(self, dictionary): query = "insert into data_" + str(self.db_num) + " (" for key in dictionary.keys(): query += (key + ",") query = query[:len(query) - 1] + ")" query += " values(" for key, value in dictionary.items(): if type(value) == int: query += (str(value) + ",") else: query += self.conn.escape(value) + "," query = query[:len(query) - 1] + ")" return query def make_insert_query(self, dictionary): query = "insert into data_" + str(self.db_num) + " (" key_list = list() val_list = list() for key, val in dictionary.items(): key_list.append(key) if type(val) == int: val_list.append(str(val)) else: val_list.append(self.conn.escape(val)) return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" def send_body(self, body): if not body: return self.conn_check() with self.conn.cursor() as cursor: query = self.make_insert_query(body) try: cursor.execute(query) self.conn.commit() except Exception as e: print(e) sys.stdout.flush() print(query) sys.stdout.flush() def send_reply(self, reply): if not reply: return for i in reply: self.send_body(i) def conn_check(self): if not self.conn.open: self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) def close(self): self.conn.close() def delete_url(self, url): query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url)) self.conn_check() with self.conn.cursor() as cursor: try: cursor.execute(query) self.conn.commit() except Exception as e: print(e) sys.stdout.flush() print(query) sys.stdout.flush() # class NaverCafeInit: # pymysql = __import__('pymysql.cursors') # url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" # url_second = "&search.searchdate=" # url_third = "&search.searchBy=0&search.query=" # url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" # # def __init__(self, before_day=0): # self.conn = self.pymysql.connect(host ='bigbird.iptime.org', # user='admin', passwd='admin123', # db='concepters', charset='utf8', # cursorclass=self.pymysql.cursors.DictCursor) # self.urls = dict() # self.before_day = before_day # # def set_before_day(self, before_day): # if type(before_day) == str: # self.before_day = int(before_day) # elif type(before_day) == int: # self.before_day = before_day # # def set_until_page(self, until_page): # if type(until_page) == str: # self.before_day = int(until_page) # elif type(until_page) == int: # self.before_day = until_page # # def split_searches(self): # search = self.searches() # splited_list = search.split(',') # trimmed_list = list() # for x in splited_list: # trimmed_list.append(self.euc_kr(x.strip())) # return trimmed_list # # def get_keyword_parameters(self, keyword_id): # query = "select * from keyword where id = " + str(keyword_id) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # self.params = cursor.fetchone() # return self.params # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return dict() # # def get_naver_cafe_list(self): # query = "select url, clubid from navercafelist" # if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: # pass # else: # query += (" where group_num = " + str(self.authorship())) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # list_result = cursor.fetchall() # for i in list_result: # self.urls[i["url"]] = i["clubid"] # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return self.urls # # def start_day(self): # return self.params["start"] # # def end_day(self): # return self.params["end"] # # def keyword_id(self): # return self.params["id"] # # def realtime(self): # return self.params["realtime"] # # def searches(self): # return self.params["searches"] # # def authorship(self): # return self.params["authorship"] # # def platform(self): # return self.params["platform"] # # def is_realtime(self): # if str(self.realtime()) == '0': # return False # else: # return True # # def euc_kr(self, keyword): # byte_code = list(keyword.encode("euc_kr")) # encoded_keyword = "" # for i in byte_code: # if i == 0x20: # encoded_keyword += "+" # else: # encoded_keyword += str(hex(i)).replace("0x", "%").upper() # return encoded_keyword # # def url_all_days(self): # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # today = datetime.date.today() # url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) # else: # url = self.make_url(self.start_day(), self.end_day(), val) # for i in url: # url_list.append(i) # return url_list # # def url_day_by_day(self): # one_day = datetime.timedelta(days=1) # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # end = datetime.date.today() # start = end + datetime.timedelta(days=self.before_day) # else: # start = self.start_day() # end = self.end_day() # while start <= end: # url = self.make_url(start, start, val) # for i in url: # url_list.append(i) # start += one_day # return url_list # # def make_url(self, start_day, end_day, clubid): # urls = list() # for x in self.split_searches(): # url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth # urls.append(url) # return urls # # def disconnect(self): # self.conn.close() # # def date_to_str(self, arg_date): # return arg_date.strftime("%Y-%m-%d") class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") class NaverCafeInit(CrawlInit): url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): super().__init__(before_day) def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "66556655*" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("chrome")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_list = list() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_list) i += 1 backup_list.clear() except Exception as e: print_and_flush(e) backup_list = list(naver_main_area_crawler.board_crawler.content_num_list) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file diff --git a/WebBasedCrawler/navercrawl11.23.py b/WebBasedCrawler/navercrawl11.23.py new file mode 100644 index 0000000..5a8d9af --- /dev/null +++ b/WebBasedCrawler/navercrawl11.23.py @@ -0,0 +1 @@ +#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import threading from time import localtime, strftime import time import os import sys import datetime import psutil import re def fcntwait(n): time.sleep(n) def wait(n): th = threading.Thread(target=fcntwait, args=(n,)) th.start() th.join() def insert_log(msg): pid = os.getpid() tm = strftime("%Y_%m_%d", localtime()) filename = tm + "_" + str(pid) + ".log" total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg with open(filename, "a") as f: f.write(total_msg) f.flush() def print_and_flush(string): print(string) sys.stdout.flush() class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class Browser: def __init__(self, driver=None): self.driver = driver self.info = "" def get_new_driver(self, name): """ windows system: name = chrome, ie, opera, firefox default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe linux system: name = chrome, opera, firefox default driver_exec: chromedriver, operadriver """ if sys.platform == "win32": if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver.exe") elif name == "ie": return self.new_ie_browser(driver_exec="IEDriverServer.exe") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver.exe") elif name == "firefox": return self.new_firefox_browser() else: return None else: if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver") elif name == "firefox": return self.new_firefox_browser() else: return None def new_chrome_browser(self, driver_exec=None): self.info = "chrome" if driver_exec is not None: self.chrome_driver_path = driver_exec self.chrome_basename = os.path.basename(driver_exec) if self.is_server_executed(self.chrome_basename): port = self.port(self.chrome_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME) else: self.driver = webdriver.Chrome(self.chrome_driver_path) return self.driver def new_ie_browser(self, driver_exec=None): self.info = "ie" if driver_exec is not None: self.ie_driver_path = driver_exec self.ie_basename = os.path.basename(driver_exec) if self.is_server_executed(self.ie_basename): port = self.port(self.ie_basename) self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER) else: self.driver = webdriver.Ie(self.ie_driver_path) return self.driver def new_firefox_browser(self): self.info = "firefox" self.driver = webdriver.Firefox() return self.driver def new_opera_browser(self, driver_exec=None): self.info = "opera" if driver_exec is not None: self.opera_driver_path = driver_exec self.opera_basename = os.path.basename(driver_exec) if self.is_server_executed(self.opera_basename): port = self.port(self.opera_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA) else: self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path) return self.driver def driver(self): return self.driver def is_server_executed(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return True return False def port(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return str(x.laddr[1]) return str(9999) def new_browser(self): if self.info == "chrome": return self.new_chrome_browser() elif self.info == "ie": return self.new_ie_browser() elif self.info == "opera": return self.new_opera_browser() elif self.info == "firefox": return self.new_firefox_browser() else: return None class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.browser = None def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.driver = driver def copy_list(self, backup_list): for i in backup_list: self.board_crawler.content_num_list.append(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_list() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index is -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() is "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_list = list() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_list(self): self.content_num_list.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_list: continue self.content_num_list.append(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") self.enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") self.enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ',' ').replace('.','-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": self.enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def enter_element(self, element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) class SendtoDB: pymysql = __import__('pymysql.cursors') def __init__(self, db_num=0): self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.db_num = db_num def set_db(self, db_num): self.db_num = str(db_num) def make_insert_query(self, dictionary): query = "insert into data_" + str(self.db_num) + " (" for key in dictionary.keys(): query += (key + ",") query = query[:len(query) - 1] + ")" query += " values(" for key, value in dictionary.items(): if type(value) == int: query += (str(value) + ",") else: query += self.conn.escape(value) + "," query = query[:len(query) - 1] + ")" return query def send_body(self, body): if not body: return self.conn_check() with self.conn.cursor() as cursor: query = self.make_insert_query(body) try: cursor.execute(query) self.conn.commit() except Exception as e: print(e) sys.stdout.flush() print(query) sys.stdout.flush() def send_reply(self, reply): if not reply: return for i in reply: self.send_body(i) def conn_check(self): if not self.conn.open: self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) def close(self): self.conn.close() def delete_url(self, url): query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url)) self.conn_check() with self.conn.cursor() as cursor: try: cursor.execute(query) self.conn.commit() except Exception as e: print(e) sys.stdout.flush() print(query) sys.stdout.flush() class NaverCafeInit: pymysql = __import__('pymysql.cursors') url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): self.before_day = before_day def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "66556655*" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("ie")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_list = list() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_list) i += 1 backup_list.clear() except Exception as e: print_and_flush(e) backup_list = list(naver_main_area_crawler.board_crawler.content_num_list) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file