From 563b0a8a42b9778397fdead1dd555931b3131f4b Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 5 Sep 2016 10:21:03 +0000 Subject: [PATCH] =?UTF-8?q?=EC=B9=B4=EC=B9=B4=EC=98=A4=ED=81=AC=EB=A1=A4?= =?UTF-8?q?=EB=9F=AC=20=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@289 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- WebBasedCrawler/base/baseclasses.py | 10 + WebBasedCrawler/kakao/kakaocrawl.py | 2070 +++++++++++---------------- 2 files changed, 847 insertions(+), 1233 deletions(-) diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 19792b8..9330608 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -22,6 +22,16 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +is_debug = False + +def printl(*objects, sep=' ', end='\n', file=None, flush=True): + print(*objects, sep=sep, end=end, file=file, flush=flush) + + +def printd(*objects, sep=' ', end='\n', file=None, flush=True): + if is_debug: + print(*objects, sep=sep, end=end, file=file, flush=flush) + def print_and_flush(string): print(string) diff --git a/WebBasedCrawler/kakao/kakaocrawl.py b/WebBasedCrawler/kakao/kakaocrawl.py index f156549..90d9414 100644 --- a/WebBasedCrawler/kakao/kakaocrawl.py +++ b/WebBasedCrawler/kakao/kakaocrawl.py @@ -12,1142 +12,74 @@ from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import WebDriverException - +from bs4 import BeautifulSoup from base.baseclasses import wait -from base.baseclasses import print_and_flush +from base.baseclasses import printl from base.baseclasses import SendtoDB from base.baseclasses import Browser from base.baseclasses import CrawlInit +from base.baseclasses import enter_element + +from kakao.kakaoexception import KakaoCrawlerException +from kakao.kakaoexception import NotFoundElementError +from kakao.kakaoexception import NotFoundDataError + +try: + import lxml + parser_opt = 'lxml' +except ImportError: + parser_opt = 'html.parser' __author__ = 'cococo' kakaostory_url = 'https://story.kakao.com/' kakaostory_channel_url = 'https://story.kakao.com/ch/' - +limit_reload = 5 +num_of_retry = 3 logging.basicConfig(level=logging.INFO, format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") -class KakaoBodyCrawler: - def __init__(self, driver=None): - self.driver = driver - self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") +re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})" + "[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") - def set_driver(self, driver): - self.driver = driver - def set_activity(self, activity): - self.activity = activity +def get_date(element): + """ - def find_article_profileurl(self): - img = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a/img") - return img.get_attribute("src") + :param element: this may be span.time element + :return: 'yyyy-MM-dd hh:mm:ss' + """ + m = re_date.search(element.attrs.get('title', '')) \ + or re_date.search(element.attrs.get('data-tooltip', '')) - def find_article_nickname(self): - a = self.activity.find_element_by_xpath("div/div[@class='add_top']/div[@class='myid']/a") - return a.text + if m: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + # add 12 hour when the article is written at p.m + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) - def find_article_modified_date(self): - try: - span = self.activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") - except: - return None - ac = ActionChains(self.driver) - ac.move_to_element(span).perform() - wait(0.3) - data_tooltip = span.get_attribute("data-tooltip") - m = self.re_date.search(data_tooltip) - if m is None: - return None - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), - int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return temp_date + # convert datetime.datetime to str + return str(temp_date) + # return invalid date instead of exception + else: + # raise NotFoundElementError("get_date exception") + return "0000-00-00 00:00:00" - def find_article_date(self): - time_modified_date = self.find_article_modified_date() - if time_modified_date is not None: - return time_modified_date - a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") - a.send_keys(Keys.NULL) - ac = ActionChains(self.driver) - ac.move_to_element(a).perform() - wait(0.2) - data_tooltip = a.get_attribute("data-tooltip") - m = self.re_date.search(data_tooltip) - if m is None: - return "0000-00-00 00:00:00" - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return str(temp_date) - def find_article_id(self): - a = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a") - href = a.get_attribute("href") - #str_id = href[href.rindex('/') + 1:] - str_id = href.replace(kakaostory_url, "") - return str_id +# function for click X button on content +def click_kakao_close_button(driver): + btn = driver.find_element_by_css_selector("button._btnClose") + btn.send_keys(Keys.NULL) + btn.send_keys(Keys.ENTER) - def find_article_url(self): - a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") - url = a.get_attribute("href") - return url - def find_platform_name(self): - return "kakaostory" - - def find_platform_form(self): - if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: - return 'channel' - elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: - return 'tag' - else: - return 'story' - - def find_article_form(self): - return "body" - - def find_article_data(self): - more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" - "/p[@class='more _moreBtnContainer']") - display = more.get_attribute("style") - if display.find('none') == -1: - a = more.find_element_by_tag_name("a") - self.enter_element(a) - try: - content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" - "/div[@class='txt_wrap']/div[@class='_content']") - except: - return str("") - return content.text - - def click_element(self, element): - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(element, 0, 0).click().perform() - wait(2) - - def enter_element(self, element): - element.send_keys(Keys.NULL) - element.send_keys(Keys.ENTER) - wait(2) - - def find_platform_id(self): - return self.find_article_id() - - def find_article_title(self): - content = self.find_article_data() - if not content: - return "" - try: - return content.strip().splitlines()[0] - except: - return "" - - def find_feeling_users3(self): - try: - a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") - except: - return None - self.enter_element(a) - inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") - str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") - like_num = int(str_like.text) - fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() - while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.drag_and_drop_by_offset(scroll, 0, 15).perform() - wait(1) - lis = fake_scroll.find_elements_by_tag_name("li") - data = list() - for li in lis: - try: - a = li.find_element_by_xpath("a[@class='link_people']") - href = a.get_attribute('href') - # str_id = href[href.rindex('/') + 1:] - str_id = href.replace(kakaostory_url, "") - img = a.find_element_by_css_selector("img[class='img_thumb']") - profileurl = img.get_attribute('src') - data.append({'id': str_id, 'profileurl': profileurl}) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - feelings = dict() - feelings['data'] = data - feelings['count'] = len(data) - a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") - self.click_element(a) - wait(1) - return feelings - - def find_reply_users(self): - try: - a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewComments' and not(@style)]") - except: - return None - count = a.find_element_by_css_selector("strong._commentCount").text - if len(count.strip()) < 1: - return None - else: - return int(count.replace(",", "").strip()) - - def find_feeling_users(self): - try: - #a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") - a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewLikes' and not(@style)]") - except: - return None - self.enter_element(a) - # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") - inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) - str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") - like_num = int(str_like.text.replace(",", "")) - # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") - fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) - start_time = time.time() - while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() - ac.drag_and_drop_by_offset(scroll, 0, 30).perform() - wait(0.5) - if time.time() - start_time > 600.0: - break - ul = fake_scroll.find_element_by_tag_name("ul") - data = list() - try: - a_list = ul.find_elements_by_css_selector("a[class='link_people']") - # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") - for i in range(0, len(a_list)): - href = a_list[i].get_attribute('href') - str_id = href.replace(kakaostory_url, "") - # profileurl = img_list[i].get_attribute('src') - # data.append({'id': str_id, 'profileurl': profileurl}) - data.append({'id': str_id}) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - feelings = dict() - feelings['data'] = data - feelings['count'] = len(data) - a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") - self.click_element(a) - wait(1) - return feelings - - def find_feeling_users2(self): - try: - a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") - except: - return None - self.enter_element(a) - inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") - fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - scroll_style = scroll.get_attribute("style") - re_height = re.compile("height: ([0-9]*\\.[0-9]+|[0-9]+)px") - re_top = re.compile("top: ([0-9]*\\.[0-9]+|[0-9]+)px") - m_h = re_height.search(scroll_style) - m_t = re_top.search(scroll_style) - if m_t is None: - top = 0.0 - else: - top = float(m_t.group(1)) - if m_h is None: - height = 0.0 - else: - height = float(m_h.group(1)) - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() - i = 0 - data = list() - while height + top < 320: - lis = fake_scroll.find_elements_by_tag_name("li") - for j in range(i, (i+6) if i+6 < len(lis) else len(lis)): - a = lis[j].find_element_by_xpath("a[@class='link_people']") - href = a.get_attribute('href') - str_id = href[href.rindex('/') + 1:] - em = a.find_element_by_css_selector("em[class='tit_userinfo']") - nickname = em.text - span = a.find_element_by_css_selector("span[class='txt_feel']") - emotion = span.text - img = a.find_element_by_css_selector("img[class='img_thumb']") - profileurl = img.get_attribute('src') - data.append({'id': str_id, 'nickname': nickname, 'emotion': emotion, 'profileurl': profileurl}) - i += 6 - move_pixel = 1968.0 / len(fake_scroll.find_elements_by_tag_name("li")) - ac = ActionChains(self.driver) - ac.drag_and_drop_by_offset(scroll, 0, move_pixel).perform() - wait(1) - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - scroll_style = scroll.get_attribute("style") - m_h = re_height.search(scroll_style) - m_t = re_top.search(scroll_style) - if m_t is None: - top = 0.0 - else: - top = float(m_t.group(1)) - if m_h is None: - height = 0.0 - else: - height = float(m_h.group(1)) - feelings = dict() - feelings['data'] = data - feelings['count'] = len(data) - a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") - self.click_element(a) - return feelings - - def find_share_users2(self): - try: - a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]") - except: - return None - self.enter_element(a) - inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") - str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") - re_share = re.compile("\\(([\\d]+)\\)") - m = re_share.search(str_share.text) - if m is None: - share_num = 0 - else: - share_num = int(m.group(1)) - fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() - while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.drag_and_drop_by_offset(scroll, 0, 15).perform() - wait(1) - lis = fake_scroll.find_elements_by_tag_name("li") - data = list() - for li in lis: - try: - a = li.find_element_by_xpath("a[@class='link_people']") - href = a.get_attribute('href') - last_slush = href.rindex('/') - # begin_slush = href[:last_slush].rindex('/') - # str_id = href[begin_slush+1:last_slush] - str_id = href[:last_slush].replace(kakaostory_url, "") - img = a.find_element_by_css_selector("img[class='img_thumb']") - profileurl = img.get_attribute('src') - data.append({'id': str_id, 'profileurl': profileurl}) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - shares = dict() - shares['data'] = data - shares['count'] = len(data) - a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") - self.click_element(a) - return shares - - def find_share_users(self): - try: - #a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/a[@class='_btnViewShareList' and not(@style)]") - a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewStoryShareList' and not(@style)]") - except: - return None - self.enter_element(a) - # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") - inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) - str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") - re_share = re.compile("([\\d]+)") - m = re_share.search(str_share.text) - if m is None: - share_num = 0 - else: - share_num = int(m.group(1).replace(",", "")) - # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") - fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) - start_time = time.time() - while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: - scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() - ac.drag_and_drop_by_offset(scroll, 0, 30).perform() - wait(0.5) - if time.time() - start_time > 600.0: - break - ul = fake_scroll.find_element_by_tag_name("ul") - data = list() - try: - a_list = ul.find_elements_by_css_selector("a[class='link_people']") - # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") - for i in range(0, len(a_list)): - href = a_list[i].get_attribute('href') - last_slush = href.rindex('/') - # begin_slush = href[:last_slush].rindex('/') - # str_id = href[begin_slush+1:last_slush] - str_id = href[:last_slush].replace(kakaostory_url, "") - # profileurl = img_list[i].get_attribute('src') - # data.append({'id': str_id, 'profileurl': profileurl}) - data.append({'id': str_id}) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - shares = dict() - shares['data'] = data - shares['count'] = len(data) - a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") - self.click_element(a) - return shares - - def find_platform_title(self): - return self.driver.title - - def get_content(self): - content = dict() - content["article_id"] = self.find_article_id() - # print_and_flush("article_id") - content["article_nickname"] = self.find_article_nickname() - # print_and_flush("article_nickname") - content["article_title"] = self.find_article_title() - # print_and_flush("article_title") - content["article_date"] = self.find_article_date() - # print_and_flush("article_date") - #content["article_hit"] = self.find_article_hit() - content["article_url"] = self.find_article_url() - # print_and_flush("article_url") - content["article_data"] = self.find_article_data() - # print_and_flush("article_data") - content["article_form"] = self.find_article_form() - # print_and_flush("article_form") - content["article_profileurl"] = self.find_article_profileurl() - # print_and_flush("article_profileurl") - #content["platform_title"] = self.find_platform_title() - content["platform_title"] = content["article_nickname"] - # print_and_flush("platform_title") - content["platform_name"] = self.find_platform_name() - if content["article_url"].find(kakaostory_channel_url) != -1: - content["platform_form"] = "channel" - else: - content["platform_form"] = "story" - # print_and_flush("platform_form") - content["platform_id"] = self.find_platform_id() - # print_and_flush("platform_id") - data = list() - # print_and_flush("start feelings") - feelings = self.find_feeling_users() - # print_and_flush("feelings") - # print_and_flush("done feelings") - if feelings is not None: - data.append({"feelings": feelings}) - content["article_profile"] = str(feelings["count"]) - # print_and_flush("start shares") - shares = self.find_share_users() - # print_and_flush("shares") - # print_and_flush("done shares") - if shares is not None: - data.append({"shares": shares}) - content["reply_url"] = str(shares["count"]) - if data: - json_data = {"data": data} - content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode(json_data) - reply_count = self.find_reply_users() - if type(reply_count) == int: - content["article_order"] = reply_count - return content - - -class KakaoReplyCrawler_backup: - def __init__(self, driver=None, activity=None): - self.driver = driver - self.activity = activity - self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") - self.reply_list = list() - self.order = 0 - - def find_init(self): - self.reply_list.clear() - self.order = 0 - - def set_driver(self, driver): - self.driver = driver - - def set_activity(self, activity): - self.activity = activity - - def has_more(self): - more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") - if more.get_attribute('style').find('block') != -1: - return True - else: - return False - - def read_more_reply(self): - more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") - a = more.find_element_by_css_selector("a[class='_btnCommentMore']") - self.enter_element(a) - - def read_all_reply(self): - while self.has_more(): - self.read_more_reply() - - def get_reply_lis(self): - ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") - lis = ul.find_elements_by_tag_name("li") - return lis - - def has_reply(self): - try: - ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") - lis = ul.find_elements_by_tag_name("li") - if len(lis) > 0: - return True - else: - return False - except: - return False - - def crawl_reply(self, li): - content = dict() - content["article_id"] = self.find_article_id(li) - content["article_nickname"] = self.find_article_nickname(li) - content["article_date"] = self.find_article_date(li) - content["article_data"] = self.find_article_data(li) - content["article_order"] = self.order - content["article_url"] = self.find_article_url(li) - content["platform_id"] = self.find_platform_id(li) - content["article_form"] = self.find_article_form() - content["article_profileurl"] = self.find_article_profileurl(li) - content["platform_name"] = self.find_platform_name() - if content["article_url"].find(kakaostory_channel_url) != -1: - content["platform_form"] = "channel" - else: - content["platform_form"] = "story" - article_parent = self.find_article_parent(li) - if article_parent is not None: - content["article_parent"] = article_parent - self.order += 1 - self.reply_list.append(content) - - def get_content(self): - return self.reply_list - - def crawl_all(self): - self.find_init() - self.read_all_reply() - try: - lis = self.get_reply_lis() - for li in lis: - self.crawl_reply(li) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - - def find_article_id(self, li): - a = li.find_element_by_xpath("div[@class='pf']/a") - href = a.get_attribute('href') - str_id = href.replace(kakaostory_url, "").strip() - return str_id - - def find_article_profileurl(self, li): - img = li.find_element_by_xpath("div[@class='pf']/a/img") - return img.get_attribute('src') - - def find_article_nickname(self, li): - a = li.find_element_by_xpath("div[@class='txt']/p/a[@data-profile-popup]") - # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") - # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel - return a.text - - def find_article_date(self, li): - a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") - # a.send_keys(Keys.NULL) - # ac = ActionChains(self.driver) - # ac.move_to_element(a).perform() - # wait(0.1) - # data_tooltip = a.get_attribute("data-tooltip") - data_tooltip = a.get_attribute("title") - #a.get_attribute('title') <-- data_tooltip - m = self.re_date.search(data_tooltip) - if m is None: - return "0000-00-00 00:00:00" - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return str(temp_date) - - def find_article_parent(self, li): - comment = li.find_element_by_xpath("div[@class='txt']") - try: - a = comment.find_element_by_xpath("a[@data-profile-popup]") - return a.text - except: - return None - - def find_article_data(self, li): - all_element = li.find_element_by_xpath("div[@class='txt']") - all_text = all_element.text - p = all_element.find_element_by_tag_name('p') - p_text = p.text - return all_text[len(p_text):].strip() - - def find_article_url(self, li): - a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") - href = a.get_attribute("href") - return href[:href.rindex('/')] - - def find_platform_id(self, li): - article_url = self.find_article_url(li) - main_url = article_url[:article_url.rindex('/')] - #return main_url[main_url.rindex('/')+1:] - return main_url.replace(kakaostory_url, "") - - def find_article_form(self, li=None): - return 'reply' - - def find_platform_name(self, li=None): - return 'kakaostory' - - def find_platform_form(self, li=None): - if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: - return 'channel' - elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: - return 'tag' - else: - return 'story' - - def click_element(self, element): - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(element, 0, 0).click().perform() - wait(2) - - def enter_element(self, element): - element.send_keys(Keys.NULL) - element.send_keys(Keys.ENTER) - wait(2) - - def find_like_count(self, li): - try: - like = li.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") - return like.text - except: - return '0' - - -class KakaoReplyCrawler: - def __init__(self, driver=None, activity=None): - self.driver = driver - self.activity = activity - self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") - self.reply_list = list() - self.order = 0 - - def find_init(self): - self.reply_list.clear() - self.order = 0 - - def set_driver(self, driver): - self.driver = driver - - def set_activity(self, activity): - self.activity = activity - - def has_more(self): - try: - more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']") - except: - try: - more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']") - except: - return False - if more.get_attribute('style').find('block') != -1: - return True - else: - return False - - def read_more_reply(self): - try: - more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']") - a = more.find_element_by_css_selector("a[class='_btnShowMoreComment']") - except: - more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']") - a = more.find_element_by_css_selector("a[class='_btnShowPrevComment']") - self.enter_element(a) - - def read_all_reply(self): - start_time = time.time() - while self.has_more(): - self.read_more_reply() - if time.time() - start_time > 600.0: - raise WebDriverException - - def get_reply_ul(self): - ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']") - return ul - - def has_reply(self): - try: - ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']") - lis = ul.find_elements_by_tag_name("li") - if len(lis) > 0: - return True - else: - return False - except: - return False - - def crawl_reply(self, ul): - article_id = self.find_article_id(ul) - article_nickname = self.find_article_nickname(ul) - article_date = self.find_article_date(ul) - article_data = self.find_article_data(ul) - article_url = self.find_article_url(ul) - platform_id = self.find_platform_id(ul) - article_profileurl = self.find_article_profileurl(ul) - article_parent = self.find_article_parent(ul) - # print_and_flush(str(len(article_id))) - # print_and_flush(str(len(article_nickname))) - # print_and_flush(str(len(article_date))) - # print_and_flush(str(len(article_data))) - # print_and_flush(str(len(article_url))) - # print_and_flush(str(len(platform_id))) - # print_and_flush(str(len(article_profileurl))) - # print_and_flush(str(len(article_parent))) - if article_url[0].find(kakaostory_channel_url) != -1: - platform_form = "channel" - else: - platform_form = "story" - for i in range(0, len(article_id)): - content = dict() - content["article_id"] = article_id[i] - content["article_nickname"] = article_nickname[i] - content["article_profileurl"] = article_profileurl[i] - content["article_url"] = article_url[i] - content["platform_id"] = platform_id[i] - content["article_date"] = article_date[i] - content["article_data"] = article_data[i] - content["platform_form"] = platform_form - content["article_order"] = i - content["platform_name"] = self.find_platform_name() - content["article_form"] = self.find_article_form() - if len(article_parent[i]) > 0: - content["article_parent"] = article_parent[i] - self.reply_list.append(content) - - def get_content(self): - return self.reply_list - - def crawl_all(self): - self.find_init() - self.read_all_reply() - try: - ul = self.get_reply_ul() - self.crawl_reply(ul) - except WebDriverException: - raise WebDriverException - except Exception as e: - logging.info(e) - # print_and_flush(e) - - def find_article_id(self, ul): - a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a") - str_id_list = list() - for a in a_list: - href = a.get_attribute('href') - str_id = href.replace(kakaostory_url, "").strip() - str_id_list.append(str_id) - return str_id_list - - def find_article_profileurl(self, ul): - img = ul.find_elements_by_xpath("li/div[@class='pf']/a/img") - img_list = list() - for im in img: - img_list.append(im.get_attribute('src')) - return img_list - - def find_article_nickname(self, ul): - a = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@data-profile-popup]") - # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") - # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel - nickname_list = list() - for i in a: - nickname_list.append(i.text) - return nickname_list - - def find_article_date(self, ul): - a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") - # a.send_keys(Keys.NULL) - # ac = ActionChains(self.driver) - # ac.move_to_element(a).perform() - # wait(0.1) - # data_tooltip = a.get_attribute("data-tooltip") - date_list = list() - for a in a_list: - data_tooltip = a.get_attribute("title") - m = self.re_date.search(data_tooltip) - if m is None: - date_list.append("0000-00-00 00:00:00") - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - date_list.append(str(temp_date)) - return date_list - - def find_article_parent(self, ul): - comments = ul.find_elements_by_xpath("li/div[@class='txt']") - article_parents = list() - for comment in comments: - try: - a = comment.find_element_by_xpath("a[@data-profile-popup]") - article_parents.append(a.text) - except: - article_parents.append("") - return article_parents - - def find_article_data(self, ul): - all_elements = ul.find_elements_by_xpath("li/div[@class='txt']") - all_elements_p = ul.find_elements_by_xpath("li/div[@class='txt']/p") - all_text_list = list() - for i in range(0, len(all_elements)): - all_text = all_elements[i].text - p_text = all_elements_p[i].text - all_text_list.append(all_text[len(p_text):].strip()) - return all_text_list - - def find_article_url(self, ul): - a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") - article_url_list = list() - for a in a_list: - href = a.get_attribute("href") - article_url_list.append(href[:href.rindex('/')]) - return article_url_list - - def find_platform_id(self, ul): - article_urls = self.find_article_url(ul) - platform_id = list() - for article_url in article_urls: - main_url = article_url[:article_url.rindex('/')] - #return main_url[main_url.rindex('/')+1:] - platform_id.append(main_url.replace(kakaostory_url, "")) - return platform_id - - def find_article_form(self, ul=None): - return 'reply' - - def find_platform_name(self, ul=None): - return 'kakaostory' - - def find_platform_form(self, ul=None): - if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: - return 'channel' - elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: - return 'tag' - else: - return 'story' - - def click_element(self, element): - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(element, 0, 0).click().perform() - wait(2) - - def enter_element(self, element): - element.send_keys(Keys.NULL) - element.send_keys(Keys.ENTER) - wait(2) - - def find_like_count(self, ul): - try: - like = ul.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") - return like.text - except: - return '0' - - -class KakaoPageCrawler: - def __init__(self, driver=None, begin_date=None, end_date=None): - self.driver = driver - self.activity_data_model_set = set() - self.begin_date = begin_date - self.end_date = end_date - self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") - self.index = 0 - self.activities = None - self.present_activity = 0 - self.previous_activity = 0 - self.reload_count = 0 - - def move_to_url(self, url): - self.driver.get(url) - self.index = 0 - self.activity_data_model_set.clear() - - def init(self): - self.index = 0 - self.previous_activity = 0 - self.activities = None - self.activity_data_model_set.clear() - - def set_date(self, begin_date, end_date): - self.set_begin_date(begin_date) - self.set_end_date(end_date) - - def set_end_date(self, end_date): - if type(end_date) == str: - self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') - elif type(end_date) == datetime.datetime or type(end_date) == datetime.date: - self.end_date = end_date - else: - self.end_date = datetime.datetime.today() - self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day) - self.end_date += datetime.timedelta(days=1) - - def set_begin_date(self, begin_date): - if type(begin_date) == str: - self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d') - elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date: - self.begin_date = begin_date - else: - self.begin_date = datetime.datetime.today() - self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day) - - def next_activity_backup(self): - try: - if not self.activities: - self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='section _activity']"))) - self.index = 0 - if len(self.activities) == 0: - print_and_flush("activities are not found") - self.activities = None - return None - except: - print_and_flush("activities are not found") - self.activities = None - return None - has_more_activities = True - self.present_activity = len(self.activities) - while has_more_activities: - for activity in self.activities[self.previous_activity:]: - if activity.get_attribute("data-model") in self.activity_data_model_set: - continue - self.activity_data_model_set.add(activity.get_attribute("data-model")) - time_date = self.find_article_date(activity) - if self.is_earlier(time_date): - self.activities = None - return None - if self.is_late(time_date): - continue - return activity - self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if len(self.activities) == self.present_activity: - has_more_activities = self.load_more_activities() - self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - else: - has_more_activities = True - self.previous_activity = self.present_activity - self.present_activity = len(self.activities) - self.activities = None - return None - - def next_activity(self): - try: - if self.activities is None: - self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located( - (By.CSS_SELECTOR, "div[class='section _activity']")) - ) - if len(self.activities) == 0: - print_and_flush("activities are not found") - self.activities = None - return None - except: - print_and_flush("activities are not found") - self.activities = None - return None - while True: - self.index += 1 - if self.index > len(self.activities): - self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if self.index > len(self.activities): - if self.load_more_activities() is False: - self.activities = None - return None - self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if self.activities[self.index - 1].get_attribute("data-model") in self.activity_data_model_set: - continue - time_date = self.find_article_date(self.activities[self.index - 1]) - time_modified_date = self.find_article_modified_date(self.activities[self.index - 1]) - if time_modified_date is not None: - time_date = time_modified_date - print("number of post:", self.index, flush=True) - print(str(time_date), flush=True) - if type(time_date) == str: - continue - if self.is_earlier(time_date): - self.activities = None - return None - if self.is_late(time_date): - continue - return self.activities[self.index - 1] - - def crawling_ok(self): - self.activity_data_model_set.add(self.activities[self.index - 1].get_attribute("data-model")) - - def next_activity_prepare(self): - try: - activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if len(activities) == 0: - return None - except: - return None - has_more_activities = True - while has_more_activities: - if self.index < len(activities): - temp_index = self.index - self.index += 1 - time_date = self.find_article_date(activities[temp_index]) - if self.is_earlier(time_date): - return None - if self.is_late(time_date): - continue - return activities[temp_index] - else: - has_more_activities = self.load_more_activities() - activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - return None - - def load_more_activities(self): - previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - for i in range(0, 4): - print_and_flush("Try load more") - body = self.driver.find_element_by_tag_name("body") - body.send_keys(Keys.NULL) - body.send_keys(Keys.END) - wait(4) - present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if len(previous_activities) != len(present_activities): - wait(2) - self.reload_count = 0 - return True - for i in range(0, 4): - print_and_flush("Try load more") - body = self.driver.find_element_by_tag_name("body") - for j in range(0, 2): - body.send_keys(Keys.PAGE_UP) - wait(0.1) - for j in range(0, 15): - body.send_keys(Keys.PAGE_DOWN) - wait(0.1) - wait(4) - present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if len(previous_activities) != len(present_activities): - wait(2) - self.reload_count = 0 - return True - for i in range(0, 10): - print_and_flush("Try load more") - self.driver.execute_script("window.scrollBy(0, 800)") - wait(4) - present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") - if len(previous_activities) != len(present_activities): - wait(2) - self.reload_count = 0 - return True - if self.reload_count < 8: - print_and_flush("index reload") - self.reload_count += 1 - self.index -= 1 if self.index > 0 else 0 - position = self.driver.get_window_position() - size = self.driver.get_window_size() - self.driver.maximize_window() - self.driver.set_window_size(size['width'], size["height"]) - self.driver.set_window_position(position['x'], position['y']) - return True - if self.reload_count < 10: - print_and_flush("refresh") - self.driver.refresh() - wait(5) - self.index = 0 - self.reload_count += 1 - return True - return False - - def is_earlier(self, time_date): - return True if time_date < self.begin_date else False - - def is_late(self, time_date): - return True if time_date > self.end_date else False - - def set_driver(self, driver): - self.driver = driver - - def find_article_date(self, activity): - a = activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") - a.send_keys(Keys.NULL) - ac = ActionChains(self.driver) - ac.move_to_element(a).perform() - wait(0.5) - ac.move_to_element(a).perform() - wait(0.5) - data_tooltip = a.get_attribute("data-tooltip") - m = self.re_date.search(data_tooltip) - if m is None: - data_tooltip = a.get_attribute("title") - m = self.re_date.search(data_tooltip) - if m is None: - return "0000-00-00 00:00:00" - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), - int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return temp_date - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), - int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return temp_date - - def find_article_modified_date(self, activity): - try: - span = activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") - except: - return None - ac = ActionChains(self.driver) - ac.move_to_element(span).perform() - wait(0.8) - data_tooltip = span.get_attribute("data-tooltip") - wait(0.2) - m = self.re_date.search(data_tooltip) - if m is None: - return None - else: - temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), - int(m.group(5)), int(m.group(6))) - if m.group(4) == "오후" and int(m.group(5)) < 12: - temp_date += datetime.timedelta(hours=12) - #return temp_date.strftime("%Y-%m-%d") - return temp_date +def find_element_by_css_selector(driver, css_selector, wait_second=10): + element = WebDriverWait(driver, wait_second).until( + EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) + return element class KakaoInit(CrawlInit): @@ -1195,66 +127,797 @@ class KakaoInit(CrawlInit): return self.end_day() +class BodyCrawler(object): + def __init__(self, driver): + self.driver = driver + self.soup = None + self.section_activity = None + self.set_soup_and_activity() + if not self.section_activity: + raise NotFoundElementError("section _activity is not Found") + + # calling point may differ + def set_soup_and_activity(self): + self.soup = BeautifulSoup(self.driver.page_source, parser_opt) + # There are many div.section _activity. But element we use is in div.cover_wrapper + cover_wrapper = self.soup.find('div', class_='cover_wrapper') + self.section_activity = cover_wrapper.find('div', class_='section _activity') + + def find_article_id(self): + a = self.section_activity.find('a', class_='pf_name') + href = a.attrs['href'].replace('https://story.kakao.com/', '') + return href[1:] if href.startswith('/') else href + + def find_article_nickname(self): + a = self.section_activity.find('a', class_='pf_name') + return a.text + + def find_article_url(self): + # in chrome, current_url is equal to article_url + # need to check other browser + return self.driver.current_url + + def find_article_modified_date(self): + # get DOM about modified date + times = None + add_top = self.section_activity.find('div', class_='add_top') + if add_top: + times = add_top.find_all('span', class_='time') + + # written time is default. if the article was modified, modified time is added. + # so if length of times is not equal to 2, there is only written time. + if not times or len(times) < 2: + return None + + # times[0] : written time, times[1] : modified time + # times[1] structure : + # check times[1].span exists + if times[1].span: + + # before mouse over the element(tooltip), the date string is in the title attribute of span + # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span + m = re_date.search(times[1].span.attrs.get('title', '')) \ + or re_date.search(times[1].span.attrs.get('data-tooltip', '')) + + if m: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + # add 12 hour when the article is written at p.m + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + + # convert datetime.datetime to str + return str(temp_date) + else: + # raise NotFoundDataError('data for find_article_modified is not found') + return None + + # return None instead of exception. + else: + # raise NotFoundElementError('find_article_modified DOM is missing') + return None + + def find_article_date(self): + # modified date is a higher priority than written date + modified_date = self.find_article_modified_date() + if modified_date: + return modified_date + times = None + # get DOMs about date + add_top = self.section_activity.find('div', class_='add_top') + if add_top: + times = add_top.find_all('span', class_='time') + else: + raise NotFoundElementError("find_article_data DOM is missing : add_top") + if not times: + raise NotFoundElementError("find_article_data DOM is missing : time") + + # before mouse over the element(tooltip), the date string is in the title attribute of span + # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span + m = re_date.search(times[0].attrs.get('title', '')) \ + or re_date.search(times[0].attrs.get('data-tooltip', '')) + + if m: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + # add 12 hour when the article is written at p.m + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + + # convert datetime.datetime to str + return str(temp_date) + # return invalid date instead of exception + else: + # raise NotFoundElementError("find_article_date exception") + return "0000-00-00 00:00:00" + + def find_article_profileurl(self): + profile_area = self.section_activity.find('div', class_='_profileArea pf') + # check a>img + if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'): + return profile_area.a.img.get('src') + # this is not essential, so no exception occur + else: + return '' + + def find_article_data(self): + """ + :return: trimmed article_data + """ + content = self.section_activity.find('div', class_='txt_wrap') + if content and content.text: + # trim + return content.text.strip().replace('\xa0', '\n') + # if there is no content or text, return empty data + else: + return '' + + def find_article_title(self): + # strong.tit_channel is title of channel + # if strong.tit_channel do not exist, + # title is first line of article_data + # this definition is determined by me -_- + # find_article_data return trimmed string + strong = self.section_activity.find('strong', class_='tit_channel') + if strong and strong.text: + return strong.text.replace('\xa0', '') + + article_data = self.find_article_data() + if article_data: + for line in article_data.splitlines(): + # limit title length + return line[0:30] if len(line) > 30 else line + else: + return '' + + def find_article_etc(self, class_name): + """ + this function is used for crawling number of shares, replies and feelings + :param class_name: + :return: a string of number of shares, replies, or feelings + """ + element = self.section_activity.find('strong', class_=class_name) + + # check element has text that indicate the number + if element and element.text: + # It may contain comma ',' to recognize easily + # Remove comma ',' to convert from str to int + txt = element.text.replace(',', '') + return txt + # if there is no element or text, return '0' instead of raising exception + else: + # raise NotFoundElementError('find_article_etc is not Found element with ' + class_name) + return '0' + + def find_article_share(self): + return self.find_article_etc('_storyShareCount') + + def find_article_feeling(self): + return self.find_article_etc('_likeCount') + + def find_article_reply_num(self): + return self.find_article_etc('_commentCount') + + def find_platform_form(self): + article_id = self.find_article_id() + return 'channel' if article_id.startswith('ch/') else 'story' + + def get(self): + """ + you need to put 'keyword_id' + :return: dict for crawled body content + """ + content = dict() + content['article_id'] = self.find_article_id() + content['article_nickname'] = self.find_article_nickname() + content['article_data'] = self.find_article_data() + content['article_title'] = self.find_article_title() + content['article_date'] = self.find_article_date() + content['article_url'] = self.find_article_url() + content['article_profileurl'] = self.find_article_profileurl() + content['article_order'] = self.find_article_reply_num() + content['article_parent'] = self.find_article_share() + content['reply_url'] = self.find_article_feeling() + content['platform_form'] = self.find_platform_form() + content['article_form'] = 'body' + content['platform_name'] = 'kakaostory' + content['platform_id'] = content['article_id'] + content['platform_title'] = content['article_nickname'] + return content + + +class ReplyCrawler(object): + def __init__(self, driver): + self.driver = driver + self.soup = None + self.section_activity = None + self.ul = None + self.lis = None + + def set_soup_and_activity(self): + self.soup = BeautifulSoup(self.driver.page_source, parser_opt) + # There are many div.section _activity. But element we use is in div.cover_wrapper + cover_wrapper = self.soup.find('div', class_='cover_wrapper') + self.section_activity = cover_wrapper.find('div', class_='section _activity') + self.ul = self.section_activity.find('ul', class_='list _listContainer') + + def load_all_reply(self): + previous_num_of_replies = 0 + while self.has_more(): + self.click_load_more_reply_btn() + # check number of replies before and after click_load_more_reply_btn() + # If These were equal, the link or ajax failed + current_num_of_replies = self.get_num_of_replies() + if previous_num_of_replies == current_num_of_replies: + break + previous_num_of_replies = current_num_of_replies + + def get_num_of_replies(self): + # Find ul element that contains replies + # if raise occur, there is no reply + # for performance, this method may is implemented using bs4 + try: + ul = find_element_by_css_selector(self.driver, + "div.cover_wrapper " + "div[class='section _activity'] " + "ul[class='list _listContainer']", 5) + li = ul.find_elements_by_tag_name('li') + return len(li) + except Exception as e: + return 0 + + def click_load_more_reply_btn(self): + try: + # find a link to load reply and click/enter it + a = find_element_by_css_selector(self.driver, + "div.cover_wrapper " + "div[class='section _activity'] " + "a[class='_btnShowMoreComment']", 5) + enter_element(a) + + # no link is in the browser. Nothing happens instead raise exception. But log this event + except Exception as e: + printl("In click_load_more_reply_btn, there is not a link to load replies") + printl(e) + + def has_more(self): + # In the case that raise exception, + # there is no more reply or css selector of the show_more is invalid + # These two case can't be classified by exception because the logic is same + try: + # find show_more element + show_more = find_element_by_css_selector(self.driver, + "div.cover_wrapper " + "div[class='section _activity'] " + "p[class='more _showMoreCommentContainer']", 5) + + # 'display:block;' -> display the button, 'display:none;' -> hide the button + if 'block' in show_more.get_attribute('style'): + return True + else: + return False + # return False in the two case + # First case is that loading replies is finished + # Second case is that css selector to find element is invalid + except Exception as e: + return False + + # find_xxxx functions + + def find_article_id(self): + # Find name placeholder + divs = self.ul.find_all('a', class_='name _namePlaceholder') + # Get article_ids and remove kakaostory url in article_id + article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '') + for div in divs if div.attrs.get('href', '')] + # Refine hrefs. Href may start with '/' + article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids) + # Return list because of unification of types + return list(article_id) + + def find_article_nickname(self): + divs = self.ul.find_all('a', class_='name _namePlaceholder') + # If div.text exist, return div.text. Otherwise return empty string + return [div.text if div.text else '' for div in divs] + + def find_article_data(self): + divs = self.ul.find_all('div', class_='txt') + # The div.text has meta-data in div.p.text. If meta-data exists, remove it + # When element does not exists, return empty string + return [div.text[len(div.p.text):].replace('\xa0', '\n') + if div.p else div.text if div.text else '' for div in divs] + + def find_article_date(self): + divs = self.ul.find_all('span', class_='time') + return list(map(get_date, divs)) + + def find_article_like(self): + spans = self.ul.find_all('span', class_='like_num _likeCommentCount') + # The number of like exists in span.like_num _likeCommentCount Unless it is present + return [span.text if span.text else '' for span in spans] + + def find_article_profileurl(self): + divs = self.ul.find_all('div', class_='pf') + return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs)) + + def get(self): + """ + Need to put platform_title, platform_id, platform_form from body + :return: a list of replies. Need to put platform_title, platform_id + """ + # load all replies + self.load_all_reply() + + # After loading all replies, crawl replies using BeautifulSoup + self.set_soup_and_activity() + + article_ids = self.find_article_id() + article_nicknames = self.find_article_nickname() + article_datas = self.find_article_data() + article_dates = self.find_article_date() + article_profileurls = self.find_article_profileurl() + article_likes = self.find_article_like() + article_url = self.driver.current_url + + replies = [] + # This may occur exception when indices of each elements is not matched + # This exception described above is intended + for i in range(len(article_ids)): + reply = dict() + reply['article_id'] = article_ids[i] + reply['article_nickname'] = article_nicknames[i] + reply['article_data'] = article_datas[i] + reply['article_date'] = article_dates[i] + reply['article_profileurl'] = article_profileurls[i] + reply['reply_url'] = article_likes[i] + reply['platform_name'] = 'kakaostory' + reply['article_form'] = 'reply' + reply['article_url'] = article_url + reply['article_order'] = str(i) + replies.append(reply) + return replies + + +class ListTraverse(object): + def __init__(self, driver): + self.driver = driver + self.current_section = None + + def remove_current_section(self): + tag_name = self.current_section.tag_name + data_model = self.current_section.get_attribute("data-model") + css_selector = tag_name + "[data-model='" + data_model + "']" + self.driver.execute_script('document.querySelector("' + css_selector + '").remove()') + self.current_section = None + + # This is the same as the move_first_section function + def move_next_section(self): + self.move_first_section() + + # Load list more + def load_list_more(self): + position = self.driver.get_window_position() + size = self.driver.get_window_size() + self.driver.maximize_window() + self.driver.set_window_size(size['width'], size["height"]) + self.driver.set_window_position(position['x'], position['y']) + for _ in range(2): + self.driver.execute_script("window.scrollBy(0, -400)") + time.sleep(0.3) + for _ in range(4): + self.driver.execute_script("window.scrollBy(0, 800)") + time.sleep(0.3) + + def close_current_section(self): + # click close button on the page section + try: + btn = find_element_by_css_selector(self.driver, "button._btnClose", 5) + btn.send_keys(Keys.NULL) + btn.send_keys(Keys.ENTER) + except Exception as e: + printl("There is not X button on the page") + printl(e) + + # check, verify and close current section + try: + btn = find_element_by_css_selector(self.driver, "button._btnClose", 1) + btn.send_keys(Keys.NULL) + btn.send_keys(Keys.ENTER) + except Exception as e: + pass + + def get_current_section_data_model(self): + return self.current_section.get_attribute('data-model') if self.current_section else "" + + # check body is loaded + def is_loaded_body(self): + try: + section_activity = find_element_by_css_selector(self.driver, + "div.cover_wrapper div[class='section _activity']") + return True if section_activity else False + except WebDriverException as we: + printl("Body is not loaded on browser : is_loaded_body") + printl(we) + raise + + # + def check_list_and_load(self): + for _ in range(limit_reload): + num_of_list = self.get_num_of_list() + if not num_of_list: + self.load_list_more() + num_of_list = self.get_num_of_list() + if not num_of_list: + raise WebDriverException("There is no data or ajax error") + + def move_first_section(self): + raise NotImplementedError + + def open_current_section(self): + raise NotImplementedError + + def get_num_of_list(self): + raise NotImplementedError + + def get_date_of_current_section(self): + raise NotImplementedError + + +class ListTag(ListTraverse): + # open url -> move_first_section -> open_current_section -> + # check date -> crawl / ignore -> close_current_section -> remove_current_section -> next_section -> + # open_current_section + + def __init__(self, driver): + ListTraverse.__init__(self, driver) + + # Raising exception is intended when first element is not found + # Set current_section on div + def move_first_section(self): + try: + recent_section_field = \ + find_element_by_css_selector(self.driver, "div.cont_recomm[data-part-name='recentFeeds']", 10) + self.current_section = recent_section_field.find_element_by_css_selector('div.img_item') + except Exception as e: + printl("Do not find first recent section") + raise + + # Raising exception is intended when fail to find a link to a content + def open_current_section(self): + try: + # The element to find is a tag. Its class attribute is link_thumb _link or link_txt _link + a = self.current_section.find_element_by_css_selector("a[class$=' _link']") + a.send_keys(Keys.NULL) + a.send_keys(Keys.ENTER) + except WebDriverException as we: + printl("open_current_section error") + printl(we) + printl(self.current_section.get_attribute('data-model')) + raise KakaoCrawlerException("open_current_section error") + except Exception as e: + printl("Unknown Occurs") + printl(e) + raise + + # Raising exception is intended when fail to find the element or the text containing date + def get_date_of_current_section(self): + # Find the element containing date and extract text from it. If not, raise exception. + try: + div = find_element_by_css_selector(self.driver, "div.cover_wrapper") + span = div.find_element_by_css_selector("div.add_top span.time") + text_date = span.get_attribute('title') or span.get_attribute('data-tooltip') + except WebDriverException as we: + printl("Element is not found in get_date_of_current_section") + printl(we) + raise NotFoundElementError("Element is not found in get_date_of_current_section") + except Exception as e: + printl("Unknown Exception") + printl(e) + raise + + # Check the text containing date info is valid. If not, raise exception + if text_date and len(text_date) > 6: + m = re_date.search(text_date) or re_date.search(text_date) + if m: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + # add 12 hour when the article is written at p.m + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + + # convert datetime.datetime to str + return str(temp_date) + else: + raise NotFoundDataError("Date is missing when calling get_date_of_current_section") + else: + raise NotFoundDataError("Date is missing when calling get_date_of_current_section") + + def get_num_of_list(self): + items = self.driver.find_elements_by_css_selector("div[class^='img_item']") + return len(items) if items else 0 + + +class ListUser(ListTraverse): + def __init__(self, driver): + ListTraverse.__init__(self, driver) + + def move_first_section(self): + try: + recent_section_field = \ + find_element_by_css_selector(self.driver, "div.feed[data-part-name='content']", 10) + self.current_section = recent_section_field.find_element_by_css_selector("div[class='section _activity']") + except WebDriverException as we: + printl("Do not find first recent section") + printl(we) + raise NotFoundElementError("Do not find first recent section") + except Exception as e: + printl("Unknown exception occur") + printl(e) + raise + + # Raising exception is intended when fail to find a link to a content + def open_current_section(self): + try: + a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']") + a.send_keys(Keys.NULL) + a.send_keys(Keys.ENTER) + except WebDriverException as we: + printl("open_current_section error") + printl(we) + raise NotFoundElementError("Do not find first recent section") + except Exception as e: + printl("Unknown exception occur") + printl(e) + raise + + # Raising exception is intended when fail to find the element or the text containing date + def get_date_of_current_section(self): + # Find the element containing date and extract text from it. If not, raise exception. + try: + a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']") + text_date = a.get_attribute('title') or a.get_attribute('data-tooltip') + except WebDriverException as we: + printl("Element is not found in get_date_of_current_section") + printl(we) + raise NotFoundElementError("Element is not found in get_date_of_current_section") + except Exception as e: + printl("Unknown exception occur") + printl(e) + raise + + # Check the text containing date info is valid. If not, raise exception + if text_date and len(text_date) > 6: + m = re_date.search(text_date) or re_date.search(text_date) + if m: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + # add 12 hour when the article is written at p.m + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + + # convert datetime.datetime to str + return str(temp_date) + else: + raise NotFoundDataError("Date is missing when calling get_date_of_current_section") + else: + raise NotFoundDataError("Date is missing when calling get_date_of_current_section") + + def get_num_of_list(self): + items = self.driver.find_elements_by_css_selector("div[class='section _activity']") + return len(items) if items else 0 + + +class CrawlerProcess(object): + def __init__(self, driver, send_to_db, initializer, url, set_backup): + self.driver = driver + self.send_to_db = send_to_db + self.initializer = initializer + self.url = url + self.set_backup = set_backup if set_backup else set() + self.list_traverse = None + self.num_of_web_except = 0 + self.num_of_out_of_date = 0 + + # To catch exception, this function wraps traverse_and_crawl function in try-catch statement. + def start(self): + while True: + try: + self.traverse_and_crawl() + # If WebDriverException occurs, retry crawling. + except WebDriverException as we: + printl("WebDriverException occurs") + printl(we) + + # If the number of retry is over limit, crawling is terminated. + if self.num_of_web_except > num_of_retry: + printl("There may be no data") + printl("Crawling is done") + break + + printl("Retry :", num_of_retry - self.num_of_web_except) + self.num_of_web_except += 1 + + # test chromedriver can access self.driver + # if can't, WebDriverException occur + self.driver.get('https://www.google.com') + wait(2) + + # not found element or data, this program is terminated + # This process is intended for debug + except KakaoCrawlerException as ke: + printl("KakaoCrawlerException occur. Check kakao website") + printl(ke) + raise + + # unknown exception occur + except Exception as e: + printl("Unknown occurs") + printl(e) + + # If the number of retry is over limit, crawling is terminated. + if self.num_of_web_except > num_of_retry: + printl("Crawling is terminated by force") + raise + + printl("Retry :", num_of_retry - self.num_of_web_except) + self.num_of_web_except += 1 + + # no exception occurs + else: + printl("Crawling is done") + break + + def get_set_backup(self): + return self.set_backup + + def convert_datetime_to_date(self, str_date): + #return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S') + return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S').date() + + def traverse_and_crawl(self): + NotImplementedError + + def is_terminate(self): + self.num_of_out_of_date += 1 + return True if self.num_of_out_of_date > limit_reload else False + + +class UserProcess(CrawlerProcess): + def __init__(self, driver, send_to_db, initializer, url, set_backup=None): + CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup) + self.list_traverse = ListUser(driver) + + # move url -> check list and load -> move first section -> + # loop: check date, open content, check body and crawling -> + # close content -> remove current section -> check list and load -> move next + def traverse_and_crawl(self): + self.driver.get(self.url) + self.list_traverse.check_list_and_load() + self.list_traverse.move_first_section() + + self.num_of_out_of_date = 0 + # begin_day and end_day type is datetime.date + begin_day = self.initializer.get_begin_day() + end_day = self.initializer.get_end_day() + + while True: + cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section()) + if cs_date > end_day: + printl(str(cs_date), ": continue") + elif cs_date < begin_day: + if self.is_terminate(): + break + else: + current_section_data_model = self.list_traverse.get_current_section_data_model() + + if current_section_data_model not in self.set_backup: + self.set_backup.add(current_section_data_model) + self.list_traverse.open_current_section() + + if self.list_traverse.is_loaded_body(): + body_crawler = BodyCrawler(self.driver) + body = body_crawler.get() + if body: + body['keyword_id'] = self.initializer.keyword_id() + printl(body['article_url']) + self.send_to_db.delete_url(body['article_url']) + self.send_to_db.send_body(body) + + reply_crawler = ReplyCrawler(self.driver) + replies = reply_crawler.get() + + # if reply exists in replies variable + if replies: + # put platform_name, platform_form, platform_id to dict of list + for reply in replies: + reply['platform_id'] = body['platform_id'] + reply['platform_name'] = body['platform_name'] + reply['platform_form'] = body['platform_form'] + self.send_to_db.send_reply(replies) + printl('ok') + else: + raise Exception("Nobody Nobody") + self.list_traverse.close_current_section() + self.list_traverse.remove_current_section() + if not self.list_traverse.get_num_of_list(): + self.list_traverse.check_list_and_load() + self.list_traverse.move_next_section() + + +class TagProcess(CrawlerProcess): + def __init__(self, driver, send_to_db, initializer, url, set_backup=None): + CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup) + self.list_traverse = ListTag(driver) + + # move url -> check list and load -> move first section -> + # loop: open content, check body content and date, and crawling -> + # close content -> remove current section -> check list and load -> move next + def traverse_and_crawl(self): + self.driver.get(self.url) + self.list_traverse.check_list_and_load() + self.list_traverse.move_first_section() + + self.num_of_out_of_date = 0 + # begin_day and end_day type is datetime.date + begin_day = self.initializer.get_begin_day() + end_day = self.initializer.get_end_day() + + while True: + self.list_traverse.open_current_section() + if self.list_traverse.is_loaded_body(): + cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section()) + if cs_date > end_day: + printl(str(cs_date), ": continue") + elif cs_date < begin_day: + if self.is_terminate(): + break + else: + current_section_data_model = self.list_traverse.get_current_section_data_model() + + if current_section_data_model not in self.set_backup: + self.set_backup.add(current_section_data_model) + body_crawler = BodyCrawler(self.driver) + body = body_crawler.get() + printl(body['article_url']) + if body: + body['keyword_id'] = self.initializer.keyword_id() + self.send_to_db.delete_url(body['article_url']) + self.send_to_db.send_body(body) + + reply_crawler = ReplyCrawler(self.driver) + replies = reply_crawler.get() + + # if reply exists in replies variable + if replies: + # put platform_name, platform_form, platform_id to dict of list + for reply in replies: + reply['platform_id'] = body['platform_id'] + reply['platform_name'] = body['platform_name'] + reply['platform_form'] = body['platform_form'] + self.send_to_db.send_reply(replies) + printl('ok') + else: + raise Exception("Nobody Nobody") + self.list_traverse.close_current_section() + self.list_traverse.remove_current_section() + if not self.list_traverse.get_num_of_list(): + self.list_traverse.check_list_and_load() + self.list_traverse.move_next_section() + + class KakaoMainCrawler: def __init__(self): - self.page_crawler = KakaoPageCrawler() - self.body_crawler = KakaoBodyCrawler() - self.reply_crawler = KakaoReplyCrawler() self.send_to_db = SendtoDB() self.crawl_init = KakaoInit() self.browser = Browser() self.driver = None def set_driver(self, driver): - self.page_crawler.set_driver(driver) - self.body_crawler.set_driver(driver) - self.reply_crawler.set_driver(driver) self.driver = driver def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id - def crawl_all_current_url(self, backup_set=None): - self.page_crawler.init() - if backup_set: - self.page_crawler.activity_data_model_set = backup_set.copy() - while True: - activity = self.page_crawler.next_activity() - if activity is None: - break - try: - self.crawl_body(activity) - self.crawl_reply(activity) - self.page_crawler.crawling_ok() - print_and_flush("ok") - except WebDriverException as ee: - logging.info(ee) - # print_and_flush(e) - print_and_flush("fail") - raise WebDriverException - except Exception as e: - print_and_flush("failed") - logging.info(e) - # print_and_flush(e) - - def crawl_body(self, activity): - # print_and_flush("start body crawl") - self.body_crawler.set_driver(self.driver) - self.body_crawler.set_activity(activity) - content = self.body_crawler.get_content() - content["keyword_id"] = self.keyword_id - print_and_flush(content["article_url"]) - self.send_to_db.delete_url(content['article_url']) - self.send_to_db.send_body(content) - - def crawl_reply(self, activity): - # print_and_flush("start reply crawl") - self.reply_crawler.set_driver(self.driver) - self.reply_crawler.set_activity(activity) - if self.reply_crawler.has_reply(): - self.reply_crawler.crawl_all() - self.send_to_db.send_reply(self.reply_crawler.get_content()) - def start(self): self.crawl_start() @@ -1288,88 +951,29 @@ class KakaoMainCrawler: def crawl_start(self): real_time = True while real_time: - print_and_flush("Crawler Start") + printl("Crawler Start") url_list = self.crawl_init.make_url() i = 0 - backup_set = set() while i < len(url_list): try: - print_and_flush(url_list[i] + "\n") - self.driver.get(url_list[i]) - wait(3) - self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(), - end_date=self.crawl_init.get_end_day()) - self.crawl_all_current_url(backup_set) + printl(url_list[i], "\n") + if 'https://story.kakao.com/hashtag/' in url_list[i]: + kakao_process = TagProcess(self.driver, self.send_to_db, self.crawl_init, + url_list[i]) + else: + kakao_process = UserProcess(self.driver, self.send_to_db, self.crawl_init, + url_list[i]) + kakao_process.start() i += 1 - backup_set.clear() except Exception as e: logging.info(e) - # print_and_flush(e) - backup_set = self.page_crawler.activity_data_model_set.copy() - self.driver.quit() + # self.driver.quit() self.set_driver(self.browser.new_browser()) - # kakao_main.driver.implicitly_wait(5) wait(5) + i += 1 real_time = self.crawl_init.is_realtime() - print_and_flush("Finished Crawling :)") - # kakao_main.driver.quit() + printl("Finished Crawling :)") + self.send_to_db.close() - self.driver.quit() + # self.driver.quit() - -if __name__ == '__main__': - """ - argv: - 0 - kakaocrawl.py - 1 - keyword_id - 2 - data db num - 3 - before_day - 4 - until_page - """ - - if len(sys.argv) < 5: - print("Fail to process execute") - exit(1) - else: - print("Start Python Crawling") - - kakao_init = KakaoInit(int(sys.argv[3])) - kakao_init.get_keyword_parameters(sys.argv[1]) - kakao_init.disconnect() - browser = Browser() - kakao_main = KakaoMainCrawler() - kakao_main.set_driver(browser.get_new_driver("chrome")) - # kakao_main.driver.implicitly_wait(5) - wait(3) - kakao_main.set_keyword_id(sys.argv[1]) - kakao_main.send_to_db.set_db(sys.argv[2]) - realtime = True - while realtime: - print_and_flush("Crawler Start") - url_list = kakao_init.make_url() - i = 0 - backup_set = set() - while i < len(url_list): - try: - print_and_flush(url_list[i] + "\n") - kakao_main.driver.get(url_list[i]) - wait(3) - kakao_main.page_crawler.set_date(begin_date=kakao_init.get_begin_day(), - end_date=kakao_init.get_end_day()) - kakao_main.crawl_all_current_url(backup_set) - i += 1 - backup_set.clear() - except Exception as e: - logging.info(e) - # print_and_flush(e) - backup_set = kakao_main.page_crawler.activity_data_model_set.copy() - kakao_main.set_driver(browser.new_browser()) - # kakao_main.driver.implicitly_wait(5) - wait(5) - realtime = kakao_init.is_realtime() - print_and_flush("Finished Crawling :)") - # kakao_main.driver.quit() - kakao_main.send_to_db.close() - print_and_flush("ByeBye :)") - - exit(0) \ No newline at end of file