diff --git a/WebBasedCrawler/kakao/kakaocrawl_backup.py b/WebBasedCrawler/kakao/kakaocrawl_backup.py new file mode 100644 index 0000000..f156549 --- /dev/null +++ b/WebBasedCrawler/kakao/kakaocrawl_backup.py @@ -0,0 +1,1375 @@ +#-*- coding: utf-8 -*- +import sys +import re +import datetime +import json +import time +import logging + +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.common.exceptions import WebDriverException + + +from base.baseclasses import wait +from base.baseclasses import print_and_flush +from base.baseclasses import SendtoDB +from base.baseclasses import Browser +from base.baseclasses import CrawlInit + +__author__ = 'cococo' +kakaostory_url = 'https://story.kakao.com/' +kakaostory_channel_url = 'https://story.kakao.com/ch/' + + +logging.basicConfig(level=logging.INFO, + format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") + + +class KakaoBodyCrawler: + def __init__(self, driver=None): + self.driver = driver + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def find_article_profileurl(self): + img = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a/img") + return img.get_attribute("src") + + def find_article_nickname(self): + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/div[@class='myid']/a") + return a.text + + def find_article_modified_date(self): + try: + span = self.activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") + except: + return None + ac = ActionChains(self.driver) + ac.move_to_element(span).perform() + wait(0.3) + data_tooltip = span.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + return None + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + def find_article_date(self): + time_modified_date = self.find_article_modified_date() + if time_modified_date is not None: + return time_modified_date + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + a.send_keys(Keys.NULL) + ac = ActionChains(self.driver) + ac.move_to_element(a).perform() + wait(0.2) + data_tooltip = a.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return str(temp_date) + + def find_article_id(self): + a = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a") + href = a.get_attribute("href") + #str_id = href[href.rindex('/') + 1:] + str_id = href.replace(kakaostory_url, "") + return str_id + + def find_article_url(self): + a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + url = a.get_attribute("href") + return url + + def find_platform_name(self): + return "kakaostory" + + def find_platform_form(self): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def find_article_form(self): + return "body" + + def find_article_data(self): + more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" + "/p[@class='more _moreBtnContainer']") + display = more.get_attribute("style") + if display.find('none') == -1: + a = more.find_element_by_tag_name("a") + self.enter_element(a) + try: + content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" + "/div[@class='txt_wrap']/div[@class='_content']") + except: + return str("") + return content.text + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_platform_id(self): + return self.find_article_id() + + def find_article_title(self): + content = self.find_article_data() + if not content: + return "" + try: + return content.strip().splitlines()[0] + except: + return "" + + def find_feeling_users3(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") + like_num = int(str_like.text) + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, 15).perform() + wait(1) + lis = fake_scroll.find_elements_by_tag_name("li") + data = list() + for li in lis: + try: + a = li.find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + # str_id = href[href.rindex('/') + 1:] + str_id = href.replace(kakaostory_url, "") + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'profileurl': profileurl}) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + wait(1) + return feelings + + def find_reply_users(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewComments' and not(@style)]") + except: + return None + count = a.find_element_by_css_selector("strong._commentCount").text + if len(count.strip()) < 1: + return None + else: + return int(count.replace(",", "").strip()) + + def find_feeling_users(self): + try: + #a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) + str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']") + like_num = int(str_like.text.replace(",", "")) + # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) + start_time = time.time() + while len(fake_scroll.find_elements_by_tag_name("li")) < like_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + ac.drag_and_drop_by_offset(scroll, 0, 30).perform() + wait(0.5) + if time.time() - start_time > 600.0: + break + ul = fake_scroll.find_element_by_tag_name("ul") + data = list() + try: + a_list = ul.find_elements_by_css_selector("a[class='link_people']") + # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") + for i in range(0, len(a_list)): + href = a_list[i].get_attribute('href') + str_id = href.replace(kakaostory_url, "") + # profileurl = img_list[i].get_attribute('src') + # data.append({'id': str_id, 'profileurl': profileurl}) + data.append({'id': str_id}) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + wait(1) + return feelings + + def find_feeling_users2(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + scroll_style = scroll.get_attribute("style") + re_height = re.compile("height: ([0-9]*\\.[0-9]+|[0-9]+)px") + re_top = re.compile("top: ([0-9]*\\.[0-9]+|[0-9]+)px") + m_h = re_height.search(scroll_style) + m_t = re_top.search(scroll_style) + if m_t is None: + top = 0.0 + else: + top = float(m_t.group(1)) + if m_h is None: + height = 0.0 + else: + height = float(m_h.group(1)) + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + i = 0 + data = list() + while height + top < 320: + lis = fake_scroll.find_elements_by_tag_name("li") + for j in range(i, (i+6) if i+6 < len(lis) else len(lis)): + a = lis[j].find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + str_id = href[href.rindex('/') + 1:] + em = a.find_element_by_css_selector("em[class='tit_userinfo']") + nickname = em.text + span = a.find_element_by_css_selector("span[class='txt_feel']") + emotion = span.text + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'nickname': nickname, 'emotion': emotion, 'profileurl': profileurl}) + i += 6 + move_pixel = 1968.0 / len(fake_scroll.find_elements_by_tag_name("li")) + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, move_pixel).perform() + wait(1) + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + scroll_style = scroll.get_attribute("style") + m_h = re_height.search(scroll_style) + m_t = re_top.search(scroll_style) + if m_t is None: + top = 0.0 + else: + top = float(m_t.group(1)) + if m_h is None: + height = 0.0 + else: + height = float(m_h.group(1)) + feelings = dict() + feelings['data'] = data + feelings['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']") + self.click_element(a) + return feelings + + def find_share_users2(self): + try: + a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]") + except: + return None + self.enter_element(a) + inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") + re_share = re.compile("\\(([\\d]+)\\)") + m = re_share.search(str_share.text) + if m is None: + share_num = 0 + else: + share_num = int(m.group(1)) + fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.drag_and_drop_by_offset(scroll, 0, 15).perform() + wait(1) + lis = fake_scroll.find_elements_by_tag_name("li") + data = list() + for li in lis: + try: + a = li.find_element_by_xpath("a[@class='link_people']") + href = a.get_attribute('href') + last_slush = href.rindex('/') + # begin_slush = href[:last_slush].rindex('/') + # str_id = href[begin_slush+1:last_slush] + str_id = href[:last_slush].replace(kakaostory_url, "") + img = a.find_element_by_css_selector("img[class='img_thumb']") + profileurl = img.get_attribute('src') + data.append({'id': str_id, 'profileurl': profileurl}) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + shares = dict() + shares['data'] = data + shares['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") + self.click_element(a) + return shares + + def find_share_users(self): + try: + #a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/a[@class='_btnViewShareList' and not(@style)]") + a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewStoryShareList' and not(@style)]") + except: + return None + self.enter_element(a) + # inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']") + inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']"))) + str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']") + re_share = re.compile("([\\d]+)") + m = re_share.search(str_share.text) + if m is None: + share_num = 0 + else: + share_num = int(m.group(1).replace(",", "")) + # fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']") + fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']"))) + start_time = time.time() + while len(fake_scroll.find_elements_by_tag_name("li")) < share_num: + scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']") + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(fake_scroll, 0, 0).perform() + ac.drag_and_drop_by_offset(scroll, 0, 30).perform() + wait(0.5) + if time.time() - start_time > 600.0: + break + ul = fake_scroll.find_element_by_tag_name("ul") + data = list() + try: + a_list = ul.find_elements_by_css_selector("a[class='link_people']") + # img_list = ul.find_elements_by_css_selector("img[class='img_thumb']") + for i in range(0, len(a_list)): + href = a_list[i].get_attribute('href') + last_slush = href.rindex('/') + # begin_slush = href[:last_slush].rindex('/') + # str_id = href[begin_slush+1:last_slush] + str_id = href[:last_slush].replace(kakaostory_url, "") + # profileurl = img_list[i].get_attribute('src') + # data.append({'id': str_id, 'profileurl': profileurl}) + data.append({'id': str_id}) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + shares = dict() + shares['data'] = data + shares['count'] = len(data) + a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']") + self.click_element(a) + return shares + + def find_platform_title(self): + return self.driver.title + + def get_content(self): + content = dict() + content["article_id"] = self.find_article_id() + # print_and_flush("article_id") + content["article_nickname"] = self.find_article_nickname() + # print_and_flush("article_nickname") + content["article_title"] = self.find_article_title() + # print_and_flush("article_title") + content["article_date"] = self.find_article_date() + # print_and_flush("article_date") + #content["article_hit"] = self.find_article_hit() + content["article_url"] = self.find_article_url() + # print_and_flush("article_url") + content["article_data"] = self.find_article_data() + # print_and_flush("article_data") + content["article_form"] = self.find_article_form() + # print_and_flush("article_form") + content["article_profileurl"] = self.find_article_profileurl() + # print_and_flush("article_profileurl") + #content["platform_title"] = self.find_platform_title() + content["platform_title"] = content["article_nickname"] + # print_and_flush("platform_title") + content["platform_name"] = self.find_platform_name() + if content["article_url"].find(kakaostory_channel_url) != -1: + content["platform_form"] = "channel" + else: + content["platform_form"] = "story" + # print_and_flush("platform_form") + content["platform_id"] = self.find_platform_id() + # print_and_flush("platform_id") + data = list() + # print_and_flush("start feelings") + feelings = self.find_feeling_users() + # print_and_flush("feelings") + # print_and_flush("done feelings") + if feelings is not None: + data.append({"feelings": feelings}) + content["article_profile"] = str(feelings["count"]) + # print_and_flush("start shares") + shares = self.find_share_users() + # print_and_flush("shares") + # print_and_flush("done shares") + if shares is not None: + data.append({"shares": shares}) + content["reply_url"] = str(shares["count"]) + if data: + json_data = {"data": data} + content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode(json_data) + reply_count = self.find_reply_users() + if type(reply_count) == int: + content["article_order"] = reply_count + return content + + +class KakaoReplyCrawler_backup: + def __init__(self, driver=None, activity=None): + self.driver = driver + self.activity = activity + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.reply_list = list() + self.order = 0 + + def find_init(self): + self.reply_list.clear() + self.order = 0 + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def has_more(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + if more.get_attribute('style').find('block') != -1: + return True + else: + return False + + def read_more_reply(self): + more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']") + a = more.find_element_by_css_selector("a[class='_btnCommentMore']") + self.enter_element(a) + + def read_all_reply(self): + while self.has_more(): + self.read_more_reply() + + def get_reply_lis(self): + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + lis = ul.find_elements_by_tag_name("li") + return lis + + def has_reply(self): + try: + ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul") + lis = ul.find_elements_by_tag_name("li") + if len(lis) > 0: + return True + else: + return False + except: + return False + + def crawl_reply(self, li): + content = dict() + content["article_id"] = self.find_article_id(li) + content["article_nickname"] = self.find_article_nickname(li) + content["article_date"] = self.find_article_date(li) + content["article_data"] = self.find_article_data(li) + content["article_order"] = self.order + content["article_url"] = self.find_article_url(li) + content["platform_id"] = self.find_platform_id(li) + content["article_form"] = self.find_article_form() + content["article_profileurl"] = self.find_article_profileurl(li) + content["platform_name"] = self.find_platform_name() + if content["article_url"].find(kakaostory_channel_url) != -1: + content["platform_form"] = "channel" + else: + content["platform_form"] = "story" + article_parent = self.find_article_parent(li) + if article_parent is not None: + content["article_parent"] = article_parent + self.order += 1 + self.reply_list.append(content) + + def get_content(self): + return self.reply_list + + def crawl_all(self): + self.find_init() + self.read_all_reply() + try: + lis = self.get_reply_lis() + for li in lis: + self.crawl_reply(li) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + + def find_article_id(self, li): + a = li.find_element_by_xpath("div[@class='pf']/a") + href = a.get_attribute('href') + str_id = href.replace(kakaostory_url, "").strip() + return str_id + + def find_article_profileurl(self, li): + img = li.find_element_by_xpath("div[@class='pf']/a/img") + return img.get_attribute('src') + + def find_article_nickname(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@data-profile-popup]") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel + return a.text + + def find_article_date(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") + # a.send_keys(Keys.NULL) + # ac = ActionChains(self.driver) + # ac.move_to_element(a).perform() + # wait(0.1) + # data_tooltip = a.get_attribute("data-tooltip") + data_tooltip = a.get_attribute("title") + #a.get_attribute('title') <-- data_tooltip + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return str(temp_date) + + def find_article_parent(self, li): + comment = li.find_element_by_xpath("div[@class='txt']") + try: + a = comment.find_element_by_xpath("a[@data-profile-popup]") + return a.text + except: + return None + + def find_article_data(self, li): + all_element = li.find_element_by_xpath("div[@class='txt']") + all_text = all_element.text + p = all_element.find_element_by_tag_name('p') + p_text = p.text + return all_text[len(p_text):].strip() + + def find_article_url(self, li): + a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']") + href = a.get_attribute("href") + return href[:href.rindex('/')] + + def find_platform_id(self, li): + article_url = self.find_article_url(li) + main_url = article_url[:article_url.rindex('/')] + #return main_url[main_url.rindex('/')+1:] + return main_url.replace(kakaostory_url, "") + + def find_article_form(self, li=None): + return 'reply' + + def find_platform_name(self, li=None): + return 'kakaostory' + + def find_platform_form(self, li=None): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_like_count(self, li): + try: + like = li.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") + return like.text + except: + return '0' + + +class KakaoReplyCrawler: + def __init__(self, driver=None, activity=None): + self.driver = driver + self.activity = activity + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.reply_list = list() + self.order = 0 + + def find_init(self): + self.reply_list.clear() + self.order = 0 + + def set_driver(self, driver): + self.driver = driver + + def set_activity(self, activity): + self.activity = activity + + def has_more(self): + try: + more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']") + except: + try: + more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']") + except: + return False + if more.get_attribute('style').find('block') != -1: + return True + else: + return False + + def read_more_reply(self): + try: + more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']") + a = more.find_element_by_css_selector("a[class='_btnShowMoreComment']") + except: + more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']") + a = more.find_element_by_css_selector("a[class='_btnShowPrevComment']") + self.enter_element(a) + + def read_all_reply(self): + start_time = time.time() + while self.has_more(): + self.read_more_reply() + if time.time() - start_time > 600.0: + raise WebDriverException + + def get_reply_ul(self): + ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']") + return ul + + def has_reply(self): + try: + ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']") + lis = ul.find_elements_by_tag_name("li") + if len(lis) > 0: + return True + else: + return False + except: + return False + + def crawl_reply(self, ul): + article_id = self.find_article_id(ul) + article_nickname = self.find_article_nickname(ul) + article_date = self.find_article_date(ul) + article_data = self.find_article_data(ul) + article_url = self.find_article_url(ul) + platform_id = self.find_platform_id(ul) + article_profileurl = self.find_article_profileurl(ul) + article_parent = self.find_article_parent(ul) + # print_and_flush(str(len(article_id))) + # print_and_flush(str(len(article_nickname))) + # print_and_flush(str(len(article_date))) + # print_and_flush(str(len(article_data))) + # print_and_flush(str(len(article_url))) + # print_and_flush(str(len(platform_id))) + # print_and_flush(str(len(article_profileurl))) + # print_and_flush(str(len(article_parent))) + if article_url[0].find(kakaostory_channel_url) != -1: + platform_form = "channel" + else: + platform_form = "story" + for i in range(0, len(article_id)): + content = dict() + content["article_id"] = article_id[i] + content["article_nickname"] = article_nickname[i] + content["article_profileurl"] = article_profileurl[i] + content["article_url"] = article_url[i] + content["platform_id"] = platform_id[i] + content["article_date"] = article_date[i] + content["article_data"] = article_data[i] + content["platform_form"] = platform_form + content["article_order"] = i + content["platform_name"] = self.find_platform_name() + content["article_form"] = self.find_article_form() + if len(article_parent[i]) > 0: + content["article_parent"] = article_parent[i] + self.reply_list.append(content) + + def get_content(self): + return self.reply_list + + def crawl_all(self): + self.find_init() + self.read_all_reply() + try: + ul = self.get_reply_ul() + self.crawl_reply(ul) + except WebDriverException: + raise WebDriverException + except Exception as e: + logging.info(e) + # print_and_flush(e) + + def find_article_id(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a") + str_id_list = list() + for a in a_list: + href = a.get_attribute('href') + str_id = href.replace(kakaostory_url, "").strip() + str_id_list.append(str_id) + return str_id_list + + def find_article_profileurl(self, ul): + img = ul.find_elements_by_xpath("li/div[@class='pf']/a/img") + img_list = list() + for im in img: + img_list.append(im.get_attribute('src')) + return img_list + + def find_article_nickname(self, ul): + a = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@data-profile-popup]") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']") + # a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel + nickname_list = list() + for i in a: + nickname_list.append(i.text) + return nickname_list + + def find_article_date(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") + # a.send_keys(Keys.NULL) + # ac = ActionChains(self.driver) + # ac.move_to_element(a).perform() + # wait(0.1) + # data_tooltip = a.get_attribute("data-tooltip") + date_list = list() + for a in a_list: + data_tooltip = a.get_attribute("title") + m = self.re_date.search(data_tooltip) + if m is None: + date_list.append("0000-00-00 00:00:00") + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + date_list.append(str(temp_date)) + return date_list + + def find_article_parent(self, ul): + comments = ul.find_elements_by_xpath("li/div[@class='txt']") + article_parents = list() + for comment in comments: + try: + a = comment.find_element_by_xpath("a[@data-profile-popup]") + article_parents.append(a.text) + except: + article_parents.append("") + return article_parents + + def find_article_data(self, ul): + all_elements = ul.find_elements_by_xpath("li/div[@class='txt']") + all_elements_p = ul.find_elements_by_xpath("li/div[@class='txt']/p") + all_text_list = list() + for i in range(0, len(all_elements)): + all_text = all_elements[i].text + p_text = all_elements_p[i].text + all_text_list.append(all_text[len(p_text):].strip()) + return all_text_list + + def find_article_url(self, ul): + a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']") + article_url_list = list() + for a in a_list: + href = a.get_attribute("href") + article_url_list.append(href[:href.rindex('/')]) + return article_url_list + + def find_platform_id(self, ul): + article_urls = self.find_article_url(ul) + platform_id = list() + for article_url in article_urls: + main_url = article_url[:article_url.rindex('/')] + #return main_url[main_url.rindex('/')+1:] + platform_id.append(main_url.replace(kakaostory_url, "")) + return platform_id + + def find_article_form(self, ul=None): + return 'reply' + + def find_platform_name(self, ul=None): + return 'kakaostory' + + def find_platform_form(self, ul=None): + if self.driver.current_url.find("https://story.kakao.com/ch/") != -1: + return 'channel' + elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1: + return 'tag' + else: + return 'story' + + def click_element(self, element): + ac = ActionChains(self.driver) + ac.move_to_element_with_offset(element, 0, 0).click().perform() + wait(2) + + def enter_element(self, element): + element.send_keys(Keys.NULL) + element.send_keys(Keys.ENTER) + wait(2) + + def find_like_count(self, ul): + try: + like = ul.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']") + return like.text + except: + return '0' + + +class KakaoPageCrawler: + def __init__(self, driver=None, begin_date=None, end_date=None): + self.driver = driver + self.activity_data_model_set = set() + self.begin_date = begin_date + self.end_date = end_date + self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})") + self.index = 0 + self.activities = None + self.present_activity = 0 + self.previous_activity = 0 + self.reload_count = 0 + + def move_to_url(self, url): + self.driver.get(url) + self.index = 0 + self.activity_data_model_set.clear() + + def init(self): + self.index = 0 + self.previous_activity = 0 + self.activities = None + self.activity_data_model_set.clear() + + def set_date(self, begin_date, end_date): + self.set_begin_date(begin_date) + self.set_end_date(end_date) + + def set_end_date(self, end_date): + if type(end_date) == str: + self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') + elif type(end_date) == datetime.datetime or type(end_date) == datetime.date: + self.end_date = end_date + else: + self.end_date = datetime.datetime.today() + self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day) + self.end_date += datetime.timedelta(days=1) + + def set_begin_date(self, begin_date): + if type(begin_date) == str: + self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d') + elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date: + self.begin_date = begin_date + else: + self.begin_date = datetime.datetime.today() + self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day) + + def next_activity_backup(self): + try: + if not self.activities: + self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='section _activity']"))) + self.index = 0 + if len(self.activities) == 0: + print_and_flush("activities are not found") + self.activities = None + return None + except: + print_and_flush("activities are not found") + self.activities = None + return None + has_more_activities = True + self.present_activity = len(self.activities) + while has_more_activities: + for activity in self.activities[self.previous_activity:]: + if activity.get_attribute("data-model") in self.activity_data_model_set: + continue + self.activity_data_model_set.add(activity.get_attribute("data-model")) + time_date = self.find_article_date(activity) + if self.is_earlier(time_date): + self.activities = None + return None + if self.is_late(time_date): + continue + return activity + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(self.activities) == self.present_activity: + has_more_activities = self.load_more_activities() + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + else: + has_more_activities = True + self.previous_activity = self.present_activity + self.present_activity = len(self.activities) + self.activities = None + return None + + def next_activity(self): + try: + if self.activities is None: + self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located( + (By.CSS_SELECTOR, "div[class='section _activity']")) + ) + if len(self.activities) == 0: + print_and_flush("activities are not found") + self.activities = None + return None + except: + print_and_flush("activities are not found") + self.activities = None + return None + while True: + self.index += 1 + if self.index > len(self.activities): + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if self.index > len(self.activities): + if self.load_more_activities() is False: + self.activities = None + return None + self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if self.activities[self.index - 1].get_attribute("data-model") in self.activity_data_model_set: + continue + time_date = self.find_article_date(self.activities[self.index - 1]) + time_modified_date = self.find_article_modified_date(self.activities[self.index - 1]) + if time_modified_date is not None: + time_date = time_modified_date + print("number of post:", self.index, flush=True) + print(str(time_date), flush=True) + if type(time_date) == str: + continue + if self.is_earlier(time_date): + self.activities = None + return None + if self.is_late(time_date): + continue + return self.activities[self.index - 1] + + def crawling_ok(self): + self.activity_data_model_set.add(self.activities[self.index - 1].get_attribute("data-model")) + + def next_activity_prepare(self): + try: + activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(activities) == 0: + return None + except: + return None + has_more_activities = True + while has_more_activities: + if self.index < len(activities): + temp_index = self.index + self.index += 1 + time_date = self.find_article_date(activities[temp_index]) + if self.is_earlier(time_date): + return None + if self.is_late(time_date): + continue + return activities[temp_index] + else: + has_more_activities = self.load_more_activities() + activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + return None + + def load_more_activities(self): + previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + for i in range(0, 4): + print_and_flush("Try load more") + body = self.driver.find_element_by_tag_name("body") + body.send_keys(Keys.NULL) + body.send_keys(Keys.END) + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + for i in range(0, 4): + print_and_flush("Try load more") + body = self.driver.find_element_by_tag_name("body") + for j in range(0, 2): + body.send_keys(Keys.PAGE_UP) + wait(0.1) + for j in range(0, 15): + body.send_keys(Keys.PAGE_DOWN) + wait(0.1) + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + for i in range(0, 10): + print_and_flush("Try load more") + self.driver.execute_script("window.scrollBy(0, 800)") + wait(4) + present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']") + if len(previous_activities) != len(present_activities): + wait(2) + self.reload_count = 0 + return True + if self.reload_count < 8: + print_and_flush("index reload") + self.reload_count += 1 + self.index -= 1 if self.index > 0 else 0 + position = self.driver.get_window_position() + size = self.driver.get_window_size() + self.driver.maximize_window() + self.driver.set_window_size(size['width'], size["height"]) + self.driver.set_window_position(position['x'], position['y']) + return True + if self.reload_count < 10: + print_and_flush("refresh") + self.driver.refresh() + wait(5) + self.index = 0 + self.reload_count += 1 + return True + return False + + def is_earlier(self, time_date): + return True if time_date < self.begin_date else False + + def is_late(self, time_date): + return True if time_date > self.end_date else False + + def set_driver(self, driver): + self.driver = driver + + def find_article_date(self, activity): + a = activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']") + a.send_keys(Keys.NULL) + ac = ActionChains(self.driver) + ac.move_to_element(a).perform() + wait(0.5) + ac.move_to_element(a).perform() + wait(0.5) + data_tooltip = a.get_attribute("data-tooltip") + m = self.re_date.search(data_tooltip) + if m is None: + data_tooltip = a.get_attribute("title") + m = self.re_date.search(data_tooltip) + if m is None: + return "0000-00-00 00:00:00" + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + def find_article_modified_date(self, activity): + try: + span = activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span") + except: + return None + ac = ActionChains(self.driver) + ac.move_to_element(span).perform() + wait(0.8) + data_tooltip = span.get_attribute("data-tooltip") + wait(0.2) + m = self.re_date.search(data_tooltip) + if m is None: + return None + else: + temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), + int(m.group(5)), int(m.group(6))) + if m.group(4) == "오후" and int(m.group(5)) < 12: + temp_date += datetime.timedelta(hours=12) + #return temp_date.strftime("%Y-%m-%d") + return temp_date + + +class KakaoInit(CrawlInit): + def __init__(self, before_day=0): + super().__init__(before_day) + self.urls = dict() + self.urls[6] = "https://story.kakao.com/ch/" + self.urls[7] = "https://story.kakao.com/hashtag/" + self.urls[8] = "https://story.kakao.com/" + + def split_searches(self): + search = self.searches() + splited_list = search.split(',') + trimmed_list = list() + if self.platform() == 6 or self.platform() == 8: + for x in splited_list: + trimmed_list.append(x.strip()) + else: + for x in splited_list: + trimmed_list.append(self.utf8(x.strip())) + return trimmed_list + + def make_url(self): + urls = list() + for x in self.split_searches(): + url = self.urls[self.platform()] + x + urls.append(url) + return urls + + def get_begin_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + result += datetime.timedelta(days=self.before_day) + return result + else: + return self.start_day() + + def get_end_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + return result + else: + return self.end_day() + + +class KakaoMainCrawler: + def __init__(self): + self.page_crawler = KakaoPageCrawler() + self.body_crawler = KakaoBodyCrawler() + self.reply_crawler = KakaoReplyCrawler() + self.send_to_db = SendtoDB() + self.crawl_init = KakaoInit() + self.browser = Browser() + self.driver = None + + def set_driver(self, driver): + self.page_crawler.set_driver(driver) + self.body_crawler.set_driver(driver) + self.reply_crawler.set_driver(driver) + self.driver = driver + + def set_keyword_id(self, keyword_id): + self.keyword_id = keyword_id + + def crawl_all_current_url(self, backup_set=None): + self.page_crawler.init() + if backup_set: + self.page_crawler.activity_data_model_set = backup_set.copy() + while True: + activity = self.page_crawler.next_activity() + if activity is None: + break + try: + self.crawl_body(activity) + self.crawl_reply(activity) + self.page_crawler.crawling_ok() + print_and_flush("ok") + except WebDriverException as ee: + logging.info(ee) + # print_and_flush(e) + print_and_flush("fail") + raise WebDriverException + except Exception as e: + print_and_flush("failed") + logging.info(e) + # print_and_flush(e) + + def crawl_body(self, activity): + # print_and_flush("start body crawl") + self.body_crawler.set_driver(self.driver) + self.body_crawler.set_activity(activity) + content = self.body_crawler.get_content() + content["keyword_id"] = self.keyword_id + print_and_flush(content["article_url"]) + self.send_to_db.delete_url(content['article_url']) + self.send_to_db.send_body(content) + + def crawl_reply(self, activity): + # print_and_flush("start reply crawl") + self.reply_crawler.set_driver(self.driver) + self.reply_crawler.set_activity(activity) + if self.reply_crawler.has_reply(): + self.reply_crawler.crawl_all() + self.send_to_db.send_reply(self.reply_crawler.get_content()) + + def start(self): + self.crawl_start() + + def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): + self.init_browser(browser) + self.init_keyword_id(keyword_id) + self.init_db(db_num) + self.init_before_day(before_day) + self.init_until_page(until_page) + + def init_browser(self, browser): + self.set_driver(self.browser.get_new_driver(browser)) + + def init_keyword_id(self, keyword_id): + if type(keyword_id) != int: + self.keyword_id = int(keyword_id) + else: + self.keyword_id = keyword_id + self.crawl_init.get_keyword_parameters(keyword_id) + self.crawl_init.disconnect() + + def init_db(self, db_num): + self.send_to_db.set_db(db_num) + + def init_before_day(self, before_day): + self.crawl_init.set_before_day(before_day) + + def init_until_page(self, until_page): + self.crawl_init.set_until_page(until_page) + + def crawl_start(self): + real_time = True + while real_time: + print_and_flush("Crawler Start") + url_list = self.crawl_init.make_url() + i = 0 + backup_set = set() + while i < len(url_list): + try: + print_and_flush(url_list[i] + "\n") + self.driver.get(url_list[i]) + wait(3) + self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(), + end_date=self.crawl_init.get_end_day()) + self.crawl_all_current_url(backup_set) + i += 1 + backup_set.clear() + except Exception as e: + logging.info(e) + # print_and_flush(e) + backup_set = self.page_crawler.activity_data_model_set.copy() + self.driver.quit() + self.set_driver(self.browser.new_browser()) + # kakao_main.driver.implicitly_wait(5) + wait(5) + real_time = self.crawl_init.is_realtime() + print_and_flush("Finished Crawling :)") + # kakao_main.driver.quit() + self.send_to_db.close() + self.driver.quit() + + +if __name__ == '__main__': + """ + argv: + 0 - kakaocrawl.py + 1 - keyword_id + 2 - data db num + 3 - before_day + 4 - until_page + """ + + if len(sys.argv) < 5: + print("Fail to process execute") + exit(1) + else: + print("Start Python Crawling") + + kakao_init = KakaoInit(int(sys.argv[3])) + kakao_init.get_keyword_parameters(sys.argv[1]) + kakao_init.disconnect() + browser = Browser() + kakao_main = KakaoMainCrawler() + kakao_main.set_driver(browser.get_new_driver("chrome")) + # kakao_main.driver.implicitly_wait(5) + wait(3) + kakao_main.set_keyword_id(sys.argv[1]) + kakao_main.send_to_db.set_db(sys.argv[2]) + realtime = True + while realtime: + print_and_flush("Crawler Start") + url_list = kakao_init.make_url() + i = 0 + backup_set = set() + while i < len(url_list): + try: + print_and_flush(url_list[i] + "\n") + kakao_main.driver.get(url_list[i]) + wait(3) + kakao_main.page_crawler.set_date(begin_date=kakao_init.get_begin_day(), + end_date=kakao_init.get_end_day()) + kakao_main.crawl_all_current_url(backup_set) + i += 1 + backup_set.clear() + except Exception as e: + logging.info(e) + # print_and_flush(e) + backup_set = kakao_main.page_crawler.activity_data_model_set.copy() + kakao_main.set_driver(browser.new_browser()) + # kakao_main.driver.implicitly_wait(5) + wait(5) + realtime = kakao_init.is_realtime() + print_and_flush("Finished Crawling :)") + # kakao_main.driver.quit() + kakao_main.send_to_db.close() + print_and_flush("ByeBye :)") + + exit(0) \ No newline at end of file diff --git a/WebBasedCrawler/kakao/kakaoexception.py b/WebBasedCrawler/kakao/kakaoexception.py new file mode 100644 index 0000000..344bf2f --- /dev/null +++ b/WebBasedCrawler/kakao/kakaoexception.py @@ -0,0 +1,16 @@ +# Basic exception +class KakaoCrawlerException(Exception): + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + + +# exception for no element +class NotFoundElementError(KakaoCrawlerException): + def __init__(self, *args, **kwargs): + KakaoCrawlerException.__init__(self, *args, **kwargs) + + +# exception for no data +class NotFoundDataError(KakaoCrawlerException): + def __init__(self, *args, **kwargs): + KakaoCrawlerException.__init__(self, *args, **kwargs)