Files
clients/WebBasedCrawler/kakaocrawl.py
admin cc8122e074 WebBasedCrawler 추가
git-svn-id: svn://192.168.0.12/source@229 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-12-07 03:25:49 +00:00

1310 lines
55 KiB
Python

#-*- coding: utf-8 -*-
__author__ = 'cococo'
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import sys
import re
import datetime
import json
import os
import time
from navercrawl import wait
from navercrawl import print_and_flush
from navercrawl import SendtoDB
from navercrawl import Browser
from navercrawl import CrawlInit
from selenium.common.exceptions import WebDriverException
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
class KakaoBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
def set_driver(self, driver):
self.driver = driver
def set_activity(self, activity):
self.activity = activity
def find_article_profileurl(self):
img = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a/img")
return img.get_attribute("src")
def find_article_nickname(self):
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/div[@class='myid']/a")
return a.text
def find_article_modified_date(self):
try:
span = self.activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span")
except:
return None
ac = ActionChains(self.driver)
ac.move_to_element(span).perform()
wait(0.3)
data_tooltip = span.get_attribute("data-tooltip")
m = self.re_date.search(data_tooltip)
if m is None:
return None
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return temp_date
def find_article_date(self):
time_modified_date = self.find_article_modified_date()
if time_modified_date is not None:
return time_modified_date
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
a.send_keys(Keys.NULL)
ac = ActionChains(self.driver)
ac.move_to_element(a).perform()
wait(0.2)
data_tooltip = a.get_attribute("data-tooltip")
m = self.re_date.search(data_tooltip)
if m is None:
return "0000-00-00 00:00:00"
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return str(temp_date)
def find_article_id(self):
a = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a")
href = a.get_attribute("href")
#str_id = href[href.rindex('/') + 1:]
str_id = href.replace(kakaostory_url, "")
return str_id
def find_article_url(self):
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
url = a.get_attribute("href")
return url
def find_platform_name(self):
return "kakaostory"
def find_platform_form(self):
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
return 'channel'
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
return 'tag'
else:
return 'story'
def find_article_form(self):
return "body"
def find_article_data(self):
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']")
display = more.get_attribute("style")
if display.find('none') == -1:
a = more.find_element_by_tag_name("a")
self.enter_element(a)
try:
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']")
except:
return str("")
return content.text
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def find_platform_id(self):
return self.find_article_id()
def find_article_title(self):
content = self.find_article_data()
if not content:
return ""
try:
return content.strip().splitlines()[0]
except:
return ""
def find_feeling_users3(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
except:
return None
self.enter_element(a)
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']")
like_num = int(str_like.text)
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
while len(fake_scroll.find_elements_by_tag_name("li")) < like_num:
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.drag_and_drop_by_offset(scroll, 0, 15).perform()
wait(1)
lis = fake_scroll.find_elements_by_tag_name("li")
data = list()
for li in lis:
try:
a = li.find_element_by_xpath("a[@class='link_people']")
href = a.get_attribute('href')
# str_id = href[href.rindex('/') + 1:]
str_id = href.replace(kakaostory_url, "")
img = a.find_element_by_css_selector("img[class='img_thumb']")
profileurl = img.get_attribute('src')
data.append({'id': str_id, 'profileurl': profileurl})
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
feelings = dict()
feelings['data'] = data
feelings['count'] = len(data)
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
self.click_element(a)
wait(1)
return feelings
def find_reply_users(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewComments' and not(@style)]")
except:
return None
count = a.find_element_by_css_selector("strong._commentCount").text
if len(count.strip()) < 1:
return None
else:
return int(count.replace(",", "").strip())
def find_feeling_users(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
except:
return None
self.enter_element(a)
# inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']")))
str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']")
like_num = int(str_like.text.replace(",", ""))
# fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']")))
start_time = time.time()
while len(fake_scroll.find_elements_by_tag_name("li")) < like_num:
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
ac.drag_and_drop_by_offset(scroll, 0, 30).perform()
wait(0.5)
if time.time() - start_time > 600.0:
break
ul = fake_scroll.find_element_by_tag_name("ul")
data = list()
try:
a_list = ul.find_elements_by_css_selector("a[class='link_people']")
# img_list = ul.find_elements_by_css_selector("img[class='img_thumb']")
for i in range(0, len(a_list)):
href = a_list[i].get_attribute('href')
str_id = href.replace(kakaostory_url, "")
# profileurl = img_list[i].get_attribute('src')
# data.append({'id': str_id, 'profileurl': profileurl})
data.append({'id': str_id})
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
feelings = dict()
feelings['data'] = data
feelings['count'] = len(data)
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
self.click_element(a)
wait(1)
return feelings
def find_feeling_users2(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
except:
return None
self.enter_element(a)
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
scroll_style = scroll.get_attribute("style")
re_height = re.compile("height: ([0-9]*\\.[0-9]+|[0-9]+)px")
re_top = re.compile("top: ([0-9]*\\.[0-9]+|[0-9]+)px")
m_h = re_height.search(scroll_style)
m_t = re_top.search(scroll_style)
if m_t is None:
top = 0.0
else:
top = float(m_t.group(1))
if m_h is None:
height = 0.0
else:
height = float(m_h.group(1))
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
i = 0
data = list()
while height + top < 320:
lis = fake_scroll.find_elements_by_tag_name("li")
for j in range(i, (i+6) if i+6 < len(lis) else len(lis)):
a = lis[j].find_element_by_xpath("a[@class='link_people']")
href = a.get_attribute('href')
str_id = href[href.rindex('/') + 1:]
em = a.find_element_by_css_selector("em[class='tit_userinfo']")
nickname = em.text
span = a.find_element_by_css_selector("span[class='txt_feel']")
emotion = span.text
img = a.find_element_by_css_selector("img[class='img_thumb']")
profileurl = img.get_attribute('src')
data.append({'id': str_id, 'nickname': nickname, 'emotion': emotion, 'profileurl': profileurl})
i += 6
move_pixel = 1968.0 / len(fake_scroll.find_elements_by_tag_name("li"))
ac = ActionChains(self.driver)
ac.drag_and_drop_by_offset(scroll, 0, move_pixel).perform()
wait(1)
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
scroll_style = scroll.get_attribute("style")
m_h = re_height.search(scroll_style)
m_t = re_top.search(scroll_style)
if m_t is None:
top = 0.0
else:
top = float(m_t.group(1))
if m_h is None:
height = 0.0
else:
height = float(m_h.group(1))
feelings = dict()
feelings['data'] = data
feelings['count'] = len(data)
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
self.click_element(a)
return feelings
def find_share_users2(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]")
except:
return None
self.enter_element(a)
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']")
re_share = re.compile("\\(([\\d]+)\\)")
m = re_share.search(str_share.text)
if m is None:
share_num = 0
else:
share_num = int(m.group(1))
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
while len(fake_scroll.find_elements_by_tag_name("li")) < share_num:
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.drag_and_drop_by_offset(scroll, 0, 15).perform()
wait(1)
lis = fake_scroll.find_elements_by_tag_name("li")
data = list()
for li in lis:
try:
a = li.find_element_by_xpath("a[@class='link_people']")
href = a.get_attribute('href')
last_slush = href.rindex('/')
# begin_slush = href[:last_slush].rindex('/')
# str_id = href[begin_slush+1:last_slush]
str_id = href[:last_slush].replace(kakaostory_url, "")
img = a.find_element_by_css_selector("img[class='img_thumb']")
profileurl = img.get_attribute('src')
data.append({'id': str_id, 'profileurl': profileurl})
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
shares = dict()
shares['data'] = data
shares['count'] = len(data)
a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']")
self.click_element(a)
return shares
def find_share_users(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]")
except:
return None
self.enter_element(a)
# inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']")))
str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']")
re_share = re.compile("\\(([\\d]+)\\)")
m = re_share.search(str_share.text)
if m is None:
share_num = 0
else:
share_num = int(m.group(1).replace(",", ""))
# fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']")))
start_time = time.time()
while len(fake_scroll.find_elements_by_tag_name("li")) < share_num:
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
ac.drag_and_drop_by_offset(scroll, 0, 30).perform()
wait(0.5)
if time.time() - start_time > 600.0:
break
ul = fake_scroll.find_element_by_tag_name("ul")
data = list()
try:
a_list = ul.find_elements_by_css_selector("a[class='link_people']")
# img_list = ul.find_elements_by_css_selector("img[class='img_thumb']")
for i in range(0, len(a_list)):
href = a_list[i].get_attribute('href')
last_slush = href.rindex('/')
# begin_slush = href[:last_slush].rindex('/')
# str_id = href[begin_slush+1:last_slush]
str_id = href[:last_slush].replace(kakaostory_url, "")
# profileurl = img_list[i].get_attribute('src')
# data.append({'id': str_id, 'profileurl': profileurl})
data.append({'id': str_id})
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
shares = dict()
shares['data'] = data
shares['count'] = len(data)
a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']")
self.click_element(a)
return shares
def find_platform_title(self):
return self.driver.title
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
# print_and_flush("article_id")
content["article_nickname"] = self.find_article_nickname()
# print_and_flush("article_nickname")
content["article_title"] = self.find_article_title()
# print_and_flush("article_title")
content["article_date"] = self.find_article_date()
# print_and_flush("article_date")
#content["article_hit"] = self.find_article_hit()
content["article_url"] = self.find_article_url()
# print_and_flush("article_url")
content["article_data"] = self.find_article_data()
# print_and_flush("article_data")
content["article_form"] = self.find_article_form()
# print_and_flush("article_form")
content["article_profileurl"] = self.find_article_profileurl()
# print_and_flush("article_profileurl")
#content["platform_title"] = self.find_platform_title()
content["platform_title"] = content["article_nickname"]
# print_and_flush("platform_title")
content["platform_name"] = self.find_platform_name()
if content["article_url"].find(kakaostory_channel_url) != -1:
content["platform_form"] = "channel"
else:
content["platform_form"] = "story"
# print_and_flush("platform_form")
content["platform_id"] = self.find_platform_id()
# print_and_flush("platform_id")
data = list()
# print_and_flush("start feelings")
feelings = self.find_feeling_users()
# print_and_flush("feelings")
# print_and_flush("done feelings")
if feelings is not None:
data.append({"feelings": feelings})
content["article_profile"] = str(feelings["count"])
# print_and_flush("start shares")
shares = self.find_share_users()
# print_and_flush("shares")
# print_and_flush("done shares")
if shares is not None:
data.append({"shares": shares})
content["reply_url"] = str(shares["count"])
if data:
json_data = {"data": data}
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode(json_data)
reply_count = self.find_reply_users()
if type(reply_count) == int:
content["article_order"] = reply_count
return content
class KakaoReplyCrawler_backup:
def __init__(self, driver=None, activity=None):
self.driver = driver
self.activity = activity
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
self.reply_list = list()
self.order = 0
def find_init(self):
self.reply_list.clear()
self.order = 0
def set_driver(self, driver):
self.driver = driver
def set_activity(self, activity):
self.activity = activity
def has_more(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
if more.get_attribute('style').find('block') != -1:
return True
else:
return False
def read_more_reply(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
a = more.find_element_by_css_selector("a[class='_btnCommentMore']")
self.enter_element(a)
def read_all_reply(self):
while self.has_more():
self.read_more_reply()
def get_reply_lis(self):
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
lis = ul.find_elements_by_tag_name("li")
return lis
def has_reply(self):
try:
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) > 0:
return True
else:
return False
except:
return False
def crawl_reply(self, li):
content = dict()
content["article_id"] = self.find_article_id(li)
content["article_nickname"] = self.find_article_nickname(li)
content["article_date"] = self.find_article_date(li)
content["article_data"] = self.find_article_data(li)
content["article_order"] = self.order
content["article_url"] = self.find_article_url(li)
content["platform_id"] = self.find_platform_id(li)
content["article_form"] = self.find_article_form()
content["article_profileurl"] = self.find_article_profileurl(li)
content["platform_name"] = self.find_platform_name()
if content["article_url"].find(kakaostory_channel_url) != -1:
content["platform_form"] = "channel"
else:
content["platform_form"] = "story"
article_parent = self.find_article_parent(li)
if article_parent is not None:
content["article_parent"] = article_parent
self.order += 1
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def crawl_all(self):
self.find_init()
self.read_all_reply()
try:
lis = self.get_reply_lis()
for li in lis:
self.crawl_reply(li)
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
def find_article_id(self, li):
a = li.find_element_by_xpath("div[@class='pf']/a")
href = a.get_attribute('href')
str_id = href.replace(kakaostory_url, "").strip()
return str_id
def find_article_profileurl(self, li):
img = li.find_element_by_xpath("div[@class='pf']/a/img")
return img.get_attribute('src')
def find_article_nickname(self, li):
a = li.find_element_by_xpath("div[@class='txt']/p/a[@data-profile-popup]")
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']")
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel
return a.text
def find_article_date(self, li):
a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']")
# a.send_keys(Keys.NULL)
# ac = ActionChains(self.driver)
# ac.move_to_element(a).perform()
# wait(0.1)
# data_tooltip = a.get_attribute("data-tooltip")
data_tooltip = a.get_attribute("title")
#a.get_attribute('title') <-- data_tooltip
m = self.re_date.search(data_tooltip)
if m is None:
return "0000-00-00 00:00:00"
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return str(temp_date)
def find_article_parent(self, li):
comment = li.find_element_by_xpath("div[@class='txt']")
try:
a = comment.find_element_by_xpath("a[@data-profile-popup]")
return a.text
except:
return None
def find_article_data(self, li):
all_element = li.find_element_by_xpath("div[@class='txt']")
all_text = all_element.text
p = all_element.find_element_by_tag_name('p')
p_text = p.text
return all_text[len(p_text):].strip()
def find_article_url(self, li):
a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']")
href = a.get_attribute("href")
return href[:href.rindex('/')]
def find_platform_id(self, li):
article_url = self.find_article_url(li)
main_url = article_url[:article_url.rindex('/')]
#return main_url[main_url.rindex('/')+1:]
return main_url.replace(kakaostory_url, "")
def find_article_form(self, li=None):
return 'reply'
def find_platform_name(self, li=None):
return 'kakaostory'
def find_platform_form(self, li=None):
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
return 'channel'
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
return 'tag'
else:
return 'story'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def find_like_count(self, li):
try:
like = li.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']")
return like.text
except:
return '0'
class KakaoReplyCrawler:
def __init__(self, driver=None, activity=None):
self.driver = driver
self.activity = activity
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
self.reply_list = list()
self.order = 0
def find_init(self):
self.reply_list.clear()
self.order = 0
def set_driver(self, driver):
self.driver = driver
def set_activity(self, activity):
self.activity = activity
def has_more(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
if more.get_attribute('style').find('block') != -1:
return True
else:
return False
def read_more_reply(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
a = more.find_element_by_css_selector("a[class='_btnCommentMore']")
self.enter_element(a)
def read_all_reply(self):
start_time = time.time()
while self.has_more():
self.read_more_reply()
if time.time() - start_time > 600.0:
raise WebDriverException
def get_reply_ul(self):
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
return ul
def has_reply(self):
try:
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) > 0:
return True
else:
return False
except:
return False
def crawl_reply(self, ul):
article_id = self.find_article_id(ul)
article_nickname = self.find_article_nickname(ul)
article_date = self.find_article_date(ul)
article_data = self.find_article_data(ul)
article_url = self.find_article_url(ul)
platform_id = self.find_platform_id(ul)
article_profileurl = self.find_article_profileurl(ul)
article_parent = self.find_article_parent(ul)
# print_and_flush(str(len(article_id)))
# print_and_flush(str(len(article_nickname)))
# print_and_flush(str(len(article_date)))
# print_and_flush(str(len(article_data)))
# print_and_flush(str(len(article_url)))
# print_and_flush(str(len(platform_id)))
# print_and_flush(str(len(article_profileurl)))
# print_and_flush(str(len(article_parent)))
if article_url[0].find(kakaostory_channel_url) != -1:
platform_form = "channel"
else:
platform_form = "story"
for i in range(0, len(article_id)):
content = dict()
content["article_id"] = article_id[i]
content["article_nickname"] = article_nickname[i]
content["article_profileurl"] = article_profileurl[i]
content["article_url"] = article_url[i]
content["platform_id"] = platform_id[i]
content["article_date"] = article_date[i]
content["article_data"] = article_data[i]
content["platform_form"] = platform_form
content["article_order"] = i
content["platform_name"] = self.find_platform_name()
content["article_form"] = self.find_article_form()
if len(article_parent[i]) > 0:
content["article_parent"] = article_parent[i]
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def crawl_all(self):
self.find_init()
self.read_all_reply()
try:
ul = self.get_reply_ul()
self.crawl_reply(ul)
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
def find_article_id(self, ul):
a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a")
str_id_list = list()
for a in a_list:
href = a.get_attribute('href')
str_id = href.replace(kakaostory_url, "").strip()
str_id_list.append(str_id)
return str_id_list
def find_article_profileurl(self, ul):
img = ul.find_elements_by_xpath("li/div[@class='pf']/a/img")
img_list = list()
for im in img:
img_list.append(im.get_attribute('src'))
return img_list
def find_article_nickname(self, ul):
a = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@data-profile-popup]")
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']")
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel
nickname_list = list()
for i in a:
nickname_list.append(i.text)
return nickname_list
def find_article_date(self, ul):
a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']")
# a.send_keys(Keys.NULL)
# ac = ActionChains(self.driver)
# ac.move_to_element(a).perform()
# wait(0.1)
# data_tooltip = a.get_attribute("data-tooltip")
date_list = list()
for a in a_list:
data_tooltip = a.get_attribute("title")
m = self.re_date.search(data_tooltip)
if m is None:
date_list.append("0000-00-00 00:00:00")
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
date_list.append(str(temp_date))
return date_list
def find_article_parent(self, ul):
comments = ul.find_elements_by_xpath("li/div[@class='txt']")
article_parents = list()
for comment in comments:
try:
a = comment.find_element_by_xpath("a[@data-profile-popup]")
article_parents.append(a.text)
except:
article_parents.append("")
return article_parents
def find_article_data(self, ul):
all_elements = ul.find_elements_by_xpath("li/div[@class='txt']")
all_elements_p = ul.find_elements_by_xpath("li/div[@class='txt']/p")
all_text_list = list()
for i in range(0, len(all_elements)):
all_text = all_elements[i].text
p_text = all_elements_p[i].text
all_text_list.append(all_text[len(p_text):].strip())
return all_text_list
def find_article_url(self, ul):
a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']")
article_url_list = list()
for a in a_list:
href = a.get_attribute("href")
article_url_list.append(href[:href.rindex('/')])
return article_url_list
def find_platform_id(self, ul):
article_urls = self.find_article_url(ul)
platform_id = list()
for article_url in article_urls:
main_url = article_url[:article_url.rindex('/')]
#return main_url[main_url.rindex('/')+1:]
platform_id.append(main_url.replace(kakaostory_url, ""))
return platform_id
def find_article_form(self, ul=None):
return 'reply'
def find_platform_name(self, ul=None):
return 'kakaostory'
def find_platform_form(self, ul=None):
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
return 'channel'
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
return 'tag'
else:
return 'story'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def find_like_count(self, ul):
try:
like = ul.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']")
return like.text
except:
return '0'
class KakaoPageCrawler:
def __init__(self, driver=None, begin_date=None, end_date=None):
self.driver = driver
self.activity_data_model_set = set()
self.begin_date = begin_date
self.end_date = end_date
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
self.index = 0
self.activities = None
self.present_activity = 0
self.previous_activity = 0
self.reload_count = 0
def move_to_url(self, url):
self.driver.get(url)
self.index = 0
self.activity_data_model_set.clear()
def init(self):
self.index = 0
self.previous_activity = 0
self.activities = None
self.activity_data_model_set.clear()
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
def next_activity_backup(self):
try:
if not self.activities:
self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='section _activity']")))
self.index = 0
if len(self.activities) == 0:
print_and_flush("activities are not found")
self.activities = None
return None
except:
print_and_flush("activities are not found")
self.activities = None
return None
has_more_activities = True
self.present_activity = len(self.activities)
while has_more_activities:
for activity in self.activities[self.previous_activity:]:
if activity.get_attribute("data-model") in self.activity_data_model_set:
continue
self.activity_data_model_set.add(activity.get_attribute("data-model"))
time_date = self.find_article_date(activity)
if self.is_earlier(time_date):
self.activities = None
return None
if self.is_late(time_date):
continue
return activity
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if len(self.activities) == self.present_activity:
has_more_activities = self.load_more_activities()
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
else:
has_more_activities = True
self.previous_activity = self.present_activity
self.present_activity = len(self.activities)
self.activities = None
return None
def next_activity(self):
try:
if self.activities is None:
self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, "div[class='section _activity']"))
)
if len(self.activities) == 0:
print_and_flush("activities are not found")
self.activities = None
return None
except:
print_and_flush("activities are not found")
self.activities = None
return None
while True:
self.index += 1
if self.index >= len(self.activities):
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if self.index >= len(self.activities):
if self.load_more_activities() is False:
self.activities = None
return None
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if self.activities[self.index - 1].get_attribute("data-model") in self.activity_data_model_set:
continue
time_date = self.find_article_date(self.activities[self.index - 1])
time_modified_date = self.find_article_modified_date(self.activities[self.index - 1])
if time_modified_date is not None:
time_date = time_modified_date
print_and_flush(str(time_date))
if type(time_date) == str:
continue
if self.is_earlier(time_date):
self.activities = None
return None
if self.is_late(time_date):
continue
return self.activities[self.index - 1]
def crawling_ok(self):
self.activity_data_model_set.add(self.activities[self.index - 1].get_attribute("data-model"))
def next_activity_prepare(self):
try:
activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if len(activities) == 0:
return None
except:
return None
has_more_activities = True
while has_more_activities:
if self.index < len(activities):
temp_index = self.index
self.index += 1
time_date = self.find_article_date(activities[temp_index])
if self.is_earlier(time_date):
return None
if self.is_late(time_date):
continue
return activities[temp_index]
else:
has_more_activities = self.load_more_activities()
activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
return None
def load_more_activities(self):
previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
for i in range(0, 5):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
body.send_keys(Keys.NULL)
body.send_keys(Keys.END)
wait(4)
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if len(previous_activities) != len(present_activities):
wait(2)
self.reload_count = 0
return True
for i in range(0, 5):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
for j in range(0, 3):
body.send_keys(Keys.PAGE_UP)
wait(0.1)
for j in range(0, 50):
body.send_keys(Keys.PAGE_DOWN)
wait(0.1)
wait(4)
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if len(previous_activities) != len(present_activities):
wait(2)
self.reload_count = 0
return True
for i in range(0, 10):
print_and_flush("Try load more")
self.driver.execute_script("window.scrollBy(0, 800)")
wait(4)
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if len(previous_activities) != len(present_activities):
wait(2)
self.reload_count = 0
return True
if self.reload_count < 10:
print_and_flush("index reload")
self.reload_count += 1
self.index //= 2
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
return True
if self.reload_count < 15:
print_and_flush("refresh")
self.driver.refresh()
wait(5)
self.index = 0
self.reload_count += 1
return True
return False
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def set_driver(self, driver):
self.driver = driver
def find_article_date(self, activity):
a = activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
a.send_keys(Keys.NULL)
ac = ActionChains(self.driver)
ac.move_to_element(a).perform()
wait(0.5)
ac.move_to_element(a).perform()
wait(0.5)
data_tooltip = a.get_attribute("data-tooltip")
m = self.re_date.search(data_tooltip)
if m is None:
data_tooltip = a.get_attribute("title")
m = self.re_date.search(data_tooltip)
if m is None:
return "0000-00-00 00:00:00"
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return temp_date
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return temp_date
def find_article_modified_date(self, activity):
try:
span = activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span")
except:
return None
ac = ActionChains(self.driver)
ac.move_to_element(span).perform()
wait(0.8)
data_tooltip = span.get_attribute("data-tooltip")
wait(0.2)
m = self.re_date.search(data_tooltip)
if m is None:
return None
else:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
#return temp_date.strftime("%Y-%m-%d")
return temp_date
class KakaoMainCrawler:
def __init__(self):
self.page_crawler = KakaoPageCrawler()
self.body_crawler = KakaoBodyCrawler()
self.reply_crawler = KakaoReplyCrawler()
self.send_to_db = SendtoDB()
self.driver = None
self.browser = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all_current_url(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.activity_data_model_set = backup_set.copy()
while True:
activity = self.page_crawler.next_activity()
if activity is None:
break
try:
self.crawl_body(activity)
self.crawl_reply(activity)
self.page_crawler.crawling_ok()
print_and_flush("ok")
except WebDriverException as ee:
print_and_flush(ee)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush("fail")
raise WebDriverException
except Exception as e:
print_and_flush("failed")
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
def crawl_body(self, activity):
# print_and_flush("start body crawl")
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_activity(activity)
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
def crawl_reply(self, activity):
# print_and_flush("start reply crawl")
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.set_activity(activity)
if self.reply_crawler.has_reply():
self.reply_crawler.crawl_all()
self.send_to_db.send_reply(self.reply_crawler.get_content())
class KakaoInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[6] = "https://story.kakao.com/ch/"
self.urls[7] = "https://story.kakao.com/hashtag/"
self.urls[8] = "https://story.kakao.com/"
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 6 or self.platform() == 8:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x.strip()))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
if __name__ == '__main__':
"""
argv:
0 - kakaocrawl.py
1 - keyword_id
2 - data db num
3 - before_day
4 - until_page
"""
if len(sys.argv) < 5:
print("Fail to process execute")
exit(1)
else:
print("Start Python Crawling")
kakao_init = KakaoInit(int(sys.argv[3]))
kakao_init.get_keyword_parameters(sys.argv[1])
kakao_init.disconnect()
browser = Browser()
kakao_main = KakaoMainCrawler()
kakao_main.set_driver(browser.get_new_driver("chrome"))
# kakao_main.driver.implicitly_wait(5)
wait(3)
kakao_main.set_keyword_id(sys.argv[1])
kakao_main.send_to_db.set_db(sys.argv[2])
realtime = True
while realtime:
print_and_flush("Crawler Start")
url_list = kakao_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
kakao_main.driver.get(url_list[i])
wait(3)
kakao_main.page_crawler.set_date(begin_date=kakao_init.get_begin_day(),
end_date=kakao_init.get_end_day())
kakao_main.crawl_all_current_url(backup_set)
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
backup_set = kakao_main.page_crawler.activity_data_model_set.copy()
kakao_main.set_driver(browser.new_browser())
# kakao_main.driver.implicitly_wait(5)
wait(5)
realtime = kakao_init.is_realtime()
print_and_flush("Finished Crawling :)")
# kakao_main.driver.quit()
kakao_main.send_to_db.close()
print_and_flush("ByeBye :)")
exit(0)