1310 lines
55 KiB
Python
1310 lines
55 KiB
Python
#-*- coding: utf-8 -*-
|
|
__author__ = 'cococo'
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
|
import sys
|
|
import re
|
|
import datetime
|
|
import json
|
|
import os
|
|
import time
|
|
|
|
from navercrawl import wait
|
|
from navercrawl import print_and_flush
|
|
from navercrawl import SendtoDB
|
|
from navercrawl import Browser
|
|
from navercrawl import CrawlInit
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
kakaostory_url = 'https://story.kakao.com/'
|
|
kakaostory_channel_url = 'https://story.kakao.com/ch/'
|
|
|
|
class KakaoBodyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_activity(self, activity):
|
|
self.activity = activity
|
|
|
|
def find_article_profileurl(self):
|
|
img = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a/img")
|
|
return img.get_attribute("src")
|
|
|
|
def find_article_nickname(self):
|
|
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/div[@class='myid']/a")
|
|
return a.text
|
|
|
|
def find_article_modified_date(self):
|
|
try:
|
|
span = self.activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span")
|
|
except:
|
|
return None
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element(span).perform()
|
|
wait(0.3)
|
|
data_tooltip = span.get_attribute("data-tooltip")
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
return None
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return temp_date
|
|
|
|
def find_article_date(self):
|
|
time_modified_date = self.find_article_modified_date()
|
|
if time_modified_date is not None:
|
|
return time_modified_date
|
|
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
|
|
a.send_keys(Keys.NULL)
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element(a).perform()
|
|
wait(0.2)
|
|
data_tooltip = a.get_attribute("data-tooltip")
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return str(temp_date)
|
|
|
|
def find_article_id(self):
|
|
a = self.activity.find_element_by_xpath("div/div[@class='_profileArea pf']/a")
|
|
href = a.get_attribute("href")
|
|
#str_id = href[href.rindex('/') + 1:]
|
|
str_id = href.replace(kakaostory_url, "")
|
|
return str_id
|
|
|
|
def find_article_url(self):
|
|
a = self.activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
|
|
url = a.get_attribute("href")
|
|
return url
|
|
|
|
def find_platform_name(self):
|
|
return "kakaostory"
|
|
|
|
def find_platform_form(self):
|
|
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
|
|
return 'channel'
|
|
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
|
|
return 'tag'
|
|
else:
|
|
return 'story'
|
|
|
|
def find_article_form(self):
|
|
return "body"
|
|
|
|
def find_article_data(self):
|
|
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']")
|
|
display = more.get_attribute("style")
|
|
if display.find('none') == -1:
|
|
a = more.find_element_by_tag_name("a")
|
|
self.enter_element(a)
|
|
try:
|
|
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']")
|
|
except:
|
|
return str("")
|
|
return content.text
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
def find_platform_id(self):
|
|
return self.find_article_id()
|
|
|
|
def find_article_title(self):
|
|
content = self.find_article_data()
|
|
if not content:
|
|
return ""
|
|
try:
|
|
return content.strip().splitlines()[0]
|
|
except:
|
|
return ""
|
|
|
|
def find_feeling_users3(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
|
|
except:
|
|
return None
|
|
self.enter_element(a)
|
|
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
|
|
str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']")
|
|
like_num = int(str_like.text)
|
|
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
|
|
while len(fake_scroll.find_elements_by_tag_name("li")) < like_num:
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.drag_and_drop_by_offset(scroll, 0, 15).perform()
|
|
wait(1)
|
|
lis = fake_scroll.find_elements_by_tag_name("li")
|
|
data = list()
|
|
for li in lis:
|
|
try:
|
|
a = li.find_element_by_xpath("a[@class='link_people']")
|
|
href = a.get_attribute('href')
|
|
# str_id = href[href.rindex('/') + 1:]
|
|
str_id = href.replace(kakaostory_url, "")
|
|
img = a.find_element_by_css_selector("img[class='img_thumb']")
|
|
profileurl = img.get_attribute('src')
|
|
data.append({'id': str_id, 'profileurl': profileurl})
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
feelings = dict()
|
|
feelings['data'] = data
|
|
feelings['count'] = len(data)
|
|
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
|
|
self.click_element(a)
|
|
wait(1)
|
|
return feelings
|
|
|
|
def find_reply_users(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewComments' and not(@style)]")
|
|
except:
|
|
return None
|
|
count = a.find_element_by_css_selector("strong._commentCount").text
|
|
if len(count.strip()) < 1:
|
|
return None
|
|
else:
|
|
return int(count.replace(",", "").strip())
|
|
|
|
def find_feeling_users(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
|
|
except:
|
|
return None
|
|
self.enter_element(a)
|
|
# inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
|
|
inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']")))
|
|
str_like = inner_layer.find_element_by_css_selector("span[class='_likeCount']")
|
|
like_num = int(str_like.text.replace(",", ""))
|
|
# fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
|
|
fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']")))
|
|
start_time = time.time()
|
|
while len(fake_scroll.find_elements_by_tag_name("li")) < like_num:
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
|
|
ac.drag_and_drop_by_offset(scroll, 0, 30).perform()
|
|
wait(0.5)
|
|
if time.time() - start_time > 600.0:
|
|
break
|
|
ul = fake_scroll.find_element_by_tag_name("ul")
|
|
data = list()
|
|
try:
|
|
a_list = ul.find_elements_by_css_selector("a[class='link_people']")
|
|
# img_list = ul.find_elements_by_css_selector("img[class='img_thumb']")
|
|
for i in range(0, len(a_list)):
|
|
href = a_list[i].get_attribute('href')
|
|
str_id = href.replace(kakaostory_url, "")
|
|
# profileurl = img_list[i].get_attribute('src')
|
|
# data.append({'id': str_id, 'profileurl': profileurl})
|
|
data.append({'id': str_id})
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
feelings = dict()
|
|
feelings['data'] = data
|
|
feelings['count'] = len(data)
|
|
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
|
|
self.click_element(a)
|
|
wait(1)
|
|
return feelings
|
|
|
|
def find_feeling_users2(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
|
|
except:
|
|
return None
|
|
self.enter_element(a)
|
|
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
|
|
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
scroll_style = scroll.get_attribute("style")
|
|
re_height = re.compile("height: ([0-9]*\\.[0-9]+|[0-9]+)px")
|
|
re_top = re.compile("top: ([0-9]*\\.[0-9]+|[0-9]+)px")
|
|
m_h = re_height.search(scroll_style)
|
|
m_t = re_top.search(scroll_style)
|
|
if m_t is None:
|
|
top = 0.0
|
|
else:
|
|
top = float(m_t.group(1))
|
|
if m_h is None:
|
|
height = 0.0
|
|
else:
|
|
height = float(m_h.group(1))
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
|
|
i = 0
|
|
data = list()
|
|
while height + top < 320:
|
|
lis = fake_scroll.find_elements_by_tag_name("li")
|
|
for j in range(i, (i+6) if i+6 < len(lis) else len(lis)):
|
|
a = lis[j].find_element_by_xpath("a[@class='link_people']")
|
|
href = a.get_attribute('href')
|
|
str_id = href[href.rindex('/') + 1:]
|
|
em = a.find_element_by_css_selector("em[class='tit_userinfo']")
|
|
nickname = em.text
|
|
span = a.find_element_by_css_selector("span[class='txt_feel']")
|
|
emotion = span.text
|
|
img = a.find_element_by_css_selector("img[class='img_thumb']")
|
|
profileurl = img.get_attribute('src')
|
|
data.append({'id': str_id, 'nickname': nickname, 'emotion': emotion, 'profileurl': profileurl})
|
|
i += 6
|
|
move_pixel = 1968.0 / len(fake_scroll.find_elements_by_tag_name("li"))
|
|
ac = ActionChains(self.driver)
|
|
ac.drag_and_drop_by_offset(scroll, 0, move_pixel).perform()
|
|
wait(1)
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
scroll_style = scroll.get_attribute("style")
|
|
m_h = re_height.search(scroll_style)
|
|
m_t = re_top.search(scroll_style)
|
|
if m_t is None:
|
|
top = 0.0
|
|
else:
|
|
top = float(m_t.group(1))
|
|
if m_h is None:
|
|
height = 0.0
|
|
else:
|
|
height = float(m_h.group(1))
|
|
feelings = dict()
|
|
feelings['data'] = data
|
|
feelings['count'] = len(data)
|
|
a = inner_layer.find_element_by_css_selector("a[class='btn_close _likeListLayerClose']")
|
|
self.click_element(a)
|
|
return feelings
|
|
|
|
def find_share_users2(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]")
|
|
except:
|
|
return None
|
|
self.enter_element(a)
|
|
inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
|
|
str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']")
|
|
re_share = re.compile("\\(([\\d]+)\\)")
|
|
m = re_share.search(str_share.text)
|
|
if m is None:
|
|
share_num = 0
|
|
else:
|
|
share_num = int(m.group(1))
|
|
fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
|
|
while len(fake_scroll.find_elements_by_tag_name("li")) < share_num:
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.drag_and_drop_by_offset(scroll, 0, 15).perform()
|
|
wait(1)
|
|
lis = fake_scroll.find_elements_by_tag_name("li")
|
|
data = list()
|
|
for li in lis:
|
|
try:
|
|
a = li.find_element_by_xpath("a[@class='link_people']")
|
|
href = a.get_attribute('href')
|
|
last_slush = href.rindex('/')
|
|
# begin_slush = href[:last_slush].rindex('/')
|
|
# str_id = href[begin_slush+1:last_slush]
|
|
str_id = href[:last_slush].replace(kakaostory_url, "")
|
|
img = a.find_element_by_css_selector("img[class='img_thumb']")
|
|
profileurl = img.get_attribute('src')
|
|
data.append({'id': str_id, 'profileurl': profileurl})
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
shares = dict()
|
|
shares['data'] = data
|
|
shares['count'] = len(data)
|
|
a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']")
|
|
self.click_element(a)
|
|
return shares
|
|
|
|
def find_share_users(self):
|
|
try:
|
|
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]")
|
|
except:
|
|
return None
|
|
self.enter_element(a)
|
|
# inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
|
|
inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']")))
|
|
str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']")
|
|
re_share = re.compile("\\(([\\d]+)\\)")
|
|
m = re_share.search(str_share.text)
|
|
if m is None:
|
|
share_num = 0
|
|
else:
|
|
share_num = int(m.group(1).replace(",", ""))
|
|
# fake_scroll = inner_layer.find_element_by_css_selector("div[class='fake_scroll']")
|
|
fake_scroll = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='fake_scroll']")))
|
|
start_time = time.time()
|
|
while len(fake_scroll.find_elements_by_tag_name("li")) < share_num:
|
|
scroll = fake_scroll.find_element_by_css_selector("div[class='scroll']")
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(fake_scroll, 0, 0).perform()
|
|
ac.drag_and_drop_by_offset(scroll, 0, 30).perform()
|
|
wait(0.5)
|
|
if time.time() - start_time > 600.0:
|
|
break
|
|
ul = fake_scroll.find_element_by_tag_name("ul")
|
|
data = list()
|
|
try:
|
|
a_list = ul.find_elements_by_css_selector("a[class='link_people']")
|
|
# img_list = ul.find_elements_by_css_selector("img[class='img_thumb']")
|
|
for i in range(0, len(a_list)):
|
|
href = a_list[i].get_attribute('href')
|
|
last_slush = href.rindex('/')
|
|
# begin_slush = href[:last_slush].rindex('/')
|
|
# str_id = href[begin_slush+1:last_slush]
|
|
str_id = href[:last_slush].replace(kakaostory_url, "")
|
|
# profileurl = img_list[i].get_attribute('src')
|
|
# data.append({'id': str_id, 'profileurl': profileurl})
|
|
data.append({'id': str_id})
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
shares = dict()
|
|
shares['data'] = data
|
|
shares['count'] = len(data)
|
|
a = inner_layer.find_element_by_css_selector("a[class='btn_close _btnCloseShareLayer']")
|
|
self.click_element(a)
|
|
return shares
|
|
|
|
def find_platform_title(self):
|
|
return self.driver.title
|
|
|
|
def get_content(self):
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id()
|
|
# print_and_flush("article_id")
|
|
content["article_nickname"] = self.find_article_nickname()
|
|
# print_and_flush("article_nickname")
|
|
content["article_title"] = self.find_article_title()
|
|
# print_and_flush("article_title")
|
|
content["article_date"] = self.find_article_date()
|
|
# print_and_flush("article_date")
|
|
#content["article_hit"] = self.find_article_hit()
|
|
content["article_url"] = self.find_article_url()
|
|
# print_and_flush("article_url")
|
|
content["article_data"] = self.find_article_data()
|
|
# print_and_flush("article_data")
|
|
content["article_form"] = self.find_article_form()
|
|
# print_and_flush("article_form")
|
|
content["article_profileurl"] = self.find_article_profileurl()
|
|
# print_and_flush("article_profileurl")
|
|
#content["platform_title"] = self.find_platform_title()
|
|
content["platform_title"] = content["article_nickname"]
|
|
# print_and_flush("platform_title")
|
|
content["platform_name"] = self.find_platform_name()
|
|
if content["article_url"].find(kakaostory_channel_url) != -1:
|
|
content["platform_form"] = "channel"
|
|
else:
|
|
content["platform_form"] = "story"
|
|
# print_and_flush("platform_form")
|
|
content["platform_id"] = self.find_platform_id()
|
|
# print_and_flush("platform_id")
|
|
data = list()
|
|
# print_and_flush("start feelings")
|
|
feelings = self.find_feeling_users()
|
|
# print_and_flush("feelings")
|
|
# print_and_flush("done feelings")
|
|
if feelings is not None:
|
|
data.append({"feelings": feelings})
|
|
content["article_profile"] = str(feelings["count"])
|
|
# print_and_flush("start shares")
|
|
shares = self.find_share_users()
|
|
# print_and_flush("shares")
|
|
# print_and_flush("done shares")
|
|
if shares is not None:
|
|
data.append({"shares": shares})
|
|
content["reply_url"] = str(shares["count"])
|
|
if data:
|
|
json_data = {"data": data}
|
|
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode(json_data)
|
|
reply_count = self.find_reply_users()
|
|
if type(reply_count) == int:
|
|
content["article_order"] = reply_count
|
|
return content
|
|
|
|
|
|
class KakaoReplyCrawler_backup:
|
|
def __init__(self, driver=None, activity=None):
|
|
self.driver = driver
|
|
self.activity = activity
|
|
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
self.reply_list = list()
|
|
self.order = 0
|
|
|
|
def find_init(self):
|
|
self.reply_list.clear()
|
|
self.order = 0
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_activity(self, activity):
|
|
self.activity = activity
|
|
|
|
def has_more(self):
|
|
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
|
|
if more.get_attribute('style').find('block') != -1:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def read_more_reply(self):
|
|
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
|
|
a = more.find_element_by_css_selector("a[class='_btnCommentMore']")
|
|
self.enter_element(a)
|
|
|
|
def read_all_reply(self):
|
|
while self.has_more():
|
|
self.read_more_reply()
|
|
|
|
def get_reply_lis(self):
|
|
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
|
|
lis = ul.find_elements_by_tag_name("li")
|
|
return lis
|
|
|
|
def has_reply(self):
|
|
try:
|
|
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
|
|
lis = ul.find_elements_by_tag_name("li")
|
|
if len(lis) > 0:
|
|
return True
|
|
else:
|
|
return False
|
|
except:
|
|
return False
|
|
|
|
def crawl_reply(self, li):
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id(li)
|
|
content["article_nickname"] = self.find_article_nickname(li)
|
|
content["article_date"] = self.find_article_date(li)
|
|
content["article_data"] = self.find_article_data(li)
|
|
content["article_order"] = self.order
|
|
content["article_url"] = self.find_article_url(li)
|
|
content["platform_id"] = self.find_platform_id(li)
|
|
content["article_form"] = self.find_article_form()
|
|
content["article_profileurl"] = self.find_article_profileurl(li)
|
|
content["platform_name"] = self.find_platform_name()
|
|
if content["article_url"].find(kakaostory_channel_url) != -1:
|
|
content["platform_form"] = "channel"
|
|
else:
|
|
content["platform_form"] = "story"
|
|
article_parent = self.find_article_parent(li)
|
|
if article_parent is not None:
|
|
content["article_parent"] = article_parent
|
|
self.order += 1
|
|
self.reply_list.append(content)
|
|
|
|
def get_content(self):
|
|
return self.reply_list
|
|
|
|
def crawl_all(self):
|
|
self.find_init()
|
|
self.read_all_reply()
|
|
try:
|
|
lis = self.get_reply_lis()
|
|
for li in lis:
|
|
self.crawl_reply(li)
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
|
|
def find_article_id(self, li):
|
|
a = li.find_element_by_xpath("div[@class='pf']/a")
|
|
href = a.get_attribute('href')
|
|
str_id = href.replace(kakaostory_url, "").strip()
|
|
return str_id
|
|
|
|
def find_article_profileurl(self, li):
|
|
img = li.find_element_by_xpath("div[@class='pf']/a/img")
|
|
return img.get_attribute('src')
|
|
|
|
def find_article_nickname(self, li):
|
|
a = li.find_element_by_xpath("div[@class='txt']/p/a[@data-profile-popup]")
|
|
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']")
|
|
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel
|
|
return a.text
|
|
|
|
def find_article_date(self, li):
|
|
a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']")
|
|
# a.send_keys(Keys.NULL)
|
|
# ac = ActionChains(self.driver)
|
|
# ac.move_to_element(a).perform()
|
|
# wait(0.1)
|
|
# data_tooltip = a.get_attribute("data-tooltip")
|
|
data_tooltip = a.get_attribute("title")
|
|
#a.get_attribute('title') <-- data_tooltip
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return str(temp_date)
|
|
|
|
def find_article_parent(self, li):
|
|
comment = li.find_element_by_xpath("div[@class='txt']")
|
|
try:
|
|
a = comment.find_element_by_xpath("a[@data-profile-popup]")
|
|
return a.text
|
|
except:
|
|
return None
|
|
|
|
def find_article_data(self, li):
|
|
all_element = li.find_element_by_xpath("div[@class='txt']")
|
|
all_text = all_element.text
|
|
p = all_element.find_element_by_tag_name('p')
|
|
p_text = p.text
|
|
return all_text[len(p_text):].strip()
|
|
|
|
def find_article_url(self, li):
|
|
a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='time _linkPost']")
|
|
href = a.get_attribute("href")
|
|
return href[:href.rindex('/')]
|
|
|
|
def find_platform_id(self, li):
|
|
article_url = self.find_article_url(li)
|
|
main_url = article_url[:article_url.rindex('/')]
|
|
#return main_url[main_url.rindex('/')+1:]
|
|
return main_url.replace(kakaostory_url, "")
|
|
|
|
def find_article_form(self, li=None):
|
|
return 'reply'
|
|
|
|
def find_platform_name(self, li=None):
|
|
return 'kakaostory'
|
|
|
|
def find_platform_form(self, li=None):
|
|
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
|
|
return 'channel'
|
|
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
|
|
return 'tag'
|
|
else:
|
|
return 'story'
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
def find_like_count(self, li):
|
|
try:
|
|
like = li.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']")
|
|
return like.text
|
|
except:
|
|
return '0'
|
|
|
|
|
|
class KakaoReplyCrawler:
|
|
def __init__(self, driver=None, activity=None):
|
|
self.driver = driver
|
|
self.activity = activity
|
|
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
self.reply_list = list()
|
|
self.order = 0
|
|
|
|
def find_init(self):
|
|
self.reply_list.clear()
|
|
self.order = 0
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_activity(self, activity):
|
|
self.activity = activity
|
|
|
|
def has_more(self):
|
|
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
|
|
if more.get_attribute('style').find('block') != -1:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def read_more_reply(self):
|
|
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
|
|
a = more.find_element_by_css_selector("a[class='_btnCommentMore']")
|
|
self.enter_element(a)
|
|
|
|
def read_all_reply(self):
|
|
start_time = time.time()
|
|
while self.has_more():
|
|
self.read_more_reply()
|
|
if time.time() - start_time > 600.0:
|
|
raise WebDriverException
|
|
|
|
def get_reply_ul(self):
|
|
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
|
|
return ul
|
|
|
|
def has_reply(self):
|
|
try:
|
|
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
|
|
lis = ul.find_elements_by_tag_name("li")
|
|
if len(lis) > 0:
|
|
return True
|
|
else:
|
|
return False
|
|
except:
|
|
return False
|
|
|
|
def crawl_reply(self, ul):
|
|
article_id = self.find_article_id(ul)
|
|
article_nickname = self.find_article_nickname(ul)
|
|
article_date = self.find_article_date(ul)
|
|
article_data = self.find_article_data(ul)
|
|
article_url = self.find_article_url(ul)
|
|
platform_id = self.find_platform_id(ul)
|
|
article_profileurl = self.find_article_profileurl(ul)
|
|
article_parent = self.find_article_parent(ul)
|
|
# print_and_flush(str(len(article_id)))
|
|
# print_and_flush(str(len(article_nickname)))
|
|
# print_and_flush(str(len(article_date)))
|
|
# print_and_flush(str(len(article_data)))
|
|
# print_and_flush(str(len(article_url)))
|
|
# print_and_flush(str(len(platform_id)))
|
|
# print_and_flush(str(len(article_profileurl)))
|
|
# print_and_flush(str(len(article_parent)))
|
|
if article_url[0].find(kakaostory_channel_url) != -1:
|
|
platform_form = "channel"
|
|
else:
|
|
platform_form = "story"
|
|
for i in range(0, len(article_id)):
|
|
content = dict()
|
|
content["article_id"] = article_id[i]
|
|
content["article_nickname"] = article_nickname[i]
|
|
content["article_profileurl"] = article_profileurl[i]
|
|
content["article_url"] = article_url[i]
|
|
content["platform_id"] = platform_id[i]
|
|
content["article_date"] = article_date[i]
|
|
content["article_data"] = article_data[i]
|
|
content["platform_form"] = platform_form
|
|
content["article_order"] = i
|
|
content["platform_name"] = self.find_platform_name()
|
|
content["article_form"] = self.find_article_form()
|
|
if len(article_parent[i]) > 0:
|
|
content["article_parent"] = article_parent[i]
|
|
self.reply_list.append(content)
|
|
|
|
def get_content(self):
|
|
return self.reply_list
|
|
|
|
def crawl_all(self):
|
|
self.find_init()
|
|
self.read_all_reply()
|
|
try:
|
|
ul = self.get_reply_ul()
|
|
self.crawl_reply(ul)
|
|
except WebDriverException:
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
|
|
def find_article_id(self, ul):
|
|
a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a")
|
|
str_id_list = list()
|
|
for a in a_list:
|
|
href = a.get_attribute('href')
|
|
str_id = href.replace(kakaostory_url, "").strip()
|
|
str_id_list.append(str_id)
|
|
return str_id_list
|
|
|
|
def find_article_profileurl(self, ul):
|
|
img = ul.find_elements_by_xpath("li/div[@class='pf']/a/img")
|
|
img_list = list()
|
|
for im in img:
|
|
img_list.append(im.get_attribute('src'))
|
|
return img_list
|
|
|
|
def find_article_nickname(self, ul):
|
|
a = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@data-profile-popup]")
|
|
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder']")
|
|
# a = li.find_element_by_xpath("div[@class='txt']/p/a[@class='name _namePlaceholder channel']") for channel
|
|
nickname_list = list()
|
|
for i in a:
|
|
nickname_list.append(i.text)
|
|
return nickname_list
|
|
|
|
def find_article_date(self, ul):
|
|
a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']")
|
|
# a.send_keys(Keys.NULL)
|
|
# ac = ActionChains(self.driver)
|
|
# ac.move_to_element(a).perform()
|
|
# wait(0.1)
|
|
# data_tooltip = a.get_attribute("data-tooltip")
|
|
date_list = list()
|
|
for a in a_list:
|
|
data_tooltip = a.get_attribute("title")
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
date_list.append("0000-00-00 00:00:00")
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
date_list.append(str(temp_date))
|
|
return date_list
|
|
|
|
def find_article_parent(self, ul):
|
|
comments = ul.find_elements_by_xpath("li/div[@class='txt']")
|
|
article_parents = list()
|
|
for comment in comments:
|
|
try:
|
|
a = comment.find_element_by_xpath("a[@data-profile-popup]")
|
|
article_parents.append(a.text)
|
|
except:
|
|
article_parents.append("")
|
|
return article_parents
|
|
|
|
def find_article_data(self, ul):
|
|
all_elements = ul.find_elements_by_xpath("li/div[@class='txt']")
|
|
all_elements_p = ul.find_elements_by_xpath("li/div[@class='txt']/p")
|
|
all_text_list = list()
|
|
for i in range(0, len(all_elements)):
|
|
all_text = all_elements[i].text
|
|
p_text = all_elements_p[i].text
|
|
all_text_list.append(all_text[len(p_text):].strip())
|
|
return all_text_list
|
|
|
|
def find_article_url(self, ul):
|
|
a_list = ul.find_elements_by_xpath("li/div[@class='txt']/p/a[@class='time _linkPost']")
|
|
article_url_list = list()
|
|
for a in a_list:
|
|
href = a.get_attribute("href")
|
|
article_url_list.append(href[:href.rindex('/')])
|
|
return article_url_list
|
|
|
|
def find_platform_id(self, ul):
|
|
article_urls = self.find_article_url(ul)
|
|
platform_id = list()
|
|
for article_url in article_urls:
|
|
main_url = article_url[:article_url.rindex('/')]
|
|
#return main_url[main_url.rindex('/')+1:]
|
|
platform_id.append(main_url.replace(kakaostory_url, ""))
|
|
return platform_id
|
|
|
|
def find_article_form(self, ul=None):
|
|
return 'reply'
|
|
|
|
def find_platform_name(self, ul=None):
|
|
return 'kakaostory'
|
|
|
|
def find_platform_form(self, ul=None):
|
|
if self.driver.current_url.find("https://story.kakao.com/ch/") != -1:
|
|
return 'channel'
|
|
elif self.driver.current_url.find("https://story.kakao.com/hashtag/") != -1:
|
|
return 'tag'
|
|
else:
|
|
return 'story'
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
def find_like_count(self, ul):
|
|
try:
|
|
like = ul.find_element_by_xpath("div[@class='txt']/p/span[@class='_likedComment']/a/span[@class='like_num _likeCommentCount']")
|
|
return like.text
|
|
except:
|
|
return '0'
|
|
|
|
|
|
class KakaoPageCrawler:
|
|
def __init__(self, driver=None, begin_date=None, end_date=None):
|
|
self.driver = driver
|
|
self.activity_data_model_set = set()
|
|
self.begin_date = begin_date
|
|
self.end_date = end_date
|
|
self.re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
self.index = 0
|
|
self.activities = None
|
|
self.present_activity = 0
|
|
self.previous_activity = 0
|
|
self.reload_count = 0
|
|
|
|
def move_to_url(self, url):
|
|
self.driver.get(url)
|
|
self.index = 0
|
|
self.activity_data_model_set.clear()
|
|
|
|
def init(self):
|
|
self.index = 0
|
|
self.previous_activity = 0
|
|
self.activities = None
|
|
self.activity_data_model_set.clear()
|
|
|
|
def set_date(self, begin_date, end_date):
|
|
self.set_begin_date(begin_date)
|
|
self.set_end_date(end_date)
|
|
|
|
def set_end_date(self, end_date):
|
|
if type(end_date) == str:
|
|
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
|
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
|
self.end_date = end_date
|
|
else:
|
|
self.end_date = datetime.datetime.today()
|
|
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
|
self.end_date += datetime.timedelta(days=1)
|
|
|
|
def set_begin_date(self, begin_date):
|
|
if type(begin_date) == str:
|
|
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
|
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
|
self.begin_date = begin_date
|
|
else:
|
|
self.begin_date = datetime.datetime.today()
|
|
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
|
|
|
|
def next_activity_backup(self):
|
|
try:
|
|
if not self.activities:
|
|
self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[class='section _activity']")))
|
|
self.index = 0
|
|
if len(self.activities) == 0:
|
|
print_and_flush("activities are not found")
|
|
self.activities = None
|
|
return None
|
|
except:
|
|
print_and_flush("activities are not found")
|
|
self.activities = None
|
|
return None
|
|
has_more_activities = True
|
|
self.present_activity = len(self.activities)
|
|
while has_more_activities:
|
|
for activity in self.activities[self.previous_activity:]:
|
|
if activity.get_attribute("data-model") in self.activity_data_model_set:
|
|
continue
|
|
self.activity_data_model_set.add(activity.get_attribute("data-model"))
|
|
time_date = self.find_article_date(activity)
|
|
if self.is_earlier(time_date):
|
|
self.activities = None
|
|
return None
|
|
if self.is_late(time_date):
|
|
continue
|
|
return activity
|
|
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if len(self.activities) == self.present_activity:
|
|
has_more_activities = self.load_more_activities()
|
|
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
else:
|
|
has_more_activities = True
|
|
self.previous_activity = self.present_activity
|
|
self.present_activity = len(self.activities)
|
|
self.activities = None
|
|
return None
|
|
|
|
def next_activity(self):
|
|
try:
|
|
if self.activities is None:
|
|
self.activities = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located(
|
|
(By.CSS_SELECTOR, "div[class='section _activity']"))
|
|
)
|
|
if len(self.activities) == 0:
|
|
print_and_flush("activities are not found")
|
|
self.activities = None
|
|
return None
|
|
except:
|
|
print_and_flush("activities are not found")
|
|
self.activities = None
|
|
return None
|
|
while True:
|
|
self.index += 1
|
|
if self.index >= len(self.activities):
|
|
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if self.index >= len(self.activities):
|
|
if self.load_more_activities() is False:
|
|
self.activities = None
|
|
return None
|
|
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if self.activities[self.index - 1].get_attribute("data-model") in self.activity_data_model_set:
|
|
continue
|
|
time_date = self.find_article_date(self.activities[self.index - 1])
|
|
time_modified_date = self.find_article_modified_date(self.activities[self.index - 1])
|
|
if time_modified_date is not None:
|
|
time_date = time_modified_date
|
|
print_and_flush(str(time_date))
|
|
if type(time_date) == str:
|
|
continue
|
|
if self.is_earlier(time_date):
|
|
self.activities = None
|
|
return None
|
|
if self.is_late(time_date):
|
|
continue
|
|
return self.activities[self.index - 1]
|
|
|
|
def crawling_ok(self):
|
|
self.activity_data_model_set.add(self.activities[self.index - 1].get_attribute("data-model"))
|
|
|
|
def next_activity_prepare(self):
|
|
try:
|
|
activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if len(activities) == 0:
|
|
return None
|
|
except:
|
|
return None
|
|
has_more_activities = True
|
|
while has_more_activities:
|
|
if self.index < len(activities):
|
|
temp_index = self.index
|
|
self.index += 1
|
|
time_date = self.find_article_date(activities[temp_index])
|
|
if self.is_earlier(time_date):
|
|
return None
|
|
if self.is_late(time_date):
|
|
continue
|
|
return activities[temp_index]
|
|
else:
|
|
has_more_activities = self.load_more_activities()
|
|
activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
return None
|
|
|
|
def load_more_activities(self):
|
|
previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
for i in range(0, 5):
|
|
print_and_flush("Try load more")
|
|
body = self.driver.find_element_by_tag_name("body")
|
|
body.send_keys(Keys.NULL)
|
|
body.send_keys(Keys.END)
|
|
wait(4)
|
|
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if len(previous_activities) != len(present_activities):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
for i in range(0, 5):
|
|
print_and_flush("Try load more")
|
|
body = self.driver.find_element_by_tag_name("body")
|
|
for j in range(0, 3):
|
|
body.send_keys(Keys.PAGE_UP)
|
|
wait(0.1)
|
|
for j in range(0, 50):
|
|
body.send_keys(Keys.PAGE_DOWN)
|
|
wait(0.1)
|
|
wait(4)
|
|
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if len(previous_activities) != len(present_activities):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
for i in range(0, 10):
|
|
print_and_flush("Try load more")
|
|
self.driver.execute_script("window.scrollBy(0, 800)")
|
|
wait(4)
|
|
present_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
if len(previous_activities) != len(present_activities):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
if self.reload_count < 10:
|
|
print_and_flush("index reload")
|
|
self.reload_count += 1
|
|
self.index //= 2
|
|
position = self.driver.get_window_position()
|
|
size = self.driver.get_window_size()
|
|
self.driver.maximize_window()
|
|
self.driver.set_window_size(size['width'], size["height"])
|
|
self.driver.set_window_position(position['x'], position['y'])
|
|
return True
|
|
if self.reload_count < 15:
|
|
print_and_flush("refresh")
|
|
self.driver.refresh()
|
|
wait(5)
|
|
self.index = 0
|
|
self.reload_count += 1
|
|
return True
|
|
return False
|
|
|
|
def is_earlier(self, time_date):
|
|
return True if time_date < self.begin_date else False
|
|
|
|
def is_late(self, time_date):
|
|
return True if time_date > self.end_date else False
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def find_article_date(self, activity):
|
|
a = activity.find_element_by_xpath("div/div[@class='add_top']/a[@class='time _linkPost']")
|
|
a.send_keys(Keys.NULL)
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element(a).perform()
|
|
wait(0.5)
|
|
ac.move_to_element(a).perform()
|
|
wait(0.5)
|
|
data_tooltip = a.get_attribute("data-tooltip")
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
data_tooltip = a.get_attribute("title")
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return temp_date
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return temp_date
|
|
|
|
def find_article_modified_date(self, activity):
|
|
try:
|
|
span = activity.find_element_by_xpath("div/div[@class='add_top']/span[@class='time']/span")
|
|
except:
|
|
return None
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element(span).perform()
|
|
wait(0.8)
|
|
data_tooltip = span.get_attribute("data-tooltip")
|
|
wait(0.2)
|
|
m = self.re_date.search(data_tooltip)
|
|
if m is None:
|
|
return None
|
|
else:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
#return temp_date.strftime("%Y-%m-%d")
|
|
return temp_date
|
|
|
|
|
|
class KakaoMainCrawler:
|
|
def __init__(self):
|
|
self.page_crawler = KakaoPageCrawler()
|
|
self.body_crawler = KakaoBodyCrawler()
|
|
self.reply_crawler = KakaoReplyCrawler()
|
|
self.send_to_db = SendtoDB()
|
|
self.driver = None
|
|
self.browser = None
|
|
|
|
def set_driver(self, driver):
|
|
self.page_crawler.set_driver(driver)
|
|
self.body_crawler.set_driver(driver)
|
|
self.reply_crawler.set_driver(driver)
|
|
self.driver = driver
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def crawl_all_current_url(self, backup_set=None):
|
|
self.page_crawler.init()
|
|
if backup_set:
|
|
self.page_crawler.activity_data_model_set = backup_set.copy()
|
|
while True:
|
|
activity = self.page_crawler.next_activity()
|
|
if activity is None:
|
|
break
|
|
try:
|
|
self.crawl_body(activity)
|
|
self.crawl_reply(activity)
|
|
self.page_crawler.crawling_ok()
|
|
print_and_flush("ok")
|
|
except WebDriverException as ee:
|
|
print_and_flush(ee)
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush("fail")
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
print_and_flush("failed")
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
print_and_flush(e)
|
|
|
|
def crawl_body(self, activity):
|
|
# print_and_flush("start body crawl")
|
|
self.body_crawler.set_driver(self.driver)
|
|
self.body_crawler.set_activity(activity)
|
|
content = self.body_crawler.get_content()
|
|
content["keyword_id"] = self.keyword_id
|
|
print_and_flush(content["article_url"])
|
|
self.send_to_db.delete_url(content['article_url'])
|
|
self.send_to_db.send_body(content)
|
|
|
|
def crawl_reply(self, activity):
|
|
# print_and_flush("start reply crawl")
|
|
self.reply_crawler.set_driver(self.driver)
|
|
self.reply_crawler.set_activity(activity)
|
|
if self.reply_crawler.has_reply():
|
|
self.reply_crawler.crawl_all()
|
|
self.send_to_db.send_reply(self.reply_crawler.get_content())
|
|
|
|
|
|
class KakaoInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[6] = "https://story.kakao.com/ch/"
|
|
self.urls[7] = "https://story.kakao.com/hashtag/"
|
|
self.urls[8] = "https://story.kakao.com/"
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
trimmed_list = list()
|
|
if self.platform() == 6 or self.platform() == 8:
|
|
for x in splited_list:
|
|
trimmed_list.append(x.strip())
|
|
else:
|
|
for x in splited_list:
|
|
trimmed_list.append(self.utf8(x.strip()))
|
|
return trimmed_list
|
|
|
|
def make_url(self):
|
|
urls = list()
|
|
for x in self.split_searches():
|
|
url = self.urls[self.platform()] + x
|
|
urls.append(url)
|
|
return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result
|
|
else:
|
|
return self.end_day()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
"""
|
|
argv:
|
|
0 - kakaocrawl.py
|
|
1 - keyword_id
|
|
2 - data db num
|
|
3 - before_day
|
|
4 - until_page
|
|
"""
|
|
|
|
if len(sys.argv) < 5:
|
|
print("Fail to process execute")
|
|
exit(1)
|
|
else:
|
|
print("Start Python Crawling")
|
|
|
|
kakao_init = KakaoInit(int(sys.argv[3]))
|
|
kakao_init.get_keyword_parameters(sys.argv[1])
|
|
kakao_init.disconnect()
|
|
browser = Browser()
|
|
kakao_main = KakaoMainCrawler()
|
|
kakao_main.set_driver(browser.get_new_driver("chrome"))
|
|
# kakao_main.driver.implicitly_wait(5)
|
|
wait(3)
|
|
kakao_main.set_keyword_id(sys.argv[1])
|
|
kakao_main.send_to_db.set_db(sys.argv[2])
|
|
realtime = True
|
|
while realtime:
|
|
print_and_flush("Crawler Start")
|
|
url_list = kakao_init.make_url()
|
|
i = 0
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
try:
|
|
print_and_flush(url_list[i] + "\n")
|
|
kakao_main.driver.get(url_list[i])
|
|
wait(3)
|
|
kakao_main.page_crawler.set_date(begin_date=kakao_init.get_begin_day(),
|
|
end_date=kakao_init.get_end_day())
|
|
kakao_main.crawl_all_current_url(backup_set)
|
|
i += 1
|
|
backup_set.clear()
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
exc_type, exc_obj, exc_tb = sys.exc_info()
|
|
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
|
print(exc_type, fname, exc_tb.tb_lineno)
|
|
backup_set = kakao_main.page_crawler.activity_data_model_set.copy()
|
|
kakao_main.set_driver(browser.new_browser())
|
|
# kakao_main.driver.implicitly_wait(5)
|
|
wait(5)
|
|
realtime = kakao_init.is_realtime()
|
|
print_and_flush("Finished Crawling :)")
|
|
# kakao_main.driver.quit()
|
|
kakao_main.send_to_db.close()
|
|
print_and_flush("ByeBye :)")
|
|
|
|
exit(0) |