508 lines
21 KiB
Python
508 lines
21 KiB
Python
import datetime
|
|
import json
|
|
import effect.effecterror
|
|
import re
|
|
|
|
from kakao.kakaoexception import NotFoundElementError
|
|
from bs4 import BeautifulSoup
|
|
|
|
from base.baseclasses import wait
|
|
from effect.resultsender import get_settings
|
|
from effect.resultsender import ResultSender
|
|
from base.baseclasses import find_element_by_css_selector
|
|
from base.baseclasses import enter_element
|
|
|
|
try:
|
|
import lxml
|
|
parser_opt = 'lxml'
|
|
except ImportError:
|
|
parser_opt = 'html.parser'
|
|
|
|
SEPERATOR = '!@#'
|
|
|
|
kakaostory_url = 'https://story.kakao.com/'
|
|
kakaostory_channel_url = 'https://story.kakao.com/ch/'
|
|
limit_reload = 5
|
|
num_of_retry = 3
|
|
|
|
|
|
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
|
|
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
|
|
|
|
def get_date(element):
|
|
"""
|
|
|
|
:param element: this may be span.time element
|
|
:return: 'yyyy-MM-dd hh:mm:ss'
|
|
"""
|
|
m = re_date.search(element.attrs.get('title', '')) \
|
|
or re_date.search(element.attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
# return invalid date instead of exception
|
|
else:
|
|
# raise NotFoundElementError("get_date exception")
|
|
return "0000-00-00 00:00:00"
|
|
|
|
|
|
class BodyCrawler(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.soup = None
|
|
self.section_activity = None
|
|
self.set_soup_and_activity()
|
|
|
|
# calling point may differ
|
|
def set_soup_and_activity(self):
|
|
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
|
# There are many div.section _activity. But element we use is in div.cover_wrapper
|
|
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
|
self.section_activity = self.soup.find('div', class_='section _activity')
|
|
|
|
def find_article_id(self):
|
|
a = self.section_activity.find('a', class_='pf_name')
|
|
href = a.attrs['href'].replace('https://story.kakao.com/', '')
|
|
return href[1:] if href.startswith('/') else href
|
|
|
|
def find_article_nickname(self):
|
|
a = self.section_activity.find('a', class_='pf_name')
|
|
return a.text
|
|
|
|
def find_article_url(self):
|
|
# in chrome, current_url is equal to article_url
|
|
# need to check other browser
|
|
return self.driver.current_url
|
|
|
|
def find_article_modified_date(self):
|
|
# get DOM about modified date
|
|
times = None
|
|
add_top = self.section_activity.find('div', class_='add_top')
|
|
if add_top:
|
|
times = add_top.find_all('span', class_='time')
|
|
|
|
# written time is default. if the article was modified, modified time is added.
|
|
# so if length of times is not equal to 2, there is only written time.
|
|
if not times or len(times) < 2:
|
|
return None
|
|
|
|
# times[0] : written time, times[1] : modified time
|
|
# times[1] structure : <span><span ...> </span></span>
|
|
# check times[1].span exists
|
|
if times[1].span:
|
|
|
|
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
|
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
|
m = re_date.search(times[1].span.attrs.get('title', '')) \
|
|
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
else:
|
|
# raise NotFoundDataError('data for find_article_modified is not found')
|
|
return None
|
|
|
|
# return None instead of exception.
|
|
else:
|
|
# raise NotFoundElementError('find_article_modified DOM is missing')
|
|
return None
|
|
|
|
def find_article_date(self):
|
|
# modified date is a higher priority than written date
|
|
|
|
# modified_date = self.find_article_modified_date()
|
|
# if modified_date:
|
|
# return modified_date
|
|
|
|
times = None
|
|
# get DOMs about date
|
|
add_top = self.section_activity.find('div', class_='add_top')
|
|
if add_top:
|
|
times = add_top.find_all('span', class_='time')
|
|
else:
|
|
raise NotFoundElementError("find_article_data DOM is missing : add_top")
|
|
if not times:
|
|
raise NotFoundElementError("find_article_data DOM is missing : time")
|
|
|
|
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
|
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
|
m = re_date.search(times[0].attrs.get('title', '')) \
|
|
or re_date.search(times[0].attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
# return invalid date instead of exception
|
|
else:
|
|
# raise NotFoundElementError("find_article_date exception")
|
|
return "0000-00-00 00:00:00"
|
|
|
|
def find_article_profileurl(self):
|
|
profile_area = self.section_activity.find('div', class_='_profileArea pf')
|
|
# check a>img
|
|
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
|
|
return profile_area.a.img.get('src')
|
|
# this is not essential, so no exception occur
|
|
else:
|
|
return ''
|
|
|
|
def find_article_data(self):
|
|
"""
|
|
:return: trimmed article_data
|
|
"""
|
|
content = self.section_activity.find('div', class_='txt_wrap')
|
|
if content and content.text:
|
|
# trim
|
|
return content.text.strip().replace('\xa0', '\n')
|
|
# if there is no content or text, return empty data
|
|
else:
|
|
return ''
|
|
|
|
def find_article_title(self):
|
|
# strong.tit_channel is title of channel
|
|
# if strong.tit_channel do not exist,
|
|
# title is first line of article_data
|
|
# this definition is determined by me -_-
|
|
# find_article_data return trimmed string
|
|
strong = self.section_activity.find('strong', class_='tit_channel')
|
|
if strong and strong.text:
|
|
return strong.text.replace('\xa0', '')
|
|
|
|
article_data = self.find_article_data()
|
|
if article_data:
|
|
for line in article_data.splitlines():
|
|
# limit title length
|
|
return line[0:30] if len(line) > 30 else line
|
|
else:
|
|
return ''
|
|
|
|
def find_article_etc(self, class_name):
|
|
"""
|
|
this function is used for crawling number of shares, replies and feelings
|
|
:param class_name:
|
|
:return: a string of number of shares, replies, or feelings
|
|
"""
|
|
element = self.section_activity.find('strong', class_=class_name)
|
|
|
|
# check element has text that indicate the number
|
|
if element and element.text:
|
|
# It may contain comma ',' to recognize easily
|
|
# Remove comma ',' to convert from str to int
|
|
txt = element.text.replace(',', '')
|
|
return txt
|
|
# if there is no element or text, return '0' instead of raising exception
|
|
else:
|
|
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
|
|
return '0'
|
|
|
|
def find_article_share(self):
|
|
return self.find_article_etc('_storyShareCount')
|
|
|
|
def find_article_feeling(self):
|
|
return self.find_article_etc('_likeCount')
|
|
|
|
def find_article_reply_num(self):
|
|
return self.find_article_etc('_commentCount')
|
|
|
|
def find_platform_form(self):
|
|
article_id = self.find_article_id()
|
|
return 'channel' if article_id.startswith('ch/') else 'story'
|
|
|
|
def find_error(self):
|
|
error = self.soup.find('div', class_='info_error')
|
|
if error:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def get(self):
|
|
"""
|
|
you need to put 'keyword_id'
|
|
:return: dict for crawled body content
|
|
"""
|
|
if not self.section_activity:
|
|
raise NotFoundElementError("section _activity is not Found")
|
|
content = dict()
|
|
content['article_id'] = self.find_article_id()
|
|
content['article_nickname'] = self.find_article_nickname()
|
|
content['article_data'] = self.find_article_data()
|
|
content['article_title'] = self.find_article_title()
|
|
content['article_date'] = self.find_article_date()
|
|
content['article_url'] = self.find_article_url()
|
|
content['article_profileurl'] = self.find_article_profileurl()
|
|
content['article_order'] = self.find_article_reply_num()
|
|
content['article_parent'] = self.find_article_share()
|
|
content['reply_url'] = self.find_article_feeling()
|
|
content['platform_form'] = self.find_platform_form()
|
|
content['article_form'] = 'body'
|
|
content['platform_name'] = 'kakaostory'
|
|
content['platform_id'] = content['article_id']
|
|
content['platform_title'] = content['article_nickname']
|
|
return content
|
|
|
|
|
|
class ReplyCrawler(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.soup = None
|
|
self.section_activity = None
|
|
self.ul = None
|
|
self.lis = None
|
|
|
|
def set_soup_and_activity(self):
|
|
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
|
# There are many div.section _activity. But a element we use is in div.cover_wrapper
|
|
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
|
self.section_activity = self.soup.find('div', class_='section _activity')
|
|
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
|
|
|
def load_all_reply(self):
|
|
previous_num_of_replies = 0
|
|
while self.has_more():
|
|
self.click_load_more_reply_btn()
|
|
# check the number of replies before and after click_load_more_reply_btn()
|
|
# If These were equal, the link or ajax failed
|
|
current_num_of_replies = self.get_num_of_replies()
|
|
if previous_num_of_replies == current_num_of_replies:
|
|
break
|
|
previous_num_of_replies = current_num_of_replies
|
|
|
|
def get_num_of_replies(self):
|
|
# Find ul element that contains replies
|
|
# if raise occur, there is no reply
|
|
# for performance, this method may is implemented using bs4
|
|
try:
|
|
ul = find_element_by_css_selector(self.driver,
|
|
#"div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"ul[class='list _listContainer']", 5)
|
|
li = ul.find_elements_by_tag_name('li')
|
|
return len(li)
|
|
except Exception as e:
|
|
return 0
|
|
|
|
def click_load_more_reply_btn(self):
|
|
try:
|
|
# find a link to load reply and click/enter it
|
|
a = find_element_by_css_selector(self.driver,
|
|
#"div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"a[class='_btnShowMoreComment']", 5)
|
|
enter_element(a)
|
|
|
|
# no link is in the browser. Nothing happens instead raise exception. But log this event
|
|
except Exception as e:
|
|
pass
|
|
# printl("In click_load_more_reply_btn, there is not a link to load replies")
|
|
# printl(e)
|
|
|
|
def has_more(self):
|
|
# In the case that raise exception,
|
|
# there is no more reply or css selector of the show_more is invalid
|
|
# These two case can't be classified by exception because the logic is same
|
|
try:
|
|
# find show_more element
|
|
show_more = find_element_by_css_selector(self.driver,
|
|
# "div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"p[class='more _showMoreCommentContainer']", 5)
|
|
|
|
# 'display:block;' -> display the button, 'display:none;' -> hide the button
|
|
if 'block' in show_more.get_attribute('style'):
|
|
return True
|
|
else:
|
|
return False
|
|
# return False in the two case
|
|
# First case is that loading replies is finished
|
|
# Second case is that css selector to find element is invalid
|
|
except Exception as e:
|
|
return False
|
|
|
|
# find_xxxx functions
|
|
|
|
def find_article_id(self):
|
|
# Find name placeholder
|
|
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
|
# Get article_ids and remove kakaostory url in article_id
|
|
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
|
|
for div in divs if div.attrs.get('href', '')]
|
|
# Refine hrefs. Href may start with '/'
|
|
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
|
|
# Return list because of unification of types
|
|
return list(article_id)
|
|
|
|
def find_article_nickname(self):
|
|
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
|
# If div.text exist, return div.text. Otherwise return empty string
|
|
return [div.text if div.text else '' for div in divs]
|
|
|
|
def find_article_data(self):
|
|
divs = self.ul.find_all('div', class_='txt')
|
|
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
|
|
# When element does not exists, return empty string
|
|
return [div.text[len(div.p.text):].replace('\xa0', '\n')
|
|
if div.p else div.text if div.text else '' for div in divs]
|
|
|
|
def find_article_date(self):
|
|
divs = self.ul.find_all('span', class_='time')
|
|
return list(map(get_date, divs))
|
|
|
|
def find_article_like(self):
|
|
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
|
|
# The number of like exists in span.like_num _likeCommentCount Unless it is present
|
|
return [span.text if span.text else '' for span in spans]
|
|
|
|
def find_article_profileurl(self):
|
|
divs = self.ul.find_all('div', class_='pf')
|
|
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
|
|
|
|
def get(self):
|
|
"""
|
|
Need to put platform_title, platform_id, platform_form from body
|
|
:return: a list of replies. Need to put platform_title, platform_id
|
|
"""
|
|
# load all replies
|
|
self.load_all_reply()
|
|
|
|
# After loading all replies, crawl replies using BeautifulSoup
|
|
self.set_soup_and_activity()
|
|
|
|
article_ids = self.find_article_id()
|
|
article_nicknames = self.find_article_nickname()
|
|
article_datas = self.find_article_data()
|
|
article_dates = self.find_article_date()
|
|
article_profileurls = self.find_article_profileurl()
|
|
article_likes = self.find_article_like()
|
|
article_url = self.driver.current_url
|
|
|
|
replies = []
|
|
# This may occur exception when indices of each elements is not matched
|
|
# This exception described above is intended
|
|
for i in range(len(article_ids)):
|
|
reply = dict()
|
|
reply['article_id'] = article_ids[i]
|
|
reply['article_nickname'] = article_nicknames[i]
|
|
reply['article_data'] = article_datas[i]
|
|
reply['article_date'] = article_dates[i]
|
|
reply['article_profileurl'] = article_profileurls[i]
|
|
reply['reply_url'] = article_likes[i]
|
|
reply['platform_name'] = 'kakaostory'
|
|
reply['article_form'] = 'reply'
|
|
reply['article_url'] = article_url
|
|
reply['article_order'] = str(i)
|
|
replies.append(reply)
|
|
return replies
|
|
|
|
|
|
class EffectKakaostory(object):
|
|
def __init__(self, event_num, event_code, url, driver):
|
|
self.event_num = event_num
|
|
self.event_code = event_code
|
|
self.url = url
|
|
self.driver = driver
|
|
|
|
def start(self):
|
|
try:
|
|
self.driver.get(self.url)
|
|
wait(3)
|
|
body_crawler = BodyCrawler(self.driver)
|
|
reply_crawler = ReplyCrawler(self.driver)
|
|
except Exception as e:
|
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
|
|
|
try:
|
|
error = body_crawler.find_error()
|
|
except Exception as e:
|
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
|
if error:
|
|
raise effect.effecterror.DeletedUrlError("The URL is Deleted")
|
|
try:
|
|
body = body_crawler.get()
|
|
replies = reply_crawler.get()
|
|
except Exception as e:
|
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
|
|
|
if not body.get('article_id', ''):
|
|
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
|
|
|
|
try:
|
|
result = self.statistics(body, replies)
|
|
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
result['status'] = 'OK'
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e))
|
|
#pprint.pprint(body)
|
|
#pprint.pprint(replies)
|
|
#pprint.pprint(result)
|
|
try:
|
|
cg = get_settings()
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
|
|
|
try:
|
|
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
|
result_sender.connect()
|
|
result_sender.send('stats_s1_effect', result)
|
|
result_sender.close()
|
|
except Exception as e:
|
|
raise effect.effecterror.DBQueryError(str(e))
|
|
|
|
def statistics(self, body, replies):
|
|
result = {}
|
|
result['viewcount'] = int(body.get('article_hit', 0))
|
|
result['event_num'] = self.event_num
|
|
result['replycount'] = int(body.get('article_order'), 0)
|
|
result['likecount'] = int(body.get('reply_url'), 0)
|
|
result['interactioncount'] = self.get_replycount(body, replies)
|
|
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
|
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
|
return result
|
|
|
|
def get_replycount(self, body, replies):
|
|
set_reply_id = set()
|
|
for i in replies:
|
|
set_reply_id.add(i.get('article_id', ''))
|
|
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
|
|
|
def get_reply_buzz(self, body, replies):
|
|
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
|
end_date = datetime.datetime.now().date()
|
|
date_dict = dict()
|
|
while start_date <= end_date:
|
|
date_dict[start_date.strftime('%Y%m%d')] = 0
|
|
start_date = start_date + datetime.timedelta(days=1)
|
|
|
|
for reply in replies:
|
|
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
|
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
|
|
if reply_date in date_dict:
|
|
date_dict[reply_date] = date_dict[reply_date] + 1
|
|
|
|
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
|
|
|
return json.dumps(json_array, sort_keys=True)
|
|
|
|
|
|
|