Files
clients/WebBasedCrawler/effect/effectkakaostory.py
admin 40f29bdf51 effect
git-svn-id: svn://192.168.0.12/source@335 8346c931-da38-4b9b-9d4c-e48b93cbd075
2017-01-06 07:21:32 +00:00

508 lines
21 KiB
Python

import datetime
import json
import effect.effecterror
import re
from kakao.kakaoexception import NotFoundElementError
from bs4 import BeautifulSoup
from base.baseclasses import wait
from effect.resultsender import get_settings
from effect.resultsender import ResultSender
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
try:
import lxml
parser_opt = 'lxml'
except ImportError:
parser_opt = 'html.parser'
SEPERATOR = '!@#'
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
limit_reload = 5
num_of_retry = 3
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
def get_date(element):
"""
:param element: this may be span.time element
:return: 'yyyy-MM-dd hh:mm:ss'
"""
m = re_date.search(element.attrs.get('title', '')) \
or re_date.search(element.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("get_date exception")
return "0000-00-00 00:00:00"
class BodyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.set_soup_and_activity()
# calling point may differ
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But element we use is in div.cover_wrapper
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = self.soup.find('div', class_='section _activity')
def find_article_id(self):
a = self.section_activity.find('a', class_='pf_name')
href = a.attrs['href'].replace('https://story.kakao.com/', '')
return href[1:] if href.startswith('/') else href
def find_article_nickname(self):
a = self.section_activity.find('a', class_='pf_name')
return a.text
def find_article_url(self):
# in chrome, current_url is equal to article_url
# need to check other browser
return self.driver.current_url
def find_article_modified_date(self):
# get DOM about modified date
times = None
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
# written time is default. if the article was modified, modified time is added.
# so if length of times is not equal to 2, there is only written time.
if not times or len(times) < 2:
return None
# times[0] : written time, times[1] : modified time
# times[1] structure : <span><span ...> </span></span>
# check times[1].span exists
if times[1].span:
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[1].span.attrs.get('title', '')) \
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
else:
# raise NotFoundDataError('data for find_article_modified is not found')
return None
# return None instead of exception.
else:
# raise NotFoundElementError('find_article_modified DOM is missing')
return None
def find_article_date(self):
# modified date is a higher priority than written date
# modified_date = self.find_article_modified_date()
# if modified_date:
# return modified_date
times = None
# get DOMs about date
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
else:
raise NotFoundElementError("find_article_data DOM is missing : add_top")
if not times:
raise NotFoundElementError("find_article_data DOM is missing : time")
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[0].attrs.get('title', '')) \
or re_date.search(times[0].attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("find_article_date exception")
return "0000-00-00 00:00:00"
def find_article_profileurl(self):
profile_area = self.section_activity.find('div', class_='_profileArea pf')
# check a>img
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
return profile_area.a.img.get('src')
# this is not essential, so no exception occur
else:
return ''
def find_article_data(self):
"""
:return: trimmed article_data
"""
content = self.section_activity.find('div', class_='txt_wrap')
if content and content.text:
# trim
return content.text.strip().replace('\xa0', '\n')
# if there is no content or text, return empty data
else:
return ''
def find_article_title(self):
# strong.tit_channel is title of channel
# if strong.tit_channel do not exist,
# title is first line of article_data
# this definition is determined by me -_-
# find_article_data return trimmed string
strong = self.section_activity.find('strong', class_='tit_channel')
if strong and strong.text:
return strong.text.replace('\xa0', '')
article_data = self.find_article_data()
if article_data:
for line in article_data.splitlines():
# limit title length
return line[0:30] if len(line) > 30 else line
else:
return ''
def find_article_etc(self, class_name):
"""
this function is used for crawling number of shares, replies and feelings
:param class_name:
:return: a string of number of shares, replies, or feelings
"""
element = self.section_activity.find('strong', class_=class_name)
# check element has text that indicate the number
if element and element.text:
# It may contain comma ',' to recognize easily
# Remove comma ',' to convert from str to int
txt = element.text.replace(',', '')
return txt
# if there is no element or text, return '0' instead of raising exception
else:
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
return '0'
def find_article_share(self):
return self.find_article_etc('_storyShareCount')
def find_article_feeling(self):
return self.find_article_etc('_likeCount')
def find_article_reply_num(self):
return self.find_article_etc('_commentCount')
def find_platform_form(self):
article_id = self.find_article_id()
return 'channel' if article_id.startswith('ch/') else 'story'
def find_error(self):
error = self.soup.find('div', class_='info_error')
if error:
return True
else:
return False
def get(self):
"""
you need to put 'keyword_id'
:return: dict for crawled body content
"""
if not self.section_activity:
raise NotFoundElementError("section _activity is not Found")
content = dict()
content['article_id'] = self.find_article_id()
content['article_nickname'] = self.find_article_nickname()
content['article_data'] = self.find_article_data()
content['article_title'] = self.find_article_title()
content['article_date'] = self.find_article_date()
content['article_url'] = self.find_article_url()
content['article_profileurl'] = self.find_article_profileurl()
content['article_order'] = self.find_article_reply_num()
content['article_parent'] = self.find_article_share()
content['reply_url'] = self.find_article_feeling()
content['platform_form'] = self.find_platform_form()
content['article_form'] = 'body'
content['platform_name'] = 'kakaostory'
content['platform_id'] = content['article_id']
content['platform_title'] = content['article_nickname']
return content
class ReplyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.ul = None
self.lis = None
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But a element we use is in div.cover_wrapper
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = self.soup.find('div', class_='section _activity')
self.ul = self.section_activity.find('ul', class_='list _listContainer')
def load_all_reply(self):
previous_num_of_replies = 0
while self.has_more():
self.click_load_more_reply_btn()
# check the number of replies before and after click_load_more_reply_btn()
# If These were equal, the link or ajax failed
current_num_of_replies = self.get_num_of_replies()
if previous_num_of_replies == current_num_of_replies:
break
previous_num_of_replies = current_num_of_replies
def get_num_of_replies(self):
# Find ul element that contains replies
# if raise occur, there is no reply
# for performance, this method may is implemented using bs4
try:
ul = find_element_by_css_selector(self.driver,
#"div.cover_wrapper "
"div[class='section _activity'] "
"ul[class='list _listContainer']", 5)
li = ul.find_elements_by_tag_name('li')
return len(li)
except Exception as e:
return 0
def click_load_more_reply_btn(self):
try:
# find a link to load reply and click/enter it
a = find_element_by_css_selector(self.driver,
#"div.cover_wrapper "
"div[class='section _activity'] "
"a[class='_btnShowMoreComment']", 5)
enter_element(a)
# no link is in the browser. Nothing happens instead raise exception. But log this event
except Exception as e:
pass
# printl("In click_load_more_reply_btn, there is not a link to load replies")
# printl(e)
def has_more(self):
# In the case that raise exception,
# there is no more reply or css selector of the show_more is invalid
# These two case can't be classified by exception because the logic is same
try:
# find show_more element
show_more = find_element_by_css_selector(self.driver,
# "div.cover_wrapper "
"div[class='section _activity'] "
"p[class='more _showMoreCommentContainer']", 5)
# 'display:block;' -> display the button, 'display:none;' -> hide the button
if 'block' in show_more.get_attribute('style'):
return True
else:
return False
# return False in the two case
# First case is that loading replies is finished
# Second case is that css selector to find element is invalid
except Exception as e:
return False
# find_xxxx functions
def find_article_id(self):
# Find name placeholder
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# Get article_ids and remove kakaostory url in article_id
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
for div in divs if div.attrs.get('href', '')]
# Refine hrefs. Href may start with '/'
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
# Return list because of unification of types
return list(article_id)
def find_article_nickname(self):
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# If div.text exist, return div.text. Otherwise return empty string
return [div.text if div.text else '' for div in divs]
def find_article_data(self):
divs = self.ul.find_all('div', class_='txt')
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
# When element does not exists, return empty string
return [div.text[len(div.p.text):].replace('\xa0', '\n')
if div.p else div.text if div.text else '' for div in divs]
def find_article_date(self):
divs = self.ul.find_all('span', class_='time')
return list(map(get_date, divs))
def find_article_like(self):
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
# The number of like exists in span.like_num _likeCommentCount Unless it is present
return [span.text if span.text else '' for span in spans]
def find_article_profileurl(self):
divs = self.ul.find_all('div', class_='pf')
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
def get(self):
"""
Need to put platform_title, platform_id, platform_form from body
:return: a list of replies. Need to put platform_title, platform_id
"""
# load all replies
self.load_all_reply()
# After loading all replies, crawl replies using BeautifulSoup
self.set_soup_and_activity()
article_ids = self.find_article_id()
article_nicknames = self.find_article_nickname()
article_datas = self.find_article_data()
article_dates = self.find_article_date()
article_profileurls = self.find_article_profileurl()
article_likes = self.find_article_like()
article_url = self.driver.current_url
replies = []
# This may occur exception when indices of each elements is not matched
# This exception described above is intended
for i in range(len(article_ids)):
reply = dict()
reply['article_id'] = article_ids[i]
reply['article_nickname'] = article_nicknames[i]
reply['article_data'] = article_datas[i]
reply['article_date'] = article_dates[i]
reply['article_profileurl'] = article_profileurls[i]
reply['reply_url'] = article_likes[i]
reply['platform_name'] = 'kakaostory'
reply['article_form'] = 'reply'
reply['article_url'] = article_url
reply['article_order'] = str(i)
replies.append(reply)
return replies
class EffectKakaostory(object):
def __init__(self, event_num, event_code, url, driver):
self.event_num = event_num
self.event_code = event_code
self.url = url
self.driver = driver
def start(self):
try:
self.driver.get(self.url)
wait(3)
body_crawler = BodyCrawler(self.driver)
reply_crawler = ReplyCrawler(self.driver)
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
try:
error = body_crawler.find_error()
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if error:
raise effect.effecterror.DeletedUrlError("The URL is Deleted")
try:
body = body_crawler.get()
replies = reply_crawler.get()
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if not body.get('article_id', ''):
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
try:
result = self.statistics(body, replies)
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
result['status'] = 'OK'
except Exception as e:
raise effect.effecterror.UnknownError(str(e))
#pprint.pprint(body)
#pprint.pprint(replies)
#pprint.pprint(result)
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
try:
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
result_sender.connect()
result_sender.send('stats_s1_effect', result)
result_sender.close()
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
def statistics(self, body, replies):
result = {}
result['viewcount'] = int(body.get('article_hit', 0))
result['event_num'] = self.event_num
result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies)
result['replybuzz'] = self.get_reply_buzz(body, replies)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result
def get_replycount(self, body, replies):
set_reply_id = set()
for i in replies:
set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
end_date = datetime.datetime.now().date()
date_dict = dict()
while start_date <= end_date:
date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1)
for reply in replies:
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
return json.dumps(json_array, sort_keys=True)