clients/WebBasedCrawler/effect/effectkakaostory.py

import datetime
import json
import effect.effecterror
import re

from kakao.kakaoexception import NotFoundElementError
from bs4 import BeautifulSoup

from base.baseclasses import wait
from effect.resultsender import get_settings
from effect.resultsender import ResultSender
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element

try:
    import lxml
    parser_opt = 'lxml'
except ImportError:
    parser_opt = 'html.parser'

SEPERATOR = '!@#'

kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
limit_reload = 5
num_of_retry = 3


re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
                     "[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")


def get_date(element):
    """

    :param element: this may be span.time element
    :return: 'yyyy-MM-dd hh:mm:ss'
    """
    m = re_date.search(element.attrs.get('title', '')) \
        or re_date.search(element.attrs.get('data-tooltip', ''))

    if m:
        temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                      int(m.group(5)), int(m.group(6)))
        # add 12 hour when the article is written at p.m
        if m.group(4) == "오후" and int(m.group(5)) < 12:
            temp_date += datetime.timedelta(hours=12)

        # convert datetime.datetime to str
        return str(temp_date)
    # return invalid date instead of exception
    else:
        # raise NotFoundElementError("get_date exception")
        return "0000-00-00 00:00:00"


class BodyCrawler(object):
    def __init__(self, driver):
        self.driver = driver
        self.soup = None
        self.section_activity = None
        self.set_soup_and_activity()

    # calling point may differ
    def set_soup_and_activity(self):
        self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
        # There are many div.section _activity. But element we use is in div.cover_wrapper
        # cover_wrapper = self.soup.find('div', class_='cover_wrapper')
        self.section_activity = self.soup.find('div', class_='section _activity')

    def find_article_id(self):
        a = self.section_activity.find('a', class_='pf_name')
        href = a.attrs['href'].replace('https://story.kakao.com/', '')
        return href[1:] if href.startswith('/') else href

    def find_article_nickname(self):
        a = self.section_activity.find('a', class_='pf_name')
        return a.text

    def find_article_url(self):
        # in chrome, current_url is equal to article_url
        # need to check other browser
        return self.driver.current_url

    def find_article_modified_date(self):
        # get DOM about modified date
        times = None
        add_top = self.section_activity.find('div', class_='add_top')
        if add_top:
            times = add_top.find_all('span', class_='time')

        # written time is default. if the article was modified, modified time is added.
        # so if length of times is not equal to 2, there is only written time.
        if not times or len(times) < 2:
            return None

        # times[0] : written time, times[1] : modified time
        # times[1] structure : <span><span ...> </span></span>
        # check times[1].span exists
        if times[1].span:

            # before mouse over the element(tooltip), the date string is in the title attribute of span
            # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
            m = re_date.search(times[1].span.attrs.get('title', '')) \
                or re_date.search(times[1].span.attrs.get('data-tooltip', ''))

            if m:
                temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                              int(m.group(5)), int(m.group(6)))
                # add 12 hour when the article is written at p.m
                if m.group(4) == "오후" and int(m.group(5)) < 12:
                    temp_date += datetime.timedelta(hours=12)

                # convert datetime.datetime to str
                return str(temp_date)
            else:
                # raise NotFoundDataError('data for find_article_modified is not found')
                return None

        # return None instead of exception.
        else:
            # raise NotFoundElementError('find_article_modified DOM is missing')
            return None

    def find_article_date(self):
        # modified date is a higher priority than written date

        # modified_date = self.find_article_modified_date()
        # if modified_date:
        #     return modified_date

        times = None
        # get DOMs about date
        add_top = self.section_activity.find('div', class_='add_top')
        if add_top:
            times = add_top.find_all('span', class_='time')
        else:
            raise NotFoundElementError("find_article_data DOM is missing : add_top")
        if not times:
            raise NotFoundElementError("find_article_data DOM is missing : time")

        # before mouse over the element(tooltip), the date string is in the title attribute of span
        # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
        m = re_date.search(times[0].attrs.get('title', '')) \
            or re_date.search(times[0].attrs.get('data-tooltip', ''))

        if m:
            temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                          int(m.group(5)), int(m.group(6)))
            # add 12 hour when the article is written at p.m
            if m.group(4) == "오후" and int(m.group(5)) < 12:
                temp_date += datetime.timedelta(hours=12)

            # convert datetime.datetime to str
            return str(temp_date)
        # return invalid date instead of exception
        else:
            # raise NotFoundElementError("find_article_date exception")
            return "0000-00-00 00:00:00"

    def find_article_profileurl(self):
        profile_area = self.section_activity.find('div', class_='_profileArea pf')
        # check a>img
        if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
            return profile_area.a.img.get('src')
        # this is not essential, so no exception occur
        else:
            return ''

    def find_article_data(self):
        """
        :return: trimmed article_data
        """
        content = self.section_activity.find('div', class_='txt_wrap')
        if content and content.text:
            # trim
            return content.text.strip().replace('\xa0', '\n')
        # if there is no content or text, return empty data
        else:
            return ''

    def find_article_title(self):
        # strong.tit_channel is title of channel
        # if strong.tit_channel do not exist,
        # title is first line of article_data
        # this definition is determined by me -_-
        # find_article_data return trimmed string
        strong = self.section_activity.find('strong', class_='tit_channel')
        if strong and strong.text:
            return strong.text.replace('\xa0', '')

        article_data = self.find_article_data()
        if article_data:
            for line in article_data.splitlines():
                # limit title length
                return line[0:30] if len(line) > 30 else line
        else:
            return ''

    def find_article_etc(self, class_name):
        """
        this function is used for crawling number of shares, replies and feelings
        :param class_name:
        :return: a string of number of shares, replies, or feelings
        """
        element = self.section_activity.find('strong', class_=class_name)

        # check element has text that indicate the number
        if element and element.text:
            # It may contain comma ',' to recognize easily
            # Remove comma ',' to convert from str to int
            txt = element.text.replace(',', '')
            return txt
        # if there is no element or text, return '0' instead of raising exception
        else:
            # raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
            return '0'

    def find_article_share(self):
        return self.find_article_etc('_storyShareCount')

    def find_article_feeling(self):
        return self.find_article_etc('_likeCount')

    def find_article_reply_num(self):
        return self.find_article_etc('_commentCount')

    def find_platform_form(self):
        article_id = self.find_article_id()
        return 'channel' if article_id.startswith('ch/') else 'story'

    def find_error(self):
        error = self.soup.find('div', class_='info_error')
        if error:
            return True
        else:
            return False

    def get(self):
        """
        you need to put 'keyword_id'
        :return: dict for crawled body content
        """
        if not self.section_activity:
            raise NotFoundElementError("section _activity is not Found")
        content = dict()
        content['article_id'] = self.find_article_id()
        content['article_nickname'] = self.find_article_nickname()
        content['article_data'] = self.find_article_data()
        content['article_title'] = self.find_article_title()
        content['article_date'] = self.find_article_date()
        content['article_url'] = self.find_article_url()
        content['article_profileurl'] = self.find_article_profileurl()
        content['article_order'] = self.find_article_reply_num()
        content['article_parent'] = self.find_article_share()
        content['reply_url'] = self.find_article_feeling()
        content['platform_form'] = self.find_platform_form()
        content['article_form'] = 'body'
        content['platform_name'] = 'kakaostory'
        content['platform_id'] = content['article_id']
        content['platform_title'] = content['article_nickname']
        return content


class ReplyCrawler(object):
    def __init__(self, driver):
        self.driver = driver
        self.soup = None
        self.section_activity = None
        self.ul = None
        self.lis = None

    def set_soup_and_activity(self):
        self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
        # There are many div.section _activity. But a element we use is in div.cover_wrapper
        # cover_wrapper = self.soup.find('div', class_='cover_wrapper')
        self.section_activity = self.soup.find('div', class_='section _activity')
        self.ul = self.section_activity.find('ul', class_='list _listContainer')

    def load_all_reply(self):
        previous_num_of_replies = 0
        while self.has_more():
            self.click_load_more_reply_btn()
            # check the number of replies before and after click_load_more_reply_btn()
            # If These were equal, the link or ajax failed
            current_num_of_replies = self.get_num_of_replies()
            if previous_num_of_replies == current_num_of_replies:
                break
            previous_num_of_replies = current_num_of_replies

    def get_num_of_replies(self):
        # Find ul element that contains replies
        # if raise occur, there is no reply
        # for performance, this method may is implemented using bs4
        try:
            ul = find_element_by_css_selector(self.driver,
                                              #"div.cover_wrapper "
                                              "div[class='section _activity'] "
                                              "ul[class='list _listContainer']", 5)
            li = ul.find_elements_by_tag_name('li')
            return len(li)
        except Exception as e:
            return 0

    def click_load_more_reply_btn(self):
        try:
            # find a link to load reply and click/enter it
            a = find_element_by_css_selector(self.driver,
                                             #"div.cover_wrapper "
                                             "div[class='section _activity'] "
                                             "a[class='_btnShowMoreComment']", 5)
            enter_element(a)

        # no link is in the browser. Nothing happens instead raise exception. But log this event
        except Exception as e:
            pass
            # printl("In click_load_more_reply_btn, there is not a link to load replies")
            # printl(e)

    def has_more(self):
        # In the case that raise exception,
        # there is no more reply or css selector of the show_more is invalid
        # These two case can't be classified by exception because the logic is same
        try:
            # find show_more element
            show_more = find_element_by_css_selector(self.driver,
                                                     # "div.cover_wrapper "
                                                     "div[class='section _activity'] "
                                                     "p[class='more _showMoreCommentContainer']", 5)

            # 'display:block;' -> display the button, 'display:none;' -> hide the button
            if 'block' in show_more.get_attribute('style'):
                return True
            else:
                return False
        # return False in the two case
        # First case is that loading replies is finished
        # Second case is that css selector to find element is invalid
        except Exception as e:
            return False

    # find_xxxx functions

    def find_article_id(self):
        # Find name placeholder
        divs = self.ul.find_all('a', class_='name _namePlaceholder')
        # Get article_ids and remove kakaostory url in article_id
        article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
                       for div in divs if div.attrs.get('href', '')]
        # Refine hrefs. Href may start with '/'
        article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
        # Return list because of unification of types
        return list(article_id)

    def find_article_nickname(self):
        divs = self.ul.find_all('a', class_='name _namePlaceholder')
        # If div.text exist, return div.text. Otherwise return empty string
        return [div.text if div.text else '' for div in divs]

    def find_article_data(self):
        divs = self.ul.find_all('div', class_='txt')
        # The div.text has meta-data in div.p.text. If meta-data exists, remove it
        # When element does not exists, return empty string
        return [div.text[len(div.p.text):].replace('\xa0', '\n')
                if div.p else div.text if div.text else '' for div in divs]

    def find_article_date(self):
        divs = self.ul.find_all('span', class_='time')
        return list(map(get_date, divs))

    def find_article_like(self):
        spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
        # The number of like exists in span.like_num _likeCommentCount Unless it is present
        return [span.text if span.text else '' for span in spans]

    def find_article_profileurl(self):
        divs = self.ul.find_all('div', class_='pf')
        return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))

    def get(self):
        """
        Need to put platform_title, platform_id, platform_form from body
        :return: a list of replies. Need to put platform_title, platform_id
        """
        # load all replies
        self.load_all_reply()

        # After loading all replies, crawl replies using BeautifulSoup
        self.set_soup_and_activity()

        article_ids = self.find_article_id()
        article_nicknames = self.find_article_nickname()
        article_datas = self.find_article_data()
        article_dates = self.find_article_date()
        article_profileurls = self.find_article_profileurl()
        article_likes = self.find_article_like()
        article_url = self.driver.current_url

        replies = []
        # This may occur exception when indices of each elements is not matched
        # This exception described above is intended
        for i in range(len(article_ids)):
            reply = dict()
            reply['article_id'] = article_ids[i]
            reply['article_nickname'] = article_nicknames[i]
            reply['article_data'] = article_datas[i]
            reply['article_date'] = article_dates[i]
            reply['article_profileurl'] = article_profileurls[i]
            reply['reply_url'] = article_likes[i]
            reply['platform_name'] = 'kakaostory'
            reply['article_form'] = 'reply'
            reply['article_url'] = article_url
            reply['article_order'] = str(i)
            replies.append(reply)
        return replies


class EffectKakaostory(object):
    def __init__(self, event_num, event_code, url, driver):
        self.event_num = event_num
        self.event_code = event_code
        self.url = url
        self.driver = driver

    def start(self):
        try:
            self.driver.get(self.url)
            wait(3)
            body_crawler = BodyCrawler(self.driver)
            reply_crawler = ReplyCrawler(self.driver)
        except Exception as e:
            raise effect.effecterror.OutDatedCrawler(str(e))

        try:
            error = body_crawler.find_error()
        except Exception as e:
            raise effect.effecterror.OutDatedCrawler(str(e))
        if error:
            raise effect.effecterror.DeletedUrlError("The URL is Deleted")
        try:
            body = body_crawler.get()
            replies = reply_crawler.get()
        except Exception as e:
            raise effect.effecterror.OutDatedCrawler(str(e))

        if not body.get('article_id', ''):
            raise effect.effecterror.OutDatedCrawler("NoData Crawled")

        try:
            result = self.statistics(body, replies)
            result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            result['status'] = 'OK'
        except Exception as e:
            raise effect.effecterror.UnknownError(str(e))
        #pprint.pprint(body)
        #pprint.pprint(replies)
        #pprint.pprint(result)
        try:
            cg = get_settings()
        except Exception as e:
            raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')

        try:
            result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
            result_sender.connect()
            result_sender.send('stats_s1_effect', result)
            result_sender.close()
        except Exception as e:
            raise effect.effecterror.DBQueryError(str(e))

    def statistics(self, body, replies):
        result = {}
        result['viewcount'] = int(body.get('article_hit', 0))
        result['event_num'] = self.event_num
        result['replycount'] = int(body.get('article_order'), 0)
        result['likecount'] = int(body.get('reply_url'), 0)
        result['interactioncount'] = self.get_replycount(body, replies)
        result['replybuzz'] = self.get_reply_buzz(body, replies)
        result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
        return result

    def get_replycount(self, body, replies):
        set_reply_id = set()
        for i in replies:
            set_reply_id.add(i.get('article_id', ''))
        return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)

    def get_reply_buzz(self, body, replies):
        start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
        end_date = datetime.datetime.now().date()
        date_dict = dict()
        while start_date <= end_date:
            date_dict[start_date.strftime('%Y%m%d')] = 0
            start_date = start_date + datetime.timedelta(days=1)

        for reply in replies:
            str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
            reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
            if reply_date in date_dict:
                date_dict[reply_date] = date_dict[reply_date] + 1

        json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]

        return json.dumps(json_array, sort_keys=True)