clients/WebBasedCrawler/kakao/kakaocrawl.py

#-*- coding: utf-8 -*-
import sys
import re
import datetime
import json
import time
import logging

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup

from base.baseclasses import wait
from base.baseclasses import printl
from base.baseclasses import SendtoDB
from base.baseclasses import Browser
from base.baseclasses import CrawlInit
from base.baseclasses import enter_element

from kakao.kakaoexception import KakaoCrawlerException
from kakao.kakaoexception import NotFoundElementError
from kakao.kakaoexception import NotFoundDataError

try:
    import lxml
    parser_opt = 'lxml'
except ImportError:
    parser_opt = 'html.parser'

__author__ = 'cococo'
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
limit_reload = 5
num_of_retry = 3

logging.basicConfig(level=logging.INFO,
                    format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")


re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
                     "[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")


def get_date(element):
    """

    :param element: this may be span.time element
    :return: 'yyyy-MM-dd hh:mm:ss'
    """
    m = re_date.search(element.attrs.get('title', '')) \
        or re_date.search(element.attrs.get('data-tooltip', ''))

    if m:
        temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                      int(m.group(5)), int(m.group(6)))
        # add 12 hour when the article is written at p.m
        if m.group(4) == "오후" and int(m.group(5)) < 12:
            temp_date += datetime.timedelta(hours=12)

        if m.group(4) == "오전" and int(m.group(5)) == 12:
            temp_date -= datetime.timedelta(hours=12)

        # convert datetime.datetime to str
        return str(temp_date)
    # return invalid date instead of exception
    else:
        # raise NotFoundElementError("get_date exception")
        return "0000-00-00 00:00:00"


# function for click X button on content
def click_kakao_close_button(driver):
    btn = driver.find_element_by_css_selector("button._btnClose")
    btn.send_keys(Keys.NULL)
    btn.send_keys(Keys.ENTER)


def find_element_by_css_selector(driver, css_selector, wait_second=10):
    element = WebDriverWait(driver, wait_second).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
    return element


class KakaoInit(CrawlInit):
    def __init__(self, before_day=0):
        super().__init__(before_day)
        self.urls = dict()
        self.urls[6] = "https://story.kakao.com/ch/"
        self.urls[7] = "https://story.kakao.com/hashtag/"
        self.urls[8] = "https://story.kakao.com/"

    def split_searches(self):
        search = self.searches()
        splited_list = search.split(',')
        trimmed_list = list()
        if self.platform() == 6 or self.platform() == 8:
            for x in splited_list:
                trimmed_list.append(x.strip())
        else:
            for x in splited_list:
                trimmed_list.append(self.utf8(x.strip()))
        return trimmed_list

    def make_url(self):
        urls = list()
        for x in self.split_searches():
            url = self.urls[self.platform()] + x
            urls.append(url)
        return urls

    def get_begin_day(self):
        if self.is_realtime():
            date_now = datetime.datetime.now()
            result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
            result += datetime.timedelta(days=self.before_day)
            return result.date()
        else:
            return self.start_day()

    def get_end_day(self):
        if self.is_realtime():
            date_now = datetime.datetime.now()
            result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
            return result.date()
        else:
            return self.end_day()


class BodyCrawler(object):
    def __init__(self, driver):
        self.driver = driver
        self.soup = None
        self.section_activity = None
        self.set_soup_and_activity()
        if not self.section_activity:
            raise NotFoundElementError("section _activity is not Found")

    # calling point may differ
    def set_soup_and_activity(self):
        self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
        # There are many div.section _activity. But element we use is in div.cover_wrapper
        cover_wrapper = self.soup.find('div', class_='cover_wrapper')
        self.section_activity = cover_wrapper.find('div', class_='section _activity')

    def find_article_id(self):
        a = self.section_activity.find('a', class_='pf_name')
        href = a.attrs['href'].replace('https://story.kakao.com/', '')
        return href[1:] if href.startswith('/') else href

    def find_article_nickname(self):
        a = self.section_activity.find('a', class_='pf_name')
        return a.text

    def find_article_url(self):
        # in chrome, current_url is equal to article_url
        # need to check other browser
        return self.driver.current_url

    def find_article_modified_date(self):
        # get DOM about modified date
        times = None
        add_top = self.section_activity.find('div', class_='add_top')
        if add_top:
            times = add_top.find_all('span', class_='time')

        # written time is default. if the article was modified, modified time is added.
        # so if length of times is not equal to 2, there is only written time.
        if not times or len(times) < 2:
            return None

        # times[0] : written time, times[1] : modified time
        # times[1] structure : <span><span ...> </span></span>
        # check times[1].span exists
        if times[1].span:

            # before mouse over the element(tooltip), the date string is in the title attribute of span
            # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
            m = re_date.search(times[1].span.attrs.get('title', '')) \
                or re_date.search(times[1].span.attrs.get('data-tooltip', ''))

            if m:
                temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                              int(m.group(5)), int(m.group(6)))
                # add 12 hour when the article is written at p.m
                if m.group(4) == "오후" and int(m.group(5)) < 12:
                    temp_date += datetime.timedelta(hours=12)

                # sub 12 hour when the article is written at 12 a.m
                if m.group(4) == "오전" and int(m.group(5)) == 12:
                    temp_date -= datetime.timedelta(hours=12)

                # convert datetime.datetime to str
                return str(temp_date)
            else:
                # raise NotFoundDataError('data for find_article_modified is not found')
                return None

        # return None instead of exception.
        else:
            # raise NotFoundElementError('find_article_modified DOM is missing')
            return None

    def find_article_date(self):
        # modified date is a higher priority than written date
        modified_date = self.find_article_modified_date()
        if modified_date:
            return modified_date
        times = None
        # get DOMs about date
        add_top = self.section_activity.find('div', class_='add_top')
        if add_top:
            times = add_top.find_all('span', class_='time')
        else:
            raise NotFoundElementError("find_article_data DOM is missing : add_top")
        if not times:
            raise NotFoundElementError("find_article_data DOM is missing : time")

        # before mouse over the element(tooltip), the date string is in the title attribute of span
        # after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
        m = re_date.search(times[0].attrs.get('title', '')) \
            or re_date.search(times[0].attrs.get('data-tooltip', ''))

        if m:
            temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                          int(m.group(5)), int(m.group(6)))
            # add 12 hour when the article is written at p.m
            if m.group(4) == "오후" and int(m.group(5)) < 12:
                temp_date += datetime.timedelta(hours=12)

            # sub 12 hour when the article is written at 12 a.m
            if m.group(4) == "오전" and int(m.group(5)) == 12:
                temp_date -= datetime.timedelta(hours=12)

            # convert datetime.datetime to str
            return str(temp_date)
        # return invalid date instead of exception
        else:
            # raise NotFoundElementError("find_article_date exception")
            return "0000-00-00 00:00:00"

    def find_article_profileurl(self):
        profile_area = self.section_activity.find('div', class_='_profileArea pf')
        # check a>img
        if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
            return profile_area.a.img.get('src')
        # this is not essential, so no exception occur
        else:
            return ''

    def find_article_data(self):
        """
        :return: trimmed article_data
        """
        content = self.section_activity.find('div', class_='txt_wrap')
        if content and content.text:
            # trim
            return content.text.strip().replace('\xa0', '\n')
        # if there is no content or text, return empty data
        else:
            return ''

    def find_article_title(self):
        # strong.tit_channel is title of channel
        # if strong.tit_channel do not exist,
        # title is first line of article_data
        # this definition is determined by me -_-
        # find_article_data return trimmed string
        strong = self.section_activity.find('strong', class_='tit_channel')
        if strong and strong.text:
            return strong.text.replace('\xa0', '')

        article_data = self.find_article_data()
        if article_data:
            for line in article_data.splitlines():
                # limit title length
                return line[0:30] if len(line) > 30 else line
        else:
            return ''

    def find_article_etc(self, class_name):
        """
        this function is used for crawling number of shares, replies and feelings
        :param class_name:
        :return: a string of number of shares, replies, or feelings
        """
        element = self.section_activity.find('strong', class_=class_name)

        # check element has text that indicate the number
        if element and element.text:
            # It may contain comma ',' to recognize easily
            # Remove comma ',' to convert from str to int
            txt = element.text.replace(',', '')
            return txt
        # if there is no element or text, return '0' instead of raising exception
        else:
            # raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
            return '0'

    def find_article_share(self):
        return self.find_article_etc('_storyShareCount')

    def find_article_feeling(self):
        return self.find_article_etc('_likeCount')

    def find_article_reply_num(self):
        return self.find_article_etc('_commentCount')

    def find_platform_form(self):
        article_id = self.find_article_id()
        return 'channel' if article_id.startswith('ch/') else 'story'

    def get(self):
        """
        you need to put 'keyword_id'
        :return: dict for crawled body content
        """
        content = dict()
        content['article_id'] = self.find_article_id()
        content['article_nickname'] = self.find_article_nickname()
        content['article_data'] = self.find_article_data()
        content['article_title'] = self.find_article_title()
        content['article_date'] = self.find_article_date()
        content['article_url'] = self.find_article_url()
        content['article_profileurl'] = self.find_article_profileurl()
        content['article_order'] = self.find_article_reply_num()
        content['article_parent'] = self.find_article_share()
        content['reply_url'] = self.find_article_feeling()
        content['platform_form'] = self.find_platform_form()
        content['article_form'] = 'body'
        content['platform_name'] = 'kakaostory'
        content['platform_id'] = content['article_id']
        content['platform_title'] = content['article_nickname']
        return content


class ReplyCrawler(object):
    def __init__(self, driver):
        self.driver = driver
        self.soup = None
        self.section_activity = None
        self.ul = None
        self.lis = None

    def set_soup_and_activity(self):
        self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
        # There are many div.section _activity. But a element we use is in div.cover_wrapper
        cover_wrapper = self.soup.find('div', class_='cover_wrapper')
        self.section_activity = cover_wrapper.find('div', class_='section _activity')
        self.ul = self.section_activity.find('ul', class_='list _listContainer')

    def load_all_reply(self):
        previous_num_of_replies = 0
        while self.has_more():
            self.click_load_more_reply_btn()
            # check the number of replies before and after click_load_more_reply_btn()
            # If These were equal, the link or ajax failed
            current_num_of_replies = self.get_num_of_replies()
            if previous_num_of_replies == current_num_of_replies:
                break
            previous_num_of_replies = current_num_of_replies

    def get_num_of_replies(self):
        # Find ul element that contains replies
        # if raise occur, there is no reply
        # for performance, this method may is implemented using bs4
        try:
            ul = find_element_by_css_selector(self.driver,
                                              "div.cover_wrapper "
                                              "div[class='section _activity'] "
                                              "ul[class='list _listContainer']", 5)
            li = ul.find_elements_by_tag_name('li')
            return len(li)
        except Exception as e:
            return 0

    def click_load_more_reply_btn(self):
        try:
            # find a link to load reply and click/enter it
            a = find_element_by_css_selector(self.driver,
                                             "div.cover_wrapper "
                                             "div[class='section _activity'] "
                                             "a[class='_btnShowMoreComment']", 5)
            enter_element(a)

        # no link is in the browser. Nothing happens instead raise exception. But log this event
        except Exception as e:
            printl("In click_load_more_reply_btn, there is not a link to load replies")
            printl(e)

    def has_more(self):
        # In the case that raise exception,
        # there is no more reply or css selector of the show_more is invalid
        # These two case can't be classified by exception because the logic is same
        try:
            # find show_more element
            show_more = find_element_by_css_selector(self.driver,
                                                     "div.cover_wrapper "
                                                     "div[class='section _activity'] "
                                                     "p[class='more _showMoreCommentContainer']", 5)

            # 'display:block;' -> display the button, 'display:none;' -> hide the button
            if 'block' in show_more.get_attribute('style'):
                return True
            else:
                return False
        # return False in the two case
        # First case is that loading replies is finished
        # Second case is that css selector to find element is invalid
        except Exception as e:
            return False

    # find_xxxx functions

    def find_article_id(self):
        # Find name placeholder
        divs = self.ul.find_all('a', class_='name _namePlaceholder')
        # Get article_ids and remove kakaostory url in article_id
        article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
                       for div in divs if div.attrs.get('href', '')]
        # Refine hrefs. Href may start with '/'
        article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
        # Return list because of unification of types
        return list(article_id)

    def find_article_nickname(self):
        divs = self.ul.find_all('a', class_='name _namePlaceholder')
        # If div.text exist, return div.text. Otherwise return empty string
        return [div.text if div.text else '' for div in divs]

    def find_article_data(self):
        divs = self.ul.find_all('div', class_='txt')
        # The div.text has meta-data in div.p.text. If meta-data exists, remove it
        # When element does not exists, return empty string
        return [div.text[len(div.p.text):].replace('\xa0', '\n')
                if div.p else div.text if div.text else '' for div in divs]

    def find_article_date(self):
        divs = self.ul.find_all('span', class_='time')
        return list(map(get_date, divs))

    def find_article_like(self):
        spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
        # The number of like exists in span.like_num _likeCommentCount Unless it is present
        return [span.text if span.text else '' for span in spans]

    def find_article_profileurl(self):
        divs = self.ul.find_all('div', class_='pf')
        return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))

    def get(self):
        """
        Need to put platform_title, platform_id, platform_form from body
        :return: a list of replies. Need to put platform_title, platform_id
        """
        # load all replies
        self.load_all_reply()

        # After loading all replies, crawl replies using BeautifulSoup
        self.set_soup_and_activity()

        article_ids = self.find_article_id()
        article_nicknames = self.find_article_nickname()
        article_datas = self.find_article_data()
        article_dates = self.find_article_date()
        article_profileurls = self.find_article_profileurl()
        article_likes = self.find_article_like()
        article_url = self.driver.current_url

        replies = []
        # This may occur exception when indices of each elements is not matched
        # This exception described above is intended
        for i in range(len(article_ids)):
            reply = dict()
            reply['article_id'] = article_ids[i]
            reply['article_nickname'] = article_nicknames[i]
            reply['article_data'] = article_datas[i]
            reply['article_date'] = article_dates[i]
            reply['article_profileurl'] = article_profileurls[i]
            reply['reply_url'] = article_likes[i]
            reply['platform_name'] = 'kakaostory'
            reply['article_form'] = 'reply'
            reply['article_url'] = article_url
            reply['article_order'] = str(i)
            replies.append(reply)
        return replies


class ListTraverse(object):
    def __init__(self, driver):
        self.driver = driver
        self.current_section = None

    def remove_current_section(self):
        tag_name = self.current_section.tag_name
        data_model = self.current_section.get_attribute("data-model")
        css_selector = tag_name + "[data-model='" + data_model + "']"
        self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
        self.current_section = None

    # This is the same as the move_first_section function
    def move_next_section(self):
        self.move_first_section()

    # Load list more
    def load_list_more(self):
        position = self.driver.get_window_position()
        size = self.driver.get_window_size()
        self.driver.maximize_window()
        self.driver.set_window_size(size['width'], size["height"])
        self.driver.set_window_position(position['x'], position['y'])
        for _ in range(2):
            self.driver.execute_script("window.scrollBy(0, -400)")
            time.sleep(0.3)
        for _ in range(4):
            self.driver.execute_script("window.scrollBy(0, 800)")
            time.sleep(0.3)

    def close_current_section(self):
        # click close button on the page section
        try:
            btn = find_element_by_css_selector(self.driver, "button._btnClose", 5)
            btn.send_keys(Keys.NULL)
            btn.send_keys(Keys.ENTER)
        except Exception as e:
            printl("There is not X button on the page")
            printl(e)

        # check, verify and close current section
        try:
            btn = find_element_by_css_selector(self.driver, "button._btnClose", 1)
            btn.send_keys(Keys.NULL)
            btn.send_keys(Keys.ENTER)
        except Exception as e:
            pass

    def get_current_section_data_model(self):
        return self.current_section.get_attribute('data-model') if self.current_section else ""

    # check body is loaded
    def is_loaded_body(self):
        try:
            section_activity = find_element_by_css_selector(self.driver,
                                                            "div.cover_wrapper div[class='section _activity']")
            return True if section_activity else False
        except WebDriverException as we:
            printl("Body is not loaded on browser : is_loaded_body")
            printl(we)
            raise

    #
    def check_list_and_load(self):
        for _ in range(limit_reload):
            num_of_list = self.get_num_of_list()
            if not num_of_list:
                self.load_list_more()
        num_of_list = self.get_num_of_list()
        if not num_of_list:
            raise WebDriverException("There is no data or ajax error")

    def move_first_section(self):
        raise NotImplementedError

    def open_current_section(self):
        raise NotImplementedError

    def get_num_of_list(self):
        raise NotImplementedError

    def get_date_of_current_section(self):
        raise NotImplementedError


class ListTag(ListTraverse):
    # open url -> move_first_section -> open_current_section ->
    # check date -> crawl / ignore -> close_current_section -> remove_current_section -> next_section ->
    # open_current_section

    def __init__(self, driver):
        ListTraverse.__init__(self, driver)

    # Raising exception is intended when first element is not found
    # Set current_section on div
    def move_first_section(self):
        try:
            recent_section_field = \
                find_element_by_css_selector(self.driver, "div.cont_recomm[data-part-name='recentFeeds']", 10)
            self.current_section = recent_section_field.find_element_by_css_selector('div.img_item')
        except Exception as e:
            printl("Do not find first recent section")
            raise

    # Raising exception is intended when fail to find a link to a content
    def open_current_section(self):
        try:
            # The element to find is 'a' tag. Its class attribute is link_thumb _link or link_txt _link
            a = self.current_section.find_element_by_css_selector("a[class$=' _link']")
            a.send_keys(Keys.NULL)
            a.send_keys(Keys.ENTER)
        except WebDriverException as we:
            printl("open_current_section error")
            printl(we)
            printl(self.current_section.get_attribute('data-model'))
            raise KakaoCrawlerException("open_current_section error")
        except Exception as e:
            printl("Unknown Occurs")
            printl(e)
            raise

    # Raising exception is intended when fail to find the element or the text containing date
    def get_date_of_current_section(self):
        # Find the element containing date and extract text from it. If not, raise exception.
        try:
            div = find_element_by_css_selector(self.driver, "div.cover_wrapper")
            span = div.find_element_by_css_selector("div.add_top span.time")
            text_date = span.get_attribute('title') or span.get_attribute('data-tooltip')
        except WebDriverException as we:
            printl("Element is not found in get_date_of_current_section")
            printl(we)
            raise NotFoundElementError("Element is not found in get_date_of_current_section")
        except Exception as e:
            printl("Unknown Exception")
            printl(e)
            raise

        # Check the text containing date info is valid. If not, raise exception
        if text_date and len(text_date) > 6:
            m = re_date.search(text_date) or re_date.search(text_date)
            if m:
                temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                              int(m.group(5)), int(m.group(6)))
                # add 12 hour when the article is written at p.m
                if m.group(4) == "오후" and int(m.group(5)) < 12:
                    temp_date += datetime.timedelta(hours=12)

                # convert datetime.datetime to str
                return str(temp_date)
            else:
                raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
        else:
            raise NotFoundDataError("Date is missing when calling get_date_of_current_section")

    def get_num_of_list(self):
        items = self.driver.find_elements_by_css_selector("div[class^='img_item']")
        return len(items) if items else 0


class ListUser(ListTraverse):
    def __init__(self, driver):
        ListTraverse.__init__(self, driver)

    def move_first_section(self):
        try:
            recent_section_field = \
                find_element_by_css_selector(self.driver, "div.feed[data-part-name='content']", 10)
            self.current_section = recent_section_field.find_element_by_css_selector("div[class='section _activity']")
        except WebDriverException as we:
            printl("Do not find first recent section")
            printl(we)
            raise NotFoundElementError("Do not find first recent section")
        except Exception as e:
            printl("Unknown exception occur")
            printl(e)
            raise

    # Raising exception is intended when fail to find a link to a content
    def open_current_section(self):
        try:
            a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
            a.send_keys(Keys.NULL)
            a.send_keys(Keys.ENTER)
        except WebDriverException as we:
            printl("open_current_section error")
            printl(we)
            raise NotFoundElementError("Do not find first recent section")
        except Exception as e:
            printl("Unknown exception occur")
            printl(e)
            raise

    # Raising exception is intended when fail to find the element or the text containing date
    def get_date_of_current_section(self):
        # Find the element containing date and extract text from it. If not, raise exception.
        try:
            a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
            text_date = a.get_attribute('title') or a.get_attribute('data-tooltip')
        except WebDriverException as we:
            printl("Element is not found in get_date_of_current_section")
            printl(we)
            raise NotFoundElementError("Element is not found in get_date_of_current_section")
        except Exception as e:
            printl("Unknown exception occur")
            printl(e)
            raise

        # Check the text containing date info is valid. If not, raise exception
        if text_date and len(text_date) > 6:
            m = re_date.search(text_date) or re_date.search(text_date)
            if m:
                temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                              int(m.group(5)), int(m.group(6)))
                # add 12 hour when the article is written at p.m
                if m.group(4) == "오후" and int(m.group(5)) < 12:
                    temp_date += datetime.timedelta(hours=12)

                # convert datetime.datetime to str
                return str(temp_date)
            else:
                raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
        else:
            raise NotFoundDataError("Date is missing when calling get_date_of_current_section")

    def get_num_of_list(self):
        items = self.driver.find_elements_by_css_selector("div[class='section _activity']")
        return len(items) if items else 0


class CrawlerProcess(object):
    def __init__(self, driver, send_to_db, initializer, url, set_backup):
        self.driver = driver
        self.send_to_db = send_to_db
        self.initializer = initializer
        self.url = url
        self.set_backup = set_backup if set_backup else set()
        self.list_traverse = None
        self.num_of_web_except = 0
        self.num_of_out_of_date = 0

    # To catch exception, this function wraps traverse_and_crawl function in try-catch statement.
    def start(self):
        while True:
            try:
                self.traverse_and_crawl()
            # If WebDriverException occurs, retry crawling.
            except WebDriverException as we:
                printl("WebDriverException occurs")
                printl(we)

                # If the number of retry is over limit, crawling is terminated.
                if self.num_of_web_except > num_of_retry:
                    printl("There may be no data")
                    printl("Crawling is done")
                    break

                printl("Retry :", num_of_retry - self.num_of_web_except)
                self.num_of_web_except += 1

                # test chromedriver can access self.driver
                # if can't, WebDriverException occur
                self.driver.get('https://www.google.com')
                wait(2)

            # not found element or data, this program is terminated
            # This process is intended for debug
            except KakaoCrawlerException as ke:
                printl("KakaoCrawlerException occur. Check kakao website")
                printl(ke)
                raise

            # unknown exception occur
            except Exception as e:
                printl("Unknown occurs")
                printl(e)

                # If the number of retry is over limit, crawling is terminated.
                if self.num_of_web_except > num_of_retry:
                    printl("Crawling is terminated by force")
                    raise

                printl("Retry :", num_of_retry - self.num_of_web_except)
                self.num_of_web_except += 1

            # no exception occurs
            else:
                printl("Crawling is done")
                break

    def get_set_backup(self):
        return self.set_backup

    def convert_datetime_to_date(self, str_date):
        #return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
        return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S').date()

    def traverse_and_crawl(self):
        NotImplementedError

    def is_terminate(self):
        self.num_of_out_of_date += 1
        return True if self.num_of_out_of_date > limit_reload else False


class UserProcess(CrawlerProcess):
    def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
        CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
        self.list_traverse = ListUser(driver)

    # move url -> check list and load -> move first section ->
    # loop: check date, open content, check body and crawling ->
    # close content -> remove current section -> check list and load -> move next
    def traverse_and_crawl(self):
        self.driver.get(self.url)
        self.list_traverse.check_list_and_load()
        self.list_traverse.move_first_section()

        self.num_of_out_of_date = 0
        # begin_day and end_day type is datetime.date
        begin_day = self.initializer.get_begin_day()
        end_day = self.initializer.get_end_day()

        while True:
            cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
            if cs_date > end_day:
                printl(str(cs_date), ": continue")
            elif cs_date < begin_day:
                if self.is_terminate():
                    break
            else:
                current_section_data_model = self.list_traverse.get_current_section_data_model()

                if current_section_data_model not in self.set_backup:
                    self.set_backup.add(current_section_data_model)
                    self.list_traverse.open_current_section()

                    if self.list_traverse.is_loaded_body():
                        body_crawler = BodyCrawler(self.driver)
                        body = body_crawler.get()
                        if body:
                            body['keyword_id'] = self.initializer.keyword_id()
                            printl(body['article_url'])
                            self.send_to_db.delete_url(body['article_url'])
                            self.send_to_db.send_body(body)

                            reply_crawler = ReplyCrawler(self.driver)
                            replies = reply_crawler.get()

                            # if reply exists in replies variable
                            if replies:
                                # put platform_name, platform_form, platform_id to dict of list
                                for reply in replies:
                                    reply['platform_id'] = body['platform_id']
                                    reply['platform_name'] = body['platform_name']
                                    reply['platform_form'] = body['platform_form']
                                self.send_to_db.send_reply(replies)
                            printl('ok')
                    else:
                        raise Exception("Nobody Nobody")
                    self.list_traverse.close_current_section()
            self.list_traverse.remove_current_section()
            if not self.list_traverse.get_num_of_list():
                self.list_traverse.check_list_and_load()
            self.list_traverse.move_next_section()


class TagProcess(CrawlerProcess):
    def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
        CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
        self.list_traverse = ListTag(driver)

        # move url -> check list and load -> move first section ->
        # loop: open content, check body content and date, and crawling ->
        # close content -> remove current section -> check list and load -> move next
    def traverse_and_crawl(self):
        self.driver.get(self.url)
        self.list_traverse.check_list_and_load()
        self.list_traverse.move_first_section()

        self.num_of_out_of_date = 0
        # begin_day and end_day type is datetime.date
        begin_day = self.initializer.get_begin_day()
        end_day = self.initializer.get_end_day()

        while True:
            self.list_traverse.open_current_section()
            if self.list_traverse.is_loaded_body():
                cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
                if cs_date > end_day:
                    printl(str(cs_date), ": continue")
                elif cs_date < begin_day:
                    if self.is_terminate():
                        break
                else:
                    current_section_data_model = self.list_traverse.get_current_section_data_model()

                    if current_section_data_model not in self.set_backup:
                        self.set_backup.add(current_section_data_model)
                        body_crawler = BodyCrawler(self.driver)
                        body = body_crawler.get()
                        printl(body['article_url'])
                        if body:
                            body['keyword_id'] = self.initializer.keyword_id()
                            self.send_to_db.delete_url(body['article_url'])
                            self.send_to_db.send_body(body)

                            reply_crawler = ReplyCrawler(self.driver)
                            replies = reply_crawler.get()

                            # if reply exists in replies variable
                            if replies:
                                # put platform_name, platform_form, platform_id to dict of list
                                for reply in replies:
                                    reply['platform_id'] = body['platform_id']
                                    reply['platform_name'] = body['platform_name']
                                    reply['platform_form'] = body['platform_form']
                                self.send_to_db.send_reply(replies)
                        printl('ok')
            else:
                raise Exception("Nobody Nobody")
            self.list_traverse.close_current_section()
            self.list_traverse.remove_current_section()
            if not self.list_traverse.get_num_of_list():
                self.list_traverse.check_list_and_load()
            self.list_traverse.move_next_section()


class KakaoMainCrawler:
    def __init__(self):
        self.send_to_db = SendtoDB()
        self.crawl_init = KakaoInit()
        self.browser = Browser()
        self.driver = None

    def set_driver(self, driver):
        self.driver = driver

    def set_keyword_id(self, keyword_id):
        self.keyword_id = keyword_id

    def start(self):
        self.crawl_start()

    def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
        self.init_browser(browser)
        self.init_keyword_id(keyword_id)
        self.init_db(db_num)
        self.init_before_day(before_day)
        self.init_until_page(until_page)

    def init_browser(self, browser):
        self.set_driver(self.browser.get_new_driver(browser))

    def init_keyword_id(self, keyword_id):
        if type(keyword_id) != int:
            self.keyword_id = int(keyword_id)
        else:
            self.keyword_id = keyword_id
        self.crawl_init.get_keyword_parameters(keyword_id)
        self.crawl_init.disconnect()

    def init_db(self, db_num):
        self.send_to_db.set_db(db_num)

    def init_before_day(self, before_day):
        self.crawl_init.set_before_day(before_day)

    def init_until_page(self, until_page):
        self.crawl_init.set_until_page(until_page)

    def crawl_start(self):
        real_time = True
        while real_time:
            printl("Crawler Start")
            url_list = self.crawl_init.make_url()
            i = 0
            while i < len(url_list):
                try:
                    printl(url_list[i], "\n")
                    if 'https://story.kakao.com/hashtag/' in url_list[i]:
                        kakao_process = TagProcess(self.driver, self.send_to_db, self.crawl_init,
                                                   url_list[i])
                    else:
                        kakao_process = UserProcess(self.driver, self.send_to_db, self.crawl_init,
                                                    url_list[i])
                    kakao_process.start()
                    i += 1
                except Exception as e:
                    logging.info(e)
                    # check for exception
                    # self.driver.quit()
                    self.set_driver(self.browser.new_browser())
                    wait(5)
                    i += 1
            real_time = self.crawl_init.is_realtime()
            printl("Finished Crawling :)")

        self.send_to_db.close()
        self.driver.quit()