Files
clients/WebBasedCrawler/kakao/kakaocrawl.py
admin a8014e257e effect update time 업로드하도록 수정
기타 오류 수정


git-svn-id: svn://192.168.0.12/source@333 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-12-30 07:51:45 +00:00

992 lines
41 KiB
Python

#-*- coding: utf-8 -*-
import sys
import re
import datetime
import json
import time
import logging
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from base.baseclasses import wait
from base.baseclasses import printl
from base.baseclasses import SendtoDB
from base.baseclasses import Browser
from base.baseclasses import CrawlInit
from base.baseclasses import enter_element
from kakao.kakaoexception import KakaoCrawlerException
from kakao.kakaoexception import NotFoundElementError
from kakao.kakaoexception import NotFoundDataError
try:
import lxml
parser_opt = 'lxml'
except ImportError:
parser_opt = 'html.parser'
__author__ = 'cococo'
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
limit_reload = 5
num_of_retry = 3
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
def get_date(element):
"""
:param element: this may be span.time element
:return: 'yyyy-MM-dd hh:mm:ss'
"""
m = re_date.search(element.attrs.get('title', '')) \
or re_date.search(element.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
if m.group(4) == "오전" and int(m.group(5)) == 12:
temp_date -= datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("get_date exception")
return "0000-00-00 00:00:00"
# function for click X button on content
def click_kakao_close_button(driver):
btn = driver.find_element_by_css_selector("button._btnClose")
btn.send_keys(Keys.NULL)
btn.send_keys(Keys.ENTER)
def find_element_by_css_selector(driver, css_selector, wait_second=10):
element = WebDriverWait(driver, wait_second).until(
EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
return element
class KakaoInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[6] = "https://story.kakao.com/ch/"
self.urls[7] = "https://story.kakao.com/hashtag/"
self.urls[8] = "https://story.kakao.com/"
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 6 or self.platform() == 8:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x.strip()))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result.date()
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result.date()
else:
return self.end_day()
class BodyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.set_soup_and_activity()
if not self.section_activity:
raise NotFoundElementError("section _activity is not Found")
# calling point may differ
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But element we use is in div.cover_wrapper
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = cover_wrapper.find('div', class_='section _activity')
def find_article_id(self):
a = self.section_activity.find('a', class_='pf_name')
href = a.attrs['href'].replace('https://story.kakao.com/', '')
return href[1:] if href.startswith('/') else href
def find_article_nickname(self):
a = self.section_activity.find('a', class_='pf_name')
return a.text
def find_article_url(self):
# in chrome, current_url is equal to article_url
# need to check other browser
return self.driver.current_url
def find_article_modified_date(self):
# get DOM about modified date
times = None
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
# written time is default. if the article was modified, modified time is added.
# so if length of times is not equal to 2, there is only written time.
if not times or len(times) < 2:
return None
# times[0] : written time, times[1] : modified time
# times[1] structure : <span><span ...> </span></span>
# check times[1].span exists
if times[1].span:
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[1].span.attrs.get('title', '')) \
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# sub 12 hour when the article is written at 12 a.m
if m.group(4) == "오전" and int(m.group(5)) == 12:
temp_date -= datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
else:
# raise NotFoundDataError('data for find_article_modified is not found')
return None
# return None instead of exception.
else:
# raise NotFoundElementError('find_article_modified DOM is missing')
return None
def find_article_date(self):
# modified date is a higher priority than written date
modified_date = self.find_article_modified_date()
if modified_date:
return modified_date
times = None
# get DOMs about date
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
else:
raise NotFoundElementError("find_article_data DOM is missing : add_top")
if not times:
raise NotFoundElementError("find_article_data DOM is missing : time")
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[0].attrs.get('title', '')) \
or re_date.search(times[0].attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# sub 12 hour when the article is written at 12 a.m
if m.group(4) == "오전" and int(m.group(5)) == 12:
temp_date -= datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("find_article_date exception")
return "0000-00-00 00:00:00"
def find_article_profileurl(self):
profile_area = self.section_activity.find('div', class_='_profileArea pf')
# check a>img
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
return profile_area.a.img.get('src')
# this is not essential, so no exception occur
else:
return ''
def find_article_data(self):
"""
:return: trimmed article_data
"""
content = self.section_activity.find('div', class_='txt_wrap')
if content and content.text:
# trim
return content.text.strip().replace('\xa0', '\n')
# if there is no content or text, return empty data
else:
return ''
def find_article_title(self):
# strong.tit_channel is title of channel
# if strong.tit_channel do not exist,
# title is first line of article_data
# this definition is determined by me -_-
# find_article_data return trimmed string
strong = self.section_activity.find('strong', class_='tit_channel')
if strong and strong.text:
return strong.text.replace('\xa0', '')
article_data = self.find_article_data()
if article_data:
for line in article_data.splitlines():
# limit title length
return line[0:30] if len(line) > 30 else line
else:
return ''
def find_article_etc(self, class_name):
"""
this function is used for crawling number of shares, replies and feelings
:param class_name:
:return: a string of number of shares, replies, or feelings
"""
element = self.section_activity.find('strong', class_=class_name)
# check element has text that indicate the number
if element and element.text:
# It may contain comma ',' to recognize easily
# Remove comma ',' to convert from str to int
txt = element.text.replace(',', '')
return txt
# if there is no element or text, return '0' instead of raising exception
else:
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
return '0'
def find_article_share(self):
return self.find_article_etc('_storyShareCount')
def find_article_feeling(self):
return self.find_article_etc('_likeCount')
def find_article_reply_num(self):
return self.find_article_etc('_commentCount')
def find_platform_form(self):
article_id = self.find_article_id()
return 'channel' if article_id.startswith('ch/') else 'story'
def get(self):
"""
you need to put 'keyword_id'
:return: dict for crawled body content
"""
content = dict()
content['article_id'] = self.find_article_id()
content['article_nickname'] = self.find_article_nickname()
content['article_data'] = self.find_article_data()
content['article_title'] = self.find_article_title()
content['article_date'] = self.find_article_date()
content['article_url'] = self.find_article_url()
content['article_profileurl'] = self.find_article_profileurl()
content['article_order'] = self.find_article_reply_num()
content['article_parent'] = self.find_article_share()
content['reply_url'] = self.find_article_feeling()
content['platform_form'] = self.find_platform_form()
content['article_form'] = 'body'
content['platform_name'] = 'kakaostory'
content['platform_id'] = content['article_id']
content['platform_title'] = content['article_nickname']
return content
class ReplyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.ul = None
self.lis = None
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But a element we use is in div.cover_wrapper
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = cover_wrapper.find('div', class_='section _activity')
self.ul = self.section_activity.find('ul', class_='list _listContainer')
def load_all_reply(self):
previous_num_of_replies = 0
while self.has_more():
self.click_load_more_reply_btn()
# check the number of replies before and after click_load_more_reply_btn()
# If These were equal, the link or ajax failed
current_num_of_replies = self.get_num_of_replies()
if previous_num_of_replies == current_num_of_replies:
break
previous_num_of_replies = current_num_of_replies
def get_num_of_replies(self):
# Find ul element that contains replies
# if raise occur, there is no reply
# for performance, this method may is implemented using bs4
try:
ul = find_element_by_css_selector(self.driver,
"div.cover_wrapper "
"div[class='section _activity'] "
"ul[class='list _listContainer']", 5)
li = ul.find_elements_by_tag_name('li')
return len(li)
except Exception as e:
return 0
def click_load_more_reply_btn(self):
try:
# find a link to load reply and click/enter it
a = find_element_by_css_selector(self.driver,
"div.cover_wrapper "
"div[class='section _activity'] "
"a[class='_btnShowMoreComment']", 5)
enter_element(a)
# no link is in the browser. Nothing happens instead raise exception. But log this event
except Exception as e:
printl("In click_load_more_reply_btn, there is not a link to load replies")
printl(e)
def has_more(self):
# In the case that raise exception,
# there is no more reply or css selector of the show_more is invalid
# These two case can't be classified by exception because the logic is same
try:
# find show_more element
show_more = find_element_by_css_selector(self.driver,
"div.cover_wrapper "
"div[class='section _activity'] "
"p[class='more _showMoreCommentContainer']", 5)
# 'display:block;' -> display the button, 'display:none;' -> hide the button
if 'block' in show_more.get_attribute('style'):
return True
else:
return False
# return False in the two case
# First case is that loading replies is finished
# Second case is that css selector to find element is invalid
except Exception as e:
return False
# find_xxxx functions
def find_article_id(self):
# Find name placeholder
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# Get article_ids and remove kakaostory url in article_id
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
for div in divs if div.attrs.get('href', '')]
# Refine hrefs. Href may start with '/'
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
# Return list because of unification of types
return list(article_id)
def find_article_nickname(self):
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# If div.text exist, return div.text. Otherwise return empty string
return [div.text if div.text else '' for div in divs]
def find_article_data(self):
divs = self.ul.find_all('div', class_='txt')
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
# When element does not exists, return empty string
return [div.text[len(div.p.text):].replace('\xa0', '\n')
if div.p else div.text if div.text else '' for div in divs]
def find_article_date(self):
divs = self.ul.find_all('span', class_='time')
return list(map(get_date, divs))
def find_article_like(self):
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
# The number of like exists in span.like_num _likeCommentCount Unless it is present
return [span.text if span.text else '' for span in spans]
def find_article_profileurl(self):
divs = self.ul.find_all('div', class_='pf')
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
def get(self):
"""
Need to put platform_title, platform_id, platform_form from body
:return: a list of replies. Need to put platform_title, platform_id
"""
# load all replies
self.load_all_reply()
# After loading all replies, crawl replies using BeautifulSoup
self.set_soup_and_activity()
article_ids = self.find_article_id()
article_nicknames = self.find_article_nickname()
article_datas = self.find_article_data()
article_dates = self.find_article_date()
article_profileurls = self.find_article_profileurl()
article_likes = self.find_article_like()
article_url = self.driver.current_url
replies = []
# This may occur exception when indices of each elements is not matched
# This exception described above is intended
for i in range(len(article_ids)):
reply = dict()
reply['article_id'] = article_ids[i]
reply['article_nickname'] = article_nicknames[i]
reply['article_data'] = article_datas[i]
reply['article_date'] = article_dates[i]
reply['article_profileurl'] = article_profileurls[i]
reply['reply_url'] = article_likes[i]
reply['platform_name'] = 'kakaostory'
reply['article_form'] = 'reply'
reply['article_url'] = article_url
reply['article_order'] = str(i)
replies.append(reply)
return replies
class ListTraverse(object):
def __init__(self, driver):
self.driver = driver
self.current_section = None
def remove_current_section(self):
tag_name = self.current_section.tag_name
data_model = self.current_section.get_attribute("data-model")
css_selector = tag_name + "[data-model='" + data_model + "']"
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
self.current_section = None
# This is the same as the move_first_section function
def move_next_section(self):
self.move_first_section()
# Load list more
def load_list_more(self):
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
for _ in range(2):
self.driver.execute_script("window.scrollBy(0, -400)")
time.sleep(0.3)
for _ in range(4):
self.driver.execute_script("window.scrollBy(0, 800)")
time.sleep(0.3)
def close_current_section(self):
# click close button on the page section
try:
btn = find_element_by_css_selector(self.driver, "button._btnClose", 5)
btn.send_keys(Keys.NULL)
btn.send_keys(Keys.ENTER)
except Exception as e:
printl("There is not X button on the page")
printl(e)
# check, verify and close current section
try:
btn = find_element_by_css_selector(self.driver, "button._btnClose", 1)
btn.send_keys(Keys.NULL)
btn.send_keys(Keys.ENTER)
except Exception as e:
pass
def get_current_section_data_model(self):
return self.current_section.get_attribute('data-model') if self.current_section else ""
# check body is loaded
def is_loaded_body(self):
try:
section_activity = find_element_by_css_selector(self.driver,
"div.cover_wrapper div[class='section _activity']")
return True if section_activity else False
except WebDriverException as we:
printl("Body is not loaded on browser : is_loaded_body")
printl(we)
raise
#
def check_list_and_load(self):
for _ in range(limit_reload):
num_of_list = self.get_num_of_list()
if not num_of_list:
self.load_list_more()
num_of_list = self.get_num_of_list()
if not num_of_list:
raise WebDriverException("There is no data or ajax error")
def move_first_section(self):
raise NotImplementedError
def open_current_section(self):
raise NotImplementedError
def get_num_of_list(self):
raise NotImplementedError
def get_date_of_current_section(self):
raise NotImplementedError
class ListTag(ListTraverse):
# open url -> move_first_section -> open_current_section ->
# check date -> crawl / ignore -> close_current_section -> remove_current_section -> next_section ->
# open_current_section
def __init__(self, driver):
ListTraverse.__init__(self, driver)
# Raising exception is intended when first element is not found
# Set current_section on div
def move_first_section(self):
try:
recent_section_field = \
find_element_by_css_selector(self.driver, "div.cont_recomm[data-part-name='recentFeeds']", 10)
self.current_section = recent_section_field.find_element_by_css_selector('div.img_item')
except Exception as e:
printl("Do not find first recent section")
raise
# Raising exception is intended when fail to find a link to a content
def open_current_section(self):
try:
# The element to find is 'a' tag. Its class attribute is link_thumb _link or link_txt _link
a = self.current_section.find_element_by_css_selector("a[class$=' _link']")
a.send_keys(Keys.NULL)
a.send_keys(Keys.ENTER)
except WebDriverException as we:
printl("open_current_section error")
printl(we)
printl(self.current_section.get_attribute('data-model'))
raise KakaoCrawlerException("open_current_section error")
except Exception as e:
printl("Unknown Occurs")
printl(e)
raise
# Raising exception is intended when fail to find the element or the text containing date
def get_date_of_current_section(self):
# Find the element containing date and extract text from it. If not, raise exception.
try:
div = find_element_by_css_selector(self.driver, "div.cover_wrapper")
span = div.find_element_by_css_selector("div.add_top span.time")
text_date = span.get_attribute('title') or span.get_attribute('data-tooltip')
except WebDriverException as we:
printl("Element is not found in get_date_of_current_section")
printl(we)
raise NotFoundElementError("Element is not found in get_date_of_current_section")
except Exception as e:
printl("Unknown Exception")
printl(e)
raise
# Check the text containing date info is valid. If not, raise exception
if text_date and len(text_date) > 6:
m = re_date.search(text_date) or re_date.search(text_date)
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
else:
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
else:
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
def get_num_of_list(self):
items = self.driver.find_elements_by_css_selector("div[class^='img_item']")
return len(items) if items else 0
class ListUser(ListTraverse):
def __init__(self, driver):
ListTraverse.__init__(self, driver)
def move_first_section(self):
try:
recent_section_field = \
find_element_by_css_selector(self.driver, "div.feed[data-part-name='content']", 10)
self.current_section = recent_section_field.find_element_by_css_selector("div[class='section _activity']")
except WebDriverException as we:
printl("Do not find first recent section")
printl(we)
raise NotFoundElementError("Do not find first recent section")
except Exception as e:
printl("Unknown exception occur")
printl(e)
raise
# Raising exception is intended when fail to find a link to a content
def open_current_section(self):
try:
a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
a.send_keys(Keys.NULL)
a.send_keys(Keys.ENTER)
except WebDriverException as we:
printl("open_current_section error")
printl(we)
raise NotFoundElementError("Do not find first recent section")
except Exception as e:
printl("Unknown exception occur")
printl(e)
raise
# Raising exception is intended when fail to find the element or the text containing date
def get_date_of_current_section(self):
# Find the element containing date and extract text from it. If not, raise exception.
try:
a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
text_date = a.get_attribute('title') or a.get_attribute('data-tooltip')
except WebDriverException as we:
printl("Element is not found in get_date_of_current_section")
printl(we)
raise NotFoundElementError("Element is not found in get_date_of_current_section")
except Exception as e:
printl("Unknown exception occur")
printl(e)
raise
# Check the text containing date info is valid. If not, raise exception
if text_date and len(text_date) > 6:
m = re_date.search(text_date) or re_date.search(text_date)
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
else:
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
else:
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
def get_num_of_list(self):
items = self.driver.find_elements_by_css_selector("div[class='section _activity']")
return len(items) if items else 0
class CrawlerProcess(object):
def __init__(self, driver, send_to_db, initializer, url, set_backup):
self.driver = driver
self.send_to_db = send_to_db
self.initializer = initializer
self.url = url
self.set_backup = set_backup if set_backup else set()
self.list_traverse = None
self.num_of_web_except = 0
self.num_of_out_of_date = 0
# To catch exception, this function wraps traverse_and_crawl function in try-catch statement.
def start(self):
while True:
try:
self.traverse_and_crawl()
# If WebDriverException occurs, retry crawling.
except WebDriverException as we:
printl("WebDriverException occurs")
printl(we)
# If the number of retry is over limit, crawling is terminated.
if self.num_of_web_except > num_of_retry:
printl("There may be no data")
printl("Crawling is done")
break
printl("Retry :", num_of_retry - self.num_of_web_except)
self.num_of_web_except += 1
# test chromedriver can access self.driver
# if can't, WebDriverException occur
self.driver.get('https://www.google.com')
wait(2)
# not found element or data, this program is terminated
# This process is intended for debug
except KakaoCrawlerException as ke:
printl("KakaoCrawlerException occur. Check kakao website")
printl(ke)
raise
# unknown exception occur
except Exception as e:
printl("Unknown occurs")
printl(e)
# If the number of retry is over limit, crawling is terminated.
if self.num_of_web_except > num_of_retry:
printl("Crawling is terminated by force")
raise
printl("Retry :", num_of_retry - self.num_of_web_except)
self.num_of_web_except += 1
# no exception occurs
else:
printl("Crawling is done")
break
def get_set_backup(self):
return self.set_backup
def convert_datetime_to_date(self, str_date):
#return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S').date()
def traverse_and_crawl(self):
NotImplementedError
def is_terminate(self):
self.num_of_out_of_date += 1
return True if self.num_of_out_of_date > limit_reload else False
class UserProcess(CrawlerProcess):
def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
self.list_traverse = ListUser(driver)
# move url -> check list and load -> move first section ->
# loop: check date, open content, check body and crawling ->
# close content -> remove current section -> check list and load -> move next
def traverse_and_crawl(self):
self.driver.get(self.url)
self.list_traverse.check_list_and_load()
self.list_traverse.move_first_section()
self.num_of_out_of_date = 0
# begin_day and end_day type is datetime.date
begin_day = self.initializer.get_begin_day()
end_day = self.initializer.get_end_day()
while True:
cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
if cs_date > end_day:
printl(str(cs_date), ": continue")
elif cs_date < begin_day:
if self.is_terminate():
break
else:
current_section_data_model = self.list_traverse.get_current_section_data_model()
if current_section_data_model not in self.set_backup:
self.set_backup.add(current_section_data_model)
self.list_traverse.open_current_section()
if self.list_traverse.is_loaded_body():
body_crawler = BodyCrawler(self.driver)
body = body_crawler.get()
if body:
body['keyword_id'] = self.initializer.keyword_id()
printl(body['article_url'])
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
reply_crawler = ReplyCrawler(self.driver)
replies = reply_crawler.get()
# if reply exists in replies variable
if replies:
# put platform_name, platform_form, platform_id to dict of list
for reply in replies:
reply['platform_id'] = body['platform_id']
reply['platform_name'] = body['platform_name']
reply['platform_form'] = body['platform_form']
self.send_to_db.send_reply(replies)
printl('ok')
else:
raise Exception("Nobody Nobody")
self.list_traverse.close_current_section()
self.list_traverse.remove_current_section()
if not self.list_traverse.get_num_of_list():
self.list_traverse.check_list_and_load()
self.list_traverse.move_next_section()
class TagProcess(CrawlerProcess):
def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
self.list_traverse = ListTag(driver)
# move url -> check list and load -> move first section ->
# loop: open content, check body content and date, and crawling ->
# close content -> remove current section -> check list and load -> move next
def traverse_and_crawl(self):
self.driver.get(self.url)
self.list_traverse.check_list_and_load()
self.list_traverse.move_first_section()
self.num_of_out_of_date = 0
# begin_day and end_day type is datetime.date
begin_day = self.initializer.get_begin_day()
end_day = self.initializer.get_end_day()
while True:
self.list_traverse.open_current_section()
if self.list_traverse.is_loaded_body():
cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
if cs_date > end_day:
printl(str(cs_date), ": continue")
elif cs_date < begin_day:
if self.is_terminate():
break
else:
current_section_data_model = self.list_traverse.get_current_section_data_model()
if current_section_data_model not in self.set_backup:
self.set_backup.add(current_section_data_model)
body_crawler = BodyCrawler(self.driver)
body = body_crawler.get()
printl(body['article_url'])
if body:
body['keyword_id'] = self.initializer.keyword_id()
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
reply_crawler = ReplyCrawler(self.driver)
replies = reply_crawler.get()
# if reply exists in replies variable
if replies:
# put platform_name, platform_form, platform_id to dict of list
for reply in replies:
reply['platform_id'] = body['platform_id']
reply['platform_name'] = body['platform_name']
reply['platform_form'] = body['platform_form']
self.send_to_db.send_reply(replies)
printl('ok')
else:
raise Exception("Nobody Nobody")
self.list_traverse.close_current_section()
self.list_traverse.remove_current_section()
if not self.list_traverse.get_num_of_list():
self.list_traverse.check_list_and_load()
self.list_traverse.move_next_section()
class KakaoMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = KakaoInit()
self.browser = Browser()
self.driver = None
def set_driver(self, driver):
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def start(self):
self.crawl_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawl_start(self):
real_time = True
while real_time:
printl("Crawler Start")
url_list = self.crawl_init.make_url()
i = 0
while i < len(url_list):
try:
printl(url_list[i], "\n")
if 'https://story.kakao.com/hashtag/' in url_list[i]:
kakao_process = TagProcess(self.driver, self.send_to_db, self.crawl_init,
url_list[i])
else:
kakao_process = UserProcess(self.driver, self.send_to_db, self.crawl_init,
url_list[i])
kakao_process.start()
i += 1
except Exception as e:
logging.info(e)
# check for exception
# self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
i += 1
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
self.send_to_db.close()
self.driver.quit()