992 lines
41 KiB
Python
992 lines
41 KiB
Python
#-*- coding: utf-8 -*-
|
|
import sys
|
|
import re
|
|
import datetime
|
|
import json
|
|
import time
|
|
import logging
|
|
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.common.exceptions import WebDriverException
|
|
from bs4 import BeautifulSoup
|
|
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import printl
|
|
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import Browser
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import enter_element
|
|
|
|
from kakao.kakaoexception import KakaoCrawlerException
|
|
from kakao.kakaoexception import NotFoundElementError
|
|
from kakao.kakaoexception import NotFoundDataError
|
|
|
|
try:
|
|
import lxml
|
|
parser_opt = 'lxml'
|
|
except ImportError:
|
|
parser_opt = 'html.parser'
|
|
|
|
__author__ = 'cococo'
|
|
kakaostory_url = 'https://story.kakao.com/'
|
|
kakaostory_channel_url = 'https://story.kakao.com/ch/'
|
|
limit_reload = 5
|
|
num_of_retry = 3
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
|
|
|
|
|
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
|
|
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
|
|
|
|
|
def get_date(element):
|
|
"""
|
|
|
|
:param element: this may be span.time element
|
|
:return: 'yyyy-MM-dd hh:mm:ss'
|
|
"""
|
|
m = re_date.search(element.attrs.get('title', '')) \
|
|
or re_date.search(element.attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
if m.group(4) == "오전" and int(m.group(5)) == 12:
|
|
temp_date -= datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
# return invalid date instead of exception
|
|
else:
|
|
# raise NotFoundElementError("get_date exception")
|
|
return "0000-00-00 00:00:00"
|
|
|
|
|
|
# function for click X button on content
|
|
def click_kakao_close_button(driver):
|
|
btn = driver.find_element_by_css_selector("button._btnClose")
|
|
btn.send_keys(Keys.NULL)
|
|
btn.send_keys(Keys.ENTER)
|
|
|
|
|
|
def find_element_by_css_selector(driver, css_selector, wait_second=10):
|
|
element = WebDriverWait(driver, wait_second).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, css_selector)))
|
|
return element
|
|
|
|
|
|
class KakaoInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[6] = "https://story.kakao.com/ch/"
|
|
self.urls[7] = "https://story.kakao.com/hashtag/"
|
|
self.urls[8] = "https://story.kakao.com/"
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
trimmed_list = list()
|
|
if self.platform() == 6 or self.platform() == 8:
|
|
for x in splited_list:
|
|
trimmed_list.append(x.strip())
|
|
else:
|
|
for x in splited_list:
|
|
trimmed_list.append(self.utf8(x.strip()))
|
|
return trimmed_list
|
|
|
|
def make_url(self):
|
|
urls = list()
|
|
for x in self.split_searches():
|
|
url = self.urls[self.platform()] + x
|
|
urls.append(url)
|
|
return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result.date()
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result.date()
|
|
else:
|
|
return self.end_day()
|
|
|
|
|
|
class BodyCrawler(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.soup = None
|
|
self.section_activity = None
|
|
self.set_soup_and_activity()
|
|
if not self.section_activity:
|
|
raise NotFoundElementError("section _activity is not Found")
|
|
|
|
# calling point may differ
|
|
def set_soup_and_activity(self):
|
|
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
|
# There are many div.section _activity. But element we use is in div.cover_wrapper
|
|
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
|
self.section_activity = cover_wrapper.find('div', class_='section _activity')
|
|
|
|
def find_article_id(self):
|
|
a = self.section_activity.find('a', class_='pf_name')
|
|
href = a.attrs['href'].replace('https://story.kakao.com/', '')
|
|
return href[1:] if href.startswith('/') else href
|
|
|
|
def find_article_nickname(self):
|
|
a = self.section_activity.find('a', class_='pf_name')
|
|
return a.text
|
|
|
|
def find_article_url(self):
|
|
# in chrome, current_url is equal to article_url
|
|
# need to check other browser
|
|
return self.driver.current_url
|
|
|
|
def find_article_modified_date(self):
|
|
# get DOM about modified date
|
|
times = None
|
|
add_top = self.section_activity.find('div', class_='add_top')
|
|
if add_top:
|
|
times = add_top.find_all('span', class_='time')
|
|
|
|
# written time is default. if the article was modified, modified time is added.
|
|
# so if length of times is not equal to 2, there is only written time.
|
|
if not times or len(times) < 2:
|
|
return None
|
|
|
|
# times[0] : written time, times[1] : modified time
|
|
# times[1] structure : <span><span ...> </span></span>
|
|
# check times[1].span exists
|
|
if times[1].span:
|
|
|
|
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
|
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
|
m = re_date.search(times[1].span.attrs.get('title', '')) \
|
|
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# sub 12 hour when the article is written at 12 a.m
|
|
if m.group(4) == "오전" and int(m.group(5)) == 12:
|
|
temp_date -= datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
else:
|
|
# raise NotFoundDataError('data for find_article_modified is not found')
|
|
return None
|
|
|
|
# return None instead of exception.
|
|
else:
|
|
# raise NotFoundElementError('find_article_modified DOM is missing')
|
|
return None
|
|
|
|
def find_article_date(self):
|
|
# modified date is a higher priority than written date
|
|
modified_date = self.find_article_modified_date()
|
|
if modified_date:
|
|
return modified_date
|
|
times = None
|
|
# get DOMs about date
|
|
add_top = self.section_activity.find('div', class_='add_top')
|
|
if add_top:
|
|
times = add_top.find_all('span', class_='time')
|
|
else:
|
|
raise NotFoundElementError("find_article_data DOM is missing : add_top")
|
|
if not times:
|
|
raise NotFoundElementError("find_article_data DOM is missing : time")
|
|
|
|
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
|
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
|
m = re_date.search(times[0].attrs.get('title', '')) \
|
|
or re_date.search(times[0].attrs.get('data-tooltip', ''))
|
|
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# sub 12 hour when the article is written at 12 a.m
|
|
if m.group(4) == "오전" and int(m.group(5)) == 12:
|
|
temp_date -= datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
# return invalid date instead of exception
|
|
else:
|
|
# raise NotFoundElementError("find_article_date exception")
|
|
return "0000-00-00 00:00:00"
|
|
|
|
def find_article_profileurl(self):
|
|
profile_area = self.section_activity.find('div', class_='_profileArea pf')
|
|
# check a>img
|
|
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
|
|
return profile_area.a.img.get('src')
|
|
# this is not essential, so no exception occur
|
|
else:
|
|
return ''
|
|
|
|
def find_article_data(self):
|
|
"""
|
|
:return: trimmed article_data
|
|
"""
|
|
content = self.section_activity.find('div', class_='txt_wrap')
|
|
if content and content.text:
|
|
# trim
|
|
return content.text.strip().replace('\xa0', '\n')
|
|
# if there is no content or text, return empty data
|
|
else:
|
|
return ''
|
|
|
|
def find_article_title(self):
|
|
# strong.tit_channel is title of channel
|
|
# if strong.tit_channel do not exist,
|
|
# title is first line of article_data
|
|
# this definition is determined by me -_-
|
|
# find_article_data return trimmed string
|
|
strong = self.section_activity.find('strong', class_='tit_channel')
|
|
if strong and strong.text:
|
|
return strong.text.replace('\xa0', '')
|
|
|
|
article_data = self.find_article_data()
|
|
if article_data:
|
|
for line in article_data.splitlines():
|
|
# limit title length
|
|
return line[0:30] if len(line) > 30 else line
|
|
else:
|
|
return ''
|
|
|
|
def find_article_etc(self, class_name):
|
|
"""
|
|
this function is used for crawling number of shares, replies and feelings
|
|
:param class_name:
|
|
:return: a string of number of shares, replies, or feelings
|
|
"""
|
|
element = self.section_activity.find('strong', class_=class_name)
|
|
|
|
# check element has text that indicate the number
|
|
if element and element.text:
|
|
# It may contain comma ',' to recognize easily
|
|
# Remove comma ',' to convert from str to int
|
|
txt = element.text.replace(',', '')
|
|
return txt
|
|
# if there is no element or text, return '0' instead of raising exception
|
|
else:
|
|
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
|
|
return '0'
|
|
|
|
def find_article_share(self):
|
|
return self.find_article_etc('_storyShareCount')
|
|
|
|
def find_article_feeling(self):
|
|
return self.find_article_etc('_likeCount')
|
|
|
|
def find_article_reply_num(self):
|
|
return self.find_article_etc('_commentCount')
|
|
|
|
def find_platform_form(self):
|
|
article_id = self.find_article_id()
|
|
return 'channel' if article_id.startswith('ch/') else 'story'
|
|
|
|
def get(self):
|
|
"""
|
|
you need to put 'keyword_id'
|
|
:return: dict for crawled body content
|
|
"""
|
|
content = dict()
|
|
content['article_id'] = self.find_article_id()
|
|
content['article_nickname'] = self.find_article_nickname()
|
|
content['article_data'] = self.find_article_data()
|
|
content['article_title'] = self.find_article_title()
|
|
content['article_date'] = self.find_article_date()
|
|
content['article_url'] = self.find_article_url()
|
|
content['article_profileurl'] = self.find_article_profileurl()
|
|
content['article_order'] = self.find_article_reply_num()
|
|
content['article_parent'] = self.find_article_share()
|
|
content['reply_url'] = self.find_article_feeling()
|
|
content['platform_form'] = self.find_platform_form()
|
|
content['article_form'] = 'body'
|
|
content['platform_name'] = 'kakaostory'
|
|
content['platform_id'] = content['article_id']
|
|
content['platform_title'] = content['article_nickname']
|
|
return content
|
|
|
|
|
|
class ReplyCrawler(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.soup = None
|
|
self.section_activity = None
|
|
self.ul = None
|
|
self.lis = None
|
|
|
|
def set_soup_and_activity(self):
|
|
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
|
# There are many div.section _activity. But a element we use is in div.cover_wrapper
|
|
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
|
self.section_activity = cover_wrapper.find('div', class_='section _activity')
|
|
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
|
|
|
def load_all_reply(self):
|
|
previous_num_of_replies = 0
|
|
while self.has_more():
|
|
self.click_load_more_reply_btn()
|
|
# check the number of replies before and after click_load_more_reply_btn()
|
|
# If These were equal, the link or ajax failed
|
|
current_num_of_replies = self.get_num_of_replies()
|
|
if previous_num_of_replies == current_num_of_replies:
|
|
break
|
|
previous_num_of_replies = current_num_of_replies
|
|
|
|
def get_num_of_replies(self):
|
|
# Find ul element that contains replies
|
|
# if raise occur, there is no reply
|
|
# for performance, this method may is implemented using bs4
|
|
try:
|
|
ul = find_element_by_css_selector(self.driver,
|
|
"div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"ul[class='list _listContainer']", 5)
|
|
li = ul.find_elements_by_tag_name('li')
|
|
return len(li)
|
|
except Exception as e:
|
|
return 0
|
|
|
|
def click_load_more_reply_btn(self):
|
|
try:
|
|
# find a link to load reply and click/enter it
|
|
a = find_element_by_css_selector(self.driver,
|
|
"div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"a[class='_btnShowMoreComment']", 5)
|
|
enter_element(a)
|
|
|
|
# no link is in the browser. Nothing happens instead raise exception. But log this event
|
|
except Exception as e:
|
|
printl("In click_load_more_reply_btn, there is not a link to load replies")
|
|
printl(e)
|
|
|
|
def has_more(self):
|
|
# In the case that raise exception,
|
|
# there is no more reply or css selector of the show_more is invalid
|
|
# These two case can't be classified by exception because the logic is same
|
|
try:
|
|
# find show_more element
|
|
show_more = find_element_by_css_selector(self.driver,
|
|
"div.cover_wrapper "
|
|
"div[class='section _activity'] "
|
|
"p[class='more _showMoreCommentContainer']", 5)
|
|
|
|
# 'display:block;' -> display the button, 'display:none;' -> hide the button
|
|
if 'block' in show_more.get_attribute('style'):
|
|
return True
|
|
else:
|
|
return False
|
|
# return False in the two case
|
|
# First case is that loading replies is finished
|
|
# Second case is that css selector to find element is invalid
|
|
except Exception as e:
|
|
return False
|
|
|
|
# find_xxxx functions
|
|
|
|
def find_article_id(self):
|
|
# Find name placeholder
|
|
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
|
# Get article_ids and remove kakaostory url in article_id
|
|
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
|
|
for div in divs if div.attrs.get('href', '')]
|
|
# Refine hrefs. Href may start with '/'
|
|
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
|
|
# Return list because of unification of types
|
|
return list(article_id)
|
|
|
|
def find_article_nickname(self):
|
|
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
|
# If div.text exist, return div.text. Otherwise return empty string
|
|
return [div.text if div.text else '' for div in divs]
|
|
|
|
def find_article_data(self):
|
|
divs = self.ul.find_all('div', class_='txt')
|
|
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
|
|
# When element does not exists, return empty string
|
|
return [div.text[len(div.p.text):].replace('\xa0', '\n')
|
|
if div.p else div.text if div.text else '' for div in divs]
|
|
|
|
def find_article_date(self):
|
|
divs = self.ul.find_all('span', class_='time')
|
|
return list(map(get_date, divs))
|
|
|
|
def find_article_like(self):
|
|
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
|
|
# The number of like exists in span.like_num _likeCommentCount Unless it is present
|
|
return [span.text if span.text else '' for span in spans]
|
|
|
|
def find_article_profileurl(self):
|
|
divs = self.ul.find_all('div', class_='pf')
|
|
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
|
|
|
|
def get(self):
|
|
"""
|
|
Need to put platform_title, platform_id, platform_form from body
|
|
:return: a list of replies. Need to put platform_title, platform_id
|
|
"""
|
|
# load all replies
|
|
self.load_all_reply()
|
|
|
|
# After loading all replies, crawl replies using BeautifulSoup
|
|
self.set_soup_and_activity()
|
|
|
|
article_ids = self.find_article_id()
|
|
article_nicknames = self.find_article_nickname()
|
|
article_datas = self.find_article_data()
|
|
article_dates = self.find_article_date()
|
|
article_profileurls = self.find_article_profileurl()
|
|
article_likes = self.find_article_like()
|
|
article_url = self.driver.current_url
|
|
|
|
replies = []
|
|
# This may occur exception when indices of each elements is not matched
|
|
# This exception described above is intended
|
|
for i in range(len(article_ids)):
|
|
reply = dict()
|
|
reply['article_id'] = article_ids[i]
|
|
reply['article_nickname'] = article_nicknames[i]
|
|
reply['article_data'] = article_datas[i]
|
|
reply['article_date'] = article_dates[i]
|
|
reply['article_profileurl'] = article_profileurls[i]
|
|
reply['reply_url'] = article_likes[i]
|
|
reply['platform_name'] = 'kakaostory'
|
|
reply['article_form'] = 'reply'
|
|
reply['article_url'] = article_url
|
|
reply['article_order'] = str(i)
|
|
replies.append(reply)
|
|
return replies
|
|
|
|
|
|
class ListTraverse(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.current_section = None
|
|
|
|
def remove_current_section(self):
|
|
tag_name = self.current_section.tag_name
|
|
data_model = self.current_section.get_attribute("data-model")
|
|
css_selector = tag_name + "[data-model='" + data_model + "']"
|
|
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
|
|
self.current_section = None
|
|
|
|
# This is the same as the move_first_section function
|
|
def move_next_section(self):
|
|
self.move_first_section()
|
|
|
|
# Load list more
|
|
def load_list_more(self):
|
|
position = self.driver.get_window_position()
|
|
size = self.driver.get_window_size()
|
|
self.driver.maximize_window()
|
|
self.driver.set_window_size(size['width'], size["height"])
|
|
self.driver.set_window_position(position['x'], position['y'])
|
|
for _ in range(2):
|
|
self.driver.execute_script("window.scrollBy(0, -400)")
|
|
time.sleep(0.3)
|
|
for _ in range(4):
|
|
self.driver.execute_script("window.scrollBy(0, 800)")
|
|
time.sleep(0.3)
|
|
|
|
def close_current_section(self):
|
|
# click close button on the page section
|
|
try:
|
|
btn = find_element_by_css_selector(self.driver, "button._btnClose", 5)
|
|
btn.send_keys(Keys.NULL)
|
|
btn.send_keys(Keys.ENTER)
|
|
except Exception as e:
|
|
printl("There is not X button on the page")
|
|
printl(e)
|
|
|
|
# check, verify and close current section
|
|
try:
|
|
btn = find_element_by_css_selector(self.driver, "button._btnClose", 1)
|
|
btn.send_keys(Keys.NULL)
|
|
btn.send_keys(Keys.ENTER)
|
|
except Exception as e:
|
|
pass
|
|
|
|
def get_current_section_data_model(self):
|
|
return self.current_section.get_attribute('data-model') if self.current_section else ""
|
|
|
|
# check body is loaded
|
|
def is_loaded_body(self):
|
|
try:
|
|
section_activity = find_element_by_css_selector(self.driver,
|
|
"div.cover_wrapper div[class='section _activity']")
|
|
return True if section_activity else False
|
|
except WebDriverException as we:
|
|
printl("Body is not loaded on browser : is_loaded_body")
|
|
printl(we)
|
|
raise
|
|
|
|
#
|
|
def check_list_and_load(self):
|
|
for _ in range(limit_reload):
|
|
num_of_list = self.get_num_of_list()
|
|
if not num_of_list:
|
|
self.load_list_more()
|
|
num_of_list = self.get_num_of_list()
|
|
if not num_of_list:
|
|
raise WebDriverException("There is no data or ajax error")
|
|
|
|
def move_first_section(self):
|
|
raise NotImplementedError
|
|
|
|
def open_current_section(self):
|
|
raise NotImplementedError
|
|
|
|
def get_num_of_list(self):
|
|
raise NotImplementedError
|
|
|
|
def get_date_of_current_section(self):
|
|
raise NotImplementedError
|
|
|
|
|
|
class ListTag(ListTraverse):
|
|
# open url -> move_first_section -> open_current_section ->
|
|
# check date -> crawl / ignore -> close_current_section -> remove_current_section -> next_section ->
|
|
# open_current_section
|
|
|
|
def __init__(self, driver):
|
|
ListTraverse.__init__(self, driver)
|
|
|
|
# Raising exception is intended when first element is not found
|
|
# Set current_section on div
|
|
def move_first_section(self):
|
|
try:
|
|
recent_section_field = \
|
|
find_element_by_css_selector(self.driver, "div.cont_recomm[data-part-name='recentFeeds']", 10)
|
|
self.current_section = recent_section_field.find_element_by_css_selector('div.img_item')
|
|
except Exception as e:
|
|
printl("Do not find first recent section")
|
|
raise
|
|
|
|
# Raising exception is intended when fail to find a link to a content
|
|
def open_current_section(self):
|
|
try:
|
|
# The element to find is 'a' tag. Its class attribute is link_thumb _link or link_txt _link
|
|
a = self.current_section.find_element_by_css_selector("a[class$=' _link']")
|
|
a.send_keys(Keys.NULL)
|
|
a.send_keys(Keys.ENTER)
|
|
except WebDriverException as we:
|
|
printl("open_current_section error")
|
|
printl(we)
|
|
printl(self.current_section.get_attribute('data-model'))
|
|
raise KakaoCrawlerException("open_current_section error")
|
|
except Exception as e:
|
|
printl("Unknown Occurs")
|
|
printl(e)
|
|
raise
|
|
|
|
# Raising exception is intended when fail to find the element or the text containing date
|
|
def get_date_of_current_section(self):
|
|
# Find the element containing date and extract text from it. If not, raise exception.
|
|
try:
|
|
div = find_element_by_css_selector(self.driver, "div.cover_wrapper")
|
|
span = div.find_element_by_css_selector("div.add_top span.time")
|
|
text_date = span.get_attribute('title') or span.get_attribute('data-tooltip')
|
|
except WebDriverException as we:
|
|
printl("Element is not found in get_date_of_current_section")
|
|
printl(we)
|
|
raise NotFoundElementError("Element is not found in get_date_of_current_section")
|
|
except Exception as e:
|
|
printl("Unknown Exception")
|
|
printl(e)
|
|
raise
|
|
|
|
# Check the text containing date info is valid. If not, raise exception
|
|
if text_date and len(text_date) > 6:
|
|
m = re_date.search(text_date) or re_date.search(text_date)
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
else:
|
|
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
|
|
else:
|
|
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
|
|
|
|
def get_num_of_list(self):
|
|
items = self.driver.find_elements_by_css_selector("div[class^='img_item']")
|
|
return len(items) if items else 0
|
|
|
|
|
|
class ListUser(ListTraverse):
|
|
def __init__(self, driver):
|
|
ListTraverse.__init__(self, driver)
|
|
|
|
def move_first_section(self):
|
|
try:
|
|
recent_section_field = \
|
|
find_element_by_css_selector(self.driver, "div.feed[data-part-name='content']", 10)
|
|
self.current_section = recent_section_field.find_element_by_css_selector("div[class='section _activity']")
|
|
except WebDriverException as we:
|
|
printl("Do not find first recent section")
|
|
printl(we)
|
|
raise NotFoundElementError("Do not find first recent section")
|
|
except Exception as e:
|
|
printl("Unknown exception occur")
|
|
printl(e)
|
|
raise
|
|
|
|
# Raising exception is intended when fail to find a link to a content
|
|
def open_current_section(self):
|
|
try:
|
|
a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
|
|
a.send_keys(Keys.NULL)
|
|
a.send_keys(Keys.ENTER)
|
|
except WebDriverException as we:
|
|
printl("open_current_section error")
|
|
printl(we)
|
|
raise NotFoundElementError("Do not find first recent section")
|
|
except Exception as e:
|
|
printl("Unknown exception occur")
|
|
printl(e)
|
|
raise
|
|
|
|
# Raising exception is intended when fail to find the element or the text containing date
|
|
def get_date_of_current_section(self):
|
|
# Find the element containing date and extract text from it. If not, raise exception.
|
|
try:
|
|
a = self.current_section.find_element_by_css_selector("a[class='time _linkPost']")
|
|
text_date = a.get_attribute('title') or a.get_attribute('data-tooltip')
|
|
except WebDriverException as we:
|
|
printl("Element is not found in get_date_of_current_section")
|
|
printl(we)
|
|
raise NotFoundElementError("Element is not found in get_date_of_current_section")
|
|
except Exception as e:
|
|
printl("Unknown exception occur")
|
|
printl(e)
|
|
raise
|
|
|
|
# Check the text containing date info is valid. If not, raise exception
|
|
if text_date and len(text_date) > 6:
|
|
m = re_date.search(text_date) or re_date.search(text_date)
|
|
if m:
|
|
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
|
int(m.group(5)), int(m.group(6)))
|
|
# add 12 hour when the article is written at p.m
|
|
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
|
temp_date += datetime.timedelta(hours=12)
|
|
|
|
# convert datetime.datetime to str
|
|
return str(temp_date)
|
|
else:
|
|
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
|
|
else:
|
|
raise NotFoundDataError("Date is missing when calling get_date_of_current_section")
|
|
|
|
def get_num_of_list(self):
|
|
items = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
|
return len(items) if items else 0
|
|
|
|
|
|
class CrawlerProcess(object):
|
|
def __init__(self, driver, send_to_db, initializer, url, set_backup):
|
|
self.driver = driver
|
|
self.send_to_db = send_to_db
|
|
self.initializer = initializer
|
|
self.url = url
|
|
self.set_backup = set_backup if set_backup else set()
|
|
self.list_traverse = None
|
|
self.num_of_web_except = 0
|
|
self.num_of_out_of_date = 0
|
|
|
|
# To catch exception, this function wraps traverse_and_crawl function in try-catch statement.
|
|
def start(self):
|
|
while True:
|
|
try:
|
|
self.traverse_and_crawl()
|
|
# If WebDriverException occurs, retry crawling.
|
|
except WebDriverException as we:
|
|
printl("WebDriverException occurs")
|
|
printl(we)
|
|
|
|
# If the number of retry is over limit, crawling is terminated.
|
|
if self.num_of_web_except > num_of_retry:
|
|
printl("There may be no data")
|
|
printl("Crawling is done")
|
|
break
|
|
|
|
printl("Retry :", num_of_retry - self.num_of_web_except)
|
|
self.num_of_web_except += 1
|
|
|
|
# test chromedriver can access self.driver
|
|
# if can't, WebDriverException occur
|
|
self.driver.get('https://www.google.com')
|
|
wait(2)
|
|
|
|
# not found element or data, this program is terminated
|
|
# This process is intended for debug
|
|
except KakaoCrawlerException as ke:
|
|
printl("KakaoCrawlerException occur. Check kakao website")
|
|
printl(ke)
|
|
raise
|
|
|
|
# unknown exception occur
|
|
except Exception as e:
|
|
printl("Unknown occurs")
|
|
printl(e)
|
|
|
|
# If the number of retry is over limit, crawling is terminated.
|
|
if self.num_of_web_except > num_of_retry:
|
|
printl("Crawling is terminated by force")
|
|
raise
|
|
|
|
printl("Retry :", num_of_retry - self.num_of_web_except)
|
|
self.num_of_web_except += 1
|
|
|
|
# no exception occurs
|
|
else:
|
|
printl("Crawling is done")
|
|
break
|
|
|
|
def get_set_backup(self):
|
|
return self.set_backup
|
|
|
|
def convert_datetime_to_date(self, str_date):
|
|
#return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
|
|
return datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S').date()
|
|
|
|
def traverse_and_crawl(self):
|
|
NotImplementedError
|
|
|
|
def is_terminate(self):
|
|
self.num_of_out_of_date += 1
|
|
return True if self.num_of_out_of_date > limit_reload else False
|
|
|
|
|
|
class UserProcess(CrawlerProcess):
|
|
def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
|
|
CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
|
|
self.list_traverse = ListUser(driver)
|
|
|
|
# move url -> check list and load -> move first section ->
|
|
# loop: check date, open content, check body and crawling ->
|
|
# close content -> remove current section -> check list and load -> move next
|
|
def traverse_and_crawl(self):
|
|
self.driver.get(self.url)
|
|
self.list_traverse.check_list_and_load()
|
|
self.list_traverse.move_first_section()
|
|
|
|
self.num_of_out_of_date = 0
|
|
# begin_day and end_day type is datetime.date
|
|
begin_day = self.initializer.get_begin_day()
|
|
end_day = self.initializer.get_end_day()
|
|
|
|
while True:
|
|
cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
|
|
if cs_date > end_day:
|
|
printl(str(cs_date), ": continue")
|
|
elif cs_date < begin_day:
|
|
if self.is_terminate():
|
|
break
|
|
else:
|
|
current_section_data_model = self.list_traverse.get_current_section_data_model()
|
|
|
|
if current_section_data_model not in self.set_backup:
|
|
self.set_backup.add(current_section_data_model)
|
|
self.list_traverse.open_current_section()
|
|
|
|
if self.list_traverse.is_loaded_body():
|
|
body_crawler = BodyCrawler(self.driver)
|
|
body = body_crawler.get()
|
|
if body:
|
|
body['keyword_id'] = self.initializer.keyword_id()
|
|
printl(body['article_url'])
|
|
self.send_to_db.delete_url(body['article_url'])
|
|
self.send_to_db.send_body(body)
|
|
|
|
reply_crawler = ReplyCrawler(self.driver)
|
|
replies = reply_crawler.get()
|
|
|
|
# if reply exists in replies variable
|
|
if replies:
|
|
# put platform_name, platform_form, platform_id to dict of list
|
|
for reply in replies:
|
|
reply['platform_id'] = body['platform_id']
|
|
reply['platform_name'] = body['platform_name']
|
|
reply['platform_form'] = body['platform_form']
|
|
self.send_to_db.send_reply(replies)
|
|
printl('ok')
|
|
else:
|
|
raise Exception("Nobody Nobody")
|
|
self.list_traverse.close_current_section()
|
|
self.list_traverse.remove_current_section()
|
|
if not self.list_traverse.get_num_of_list():
|
|
self.list_traverse.check_list_and_load()
|
|
self.list_traverse.move_next_section()
|
|
|
|
|
|
class TagProcess(CrawlerProcess):
|
|
def __init__(self, driver, send_to_db, initializer, url, set_backup=None):
|
|
CrawlerProcess.__init__(self, driver, send_to_db, initializer, url, set_backup)
|
|
self.list_traverse = ListTag(driver)
|
|
|
|
# move url -> check list and load -> move first section ->
|
|
# loop: open content, check body content and date, and crawling ->
|
|
# close content -> remove current section -> check list and load -> move next
|
|
def traverse_and_crawl(self):
|
|
self.driver.get(self.url)
|
|
self.list_traverse.check_list_and_load()
|
|
self.list_traverse.move_first_section()
|
|
|
|
self.num_of_out_of_date = 0
|
|
# begin_day and end_day type is datetime.date
|
|
begin_day = self.initializer.get_begin_day()
|
|
end_day = self.initializer.get_end_day()
|
|
|
|
while True:
|
|
self.list_traverse.open_current_section()
|
|
if self.list_traverse.is_loaded_body():
|
|
cs_date = self.convert_datetime_to_date(self.list_traverse.get_date_of_current_section())
|
|
if cs_date > end_day:
|
|
printl(str(cs_date), ": continue")
|
|
elif cs_date < begin_day:
|
|
if self.is_terminate():
|
|
break
|
|
else:
|
|
current_section_data_model = self.list_traverse.get_current_section_data_model()
|
|
|
|
if current_section_data_model not in self.set_backup:
|
|
self.set_backup.add(current_section_data_model)
|
|
body_crawler = BodyCrawler(self.driver)
|
|
body = body_crawler.get()
|
|
printl(body['article_url'])
|
|
if body:
|
|
body['keyword_id'] = self.initializer.keyword_id()
|
|
self.send_to_db.delete_url(body['article_url'])
|
|
self.send_to_db.send_body(body)
|
|
|
|
reply_crawler = ReplyCrawler(self.driver)
|
|
replies = reply_crawler.get()
|
|
|
|
# if reply exists in replies variable
|
|
if replies:
|
|
# put platform_name, platform_form, platform_id to dict of list
|
|
for reply in replies:
|
|
reply['platform_id'] = body['platform_id']
|
|
reply['platform_name'] = body['platform_name']
|
|
reply['platform_form'] = body['platform_form']
|
|
self.send_to_db.send_reply(replies)
|
|
printl('ok')
|
|
else:
|
|
raise Exception("Nobody Nobody")
|
|
self.list_traverse.close_current_section()
|
|
self.list_traverse.remove_current_section()
|
|
if not self.list_traverse.get_num_of_list():
|
|
self.list_traverse.check_list_and_load()
|
|
self.list_traverse.move_next_section()
|
|
|
|
|
|
class KakaoMainCrawler:
|
|
def __init__(self):
|
|
self.send_to_db = SendtoDB()
|
|
self.crawl_init = KakaoInit()
|
|
self.browser = Browser()
|
|
self.driver = None
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def start(self):
|
|
self.crawl_start()
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
self.init_browser(browser)
|
|
self.init_keyword_id(keyword_id)
|
|
self.init_db(db_num)
|
|
self.init_before_day(before_day)
|
|
self.init_until_page(until_page)
|
|
|
|
def init_browser(self, browser):
|
|
self.set_driver(self.browser.get_new_driver(browser))
|
|
|
|
def init_keyword_id(self, keyword_id):
|
|
if type(keyword_id) != int:
|
|
self.keyword_id = int(keyword_id)
|
|
else:
|
|
self.keyword_id = keyword_id
|
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
|
self.crawl_init.disconnect()
|
|
|
|
def init_db(self, db_num):
|
|
self.send_to_db.set_db(db_num)
|
|
|
|
def init_before_day(self, before_day):
|
|
self.crawl_init.set_before_day(before_day)
|
|
|
|
def init_until_page(self, until_page):
|
|
self.crawl_init.set_until_page(until_page)
|
|
|
|
def crawl_start(self):
|
|
real_time = True
|
|
while real_time:
|
|
printl("Crawler Start")
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
while i < len(url_list):
|
|
try:
|
|
printl(url_list[i], "\n")
|
|
if 'https://story.kakao.com/hashtag/' in url_list[i]:
|
|
kakao_process = TagProcess(self.driver, self.send_to_db, self.crawl_init,
|
|
url_list[i])
|
|
else:
|
|
kakao_process = UserProcess(self.driver, self.send_to_db, self.crawl_init,
|
|
url_list[i])
|
|
kakao_process.start()
|
|
i += 1
|
|
except Exception as e:
|
|
logging.info(e)
|
|
# check for exception
|
|
# self.driver.quit()
|
|
self.set_driver(self.browser.new_browser())
|
|
wait(5)
|
|
i += 1
|
|
real_time = self.crawl_init.is_realtime()
|
|
printl("Finished Crawling :)")
|
|
|
|
self.send_to_db.close()
|
|
self.driver.quit()
|
|
|