instagram, kakaostory effect 추가
git-svn-id: svn://192.168.0.12/source@308 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
145
WebBasedCrawler/effect/effecterror.py
Normal file
145
WebBasedCrawler/effect/effecterror.py
Normal file
@@ -0,0 +1,145 @@
|
||||
DB_OPEN_ERROR = 0
|
||||
DB_FULL_ERROR = 1
|
||||
DB_LONG_QUERY_ERROR = 2
|
||||
DB_QUERY_ERROR = 3
|
||||
DB_UNKNOWN_ERROR = 4
|
||||
INTERNET_ERROR = 5
|
||||
OUT_DATED_CRAWLER = 6
|
||||
DELETED_URL_ERROR = 7
|
||||
BLOCK_ERROR = 8
|
||||
TIMEOUT = 9
|
||||
NO_PROGRAM = 10
|
||||
UNKNOWN_ERROR = 11
|
||||
|
||||
error_message = [
|
||||
"DB_OPEN_ERROR",
|
||||
"DB_FULL_ERROR",
|
||||
"DB_LONG_QUERY_ERROR",
|
||||
"DB_QUERY_ERROR",
|
||||
"DB_UNKNOWN_ERROR",
|
||||
"INTERNET_ERROR",
|
||||
"OUT_DATED_CRAWLER",
|
||||
"DELETED_URL_ERROR",
|
||||
"BLOCK_ERROR",
|
||||
"TIMEOUT",
|
||||
"NO_PROGRAM",
|
||||
"UNKNOWN_ERROR",
|
||||
]
|
||||
|
||||
error_message_code = [
|
||||
"e000",
|
||||
"e001",
|
||||
"e002",
|
||||
"e003",
|
||||
"e004",
|
||||
"e005",
|
||||
"e006",
|
||||
"e007",
|
||||
"e008",
|
||||
"e009",
|
||||
"e010",
|
||||
"e011",
|
||||
]
|
||||
|
||||
SEPERATOR = '!@#'
|
||||
|
||||
|
||||
class EffectException(Exception):
|
||||
def __init__(self, error_no, msg='', *args, **kwargs):
|
||||
self.error_no = error_no
|
||||
self.error_message_code = error_message_code[self.error_no]
|
||||
self.msg = msg
|
||||
Exception.__init__(self, *args, **kwargs)
|
||||
|
||||
def __str__(self):
|
||||
try:
|
||||
s = self.error_message_code + SEPERATOR + self.msg
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return s
|
||||
|
||||
|
||||
class DBOpenError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DB_OPEN_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class DBFullError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DB_FULL_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class DBLongQueryError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DB_LONG_QUERY_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class DBQueryError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DB_QUERY_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class DBUnknownError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DB_UNKNOWN_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class InternetError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = INTERNET_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class OutDatedCrawler(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = OUT_DATED_CRAWLER
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class DeletedUrlError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = DELETED_URL_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class BlockError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = BLOCK_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class TimeOutError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = TIMEOUT
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class NoProgramError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = NO_PROGRAM
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
class UnknownError(EffectException):
|
||||
def __init__(self, msg='', *args, **kwargs):
|
||||
self.error_no = UNKNOWN_ERROR
|
||||
self.msg = msg
|
||||
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
|
||||
|
||||
|
||||
213
WebBasedCrawler/effect/effectinstagram.py
Normal file
213
WebBasedCrawler/effect/effectinstagram.py
Normal file
@@ -0,0 +1,213 @@
|
||||
import re
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import requests
|
||||
import requests.exceptions
|
||||
import time
|
||||
|
||||
import insta.instaheaders as instaheaders
|
||||
import insta.instaparser as instaparser
|
||||
from effect.resultsender import ResultSender
|
||||
from effect.resultsender import get_settings
|
||||
|
||||
from base.baseclasses import printl
|
||||
from base.baseclasses import wait
|
||||
|
||||
import effect.effecterror
|
||||
|
||||
is_debuging = False
|
||||
is_debug = True
|
||||
|
||||
SEPERATOR = '!@#'
|
||||
|
||||
num_of_list_ajax = 24
|
||||
num_of_reply_ajax = 100
|
||||
list_wait_sec = 0.9
|
||||
body_wait_sec = 0.5
|
||||
reply_wait_sec = 0.8
|
||||
num_of_page_down = 20
|
||||
num_of_content_process = 10
|
||||
requests_timeout = 60
|
||||
num_of_retry_proxy = 5
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
|
||||
def requests_get(req, timeout=requests_timeout):
|
||||
body = []
|
||||
start = time.time()
|
||||
for chunk in req.iter_content(1024):
|
||||
body.append(chunk)
|
||||
if time.time() > (start + timeout):
|
||||
req.close()
|
||||
raise Exception("timeout")
|
||||
return b''.join(body)
|
||||
|
||||
|
||||
class InstaContent:
|
||||
def __init__(self, url, cookies, referer, proxies=None):
|
||||
self.__r = None
|
||||
self.__referer = ''
|
||||
self.__code = ''
|
||||
self.body = None
|
||||
self.reply = []
|
||||
self.start_cursor = None
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.proxies = proxies
|
||||
self.load_url(url, cookies, referer, self.proxies)
|
||||
|
||||
def load_url(self, url, cookies, referer, proxies):
|
||||
self.__set_cookies(cookies)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.close()
|
||||
return self.body, self.reply
|
||||
|
||||
def get_body(self):
|
||||
return self.body
|
||||
|
||||
def get_reply(self):
|
||||
return self.reply
|
||||
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
self.log_load_reply_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
|
||||
self.__r.close()
|
||||
self.log_load_reply_more_after()
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def __get_code(self, url):
|
||||
m = re.search(insta_body_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_proxy(self):
|
||||
return self.proxies
|
||||
|
||||
def log_load_reply_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
|
||||
def log_load_reply_more_after(self):
|
||||
if is_debuging:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl('proxies = ', end='')
|
||||
printl(self.proxies)
|
||||
printl("<ContentReply End>")
|
||||
|
||||
|
||||
class EffectInsta(object):
|
||||
def __init__(self, event_num, event_code, url):
|
||||
self.event_num = event_num
|
||||
self.event_code = event_code
|
||||
self.url = url
|
||||
|
||||
def start(self):
|
||||
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
||||
try:
|
||||
content = InstaContent(self.url, {}, self.url)
|
||||
body = content.get_body()
|
||||
replies = content.get_reply()
|
||||
while content.has_previous:
|
||||
replies = content.load_reply_more() + replies
|
||||
wait(2)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise effect.effecterror.DeletedUrlError(str(e))
|
||||
except Exception as e:
|
||||
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||
|
||||
if not body.get('article_id', ''):
|
||||
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
|
||||
|
||||
try:
|
||||
result = self.statistics(body, replies)
|
||||
except Exception as e:
|
||||
raise effect.effecterror.UnknownError(str(e))
|
||||
#pprint.pprint(body)
|
||||
#pprint.pprint(replies)
|
||||
#pprint.pprint(result)
|
||||
try:
|
||||
cg = get_settings()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
||||
|
||||
try:
|
||||
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
||||
result_sender.connect()
|
||||
result_sender.send('stats_s1_effect', result)
|
||||
result_sender.close()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.DBQueryError(str(e))
|
||||
|
||||
|
||||
def statistics(self, body, replies):
|
||||
result = {}
|
||||
result['viewcount'] = int(body.get('article_hit', 0))
|
||||
result['event_num'] = self.event_num
|
||||
result['replycount'] = int(body.get('article_order'), 0)
|
||||
result['likecount'] = int(body.get('reply_url'), 0)
|
||||
result['interactioncount'] = self.get_replycount(body, replies)
|
||||
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
||||
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
||||
return result
|
||||
|
||||
def get_replycount(self, body, replies):
|
||||
set_reply_id = set()
|
||||
for i in replies:
|
||||
set_reply_id.add(i.get('article_id', ''))
|
||||
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
||||
|
||||
def get_reply_buzz(self, body, replies):
|
||||
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
||||
end_date = datetime.datetime.now().date()
|
||||
date_dict = dict()
|
||||
while start_date <= end_date:
|
||||
date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||
start_date = start_date + datetime.timedelta(days=1)
|
||||
|
||||
for reply in replies:
|
||||
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
||||
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||
if reply_date in date_dict:
|
||||
date_dict[reply_date] = date_dict[reply_date] + 1
|
||||
return json.dumps(date_dict, sort_keys=True)
|
||||
|
||||
485
WebBasedCrawler/effect/effectkakaostory.py
Normal file
485
WebBasedCrawler/effect/effectkakaostory.py
Normal file
@@ -0,0 +1,485 @@
|
||||
import datetime
|
||||
import json
|
||||
import effect.effecterror
|
||||
import re
|
||||
|
||||
from kakao.kakaoexception import NotFoundElementError
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from base.baseclasses import wait
|
||||
from effect.resultsender import get_settings
|
||||
from effect.resultsender import ResultSender
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import enter_element
|
||||
|
||||
try:
|
||||
import lxml
|
||||
parser_opt = 'lxml'
|
||||
except ImportError:
|
||||
parser_opt = 'html.parser'
|
||||
|
||||
SEPERATOR = '!@#'
|
||||
|
||||
kakaostory_url = 'https://story.kakao.com/'
|
||||
kakaostory_channel_url = 'https://story.kakao.com/ch/'
|
||||
limit_reload = 5
|
||||
num_of_retry = 3
|
||||
|
||||
|
||||
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
|
||||
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
|
||||
|
||||
|
||||
def get_date(element):
|
||||
"""
|
||||
|
||||
:param element: this may be span.time element
|
||||
:return: 'yyyy-MM-dd hh:mm:ss'
|
||||
"""
|
||||
m = re_date.search(element.attrs.get('title', '')) \
|
||||
or re_date.search(element.attrs.get('data-tooltip', ''))
|
||||
|
||||
if m:
|
||||
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
||||
int(m.group(5)), int(m.group(6)))
|
||||
# add 12 hour when the article is written at p.m
|
||||
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
||||
temp_date += datetime.timedelta(hours=12)
|
||||
|
||||
# convert datetime.datetime to str
|
||||
return str(temp_date)
|
||||
# return invalid date instead of exception
|
||||
else:
|
||||
# raise NotFoundElementError("get_date exception")
|
||||
return "0000-00-00 00:00:00"
|
||||
|
||||
|
||||
class BodyCrawler(object):
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
self.soup = None
|
||||
self.section_activity = None
|
||||
self.set_soup_and_activity()
|
||||
if not self.section_activity:
|
||||
raise NotFoundElementError("section _activity is not Found")
|
||||
|
||||
# calling point may differ
|
||||
def set_soup_and_activity(self):
|
||||
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
||||
# There are many div.section _activity. But element we use is in div.cover_wrapper
|
||||
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
||||
self.section_activity = self.soup.find('div', class_='section _activity')
|
||||
|
||||
def find_article_id(self):
|
||||
a = self.section_activity.find('a', class_='pf_name')
|
||||
href = a.attrs['href'].replace('https://story.kakao.com/', '')
|
||||
return href[1:] if href.startswith('/') else href
|
||||
|
||||
def find_article_nickname(self):
|
||||
a = self.section_activity.find('a', class_='pf_name')
|
||||
return a.text
|
||||
|
||||
def find_article_url(self):
|
||||
# in chrome, current_url is equal to article_url
|
||||
# need to check other browser
|
||||
return self.driver.current_url
|
||||
|
||||
def find_article_modified_date(self):
|
||||
# get DOM about modified date
|
||||
times = None
|
||||
add_top = self.section_activity.find('div', class_='add_top')
|
||||
if add_top:
|
||||
times = add_top.find_all('span', class_='time')
|
||||
|
||||
# written time is default. if the article was modified, modified time is added.
|
||||
# so if length of times is not equal to 2, there is only written time.
|
||||
if not times or len(times) < 2:
|
||||
return None
|
||||
|
||||
# times[0] : written time, times[1] : modified time
|
||||
# times[1] structure : <span><span ...> </span></span>
|
||||
# check times[1].span exists
|
||||
if times[1].span:
|
||||
|
||||
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
||||
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
||||
m = re_date.search(times[1].span.attrs.get('title', '')) \
|
||||
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
|
||||
|
||||
if m:
|
||||
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
||||
int(m.group(5)), int(m.group(6)))
|
||||
# add 12 hour when the article is written at p.m
|
||||
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
||||
temp_date += datetime.timedelta(hours=12)
|
||||
|
||||
# convert datetime.datetime to str
|
||||
return str(temp_date)
|
||||
else:
|
||||
# raise NotFoundDataError('data for find_article_modified is not found')
|
||||
return None
|
||||
|
||||
# return None instead of exception.
|
||||
else:
|
||||
# raise NotFoundElementError('find_article_modified DOM is missing')
|
||||
return None
|
||||
|
||||
def find_article_date(self):
|
||||
# modified date is a higher priority than written date
|
||||
|
||||
# modified_date = self.find_article_modified_date()
|
||||
# if modified_date:
|
||||
# return modified_date
|
||||
|
||||
times = None
|
||||
# get DOMs about date
|
||||
add_top = self.section_activity.find('div', class_='add_top')
|
||||
if add_top:
|
||||
times = add_top.find_all('span', class_='time')
|
||||
else:
|
||||
raise NotFoundElementError("find_article_data DOM is missing : add_top")
|
||||
if not times:
|
||||
raise NotFoundElementError("find_article_data DOM is missing : time")
|
||||
|
||||
# before mouse over the element(tooltip), the date string is in the title attribute of span
|
||||
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
|
||||
m = re_date.search(times[0].attrs.get('title', '')) \
|
||||
or re_date.search(times[0].attrs.get('data-tooltip', ''))
|
||||
|
||||
if m:
|
||||
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
|
||||
int(m.group(5)), int(m.group(6)))
|
||||
# add 12 hour when the article is written at p.m
|
||||
if m.group(4) == "오후" and int(m.group(5)) < 12:
|
||||
temp_date += datetime.timedelta(hours=12)
|
||||
|
||||
# convert datetime.datetime to str
|
||||
return str(temp_date)
|
||||
# return invalid date instead of exception
|
||||
else:
|
||||
# raise NotFoundElementError("find_article_date exception")
|
||||
return "0000-00-00 00:00:00"
|
||||
|
||||
def find_article_profileurl(self):
|
||||
profile_area = self.section_activity.find('div', class_='_profileArea pf')
|
||||
# check a>img
|
||||
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
|
||||
return profile_area.a.img.get('src')
|
||||
# this is not essential, so no exception occur
|
||||
else:
|
||||
return ''
|
||||
|
||||
def find_article_data(self):
|
||||
"""
|
||||
:return: trimmed article_data
|
||||
"""
|
||||
content = self.section_activity.find('div', class_='txt_wrap')
|
||||
if content and content.text:
|
||||
# trim
|
||||
return content.text.strip().replace('\xa0', '\n')
|
||||
# if there is no content or text, return empty data
|
||||
else:
|
||||
return ''
|
||||
|
||||
def find_article_title(self):
|
||||
# strong.tit_channel is title of channel
|
||||
# if strong.tit_channel do not exist,
|
||||
# title is first line of article_data
|
||||
# this definition is determined by me -_-
|
||||
# find_article_data return trimmed string
|
||||
strong = self.section_activity.find('strong', class_='tit_channel')
|
||||
if strong and strong.text:
|
||||
return strong.text.replace('\xa0', '')
|
||||
|
||||
article_data = self.find_article_data()
|
||||
if article_data:
|
||||
for line in article_data.splitlines():
|
||||
# limit title length
|
||||
return line[0:30] if len(line) > 30 else line
|
||||
else:
|
||||
return ''
|
||||
|
||||
def find_article_etc(self, class_name):
|
||||
"""
|
||||
this function is used for crawling number of shares, replies and feelings
|
||||
:param class_name:
|
||||
:return: a string of number of shares, replies, or feelings
|
||||
"""
|
||||
element = self.section_activity.find('strong', class_=class_name)
|
||||
|
||||
# check element has text that indicate the number
|
||||
if element and element.text:
|
||||
# It may contain comma ',' to recognize easily
|
||||
# Remove comma ',' to convert from str to int
|
||||
txt = element.text.replace(',', '')
|
||||
return txt
|
||||
# if there is no element or text, return '0' instead of raising exception
|
||||
else:
|
||||
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
|
||||
return '0'
|
||||
|
||||
def find_article_share(self):
|
||||
return self.find_article_etc('_storyShareCount')
|
||||
|
||||
def find_article_feeling(self):
|
||||
return self.find_article_etc('_likeCount')
|
||||
|
||||
def find_article_reply_num(self):
|
||||
return self.find_article_etc('_commentCount')
|
||||
|
||||
def find_platform_form(self):
|
||||
article_id = self.find_article_id()
|
||||
return 'channel' if article_id.startswith('ch/') else 'story'
|
||||
|
||||
def get(self):
|
||||
"""
|
||||
you need to put 'keyword_id'
|
||||
:return: dict for crawled body content
|
||||
"""
|
||||
content = dict()
|
||||
content['article_id'] = self.find_article_id()
|
||||
content['article_nickname'] = self.find_article_nickname()
|
||||
content['article_data'] = self.find_article_data()
|
||||
content['article_title'] = self.find_article_title()
|
||||
content['article_date'] = self.find_article_date()
|
||||
content['article_url'] = self.find_article_url()
|
||||
content['article_profileurl'] = self.find_article_profileurl()
|
||||
content['article_order'] = self.find_article_reply_num()
|
||||
content['article_parent'] = self.find_article_share()
|
||||
content['reply_url'] = self.find_article_feeling()
|
||||
content['platform_form'] = self.find_platform_form()
|
||||
content['article_form'] = 'body'
|
||||
content['platform_name'] = 'kakaostory'
|
||||
content['platform_id'] = content['article_id']
|
||||
content['platform_title'] = content['article_nickname']
|
||||
return content
|
||||
|
||||
|
||||
class ReplyCrawler(object):
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
self.soup = None
|
||||
self.section_activity = None
|
||||
self.ul = None
|
||||
self.lis = None
|
||||
|
||||
def set_soup_and_activity(self):
|
||||
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
||||
# There are many div.section _activity. But a element we use is in div.cover_wrapper
|
||||
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
||||
self.section_activity = self.soup.find('div', class_='section _activity')
|
||||
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
||||
|
||||
def load_all_reply(self):
|
||||
previous_num_of_replies = 0
|
||||
while self.has_more():
|
||||
self.click_load_more_reply_btn()
|
||||
# check the number of replies before and after click_load_more_reply_btn()
|
||||
# If These were equal, the link or ajax failed
|
||||
current_num_of_replies = self.get_num_of_replies()
|
||||
if previous_num_of_replies == current_num_of_replies:
|
||||
break
|
||||
previous_num_of_replies = current_num_of_replies
|
||||
|
||||
def get_num_of_replies(self):
|
||||
# Find ul element that contains replies
|
||||
# if raise occur, there is no reply
|
||||
# for performance, this method may is implemented using bs4
|
||||
try:
|
||||
ul = find_element_by_css_selector(self.driver,
|
||||
#"div.cover_wrapper "
|
||||
"div[class='section _activity'] "
|
||||
"ul[class='list _listContainer']", 5)
|
||||
li = ul.find_elements_by_tag_name('li')
|
||||
return len(li)
|
||||
except Exception as e:
|
||||
return 0
|
||||
|
||||
def click_load_more_reply_btn(self):
|
||||
try:
|
||||
# find a link to load reply and click/enter it
|
||||
a = find_element_by_css_selector(self.driver,
|
||||
#"div.cover_wrapper "
|
||||
"div[class='section _activity'] "
|
||||
"a[class='_btnShowMoreComment']", 5)
|
||||
enter_element(a)
|
||||
|
||||
# no link is in the browser. Nothing happens instead raise exception. But log this event
|
||||
except Exception as e:
|
||||
pass
|
||||
# printl("In click_load_more_reply_btn, there is not a link to load replies")
|
||||
# printl(e)
|
||||
|
||||
def has_more(self):
|
||||
# In the case that raise exception,
|
||||
# there is no more reply or css selector of the show_more is invalid
|
||||
# These two case can't be classified by exception because the logic is same
|
||||
try:
|
||||
# find show_more element
|
||||
show_more = find_element_by_css_selector(self.driver,
|
||||
# "div.cover_wrapper "
|
||||
"div[class='section _activity'] "
|
||||
"p[class='more _showMoreCommentContainer']", 5)
|
||||
|
||||
# 'display:block;' -> display the button, 'display:none;' -> hide the button
|
||||
if 'block' in show_more.get_attribute('style'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
# return False in the two case
|
||||
# First case is that loading replies is finished
|
||||
# Second case is that css selector to find element is invalid
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
# find_xxxx functions
|
||||
|
||||
def find_article_id(self):
|
||||
# Find name placeholder
|
||||
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
||||
# Get article_ids and remove kakaostory url in article_id
|
||||
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
|
||||
for div in divs if div.attrs.get('href', '')]
|
||||
# Refine hrefs. Href may start with '/'
|
||||
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
|
||||
# Return list because of unification of types
|
||||
return list(article_id)
|
||||
|
||||
def find_article_nickname(self):
|
||||
divs = self.ul.find_all('a', class_='name _namePlaceholder')
|
||||
# If div.text exist, return div.text. Otherwise return empty string
|
||||
return [div.text if div.text else '' for div in divs]
|
||||
|
||||
def find_article_data(self):
|
||||
divs = self.ul.find_all('div', class_='txt')
|
||||
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
|
||||
# When element does not exists, return empty string
|
||||
return [div.text[len(div.p.text):].replace('\xa0', '\n')
|
||||
if div.p else div.text if div.text else '' for div in divs]
|
||||
|
||||
def find_article_date(self):
|
||||
divs = self.ul.find_all('span', class_='time')
|
||||
return list(map(get_date, divs))
|
||||
|
||||
def find_article_like(self):
|
||||
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
|
||||
# The number of like exists in span.like_num _likeCommentCount Unless it is present
|
||||
return [span.text if span.text else '' for span in spans]
|
||||
|
||||
def find_article_profileurl(self):
|
||||
divs = self.ul.find_all('div', class_='pf')
|
||||
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
|
||||
|
||||
def get(self):
|
||||
"""
|
||||
Need to put platform_title, platform_id, platform_form from body
|
||||
:return: a list of replies. Need to put platform_title, platform_id
|
||||
"""
|
||||
# load all replies
|
||||
self.load_all_reply()
|
||||
|
||||
# After loading all replies, crawl replies using BeautifulSoup
|
||||
self.set_soup_and_activity()
|
||||
|
||||
article_ids = self.find_article_id()
|
||||
article_nicknames = self.find_article_nickname()
|
||||
article_datas = self.find_article_data()
|
||||
article_dates = self.find_article_date()
|
||||
article_profileurls = self.find_article_profileurl()
|
||||
article_likes = self.find_article_like()
|
||||
article_url = self.driver.current_url
|
||||
|
||||
replies = []
|
||||
# This may occur exception when indices of each elements is not matched
|
||||
# This exception described above is intended
|
||||
for i in range(len(article_ids)):
|
||||
reply = dict()
|
||||
reply['article_id'] = article_ids[i]
|
||||
reply['article_nickname'] = article_nicknames[i]
|
||||
reply['article_data'] = article_datas[i]
|
||||
reply['article_date'] = article_dates[i]
|
||||
reply['article_profileurl'] = article_profileurls[i]
|
||||
reply['reply_url'] = article_likes[i]
|
||||
reply['platform_name'] = 'kakaostory'
|
||||
reply['article_form'] = 'reply'
|
||||
reply['article_url'] = article_url
|
||||
reply['article_order'] = str(i)
|
||||
replies.append(reply)
|
||||
return replies
|
||||
|
||||
|
||||
class EffectKakaostory(object):
|
||||
def __init__(self, event_num, event_code, url, driver):
|
||||
self.event_num = event_num
|
||||
self.event_code = event_code
|
||||
self.url = url
|
||||
self.driver = driver
|
||||
|
||||
def start(self):
|
||||
try:
|
||||
self.driver.get(self.url)
|
||||
wait(3)
|
||||
body_crawler = BodyCrawler(self.driver)
|
||||
reply_crawler = ReplyCrawler(self.driver)
|
||||
body = body_crawler.get()
|
||||
replies = reply_crawler.get()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||
|
||||
if not body.get('article_id', ''):
|
||||
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
|
||||
|
||||
try:
|
||||
result = self.statistics(body, replies)
|
||||
except Exception as e:
|
||||
raise effect.effecterror.UnknownError(str(e))
|
||||
#pprint.pprint(body)
|
||||
#pprint.pprint(replies)
|
||||
#pprint.pprint(result)
|
||||
try:
|
||||
cg = get_settings()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
||||
|
||||
try:
|
||||
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
||||
result_sender.connect()
|
||||
result_sender.send('stats_s1_effect', result)
|
||||
result_sender.close()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.DBQueryError(str(e))
|
||||
|
||||
def statistics(self, body, replies):
|
||||
result = {}
|
||||
result['viewcount'] = int(body.get('article_hit', 0))
|
||||
result['event_num'] = self.event_num
|
||||
result['replycount'] = int(body.get('article_order'), 0)
|
||||
result['likecount'] = int(body.get('reply_url'), 0)
|
||||
result['interactioncount'] = self.get_replycount(body, replies)
|
||||
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
||||
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
||||
return result
|
||||
|
||||
def get_replycount(self, body, replies):
|
||||
set_reply_id = set()
|
||||
for i in replies:
|
||||
set_reply_id.add(i.get('article_id', ''))
|
||||
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
||||
|
||||
def get_reply_buzz(self, body, replies):
|
||||
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
||||
end_date = datetime.datetime.now().date()
|
||||
date_dict = dict()
|
||||
while start_date <= end_date:
|
||||
date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||
start_date = start_date + datetime.timedelta(days=1)
|
||||
|
||||
for reply in replies:
|
||||
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
||||
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||
if reply_date in date_dict:
|
||||
date_dict[reply_date] = date_dict[reply_date] + 1
|
||||
return json.dumps(date_dict, sort_keys=True)
|
||||
|
||||
|
||||
|
||||
88
WebBasedCrawler/effect/resultsender.py
Normal file
88
WebBasedCrawler/effect/resultsender.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import re
|
||||
import configparser
|
||||
|
||||
class ResultSender:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE)
|
||||
|
||||
def __init__(self, host='182.162.171.147', user='admin', passwd='admin123', db='bigbird'):
|
||||
self.host = host
|
||||
self.user = user
|
||||
self.passwd = passwd
|
||||
self.db = db
|
||||
self.conn = None
|
||||
|
||||
def connect(self):
|
||||
self.conn = self.pymysql.connect(host=self.host,
|
||||
user=self.user,
|
||||
passwd=self.passwd,
|
||||
db=self.db,
|
||||
charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
|
||||
def close(self):
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
|
||||
def _make_query(self, table_name, dictionary):
|
||||
query = "insert into " + str(table_name) + " ("
|
||||
key_list = list()
|
||||
val_list = list()
|
||||
for key, val in dictionary.items():
|
||||
key_list.append(key)
|
||||
if type(val) == int:
|
||||
val_list.append(str(val))
|
||||
else:
|
||||
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
|
||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \
|
||||
# ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||
|
||||
def send(self, table_name, dictionary):
|
||||
query = self._make_query(table_name, dictionary)
|
||||
self._exec_query(query)
|
||||
|
||||
def _exec_query(self, query):
|
||||
if not self.conn.open:
|
||||
self.connect()
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
self.conn.commit()
|
||||
|
||||
|
||||
def get_settings(option='database', filename='effect.ini'):
|
||||
file_content = ''
|
||||
start = 0
|
||||
end = 0
|
||||
try:
|
||||
with open(filename, encoding='utf8') as f:
|
||||
file_content = f.readlines()
|
||||
except:
|
||||
return None
|
||||
|
||||
for i in range(0, len(file_content)):
|
||||
line_trimmed = file_content[i].strip()
|
||||
if line_trimmed.startswith('#'):
|
||||
continue
|
||||
elif line_trimmed.startswith('[') and line_trimmed.endswith(']') and line_trimmed[1:-1] == option:
|
||||
start = i
|
||||
break
|
||||
|
||||
for i in range(start + 1, len(file_content)):
|
||||
line_trimmed = file_content[i].strip()
|
||||
if line_trimmed.startswith('#') and line_trimmed[1] == '[' and line_trimmed[-1] == ']':
|
||||
end = i + 1
|
||||
break
|
||||
elif line_trimmed.startswith('['):
|
||||
end = i + 1
|
||||
break
|
||||
elif i == len(file_content) - 1:
|
||||
end = i + 1
|
||||
break
|
||||
|
||||
if start == end:
|
||||
return None
|
||||
|
||||
cg = configparser.ConfigParser()
|
||||
cg.read_string(''.join(file_content[start:end]))
|
||||
return cg[option]
|
||||
|
||||
Reference in New Issue
Block a user