instagram, kakaostory effect 추가

git-svn-id: svn://192.168.0.12/source@308 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-11-04 10:44:08 +00:00
parent 732ebaa53b
commit 73acdf3d3c
4 changed files with 931 additions and 0 deletions

View File

@@ -0,0 +1,145 @@
DB_OPEN_ERROR = 0
DB_FULL_ERROR = 1
DB_LONG_QUERY_ERROR = 2
DB_QUERY_ERROR = 3
DB_UNKNOWN_ERROR = 4
INTERNET_ERROR = 5
OUT_DATED_CRAWLER = 6
DELETED_URL_ERROR = 7
BLOCK_ERROR = 8
TIMEOUT = 9
NO_PROGRAM = 10
UNKNOWN_ERROR = 11
error_message = [
"DB_OPEN_ERROR",
"DB_FULL_ERROR",
"DB_LONG_QUERY_ERROR",
"DB_QUERY_ERROR",
"DB_UNKNOWN_ERROR",
"INTERNET_ERROR",
"OUT_DATED_CRAWLER",
"DELETED_URL_ERROR",
"BLOCK_ERROR",
"TIMEOUT",
"NO_PROGRAM",
"UNKNOWN_ERROR",
]
error_message_code = [
"e000",
"e001",
"e002",
"e003",
"e004",
"e005",
"e006",
"e007",
"e008",
"e009",
"e010",
"e011",
]
SEPERATOR = '!@#'
class EffectException(Exception):
def __init__(self, error_no, msg='', *args, **kwargs):
self.error_no = error_no
self.error_message_code = error_message_code[self.error_no]
self.msg = msg
Exception.__init__(self, *args, **kwargs)
def __str__(self):
try:
s = self.error_message_code + SEPERATOR + self.msg
except Exception as e:
print(e)
return s
class DBOpenError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DB_OPEN_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class DBFullError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DB_FULL_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class DBLongQueryError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DB_LONG_QUERY_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class DBQueryError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DB_QUERY_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class DBUnknownError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DB_UNKNOWN_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class InternetError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = INTERNET_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class OutDatedCrawler(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = OUT_DATED_CRAWLER
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class DeletedUrlError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = DELETED_URL_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class BlockError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = BLOCK_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class TimeOutError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = TIMEOUT
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class NoProgramError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = NO_PROGRAM
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)
class UnknownError(EffectException):
def __init__(self, msg='', *args, **kwargs):
self.error_no = UNKNOWN_ERROR
self.msg = msg
EffectException.__init__(self, self.error_no, self.msg, *args, **kwargs)

View File

@@ -0,0 +1,213 @@
import re
import datetime
import json
import requests
import requests.exceptions
import time
import insta.instaheaders as instaheaders
import insta.instaparser as instaparser
from effect.resultsender import ResultSender
from effect.resultsender import get_settings
from base.baseclasses import printl
from base.baseclasses import wait
import effect.effecterror
is_debuging = False
is_debug = True
SEPERATOR = '!@#'
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
num_of_content_process = 10
requests_timeout = 60
num_of_retry_proxy = 5
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
def requests_get(req, timeout=requests_timeout):
body = []
start = time.time()
for chunk in req.iter_content(1024):
body.append(chunk)
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
class InstaContent:
def __init__(self, url, cookies, referer, proxies=None):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close()
self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_proxy(self):
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_reply_more_after(self):
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ContentReply End>")
class EffectInsta(object):
def __init__(self, event_num, event_code, url):
self.event_num = event_num
self.event_code = event_code
self.url = url
def start(self):
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
try:
content = InstaContent(self.url, {}, self.url)
body = content.get_body()
replies = content.get_reply()
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
except requests.exceptions.HTTPError as e:
raise effect.effecterror.DeletedUrlError(str(e))
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if not body.get('article_id', ''):
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
try:
result = self.statistics(body, replies)
except Exception as e:
raise effect.effecterror.UnknownError(str(e))
#pprint.pprint(body)
#pprint.pprint(replies)
#pprint.pprint(result)
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
try:
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
result_sender.connect()
result_sender.send('stats_s1_effect', result)
result_sender.close()
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
def statistics(self, body, replies):
result = {}
result['viewcount'] = int(body.get('article_hit', 0))
result['event_num'] = self.event_num
result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies)
result['replybuzz'] = self.get_reply_buzz(body, replies)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result
def get_replycount(self, body, replies):
set_reply_id = set()
for i in replies:
set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
end_date = datetime.datetime.now().date()
date_dict = dict()
while start_date <= end_date:
date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1)
for reply in replies:
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1
return json.dumps(date_dict, sort_keys=True)

View File

@@ -0,0 +1,485 @@
import datetime
import json
import effect.effecterror
import re
from kakao.kakaoexception import NotFoundElementError
from bs4 import BeautifulSoup
from base.baseclasses import wait
from effect.resultsender import get_settings
from effect.resultsender import ResultSender
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
try:
import lxml
parser_opt = 'lxml'
except ImportError:
parser_opt = 'html.parser'
SEPERATOR = '!@#'
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
limit_reload = 5
num_of_retry = 3
re_date = re.compile("^([\\d]{4})[\\D]+([\\d]{1,2})[\\D]+([\\d]{1,2})"
"[\\w]+[\\s]+([\\w]+)[\\s]+([\\d]{1,2})[\\D]([\\d]{1,2})")
def get_date(element):
"""
:param element: this may be span.time element
:return: 'yyyy-MM-dd hh:mm:ss'
"""
m = re_date.search(element.attrs.get('title', '')) \
or re_date.search(element.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("get_date exception")
return "0000-00-00 00:00:00"
class BodyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.set_soup_and_activity()
if not self.section_activity:
raise NotFoundElementError("section _activity is not Found")
# calling point may differ
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But element we use is in div.cover_wrapper
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = self.soup.find('div', class_='section _activity')
def find_article_id(self):
a = self.section_activity.find('a', class_='pf_name')
href = a.attrs['href'].replace('https://story.kakao.com/', '')
return href[1:] if href.startswith('/') else href
def find_article_nickname(self):
a = self.section_activity.find('a', class_='pf_name')
return a.text
def find_article_url(self):
# in chrome, current_url is equal to article_url
# need to check other browser
return self.driver.current_url
def find_article_modified_date(self):
# get DOM about modified date
times = None
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
# written time is default. if the article was modified, modified time is added.
# so if length of times is not equal to 2, there is only written time.
if not times or len(times) < 2:
return None
# times[0] : written time, times[1] : modified time
# times[1] structure : <span><span ...> </span></span>
# check times[1].span exists
if times[1].span:
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[1].span.attrs.get('title', '')) \
or re_date.search(times[1].span.attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
else:
# raise NotFoundDataError('data for find_article_modified is not found')
return None
# return None instead of exception.
else:
# raise NotFoundElementError('find_article_modified DOM is missing')
return None
def find_article_date(self):
# modified date is a higher priority than written date
# modified_date = self.find_article_modified_date()
# if modified_date:
# return modified_date
times = None
# get DOMs about date
add_top = self.section_activity.find('div', class_='add_top')
if add_top:
times = add_top.find_all('span', class_='time')
else:
raise NotFoundElementError("find_article_data DOM is missing : add_top")
if not times:
raise NotFoundElementError("find_article_data DOM is missing : time")
# before mouse over the element(tooltip), the date string is in the title attribute of span
# after mouse over the element(tooltip), the date string is in the data-tooltip attribute of span
m = re_date.search(times[0].attrs.get('title', '')) \
or re_date.search(times[0].attrs.get('data-tooltip', ''))
if m:
temp_date = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
int(m.group(5)), int(m.group(6)))
# add 12 hour when the article is written at p.m
if m.group(4) == "오후" and int(m.group(5)) < 12:
temp_date += datetime.timedelta(hours=12)
# convert datetime.datetime to str
return str(temp_date)
# return invalid date instead of exception
else:
# raise NotFoundElementError("find_article_date exception")
return "0000-00-00 00:00:00"
def find_article_profileurl(self):
profile_area = self.section_activity.find('div', class_='_profileArea pf')
# check a>img
if profile_area.a and profile_area.a.img and profile_area.a.img.get('src'):
return profile_area.a.img.get('src')
# this is not essential, so no exception occur
else:
return ''
def find_article_data(self):
"""
:return: trimmed article_data
"""
content = self.section_activity.find('div', class_='txt_wrap')
if content and content.text:
# trim
return content.text.strip().replace('\xa0', '\n')
# if there is no content or text, return empty data
else:
return ''
def find_article_title(self):
# strong.tit_channel is title of channel
# if strong.tit_channel do not exist,
# title is first line of article_data
# this definition is determined by me -_-
# find_article_data return trimmed string
strong = self.section_activity.find('strong', class_='tit_channel')
if strong and strong.text:
return strong.text.replace('\xa0', '')
article_data = self.find_article_data()
if article_data:
for line in article_data.splitlines():
# limit title length
return line[0:30] if len(line) > 30 else line
else:
return ''
def find_article_etc(self, class_name):
"""
this function is used for crawling number of shares, replies and feelings
:param class_name:
:return: a string of number of shares, replies, or feelings
"""
element = self.section_activity.find('strong', class_=class_name)
# check element has text that indicate the number
if element and element.text:
# It may contain comma ',' to recognize easily
# Remove comma ',' to convert from str to int
txt = element.text.replace(',', '')
return txt
# if there is no element or text, return '0' instead of raising exception
else:
# raise NotFoundElementError('find_article_etc is not Found element with ' + class_name)
return '0'
def find_article_share(self):
return self.find_article_etc('_storyShareCount')
def find_article_feeling(self):
return self.find_article_etc('_likeCount')
def find_article_reply_num(self):
return self.find_article_etc('_commentCount')
def find_platform_form(self):
article_id = self.find_article_id()
return 'channel' if article_id.startswith('ch/') else 'story'
def get(self):
"""
you need to put 'keyword_id'
:return: dict for crawled body content
"""
content = dict()
content['article_id'] = self.find_article_id()
content['article_nickname'] = self.find_article_nickname()
content['article_data'] = self.find_article_data()
content['article_title'] = self.find_article_title()
content['article_date'] = self.find_article_date()
content['article_url'] = self.find_article_url()
content['article_profileurl'] = self.find_article_profileurl()
content['article_order'] = self.find_article_reply_num()
content['article_parent'] = self.find_article_share()
content['reply_url'] = self.find_article_feeling()
content['platform_form'] = self.find_platform_form()
content['article_form'] = 'body'
content['platform_name'] = 'kakaostory'
content['platform_id'] = content['article_id']
content['platform_title'] = content['article_nickname']
return content
class ReplyCrawler(object):
def __init__(self, driver):
self.driver = driver
self.soup = None
self.section_activity = None
self.ul = None
self.lis = None
def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But a element we use is in div.cover_wrapper
# cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = self.soup.find('div', class_='section _activity')
self.ul = self.section_activity.find('ul', class_='list _listContainer')
def load_all_reply(self):
previous_num_of_replies = 0
while self.has_more():
self.click_load_more_reply_btn()
# check the number of replies before and after click_load_more_reply_btn()
# If These were equal, the link or ajax failed
current_num_of_replies = self.get_num_of_replies()
if previous_num_of_replies == current_num_of_replies:
break
previous_num_of_replies = current_num_of_replies
def get_num_of_replies(self):
# Find ul element that contains replies
# if raise occur, there is no reply
# for performance, this method may is implemented using bs4
try:
ul = find_element_by_css_selector(self.driver,
#"div.cover_wrapper "
"div[class='section _activity'] "
"ul[class='list _listContainer']", 5)
li = ul.find_elements_by_tag_name('li')
return len(li)
except Exception as e:
return 0
def click_load_more_reply_btn(self):
try:
# find a link to load reply and click/enter it
a = find_element_by_css_selector(self.driver,
#"div.cover_wrapper "
"div[class='section _activity'] "
"a[class='_btnShowMoreComment']", 5)
enter_element(a)
# no link is in the browser. Nothing happens instead raise exception. But log this event
except Exception as e:
pass
# printl("In click_load_more_reply_btn, there is not a link to load replies")
# printl(e)
def has_more(self):
# In the case that raise exception,
# there is no more reply or css selector of the show_more is invalid
# These two case can't be classified by exception because the logic is same
try:
# find show_more element
show_more = find_element_by_css_selector(self.driver,
# "div.cover_wrapper "
"div[class='section _activity'] "
"p[class='more _showMoreCommentContainer']", 5)
# 'display:block;' -> display the button, 'display:none;' -> hide the button
if 'block' in show_more.get_attribute('style'):
return True
else:
return False
# return False in the two case
# First case is that loading replies is finished
# Second case is that css selector to find element is invalid
except Exception as e:
return False
# find_xxxx functions
def find_article_id(self):
# Find name placeholder
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# Get article_ids and remove kakaostory url in article_id
article_ids = [div.attrs.get('href', '').replace(kakaostory_url, '')
for div in divs if div.attrs.get('href', '')]
# Refine hrefs. Href may start with '/'
article_id = map(lambda x: x[1:] if x.startswith('/') else x, article_ids)
# Return list because of unification of types
return list(article_id)
def find_article_nickname(self):
divs = self.ul.find_all('a', class_='name _namePlaceholder')
# If div.text exist, return div.text. Otherwise return empty string
return [div.text if div.text else '' for div in divs]
def find_article_data(self):
divs = self.ul.find_all('div', class_='txt')
# The div.text has meta-data in div.p.text. If meta-data exists, remove it
# When element does not exists, return empty string
return [div.text[len(div.p.text):].replace('\xa0', '\n')
if div.p else div.text if div.text else '' for div in divs]
def find_article_date(self):
divs = self.ul.find_all('span', class_='time')
return list(map(get_date, divs))
def find_article_like(self):
spans = self.ul.find_all('span', class_='like_num _likeCommentCount')
# The number of like exists in span.like_num _likeCommentCount Unless it is present
return [span.text if span.text else '' for span in spans]
def find_article_profileurl(self):
divs = self.ul.find_all('div', class_='pf')
return list(map(lambda div: div.a.img.attrs.get('src', '') if div.a and div.a.img else '', divs))
def get(self):
"""
Need to put platform_title, platform_id, platform_form from body
:return: a list of replies. Need to put platform_title, platform_id
"""
# load all replies
self.load_all_reply()
# After loading all replies, crawl replies using BeautifulSoup
self.set_soup_and_activity()
article_ids = self.find_article_id()
article_nicknames = self.find_article_nickname()
article_datas = self.find_article_data()
article_dates = self.find_article_date()
article_profileurls = self.find_article_profileurl()
article_likes = self.find_article_like()
article_url = self.driver.current_url
replies = []
# This may occur exception when indices of each elements is not matched
# This exception described above is intended
for i in range(len(article_ids)):
reply = dict()
reply['article_id'] = article_ids[i]
reply['article_nickname'] = article_nicknames[i]
reply['article_data'] = article_datas[i]
reply['article_date'] = article_dates[i]
reply['article_profileurl'] = article_profileurls[i]
reply['reply_url'] = article_likes[i]
reply['platform_name'] = 'kakaostory'
reply['article_form'] = 'reply'
reply['article_url'] = article_url
reply['article_order'] = str(i)
replies.append(reply)
return replies
class EffectKakaostory(object):
def __init__(self, event_num, event_code, url, driver):
self.event_num = event_num
self.event_code = event_code
self.url = url
self.driver = driver
def start(self):
try:
self.driver.get(self.url)
wait(3)
body_crawler = BodyCrawler(self.driver)
reply_crawler = ReplyCrawler(self.driver)
body = body_crawler.get()
replies = reply_crawler.get()
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if not body.get('article_id', ''):
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
try:
result = self.statistics(body, replies)
except Exception as e:
raise effect.effecterror.UnknownError(str(e))
#pprint.pprint(body)
#pprint.pprint(replies)
#pprint.pprint(result)
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
try:
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
result_sender.connect()
result_sender.send('stats_s1_effect', result)
result_sender.close()
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
def statistics(self, body, replies):
result = {}
result['viewcount'] = int(body.get('article_hit', 0))
result['event_num'] = self.event_num
result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies)
result['replybuzz'] = self.get_reply_buzz(body, replies)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result
def get_replycount(self, body, replies):
set_reply_id = set()
for i in replies:
set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
end_date = datetime.datetime.now().date()
date_dict = dict()
while start_date <= end_date:
date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1)
for reply in replies:
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1
return json.dumps(date_dict, sort_keys=True)

View File

@@ -0,0 +1,88 @@
import re
import configparser
class ResultSender:
pymysql = __import__('pymysql.cursors')
re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE)
def __init__(self, host='182.162.171.147', user='admin', passwd='admin123', db='bigbird'):
self.host = host
self.user = user
self.passwd = passwd
self.db = db
self.conn = None
def connect(self):
self.conn = self.pymysql.connect(host=self.host,
user=self.user,
passwd=self.passwd,
db=self.db,
charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
def close(self):
if self.conn:
self.conn.close()
def _make_query(self, table_name, dictionary):
query = "insert into " + str(table_name) + " ("
key_list = list()
val_list = list()
for key, val in dictionary.items():
key_list.append(key)
if type(val) == int:
val_list.append(str(val))
else:
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \
# ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
def send(self, table_name, dictionary):
query = self._make_query(table_name, dictionary)
self._exec_query(query)
def _exec_query(self, query):
if not self.conn.open:
self.connect()
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
def get_settings(option='database', filename='effect.ini'):
file_content = ''
start = 0
end = 0
try:
with open(filename, encoding='utf8') as f:
file_content = f.readlines()
except:
return None
for i in range(0, len(file_content)):
line_trimmed = file_content[i].strip()
if line_trimmed.startswith('#'):
continue
elif line_trimmed.startswith('[') and line_trimmed.endswith(']') and line_trimmed[1:-1] == option:
start = i
break
for i in range(start + 1, len(file_content)):
line_trimmed = file_content[i].strip()
if line_trimmed.startswith('#') and line_trimmed[1] == '[' and line_trimmed[-1] == ']':
end = i + 1
break
elif line_trimmed.startswith('['):
end = i + 1
break
elif i == len(file_content) - 1:
end = i + 1
break
if start == end:
return None
cg = configparser.ConfigParser()
cg.read_string(''.join(file_content[start:end]))
return cg[option]