400 lines
15 KiB
Python
400 lines
15 KiB
Python
import re
|
|
|
|
import datetime
|
|
import json
|
|
import requests
|
|
import requests.exceptions
|
|
import time
|
|
|
|
import insta.instaheaders as instaheaders
|
|
import insta.instaparser as instaparser
|
|
from effect.resultsender import ResultSender
|
|
from effect.resultsender import get_settings
|
|
|
|
from base.baseclasses import printl
|
|
from base.baseclasses import wait
|
|
|
|
import effect.effecterror
|
|
|
|
is_debuging = False
|
|
is_debug = True
|
|
|
|
SEPERATOR = '!@#'
|
|
|
|
num_of_list_ajax = 24
|
|
num_of_reply_ajax = 100
|
|
list_wait_sec = 0.9
|
|
body_wait_sec = 0.5
|
|
reply_wait_sec = 0.8
|
|
num_of_page_down = 20
|
|
num_of_content_process = 10
|
|
requests_timeout = 60
|
|
num_of_retry_proxy = 5
|
|
|
|
insta_url = "https://www.instagram.com/"
|
|
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
|
insta_query = "https://www.instagram.com/query/"
|
|
insta_body_url = 'https://www.instagram.com/p/'
|
|
|
|
DATE = 0
|
|
REPLY_DAY = 1
|
|
REPLY_ACC = 2
|
|
LIKE_DAY = 3
|
|
LIKE_ACC = 4
|
|
DAY = 5
|
|
ACC = 6
|
|
REPLY = 7
|
|
LIKE = 8
|
|
|
|
BUZZ_KEY = [
|
|
"date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc"
|
|
]
|
|
|
|
|
|
def requests_get(req, timeout=requests_timeout):
|
|
body = []
|
|
start = time.time()
|
|
for chunk in req.iter_content(1024):
|
|
body.append(chunk)
|
|
if time.time() > (start + timeout):
|
|
req.close()
|
|
raise Exception("timeout")
|
|
return b''.join(body)
|
|
|
|
|
|
class InstaContent:
|
|
def __init__(self, url, cookies, referer, proxies=None):
|
|
self.__r = None
|
|
self.__referer = ''
|
|
self.__code = ''
|
|
self.body = None
|
|
self.reply = []
|
|
self.start_cursor = None
|
|
self.has_previous = False
|
|
self.cookies = {}
|
|
self.proxies = proxies
|
|
self.load_url(url, cookies, referer, self.proxies)
|
|
|
|
def load_url(self, url, cookies, referer, proxies):
|
|
self.__set_cookies(cookies)
|
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
|
timeout=requests_timeout, stream=True)
|
|
content = requests_get(self.__r)
|
|
self.__r.raise_for_status()
|
|
self.__referer = referer
|
|
self.__code = self.__get_code(url)
|
|
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
|
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.__r.close()
|
|
return self.body, self.reply
|
|
|
|
def get_body(self):
|
|
return self.body
|
|
|
|
def get_reply(self):
|
|
return self.reply
|
|
|
|
def load_reply_more(self):
|
|
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
|
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
|
self.log_load_reply_more_before(form_data, headers)
|
|
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
|
timeout=requests_timeout, stream=True)
|
|
content = requests_get(self.__r)
|
|
self.__r.raise_for_status()
|
|
self.__set_cookies(self.__r.cookies)
|
|
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
|
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
|
|
self.__r.close()
|
|
self.log_load_reply_more_after()
|
|
return self.reply
|
|
|
|
def get_cookies(self):
|
|
return self.cookies
|
|
|
|
def __get_code(self, url):
|
|
m = re.search(insta_body_url + "([^/]*)", url)
|
|
if m:
|
|
return m.group(1)
|
|
else:
|
|
raise RuntimeError('Tag Error')
|
|
|
|
def __set_cookies(self, cookies):
|
|
for k, v in cookies.items():
|
|
self.cookies[k] = v
|
|
|
|
def get_proxy(self):
|
|
return self.proxies
|
|
|
|
def log_load_reply_more_before(self, form_data, headers):
|
|
if is_debuging:
|
|
printl("<ContentReply Start>")
|
|
printl("<ContentReply requests>")
|
|
printl('start_cursor = ' + self.start_cursor)
|
|
printl('form_data' + form_data)
|
|
printl('headers = ', end=' ')
|
|
printl(headers)
|
|
|
|
def log_load_reply_more_after(self):
|
|
if is_debuging:
|
|
printl("<ContentReply response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('start_cursor = ' + str(self.start_cursor))
|
|
printl('has_previous = ', end='')
|
|
printl(self.has_previous)
|
|
printl('proxies = ', end='')
|
|
printl(self.proxies)
|
|
printl("<ContentReply End>")
|
|
|
|
|
|
class EffectInsta(object):
|
|
|
|
def __init__(self, event_num, event_code, url, start_date):
|
|
self.event_num = event_num
|
|
self.event_code = event_code
|
|
self.url = url
|
|
self.start_date = start_date.replace("-", "")
|
|
self.database = self.database_init()
|
|
|
|
|
|
def database_init(self):
|
|
try:
|
|
cg = get_settings()
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
|
|
|
database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
|
database.connect()
|
|
|
|
return database
|
|
|
|
def start(self):
|
|
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
|
try:
|
|
content = InstaContent(self.url, {}, self.url)
|
|
body = content.get_body()
|
|
replies = content.get_reply()
|
|
while content.has_previous:
|
|
replies = content.load_reply_more() + replies
|
|
wait(2)
|
|
except requests.exceptions.HTTPError as e:
|
|
raise effect.effecterror.DeletedUrlError(str(e))
|
|
except Exception as e:
|
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
|
|
|
if not body.get('article_id', ''):
|
|
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
|
|
|
|
try:
|
|
result = self.statistics(body, replies)
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e))
|
|
#pprint.pprint(body)
|
|
#pprint.pprint(replies)
|
|
#pprint.pprint(result)
|
|
try:
|
|
cg = get_settings()
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
|
|
|
try:
|
|
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
|
result_sender.connect()
|
|
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
result['status'] = 'OK'
|
|
result_sender.send('stats_s1_effect', result)
|
|
result_sender.close()
|
|
except Exception as e:
|
|
raise effect.effecterror.DBQueryError(str(e))
|
|
|
|
def statistics(self, body, replies):
|
|
result = {}
|
|
result['viewcount'] = int(body.get('article_hit', 0))
|
|
result['event_num'] = self.event_num
|
|
result['replycount'] = int(body.get('article_order'), 0)
|
|
result['likecount'] = int(body.get('reply_url'), 0)
|
|
result['interactioncount'] = self.get_replycount(body, replies)
|
|
replybuzz = self.get_reply_buzz(body, replies)
|
|
likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0))
|
|
totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs)
|
|
result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True)
|
|
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
|
|
|
return result
|
|
|
|
def get_replycount(self, body, replies):
|
|
set_reply_id = set()
|
|
for i in replies:
|
|
set_reply_id.add(i.get('article_id', ''))
|
|
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
|
|
|
# def get_reply_buzz(self, body, replies):
|
|
# start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
|
# end_date = datetime.datetime.now().date()
|
|
# date_dict = dict()
|
|
# while start_date <= end_date:
|
|
# date_dict[start_date.strftime('%Y%m%d')] = 0
|
|
# start_date = start_date + datetime.timedelta(days=1)
|
|
#
|
|
# for reply in replies:
|
|
# str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
|
# reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
|
# print(reply_date)
|
|
# if reply_date in date_dict:
|
|
# date_dict[reply_date] = date_dict[reply_date] + 1
|
|
#
|
|
# print(date_dict)
|
|
#
|
|
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
|
#
|
|
# return json.dumps(json_array, sort_keys=True)
|
|
|
|
def get_reply_buzz(self, body, replies):
|
|
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
|
today = datetime.datetime.now().date()
|
|
|
|
date_dict = dict()
|
|
while start_date <= today:
|
|
date_dict[start_date.strftime('%Y%m%d')] = 0
|
|
start_date = start_date + datetime.timedelta(days=1)
|
|
|
|
for reply in replies:
|
|
str_reply_date = reply.get('article_date')
|
|
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
|
if reply_date in date_dict:
|
|
date_dict[reply_date] = date_dict[reply_date] + 1
|
|
|
|
reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d'))
|
|
reply_acc_count = 0
|
|
for reply_buzz in reply_buzzs:
|
|
date = reply_buzz[BUZZ_KEY[DATE]]
|
|
reply_count = date_dict[date]
|
|
reply_acc_count += reply_count
|
|
reply_buzz[BUZZ_KEY[DAY]] = date_dict[date]
|
|
reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count
|
|
|
|
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
|
# return json.dumps(json_array, sort_keys=True)
|
|
return reply_buzzs
|
|
|
|
def get_like_buzz(self, like_count):
|
|
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
|
today = datetime.datetime.today().strftime('%Y%m%d')
|
|
|
|
try:
|
|
buzzs = self.database.get_buzz(self.event_num)
|
|
if buzzs != None:
|
|
buzzs = json.loads(buzzs)
|
|
else:
|
|
buzzs = []
|
|
buzzs = self.get_buzzs(buzzs, LIKE)
|
|
like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today)
|
|
like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs)
|
|
like_buzzs = self.put_today_buzz(like_buzzs, like_count)
|
|
except Exception as e:
|
|
raise effect.effecterror.DBQueryError(str(e))
|
|
|
|
return like_buzzs
|
|
|
|
def make_base_buzz_instance(self, values):
|
|
base_buzz_instance = dict()
|
|
base_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
|
base_buzz_instance[BUZZ_KEY[DAY]] = values[1]
|
|
base_buzz_instance[BUZZ_KEY[ACC]] = values[2]
|
|
|
|
return base_buzz_instance
|
|
|
|
def make_summary_buzz_instance(self, values):
|
|
summary_buzz_instance = dict()
|
|
summary_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
|
summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]]
|
|
summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]]
|
|
summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]]
|
|
summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]]
|
|
|
|
return summary_buzz_instance
|
|
|
|
def make_dummy_buzzs(self, start_date, end_date):
|
|
|
|
startdate = datetime.datetime.strptime(start_date, '%Y%m%d')
|
|
enddate = datetime.datetime.strptime(end_date, '%Y%m%d')
|
|
|
|
buzzs = []
|
|
while startdate <= enddate:
|
|
buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0])
|
|
buzzs.append(buzz_instance)
|
|
|
|
startdate += datetime.timedelta(days=1)
|
|
|
|
return buzzs
|
|
|
|
def put_today_buzz(self, buzzs, today_acc_buzz_count):
|
|
today = datetime.date.today().strftime('%Y%m%d')
|
|
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
|
|
# if today_buzz_count < 0:
|
|
# today_buzz_count = 0
|
|
|
|
result_buzzs = buzzs.copy()
|
|
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0
|
|
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count
|
|
|
|
return result_buzzs
|
|
|
|
def fill_buzzs_into_dummy(self, buzzs, dummy):
|
|
buzzs_clone = buzzs.copy()
|
|
dummy_clone = dummy.copy()
|
|
|
|
for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone):
|
|
dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]]
|
|
dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]]
|
|
dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]]
|
|
|
|
for index, dummy_buzz in enumerate(dummy_clone):
|
|
previous_index = index - 1
|
|
previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]]
|
|
current_acc_value = dummy_buzz[BUZZ_KEY[ACC]]
|
|
|
|
if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0:
|
|
dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value
|
|
|
|
return dummy_clone
|
|
|
|
def get_buzzs(self, buzzs, buzz_type):
|
|
result_buzzs = []
|
|
if buzz_type == LIKE:
|
|
for buzz in buzzs:
|
|
buzz_instance = self.make_base_buzz_instance([
|
|
buzz[BUZZ_KEY[DATE]],
|
|
buzz[BUZZ_KEY[LIKE_DAY]],
|
|
buzz[BUZZ_KEY[LIKE_ACC]]
|
|
])
|
|
result_buzzs.append(buzz_instance)
|
|
|
|
return result_buzzs
|
|
|
|
def is_valid_data(self, reply_buzzs, like_buzzs):
|
|
reply_dates = self.get_date_list(reply_buzzs)
|
|
like_dates = self.get_date_list(like_buzzs)
|
|
|
|
if reply_dates == like_dates:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def summary_reply_and_like(self, reply_buzzs, like_buzzs):
|
|
# if self.is_valid_data(reply_buzzs, like_buzzs) == False:
|
|
# raise IndexError("")
|
|
|
|
summary_buzzs = []
|
|
for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs):
|
|
date = reply_buzz[BUZZ_KEY[DATE]]
|
|
summary_buzz_instance = self.make_summary_buzz_instance([
|
|
date,
|
|
reply_buzz,
|
|
like_buzz
|
|
])
|
|
summary_buzzs.append(summary_buzz_instance)
|
|
|
|
return summary_buzzs |