Files
clients/WebBasedCrawler/effect/effectinstagram.py

441 lines
17 KiB
Python

import re
import datetime
import json
import requests
import requests.exceptions
import time
import bs4
import insta.instaheaders as instaheaders
import insta.instaparser as instaparser
from effect.resultsender import ResultSender
from effect.resultsender import get_settings
from base.baseclasses import printl
from base.baseclasses import wait
import effect.effecterror
is_debuging = False
is_debug = True
SEPERATOR = '!@#'
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
num_of_content_process = 10
requests_timeout = 60
num_of_retry_proxy = 5
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
DATE = 0
REPLY_DAY = 1
REPLY_ACC = 2
LIKE_DAY = 3
LIKE_ACC = 4
DAY = 5
ACC = 6
REPLY = 7
LIKE = 8
BUZZ_KEY = [
"date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc"
]
def requests_get(req, timeout=requests_timeout):
body = []
start = time.time()
for chunk in req.iter_content(1024):
body.append(chunk)
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
class InstaContent:
def __init__(self, url, cookies, referer, proxies=None):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.query_id = ''
self.content = ''
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
self.content = requests_get(self.__r)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def get_query_ids(self, html):
doc = bs4.BeautifulSoup(html, "html.parser")
query_ids = []
for script in doc.find_all("script"):
if script.has_attr("src") and "_Commons.js" in script['src']:
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
query_ids.append(query_id)
return query_ids
def find_query_id(self):
potential_query_ids = self.get_query_ids(self.content)
query_id = ''
for potential_id in potential_query_ids:
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
potential_id, self.__code, len(self.reply), self.start_cursor)
try:
data = requests.get(url).json()
if data['status'] == 'ok':
query_id = potential_id
break
except Exception:
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
pass
return query_id
def load_reply_more(self):
if not self.query_id:
self.query_id = self.find_query_id()
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
self.query_id, self.__code, len(self.reply), self.start_cursor)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
self.reply = self.reply+reply
# printl('{} - reply : {}'.format(self.__referer, len(self.reply)))
return reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_proxy(self):
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_reply_more_after(self):
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ContentReply End>")
class EffectInsta(object):
def __init__(self, event_num, event_code, url, start_date):
self.event_num = event_num
self.event_code = event_code
self.url = url
self.start_date = start_date.replace("-", "")
self.database = self.database_init()
def database_init(self):
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
database.connect()
return database
def start(self):
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
try:
content = InstaContent(self.url, {}, self.url)
body = content.get_body()
replies = content.get_reply()
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
except requests.exceptions.HTTPError as e:
raise effect.effecterror.DeletedUrlError(str(e))
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if not body.get('article_id', ''):
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
try:
result = self.statistics(body, replies)
except Exception as e:
raise effect.effecterror.UnknownError(str(e)+'statistic')
#pprint.pprint(body)
#pprint.pprint(replies)
#pprint.pprint(result)
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
try:
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
result_sender.connect()
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
result['status'] = 'OK'
result_sender.send('stats_s1_effect', result)
result_sender.close()
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
def statistics(self, body, replies):
result = {}
result['viewcount'] = int(body.get('article_hit', 0))
result['event_num'] = self.event_num
result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies)
replybuzz = self.get_reply_buzz(body, replies)
likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0))
totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs)
result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result
def get_replycount(self, body, replies):
set_reply_id = set()
for i in replies:
set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
# def get_reply_buzz(self, body, replies):
# start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
# end_date = datetime.datetime.now().date()
# date_dict = dict()
# while start_date <= end_date:
# date_dict[start_date.strftime('%Y%m%d')] = 0
# start_date = start_date + datetime.timedelta(days=1)
#
# for reply in replies:
# str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
# reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
# print(reply_date)
# if reply_date in date_dict:
# date_dict[reply_date] = date_dict[reply_date] + 1
#
# print(date_dict)
#
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
#
# return json.dumps(json_array, sort_keys=True)
def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
today = datetime.datetime.now().date()
date_dict = dict()
while start_date <= today:
date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1)
for reply in replies:
str_reply_date = reply.get('article_date')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1
reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d'))
reply_acc_count = 0
for reply_buzz in reply_buzzs:
date = reply_buzz[BUZZ_KEY[DATE]]
reply_count = date_dict[date]
reply_acc_count += reply_count
reply_buzz[BUZZ_KEY[DAY]] = date_dict[date]
reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
# return json.dumps(json_array, sort_keys=True)
return reply_buzzs
def get_like_buzz(self, like_count):
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
today = datetime.datetime.today().strftime('%Y%m%d')
try:
buzzs = self.database.get_buzz(self.event_num)
if buzzs != None:
buzzs = json.loads(buzzs)
else:
buzzs = []
buzzs = self.get_buzzs(buzzs, LIKE)
like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today)
like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs)
like_buzzs = self.put_today_buzz(like_buzzs, like_count)
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
return like_buzzs
def make_base_buzz_instance(self, values):
base_buzz_instance = dict()
base_buzz_instance[BUZZ_KEY[DATE]] = values[0]
base_buzz_instance[BUZZ_KEY[DAY]] = values[1]
base_buzz_instance[BUZZ_KEY[ACC]] = values[2]
return base_buzz_instance
def make_summary_buzz_instance(self, values):
summary_buzz_instance = dict()
summary_buzz_instance[BUZZ_KEY[DATE]] = values[0]
summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]]
summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]]
summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]]
summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]]
return summary_buzz_instance
def make_dummy_buzzs(self, start_date, end_date):
startdate = datetime.datetime.strptime(start_date, '%Y%m%d')
enddate = datetime.datetime.strptime(end_date, '%Y%m%d')
buzzs = []
while startdate <= enddate:
buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0])
buzzs.append(buzz_instance)
startdate += datetime.timedelta(days=1)
return buzzs
def put_today_buzz(self, buzzs, today_acc_buzz_count):
today = datetime.date.today().strftime('%Y%m%d')
result_buzzs = buzzs.copy()
if len(result_buzzs) == 0:
result_buzzs.append({BUZZ_KEY[ACC]:today_acc_buzz_count, BUZZ_KEY[DAY]:today_acc_buzz_count, BUZZ_KEY[DATE]:today})
elif len(result_buzzs) == 1:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count
else:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count - result_buzzs[-2][BUZZ_KEY[ACC]]
if result_buzzs[-1][BUZZ_KEY[DAY]] < 0:
result_buzzs[-1][BUZZ_KEY[DAY]] = 0
return result_buzzs
def fill_buzzs_into_dummy(self, buzzs, dummy):
buzzs_clone = buzzs.copy()
dummy_clone = dummy.copy()
for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone):
dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]]
dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]]
dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]]
for index, dummy_buzz in enumerate(dummy_clone):
previous_index = index - 1
previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]]
current_acc_value = dummy_buzz[BUZZ_KEY[ACC]]
if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0:
dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value
return dummy_clone
def get_buzzs(self, buzzs, buzz_type):
result_buzzs = []
if buzz_type == LIKE:
for buzz in buzzs:
buzz_instance = self.make_base_buzz_instance([
buzz[BUZZ_KEY[DATE]],
buzz[BUZZ_KEY[LIKE_DAY]],
buzz[BUZZ_KEY[LIKE_ACC]]
])
result_buzzs.append(buzz_instance)
return result_buzzs
def is_valid_data(self, reply_buzzs, like_buzzs):
reply_dates = self.get_date_list(reply_buzzs)
like_dates = self.get_date_list(like_buzzs)
if reply_dates == like_dates:
return True
else:
return False
def summary_reply_and_like(self, reply_buzzs, like_buzzs):
# if self.is_valid_data(reply_buzzs, like_buzzs) == False:
# raise IndexError("")
summary_buzzs = []
for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs):
date = reply_buzz[BUZZ_KEY[DATE]]
summary_buzz_instance = self.make_summary_buzz_instance([
date,
reply_buzz,
like_buzz
])
summary_buzzs.append(summary_buzz_instance)
return summary_buzzs