219 lines
7.9 KiB
Python
219 lines
7.9 KiB
Python
import re
|
|
|
|
import datetime
|
|
import json
|
|
import requests
|
|
import requests.exceptions
|
|
import time
|
|
|
|
import insta.instaheaders as instaheaders
|
|
import insta.instaparser as instaparser
|
|
from effect.resultsender import ResultSender
|
|
from effect.resultsender import get_settings
|
|
|
|
from base.baseclasses import printl
|
|
from base.baseclasses import wait
|
|
|
|
import effect.effecterror
|
|
|
|
is_debuging = False
|
|
is_debug = True
|
|
|
|
SEPERATOR = '!@#'
|
|
|
|
num_of_list_ajax = 24
|
|
num_of_reply_ajax = 100
|
|
list_wait_sec = 0.9
|
|
body_wait_sec = 0.5
|
|
reply_wait_sec = 0.8
|
|
num_of_page_down = 20
|
|
num_of_content_process = 10
|
|
requests_timeout = 60
|
|
num_of_retry_proxy = 5
|
|
|
|
insta_url = "https://www.instagram.com/"
|
|
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
|
insta_query = "https://www.instagram.com/query/"
|
|
insta_body_url = 'https://www.instagram.com/p/'
|
|
|
|
|
|
def requests_get(req, timeout=requests_timeout):
|
|
body = []
|
|
start = time.time()
|
|
for chunk in req.iter_content(1024):
|
|
body.append(chunk)
|
|
if time.time() > (start + timeout):
|
|
req.close()
|
|
raise Exception("timeout")
|
|
return b''.join(body)
|
|
|
|
|
|
class InstaContent:
|
|
def __init__(self, url, cookies, referer, proxies=None):
|
|
self.__r = None
|
|
self.__referer = ''
|
|
self.__code = ''
|
|
self.body = None
|
|
self.reply = []
|
|
self.start_cursor = None
|
|
self.has_previous = False
|
|
self.cookies = {}
|
|
self.proxies = proxies
|
|
self.load_url(url, cookies, referer, self.proxies)
|
|
|
|
def load_url(self, url, cookies, referer, proxies):
|
|
self.__set_cookies(cookies)
|
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
|
timeout=requests_timeout, stream=True)
|
|
content = requests_get(self.__r)
|
|
self.__r.raise_for_status()
|
|
self.__referer = referer
|
|
self.__code = self.__get_code(url)
|
|
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
|
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.__r.close()
|
|
return self.body, self.reply
|
|
|
|
def get_body(self):
|
|
return self.body
|
|
|
|
def get_reply(self):
|
|
return self.reply
|
|
|
|
def load_reply_more(self):
|
|
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
|
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
|
self.log_load_reply_more_before(form_data, headers)
|
|
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
|
timeout=requests_timeout, stream=True)
|
|
content = requests_get(self.__r)
|
|
self.__r.raise_for_status()
|
|
self.__set_cookies(self.__r.cookies)
|
|
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
|
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
|
|
self.__r.close()
|
|
self.log_load_reply_more_after()
|
|
return self.reply
|
|
|
|
def get_cookies(self):
|
|
return self.cookies
|
|
|
|
def __get_code(self, url):
|
|
m = re.search(insta_body_url + "([^/]*)", url)
|
|
if m:
|
|
return m.group(1)
|
|
else:
|
|
raise RuntimeError('Tag Error')
|
|
|
|
def __set_cookies(self, cookies):
|
|
for k, v in cookies.items():
|
|
self.cookies[k] = v
|
|
|
|
def get_proxy(self):
|
|
return self.proxies
|
|
|
|
def log_load_reply_more_before(self, form_data, headers):
|
|
if is_debuging:
|
|
printl("<ContentReply Start>")
|
|
printl("<ContentReply requests>")
|
|
printl('start_cursor = ' + self.start_cursor)
|
|
printl('form_data' + form_data)
|
|
printl('headers = ', end=' ')
|
|
printl(headers)
|
|
|
|
def log_load_reply_more_after(self):
|
|
if is_debuging:
|
|
printl("<ContentReply response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('start_cursor = ' + str(self.start_cursor))
|
|
printl('has_previous = ', end='')
|
|
printl(self.has_previous)
|
|
printl('proxies = ', end='')
|
|
printl(self.proxies)
|
|
printl("<ContentReply End>")
|
|
|
|
|
|
class EffectInsta(object):
|
|
def __init__(self, event_num, event_code, url):
|
|
self.event_num = event_num
|
|
self.event_code = event_code
|
|
self.url = url
|
|
|
|
def start(self):
|
|
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
|
try:
|
|
content = InstaContent(self.url, {}, self.url)
|
|
body = content.get_body()
|
|
replies = content.get_reply()
|
|
while content.has_previous:
|
|
replies = content.load_reply_more() + replies
|
|
wait(2)
|
|
except requests.exceptions.HTTPError as e:
|
|
raise effect.effecterror.DeletedUrlError(str(e))
|
|
except Exception as e:
|
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
|
|
|
if not body.get('article_id', ''):
|
|
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
|
|
|
|
try:
|
|
result = self.statistics(body, replies)
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e))
|
|
#pprint.pprint(body)
|
|
#pprint.pprint(replies)
|
|
#pprint.pprint(result)
|
|
try:
|
|
cg = get_settings()
|
|
except Exception as e:
|
|
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
|
|
|
try:
|
|
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
|
result_sender.connect()
|
|
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
result['status'] = 'OK'
|
|
result_sender.send('stats_s1_effect', result)
|
|
result_sender.close()
|
|
except Exception as e:
|
|
raise effect.effecterror.DBQueryError(str(e))
|
|
|
|
def statistics(self, body, replies):
|
|
result = {}
|
|
result['viewcount'] = int(body.get('article_hit', 0))
|
|
result['event_num'] = self.event_num
|
|
result['replycount'] = int(body.get('article_order'), 0)
|
|
result['likecount'] = int(body.get('reply_url'), 0)
|
|
result['interactioncount'] = self.get_replycount(body, replies)
|
|
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
|
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
|
return result
|
|
|
|
def get_replycount(self, body, replies):
|
|
set_reply_id = set()
|
|
for i in replies:
|
|
set_reply_id.add(i.get('article_id', ''))
|
|
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
|
|
|
def get_reply_buzz(self, body, replies):
|
|
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
|
end_date = datetime.datetime.now().date()
|
|
date_dict = dict()
|
|
while start_date <= end_date:
|
|
date_dict[start_date.strftime('%Y%m%d')] = 0
|
|
start_date = start_date + datetime.timedelta(days=1)
|
|
|
|
for reply in replies:
|
|
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
|
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
|
|
if reply_date in date_dict:
|
|
date_dict[reply_date] = date_dict[reply_date] + 1
|
|
|
|
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
|
|
|
return json.dumps(json_array, sort_keys=True)
|
|
|
|
|