Files
clients/WebBasedCrawler/effect/effectinstagram.py
admin 40f29bdf51 effect
git-svn-id: svn://192.168.0.12/source@335 8346c931-da38-4b9b-9d4c-e48b93cbd075
2017-01-06 07:21:32 +00:00

219 lines
7.9 KiB
Python

import re
import datetime
import json
import requests
import requests.exceptions
import time
import insta.instaheaders as instaheaders
import insta.instaparser as instaparser
from effect.resultsender import ResultSender
from effect.resultsender import get_settings
from base.baseclasses import printl
from base.baseclasses import wait
import effect.effecterror
is_debuging = False
is_debug = True
SEPERATOR = '!@#'
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
num_of_content_process = 10
requests_timeout = 60
num_of_retry_proxy = 5
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
def requests_get(req, timeout=requests_timeout):
body = []
start = time.time()
for chunk in req.iter_content(1024):
body.append(chunk)
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
class InstaContent:
def __init__(self, url, cookies, referer, proxies=None):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close()
self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_proxy(self):
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_reply_more_after(self):
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ContentReply End>")
class EffectInsta(object):
def __init__(self, event_num, event_code, url):
self.event_num = event_num
self.event_code = event_code
self.url = url
def start(self):
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
try:
content = InstaContent(self.url, {}, self.url)
body = content.get_body()
replies = content.get_reply()
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
except requests.exceptions.HTTPError as e:
raise effect.effecterror.DeletedUrlError(str(e))
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if not body.get('article_id', ''):
raise effect.effecterror.OutDatedCrawler("NoData Crawled")
try:
result = self.statistics(body, replies)
except Exception as e:
raise effect.effecterror.UnknownError(str(e))
#pprint.pprint(body)
#pprint.pprint(replies)
#pprint.pprint(result)
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
try:
result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
result_sender.connect()
result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
result['status'] = 'OK'
result_sender.send('stats_s1_effect', result)
result_sender.close()
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
def statistics(self, body, replies):
result = {}
result['viewcount'] = int(body.get('article_hit', 0))
result['event_num'] = self.event_num
result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies)
result['replybuzz'] = self.get_reply_buzz(body, replies)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result
def get_replycount(self, body, replies):
set_reply_id = set()
for i in replies:
set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
end_date = datetime.datetime.now().date()
date_dict = dict()
while start_date <= end_date:
date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1)
for reply in replies:
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
return json.dumps(json_array, sort_keys=True)