import re import datetime import json import requests import requests.exceptions import time import insta.instaheaders as instaheaders import insta.instaparser as instaparser from effect.resultsender import ResultSender from effect.resultsender import get_settings from base.baseclasses import printl from base.baseclasses import wait import effect.effecterror is_debuging = False is_debug = True SEPERATOR = '!@#' num_of_list_ajax = 24 num_of_reply_ajax = 100 list_wait_sec = 0.9 body_wait_sec = 0.5 reply_wait_sec = 0.8 num_of_page_down = 20 num_of_content_process = 10 requests_timeout = 60 num_of_retry_proxy = 5 insta_url = "https://www.instagram.com/" insta_tag_url = "https://www.instagram.com/explore/tags/" insta_query = "https://www.instagram.com/query/" insta_body_url = 'https://www.instagram.com/p/' def requests_get(req, timeout=requests_timeout): body = [] start = time.time() for chunk in req.iter_content(1024): body.append(chunk) if time.time() > (start + timeout): req.close() raise Exception("timeout") return b''.join(body) class InstaContent: def __init__(self, url, cookies, referer, proxies=None): self.__r = None self.__referer = '' self.__code = '' self.body = None self.reply = [] self.start_cursor = None self.has_previous = False self.cookies = {} self.proxies = proxies self.load_url(url, cookies, referer, self.proxies) def load_url(self, url, cookies, referer, proxies): self.__set_cookies(cookies) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__referer = referer self.__code = self.__get_code(url) # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) self.__r.close() return self.body, self.reply def get_body(self): return self.body def get_reply(self): return self.reply def load_reply_more(self): form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) self.log_load_reply_more_before(form_data, headers) self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__set_cookies(self.__r.cookies) # self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content) self.__r.close() self.log_load_reply_more_after() return self.reply def get_cookies(self): return self.cookies def __get_code(self, url): m = re.search(insta_body_url + "([^/]*)", url) if m: return m.group(1) else: raise RuntimeError('Tag Error') def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_proxy(self): return self.proxies def log_load_reply_more_before(self, form_data, headers): if is_debuging: printl("") printl("") printl('start_cursor = ' + self.start_cursor) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_reply_more_after(self): if is_debuging: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('start_cursor = ' + str(self.start_cursor)) printl('has_previous = ', end='') printl(self.has_previous) printl('proxies = ', end='') printl(self.proxies) printl("") class EffectInsta(object): def __init__(self, event_num, event_code, url): self.event_num = event_num self.event_code = event_code self.url = url def start(self): #content = insta.instacrawl.InstaContent(self.url, {}, self.url) try: content = InstaContent(self.url, {}, self.url) body = content.get_body() replies = content.get_reply() while content.has_previous: replies = content.load_reply_more() + replies wait(2) except requests.exceptions.HTTPError as e: raise effect.effecterror.DeletedUrlError(str(e)) except Exception as e: raise effect.effecterror.OutDatedCrawler(str(e)) if not body.get('article_id', ''): raise effect.effecterror.OutDatedCrawler("NoData Crawled") try: result = self.statistics(body, replies) except Exception as e: raise effect.effecterror.UnknownError(str(e)) #pprint.pprint(body) #pprint.pprint(replies) #pprint.pprint(result) try: cg = get_settings() except Exception as e: raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error') try: result_sender = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender() result_sender.connect() result['lastupdate_filter'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") result['status'] = 'OK' result_sender.send('stats_s1_effect', result) result_sender.close() except Exception as e: raise effect.effecterror.DBQueryError(str(e)) def statistics(self, body, replies): result = {} result['viewcount'] = int(body.get('article_hit', 0)) result['event_num'] = self.event_num result['replycount'] = int(body.get('article_order'), 0) result['likecount'] = int(body.get('reply_url'), 0) result['interactioncount'] = self.get_replycount(body, replies) result['replybuzz'] = self.get_reply_buzz(body, replies) result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0) return result def get_replycount(self, body, replies): set_reply_id = set() for i in replies: set_reply_id.add(i.get('article_id', '')) return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id) def get_reply_buzz(self, body, replies): start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date() end_date = datetime.datetime.now().date() date_dict = dict() while start_date <= end_date: date_dict[start_date.strftime('%Y%m%d')] = 0 start_date = start_date + datetime.timedelta(days=1) for reply in replies: str_reply_date = reply.get('article_date', '1990-01-01 00:00:00') reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y') if reply_date in date_dict: date_dict[reply_date] = date_dict[reply_date] + 1 json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] return json.dumps(json_array, sort_keys=True)