diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 0809ba4..70fc8a8 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -23,7 +23,6 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.desired_capabilities import DesiredCapabilities -is_debug = False def is_debugger_attached(): for frame in inspect.stack(): @@ -31,6 +30,8 @@ def is_debugger_attached(): return True return False +is_debug = is_debugger_attached() + def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() diff --git a/WebBasedCrawler/effect/effectinstagram.py b/WebBasedCrawler/effect/effectinstagram.py index 4249e7f..ce27a38 100644 --- a/WebBasedCrawler/effect/effectinstagram.py +++ b/WebBasedCrawler/effect/effectinstagram.py @@ -5,6 +5,7 @@ import json import requests import requests.exceptions import time +import bs4 import insta.instaheaders as instaheaders import insta.instaparser as instaparser @@ -73,18 +74,20 @@ class InstaContent: self.has_previous = False self.cookies = {} self.proxies = proxies + self.query_id = '' + self.content = '' self.load_url(url, cookies, referer, self.proxies) def load_url(self, url, cookies, referer, proxies): self.__set_cookies(cookies) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, timeout=requests_timeout, stream=True) - content = requests_get(self.__r) + self.content = requests_get(self.__r) self.__r.raise_for_status() self.__referer = referer self.__code = self.__get_code(url) # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) - self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) + self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.content) self.__set_cookies(self.__r.cookies) self.__r.close() return self.body, self.reply @@ -95,20 +98,52 @@ class InstaContent: def get_reply(self): return self.reply + def get_query_ids(self, html): + doc = bs4.BeautifulSoup(html, "html.parser") + + query_ids = [] + for script in doc.find_all("script"): + if script.has_attr("src") and "_Commons.js" in script['src']: + text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text + for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text): + query_ids.append(query_id) + return query_ids + + def find_query_id(self): + potential_query_ids = self.get_query_ids(self.content) + query_id = '' + for potential_id in potential_query_ids: + url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( + potential_id, self.__code, len(self.reply), self.start_cursor) + try: + data = requests.get(url).json() + if data['status'] == 'ok': + query_id = potential_id + break + except Exception: + # no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.' + pass + + return query_id + def load_reply_more(self): - form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) - self.log_load_reply_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout, stream=True) + if not self.query_id: + self.query_id = self.find_query_id() + + url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( + self.query_id, self.__code, len(self.reply), self.start_cursor) + self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, + timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() + reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content) self.__set_cookies(self.__r.cookies) - # self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) - self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content) self.__r.close() - self.log_load_reply_more_after() - return self.reply + + self.reply = self.reply+reply + printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) + + return reply def get_cookies(self): return self.cookies @@ -332,13 +367,19 @@ class EffectInsta(object): def put_today_buzz(self, buzzs, today_acc_buzz_count): today = datetime.date.today().strftime('%Y%m%d') - today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]] - # if today_buzz_count < 0: - # today_buzz_count = 0 result_buzzs = buzzs.copy() - result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0 - result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count + if len(result_buzzs) == 0: + result_buzzs.append({BUZZ_KEY[ACC]:today_acc_buzz_count, BUZZ_KEY[DAY]:today_acc_buzz_count, BUZZ_KEY[DATE]:today}) + elif len(result_buzzs) == 1: + result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count + result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count + else: + result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count + result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count - result_buzzs[-2][BUZZ_KEY[ACC]] + + if result_buzzs[-1][BUZZ_KEY[DAY]] < 0: + result_buzzs[-1][BUZZ_KEY[DAY]] = 0 return result_buzzs diff --git a/WebBasedCrawler/effect/resultsender.py b/WebBasedCrawler/effect/resultsender.py index 435d39d..9d4f210 100644 --- a/WebBasedCrawler/effect/resultsender.py +++ b/WebBasedCrawler/effect/resultsender.py @@ -43,8 +43,11 @@ class ResultSender: if not self.conn.open: self.connect() with self.conn.cursor() as cursor: - cursor.execute(query) - buzz = cursor.fetchone() + try: + cursor.execute(query) + buzz = cursor.fetchone() + except Exception as e: + print(e) return buzz['replybuzz'] if buzz != None else buzz diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index e398252..1598e7e 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -731,9 +731,9 @@ class InstaContent: self.__set_cookies(self.__r.cookies) self.__r.close() - self.reply += reply + self.reply = self.reply + reply printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) - return self.reply + return reply def get_cookies(self): return self.cookies