인스타그램 effect crawler 버그 수정

This commit is contained in:
mjjo
2017-07-04 14:26:14 +09:00
parent 3d806ae5db
commit bc89f4d0f1
4 changed files with 66 additions and 21 deletions

View File

@@ -23,7 +23,6 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
is_debug = False
def is_debugger_attached():
for frame in inspect.stack():
@@ -31,6 +30,8 @@ def is_debugger_attached():
return True
return False
is_debug = is_debugger_attached()
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()

View File

@@ -5,6 +5,7 @@ import json
import requests
import requests.exceptions
import time
import bs4
import insta.instaheaders as instaheaders
import insta.instaparser as instaparser
@@ -73,18 +74,20 @@ class InstaContent:
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.query_id = ''
self.content = ''
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.content = requests_get(self.__r)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
return self.body, self.reply
@@ -95,20 +98,52 @@ class InstaContent:
def get_reply(self):
return self.reply
def get_query_ids(self, html):
doc = bs4.BeautifulSoup(html, "html.parser")
query_ids = []
for script in doc.find_all("script"):
if script.has_attr("src") and "_Commons.js" in script['src']:
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
query_ids.append(query_id)
return query_ids
def find_query_id(self):
potential_query_ids = self.get_query_ids(self.content)
query_id = ''
for potential_id in potential_query_ids:
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
potential_id, self.__code, len(self.reply), self.start_cursor)
try:
data = requests.get(url).json()
if data['status'] == 'ok':
query_id = potential_id
break
except Exception:
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
pass
return query_id
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
if not self.query_id:
self.query_id = self.find_query_id()
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
self.query_id, self.__code, len(self.reply), self.start_cursor)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close()
self.log_load_reply_more_after()
return self.reply
self.reply = self.reply+reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
return reply
def get_cookies(self):
return self.cookies
@@ -332,13 +367,19 @@ class EffectInsta(object):
def put_today_buzz(self, buzzs, today_acc_buzz_count):
today = datetime.date.today().strftime('%Y%m%d')
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
# if today_buzz_count < 0:
# today_buzz_count = 0
result_buzzs = buzzs.copy()
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count
if len(result_buzzs) == 0:
result_buzzs.append({BUZZ_KEY[ACC]:today_acc_buzz_count, BUZZ_KEY[DAY]:today_acc_buzz_count, BUZZ_KEY[DATE]:today})
elif len(result_buzzs) == 1:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count
else:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count - result_buzzs[-2][BUZZ_KEY[ACC]]
if result_buzzs[-1][BUZZ_KEY[DAY]] < 0:
result_buzzs[-1][BUZZ_KEY[DAY]] = 0
return result_buzzs

View File

@@ -43,8 +43,11 @@ class ResultSender:
if not self.conn.open:
self.connect()
with self.conn.cursor() as cursor:
try:
cursor.execute(query)
buzz = cursor.fetchone()
except Exception as e:
print(e)
return buzz['replybuzz'] if buzz != None else buzz

View File

@@ -731,9 +731,9 @@ class InstaContent:
self.__set_cookies(self.__r.cookies)
self.__r.close()
self.reply += reply
self.reply = self.reply + reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
return self.reply
return reply
def get_cookies(self):
return self.cookies