인스타그램 effect crawler 버그 수정

This commit is contained in:
mjjo
2017-07-04 14:26:14 +09:00
parent 3d806ae5db
commit bc89f4d0f1
4 changed files with 66 additions and 21 deletions

View File

@@ -23,7 +23,6 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
is_debug = False
def is_debugger_attached(): def is_debugger_attached():
for frame in inspect.stack(): for frame in inspect.stack():
@@ -31,6 +30,8 @@ def is_debugger_attached():
return True return True
return False return False
is_debug = is_debugger_attached()
def printl(*objects, sep=' ', end='\n', file=None, flush=True): def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug: if is_debug:
cur_frame = inspect.currentframe() cur_frame = inspect.currentframe()

View File

@@ -5,6 +5,7 @@ import json
import requests import requests
import requests.exceptions import requests.exceptions
import time import time
import bs4
import insta.instaheaders as instaheaders import insta.instaheaders as instaheaders
import insta.instaparser as instaparser import insta.instaparser as instaparser
@@ -73,18 +74,20 @@ class InstaContent:
self.has_previous = False self.has_previous = False
self.cookies = {} self.cookies = {}
self.proxies = proxies self.proxies = proxies
self.query_id = ''
self.content = ''
self.load_url(url, cookies, referer, self.proxies) self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies): def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies) self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True) timeout=requests_timeout, stream=True)
content = requests_get(self.__r) self.content = requests_get(self.__r)
self.__r.raise_for_status() self.__r.raise_for_status()
self.__referer = referer self.__referer = referer
self.__code = self.__get_code(url) self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.content)
self.__set_cookies(self.__r.cookies) self.__set_cookies(self.__r.cookies)
self.__r.close() self.__r.close()
return self.body, self.reply return self.body, self.reply
@@ -95,20 +98,52 @@ class InstaContent:
def get_reply(self): def get_reply(self):
return self.reply return self.reply
def get_query_ids(self, html):
doc = bs4.BeautifulSoup(html, "html.parser")
query_ids = []
for script in doc.find_all("script"):
if script.has_attr("src") and "_Commons.js" in script['src']:
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
query_ids.append(query_id)
return query_ids
def find_query_id(self):
potential_query_ids = self.get_query_ids(self.content)
query_id = ''
for potential_id in potential_query_ids:
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
potential_id, self.__code, len(self.reply), self.start_cursor)
try:
data = requests.get(url).json()
if data['status'] == 'ok':
query_id = potential_id
break
except Exception:
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
pass
return query_id
def load_reply_more(self): def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) if not self.query_id:
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) self.query_id = self.find_query_id()
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
timeout=requests_timeout, stream=True) self.query_id, self.__code, len(self.reply), self.start_cursor)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r) content = requests_get(self.__r)
self.__r.raise_for_status() self.__r.raise_for_status()
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
self.__set_cookies(self.__r.cookies) self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close() self.__r.close()
self.log_load_reply_more_after()
return self.reply self.reply = self.reply+reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
return reply
def get_cookies(self): def get_cookies(self):
return self.cookies return self.cookies
@@ -332,13 +367,19 @@ class EffectInsta(object):
def put_today_buzz(self, buzzs, today_acc_buzz_count): def put_today_buzz(self, buzzs, today_acc_buzz_count):
today = datetime.date.today().strftime('%Y%m%d') today = datetime.date.today().strftime('%Y%m%d')
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
# if today_buzz_count < 0:
# today_buzz_count = 0
result_buzzs = buzzs.copy() result_buzzs = buzzs.copy()
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0 if len(result_buzzs) == 0:
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count result_buzzs.append({BUZZ_KEY[ACC]:today_acc_buzz_count, BUZZ_KEY[DAY]:today_acc_buzz_count, BUZZ_KEY[DATE]:today})
elif len(result_buzzs) == 1:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count
else:
result_buzzs[-1][BUZZ_KEY[ACC]] = today_acc_buzz_count
result_buzzs[-1][BUZZ_KEY[DAY]] = today_acc_buzz_count - result_buzzs[-2][BUZZ_KEY[ACC]]
if result_buzzs[-1][BUZZ_KEY[DAY]] < 0:
result_buzzs[-1][BUZZ_KEY[DAY]] = 0
return result_buzzs return result_buzzs

View File

@@ -43,8 +43,11 @@ class ResultSender:
if not self.conn.open: if not self.conn.open:
self.connect() self.connect()
with self.conn.cursor() as cursor: with self.conn.cursor() as cursor:
cursor.execute(query) try:
buzz = cursor.fetchone() cursor.execute(query)
buzz = cursor.fetchone()
except Exception as e:
print(e)
return buzz['replybuzz'] if buzz != None else buzz return buzz['replybuzz'] if buzz != None else buzz

View File

@@ -731,9 +731,9 @@ class InstaContent:
self.__set_cookies(self.__r.cookies) self.__set_cookies(self.__r.cookies)
self.__r.close() self.__r.close()
self.reply += reply self.reply = self.reply + reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
return self.reply return reply
def get_cookies(self): def get_cookies(self):
return self.cookies return self.cookies