- 인스타그램 크롤러 수정

- 로그 수정
- reply 크롤링 버그 수정
This commit is contained in:
mjjo
2017-06-30 15:44:49 +09:00
parent 1e449a45af
commit bba53e2fae
3 changed files with 123 additions and 24 deletions

View File

@@ -45,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True):
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
objects = ('{}({}) :'.format(file_path, line_no),) + objects
print(*objects, sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:

View File

@@ -16,6 +16,7 @@ from queue import Queue
import threading
import time
import sys
import bs4
import inspect
@@ -44,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True):
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
objects = ('{}({}) :'.format(file_path, line_no),) + objects
print(*objects, sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
@@ -223,6 +224,12 @@ def make_list_instance(url, proxies=None):
else:
list_crawler = ListUser(url, proxies)
return list_crawler
except requests.exceptions.ProxyError as e:
printd('proxy: '+str(e.args[0].pool.proxy), e)
printd("Fail to make list instance")
return None
except Exception as e:
printd(e)
printd("Fail to make list instance")
@@ -234,9 +241,15 @@ def make_content_instance(url, proxies=None):
try:
content = InstaContent(url, {}, url, proxies)
return content
except requests.exceptions.ProxyError as e:
printd('proxy: '+str(e.args[0].pool.proxy), e)
printd("Fail to make content instance")
return None
except Exception as e:
printd(e)
printd("Fail to make contanet instance")
printd("Fail to make content instance")
return None
@@ -271,10 +284,6 @@ def load_ajax_list(ins):
def load_ajax_reply(ins):
try:
replies = ins.load_reply_more()
# if replies:
# return replies
# else:
# return None
return replies
except Exception as e:
printd(e)
@@ -360,7 +369,8 @@ def crawl_content_process(qu, keyword_id, db_num):
send_to_db.send_body(body)
if replies:
send_to_db.send_reply(replies)
printl("proxies = ", content.proxies['http'][7:])
if content.proxies is not None:
printl("proxies = ", content.proxies['http'][7:])
printl(element['url'])
printl('ok')
ok = False
@@ -436,7 +446,7 @@ class ListTag:
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
# self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
@@ -444,7 +454,8 @@ class ListTag:
#self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_url_after()
printd('tag list, end_cursor: {}'.format(self.end_cursor))
# self.log_load_url_after()
return self.list_tag
def load_more(self):
@@ -453,7 +464,7 @@ class ListTag:
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
# self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
@@ -461,9 +472,10 @@ class ListTag:
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_url_after()
printd('tag list, end_cursor: {}'.format(self.end_cursor))
# self.log_load_url_after()
# 기존 방식 instagram서 post를 막은 듯
# 기존 방식 instagram?<3F>서 post<EFBFBD>?막<>? ??
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
@@ -571,6 +583,7 @@ class ListUser:
# self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.__r.close()
printd('user list, end_cursor: {}'.format(self.end_cursor))
return self.list_user
def load_more(self):
@@ -588,6 +601,7 @@ class ListUser:
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.__r.close()
printd('user list, end_cursor: {}'.format(self.end_cursor))
# self.log_load_more_after()
return self.list_user
@@ -646,6 +660,8 @@ class InstaContent:
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.content = ''
self.query_id = ''
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
@@ -653,6 +669,7 @@ class InstaContent:
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.content = content
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
@@ -660,6 +677,8 @@ class InstaContent:
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
printd('reply, end_cursor: {}'.format(self.start_cursor))
return self.body, self.reply
def get_body(self):
@@ -668,18 +687,56 @@ class InstaContent:
def get_reply(self):
return self.reply
def get_query_ids(self, html):
doc = bs4.BeautifulSoup(html, "html.parser")
query_ids = []
for script in doc.find_all("script"):
if script.has_attr("src") and "_Commons.js" in script['src']:
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
query_ids.append(query_id)
return query_ids
def find_query_id(self):
potential_query_ids = self.get_query_ids(self.content)
query_id = ''
for potential_id in potential_query_ids:
# url = "https://www.instagram.com/graphql/query/?query_id=%s&first=12&after=%s" % (potential_id, self.start_cursor)
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
potential_id, self.__code, len(self.reply), self.start_cursor)
try:
data = requests.get(url).json()
if data['status'] == 'ok':
query_id = potential_id
break
except Exception:
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
pass
return query_id
def load_reply_more(self):
url = self.__referer + "?max_id="+self.start_cursor
if not self.query_id:
self.query_id = self.find_query_id()
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
self.query_id, self.__code, len(self.reply), self.start_cursor)
# url = self.__referer + "?max_id="+self.start_cursor
# self.log_load_reply_more_before(form_data, headers)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
self.reply += reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
# self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
@@ -892,12 +949,19 @@ class InstaAlgorithmMulti(InstaAlgorithm):
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
element['url'],
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
element['url'],
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
old_elements += 1
if old_elements > 6:
return False

View File

@@ -17,15 +17,20 @@ def get_json_from_html(content):
s = content.content.decode('utf-8')
else:
raise TypeError
m = rx_json_html.search(s)
# try:
# json_data =json.loads(s)
# except ValueError as e:
m = rx_json_html.search(s)
if m:
#return json.dumps(json.loads(m.group(1)))
#return json.loads(json.dumps(m.group(1)))
return json.loads(m.group(1))
json_data = json.loads(m.group(1))
else:
raise TypeError("Check requests.response")
return json_data
def parse_list_user_html(content):
json_data = get_json_from_html(content)
@@ -161,3 +166,33 @@ def parse_reply_ajax(content):
"article_form": "reply",
})
return reply, start_cursor, has_previous
def parse_reply_more(content):
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
reply = []
start_cursor = ''
has_previous = False
if json_data["status"] == "ok":
data = json_data['data']['shortcode_media']['edge_media_to_comment']
comments = data['edges']
has_previous = data['page_info']['has_next_page']
start_cursor = data['page_info']['end_cursor']
if not start_cursor:
start_cursor = ''
for edge in comments:
node = edge['node']
reply.append({
"article_data": node["text"],
"article_date":
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_id": node["owner"]["username"],
"article_nickname": node["owner"]["username"],
"article_profileurl": node["owner"]["profile_pic_url"],
"platform_name": "instagram",
"platform_form": "post",
"article_form": "reply",
})
return reply, start_cursor, has_previous