- 인스타그램 크롤러 수정
- 로그 수정 - reply 크롤링 버그 수정
This commit is contained in:
@@ -45,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
# method_name = call_frame[frame_no][3]
|
||||
|
||||
try:
|
||||
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
objects = ('{}({}) :'.format(file_path, line_no),) + objects
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
else:
|
||||
|
||||
@@ -16,6 +16,7 @@ from queue import Queue
|
||||
import threading
|
||||
import time
|
||||
import sys
|
||||
import bs4
|
||||
import inspect
|
||||
|
||||
|
||||
@@ -44,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
# method_name = call_frame[frame_no][3]
|
||||
|
||||
try:
|
||||
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
objects = ('{}({}) :'.format(file_path, line_no),) + objects
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
else:
|
||||
@@ -223,6 +224,12 @@ def make_list_instance(url, proxies=None):
|
||||
else:
|
||||
list_crawler = ListUser(url, proxies)
|
||||
return list_crawler
|
||||
|
||||
except requests.exceptions.ProxyError as e:
|
||||
printd('proxy: '+str(e.args[0].pool.proxy), e)
|
||||
printd("Fail to make list instance")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
printd(e)
|
||||
printd("Fail to make list instance")
|
||||
@@ -234,9 +241,15 @@ def make_content_instance(url, proxies=None):
|
||||
try:
|
||||
content = InstaContent(url, {}, url, proxies)
|
||||
return content
|
||||
|
||||
except requests.exceptions.ProxyError as e:
|
||||
printd('proxy: '+str(e.args[0].pool.proxy), e)
|
||||
printd("Fail to make content instance")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
printd(e)
|
||||
printd("Fail to make contanet instance")
|
||||
printd("Fail to make content instance")
|
||||
return None
|
||||
|
||||
|
||||
@@ -271,10 +284,6 @@ def load_ajax_list(ins):
|
||||
def load_ajax_reply(ins):
|
||||
try:
|
||||
replies = ins.load_reply_more()
|
||||
# if replies:
|
||||
# return replies
|
||||
# else:
|
||||
# return None
|
||||
return replies
|
||||
except Exception as e:
|
||||
printd(e)
|
||||
@@ -360,7 +369,8 @@ def crawl_content_process(qu, keyword_id, db_num):
|
||||
send_to_db.send_body(body)
|
||||
if replies:
|
||||
send_to_db.send_reply(replies)
|
||||
printl("proxies = ", content.proxies['http'][7:])
|
||||
if content.proxies is not None:
|
||||
printl("proxies = ", content.proxies['http'][7:])
|
||||
printl(element['url'])
|
||||
printl('ok')
|
||||
ok = False
|
||||
@@ -436,7 +446,7 @@ class ListTag:
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
|
||||
self.log_load_url_before()
|
||||
# self.log_load_url_before()
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
@@ -444,7 +454,8 @@ class ListTag:
|
||||
#self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||||
self.__r.close()
|
||||
self.log_load_url_after()
|
||||
printd('tag list, end_cursor: {}'.format(self.end_cursor))
|
||||
# self.log_load_url_after()
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
@@ -453,7 +464,7 @@ class ListTag:
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
|
||||
self.log_load_url_before()
|
||||
# self.log_load_url_before()
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
@@ -461,9 +472,10 @@ class ListTag:
|
||||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||||
self.__r.close()
|
||||
self.log_load_url_after()
|
||||
printd('tag list, end_cursor: {}'.format(self.end_cursor))
|
||||
# self.log_load_url_after()
|
||||
|
||||
# 기존 방식 instagram에서 post를 막은 듯
|
||||
# 기존 방식 instagram?<3F>서 post<EFBFBD>?막<>? ??
|
||||
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
# self.log_load_more_before(form_data, headers)
|
||||
@@ -571,6 +583,7 @@ class ListUser:
|
||||
# self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||||
self.__r.close()
|
||||
printd('user list, end_cursor: {}'.format(self.end_cursor))
|
||||
return self.list_user
|
||||
|
||||
def load_more(self):
|
||||
@@ -588,6 +601,7 @@ class ListUser:
|
||||
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||||
self.__r.close()
|
||||
printd('user list, end_cursor: {}'.format(self.end_cursor))
|
||||
# self.log_load_more_after()
|
||||
return self.list_user
|
||||
|
||||
@@ -646,6 +660,8 @@ class InstaContent:
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.proxies = proxies
|
||||
self.content = ''
|
||||
self.query_id = ''
|
||||
self.load_url(url, cookies, referer, self.proxies)
|
||||
|
||||
def load_url(self, url, cookies, referer, proxies):
|
||||
@@ -653,6 +669,7 @@ class InstaContent:
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.content = content
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
@@ -660,6 +677,8 @@ class InstaContent:
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.close()
|
||||
|
||||
printd('reply, end_cursor: {}'.format(self.start_cursor))
|
||||
return self.body, self.reply
|
||||
|
||||
def get_body(self):
|
||||
@@ -668,18 +687,56 @@ class InstaContent:
|
||||
def get_reply(self):
|
||||
return self.reply
|
||||
|
||||
def get_query_ids(self, html):
|
||||
doc = bs4.BeautifulSoup(html, "html.parser")
|
||||
|
||||
query_ids = []
|
||||
for script in doc.find_all("script"):
|
||||
if script.has_attr("src") and "_Commons.js" in script['src']:
|
||||
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
|
||||
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
|
||||
query_ids.append(query_id)
|
||||
return query_ids
|
||||
|
||||
def find_query_id(self):
|
||||
potential_query_ids = self.get_query_ids(self.content)
|
||||
query_id = ''
|
||||
for potential_id in potential_query_ids:
|
||||
# url = "https://www.instagram.com/graphql/query/?query_id=%s&first=12&after=%s" % (potential_id, self.start_cursor)
|
||||
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
|
||||
potential_id, self.__code, len(self.reply), self.start_cursor)
|
||||
try:
|
||||
data = requests.get(url).json()
|
||||
if data['status'] == 'ok':
|
||||
query_id = potential_id
|
||||
break
|
||||
except Exception:
|
||||
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
|
||||
pass
|
||||
|
||||
return query_id
|
||||
|
||||
def load_reply_more(self):
|
||||
url = self.__referer + "?max_id="+self.start_cursor
|
||||
if not self.query_id:
|
||||
self.query_id = self.find_query_id()
|
||||
|
||||
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
|
||||
self.query_id, self.__code, len(self.reply), self.start_cursor)
|
||||
# url = self.__referer + "?max_id="+self.start_cursor
|
||||
# self.log_load_reply_more_before(form_data, headers)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
|
||||
self.__r.raise_for_status()
|
||||
self.__code = self.__get_code(url)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||||
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.close()
|
||||
|
||||
self.reply += reply
|
||||
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
|
||||
# self.log_load_reply_more_after()
|
||||
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
@@ -892,12 +949,19 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
||||
old_elements = 0
|
||||
for element in contents_list:
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
# printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
|
||||
element['url'],
|
||||
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
|
||||
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
|
||||
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
|
||||
element['url'],
|
||||
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
|
||||
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
|
||||
|
||||
old_elements += 1
|
||||
if old_elements > 6:
|
||||
return False
|
||||
|
||||
@@ -17,15 +17,20 @@ def get_json_from_html(content):
|
||||
s = content.content.decode('utf-8')
|
||||
else:
|
||||
raise TypeError
|
||||
m = rx_json_html.search(s)
|
||||
|
||||
# try:
|
||||
# json_data =json.loads(s)
|
||||
# except ValueError as e:
|
||||
m = rx_json_html.search(s)
|
||||
if m:
|
||||
#return json.dumps(json.loads(m.group(1)))
|
||||
#return json.loads(json.dumps(m.group(1)))
|
||||
return json.loads(m.group(1))
|
||||
json_data = json.loads(m.group(1))
|
||||
else:
|
||||
raise TypeError("Check requests.response")
|
||||
|
||||
return json_data
|
||||
|
||||
|
||||
def parse_list_user_html(content):
|
||||
json_data = get_json_from_html(content)
|
||||
@@ -161,3 +166,33 @@ def parse_reply_ajax(content):
|
||||
"article_form": "reply",
|
||||
})
|
||||
return reply, start_cursor, has_previous
|
||||
|
||||
|
||||
def parse_reply_more(content):
|
||||
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
|
||||
reply = []
|
||||
start_cursor = ''
|
||||
has_previous = False
|
||||
if json_data["status"] == "ok":
|
||||
data = json_data['data']['shortcode_media']['edge_media_to_comment']
|
||||
comments = data['edges']
|
||||
has_previous = data['page_info']['has_next_page']
|
||||
start_cursor = data['page_info']['end_cursor']
|
||||
if not start_cursor:
|
||||
start_cursor = ''
|
||||
|
||||
for edge in comments:
|
||||
node = edge['node']
|
||||
reply.append({
|
||||
"article_data": node["text"],
|
||||
"article_date":
|
||||
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_id": node["owner"]["username"],
|
||||
"article_nickname": node["owner"]["username"],
|
||||
"article_profileurl": node["owner"]["profile_pic_url"],
|
||||
"platform_name": "instagram",
|
||||
"platform_form": "post",
|
||||
"article_form": "reply",
|
||||
})
|
||||
|
||||
return reply, start_cursor, has_previous
|
||||
|
||||
Reference in New Issue
Block a user