diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 718a864..0809ba4 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -45,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True): # method_name = call_frame[frame_no][3] try: - # print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush) - print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush) + objects = ('{}({}) :'.format(file_path, line_no),) + objects + print(*objects, sep=sep, end=end, file=file, flush=flush) except Exception as e: print(e) else: diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index 334eb0d..986788b 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -16,6 +16,7 @@ from queue import Queue import threading import time import sys +import bs4 import inspect @@ -44,8 +45,8 @@ def printl(*objects, sep=' ', end='\n', file=None, flush=True): # method_name = call_frame[frame_no][3] try: - # print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush) - print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush) + objects = ('{}({}) :'.format(file_path, line_no),) + objects + print(*objects, sep=sep, end=end, file=file, flush=flush) except Exception as e: print(e) else: @@ -223,6 +224,12 @@ def make_list_instance(url, proxies=None): else: list_crawler = ListUser(url, proxies) return list_crawler + + except requests.exceptions.ProxyError as e: + printd('proxy: '+str(e.args[0].pool.proxy), e) + printd("Fail to make list instance") + return None + except Exception as e: printd(e) printd("Fail to make list instance") @@ -234,9 +241,15 @@ def make_content_instance(url, proxies=None): try: content = InstaContent(url, {}, url, proxies) return content + + except requests.exceptions.ProxyError as e: + printd('proxy: '+str(e.args[0].pool.proxy), e) + printd("Fail to make content instance") + return None + except Exception as e: printd(e) - printd("Fail to make contanet instance") + printd("Fail to make content instance") return None @@ -271,10 +284,6 @@ def load_ajax_list(ins): def load_ajax_reply(ins): try: replies = ins.load_reply_more() - # if replies: - # return replies - # else: - # return None return replies except Exception as e: printd(e) @@ -360,7 +369,8 @@ def crawl_content_process(qu, keyword_id, db_num): send_to_db.send_body(body) if replies: send_to_db.send_reply(replies) - printl("proxies = ", content.proxies['http'][7:]) + if content.proxies is not None: + printl("proxies = ", content.proxies['http'][7:]) printl(element['url']) printl('ok') ok = False @@ -436,7 +446,7 @@ class ListTag: self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) - self.log_load_url_before() + # self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) @@ -444,7 +454,8 @@ class ListTag: #self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() - self.log_load_url_after() + printd('tag list, end_cursor: {}'.format(self.end_cursor)) + # self.log_load_url_after() return self.list_tag def load_more(self): @@ -453,7 +464,7 @@ class ListTag: timeout=requests_timeout, stream=True) content = requests_get(self.__r) - self.log_load_url_before() + # self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) @@ -461,9 +472,10 @@ class ListTag: # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() - self.log_load_url_after() + printd('tag list, end_cursor: {}'.format(self.end_cursor)) + # self.log_load_url_after() - # 기존 방식 instagram에서 post를 막은 듯 + # 기존 방식 instagram?�서 post�?막�? ?? # form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) # self.log_load_more_before(form_data, headers) @@ -571,6 +583,7 @@ class ListUser: # self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() + printd('user list, end_cursor: {}'.format(self.end_cursor)) return self.list_user def load_more(self): @@ -588,6 +601,7 @@ class ListUser: # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() + printd('user list, end_cursor: {}'.format(self.end_cursor)) # self.log_load_more_after() return self.list_user @@ -646,6 +660,8 @@ class InstaContent: self.has_previous = False self.cookies = {} self.proxies = proxies + self.content = '' + self.query_id = '' self.load_url(url, cookies, referer, self.proxies) def load_url(self, url, cookies, referer, proxies): @@ -653,6 +669,7 @@ class InstaContent: self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) + self.content = content self.__r.raise_for_status() self.__referer = referer self.__code = self.__get_code(url) @@ -660,6 +677,8 @@ class InstaContent: self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) self.__r.close() + + printd('reply, end_cursor: {}'.format(self.start_cursor)) return self.body, self.reply def get_body(self): @@ -668,18 +687,56 @@ class InstaContent: def get_reply(self): return self.reply + def get_query_ids(self, html): + doc = bs4.BeautifulSoup(html, "html.parser") + + query_ids = [] + for script in doc.find_all("script"): + if script.has_attr("src") and "_Commons.js" in script['src']: + text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text + for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text): + query_ids.append(query_id) + return query_ids + + def find_query_id(self): + potential_query_ids = self.get_query_ids(self.content) + query_id = '' + for potential_id in potential_query_ids: + # url = "https://www.instagram.com/graphql/query/?query_id=%s&first=12&after=%s" % (potential_id, self.start_cursor) + url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( + potential_id, self.__code, len(self.reply), self.start_cursor) + try: + data = requests.get(url).json() + if data['status'] == 'ok': + query_id = potential_id + break + except Exception: + # no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.' + pass + + return query_id + def load_reply_more(self): - url = self.__referer + "?max_id="+self.start_cursor + if not self.query_id: + self.query_id = self.find_query_id() + + url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( + self.query_id, self.__code, len(self.reply), self.start_cursor) + # url = self.__referer + "?max_id="+self.start_cursor # self.log_load_reply_more_before(form_data, headers) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) + self.__r.raise_for_status() - self.__code = self.__get_code(url) - self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) + reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content) self.__set_cookies(self.__r.cookies) self.__r.close() + + self.reply += reply + printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) # self.log_load_reply_more_after() + return self.reply def get_cookies(self): @@ -892,12 +949,19 @@ class InstaAlgorithmMulti(InstaAlgorithm): old_elements = 0 for element in contents_list: if element['date'].date() > self.crawl_init.get_end_day(): - # printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format( + element['url'], + element['date'].strftime("%Y-%m-%d %H:%M:%S"), + self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"), + self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S"))) elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) + printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format( + element['url'], + element['date'].strftime("%Y-%m-%d %H:%M:%S"), + self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"), + self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S"))) + old_elements += 1 if old_elements > 6: return False diff --git a/WebBasedCrawler/insta/instaparser.py b/WebBasedCrawler/insta/instaparser.py index f3951bf..45488ef 100644 --- a/WebBasedCrawler/insta/instaparser.py +++ b/WebBasedCrawler/insta/instaparser.py @@ -17,15 +17,20 @@ def get_json_from_html(content): s = content.content.decode('utf-8') else: raise TypeError - m = rx_json_html.search(s) + # try: + # json_data =json.loads(s) + # except ValueError as e: + m = rx_json_html.search(s) if m: #return json.dumps(json.loads(m.group(1))) #return json.loads(json.dumps(m.group(1))) - return json.loads(m.group(1)) + json_data = json.loads(m.group(1)) else: raise TypeError("Check requests.response") + return json_data + def parse_list_user_html(content): json_data = get_json_from_html(content) @@ -161,3 +166,33 @@ def parse_reply_ajax(content): "article_form": "reply", }) return reply, start_cursor, has_previous + + +def parse_reply_more(content): + json_data = json.loads(content.decode('utf-8'), encoding="utf-8") + reply = [] + start_cursor = '' + has_previous = False + if json_data["status"] == "ok": + data = json_data['data']['shortcode_media']['edge_media_to_comment'] + comments = data['edges'] + has_previous = data['page_info']['has_next_page'] + start_cursor = data['page_info']['end_cursor'] + if not start_cursor: + start_cursor = '' + + for edge in comments: + node = edge['node'] + reply.append({ + "article_data": node["text"], + "article_date": + (old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"), + "article_id": node["owner"]["username"], + "article_nickname": node["owner"]["username"], + "article_profileurl": node["owner"]["profile_pic_url"], + "platform_name": "instagram", + "platform_form": "post", + "article_form": "reply", + }) + + return reply, start_cursor, has_previous