diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 97a36ee..718a864 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -12,6 +12,7 @@ import threading import re import pymysql import random +import inspect from time import localtime, strftime @@ -24,13 +25,37 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities is_debug = False +def is_debugger_attached(): + for frame in inspect.stack(): + if frame[1].endswith("pydevd.py"): + return True + return False + def printl(*objects, sep=' ', end='\n', file=None, flush=True): - print(*objects, sep=sep, end=end, file=file, flush=flush) + if is_debug: + cur_frame = inspect.currentframe() + call_frame = inspect.getouterframes(cur_frame, 2) + + frame_no = call_frame[1][3] == 'printd' and 2 or 1 + file_path = call_frame[frame_no][1] + line_no = call_frame[frame_no][2] + # class_name = '' + # if 'self' in call_frame[frame_no][0].f_locals: + # class_name = str(call_frame[frame_no][0].f_locals['self'].__class__) + # method_name = call_frame[frame_no][3] + + try: + # print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush) + print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush) + except Exception as e: + print(e) + else: + print(*objects, sep=sep, end=end, file=file, flush=flush) def printd(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: - print(*objects, sep=sep, end=end, file=file, flush=flush) + printl(objects, sep, end, file, flush) def print_and_flush(string): diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index 29a5ddf..334eb0d 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -16,32 +16,52 @@ from queue import Queue import threading import time import sys +import inspect from base.baseclasses import SendtoDB from base.baseclasses import CrawlInit from base.baseclasses import wait +from base.baseclasses import is_debugger_attached # from base.baseclasses import Browser from selenium.webdriver.common.keys import Keys from base.baseclasses import enter_element import base.proxy import eventlet + def printl(*objects, sep=' ', end='\n', file=None, flush=True): - print(*objects, sep=sep, end=end, file=file, flush=flush) + if is_debug: + cur_frame = inspect.currentframe() + call_frame = inspect.getouterframes(cur_frame, 2) + + frame_no = call_frame[1][3] == 'printd' and 2 or 1 + file_path = call_frame[frame_no][1] + line_no = call_frame[frame_no][2] + # class_name = '' + # if 'self' in call_frame[frame_no][0].f_locals: + # class_name = str(call_frame[frame_no][0].f_locals['self'].__class__) + # method_name = call_frame[frame_no][3] + + try: + # print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush) + print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush) + except Exception as e: + print(e) + else: + print(*objects, sep=sep, end=end, file=file, flush=flush) insta_url = "https://www.instagram.com/" insta_tag_url = "https://www.instagram.com/explore/tags/" insta_query = "https://www.instagram.com/query/" insta_body_url = 'https://www.instagram.com/p/' -is_debuging = False -is_debug = False +is_debug = is_debugger_attached() def printd(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: - print(*objects, sep=sep, end=end, file=file, flush=flush) + printl(*objects, sep=sep, end=end, file=file, flush=flush) num_of_list_ajax = 24 @@ -232,7 +252,7 @@ def ajax_wrapper(func): return retry_ajax_load -@ajax_wrapper +# @ajax_wrapper def load_ajax_list(ins): try: insta_list = ins.load_more() @@ -247,7 +267,7 @@ def load_ajax_list(ins): return None -@ajax_wrapper +# @ajax_wrapper def load_ajax_reply(ins): try: replies = ins.load_reply_more() @@ -307,7 +327,7 @@ def crawl_content_process(qu, keyword_id, db_num): try: element = qu.get(timeout=60) except Exception as e: - printl("getting queue is timeout") + printl("[crawl_content_process] queue is empty") continue if element is None: @@ -428,18 +448,35 @@ class ListTag: return self.list_tag def load_more(self): - form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - self.log_load_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout, stream=True) + url = self.__url + "?max_id="+self.end_cursor + self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, + timeout=requests_timeout, stream=True) content = requests_get(self.__r) - self.__set_cookies(self.__r.cookies) + + self.log_load_url_before() self.__r.raise_for_status() - # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + self.__tag = self.__get_tag(url) + self.__set_cookies(self.__r.cookies) + # self.__url = url + # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) + self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() - self.log_load_more_after() + self.log_load_url_after() + + # 기존 방식 instagram에서 post를 막은 듯 + # form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) + # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) + # self.log_load_more_before(form_data, headers) + # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, + # timeout=requests_timeout, stream=True) + # content = requests_get(self.__r) + # self.__set_cookies(self.__r.cookies) + # self.__r.raise_for_status() + # # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) + # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + # self.__r.close() + # self.log_load_more_after() + return self.list_tag def __get_tag(self, url): @@ -472,14 +509,14 @@ class ListTag: return self.proxies def log_load_url_before(self): - if is_debuging: + if is_debug: printl("") printl("") printl('headers = ', end=' ') printl(instaheaders.get_headers_for_list_html()) def log_load_url_after(self): - if is_debuging: + if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) @@ -491,7 +528,7 @@ class ListTag: printl("") def log_load_more_before(self, form_data, headers): - if is_debuging: + if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) @@ -500,7 +537,7 @@ class ListTag: printl(headers) def log_load_more_after(self): - if is_debuging: + if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) @@ -537,19 +574,21 @@ class ListUser: return self.list_user def load_more(self): - form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - self.log_load_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout, stream=True) + url = self.__url + "?max_id=" + self.end_cursor + self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, + timeout=requests_timeout, stream=True) + # form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) + # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) + # self.log_load_more_before(form_data, headers) + # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, + # timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__set_cookies(self.__r.cookies) - - # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) + self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() - self.log_load_more_after() + # self.log_load_more_after() return self.list_user def get_cookies(self): @@ -575,7 +614,7 @@ class ListUser: return self.proxies def log_load_more_before(self, form_data, headers): - if is_debuging: + if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) @@ -584,7 +623,7 @@ class ListUser: printl(headers) def log_load_more_after(self): - if is_debuging: + if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) @@ -630,18 +669,17 @@ class InstaContent: return self.reply def load_reply_more(self): - form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) - self.log_load_reply_more_before(form_data, headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, - timeout=requests_timeout, stream=True) + url = self.__referer + "?max_id="+self.start_cursor + # self.log_load_reply_more_before(form_data, headers) + self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, + timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() + self.__code = self.__get_code(url) + self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) - # self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) - self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content) self.__r.close() - self.log_load_reply_more_after() + # self.log_load_reply_more_after() return self.reply def get_cookies(self): @@ -662,7 +700,7 @@ class InstaContent: return self.proxies def log_load_reply_more_before(self, form_data, headers): - if is_debuging: + if is_debug: printl("") printl("") printl('start_cursor = ' + self.start_cursor) @@ -671,7 +709,7 @@ class InstaContent: printl(headers) def log_load_reply_more_after(self): - if is_debuging: + if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) @@ -722,7 +760,7 @@ class InstaAlgorithm: self.close() def close(self): - if self.driver and not is_debuging: + if self.driver and not is_debug: self.driver.quit() self.send_to_db.close() printl("Finished Crawling :)") @@ -760,6 +798,7 @@ class InstaAlgorithm: # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # wait(1.5) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) + printl("element insert to queue {}".format(element['url'])) self.list_crawl.append(element) backup_set.add(element['url']) if self.is_until_page(): @@ -897,6 +936,7 @@ class InstaAlgorithmMulti(InstaAlgorithm): # p_list = [multiprocessing.Process(target=crawl_content_process, # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) # for i in range(num_of_content_process)] + printl("{} processs start".format(num_of_content_process)) p_list = [threading.Thread(target=crawl_content_process, args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) for i in range(num_of_content_process)]