#-*- coding: utf-8 -*- ''' Created on 2015. 12. 8. @author: cococo ''' import re import datetime import insta.instaparser as instaparser import insta.instaheaders as instaheaders import requests import logging # from multiprocessing import Queue # import multiprocessing from queue import Queue import threading import time import sys import inspect from base.baseclasses import SendtoDB from base.baseclasses import CrawlInit from base.baseclasses import wait from base.baseclasses import is_debugger_attached # from base.baseclasses import Browser from selenium.webdriver.common.keys import Keys from base.baseclasses import enter_element import base.proxy import eventlet def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() call_frame = inspect.getouterframes(cur_frame, 2) frame_no = call_frame[1][3] == 'printd' and 2 or 1 file_path = call_frame[frame_no][1] line_no = call_frame[frame_no][2] # class_name = '' # if 'self' in call_frame[frame_no][0].f_locals: # class_name = str(call_frame[frame_no][0].f_locals['self'].__class__) # method_name = call_frame[frame_no][3] try: # print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush) print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush) except Exception as e: print(e) else: print(*objects, sep=sep, end=end, file=file, flush=flush) insta_url = "https://www.instagram.com/" insta_tag_url = "https://www.instagram.com/explore/tags/" insta_query = "https://www.instagram.com/query/" insta_body_url = 'https://www.instagram.com/p/' is_debug = is_debugger_attached() def printd(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: printl(*objects, sep=sep, end=end, file=file, flush=flush) num_of_list_ajax = 24 num_of_reply_ajax = 100 list_wait_sec = 0.9 body_wait_sec = 0.5 reply_wait_sec = 0.8 num_of_page_down = 20 num_of_content_process = 10 requests_timeout = 60 num_of_retry_proxy = 5 logging.basicConfig(level=logging.INFO, format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('pymysql').setLevel(logging.WARNING) def click_insta_load_more(driver): element = driver.find_element_by_css_selector("div._pupj3 > a") enter_element(element) def push_page_down(driver): body = driver.find_element_by_tag_name('body') body.send_keys(Keys.PAGE_DOWN) def focus_driver(driver): position = driver.get_window_position() size = driver.get_window_size() driver.maximize_window() driver.set_window_size(size['width'], size["height"]) driver.set_window_position(position['x'], position['y']) def requests_get(req, timeout=requests_timeout): body = [] start = time.time() for chunk in req.iter_content(1024): body.append(chunk) if time.time() > (start + timeout): req.close() raise Exception("timeout") return b''.join(body) eventlet.monkey_patch() def requests_wrapper(func): if sys.platform == 'win32': return func else: def wrapper(*args, **kwargs): with eventlet.Timeout(requests_timeout, Exception): return func(*args, **kwargs) return wrapper requests.get = requests_wrapper(requests.get) requests.post = requests_wrapper(requests.post) def instance_wrapper(func): # to save nice ip, port of proxy ip, port = base.proxy.get_proxy() def retry_load(*args, **kwargs): while True: # use clouser nonlocal ip, port proxies = base.proxy.get_requests_proxy(ip + ":" + port) kwargs['proxies'] = proxies # retry = num_of_retry_proxy # while retry: res = func(*args, **kwargs) if res: # printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident())) ip, port = base.proxy.get_proxy() # retry -= 1 return retry_load class InstanceWrapper(object): def __init__(self, func): self.ip, self.port = base.proxy.get_proxy() self.func = func self.num_of_retry_proxy = num_of_retry_proxy def do(self, *args, **kwargs): while True: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies # retry = num_of_retry_proxy # while retry: res = self.func(*args, **kwargs) if res: # printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) self.ip, self.port = base.proxy.get_proxy() # retry -= 1 def do_retry(self, *args, **kwargs): while True: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies retry = self.num_of_retry_proxy while retry: res = self.func(*args, **kwargs) if res: # printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) retry -= 1 self.ip, self.port = base.proxy.get_proxy() def do_no_proxy(self, *args, **kwargs): while True: retry = self.num_of_retry_proxy while retry: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies res = self.func(*args, **kwargs) if res: printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) retry -= 1 self.ip, self.port = base.proxy.get_proxy() # if get content with proxy failed, set no proxy # func guarantee returning a instance except the case where a url is invalid kwargs['proxies'] = None res = self.func(*args, **kwargs) # if res: # printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) # printl(args, kwargs) printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) return res def change_proxy(self): self.ip, self.port = base.proxy.get_proxy() @instance_wrapper def make_list_instance(url, proxies=None): try: if insta_tag_url in url: list_crawler = ListTag(url, proxies) else: list_crawler = ListUser(url, proxies) return list_crawler except Exception as e: printd(e) printd("Fail to make list instance") return None # @instance_wrapper def make_content_instance(url, proxies=None): try: content = InstaContent(url, {}, url, proxies) return content except Exception as e: printd(e) printd("Fail to make contanet instance") return None def ajax_wrapper(func): def retry_ajax_load(*args, **kwargs): retry = num_of_retry_proxy while retry: res = func(*args, **kwargs) if res is not None: break retry -= 1 return res return retry_ajax_load # @ajax_wrapper def load_ajax_list(ins): try: insta_list = ins.load_more() # if insta_list: # return insta_list # else: # return None return insta_list except Exception as e: printd(e) printd("Fail to load ajax list") return None # @ajax_wrapper def load_ajax_reply(ins): try: replies = ins.load_reply_more() # if replies: # return replies # else: # return None return replies except Exception as e: printd(e) printd("Fail to load ajax reply") return None # def crawl_content_process(qu, keyword_id, db_num): # send_to_db = SendtoDB() # send_to_db.set_db(db_num) # while True: # element = qu.get() # if element is None: # break # ok = True # while ok: # try: # ip, port = base.proxy.get_proxy() # proxies = base.proxy.get_requests_proxy(ip + ":" + port) # content = InstaContent(element['url'], {}, element['url'], proxies) # body = content.get_body() # replies = content.get_reply() # body['article_url'] = element['url'] # body['keyword_id'] = keyword_id # while content.has_previous: # replies = content.load_reply_more() + replies # wait(reply_wait_sec) # for j in range(0, len(replies)): # replies[j]['article_url'] = body['article_url'] # replies[j]['platform_id'] = body['platform_id'] # replies[j]['article_order'] = j # send_to_db.delete_url(body['article_url']) # send_to_db.send_body(body) # if replies: # send_to_db.send_reply(replies) # printl(element['url']) # printl('ok') # ok = False # except: # printl("failed proxy {0}:{1}".format(ip, port)) # printl('finish thread') def crawl_content_process(qu, keyword_id, db_num): # m_c_i = instance_wrapper(make_content_instance) m_c_i = InstanceWrapper(make_content_instance) send_to_db = SendtoDB() send_to_db.set_db(db_num) while True: try: element = qu.get(timeout=60) except Exception as e: printl("[crawl_content_process] queue is empty") continue if element is None: break ok = True while ok: try: # get a instance of InstaContent by do_no_proxy func. # if element['url'] is invalid, content is None content = m_c_i.do_no_proxy(element['url']) if not content: break body = content.get_body() replies = content.get_reply() body['article_url'] = element['url'] body['keyword_id'] = keyword_id while content.has_previous: rep = load_ajax_reply(content) if rep is None: printl("proxies = ", content.proxies) m_c_i.change_proxy() raise Exception("reply load error") replies = rep + replies wait(reply_wait_sec) for j in range(0, len(replies)): replies[j]['article_url'] = body['article_url'] replies[j]['platform_id'] = body['platform_id'] replies[j]['article_order'] = j send_to_db.delete_url(body['article_url']) send_to_db.send_body(body) if replies: send_to_db.send_reply(replies) printl("proxies = ", content.proxies['http'][7:]) printl(element['url']) printl('ok') ok = False except UnicodeEncodeError as ue: printl(element['url']) printl(ue) break except Exception as e: # catch error when send_to_db error occur printl(element['url']) printl(e) qu.task_done() printl('finish thread') class InstaInit(CrawlInit): def __init__(self, before_day=0): super().__init__(before_day) self.urls = dict() self.urls[9] = insta_tag_url self.urls[10] = insta_url def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() if self.platform() == 10: for x in splited_list: trimmed_list.append(x.strip()) else: for x in splited_list: trimmed_list.append(self.utf8(x)) return trimmed_list def make_url(self): urls = list() for x in self.split_searches(): url = self.urls[self.platform()] + x urls.append(url) return urls def get_begin_day(self): if self.is_realtime(): date_now = datetime.datetime.now() result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) result += datetime.timedelta(days=self.before_day) return result.date() else: return self.start_day() def get_end_day(self): if self.is_realtime(): date_now = datetime.datetime.now() result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) return result.date() else: return self.end_day() class ListTag: def __init__(self, url, proxies=None): self.__r = None self.__tag = '' self.__url = '' self.list_tag = [] self.end_cursor = None self.has_next = False self.cookies = {} self.proxies = proxies self.load_url(url, self.proxies) def load_url(self, url, proxies): self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) self.__url = url #self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() self.log_load_url_after() return self.list_tag def load_more(self): url = self.__url + "?max_id="+self.end_cursor self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) # self.__url = url # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() self.log_load_url_after() # 기존 방식 instagram에서 post를 막은 듯 # form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) # self.log_load_more_before(form_data, headers) # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, # timeout=requests_timeout, stream=True) # content = requests_get(self.__r) # self.__set_cookies(self.__r.cookies) # self.__r.raise_for_status() # # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) # self.__r.close() # self.log_load_more_after() return self.list_tag def __get_tag(self, url): m = re.search(insta_tag_url + "([^/]*)", url) if m: return m.group(1) else: raise RuntimeError('Tag Error') def get_cookies(self): return self.cookies def get_url(self): return self.__url def set_end_cursor(self, cursor): self.end_cursor = cursor def get_end_cursor(self): return self.end_cursor def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_list(self): return self.list_tag def get_proxy(self): return self.proxies def log_load_url_before(self): if is_debug: printl("") printl("") printl('headers = ', end=' ') printl(instaheaders.get_headers_for_list_html()) def log_load_url_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") def log_load_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") class ListUser: def __init__(self, url, proxies=None): self.__r = None self.__user = '' self.__url = '' self.list_user = [] self.end_cursor = None self.has_next = False self.cookies = {} self.proxies = proxies self.load_url(url, self.proxies) def load_url(self, url, proxies): self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__url = url self.__set_cookies(self.__r.cookies) # self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() return self.list_user def load_more(self): url = self.__url + "?max_id=" + self.end_cursor self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, timeout=requests_timeout, stream=True) # form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) # self.log_load_more_before(form_data, headers) # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, # timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__set_cookies(self.__r.cookies) # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() # self.log_load_more_after() return self.list_user def get_cookies(self): return self.cookies def get_url(self): return self.__url def set_end_cursor(self, cursor): self.end_cursor = cursor def get_end_cursor(self): return self.end_cursor def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_list(self): return self.list_user def get_proxy(self): return self.proxies def log_load_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") class InstaContent: def __init__(self, url, cookies, referer, proxies=None): self.__r = None self.__referer = '' self.__code = '' self.body = None self.reply = [] self.start_cursor = None self.has_previous = False self.cookies = {} self.proxies = proxies self.load_url(url, cookies, referer, self.proxies) def load_url(self, url, cookies, referer, proxies): self.__set_cookies(cookies) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__referer = referer self.__code = self.__get_code(url) # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) self.__r.close() return self.body, self.reply def get_body(self): return self.body def get_reply(self): return self.reply def load_reply_more(self): url = self.__referer + "?max_id="+self.start_cursor # self.log_load_reply_more_before(form_data, headers) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__code = self.__get_code(url) self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) self.__r.close() # self.log_load_reply_more_after() return self.reply def get_cookies(self): return self.cookies def __get_code(self, url): m = re.search(insta_body_url + "([^/]*)", url) if m: return m.group(1) else: raise RuntimeError('Tag Error') def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_proxy(self): return self.proxies def log_load_reply_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('start_cursor = ' + self.start_cursor) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_reply_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('start_cursor = ' + str(self.start_cursor)) printl('has_previous = ', end='') printl(self.has_previous) printl('proxies = ', end='') printl(self.proxies) printl("") class InstaAlgorithm: def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): self.send_to_db = send_to_db self.crawl_init = crawl_init self.browser = browser self.driver = driver self.keyword_id = keyword_id self.reload_wait_second = reload_wait_second self.num_of_load_content = num_of_load_content self.page_down = page_down self.list_crawl = [] def crawl_content(self, url, cookies, referer): content = InstaContent(url, cookies, referer) body = content.get_body() replies = content.get_reply() body['article_url'] = url body['keyword_id'] = self.keyword_id # printl(body['article_url']) while content.has_previous: replies = content.load_reply_more() + replies wait(reply_wait_sec) for j in range(0, len(replies)): replies[j]['article_url'] = body['article_url'] replies[j]['platform_id'] = body['platform_id'] replies[j]['article_order'] = j self.send_to_db.delete_url(body['article_url']) self.send_to_db.send_body(body) if replies: self.send_to_db.send_reply(replies) printl('ok') printl() def start_crawl(self): self.crawl() self.close() def close(self): if self.driver and not is_debug: self.driver.quit() self.send_to_db.close() printl("Finished Crawling :)") def crawl(self): raise NotImplementedError def is_until_page(self): if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl): return True else: return False def crawl_contents(self, contents_list, backup_set): """ :param contents_list: :param backup_set: :return: is_load_more """ old_elements = 0 for element in contents_list: if element['date'].date() > self.crawl_init.get_end_day(): # printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) elif element['date'].date() < self.crawl_init.get_begin_day(): printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) old_elements += 1 if old_elements > 6: return False else: if not element['url'] in backup_set: # printl(element['url']) # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # wait(1.5) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) printl("element insert to queue {}".format(element['url'])) self.list_crawl.append(element) backup_set.add(element['url']) if self.is_until_page(): return False if self.list_crawl: printl("Number of Lists = {0}".format(len(self.list_crawl))) return True def crawl_list(self): if self.list_crawl: printl() printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S"))) printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S"))) printl("Total gathered contents = {0}".format(len(self.list_crawl))) printl() for element in self.list_crawl: try: printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) wait(body_wait_sec) self.crawl_content(element['url'], {}, element['url']) except Exception as e: printl(e) logging.info(e) class InstaAlgorithmNormal(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) if self.driver: self.driver.quit() def crawl(self): real_time = True while real_time: printl("Crawling Start") url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: printl(url_list[i] + "\n") if insta_tag_url in url_list[i]: list_crawler = ListTag(url_list[i]) else: list_crawler = ListUser(url_list[i]) wait(1) insta_list = list_crawler.get_list() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # ajax load while is_load_more: if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) insta_list = list_crawler.load_more() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next self.crawl_list() self.list_crawl.clear() i += 1 except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaAlgorithmMulti(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) if self.driver: self.driver.quit() self.list_crawl = Queue() self.total_num = 0 def crawl_contents(self, contents_list, backup_set): """ :param contents_list: :param backup_set: :return: is_load_more """ old_elements = 0 for element in contents_list: if element['date'].date() > self.crawl_init.get_end_day(): # printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) elif element['date'].date() < self.crawl_init.get_begin_day(): printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) old_elements += 1 if old_elements > 6: return False else: if not element['url'] in backup_set: # printl(element['url']) # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # wait(1.5) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) try: self.list_crawl.put(element, timeout=10) except Exception as e: printl(e) printl("queue size = ", self.list_crawl.qsize()) backup_set.add(element['url']) self.total_num += 1 if self.is_until_page(): return False # if self.list_crawl: # printl("Number of Lists = {0}".format(len(self.list_crawl))) return True def crawl(self): real_time = True while real_time: printl("Crawling Start") url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: printl(url_list[i] + "\n") # insta_content process create and start # p_list = [multiprocessing.Process(target=crawl_content_process, # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) # for i in range(num_of_content_process)] printl("{} processs start".format(num_of_content_process)) p_list = [threading.Thread(target=crawl_content_process, args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) for i in range(num_of_content_process)] for p in p_list: p.daemon = True p.start() # crawl list ok = True while ok: try: list_crawler = make_list_instance(url_list[i]) ok = False except Exception as e: printl(e) wait(1) insta_list = list_crawler.get_list() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # ajax load while is_load_more: if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) try: insta_list = load_ajax_list(list_crawler) if insta_list is None: break is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next except Exception as e: printl('is_load_more exception') printl(e) is_load_more = False #self.crawl_list() #self.list_crawl.close() printl("end load") printl("total number of crawled list = {0}".format(self.total_num)) self.total_num = 0 # check task is done in queue # self.list_crawl.join() # stop child process for i in range(num_of_content_process): self.list_crawl.put(None, timeout=10) # wait child process for p in p_list: p.join() for _ in range(self.list_crawl.qsize()): self.list_crawl.get(block=False) i += 1 except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaAlgorithmBrowser(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) def url_load(self, url): if insta_tag_url in url: list_tag = ListTag(url) insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source) return list_tag, insta_list, end_cursor, has_next else: list_user = ListUser(url) insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source) return list_user, insta_list, end_cursor, has_next def crawl(self): real_time = True while real_time: url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: wait(3) printl(url_list[i] + "\n") self.driver.get(url_list[i]) wait(5) list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i]) is_load_more = self.crawl_contents(insta_list, backup_set) and has_next list_crawler.set_end_cursor(end_cursor2) list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()} # ajax load page_down = 0 while is_load_more: if page_down == self.page_down: page_down = 0 try: focus_driver(self.driver) click_insta_load_more(self.driver) except: push_page_down(self.driver) page_down += 1 if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) insta_list = list_crawler.load_more() # printl("list length = " + str(len(insta_list))) is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # printl("number of backup_set = {0}".format(len(backup_set))) i += 1 self.crawl_list() self.list_crawl.clear() except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) if self.driver: self.driver.close() wait(3) self.driver = self.browser.new_browser() real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaMainCrawler: def __init__(self): self.send_to_db = SendtoDB() self.crawl_init = InstaInit() #self.browser = Browser() self.browser = None self.driver = None def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def crawl_all(self, backup_set=None): pass def start(self): self.crawler_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) # self.init_browser(browser) def set_driver(self, driver): self.driver = driver def init_browser(self, browser): try: self.set_driver(self.browser.get_new_driver(browser)) except Exception as e: logging.info(e) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawler_start(self): # if self.driver: # algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db, # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) # else: # algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db, # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db, self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) algorithm.start_crawl()