#-*- coding: utf-8 -*- ''' Created on 2015. 12. 8. @author: cococo ''' import re import datetime import insta.instaparser as instaparser import insta.instaheaders as instaheaders import requests import logging # from multiprocessing import Queue # import multiprocessing from queue import Queue import threading import time import sys import bs4 import inspect from base.baseclasses import SendtoDB from base.baseclasses import CrawlInit from base.baseclasses import wait from base.baseclasses import is_debugger_attached # from base.baseclasses import Browser from selenium.webdriver.common.keys import Keys from base.baseclasses import enter_element import base.proxy import eventlet def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() call_frame = inspect.getouterframes(cur_frame, 2) frame_no = call_frame[1][3] == 'printd' and 2 or 1 file_path = call_frame[frame_no][1] line_no = call_frame[frame_no][2] # class_name = '' # if 'self' in call_frame[frame_no][0].f_locals: # class_name = str(call_frame[frame_no][0].f_locals['self'].__class__) # method_name = call_frame[frame_no][3] try: objects = ('{}({}) :'.format(file_path, line_no),) + objects print(*objects, sep=sep, end=end, file=file, flush=flush) except Exception as e: print(e) else: print(*objects, sep=sep, end=end, file=file, flush=flush) insta_url = "https://www.instagram.com/" insta_tag_url = "https://www.instagram.com/explore/tags/" insta_query = "https://www.instagram.com/query/" insta_body_url = 'https://www.instagram.com/p/' is_debug = is_debugger_attached() def printd(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: printl(*objects, sep=sep, end=end, file=file, flush=flush) num_of_list_ajax = 24 num_of_reply_ajax = 100 list_wait_sec = 0.9 body_wait_sec = 0.5 reply_wait_sec = 0.8 num_of_page_down = 20 num_of_content_process = 10 requests_timeout = 5 num_of_retry_proxy = 5 logging.basicConfig(level=logging.INFO, format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") logging.getLogger('requests').setLevel(logging.WARNING) logging.getLogger('pymysql').setLevel(logging.WARNING) def click_insta_load_more(driver): element = driver.find_element_by_css_selector("div._pupj3 > a") enter_element(element) def push_page_down(driver): body = driver.find_element_by_tag_name('body') body.send_keys(Keys.PAGE_DOWN) def focus_driver(driver): position = driver.get_window_position() size = driver.get_window_size() driver.maximize_window() driver.set_window_size(size['width'], size["height"]) driver.set_window_position(position['x'], position['y']) def requests_get(req, timeout=requests_timeout): body = [] start = time.time() for chunk in req.iter_content(1024): body.append(chunk) if time.time() > (start + timeout): req.close() raise Exception("timeout") return b''.join(body) eventlet.monkey_patch() def requests_wrapper(func): if sys.platform == 'win32': return func else: def wrapper(*args, **kwargs): with eventlet.Timeout(requests_timeout, Exception): return func(*args, **kwargs) return wrapper requests.get = requests_wrapper(requests.get) requests.post = requests_wrapper(requests.post) def instance_wrapper(func): # to save nice ip, port of proxy ip, port = base.proxy.get_proxy() def retry_load(*args, **kwargs): while True: # use clouser nonlocal ip, port proxies = base.proxy.get_requests_proxy(ip + ":" + port) kwargs['proxies'] = proxies # retry = num_of_retry_proxy # while retry: res = func(*args, **kwargs) if res: # printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident())) ip, port = base.proxy.get_proxy() # retry -= 1 return retry_load class InstanceWrapper(object): def __init__(self, func): self.ip, self.port = base.proxy.get_proxy() self.func = func self.num_of_retry_proxy = num_of_retry_proxy def do(self, *args, **kwargs): while True: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies # retry = num_of_retry_proxy # while retry: res = self.func(*args, **kwargs) if res: # printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) self.ip, self.port = base.proxy.get_proxy() # retry -= 1 def do_retry(self, *args, **kwargs): while True: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies retry = self.num_of_retry_proxy while retry: res = self.func(*args, **kwargs) if res: # printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy # printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) retry -= 1 self.ip, self.port = base.proxy.get_proxy() def do_no_proxy(self, *args, **kwargs): while True: retry = self.num_of_retry_proxy while retry: proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port) kwargs['proxies'] = proxies res = self.func(*args, **kwargs) if res: printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident())) return res # if the proxy was not good, get new proxy printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident())) retry -= 1 self.ip, self.port = base.proxy.get_proxy() # if get content with proxy failed, set no proxy # func guarantee returning a instance except the case where a url is invalid kwargs['proxies'] = None res = self.func(*args, **kwargs) # if res: # printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) # printl(args, kwargs) printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident())) return res def change_proxy(self): self.ip, self.port = base.proxy.get_proxy() @instance_wrapper def make_list_instance(url, proxies=None): try: if insta_tag_url in url: list_crawler = ListTag(url, proxies) else: list_crawler = ListUser(url, proxies) return list_crawler except requests.exceptions.ProxyError as e: printd('proxy: {}'.format(e)) printd("Fail to make list instance") return None except Exception as e: printd(e) printd("Fail to make list instance") return None # @instance_wrapper def make_content_instance(url, proxies=None): try: content = InstaContent(url, {}, url, proxies) return content except requests.exceptions.ProxyError as e: printd('proxy: {}'.format(e)) printd("Fail to make content instance") return None except Exception as e: printd(e) printd("Fail to make content instance") return None def ajax_wrapper(func): def retry_ajax_load(*args, **kwargs): retry = num_of_retry_proxy while retry: res = func(*args, **kwargs) if res is not None: break retry -= 1 return res return retry_ajax_load # @ajax_wrapper def load_ajax_list(ins): try: insta_list = ins.load_more() # if insta_list: # return insta_list # else: # return None return insta_list except Exception as e: printd(e) printd("Fail to load ajax list") return None # @ajax_wrapper def load_ajax_reply(ins): try: replies = ins.load_reply_more() return replies except Exception as e: printd(e) printd("Fail to load ajax reply") return None # def crawl_content_process(qu, keyword_id, db_num): # send_to_db = SendtoDB() # send_to_db.set_db(db_num) # while True: # element = qu.get() # if element is None: # break # ok = True # while ok: # try: # ip, port = base.proxy.get_proxy() # proxies = base.proxy.get_requests_proxy(ip + ":" + port) # content = InstaContent(element['url'], {}, element['url'], proxies) # body = content.get_body() # replies = content.get_reply() # body['article_url'] = element['url'] # body['keyword_id'] = keyword_id # while content.has_previous: # replies = content.load_reply_more() + replies # wait(reply_wait_sec) # for j in range(0, len(replies)): # replies[j]['article_url'] = body['article_url'] # replies[j]['platform_id'] = body['platform_id'] # replies[j]['article_order'] = j # send_to_db.delete_url(body['article_url']) # send_to_db.send_body(body) # if replies: # send_to_db.send_reply(replies) # printl(element['url']) # printl('ok') # ok = False # except: # printl("failed proxy {0}:{1}".format(ip, port)) # printl('finish thread') def crawl_content_process(qu, keyword_id, db_num): # m_c_i = instance_wrapper(make_content_instance) m_c_i = InstanceWrapper(make_content_instance) send_to_db = SendtoDB() send_to_db.set_db(db_num) while True: try: element = qu.get(timeout=60) except Exception as e: printl("[crawl_content_process] queue is empty") continue if element is None: break ok = True while ok: try: # get a instance of InstaContent by do_no_proxy func. # if element['url'] is invalid, content is None element['url'] = 'https://www.instagram.com/p/BWrBng6l9H3/' content = m_c_i.do_no_proxy(element['url']) if not content: break body = content.get_body() replies = content.get_reply() body['article_url'] = element['url'] body['keyword_id'] = keyword_id while content.has_previous: rep = load_ajax_reply(content) if rep is None: printl("proxies = ", content.proxies) m_c_i.change_proxy() raise Exception("reply load error") #if rep: replies = rep + replies wait(reply_wait_sec) for j in range(0, len(replies)): replies[j]['article_url'] = body['article_url'] replies[j]['platform_id'] = body['platform_id'] replies[j]['article_order'] = j send_to_db.delete_url(body['article_url']) send_to_db.send_body(body) if replies: send_to_db.send_reply(replies) if content.proxies is not None: printl("proxies = ", content.proxies['http'][7:]) printl(element['url']) printl('ok') ok = False except UnicodeEncodeError as ue: printl(element['url']) printl(ue) break except Exception as e: # catch error when send_to_db error occur printl(element['url']) printl(e) qu.task_done() printl('finish thread') class InstaInit(CrawlInit): def __init__(self, before_day=0): super().__init__(before_day) self.urls = dict() self.urls[9] = insta_tag_url self.urls[10] = insta_url def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() if self.platform() == 10: for x in splited_list: trimmed_list.append(x.strip()) else: for x in splited_list: trimmed_list.append(self.utf8(x)) return trimmed_list def make_url(self): urls = list() for x in self.split_searches(): url = self.urls[self.platform()] + x urls.append(url) return urls def get_begin_day(self): if self.is_realtime(): date_now = datetime.datetime.now() result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) result += datetime.timedelta(days=self.before_day) return result.date() else: return self.start_day() def get_end_day(self): if self.is_realtime(): date_now = datetime.datetime.now() result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) return result.date() else: return self.end_day() class ListTag: def __init__(self, url, proxies=None): self.__r = None self.__tag = '' self.__url = '' self.list_tag = [] self.end_cursor = None self.has_next = False self.cookies = {} self.proxies = proxies self.load_url(url, self.proxies) def load_url(self, url, proxies): self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) # self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) self.__url = url #self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() printd('tag list, end_cursor: {}'.format(self.end_cursor)) # self.log_load_url_after() return self.list_tag def load_more(self): url = self.__url + "?max_id="+self.end_cursor self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) # self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) # self.__url = url # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() printd('tag list, end_cursor: {}'.format(self.end_cursor)) # self.log_load_url_after() # 기존 방식 instagram?�서 post�?막�? ?? # form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) # self.log_load_more_before(form_data, headers) # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, # timeout=requests_timeout, stream=True) # content = requests_get(self.__r) # self.__set_cookies(self.__r.cookies) # self.__r.raise_for_status() # # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) # self.__r.close() # self.log_load_more_after() return self.list_tag def __get_tag(self, url): m = re.search(insta_tag_url + "([^/]*)", url) if m: return m.group(1) else: raise RuntimeError('Tag Error') def get_cookies(self): return self.cookies def get_url(self): return self.__url def set_end_cursor(self, cursor): self.end_cursor = cursor def get_end_cursor(self): return self.end_cursor def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_list(self): return self.list_tag def get_proxy(self): return self.proxies def log_load_url_before(self): if is_debug: printl("") printl("") printl('headers = ', end=' ') printl(instaheaders.get_headers_for_list_html()) def log_load_url_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") def log_load_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") class ListUser: def __init__(self, url, proxies=None): self.__r = None self.__user = '' self.__url = '' self.list_user = [] self.end_cursor = None self.has_next = False self.cookies = {} self.proxies = proxies self.load_url(url, self.proxies) def load_url(self, url, proxies): self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__url = url self.__set_cookies(self.__r.cookies) # self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() printd('user list, end_cursor: {}'.format(self.end_cursor)) return self.list_user def load_more(self): url = self.__url + "?max_id=" + self.end_cursor self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies, timeout=requests_timeout, stream=True) # form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) # headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) # self.log_load_more_before(form_data, headers) # self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies, # timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() self.__set_cookies(self.__r.cookies) # self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content) self.__r.close() printd('user list, end_cursor: {}'.format(self.end_cursor)) # self.log_load_more_after() return self.list_user def get_cookies(self): return self.cookies def get_url(self): return self.__url def set_end_cursor(self, cursor): self.end_cursor = cursor def get_end_cursor(self): return self.end_cursor def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_list(self): return self.list_user def get_proxy(self): return self.proxies def log_load_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('end_cursor = ' + str(self.end_cursor)) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('end_cursor = ' + str(self.end_cursor)) printl('has_next = ', end='') printl(self.has_next) printl('proxies = ', end='') printl(self.proxies) printl("") class InstaContent: def __init__(self, url, cookies, referer, proxies=None): self.__r = None self.__referer = '' self.__code = '' self.body = None self.reply = [] self.start_cursor = None self.has_previous = False self.cookies = {} self.proxies = proxies self.content = '' self.query_id = '' self.load_url(url, cookies, referer, self.proxies) def load_url(self, url, cookies, referer, proxies): self.__set_cookies(cookies) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.content = content self.__r.raise_for_status() self.__referer = referer self.__code = self.__get_code(url) # self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content) self.__set_cookies(self.__r.cookies) self.__r.close() printd('reply, end_cursor: {}'.format(self.start_cursor)) return self.body, self.reply def get_body(self): return self.body def get_reply(self): return self.reply def get_query_ids(self, html): doc = bs4.BeautifulSoup(html, "html.parser") query_ids = [] for script in doc.find_all("script"): if script.has_attr("src") and "_Commons.js" in script['src']: text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text): query_ids.append(query_id) return query_ids def find_query_id(self): potential_query_ids = self.get_query_ids(self.content) query_id = '' for potential_id in potential_query_ids: # url = "https://www.instagram.com/graphql/query/?query_id=%s&first=12&after=%s" % (potential_id, self.start_cursor) url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( potential_id, self.__code, len(self.reply), self.start_cursor) try: data = requests.get(url).json() if data['status'] == 'ok': query_id = potential_id break except Exception: # no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.' pass return query_id def load_reply_more(self): if not self.query_id: self.query_id = self.find_query_id() url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( self.query_id, self.__code, len(self.reply), self.start_cursor) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) self.__r.raise_for_status() reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content) self.__set_cookies(self.__r.cookies) self.__r.close() self.reply = self.reply + reply printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) return reply def get_cookies(self): return self.cookies def __get_code(self, url): m = re.search(insta_body_url + "([^/]*)", url) if m: return m.group(1) else: raise RuntimeError('Tag Error') def __set_cookies(self, cookies): for k, v in cookies.items(): self.cookies[k] = v def get_proxy(self): return self.proxies def log_load_reply_more_before(self, form_data, headers): if is_debug: printl("") printl("") printl('start_cursor = ' + self.start_cursor) printl('form_data' + form_data) printl('headers = ', end=' ') printl(headers) def log_load_reply_more_after(self): if is_debug: printl("") printl('self.__r.cookies=', end='') printl(self.__r.cookies) printl('start_cursor = ' + str(self.start_cursor)) printl('has_previous = ', end='') printl(self.has_previous) printl('proxies = ', end='') printl(self.proxies) printl("") class InstaAlgorithm: def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): self.send_to_db = send_to_db self.crawl_init = crawl_init self.browser = browser self.driver = driver self.keyword_id = keyword_id self.reload_wait_second = reload_wait_second self.num_of_load_content = num_of_load_content self.page_down = page_down self.list_crawl = [] def crawl_content(self, url, cookies, referer): content = InstaContent(url, cookies, referer) body = content.get_body() replies = content.get_reply() body['article_url'] = url body['keyword_id'] = self.keyword_id # printl(body['article_url']) while content.has_previous: replies = content.load_reply_more() + replies wait(reply_wait_sec) for j in range(0, len(replies)): replies[j]['article_url'] = body['article_url'] replies[j]['platform_id'] = body['platform_id'] replies[j]['article_order'] = j self.send_to_db.delete_url(body['article_url']) self.send_to_db.send_body(body) if replies: self.send_to_db.send_reply(replies) printl('ok') printl() def start_crawl(self): self.crawl() self.close() def close(self): if self.driver and not is_debug: self.driver.quit() self.send_to_db.close() printl("Finished Crawling :)") def crawl(self): raise NotImplementedError def is_until_page(self): if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl): return True else: return False def crawl_contents(self, contents_list, backup_set): """ :param contents_list: :param backup_set: :return: is_load_more """ old_elements = 0 for element in contents_list: if element['date'].date() > self.crawl_init.get_end_day(): # printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) elif element['date'].date() < self.crawl_init.get_begin_day(): printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) old_elements += 1 if old_elements > 6: return False else: if not element['url'] in backup_set: # printl(element['url']) # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # wait(1.5) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) printl("element insert to queue {}".format(element['url'])) self.list_crawl.append(element) backup_set.add(element['url']) if self.is_until_page(): return False if self.list_crawl: printl("Number of Lists = {0}".format(len(self.list_crawl))) return True def crawl_list(self): if self.list_crawl: printl() printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S"))) printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S"))) printl("Total gathered contents = {0}".format(len(self.list_crawl))) printl() for element in self.list_crawl: try: printl(element['url']) printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) wait(body_wait_sec) self.crawl_content(element['url'], {}, element['url']) except Exception as e: printl(e) logging.info(e) class InstaAlgorithmNormal(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) if self.driver: self.driver.quit() def crawl(self): real_time = True while real_time: printl("Crawling Start") url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: printl(url_list[i] + "\n") if insta_tag_url in url_list[i]: list_crawler = ListTag(url_list[i]) else: list_crawler = ListUser(url_list[i]) wait(1) insta_list = list_crawler.get_list() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # ajax load while is_load_more: if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) insta_list = list_crawler.load_more() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next self.crawl_list() self.list_crawl.clear() i += 1 except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaAlgorithmMulti(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) if self.driver: self.driver.quit() self.list_crawl = Queue() self.total_num = 0 def crawl_contents(self, contents_list, backup_set): """ :param contents_list: :param backup_set: :return: is_load_more """ old_elements = 0 for element in contents_list: if element['date'].date() > self.crawl_init.get_end_day(): printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format( element['url'], element['date'].strftime("%Y-%m-%d %H:%M:%S"), self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"), self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S"))) elif element['date'].date() < self.crawl_init.get_begin_day(): printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format( element['url'], element['date'].strftime("%Y-%m-%d %H:%M:%S"), self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"), self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S"))) old_elements += 1 if old_elements > 6: return False else: if not element['url'] in backup_set: # printl(element['url']) # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # wait(1.5) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) try: self.list_crawl.put(element, timeout=10) except Exception as e: printl(e) printl("queue size = ", self.list_crawl.qsize()) backup_set.add(element['url']) self.total_num += 1 if self.is_until_page(): return False # if self.list_crawl: # printl("Number of Lists = {0}".format(len(self.list_crawl))) return True def crawl(self): real_time = True while real_time: printl("Crawling Start") url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: printl(url_list[i] + "\n") # insta_content process create and start # p_list = [multiprocessing.Process(target=crawl_content_process, # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) # for i in range(num_of_content_process)] printl("{} processs start".format(num_of_content_process)) p_list = [threading.Thread(target=crawl_content_process, args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) for i in range(num_of_content_process)] for p in p_list: p.daemon = True p.start() # crawl list ok = True while ok: try: list_crawler = make_list_instance(url_list[i]) ok = False except Exception as e: printl(e) wait(1) insta_list = list_crawler.get_list() is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # ajax load while is_load_more: if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) try: insta_list = load_ajax_list(list_crawler) if insta_list is None: break is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next except Exception as e: printl('is_load_more exception') printl(e) is_load_more = False #self.crawl_list() #self.list_crawl.close() printl("end load") printl("total number of crawled list = {0}".format(self.total_num)) self.total_num = 0 # check task is done in queue # self.list_crawl.join() # stop child process for i in range(num_of_content_process): self.list_crawl.put(None, timeout=10) # wait child process for p in p_list: p.join() for _ in range(self.list_crawl.qsize()): self.list_crawl.get(block=False) i += 1 except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaAlgorithmBrowser(InstaAlgorithm): def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second=2, num_of_load_content=12, page_down=50): super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, reload_wait_second, num_of_load_content, page_down) def url_load(self, url): if insta_tag_url in url: list_tag = ListTag(url) insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source) return list_tag, insta_list, end_cursor, has_next else: list_user = ListUser(url) insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source) return list_user, insta_list, end_cursor, has_next def crawl(self): real_time = True while real_time: url_list = self.crawl_init.make_url() i = 0 end_cursor = None backup_set = set() while i < len(url_list): # first connect try: wait(3) printl(url_list[i] + "\n") self.driver.get(url_list[i]) wait(5) list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i]) is_load_more = self.crawl_contents(insta_list, backup_set) and has_next list_crawler.set_end_cursor(end_cursor2) list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()} # ajax load page_down = 0 while is_load_more: if page_down == self.page_down: page_down = 0 try: focus_driver(self.driver) click_insta_load_more(self.driver) except: push_page_down(self.driver) page_down += 1 if end_cursor: list_crawler.end_cursor = end_cursor end_cursor = None wait(self.reload_wait_second) insta_list = list_crawler.load_more() # printl("list length = " + str(len(insta_list))) is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next # printl("number of backup_set = {0}".format(len(backup_set))) i += 1 self.crawl_list() self.list_crawl.clear() except Exception as e: logging.info(e) end_cursor = list_crawler.end_cursor printl('end_cursor=' + end_cursor) if e.args: wait(300) if self.driver: self.driver.close() wait(3) self.driver = self.browser.new_browser() real_time = self.crawl_init.is_realtime() printl("Finished Crawling :)") class InstaMainCrawler: def __init__(self): self.send_to_db = SendtoDB() self.crawl_init = InstaInit() #self.browser = Browser() self.browser = None self.driver = None def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def crawl_all(self, backup_set=None): pass def start(self): self.crawler_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) # self.init_browser(browser) def set_driver(self, driver): self.driver = driver def init_browser(self, browser): try: self.set_driver(self.browser.get_new_driver(browser)) except Exception as e: logging.info(e) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawler_start(self): # if self.driver: # algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db, # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) # else: # algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db, # self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db, self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) algorithm.start_crawl()