Files
clients/WebBasedCrawler/insta/instacrawl.py
admin 1e449a45af - instagram 크롤러 수정 (Tag, Comment, User)
- Debug 메시지 수정


git-svn-id: svn://192.168.0.12/source@351 8346c931-da38-4b9b-9d4c-e48b93cbd075
2017-06-29 03:12:39 +00:00

1134 lines
42 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
import logging
# from multiprocessing import Queue
# import multiprocessing
from queue import Queue
import threading
import time
import sys
import inspect
from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import is_debugger_attached
# from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
import base.proxy
import eventlet
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debug = is_debugger_attached()
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
printl(*objects, sep=sep, end=end, file=file, flush=flush)
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
num_of_content_process = 10
requests_timeout = 60
num_of_retry_proxy = 5
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('pymysql').setLevel(logging.WARNING)
def click_insta_load_more(driver):
element = driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def push_page_down(driver):
body = driver.find_element_by_tag_name('body')
body.send_keys(Keys.PAGE_DOWN)
def focus_driver(driver):
position = driver.get_window_position()
size = driver.get_window_size()
driver.maximize_window()
driver.set_window_size(size['width'], size["height"])
driver.set_window_position(position['x'], position['y'])
def requests_get(req, timeout=requests_timeout):
body = []
start = time.time()
for chunk in req.iter_content(1024):
body.append(chunk)
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
eventlet.monkey_patch()
def requests_wrapper(func):
if sys.platform == 'win32':
return func
else:
def wrapper(*args, **kwargs):
with eventlet.Timeout(requests_timeout, Exception):
return func(*args, **kwargs)
return wrapper
requests.get = requests_wrapper(requests.get)
requests.post = requests_wrapper(requests.post)
def instance_wrapper(func):
# to save nice ip, port of proxy
ip, port = base.proxy.get_proxy()
def retry_load(*args, **kwargs):
while True:
# use clouser
nonlocal ip, port
proxies = base.proxy.get_requests_proxy(ip + ":" + port)
kwargs['proxies'] = proxies
# retry = num_of_retry_proxy
# while retry:
res = func(*args, **kwargs)
if res:
# printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident()))
ip, port = base.proxy.get_proxy()
# retry -= 1
return retry_load
class InstanceWrapper(object):
def __init__(self, func):
self.ip, self.port = base.proxy.get_proxy()
self.func = func
self.num_of_retry_proxy = num_of_retry_proxy
def do(self, *args, **kwargs):
while True:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
# retry = num_of_retry_proxy
# while retry:
res = self.func(*args, **kwargs)
if res:
# printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
self.ip, self.port = base.proxy.get_proxy()
# retry -= 1
def do_retry(self, *args, **kwargs):
while True:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
retry = self.num_of_retry_proxy
while retry:
res = self.func(*args, **kwargs)
if res:
# printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
retry -= 1
self.ip, self.port = base.proxy.get_proxy()
def do_no_proxy(self, *args, **kwargs):
while True:
retry = self.num_of_retry_proxy
while retry:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
res = self.func(*args, **kwargs)
if res:
printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
retry -= 1
self.ip, self.port = base.proxy.get_proxy()
# if get content with proxy failed, set no proxy
# func guarantee returning a instance except the case where a url is invalid
kwargs['proxies'] = None
res = self.func(*args, **kwargs)
# if res:
# printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
# printl(args, kwargs)
printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
return res
def change_proxy(self):
self.ip, self.port = base.proxy.get_proxy()
@instance_wrapper
def make_list_instance(url, proxies=None):
try:
if insta_tag_url in url:
list_crawler = ListTag(url, proxies)
else:
list_crawler = ListUser(url, proxies)
return list_crawler
except Exception as e:
printd(e)
printd("Fail to make list instance")
return None
# @instance_wrapper
def make_content_instance(url, proxies=None):
try:
content = InstaContent(url, {}, url, proxies)
return content
except Exception as e:
printd(e)
printd("Fail to make contanet instance")
return None
def ajax_wrapper(func):
def retry_ajax_load(*args, **kwargs):
retry = num_of_retry_proxy
while retry:
res = func(*args, **kwargs)
if res is not None:
break
retry -= 1
return res
return retry_ajax_load
# @ajax_wrapper
def load_ajax_list(ins):
try:
insta_list = ins.load_more()
# if insta_list:
# return insta_list
# else:
# return None
return insta_list
except Exception as e:
printd(e)
printd("Fail to load ajax list")
return None
# @ajax_wrapper
def load_ajax_reply(ins):
try:
replies = ins.load_reply_more()
# if replies:
# return replies
# else:
# return None
return replies
except Exception as e:
printd(e)
printd("Fail to load ajax reply")
return None
# def crawl_content_process(qu, keyword_id, db_num):
# send_to_db = SendtoDB()
# send_to_db.set_db(db_num)
# while True:
# element = qu.get()
# if element is None:
# break
# ok = True
# while ok:
# try:
# ip, port = base.proxy.get_proxy()
# proxies = base.proxy.get_requests_proxy(ip + ":" + port)
# content = InstaContent(element['url'], {}, element['url'], proxies)
# body = content.get_body()
# replies = content.get_reply()
# body['article_url'] = element['url']
# body['keyword_id'] = keyword_id
# while content.has_previous:
# replies = content.load_reply_more() + replies
# wait(reply_wait_sec)
# for j in range(0, len(replies)):
# replies[j]['article_url'] = body['article_url']
# replies[j]['platform_id'] = body['platform_id']
# replies[j]['article_order'] = j
# send_to_db.delete_url(body['article_url'])
# send_to_db.send_body(body)
# if replies:
# send_to_db.send_reply(replies)
# printl(element['url'])
# printl('ok')
# ok = False
# except:
# printl("failed proxy {0}:{1}".format(ip, port))
# printl('finish thread')
def crawl_content_process(qu, keyword_id, db_num):
# m_c_i = instance_wrapper(make_content_instance)
m_c_i = InstanceWrapper(make_content_instance)
send_to_db = SendtoDB()
send_to_db.set_db(db_num)
while True:
try:
element = qu.get(timeout=60)
except Exception as e:
printl("[crawl_content_process] queue is empty")
continue
if element is None:
break
ok = True
while ok:
try:
# get a instance of InstaContent by do_no_proxy func.
# if element['url'] is invalid, content is None
content = m_c_i.do_no_proxy(element['url'])
if not content:
break
body = content.get_body()
replies = content.get_reply()
body['article_url'] = element['url']
body['keyword_id'] = keyword_id
while content.has_previous:
rep = load_ajax_reply(content)
if rep is None:
printl("proxies = ", content.proxies)
m_c_i.change_proxy()
raise Exception("reply load error")
replies = rep + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
send_to_db.delete_url(body['article_url'])
send_to_db.send_body(body)
if replies:
send_to_db.send_reply(replies)
printl("proxies = ", content.proxies['http'][7:])
printl(element['url'])
printl('ok')
ok = False
except UnicodeEncodeError as ue:
printl(element['url'])
printl(ue)
break
except Exception as e:
# catch error when send_to_db error occur
printl(element['url'])
printl(e)
qu.task_done()
printl('finish thread')
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result.date()
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result.date()
else:
return self.end_day()
class ListTag:
def __init__(self, url, proxies=None):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.proxies = proxies
self.load_url(url, self.proxies)
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
#self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_url_after()
return self.list_tag
def load_more(self):
url = self.__url + "?max_id="+self.end_cursor
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
# self.__url = url
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_url_after()
# 기존 방식 instagram에서 post를 막은 듯
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
# content = requests_get(self.__r)
# self.__set_cookies(self.__r.cookies)
# self.__r.raise_for_status()
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
# self.__r.close()
# self.log_load_more_after()
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
def get_proxy(self):
return self.proxies
def log_load_url_before(self):
if is_debug:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
def log_load_url_after(self):
if is_debug:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListTag End>")
def log_load_more_before(self, form_data, headers):
if is_debug:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_more_after(self):
if is_debug:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListTag End>")
class ListUser:
def __init__(self, url, proxies=None):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.proxies = proxies
self.load_url(url, self.proxies)
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
# self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.__r.close()
return self.list_user
def load_more(self):
url = self.__url + "?max_id=" + self.end_cursor
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
timeout=requests_timeout, stream=True)
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.__r.close()
# self.log_load_more_after()
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
def get_proxy(self):
return self.proxies
def log_load_more_before(self, form_data, headers):
if is_debug:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_more_after(self):
if is_debug:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListUser End>")
class InstaContent:
def __init__(self, url, cookies, referer, proxies=None):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.proxies = proxies
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
url = self.__referer + "?max_id="+self.start_cursor
# self.log_load_reply_more_before(form_data, headers)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
self.__r.close()
# self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_proxy(self):
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debug:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_reply_more_after(self):
if is_debug:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ContentReply End>")
class InstaAlgorithm:
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
self.send_to_db = send_to_db
self.crawl_init = crawl_init
self.browser = browser
self.driver = driver
self.keyword_id = keyword_id
self.reload_wait_second = reload_wait_second
self.num_of_load_content = num_of_load_content
self.page_down = page_down
self.list_crawl = []
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
# printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start_crawl(self):
self.crawl()
self.close()
def close(self):
if self.driver and not is_debug:
self.driver.quit()
self.send_to_db.close()
printl("Finished Crawling :)")
def crawl(self):
raise NotImplementedError
def is_until_page(self):
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
return True
else:
return False
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
printl("element insert to queue {}".format(element['url']))
self.list_crawl.append(element)
backup_set.add(element['url'])
if self.is_until_page():
return False
if self.list_crawl:
printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def crawl_list(self):
if self.list_crawl:
printl()
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
printl()
for element in self.list_crawl:
try:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(body_wait_sec)
self.crawl_content(element['url'], {}, element['url'])
except Exception as e:
printl(e)
logging.info(e)
class InstaAlgorithmNormal(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
self.crawl_list()
self.list_crawl.clear()
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmMulti(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
self.list_crawl = Queue()
self.total_num = 0
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
try:
self.list_crawl.put(element, timeout=10)
except Exception as e:
printl(e)
printl("queue size = ", self.list_crawl.qsize())
backup_set.add(element['url'])
self.total_num += 1
if self.is_until_page():
return False
# if self.list_crawl:
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
# insta_content process create and start
# p_list = [multiprocessing.Process(target=crawl_content_process,
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
# for i in range(num_of_content_process)]
printl("{} processs start".format(num_of_content_process))
p_list = [threading.Thread(target=crawl_content_process,
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
for i in range(num_of_content_process)]
for p in p_list:
p.daemon = True
p.start()
# crawl list
ok = True
while ok:
try:
list_crawler = make_list_instance(url_list[i])
ok = False
except Exception as e:
printl(e)
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
try:
insta_list = load_ajax_list(list_crawler)
if insta_list is None:
break
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
except Exception as e:
printl('is_load_more exception')
printl(e)
is_load_more = False
#self.crawl_list()
#self.list_crawl.close()
printl("end load")
printl("total number of crawled list = {0}".format(self.total_num))
self.total_num = 0
# check task is done in queue
# self.list_crawl.join()
# stop child process
for i in range(num_of_content_process):
self.list_crawl.put(None, timeout=10)
# wait child process
for p in p_list:
p.join()
for _ in range(self.list_crawl.qsize()):
self.list_crawl.get(block=False)
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmBrowser(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
def url_load(self, url):
if insta_tag_url in url:
list_tag = ListTag(url)
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
return list_tag, insta_list, end_cursor, has_next
else:
list_user = ListUser(url)
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
return list_user, insta_list, end_cursor, has_next
def crawl(self):
real_time = True
while real_time:
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
wait(3)
printl(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
list_crawler.set_end_cursor(end_cursor2)
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
# ajax load
page_down = 0
while is_load_more:
if page_down == self.page_down:
page_down = 0
try:
focus_driver(self.driver)
click_insta_load_more(self.driver)
except:
push_page_down(self.driver)
page_down += 1
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
# printl("list length = " + str(len(insta_list)))
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# printl("number of backup_set = {0}".format(len(backup_set)))
i += 1
self.crawl_list()
self.list_crawl.clear()
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
if self.driver:
self.driver.close()
wait(3)
self.driver = self.browser.new_browser()
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
#self.browser = Browser()
self.browser = None
self.driver = None
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
# self.init_browser(browser)
def set_driver(self, driver):
self.driver = driver
def init_browser(self, browser):
try:
self.set_driver(self.browser.get_new_driver(browser))
except Exception as e:
logging.info(e)
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
# if self.driver:
# algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
# else:
# algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm.start_crawl()