1194 lines
44 KiB
Python
1194 lines
44 KiB
Python
#-*- coding: utf-8 -*-
|
||
'''
|
||
Created on 2015. 12. 8.
|
||
|
||
@author: cococo
|
||
'''
|
||
import re
|
||
import datetime
|
||
import insta.instaparser as instaparser
|
||
import insta.instaheaders as instaheaders
|
||
import requests
|
||
import logging
|
||
# from multiprocessing import Queue
|
||
# import multiprocessing
|
||
from queue import Queue
|
||
import threading
|
||
import time
|
||
import sys
|
||
import bs4
|
||
import inspect
|
||
|
||
|
||
from base.baseclasses import SendtoDB
|
||
from base.baseclasses import CrawlInit
|
||
from base.baseclasses import wait
|
||
from base.baseclasses import is_debugger_attached
|
||
# from base.baseclasses import Browser
|
||
from selenium.webdriver.common.keys import Keys
|
||
from base.baseclasses import enter_element
|
||
import base.proxy
|
||
import eventlet
|
||
|
||
|
||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||
if is_debug:
|
||
cur_frame = inspect.currentframe()
|
||
call_frame = inspect.getouterframes(cur_frame, 2)
|
||
|
||
frame_no = call_frame[1][3] == 'printd' and 2 or 1
|
||
file_path = call_frame[frame_no][1]
|
||
line_no = call_frame[frame_no][2]
|
||
# class_name = ''
|
||
# if 'self' in call_frame[frame_no][0].f_locals:
|
||
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
|
||
# method_name = call_frame[frame_no][3]
|
||
|
||
try:
|
||
objects = ('{}({}) :'.format(file_path, line_no),) + objects
|
||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||
except Exception as e:
|
||
print(e)
|
||
else:
|
||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||
|
||
insta_url = "https://www.instagram.com/"
|
||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||
insta_query = "https://www.instagram.com/query/"
|
||
insta_body_url = 'https://www.instagram.com/p/'
|
||
|
||
is_debug = is_debugger_attached()
|
||
|
||
|
||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||
if is_debug:
|
||
printl(*objects, sep=sep, end=end, file=file, flush=flush)
|
||
|
||
|
||
num_of_list_ajax = 24
|
||
num_of_reply_ajax = 100
|
||
list_wait_sec = 0.9
|
||
body_wait_sec = 0.5
|
||
reply_wait_sec = 0.8
|
||
num_of_page_down = 20
|
||
num_of_content_process = 10
|
||
requests_timeout = 60
|
||
num_of_retry_proxy = 5
|
||
|
||
logging.basicConfig(level=logging.INFO,
|
||
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
||
logging.getLogger('requests').setLevel(logging.WARNING)
|
||
logging.getLogger('pymysql').setLevel(logging.WARNING)
|
||
|
||
|
||
def click_insta_load_more(driver):
|
||
element = driver.find_element_by_css_selector("div._pupj3 > a")
|
||
enter_element(element)
|
||
|
||
|
||
def push_page_down(driver):
|
||
body = driver.find_element_by_tag_name('body')
|
||
body.send_keys(Keys.PAGE_DOWN)
|
||
|
||
|
||
def focus_driver(driver):
|
||
position = driver.get_window_position()
|
||
size = driver.get_window_size()
|
||
driver.maximize_window()
|
||
driver.set_window_size(size['width'], size["height"])
|
||
driver.set_window_position(position['x'], position['y'])
|
||
|
||
|
||
def requests_get(req, timeout=requests_timeout):
|
||
body = []
|
||
start = time.time()
|
||
for chunk in req.iter_content(1024):
|
||
body.append(chunk)
|
||
if time.time() > (start + timeout):
|
||
req.close()
|
||
raise Exception("timeout")
|
||
|
||
return b''.join(body)
|
||
|
||
|
||
eventlet.monkey_patch()
|
||
|
||
|
||
def requests_wrapper(func):
|
||
if sys.platform == 'win32':
|
||
return func
|
||
else:
|
||
def wrapper(*args, **kwargs):
|
||
with eventlet.Timeout(requests_timeout, Exception):
|
||
return func(*args, **kwargs)
|
||
return wrapper
|
||
|
||
|
||
requests.get = requests_wrapper(requests.get)
|
||
requests.post = requests_wrapper(requests.post)
|
||
|
||
|
||
def instance_wrapper(func):
|
||
# to save nice ip, port of proxy
|
||
ip, port = base.proxy.get_proxy()
|
||
|
||
def retry_load(*args, **kwargs):
|
||
while True:
|
||
# use clouser
|
||
nonlocal ip, port
|
||
proxies = base.proxy.get_requests_proxy(ip + ":" + port)
|
||
kwargs['proxies'] = proxies
|
||
# retry = num_of_retry_proxy
|
||
# while retry:
|
||
res = func(*args, **kwargs)
|
||
if res:
|
||
# printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident()))
|
||
return res
|
||
# if the proxy was not good, get new proxy
|
||
# printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident()))
|
||
ip, port = base.proxy.get_proxy()
|
||
# retry -= 1
|
||
return retry_load
|
||
|
||
|
||
class InstanceWrapper(object):
|
||
def __init__(self, func):
|
||
self.ip, self.port = base.proxy.get_proxy()
|
||
self.func = func
|
||
self.num_of_retry_proxy = num_of_retry_proxy
|
||
|
||
def do(self, *args, **kwargs):
|
||
while True:
|
||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||
kwargs['proxies'] = proxies
|
||
# retry = num_of_retry_proxy
|
||
# while retry:
|
||
res = self.func(*args, **kwargs)
|
||
if res:
|
||
# printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||
return res
|
||
# if the proxy was not good, get new proxy
|
||
# printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||
self.ip, self.port = base.proxy.get_proxy()
|
||
# retry -= 1
|
||
|
||
def do_retry(self, *args, **kwargs):
|
||
while True:
|
||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||
kwargs['proxies'] = proxies
|
||
retry = self.num_of_retry_proxy
|
||
while retry:
|
||
res = self.func(*args, **kwargs)
|
||
if res:
|
||
# printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||
return res
|
||
# if the proxy was not good, get new proxy
|
||
# printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||
retry -= 1
|
||
self.ip, self.port = base.proxy.get_proxy()
|
||
|
||
def do_no_proxy(self, *args, **kwargs):
|
||
while True:
|
||
retry = self.num_of_retry_proxy
|
||
while retry:
|
||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||
kwargs['proxies'] = proxies
|
||
res = self.func(*args, **kwargs)
|
||
if res:
|
||
printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||
return res
|
||
# if the proxy was not good, get new proxy
|
||
printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||
retry -= 1
|
||
self.ip, self.port = base.proxy.get_proxy()
|
||
|
||
# if get content with proxy failed, set no proxy
|
||
# func guarantee returning a instance except the case where a url is invalid
|
||
kwargs['proxies'] = None
|
||
res = self.func(*args, **kwargs)
|
||
# if res:
|
||
# printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
|
||
# printl(args, kwargs)
|
||
printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
|
||
return res
|
||
|
||
def change_proxy(self):
|
||
self.ip, self.port = base.proxy.get_proxy()
|
||
|
||
|
||
@instance_wrapper
|
||
def make_list_instance(url, proxies=None):
|
||
try:
|
||
if insta_tag_url in url:
|
||
list_crawler = ListTag(url, proxies)
|
||
else:
|
||
list_crawler = ListUser(url, proxies)
|
||
return list_crawler
|
||
|
||
except requests.exceptions.ProxyError as e:
|
||
printd('proxy: '+str(e.args[0].pool.proxy), e)
|
||
printd("Fail to make list instance")
|
||
return None
|
||
|
||
except Exception as e:
|
||
printd(e)
|
||
printd("Fail to make list instance")
|
||
return None
|
||
|
||
|
||
# @instance_wrapper
|
||
def make_content_instance(url, proxies=None):
|
||
try:
|
||
content = InstaContent(url, {}, url, proxies)
|
||
return content
|
||
|
||
except requests.exceptions.ProxyError as e:
|
||
printd('proxy: '+str(e.args[0].pool.proxy), e)
|
||
printd("Fail to make content instance")
|
||
return None
|
||
|
||
except Exception as e:
|
||
printd(e)
|
||
printd("Fail to make content instance")
|
||
return None
|
||
|
||
|
||
def ajax_wrapper(func):
|
||
def retry_ajax_load(*args, **kwargs):
|
||
retry = num_of_retry_proxy
|
||
while retry:
|
||
res = func(*args, **kwargs)
|
||
if res is not None:
|
||
break
|
||
retry -= 1
|
||
return res
|
||
return retry_ajax_load
|
||
|
||
|
||
# @ajax_wrapper
|
||
def load_ajax_list(ins):
|
||
try:
|
||
insta_list = ins.load_more()
|
||
# if insta_list:
|
||
# return insta_list
|
||
# else:
|
||
# return None
|
||
return insta_list
|
||
except Exception as e:
|
||
printd(e)
|
||
printd("Fail to load ajax list")
|
||
return None
|
||
|
||
|
||
# @ajax_wrapper
|
||
def load_ajax_reply(ins):
|
||
try:
|
||
replies = ins.load_reply_more()
|
||
return replies
|
||
except Exception as e:
|
||
printd(e)
|
||
printd("Fail to load ajax reply")
|
||
return None
|
||
|
||
|
||
# def crawl_content_process(qu, keyword_id, db_num):
|
||
# send_to_db = SendtoDB()
|
||
# send_to_db.set_db(db_num)
|
||
# while True:
|
||
# element = qu.get()
|
||
# if element is None:
|
||
# break
|
||
# ok = True
|
||
# while ok:
|
||
# try:
|
||
# ip, port = base.proxy.get_proxy()
|
||
# proxies = base.proxy.get_requests_proxy(ip + ":" + port)
|
||
# content = InstaContent(element['url'], {}, element['url'], proxies)
|
||
# body = content.get_body()
|
||
# replies = content.get_reply()
|
||
# body['article_url'] = element['url']
|
||
# body['keyword_id'] = keyword_id
|
||
# while content.has_previous:
|
||
# replies = content.load_reply_more() + replies
|
||
# wait(reply_wait_sec)
|
||
# for j in range(0, len(replies)):
|
||
# replies[j]['article_url'] = body['article_url']
|
||
# replies[j]['platform_id'] = body['platform_id']
|
||
# replies[j]['article_order'] = j
|
||
# send_to_db.delete_url(body['article_url'])
|
||
# send_to_db.send_body(body)
|
||
# if replies:
|
||
# send_to_db.send_reply(replies)
|
||
# printl(element['url'])
|
||
# printl('ok')
|
||
# ok = False
|
||
# except:
|
||
# printl("failed proxy {0}:{1}".format(ip, port))
|
||
# printl('finish thread')
|
||
|
||
|
||
def crawl_content_process(qu, keyword_id, db_num):
|
||
# m_c_i = instance_wrapper(make_content_instance)
|
||
m_c_i = InstanceWrapper(make_content_instance)
|
||
send_to_db = SendtoDB()
|
||
send_to_db.set_db(db_num)
|
||
while True:
|
||
try:
|
||
element = qu.get(timeout=60)
|
||
except Exception as e:
|
||
printl("[crawl_content_process] queue is empty")
|
||
continue
|
||
|
||
if element is None:
|
||
break
|
||
ok = True
|
||
while ok:
|
||
try:
|
||
# get a instance of InstaContent by do_no_proxy func.
|
||
# if element['url'] is invalid, content is None
|
||
content = m_c_i.do_no_proxy(element['url'])
|
||
if not content:
|
||
break
|
||
body = content.get_body()
|
||
replies = content.get_reply()
|
||
body['article_url'] = element['url']
|
||
body['keyword_id'] = keyword_id
|
||
while content.has_previous:
|
||
rep = load_ajax_reply(content)
|
||
if rep is None:
|
||
printl("proxies = ", content.proxies)
|
||
m_c_i.change_proxy()
|
||
raise Exception("reply load error")
|
||
replies = rep + replies
|
||
wait(reply_wait_sec)
|
||
for j in range(0, len(replies)):
|
||
replies[j]['article_url'] = body['article_url']
|
||
replies[j]['platform_id'] = body['platform_id']
|
||
replies[j]['article_order'] = j
|
||
send_to_db.delete_url(body['article_url'])
|
||
send_to_db.send_body(body)
|
||
if replies:
|
||
send_to_db.send_reply(replies)
|
||
if content.proxies is not None:
|
||
printl("proxies = ", content.proxies['http'][7:])
|
||
printl(element['url'])
|
||
printl('ok')
|
||
ok = False
|
||
except UnicodeEncodeError as ue:
|
||
printl(element['url'])
|
||
printl(ue)
|
||
break
|
||
except Exception as e:
|
||
# catch error when send_to_db error occur
|
||
printl(element['url'])
|
||
printl(e)
|
||
qu.task_done()
|
||
printl('finish thread')
|
||
|
||
|
||
class InstaInit(CrawlInit):
|
||
def __init__(self, before_day=0):
|
||
super().__init__(before_day)
|
||
self.urls = dict()
|
||
self.urls[9] = insta_tag_url
|
||
self.urls[10] = insta_url
|
||
|
||
def split_searches(self):
|
||
search = self.searches()
|
||
splited_list = search.split(',')
|
||
trimmed_list = list()
|
||
if self.platform() == 10:
|
||
for x in splited_list:
|
||
trimmed_list.append(x.strip())
|
||
else:
|
||
for x in splited_list:
|
||
trimmed_list.append(self.utf8(x))
|
||
return trimmed_list
|
||
|
||
def make_url(self):
|
||
urls = list()
|
||
for x in self.split_searches():
|
||
url = self.urls[self.platform()] + x
|
||
urls.append(url)
|
||
return urls
|
||
|
||
def get_begin_day(self):
|
||
if self.is_realtime():
|
||
date_now = datetime.datetime.now()
|
||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||
result += datetime.timedelta(days=self.before_day)
|
||
return result.date()
|
||
else:
|
||
return self.start_day()
|
||
|
||
def get_end_day(self):
|
||
if self.is_realtime():
|
||
date_now = datetime.datetime.now()
|
||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||
return result.date()
|
||
else:
|
||
return self.end_day()
|
||
|
||
|
||
class ListTag:
|
||
def __init__(self, url, proxies=None):
|
||
self.__r = None
|
||
self.__tag = ''
|
||
self.__url = ''
|
||
self.list_tag = []
|
||
self.end_cursor = None
|
||
self.has_next = False
|
||
self.cookies = {}
|
||
self.proxies = proxies
|
||
self.load_url(url, self.proxies)
|
||
|
||
def load_url(self, url, proxies):
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
|
||
# self.log_load_url_before()
|
||
self.__r.raise_for_status()
|
||
self.__tag = self.__get_tag(url)
|
||
self.__set_cookies(self.__r.cookies)
|
||
self.__url = url
|
||
#self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||
self.__r.close()
|
||
printd('tag list, end_cursor: {}'.format(self.end_cursor))
|
||
# self.log_load_url_after()
|
||
return self.list_tag
|
||
|
||
def load_more(self):
|
||
url = self.__url + "?max_id="+self.end_cursor
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||
timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
|
||
# self.log_load_url_before()
|
||
self.__r.raise_for_status()
|
||
self.__tag = self.__get_tag(url)
|
||
self.__set_cookies(self.__r.cookies)
|
||
# self.__url = url
|
||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||
self.__r.close()
|
||
printd('tag list, end_cursor: {}'.format(self.end_cursor))
|
||
# self.log_load_url_after()
|
||
|
||
# 기존 방식 instagram?<3F>서 post<73>?막<>? ??
|
||
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||
# self.log_load_more_before(form_data, headers)
|
||
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||
# timeout=requests_timeout, stream=True)
|
||
# content = requests_get(self.__r)
|
||
# self.__set_cookies(self.__r.cookies)
|
||
# self.__r.raise_for_status()
|
||
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||
# self.__r.close()
|
||
# self.log_load_more_after()
|
||
|
||
return self.list_tag
|
||
|
||
def __get_tag(self, url):
|
||
m = re.search(insta_tag_url + "([^/]*)", url)
|
||
if m:
|
||
return m.group(1)
|
||
else:
|
||
raise RuntimeError('Tag Error')
|
||
|
||
def get_cookies(self):
|
||
return self.cookies
|
||
|
||
def get_url(self):
|
||
return self.__url
|
||
|
||
def set_end_cursor(self, cursor):
|
||
self.end_cursor = cursor
|
||
|
||
def get_end_cursor(self):
|
||
return self.end_cursor
|
||
|
||
def __set_cookies(self, cookies):
|
||
for k, v in cookies.items():
|
||
self.cookies[k] = v
|
||
|
||
def get_list(self):
|
||
return self.list_tag
|
||
|
||
def get_proxy(self):
|
||
return self.proxies
|
||
|
||
def log_load_url_before(self):
|
||
if is_debug:
|
||
printl("<ListTag Start>")
|
||
printl("<ListTag requests>")
|
||
printl('headers = ', end=' ')
|
||
printl(instaheaders.get_headers_for_list_html())
|
||
|
||
def log_load_url_after(self):
|
||
if is_debug:
|
||
printl("<ListTag response>")
|
||
printl('self.__r.cookies=', end='')
|
||
printl(self.__r.cookies)
|
||
printl('end_cursor = ' + str(self.end_cursor))
|
||
printl('has_next = ', end='')
|
||
printl(self.has_next)
|
||
printl('proxies = ', end='')
|
||
printl(self.proxies)
|
||
printl("<ListTag End>")
|
||
|
||
def log_load_more_before(self, form_data, headers):
|
||
if is_debug:
|
||
printl("<ListTag Start>")
|
||
printl("<ListTag requests>")
|
||
printl('end_cursor = ' + str(self.end_cursor))
|
||
printl('form_data' + form_data)
|
||
printl('headers = ', end=' ')
|
||
printl(headers)
|
||
|
||
def log_load_more_after(self):
|
||
if is_debug:
|
||
printl("<ListTag response>")
|
||
printl('self.__r.cookies=', end='')
|
||
printl(self.__r.cookies)
|
||
printl('end_cursor = ' + str(self.end_cursor))
|
||
printl('has_next = ', end='')
|
||
printl(self.has_next)
|
||
printl('proxies = ', end='')
|
||
printl(self.proxies)
|
||
printl("<ListTag End>")
|
||
|
||
|
||
class ListUser:
|
||
def __init__(self, url, proxies=None):
|
||
self.__r = None
|
||
self.__user = ''
|
||
self.__url = ''
|
||
self.list_user = []
|
||
self.end_cursor = None
|
||
self.has_next = False
|
||
self.cookies = {}
|
||
self.proxies = proxies
|
||
self.load_url(url, self.proxies)
|
||
|
||
def load_url(self, url, proxies):
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
|
||
timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
self.__r.raise_for_status()
|
||
self.__url = url
|
||
self.__set_cookies(self.__r.cookies)
|
||
# self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||
self.__r.close()
|
||
printd('user list, end_cursor: {}'.format(self.end_cursor))
|
||
return self.list_user
|
||
|
||
def load_more(self):
|
||
url = self.__url + "?max_id=" + self.end_cursor
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||
timeout=requests_timeout, stream=True)
|
||
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||
# self.log_load_more_before(form_data, headers)
|
||
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||
# timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
self.__r.raise_for_status()
|
||
self.__set_cookies(self.__r.cookies)
|
||
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||
self.__r.close()
|
||
printd('user list, end_cursor: {}'.format(self.end_cursor))
|
||
# self.log_load_more_after()
|
||
return self.list_user
|
||
|
||
def get_cookies(self):
|
||
return self.cookies
|
||
|
||
def get_url(self):
|
||
return self.__url
|
||
|
||
def set_end_cursor(self, cursor):
|
||
self.end_cursor = cursor
|
||
|
||
def get_end_cursor(self):
|
||
return self.end_cursor
|
||
|
||
def __set_cookies(self, cookies):
|
||
for k, v in cookies.items():
|
||
self.cookies[k] = v
|
||
|
||
def get_list(self):
|
||
return self.list_user
|
||
|
||
def get_proxy(self):
|
||
return self.proxies
|
||
|
||
def log_load_more_before(self, form_data, headers):
|
||
if is_debug:
|
||
printl("<ListUser Start>")
|
||
printl("<ListUser requests>")
|
||
printl('end_cursor = ' + str(self.end_cursor))
|
||
printl('form_data' + form_data)
|
||
printl('headers = ', end=' ')
|
||
printl(headers)
|
||
|
||
def log_load_more_after(self):
|
||
if is_debug:
|
||
printl("<ListUser response>")
|
||
printl('self.__r.cookies=', end='')
|
||
printl(self.__r.cookies)
|
||
printl('end_cursor = ' + str(self.end_cursor))
|
||
printl('has_next = ', end='')
|
||
printl(self.has_next)
|
||
printl('proxies = ', end='')
|
||
printl(self.proxies)
|
||
printl("<ListUser End>")
|
||
|
||
|
||
class InstaContent:
|
||
def __init__(self, url, cookies, referer, proxies=None):
|
||
self.__r = None
|
||
self.__referer = ''
|
||
self.__code = ''
|
||
self.body = None
|
||
self.reply = []
|
||
self.start_cursor = None
|
||
self.has_previous = False
|
||
self.cookies = {}
|
||
self.proxies = proxies
|
||
self.content = ''
|
||
self.query_id = ''
|
||
self.load_url(url, cookies, referer, self.proxies)
|
||
|
||
def load_url(self, url, cookies, referer, proxies):
|
||
self.__set_cookies(cookies)
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
||
timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
self.content = content
|
||
self.__r.raise_for_status()
|
||
self.__referer = referer
|
||
self.__code = self.__get_code(url)
|
||
# self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||
self.__set_cookies(self.__r.cookies)
|
||
self.__r.close()
|
||
|
||
printd('reply, end_cursor: {}'.format(self.start_cursor))
|
||
return self.body, self.reply
|
||
|
||
def get_body(self):
|
||
return self.body
|
||
|
||
def get_reply(self):
|
||
return self.reply
|
||
|
||
def get_query_ids(self, html):
|
||
doc = bs4.BeautifulSoup(html, "html.parser")
|
||
|
||
query_ids = []
|
||
for script in doc.find_all("script"):
|
||
if script.has_attr("src") and "_Commons.js" in script['src']:
|
||
text = requests.get("%s%s" % ('https://www.instagram.com', script['src'])).text
|
||
for query_id in re.findall("(?<=queryId:\")[0-9]{17,17}", text):
|
||
query_ids.append(query_id)
|
||
return query_ids
|
||
|
||
def find_query_id(self):
|
||
potential_query_ids = self.get_query_ids(self.content)
|
||
query_id = ''
|
||
for potential_id in potential_query_ids:
|
||
# url = "https://www.instagram.com/graphql/query/?query_id=%s&first=12&after=%s" % (potential_id, self.start_cursor)
|
||
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
|
||
potential_id, self.__code, len(self.reply), self.start_cursor)
|
||
try:
|
||
data = requests.get(url).json()
|
||
if data['status'] == 'ok':
|
||
query_id = potential_id
|
||
break
|
||
except Exception:
|
||
# no valid JSON retured, most likely wrong query_id resulting in 'Oops, an error occurred.'
|
||
pass
|
||
|
||
return query_id
|
||
|
||
def load_reply_more(self):
|
||
if not self.query_id:
|
||
self.query_id = self.find_query_id()
|
||
|
||
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
|
||
self.query_id, self.__code, len(self.reply), self.start_cursor)
|
||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
|
||
timeout=requests_timeout, stream=True)
|
||
content = requests_get(self.__r)
|
||
|
||
self.__r.raise_for_status()
|
||
reply, self.start_cursor, self.has_previous = instaparser.parse_reply_more(content)
|
||
self.__set_cookies(self.__r.cookies)
|
||
self.__r.close()
|
||
|
||
self.reply += reply
|
||
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
|
||
return self.reply
|
||
|
||
def get_cookies(self):
|
||
return self.cookies
|
||
|
||
def __get_code(self, url):
|
||
m = re.search(insta_body_url + "([^/]*)", url)
|
||
if m:
|
||
return m.group(1)
|
||
else:
|
||
raise RuntimeError('Tag Error')
|
||
|
||
def __set_cookies(self, cookies):
|
||
for k, v in cookies.items():
|
||
self.cookies[k] = v
|
||
|
||
def get_proxy(self):
|
||
return self.proxies
|
||
|
||
def log_load_reply_more_before(self, form_data, headers):
|
||
if is_debug:
|
||
printl("<ContentReply Start>")
|
||
printl("<ContentReply requests>")
|
||
printl('start_cursor = ' + self.start_cursor)
|
||
printl('form_data' + form_data)
|
||
printl('headers = ', end=' ')
|
||
printl(headers)
|
||
|
||
def log_load_reply_more_after(self):
|
||
if is_debug:
|
||
printl("<ContentReply response>")
|
||
printl('self.__r.cookies=', end='')
|
||
printl(self.__r.cookies)
|
||
printl('start_cursor = ' + str(self.start_cursor))
|
||
printl('has_previous = ', end='')
|
||
printl(self.has_previous)
|
||
printl('proxies = ', end='')
|
||
printl(self.proxies)
|
||
printl("<ContentReply End>")
|
||
|
||
|
||
class InstaAlgorithm:
|
||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||
self.send_to_db = send_to_db
|
||
self.crawl_init = crawl_init
|
||
self.browser = browser
|
||
self.driver = driver
|
||
self.keyword_id = keyword_id
|
||
self.reload_wait_second = reload_wait_second
|
||
self.num_of_load_content = num_of_load_content
|
||
self.page_down = page_down
|
||
self.list_crawl = []
|
||
|
||
def crawl_content(self, url, cookies, referer):
|
||
content = InstaContent(url, cookies, referer)
|
||
body = content.get_body()
|
||
replies = content.get_reply()
|
||
body['article_url'] = url
|
||
body['keyword_id'] = self.keyword_id
|
||
# printl(body['article_url'])
|
||
while content.has_previous:
|
||
replies = content.load_reply_more() + replies
|
||
wait(reply_wait_sec)
|
||
for j in range(0, len(replies)):
|
||
replies[j]['article_url'] = body['article_url']
|
||
replies[j]['platform_id'] = body['platform_id']
|
||
replies[j]['article_order'] = j
|
||
self.send_to_db.delete_url(body['article_url'])
|
||
self.send_to_db.send_body(body)
|
||
if replies:
|
||
self.send_to_db.send_reply(replies)
|
||
printl('ok')
|
||
printl()
|
||
|
||
def start_crawl(self):
|
||
self.crawl()
|
||
self.close()
|
||
|
||
def close(self):
|
||
if self.driver and not is_debug:
|
||
self.driver.quit()
|
||
self.send_to_db.close()
|
||
printl("Finished Crawling :)")
|
||
|
||
def crawl(self):
|
||
raise NotImplementedError
|
||
|
||
def is_until_page(self):
|
||
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
|
||
return True
|
||
else:
|
||
return False
|
||
|
||
def crawl_contents(self, contents_list, backup_set):
|
||
"""
|
||
:param contents_list:
|
||
:param backup_set:
|
||
:return: is_load_more
|
||
"""
|
||
old_elements = 0
|
||
for element in contents_list:
|
||
if element['date'].date() > self.crawl_init.get_end_day():
|
||
# printl(element['url'])
|
||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||
|
||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||
printl(element['url'])
|
||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||
old_elements += 1
|
||
if old_elements > 6:
|
||
return False
|
||
else:
|
||
if not element['url'] in backup_set:
|
||
# printl(element['url'])
|
||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||
# wait(1.5)
|
||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||
printl("element insert to queue {}".format(element['url']))
|
||
self.list_crawl.append(element)
|
||
backup_set.add(element['url'])
|
||
if self.is_until_page():
|
||
return False
|
||
if self.list_crawl:
|
||
printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
||
return True
|
||
|
||
def crawl_list(self):
|
||
if self.list_crawl:
|
||
printl()
|
||
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
|
||
printl()
|
||
for element in self.list_crawl:
|
||
try:
|
||
printl(element['url'])
|
||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||
wait(body_wait_sec)
|
||
self.crawl_content(element['url'], {}, element['url'])
|
||
except Exception as e:
|
||
printl(e)
|
||
logging.info(e)
|
||
|
||
|
||
class InstaAlgorithmNormal(InstaAlgorithm):
|
||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second, num_of_load_content, page_down)
|
||
if self.driver:
|
||
self.driver.quit()
|
||
|
||
def crawl(self):
|
||
real_time = True
|
||
while real_time:
|
||
printl("Crawling Start")
|
||
url_list = self.crawl_init.make_url()
|
||
i = 0
|
||
end_cursor = None
|
||
backup_set = set()
|
||
while i < len(url_list):
|
||
# first connect
|
||
try:
|
||
printl(url_list[i] + "\n")
|
||
if insta_tag_url in url_list[i]:
|
||
list_crawler = ListTag(url_list[i])
|
||
else:
|
||
list_crawler = ListUser(url_list[i])
|
||
wait(1)
|
||
insta_list = list_crawler.get_list()
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||
# ajax load
|
||
while is_load_more:
|
||
if end_cursor:
|
||
list_crawler.end_cursor = end_cursor
|
||
end_cursor = None
|
||
wait(self.reload_wait_second)
|
||
insta_list = list_crawler.load_more()
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||
self.crawl_list()
|
||
self.list_crawl.clear()
|
||
i += 1
|
||
except Exception as e:
|
||
logging.info(e)
|
||
end_cursor = list_crawler.end_cursor
|
||
printl('end_cursor=' + end_cursor)
|
||
if e.args:
|
||
wait(300)
|
||
real_time = self.crawl_init.is_realtime()
|
||
printl("Finished Crawling :)")
|
||
|
||
|
||
class InstaAlgorithmMulti(InstaAlgorithm):
|
||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second, num_of_load_content, page_down)
|
||
if self.driver:
|
||
self.driver.quit()
|
||
self.list_crawl = Queue()
|
||
self.total_num = 0
|
||
|
||
def crawl_contents(self, contents_list, backup_set):
|
||
"""
|
||
:param contents_list:
|
||
:param backup_set:
|
||
:return: is_load_more
|
||
"""
|
||
old_elements = 0
|
||
for element in contents_list:
|
||
if element['date'].date() > self.crawl_init.get_end_day():
|
||
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
|
||
element['url'],
|
||
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
|
||
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
|
||
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
|
||
|
||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||
printl('post is not in range of date (url: {}, date:{}, start:{}, end:{})'.format(
|
||
element['url'],
|
||
element['date'].strftime("%Y-%m-%d %H:%M:%S"),
|
||
self.crawl_init.get_begin_day().strftime("%Y-%m-%d %H:%M:%S"),
|
||
self.crawl_init.get_end_day().strftime("%Y-%m-%d %H:%M:%S")))
|
||
|
||
old_elements += 1
|
||
if old_elements > 6:
|
||
return False
|
||
else:
|
||
if not element['url'] in backup_set:
|
||
# printl(element['url'])
|
||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||
# wait(1.5)
|
||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||
try:
|
||
self.list_crawl.put(element, timeout=10)
|
||
except Exception as e:
|
||
printl(e)
|
||
printl("queue size = ", self.list_crawl.qsize())
|
||
backup_set.add(element['url'])
|
||
self.total_num += 1
|
||
if self.is_until_page():
|
||
return False
|
||
# if self.list_crawl:
|
||
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
||
return True
|
||
|
||
def crawl(self):
|
||
real_time = True
|
||
while real_time:
|
||
printl("Crawling Start")
|
||
url_list = self.crawl_init.make_url()
|
||
i = 0
|
||
end_cursor = None
|
||
backup_set = set()
|
||
while i < len(url_list):
|
||
# first connect
|
||
try:
|
||
printl(url_list[i] + "\n")
|
||
# insta_content process create and start
|
||
# p_list = [multiprocessing.Process(target=crawl_content_process,
|
||
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||
# for i in range(num_of_content_process)]
|
||
printl("{} processs start".format(num_of_content_process))
|
||
p_list = [threading.Thread(target=crawl_content_process,
|
||
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||
for i in range(num_of_content_process)]
|
||
for p in p_list:
|
||
p.daemon = True
|
||
p.start()
|
||
|
||
# crawl list
|
||
ok = True
|
||
while ok:
|
||
try:
|
||
list_crawler = make_list_instance(url_list[i])
|
||
ok = False
|
||
except Exception as e:
|
||
printl(e)
|
||
wait(1)
|
||
insta_list = list_crawler.get_list()
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||
# ajax load
|
||
while is_load_more:
|
||
if end_cursor:
|
||
list_crawler.end_cursor = end_cursor
|
||
end_cursor = None
|
||
wait(self.reload_wait_second)
|
||
try:
|
||
insta_list = load_ajax_list(list_crawler)
|
||
if insta_list is None:
|
||
break
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||
except Exception as e:
|
||
printl('is_load_more exception')
|
||
printl(e)
|
||
is_load_more = False
|
||
#self.crawl_list()
|
||
#self.list_crawl.close()
|
||
printl("end load")
|
||
printl("total number of crawled list = {0}".format(self.total_num))
|
||
self.total_num = 0
|
||
|
||
# check task is done in queue
|
||
# self.list_crawl.join()
|
||
|
||
# stop child process
|
||
for i in range(num_of_content_process):
|
||
self.list_crawl.put(None, timeout=10)
|
||
|
||
# wait child process
|
||
for p in p_list:
|
||
p.join()
|
||
|
||
for _ in range(self.list_crawl.qsize()):
|
||
self.list_crawl.get(block=False)
|
||
|
||
i += 1
|
||
except Exception as e:
|
||
logging.info(e)
|
||
end_cursor = list_crawler.end_cursor
|
||
printl('end_cursor=' + end_cursor)
|
||
if e.args:
|
||
wait(300)
|
||
real_time = self.crawl_init.is_realtime()
|
||
printl("Finished Crawling :)")
|
||
|
||
|
||
class InstaAlgorithmBrowser(InstaAlgorithm):
|
||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||
reload_wait_second, num_of_load_content, page_down)
|
||
|
||
def url_load(self, url):
|
||
if insta_tag_url in url:
|
||
list_tag = ListTag(url)
|
||
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
|
||
return list_tag, insta_list, end_cursor, has_next
|
||
else:
|
||
list_user = ListUser(url)
|
||
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
|
||
return list_user, insta_list, end_cursor, has_next
|
||
|
||
def crawl(self):
|
||
real_time = True
|
||
while real_time:
|
||
url_list = self.crawl_init.make_url()
|
||
i = 0
|
||
end_cursor = None
|
||
backup_set = set()
|
||
while i < len(url_list):
|
||
# first connect
|
||
try:
|
||
wait(3)
|
||
printl(url_list[i] + "\n")
|
||
self.driver.get(url_list[i])
|
||
wait(5)
|
||
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
|
||
list_crawler.set_end_cursor(end_cursor2)
|
||
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
|
||
# ajax load
|
||
page_down = 0
|
||
while is_load_more:
|
||
if page_down == self.page_down:
|
||
page_down = 0
|
||
try:
|
||
focus_driver(self.driver)
|
||
click_insta_load_more(self.driver)
|
||
except:
|
||
push_page_down(self.driver)
|
||
page_down += 1
|
||
if end_cursor:
|
||
list_crawler.end_cursor = end_cursor
|
||
end_cursor = None
|
||
wait(self.reload_wait_second)
|
||
insta_list = list_crawler.load_more()
|
||
# printl("list length = " + str(len(insta_list)))
|
||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||
# printl("number of backup_set = {0}".format(len(backup_set)))
|
||
i += 1
|
||
self.crawl_list()
|
||
self.list_crawl.clear()
|
||
except Exception as e:
|
||
logging.info(e)
|
||
end_cursor = list_crawler.end_cursor
|
||
printl('end_cursor=' + end_cursor)
|
||
if e.args:
|
||
wait(300)
|
||
if self.driver:
|
||
self.driver.close()
|
||
wait(3)
|
||
self.driver = self.browser.new_browser()
|
||
real_time = self.crawl_init.is_realtime()
|
||
printl("Finished Crawling :)")
|
||
|
||
|
||
class InstaMainCrawler:
|
||
def __init__(self):
|
||
self.send_to_db = SendtoDB()
|
||
self.crawl_init = InstaInit()
|
||
#self.browser = Browser()
|
||
self.browser = None
|
||
self.driver = None
|
||
|
||
def set_keyword_id(self, keyword_id):
|
||
self.keyword_id = keyword_id
|
||
|
||
def crawl_all(self, backup_set=None):
|
||
pass
|
||
|
||
def start(self):
|
||
self.crawler_start()
|
||
|
||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||
self.init_keyword_id(keyword_id)
|
||
self.init_db(db_num)
|
||
self.init_before_day(before_day)
|
||
self.init_until_page(until_page)
|
||
# self.init_browser(browser)
|
||
|
||
def set_driver(self, driver):
|
||
self.driver = driver
|
||
|
||
def init_browser(self, browser):
|
||
try:
|
||
self.set_driver(self.browser.get_new_driver(browser))
|
||
except Exception as e:
|
||
logging.info(e)
|
||
|
||
def init_keyword_id(self, keyword_id):
|
||
if type(keyword_id) != int:
|
||
self.keyword_id = int(keyword_id)
|
||
else:
|
||
self.keyword_id = keyword_id
|
||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||
self.crawl_init.disconnect()
|
||
|
||
def init_db(self, db_num):
|
||
self.send_to_db.set_db(db_num)
|
||
|
||
def init_before_day(self, before_day):
|
||
self.crawl_init.set_before_day(before_day)
|
||
|
||
def init_until_page(self, until_page):
|
||
self.crawl_init.set_until_page(until_page)
|
||
|
||
def crawler_start(self):
|
||
# if self.driver:
|
||
# algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||
# else:
|
||
# algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||
algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||
algorithm.start_crawl()
|