604 lines
22 KiB
Python
604 lines
22 KiB
Python
#-*- coding: utf-8 -*-
|
|
'''
|
|
Created on 2015. 12. 8.
|
|
|
|
@author: cococo
|
|
'''
|
|
import re
|
|
import datetime
|
|
import insta.instaparser as instaparser
|
|
import insta.instaheaders as instaheaders
|
|
import requests
|
|
import logging
|
|
|
|
|
|
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import Browser
|
|
from selenium.webdriver.common.keys import Keys
|
|
from base.baseclasses import enter_element
|
|
|
|
|
|
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
|
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
|
|
|
insta_url = "https://www.instagram.com/"
|
|
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
|
insta_query = "https://www.instagram.com/query/"
|
|
insta_body_url = 'https://www.instagram.com/p/'
|
|
|
|
is_debuging = False
|
|
|
|
num_of_list_ajax = 24
|
|
num_of_reply_ajax = 100
|
|
list_wait_sec = 0.9
|
|
body_wait_sec = 0.5
|
|
reply_wait_sec = 0.8
|
|
num_of_page_down = 20
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
|
logging.getLogger('requests').setLevel(logging.WARNING)
|
|
logging.getLogger('pymysql').setLevel(logging.WARNING)
|
|
|
|
|
|
def click_insta_load_more(driver):
|
|
element = driver.find_element_by_css_selector("div._pupj3 > a")
|
|
enter_element(element)
|
|
|
|
|
|
def push_page_down(driver):
|
|
body = driver.find_element_by_tag_name('body')
|
|
body.send_keys(Keys.PAGE_DOWN)
|
|
|
|
|
|
def focus_driver(driver):
|
|
position = driver.get_window_position()
|
|
size = driver.get_window_size()
|
|
driver.maximize_window()
|
|
driver.set_window_size(size['width'], size["height"])
|
|
driver.set_window_position(position['x'], position['y'])
|
|
|
|
|
|
class InstaInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[9] = insta_tag_url
|
|
self.urls[10] = insta_url
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
trimmed_list = list()
|
|
if self.platform() == 10:
|
|
for x in splited_list:
|
|
trimmed_list.append(x.strip())
|
|
else:
|
|
for x in splited_list:
|
|
trimmed_list.append(self.utf8(x))
|
|
return trimmed_list
|
|
|
|
def make_url(self):
|
|
urls = list()
|
|
for x in self.split_searches():
|
|
url = self.urls[self.platform()] + x
|
|
urls.append(url)
|
|
return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result.date()
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result.date()
|
|
else:
|
|
return self.end_day()
|
|
|
|
|
|
class ListTag:
|
|
def __init__(self, url):
|
|
self.__r = None
|
|
self.__tag = ''
|
|
self.__url = ''
|
|
self.list_tag = []
|
|
self.end_cursor = None
|
|
self.has_next = False
|
|
self.cookies = {}
|
|
self.load_url(url)
|
|
|
|
def load_url(self, url):
|
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
|
if is_debuging:
|
|
printl("<ListTag Start>")
|
|
printl("<ListTag requests>")
|
|
printl('headers = ', end=' ')
|
|
printl(instaheaders.get_headers_for_list_html())
|
|
|
|
self.__r.raise_for_status()
|
|
self.__tag = self.__get_tag(url)
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.__url = url
|
|
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
|
if is_debuging:
|
|
printl("<ListTag response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('end_cursor = ' + str(self.end_cursor))
|
|
printl('has_next = ', end='')
|
|
printl(self.has_next)
|
|
printl("<ListTag End>")
|
|
|
|
return self.list_tag
|
|
|
|
def load_more(self):
|
|
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
|
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
|
if is_debuging:
|
|
printl("<ListTag Start>")
|
|
printl("<ListTag requests>")
|
|
printl('end_cursor = ' + str(self.end_cursor))
|
|
printl('form_data' + form_data)
|
|
printl('headers = ', end=' ')
|
|
printl(headers)
|
|
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.__r.raise_for_status()
|
|
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
|
if is_debuging:
|
|
printl("<ListTag response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('end_cursor = ' + str(self.end_cursor))
|
|
printl('has_next = ', end='')
|
|
printl(self.has_next)
|
|
printl("<ListTag End>")
|
|
return self.list_tag
|
|
|
|
def __get_tag(self, url):
|
|
m = re.search(insta_tag_url + "([^/]*)", url)
|
|
if m:
|
|
return m.group(1)
|
|
else:
|
|
raise RuntimeError('Tag Error')
|
|
|
|
def get_cookies(self):
|
|
return self.cookies
|
|
|
|
def get_url(self):
|
|
return self.__url
|
|
|
|
def set_end_cursor(self, cursor):
|
|
self.end_cursor = cursor
|
|
|
|
def get_end_cursor(self):
|
|
return self.end_cursor
|
|
|
|
def __set_cookies(self, cookies):
|
|
for k, v in cookies.items():
|
|
self.cookies[k] = v
|
|
|
|
def get_list(self):
|
|
return self.list_tag
|
|
|
|
|
|
class ListUser:
|
|
def __init__(self, url):
|
|
self.__r = None
|
|
self.__user = ''
|
|
self.__url = ''
|
|
self.list_user = []
|
|
self.end_cursor = None
|
|
self.has_next = False
|
|
self.cookies = {}
|
|
self.load_url(url)
|
|
|
|
def load_url(self, url):
|
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
|
self.__r.raise_for_status()
|
|
self.__url = url
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
|
return self.list_user
|
|
|
|
def load_more(self):
|
|
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
|
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
|
if is_debuging:
|
|
printl("<ListUser Start>")
|
|
printl("<ListUser requests>")
|
|
printl('end_cursor = ' + str(self.end_cursor))
|
|
printl('form_data' + form_data)
|
|
printl('headers = ', end=' ')
|
|
printl(headers)
|
|
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
|
self.__r.raise_for_status()
|
|
self.__set_cookies(self.__r.cookies)
|
|
if is_debuging:
|
|
printl("<ListUser response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('end_cursor = ' + str(self.end_cursor))
|
|
printl('has_next = ', end='')
|
|
printl(self.has_next)
|
|
printl("<ListUser End>")
|
|
|
|
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
|
return self.list_user
|
|
|
|
def get_cookies(self):
|
|
return self.cookies
|
|
|
|
def get_url(self):
|
|
return self.__url
|
|
|
|
def set_end_cursor(self, cursor):
|
|
self.end_cursor = cursor
|
|
|
|
def get_end_cursor(self):
|
|
return self.end_cursor
|
|
|
|
def __set_cookies(self, cookies):
|
|
for k, v in cookies.items():
|
|
self.cookies[k] = v
|
|
|
|
def get_list(self):
|
|
return self.list_user
|
|
|
|
|
|
class InstaContent:
|
|
def __init__(self, url, cookies, referer):
|
|
self.__r = None
|
|
self.__referer = ''
|
|
self.__code = ''
|
|
self.body = None
|
|
self.reply = []
|
|
self.start_cursor = None
|
|
self.has_previous = False
|
|
self.cookies = {}
|
|
self.load_url(url, cookies, referer)
|
|
|
|
def load_url(self, url, cookies, referer):
|
|
self.__set_cookies(cookies)
|
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
|
|
self.__r.raise_for_status()
|
|
self.__referer = referer
|
|
self.__code = self.__get_code(url)
|
|
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
|
self.__set_cookies(self.__r.cookies)
|
|
return self.body, self.reply
|
|
|
|
def get_body(self):
|
|
return self.body
|
|
|
|
def get_reply(self):
|
|
return self.reply
|
|
|
|
def load_reply_more(self):
|
|
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
|
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
|
if is_debuging:
|
|
printl("<ContentReply Start>")
|
|
printl("<ContentReply requests>")
|
|
printl('start_cursor = ' + self.start_cursor)
|
|
printl('form_data' + form_data)
|
|
printl('headers = ', end=' ')
|
|
printl(headers)
|
|
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
|
self.__r.raise_for_status()
|
|
self.__set_cookies(self.__r.cookies)
|
|
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
|
if is_debuging:
|
|
printl("<ContentReply response>")
|
|
printl('self.__r.cookies=', end='')
|
|
printl(self.__r.cookies)
|
|
printl('start_cursor = ' + str(self.start_cursor))
|
|
printl('has_previous = ', end='')
|
|
printl(self.has_previous)
|
|
printl("<ContentReply End>")
|
|
return self.reply
|
|
|
|
def get_cookies(self):
|
|
return self.cookies
|
|
|
|
def __get_code(self, url):
|
|
m = re.search(insta_body_url + "([^/]*)", url)
|
|
if m:
|
|
return m.group(1)
|
|
else:
|
|
raise RuntimeError('Tag Error')
|
|
|
|
def __set_cookies(self, cookies):
|
|
for k, v in cookies.items():
|
|
self.cookies[k] = v
|
|
|
|
|
|
class InstaAlgorithm:
|
|
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
|
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
|
self.send_to_db = send_to_db
|
|
self.crawl_init = crawl_init
|
|
self.browser = browser
|
|
self.driver = driver
|
|
self.keyword_id = keyword_id
|
|
self.reload_wait_second = reload_wait_second
|
|
self.num_of_load_content = num_of_load_content
|
|
self.page_down = page_down
|
|
self.list_crawl = []
|
|
|
|
def crawl_content(self, url, cookies, referer):
|
|
content = InstaContent(url, cookies, referer)
|
|
body = content.get_body()
|
|
replies = content.get_reply()
|
|
body['article_url'] = url
|
|
body['keyword_id'] = self.keyword_id
|
|
# printl(body['article_url'])
|
|
while content.has_previous:
|
|
replies = content.load_reply_more() + replies
|
|
wait(reply_wait_sec)
|
|
for j in range(0, len(replies)):
|
|
replies[j]['article_url'] = body['article_url']
|
|
replies[j]['platform_id'] = body['platform_id']
|
|
replies[j]['article_order'] = j
|
|
self.send_to_db.delete_url(body['article_url'])
|
|
self.send_to_db.send_body(body)
|
|
if replies:
|
|
self.send_to_db.send_reply(replies)
|
|
printl('ok')
|
|
printl()
|
|
|
|
def start_crawl(self):
|
|
self.crawl()
|
|
self.close()
|
|
|
|
def close(self):
|
|
if self.driver and not is_debuging:
|
|
self.driver.quit()
|
|
self.send_to_db.close()
|
|
printl("Finished Crawling :)")
|
|
|
|
def crawl(self):
|
|
raise NotImplementedError
|
|
|
|
def is_until_page(self):
|
|
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def crawl_contents(self, contents_list, backup_set):
|
|
"""
|
|
:param contents_list:
|
|
:param backup_set:
|
|
:return: is_load_more
|
|
"""
|
|
old_elements = 0
|
|
for element in contents_list:
|
|
if element['date'].date() > self.crawl_init.get_end_day():
|
|
# printl(element['url'])
|
|
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
|
|
|
elif element['date'].date() < self.crawl_init.get_begin_day():
|
|
printl(element['url'])
|
|
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
|
old_elements += 1
|
|
if old_elements > 6:
|
|
return False
|
|
else:
|
|
if not element['url'] in backup_set:
|
|
# printl(element['url'])
|
|
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
|
# wait(1.5)
|
|
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
|
self.list_crawl.append(element)
|
|
backup_set.add(element['url'])
|
|
if self.is_until_page():
|
|
return False
|
|
if self.list_crawl:
|
|
printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
|
return True
|
|
|
|
def crawl_list(self):
|
|
if self.list_crawl:
|
|
printl()
|
|
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
|
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
|
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
|
|
printl()
|
|
for element in self.list_crawl:
|
|
try:
|
|
printl(element['url'])
|
|
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
|
wait(body_wait_sec)
|
|
self.crawl_content(element['url'], {}, element['url'])
|
|
except Exception as e:
|
|
printl(e)
|
|
logging.info(e)
|
|
|
|
|
|
class InstaAlgorithmNormal(InstaAlgorithm):
|
|
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
|
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
|
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
|
reload_wait_second, num_of_load_content, page_down)
|
|
if self.driver:
|
|
self.driver.quit()
|
|
|
|
def crawl(self):
|
|
real_time = True
|
|
while real_time:
|
|
printl("Crawling Start")
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
end_cursor = None
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
# first connect
|
|
try:
|
|
printl(url_list[i] + "\n")
|
|
if insta_tag_url in url_list[i]:
|
|
list_crawler = ListTag(url_list[i])
|
|
else:
|
|
list_crawler = ListUser(url_list[i])
|
|
wait(1)
|
|
insta_list = list_crawler.get_list()
|
|
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
|
# ajax load
|
|
while is_load_more:
|
|
if end_cursor:
|
|
list_crawler.end_cursor = end_cursor
|
|
end_cursor = None
|
|
wait(self.reload_wait_second)
|
|
insta_list = list_crawler.load_more()
|
|
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
|
self.crawl_list()
|
|
self.list_crawl.clear()
|
|
i += 1
|
|
except Exception as e:
|
|
logging.info(e)
|
|
end_cursor = list_crawler.end_cursor
|
|
printl('end_cursor=' + end_cursor)
|
|
if e.args:
|
|
wait(300)
|
|
real_time = self.crawl_init.is_realtime()
|
|
printl("Finished Crawling :)")
|
|
|
|
|
|
class InstaAlgorithmBrowser(InstaAlgorithm):
|
|
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
|
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
|
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
|
reload_wait_second, num_of_load_content, page_down)
|
|
|
|
def url_load(self, url):
|
|
if insta_tag_url in url:
|
|
list_tag = ListTag(url)
|
|
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
|
|
return list_tag, insta_list, end_cursor, has_next
|
|
else:
|
|
list_user = ListUser(url)
|
|
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
|
|
return list_user, insta_list, end_cursor, has_next
|
|
|
|
def crawl(self):
|
|
real_time = True
|
|
while real_time:
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
end_cursor = None
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
# first connect
|
|
try:
|
|
wait(3)
|
|
printl(url_list[i] + "\n")
|
|
self.driver.get(url_list[i])
|
|
wait(5)
|
|
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
|
|
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
|
|
list_crawler.set_end_cursor(end_cursor2)
|
|
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
|
|
# ajax load
|
|
page_down = 0
|
|
while is_load_more:
|
|
if page_down == self.page_down:
|
|
page_down = 0
|
|
try:
|
|
focus_driver(self.driver)
|
|
click_insta_load_more(self.driver)
|
|
except:
|
|
push_page_down(self.driver)
|
|
page_down += 1
|
|
if end_cursor:
|
|
list_crawler.end_cursor = end_cursor
|
|
end_cursor = None
|
|
wait(self.reload_wait_second)
|
|
insta_list = list_crawler.load_more()
|
|
# printl("list length = " + str(len(insta_list)))
|
|
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
|
# printl("number of backup_set = {0}".format(len(backup_set)))
|
|
i += 1
|
|
self.crawl_list()
|
|
self.list_crawl.clear()
|
|
except Exception as e:
|
|
logging.info(e)
|
|
end_cursor = list_crawler.end_cursor
|
|
printl('end_cursor=' + end_cursor)
|
|
if e.args:
|
|
wait(300)
|
|
if self.driver:
|
|
self.driver.close()
|
|
wait(3)
|
|
self.driver = self.browser.new_browser()
|
|
real_time = self.crawl_init.is_realtime()
|
|
printl("Finished Crawling :)")
|
|
|
|
|
|
class InstaMainCrawler:
|
|
def __init__(self):
|
|
self.send_to_db = SendtoDB()
|
|
self.crawl_init = InstaInit()
|
|
self.browser = Browser()
|
|
self.driver = None
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def crawl_all(self, backup_set=None):
|
|
pass
|
|
|
|
def start(self):
|
|
self.crawler_start()
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
self.init_keyword_id(keyword_id)
|
|
self.init_db(db_num)
|
|
self.init_before_day(before_day)
|
|
self.init_until_page(until_page)
|
|
self.init_browser(browser)
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def init_browser(self, browser):
|
|
try:
|
|
self.set_driver(self.browser.get_new_driver(browser))
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
def init_keyword_id(self, keyword_id):
|
|
if type(keyword_id) != int:
|
|
self.keyword_id = int(keyword_id)
|
|
else:
|
|
self.keyword_id = keyword_id
|
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
|
self.crawl_init.disconnect()
|
|
|
|
def init_db(self, db_num):
|
|
self.send_to_db.set_db(db_num)
|
|
|
|
def init_before_day(self, before_day):
|
|
self.crawl_init.set_before_day(before_day)
|
|
|
|
def init_until_page(self, until_page):
|
|
self.crawl_init.set_until_page(until_page)
|
|
|
|
def crawler_start(self):
|
|
if self.driver:
|
|
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
|
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
|
else:
|
|
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
|
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
|
algorithm.start_crawl()
|