insta crawler 수정
git-svn-id: svn://192.168.0.12/source@278 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -17,7 +17,7 @@ from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
|
||||
def print_and_flush(string):
|
||||
print(string)
|
||||
@@ -139,7 +139,9 @@ class Browser:
|
||||
|
||||
def new_firefox_browser(self):
|
||||
self.info = "firefox"
|
||||
self.driver = webdriver.Firefox()
|
||||
caps = DesiredCapabilities.FIREFOX
|
||||
#caps["marionette"] = True
|
||||
self.driver = webdriver.Firefox(capabilities=caps)
|
||||
return self.driver
|
||||
|
||||
def new_opera_browser(self, driver_exec=None):
|
||||
|
||||
@@ -6,19 +6,60 @@ Created on 2015. 12. 8.
|
||||
'''
|
||||
import re
|
||||
import datetime
|
||||
import insta.instaparser as instaparser
|
||||
import insta.instaheaders as instaheaders
|
||||
import requests
|
||||
import logging
|
||||
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_xpath
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from base.baseclasses import enter_element
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
is_debuging = False
|
||||
|
||||
num_of_list_ajax = 24
|
||||
num_of_reply_ajax = 100
|
||||
list_wait_sec = 0.9
|
||||
body_wait_sec = 0.5
|
||||
reply_wait_sec = 0.8
|
||||
num_of_page_down = 20
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
||||
logging.getLogger('requests').setLevel(logging.WARNING)
|
||||
logging.getLogger('pymysql').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def click_insta_load_more(driver):
|
||||
element = driver.find_element_by_css_selector("div._pupj3 > a")
|
||||
enter_element(element)
|
||||
|
||||
|
||||
def push_page_down(driver):
|
||||
body = driver.find_element_by_tag_name('body')
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
|
||||
|
||||
def focus_driver(driver):
|
||||
position = driver.get_window_position()
|
||||
size = driver.get_window_size()
|
||||
driver.maximize_window()
|
||||
driver.set_window_size(size['width'], size["height"])
|
||||
driver.set_window_position(position['x'], position['y'])
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
@@ -52,7 +93,7 @@ class InstaInit(CrawlInit):
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
return result.date()
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
@@ -60,452 +101,480 @@ class InstaInit(CrawlInit):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
return result.date()
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class InstaBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
class ListTag:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__tag = ''
|
||||
self.__url = ''
|
||||
self.list_tag = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__url = url
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.raise_for_status()
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
return self.list_tag
|
||||
|
||||
def __get_tag(self, url):
|
||||
m = re.search(insta_tag_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_tag
|
||||
|
||||
|
||||
class ListUser:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__user = ''
|
||||
self.__url = ''
|
||||
self.list_user = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
self.__r.raise_for_status()
|
||||
self.__url = url
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
if is_debuging:
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListUser End>")
|
||||
|
||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_user
|
||||
|
||||
|
||||
class InstaContent:
|
||||
def __init__(self, url, cookies, referer):
|
||||
self.__r = None
|
||||
self.__referer = ''
|
||||
self.__code = ''
|
||||
self.body = None
|
||||
self.reply = []
|
||||
self.start_cursor = None
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.load_url(url, cookies, referer)
|
||||
|
||||
def load_url(self, url, cookies, referer):
|
||||
self.__set_cookies(cookies)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
return self.body, self.reply
|
||||
|
||||
def get_body(self):
|
||||
return self.body
|
||||
|
||||
def get_reply(self):
|
||||
return self.reply
|
||||
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
if is_debuging:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl("<ContentReply End>")
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def __get_code(self, url):
|
||||
m = re.search(insta_body_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
|
||||
class InstaAlgorithm:
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
self.send_to_db = send_to_db
|
||||
self.crawl_init = crawl_init
|
||||
self.browser = browser
|
||||
self.driver = driver
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
self.keyword_id = keyword_id
|
||||
self.reload_wait_second = reload_wait_second
|
||||
self.num_of_load_content = num_of_load_content
|
||||
self.page_down = page_down
|
||||
self.list_crawl = []
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
def crawl_content(self, url, cookies, referer):
|
||||
content = InstaContent(url, cookies, referer)
|
||||
body = content.get_body()
|
||||
replies = content.get_reply()
|
||||
body['article_url'] = url
|
||||
body['keyword_id'] = self.keyword_id
|
||||
# printl(body['article_url'])
|
||||
while content.has_previous:
|
||||
replies = content.load_reply_more() + replies
|
||||
wait(reply_wait_sec)
|
||||
for j in range(0, len(replies)):
|
||||
replies[j]['article_url'] = body['article_url']
|
||||
replies[j]['platform_id'] = body['platform_id']
|
||||
replies[j]['article_order'] = j
|
||||
self.send_to_db.delete_url(body['article_url'])
|
||||
self.send_to_db.send_body(body)
|
||||
if replies:
|
||||
self.send_to_db.send_reply(replies)
|
||||
printl('ok')
|
||||
printl()
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
def start_crawl(self):
|
||||
self.crawl()
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if self.driver and not is_debuging:
|
||||
self.driver.quit()
|
||||
self.send_to_db.close()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
def crawl(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_until_page(self):
|
||||
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
|
||||
return True
|
||||
else:
|
||||
self.article = article
|
||||
return False
|
||||
|
||||
def find_article_url(self):
|
||||
a = self.article.find_element_by_xpath("div/section/a")
|
||||
return a.get_attribute("href")
|
||||
def crawl_contents(self, contents_list, backup_set):
|
||||
"""
|
||||
:param contents_list:
|
||||
:param backup_set:
|
||||
:return: is_load_more
|
||||
"""
|
||||
old_elements = 0
|
||||
for element in contents_list:
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
# printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
def find_article_profileurl(self):
|
||||
img = self.article.find_element_by_xpath("header/a/img[@src]")
|
||||
return img.get_attribute("src")
|
||||
|
||||
def find_article_nickname(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
return a.text
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = self.article.find_element_by_xpath("div/section/a/time")
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
|
||||
def find_article_data(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
try:
|
||||
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||
span = ul.find_element_by_css_selector("li h1>span")
|
||||
return span.text
|
||||
except:
|
||||
return ""
|
||||
|
||||
def find_article_id(self):
|
||||
return self.find_platform_id()
|
||||
|
||||
def find_platform_name(self):
|
||||
return 'instagram'
|
||||
|
||||
def find_article_form(self):
|
||||
return 'body'
|
||||
|
||||
def find_platform_id(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
if a:
|
||||
href = a.get_attribute("href")
|
||||
str_id = href.replace(insta_url, "").replace("/", "")
|
||||
return str_id
|
||||
else:
|
||||
return None
|
||||
|
||||
def find_like_num(self):
|
||||
div = self.article.find_element_by_xpath("div/section[1]/div")
|
||||
try:
|
||||
span = div.find_element_by_xpath("span/span")
|
||||
str_num = span.text
|
||||
str_num = str_num.replace(',', '')
|
||||
if str_num[-1] == 'm':
|
||||
num = float(str_num[0:-1]) * 1000000
|
||||
elif str_num[-1] == 'k':
|
||||
num = float(str_num[0:-1]) * 1000
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
old_elements += 1
|
||||
if old_elements > 6:
|
||||
return False
|
||||
else:
|
||||
num = int(str_num)
|
||||
return str(num)
|
||||
except:
|
||||
a_list = div.find_elements_by_tag_name("a")
|
||||
if len(a_list) > 1:
|
||||
return str(len(a_list))
|
||||
else:
|
||||
if a_list and a_list[0].get_attribute('title'):
|
||||
return str(1)
|
||||
else:
|
||||
return str(0)
|
||||
# span = div.find_element_by_xpath("span[1]")
|
||||
# if len(span.text.strip()) < 1:
|
||||
# return str(1)
|
||||
# else:
|
||||
# return str(0)
|
||||
if not element['url'] in backup_set:
|
||||
# printl(element['url'])
|
||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# wait(1.5)
|
||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||
self.list_crawl.append(element)
|
||||
backup_set.add(element['url'])
|
||||
if self.is_until_page():
|
||||
return False
|
||||
if self.list_crawl:
|
||||
printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
||||
return True
|
||||
|
||||
def find_reply_num(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
lis = ul.find_elements_by_tag_name("li")
|
||||
if len(lis) < 2:
|
||||
return "0"
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
||||
span = li.find_element_by_xpath("button/span[2]")
|
||||
str_num = span.text.replace(",", "")
|
||||
return str_num
|
||||
except:
|
||||
return str(len(lis) - 1)
|
||||
|
||||
def get_content(self):
|
||||
content = dict()
|
||||
content["article_id"] = self.find_article_id()
|
||||
content["platform_id"] = self.find_platform_id()
|
||||
content["article_url"] = self.find_article_url()
|
||||
content["article_profileurl"] = self.find_article_profileurl()
|
||||
content["article_nickname"] = self.find_article_nickname()
|
||||
content["platform_name"] = self.find_platform_name()
|
||||
content["article_date"] = self.find_article_date()
|
||||
content["article_data"] = self.find_article_data()
|
||||
content["article_form"] = 'body'
|
||||
content["platform_form"] = 'post'
|
||||
content["platform_title"] = content["article_id"]
|
||||
reply_num = self.find_reply_num()
|
||||
if int(reply_num) > 0:
|
||||
content["article_order"] = int(reply_num)
|
||||
like_num = self.find_like_num()
|
||||
if int(float(like_num)) > 0:
|
||||
content["reply_url"] = int(float(like_num))
|
||||
return content
|
||||
|
||||
def find_platform_title(self):
|
||||
pass
|
||||
|
||||
def find_article_title(self):
|
||||
pass
|
||||
|
||||
|
||||
class InstaReplyCrawler:
|
||||
def __init__(self, driver=None, article=None):
|
||||
self.driver = driver
|
||||
self.activity = article
|
||||
self.reply_list = list()
|
||||
|
||||
def find_init(self):
|
||||
self.reply_list.clear()
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
def crawl_list(self):
|
||||
if self.list_crawl:
|
||||
printl()
|
||||
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||||
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||||
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
|
||||
printl()
|
||||
for element in self.list_crawl:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
wait(body_wait_sec)
|
||||
self.crawl_content(element['url'], {}, element['url'])
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
printl(e)
|
||||
logging.info(e)
|
||||
|
||||
|
||||
class InstaAlgorithmNormal(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second, num_of_load_content, page_down)
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
|
||||
def crawl(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
printl("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
printl(url_list[i] + "\n")
|
||||
if insta_tag_url in url_list[i]:
|
||||
list_crawler = ListTag(url_list[i])
|
||||
else:
|
||||
list_crawler = ListUser(url_list[i])
|
||||
wait(1)
|
||||
insta_list = list_crawler.get_list()
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
# ajax load
|
||||
while is_load_more:
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(self.reload_wait_second)
|
||||
insta_list = list_crawler.load_more()
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
self.crawl_list()
|
||||
self.list_crawl.clear()
|
||||
i += 1
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaAlgorithmBrowser(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second, num_of_load_content, page_down)
|
||||
|
||||
def url_load(self, url):
|
||||
if insta_tag_url in url:
|
||||
list_tag = ListTag(url)
|
||||
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
|
||||
return list_tag, insta_list, end_cursor, has_next
|
||||
else:
|
||||
self.article = article
|
||||
list_user = ListUser(url)
|
||||
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
|
||||
return list_user, insta_list, end_cursor, has_next
|
||||
|
||||
def has_more(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def read_more_reply(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
enter_element(button)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def read_all_reply(self, ul):
|
||||
i = 0
|
||||
while i < 200 and self.has_more(ul):
|
||||
self.read_more_reply(ul)
|
||||
i += 1
|
||||
# for i in range(0, 10):
|
||||
# if self.has_more(ul):
|
||||
# self.read_more_reply(ul)
|
||||
# else:
|
||||
# break
|
||||
|
||||
def get_reply_ul(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
return ul
|
||||
|
||||
def has_reply(self, ul):
|
||||
try:
|
||||
lis = ul.find_elements_by_css_selector("li>a")
|
||||
if len(lis) > 0:
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
return False
|
||||
|
||||
def crawl_all(self):
|
||||
self.find_init()
|
||||
self.set_article()
|
||||
try:
|
||||
ul = self.get_reply_ul()
|
||||
if self.has_reply(ul):
|
||||
self.read_all_reply(ul)
|
||||
self.crawl_reply(ul)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def crawl_reply(self, ul):
|
||||
article_data = self.find_article_data(ul)
|
||||
article_id = self.find_article_id(ul)
|
||||
if len(article_data) != len(article_id):
|
||||
print_and_flush("article_data != article_id")
|
||||
for i in range(0, len(article_id)):
|
||||
content = dict()
|
||||
content["article_data"] = article_data[i]
|
||||
content["article_id"] = article_id[i]
|
||||
content["article_nickname"] = article_id[i]
|
||||
content["platform_name"] = "instagram"
|
||||
content["platform_form"] = "post"
|
||||
content["article_form"] = 'reply'
|
||||
content["article_order"] = i
|
||||
self.reply_list.append(content)
|
||||
|
||||
def get_content(self):
|
||||
return self.reply_list
|
||||
|
||||
def find_article_id(self, ul):
|
||||
id_list = list()
|
||||
a_list = ul.find_elements_by_xpath("li/a")
|
||||
for i in a_list:
|
||||
id_list.append(i.text)
|
||||
return id_list
|
||||
|
||||
def find_article_profileurl(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_nickname(self, ul):
|
||||
return self.find_article_id(ul)
|
||||
|
||||
def find_article_data(self, ul):
|
||||
data_list = list()
|
||||
span_list = ul.find_elements_by_css_selector("li>span")
|
||||
for i in span_list:
|
||||
data_list.append(i.text)
|
||||
return data_list
|
||||
|
||||
def find_article_url(self, ul):
|
||||
pass
|
||||
|
||||
def find_platform_id(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_form(self, ul=None):
|
||||
return 'reply'
|
||||
|
||||
def find_platform_name(self, ul=None):
|
||||
return 'instagram'
|
||||
|
||||
def find_platform_form(self, ul=None):
|
||||
return 'post'
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
wait(2)
|
||||
|
||||
|
||||
class InstaPageCrawler:
|
||||
def __init__(self, driver=None, begin_date=None, end_date=None):
|
||||
self.driver = driver
|
||||
self.url_set = set()
|
||||
self.begin_date = begin_date
|
||||
self.end_date = end_date
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def find_article_url(self):
|
||||
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
|
||||
return a.get_attribute("href")
|
||||
|
||||
def init(self):
|
||||
self.url_set.clear()
|
||||
|
||||
def set_date(self, begin_date, end_date):
|
||||
self.set_begin_date(begin_date)
|
||||
self.set_end_date(end_date)
|
||||
|
||||
def set_end_date(self, end_date):
|
||||
if type(end_date) == str:
|
||||
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
||||
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
||||
self.end_date = end_date
|
||||
else:
|
||||
self.end_date = datetime.datetime.today()
|
||||
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
||||
self.end_date += datetime.timedelta(days=1)
|
||||
|
||||
def set_begin_date(self, begin_date):
|
||||
if type(begin_date) == str:
|
||||
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
||||
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
||||
self.begin_date = begin_date
|
||||
else:
|
||||
self.begin_date = datetime.datetime.today()
|
||||
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
|
||||
|
||||
def has_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def move_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def has_first_page(self):
|
||||
try:
|
||||
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def crawling_ok(self, url):
|
||||
self.url_set.add(url)
|
||||
|
||||
def is_earlier(self, time_date):
|
||||
return True if time_date < self.begin_date else False
|
||||
|
||||
def is_late(self, time_date):
|
||||
return True if time_date > self.end_date else False
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
def crawl(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
wait(3)
|
||||
printl(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
|
||||
list_crawler.set_end_cursor(end_cursor2)
|
||||
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
|
||||
# ajax load
|
||||
page_down = 0
|
||||
while is_load_more:
|
||||
if page_down == self.page_down:
|
||||
page_down = 0
|
||||
try:
|
||||
focus_driver(self.driver)
|
||||
click_insta_load_more(self.driver)
|
||||
except:
|
||||
push_page_down(self.driver)
|
||||
page_down += 1
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(self.reload_wait_second)
|
||||
insta_list = list_crawler.load_more()
|
||||
# printl("list length = " + str(len(insta_list)))
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
# printl("number of backup_set = {0}".format(len(backup_set)))
|
||||
i += 1
|
||||
self.crawl_list()
|
||||
self.list_crawl.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
if self.driver:
|
||||
self.driver.close()
|
||||
wait(3)
|
||||
self.driver = self.browser.new_browser()
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = InstaPageCrawler()
|
||||
self.body_crawler = InstaBodyCrawler()
|
||||
self.reply_crawler = InstaReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.browser = Browser()
|
||||
self.crawl_init = InstaInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.url_set = backup_set.copy()
|
||||
if not self.page_crawler.has_first_page():
|
||||
return
|
||||
while True:
|
||||
str_date = self.page_crawler.find_article_date()
|
||||
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
|
||||
print_and_flush(str_date)
|
||||
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
#if self.page_crawler.is_earlier(date_val.date()):
|
||||
if self.page_crawler.is_late(date_val):
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
#if self.page_crawler.is_late(date_val.date()):
|
||||
if self.page_crawler.is_earlier(date_val):
|
||||
break
|
||||
try:
|
||||
body_content = self.crawl_body()
|
||||
self.crawl_reply(body_content)
|
||||
self.page_crawler.url_set.add(body_content["article_url"])
|
||||
print_and_flush("ok")
|
||||
except Exception as e:
|
||||
print_and_flush('fail')
|
||||
print_and_flush(e)
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
else:
|
||||
break
|
||||
|
||||
def crawl_body(self):
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
self.body_crawler.set_article()
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
return content
|
||||
|
||||
def crawl_reply(self, body_content):
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.crawl_all()
|
||||
content_list = self.reply_crawler.get_content()
|
||||
if content_list:
|
||||
for i in content_list:
|
||||
i['article_url'] = body_content['article_url']
|
||||
i['platform_id'] = body_content['platform_id']
|
||||
self.send_to_db.send_reply(content_list)
|
||||
pass
|
||||
|
||||
def start(self):
|
||||
self.crawler_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
self.init_browser(browser)
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
try:
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
@@ -525,30 +594,10 @@ class InstaMainCrawler:
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(3)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
if self.page_crawler.has_first_page():
|
||||
self.crawl_all(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
backup_set = self.page_crawler.url_set.copy()
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
#self.driver.quit()
|
||||
if self.driver:
|
||||
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
else:
|
||||
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
algorithm.start_crawl()
|
||||
|
||||
556
WebBasedCrawler/insta/instacrawl_backup.py
Normal file
556
WebBasedCrawler/insta/instacrawl_backup.py
Normal file
@@ -0,0 +1,556 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import re
|
||||
import datetime
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_xpath
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[9] = insta_tag_url
|
||||
self.urls[10] = insta_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
trimmed_list = list()
|
||||
if self.platform() == 10:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(x.strip())
|
||||
else:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(self.utf8(x))
|
||||
return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
urls = list()
|
||||
for x in self.split_searches():
|
||||
url = self.urls[self.platform()] + x
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class InstaBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
else:
|
||||
self.article = article
|
||||
|
||||
def find_article_url(self):
|
||||
a = self.article.find_element_by_xpath("div/section/a")
|
||||
return a.get_attribute("href")
|
||||
|
||||
def find_article_profileurl(self):
|
||||
img = self.article.find_element_by_xpath("header/a/img[@src]")
|
||||
return img.get_attribute("src")
|
||||
|
||||
def find_article_nickname(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
return a.text
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = self.article.find_element_by_xpath("div/section/a/time")
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
|
||||
def find_article_data(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
try:
|
||||
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||
span = ul.find_element_by_css_selector("li h1>span")
|
||||
return span.text
|
||||
except:
|
||||
return ""
|
||||
|
||||
def find_article_id(self):
|
||||
return self.find_platform_id()
|
||||
|
||||
def find_platform_name(self):
|
||||
return 'instagram'
|
||||
|
||||
def find_article_form(self):
|
||||
return 'body'
|
||||
|
||||
def find_platform_id(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
if a:
|
||||
href = a.get_attribute("href")
|
||||
str_id = href.replace(insta_url, "").replace("/", "")
|
||||
return str_id
|
||||
else:
|
||||
return None
|
||||
|
||||
def find_like_num(self):
|
||||
div = self.article.find_element_by_xpath("div/section[1]/div")
|
||||
try:
|
||||
span = div.find_element_by_xpath("span/span")
|
||||
str_num = span.text
|
||||
str_num = str_num.replace(',', '')
|
||||
if str_num[-1] == 'm':
|
||||
num = float(str_num[0:-1]) * 1000000
|
||||
elif str_num[-1] == 'k':
|
||||
num = float(str_num[0:-1]) * 1000
|
||||
else:
|
||||
num = int(str_num)
|
||||
return str(num)
|
||||
except:
|
||||
a_list = div.find_elements_by_tag_name("a")
|
||||
if len(a_list) > 1:
|
||||
return str(len(a_list))
|
||||
else:
|
||||
if a_list and a_list[0].get_attribute('title'):
|
||||
return str(1)
|
||||
else:
|
||||
return str(0)
|
||||
# span = div.find_element_by_xpath("span[1]")
|
||||
# if len(span.text.strip()) < 1:
|
||||
# return str(1)
|
||||
# else:
|
||||
# return str(0)
|
||||
|
||||
def find_reply_num(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
lis = ul.find_elements_by_tag_name("li")
|
||||
if len(lis) < 2:
|
||||
return "0"
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
||||
span = li.find_element_by_xpath("button/span[2]")
|
||||
str_num = span.text.replace(",", "")
|
||||
return str_num
|
||||
except:
|
||||
return str(len(lis) - 1)
|
||||
|
||||
def get_content(self):
|
||||
content = dict()
|
||||
content["article_id"] = self.find_article_id()
|
||||
content["platform_id"] = self.find_platform_id()
|
||||
content["article_url"] = self.find_article_url()
|
||||
content["article_profileurl"] = self.find_article_profileurl()
|
||||
content["article_nickname"] = self.find_article_nickname()
|
||||
content["platform_name"] = self.find_platform_name()
|
||||
content["article_date"] = self.find_article_date()
|
||||
content["article_data"] = self.find_article_data()
|
||||
content["article_form"] = 'body'
|
||||
content["platform_form"] = 'post'
|
||||
content["platform_title"] = content["article_id"]
|
||||
reply_num = self.find_reply_num()
|
||||
if int(reply_num) > 0:
|
||||
content["article_order"] = int(reply_num)
|
||||
like_num = self.find_like_num()
|
||||
if int(float(like_num)) > 0:
|
||||
content["reply_url"] = int(float(like_num))
|
||||
return content
|
||||
|
||||
def find_platform_title(self):
|
||||
pass
|
||||
|
||||
def find_article_title(self):
|
||||
pass
|
||||
|
||||
|
||||
class InstaReplyCrawler:
|
||||
def __init__(self, driver=None, article=None):
|
||||
self.driver = driver
|
||||
self.activity = article
|
||||
self.reply_list = list()
|
||||
|
||||
def find_init(self):
|
||||
self.reply_list.clear()
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
else:
|
||||
self.article = article
|
||||
|
||||
def has_more(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def read_more_reply(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
enter_element(button)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def read_all_reply(self, ul):
|
||||
i = 0
|
||||
while i < 200 and self.has_more(ul):
|
||||
self.read_more_reply(ul)
|
||||
i += 1
|
||||
# for i in range(0, 10):
|
||||
# if self.has_more(ul):
|
||||
# self.read_more_reply(ul)
|
||||
# else:
|
||||
# break
|
||||
|
||||
def get_reply_ul(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
return ul
|
||||
|
||||
def has_reply(self, ul):
|
||||
try:
|
||||
lis = ul.find_elements_by_css_selector("li>a")
|
||||
if len(lis) > 0:
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
return False
|
||||
|
||||
def crawl_all(self):
|
||||
self.find_init()
|
||||
self.set_article()
|
||||
try:
|
||||
ul = self.get_reply_ul()
|
||||
if self.has_reply(ul):
|
||||
self.read_all_reply(ul)
|
||||
self.crawl_reply(ul)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def crawl_reply(self, ul):
|
||||
article_data = self.find_article_data(ul)
|
||||
article_id = self.find_article_id(ul)
|
||||
if len(article_data) != len(article_id):
|
||||
print_and_flush("article_data != article_id")
|
||||
for i in range(0, len(article_id)):
|
||||
content = dict()
|
||||
content["article_data"] = article_data[i]
|
||||
content["article_id"] = article_id[i]
|
||||
content["article_nickname"] = article_id[i]
|
||||
content["platform_name"] = "instagram"
|
||||
content["platform_form"] = "post"
|
||||
content["article_form"] = 'reply'
|
||||
content["article_order"] = i
|
||||
self.reply_list.append(content)
|
||||
|
||||
def get_content(self):
|
||||
return self.reply_list
|
||||
|
||||
def find_article_id(self, ul):
|
||||
id_list = list()
|
||||
a_list = ul.find_elements_by_xpath("li/a")
|
||||
for i in a_list:
|
||||
id_list.append(i.text)
|
||||
return id_list
|
||||
|
||||
def find_article_profileurl(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_nickname(self, ul):
|
||||
return self.find_article_id(ul)
|
||||
|
||||
def find_article_data(self, ul):
|
||||
data_list = list()
|
||||
span_list = ul.find_elements_by_css_selector("li>span")
|
||||
for i in span_list:
|
||||
data_list.append(i.text)
|
||||
return data_list
|
||||
|
||||
def find_article_url(self, ul):
|
||||
pass
|
||||
|
||||
def find_platform_id(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_form(self, ul=None):
|
||||
return 'reply'
|
||||
|
||||
def find_platform_name(self, ul=None):
|
||||
return 'instagram'
|
||||
|
||||
def find_platform_form(self, ul=None):
|
||||
return 'post'
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
wait(2)
|
||||
|
||||
|
||||
class InstaPageCrawler:
|
||||
def __init__(self, driver=None, begin_date=None, end_date=None):
|
||||
self.driver = driver
|
||||
self.url_set = set()
|
||||
self.begin_date = begin_date
|
||||
self.end_date = end_date
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def find_article_url(self):
|
||||
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
|
||||
return a.get_attribute("href")
|
||||
|
||||
def init(self):
|
||||
self.url_set.clear()
|
||||
|
||||
def set_date(self, begin_date, end_date):
|
||||
self.set_begin_date(begin_date)
|
||||
self.set_end_date(end_date)
|
||||
|
||||
def set_end_date(self, end_date):
|
||||
if type(end_date) == str:
|
||||
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
||||
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
||||
self.end_date = end_date
|
||||
else:
|
||||
self.end_date = datetime.datetime.today()
|
||||
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
||||
self.end_date += datetime.timedelta(days=1)
|
||||
|
||||
def set_begin_date(self, begin_date):
|
||||
if type(begin_date) == str:
|
||||
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
||||
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
||||
self.begin_date = begin_date
|
||||
else:
|
||||
self.begin_date = datetime.datetime.today()
|
||||
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
|
||||
|
||||
def has_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def move_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def has_first_page(self):
|
||||
try:
|
||||
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def crawling_ok(self, url):
|
||||
self.url_set.add(url)
|
||||
|
||||
def is_earlier(self, time_date):
|
||||
return True if time_date < self.begin_date else False
|
||||
|
||||
def is_late(self, time_date):
|
||||
return True if time_date > self.end_date else False
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
|
||||
|
||||
class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = InstaPageCrawler()
|
||||
self.body_crawler = InstaBodyCrawler()
|
||||
self.reply_crawler = InstaReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.browser = Browser()
|
||||
self.crawl_init = InstaInit()
|
||||
self.driver = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.url_set = backup_set.copy()
|
||||
if not self.page_crawler.has_first_page():
|
||||
return
|
||||
while True:
|
||||
str_date = self.page_crawler.find_article_date()
|
||||
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
|
||||
print_and_flush(str_date)
|
||||
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
# if self.page_crawler.is_earlier(date_val.date()):
|
||||
if self.page_crawler.is_late(date_val):
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
# if self.page_crawler.is_late(date_val.date()):
|
||||
if self.page_crawler.is_earlier(date_val):
|
||||
break
|
||||
try:
|
||||
wait(3)
|
||||
body_content = self.crawl_body()
|
||||
self.crawl_reply(body_content)
|
||||
self.page_crawler.url_set.add(body_content["article_url"])
|
||||
print_and_flush("ok")
|
||||
except Exception as e:
|
||||
print_and_flush('fail')
|
||||
print_and_flush(e)
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
else:
|
||||
break
|
||||
|
||||
def crawl_body(self):
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
self.body_crawler.set_article()
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
return content
|
||||
|
||||
def crawl_reply(self, body_content):
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.crawl_all()
|
||||
content_list = self.reply_crawler.get_content()
|
||||
if content_list:
|
||||
for i in content_list:
|
||||
i['article_url'] = body_content['article_url']
|
||||
i['platform_id'] = body_content['platform_id']
|
||||
self.send_to_db.send_reply(content_list)
|
||||
|
||||
def start(self):
|
||||
self.crawler_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
wait(3)
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
if self.page_crawler.has_first_page():
|
||||
self.crawl_all(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
backup_set = self.page_crawler.url_set.copy()
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
#self.driver.quit()
|
||||
426
WebBasedCrawler/insta/instacrawl_backup2.py
Normal file
426
WebBasedCrawler/insta/instacrawl_backup2.py
Normal file
@@ -0,0 +1,426 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import re
|
||||
import datetime
|
||||
import insta.instaparser as instaparser
|
||||
import insta.instaheaders as instaheaders
|
||||
import requests
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[9] = insta_tag_url
|
||||
self.urls[10] = insta_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
trimmed_list = list()
|
||||
if self.platform() == 10:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(x.strip())
|
||||
else:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(self.utf8(x))
|
||||
return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
urls = list()
|
||||
for x in self.split_searches():
|
||||
url = self.urls[self.platform()] + x
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class ListTag:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__tag = ''
|
||||
self.__url = ''
|
||||
self.list_tag = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__url = url
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, 12)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.raise_for_status()
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
return self.list_tag
|
||||
|
||||
def __get_tag(self, url):
|
||||
m = re.search(insta_tag_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_tag
|
||||
|
||||
|
||||
class ListUser:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__user = ''
|
||||
self.__url = ''
|
||||
self.list_user = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
self.__r.raise_for_status()
|
||||
self.__url = url
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, 24)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListUser End>")
|
||||
|
||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_user
|
||||
|
||||
|
||||
class InstaContent:
|
||||
def __init__(self, url, cookies, referer):
|
||||
self.__r = None
|
||||
self.__referer = ''
|
||||
self.__code = ''
|
||||
self.body = None
|
||||
self.reply = []
|
||||
self.start_cursor = None
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.load_url(url, cookies, referer)
|
||||
|
||||
def load_url(self, url, cookies, referer):
|
||||
self.__set_cookies(cookies)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
||||
return self.body, self.reply
|
||||
|
||||
def get_body(self):
|
||||
return self.body
|
||||
|
||||
def get_reply(self):
|
||||
return self.reply
|
||||
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, 20)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl("<ContentReply End>")
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def __get_code(self, url):
|
||||
m = re.search(insta_body_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
|
||||
class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = InstaInit()
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all(self, backup_set=None):
|
||||
pass
|
||||
|
||||
def crawl_content(self, url, cookies, referer):
|
||||
content = InstaContent(url, cookies, referer)
|
||||
body = content.get_body()
|
||||
replies = content.get_reply()
|
||||
body['article_url'] = url
|
||||
body['keyword_id'] = self.keyword_id
|
||||
#printl(body['article_url'])
|
||||
while content.has_previous:
|
||||
replies = content.load_reply_more() + replies
|
||||
wait(2)
|
||||
for j in range(0, len(replies)):
|
||||
replies[j]['article_url'] = body['article_url']
|
||||
replies[j]['platform_id'] = body['platform_id']
|
||||
replies[j]['article_order'] = j
|
||||
self.send_to_db.delete_url(body['article_url'])
|
||||
self.send_to_db.send_body(body)
|
||||
if replies:
|
||||
self.send_to_db.send_reply(replies)
|
||||
printl('ok')
|
||||
printl()
|
||||
|
||||
def start(self):
|
||||
self.crawler_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
pass
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
real_time = True
|
||||
|
||||
while real_time:
|
||||
print_and_flush("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
printl(url_list[i] + "\n")
|
||||
if insta_tag_url in url_list[i]:
|
||||
list_crawler = ListTag(url_list[i])
|
||||
else:
|
||||
list_crawler = ListUser(url_list[i])
|
||||
wait(1)
|
||||
insta_list = list_crawler.get_list()
|
||||
is_load_more = list_crawler.has_next
|
||||
for element in insta_list:
|
||||
old_elements = 0
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
continue
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
old_elements += 1
|
||||
if old_elements > 8:
|
||||
is_load_more = False
|
||||
break
|
||||
else:
|
||||
if not element['url'] in backup_set:
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
wait(1.5)
|
||||
self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||
backup_set.add(element['url'])
|
||||
|
||||
# ajax load
|
||||
while is_load_more:
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(1)
|
||||
insta_list = list_crawler.load_more()
|
||||
is_load_more = list_crawler.has_next
|
||||
old_elements = 0
|
||||
printl("list length = " + str(len(insta_list)))
|
||||
for element in insta_list:
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
continue
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
old_elements += 1
|
||||
if old_elements > 8:
|
||||
is_load_more = False
|
||||
break
|
||||
else:
|
||||
if not element['url'] in backup_set:
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
wait(1.5)
|
||||
try:
|
||||
self.crawl_content(element['url'], list_crawler.get_cookies(),
|
||||
list_crawler.get_url())
|
||||
except Exception as e:
|
||||
printl(e)
|
||||
backup_set.add(element['url'])
|
||||
i += 1
|
||||
except Exception as e:
|
||||
printl(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
#self.driver.quit()
|
||||
99
WebBasedCrawler/insta/instaheaders.py
Normal file
99
WebBasedCrawler/insta/instaheaders.py
Normal file
@@ -0,0 +1,99 @@
|
||||
def get_headers_for_list_html():
|
||||
return {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"accept-encoding": "gzip, deflate, sdch, br",
|
||||
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||
" Chrome/50.0.2661.102 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
def get_headers_for_body_html(cookies):
|
||||
if cookies:
|
||||
request_headers = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"accept-encoding": "gzip, deflate, sdch, br",
|
||||
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||
" Chrome/50.0.2661.102 Safari/537.36",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"cache-control": "max-age=0",
|
||||
'cookie': 'mid=' + cookies['mid'] + '; sessionid=' + cookies['sessionid'] +
|
||||
'; ig_pr=1; ig_vw=1920; csrftoken=' + cookies['csrftoken'] + "; s_network="
|
||||
}
|
||||
else:
|
||||
request_headers = {
|
||||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"accept-encoding": "gzip, deflate, sdch, br",
|
||||
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
|
||||
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||
" Chrome/50.0.2661.102 Safari/537.36",
|
||||
"upgrade-insecure-requests": "1",
|
||||
"cache-control": "max-age=0"
|
||||
}
|
||||
return request_headers
|
||||
|
||||
|
||||
def get_headers_for_ajax(cookies, referer, form_data):
|
||||
request_headers = {
|
||||
'accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
'accept-language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
'origin': 'https://www.instagram.com',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/50.0.2661.102 Safari/537.36',
|
||||
'cookie': 'mid=' + cookies['mid'] + '; sessionid=' + cookies['sessionid'] +
|
||||
'; ig_pr=1; ig_vw=1920; csrftoken=' + cookies['csrftoken'] + "; s_network=",
|
||||
'x-csrftoken': cookies['csrftoken'],
|
||||
'x-instagram-ajax': 1,
|
||||
'x-requested-with': 'XMLHttpRequest',
|
||||
'referer': referer,
|
||||
'content-length': str(len(form_data)),
|
||||
'connection': 'keep-alive'
|
||||
}
|
||||
return request_headers
|
||||
|
||||
|
||||
def get_form_data_for_list_user(user_id, end_cursor, count):
|
||||
res = 'q=ig_user(' \
|
||||
+ str(user_id) + \
|
||||
')+%7B+media.after(' \
|
||||
+ str(end_cursor) + \
|
||||
'%2C+' \
|
||||
+ str(count) + \
|
||||
')+%7B%0A++count%2C%0A++nodes+%7B%0A++++caption%2C%0A++++code%2C%0A++++comments+%7B%0A++++++' \
|
||||
'count%0A++++%7D%2C%0A++++date%2C%0A++++dimensions+%7B%0A++++++height%2C%0A++++++width%0A++++' \
|
||||
'%7D%2C%0A++++display_src%2C%0A++++id%2C%0A++++is_video%2C%0A++++likes+%7B%0A++++++count%0A++++' \
|
||||
'%7D%2C%0A++++owner+%7B%0A++++++id%0A++++%7D%2C%0A++++thumbnail_src%2C%0A++++video_views%0A++%7D%2C%0A++' \
|
||||
'page_info%0A%7D%0A+%7D&ref=users%3A%3Ashow'
|
||||
return res
|
||||
|
||||
|
||||
def get_form_data_for_list_tag(hash_tag, end_cursor, count):
|
||||
res = 'q=ig_hashtag(' \
|
||||
+ str(hash_tag) + \
|
||||
')+%7B+media.after(' \
|
||||
+ str(end_cursor) + \
|
||||
'%2C+' + str(count) + \
|
||||
')+%7B%0A++count%2C%0A++nodes+%7B%0A++++caption%2C%0A++++code%2C%0A++++comments+%7B%0A++++++count%0A++++' \
|
||||
'%7D%2C%0A++++date%2C%0A++++dimensions+%7B%0A++++++height%2C%0A++++++width%0A++++%7D%2C%0A++++' \
|
||||
'display_src%2C%0A++++id%2C%0A++++is_video%2C%0A++++likes+%7B%0A++++++count%0A++++%7D%2C%0A++++' \
|
||||
'owner+%7B%0A++++++id%0A++++%7D%2C%0A++++thumbnail_src%2C%0A++++video_views%0A++%7D%2C%0A++' \
|
||||
'page_info%0A%7D%0A+%7D&ref=tags%3A%3Ashow'
|
||||
return res
|
||||
|
||||
|
||||
def get_form_data_for_reply(body_code, start_cursor, count):
|
||||
res = 'q=ig_shortcode(' \
|
||||
+ str(body_code) + ')+%7B%0A++comments.before' \
|
||||
'(%0A++++++++++++' \
|
||||
+ str(start_cursor) + \
|
||||
'%2C%0A++++++++++++' \
|
||||
+ str(count) + \
|
||||
'%0A++++++++++)' \
|
||||
'+%7B%0A++++count%2C%0A++++nodes+%7B%0A++++++id%2C%0A++++++' \
|
||||
'created_at%2C%0A++++++text%2C%0A++++++user+%7B%0A++++++++id%2C%0A++++++++' \
|
||||
'profile_pic_url%2C%0A++++++++username%0A++++++%7D%0A++++%7D%2C%0A++++' \
|
||||
'page_info%0A++%7D%0A%7D%0A&ref=media%3A%3Ashow'
|
||||
return res
|
||||
|
||||
158
WebBasedCrawler/insta/instaparser.py
Normal file
158
WebBasedCrawler/insta/instaparser.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import re
|
||||
import json
|
||||
import requests
|
||||
import datetime
|
||||
|
||||
rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);\s*</script>')
|
||||
old_date = datetime.datetime(1970, 1, 1, 9)
|
||||
|
||||
|
||||
def get_json_from_html(content):
|
||||
if type(content) == bytes:
|
||||
s = content.decode('utf-8')
|
||||
elif type(content) == str:
|
||||
s = content
|
||||
elif type(content) == requests.models.Response:
|
||||
s = content.content.decode('utf-8')
|
||||
else:
|
||||
raise TypeError
|
||||
m = rx_json_html.search(s)
|
||||
if m:
|
||||
return json.loads(m.group(1))
|
||||
else:
|
||||
raise TypeError("Check requests.response")
|
||||
|
||||
|
||||
def parse_list_user_html(content):
|
||||
json_data = get_json_from_html(content)
|
||||
profilepage = json_data['entry_data']['ProfilePage']
|
||||
|
||||
has_next = False
|
||||
end_cursor = None
|
||||
body_list = []
|
||||
user_id = None
|
||||
|
||||
if profilepage:
|
||||
user_id = profilepage[0]["user"]["id"]
|
||||
has_next = profilepage[0]["user"]["media"]["page_info"]["has_next_page"]
|
||||
end_cursor = profilepage[0]["user"]["media"]["page_info"]["end_cursor"]
|
||||
nodes = profilepage[0]["user"]["media"]["nodes"]
|
||||
for node in nodes:
|
||||
body_list.append(
|
||||
{
|
||||
"code": node["code"],
|
||||
"url": "https://www.instagram.com/p/" + node["code"] + "/",
|
||||
"date": old_date + datetime.timedelta(seconds=node["date"])
|
||||
}
|
||||
)
|
||||
return body_list, end_cursor, has_next, user_id
|
||||
|
||||
|
||||
def parse_list_tag_html(content):
|
||||
json_data = get_json_from_html(content)
|
||||
tagpage = json_data['entry_data']['TagPage']
|
||||
|
||||
has_next = False
|
||||
end_cursor = None
|
||||
body_list = []
|
||||
if tagpage:
|
||||
print('start_cursor = ', end='', flush=True)
|
||||
print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True)
|
||||
end_cursor = tagpage[0]["tag"]["media"]["page_info"]["end_cursor"]
|
||||
has_next = tagpage[0]["tag"]["media"]["page_info"]["has_next_page"]
|
||||
nodes = tagpage[0]["tag"]["media"]["nodes"]
|
||||
for node in nodes:
|
||||
body_list.append({
|
||||
"code": node["code"],
|
||||
"url": "https://www.instagram.com/p/" + node["code"] + "/",
|
||||
"date": old_date + datetime.timedelta(seconds=node["date"])
|
||||
})
|
||||
return body_list, end_cursor, has_next
|
||||
|
||||
|
||||
def parse_list_ajax(content):
|
||||
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
|
||||
has_next = False
|
||||
end_cursor = None
|
||||
body_list = []
|
||||
|
||||
if json_data["status"] == "ok":
|
||||
has_next = json_data["media"]["page_info"]["has_next_page"]
|
||||
end_cursor = json_data["media"]["page_info"]["end_cursor"]
|
||||
nodes = json_data["media"]["nodes"]
|
||||
for node in nodes:
|
||||
body_list.append(
|
||||
{
|
||||
"code": node["code"],
|
||||
"url": "https://www.instagram.com/p/" + node["code"] + "/",
|
||||
"date": old_date + datetime.timedelta(seconds=node["date"])
|
||||
}
|
||||
)
|
||||
return body_list, end_cursor, has_next
|
||||
|
||||
|
||||
def parse_body_html(content):
|
||||
json_data = get_json_from_html(content)
|
||||
postpage = json_data["entry_data"]["PostPage"]
|
||||
body = {}
|
||||
reply = []
|
||||
start_cursor = None
|
||||
has_previous = False
|
||||
if postpage:
|
||||
media = postpage[0]["media"]
|
||||
body = {
|
||||
"article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_data": media["caption"],
|
||||
"article_id": media["owner"]["username"],
|
||||
"article_nickname": media["owner"]["username"],
|
||||
"platform_id": media["owner"]["username"],
|
||||
"platform_name": "instagram",
|
||||
"platform_form": "post",
|
||||
"platform_title": media["owner"]["username"],
|
||||
"article_form": "body",
|
||||
"article_profileurl": media["owner"]["profile_pic_url"],
|
||||
"article_order": str(media["comments"]["count"]),
|
||||
"reply_url": str(media["likes"]["count"])
|
||||
}
|
||||
comments = postpage[0]["media"]["comments"]
|
||||
has_previous = comments["page_info"]["has_previous_page"]
|
||||
start_cursor = comments["page_info"]["start_cursor"]
|
||||
nodes = comments["nodes"]
|
||||
for node in nodes:
|
||||
reply.append({
|
||||
"article_data": node["text"],
|
||||
"article_date":
|
||||
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_id": node["user"]["username"],
|
||||
"article_nickname": node["user"]["username"],
|
||||
"article_profileurl": node["user"]["profile_pic_url"],
|
||||
"platform_name": "instagram",
|
||||
"platform_form": "post",
|
||||
"article_form": "reply"
|
||||
})
|
||||
return body, reply, start_cursor, has_previous
|
||||
|
||||
|
||||
def parse_reply_ajax(content):
|
||||
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
|
||||
reply = []
|
||||
start_cursor = None
|
||||
has_previous = False
|
||||
if json_data["status"] == "ok":
|
||||
comments = json_data["comments"]
|
||||
has_previous = comments["page_info"]["has_previous_page"]
|
||||
start_cursor = comments["page_info"]["start_cursor"]
|
||||
nodes = comments["nodes"]
|
||||
for node in nodes:
|
||||
reply.append({
|
||||
"article_data": node["text"],
|
||||
"article_date":
|
||||
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_id": node["user"]["username"],
|
||||
"article_nickname": node["user"]["username"],
|
||||
"article_profileurl": node["user"]["profile_pic_url"],
|
||||
"platform_name": "instagram",
|
||||
"platform_form": "post",
|
||||
"article_form": "reply",
|
||||
})
|
||||
return reply, start_cursor, has_previous
|
||||
75
WebBasedCrawler/insta/instatest.py
Normal file
75
WebBasedCrawler/insta/instatest.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import Browser
|
||||
from base.baseclasses import enter_element
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
|
||||
|
||||
def pageup_and_pagedown(_driver):
|
||||
body = _driver.find_element_by_tag_name('body')
|
||||
for i in range(0, 2):
|
||||
body.send_keys(Keys.PAGE_UP)
|
||||
wait(0.2)
|
||||
for i in range(0, 5):
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
wait(0.2)
|
||||
|
||||
|
||||
def first_load(_driver):
|
||||
element = _driver.find_element_by_css_selector("div._pupj3 > a")
|
||||
enter_element(element)
|
||||
|
||||
|
||||
def get_urls(_driver, url_set):
|
||||
elements = _driver.find_elements_by_css_selector("div._myci9>a")
|
||||
for element in elements:
|
||||
url_set.add(element.get_attribute('href'))
|
||||
|
||||
|
||||
def remove_myci9(_driver):
|
||||
elements = _driver.find_elements_by_css_selector("div._myci9")
|
||||
for i in range(0, len(elements) - 4 if len(elements) - 4 > 0 else 0):
|
||||
_driver.execute_script("""
|
||||
var element = document.querySelector("div._myci9");
|
||||
if (element)
|
||||
element.parentNode.removeChild(element);
|
||||
""")
|
||||
|
||||
|
||||
browser = Browser()
|
||||
driver = browser.get_new_driver('chrome')
|
||||
|
||||
url_sets = set()
|
||||
wait(5)
|
||||
url = "https://www.instagram.com/explore/tags/%EC%A4%8C%EB%A7%88%EA%B7%B8%EB%9E%A8/"
|
||||
#url = 'https://www.instagram.com/explore/tags/%EB%A7%9B%EC%8A%A4%ED%83%80%EA%B7%B8%EB%9E%A8/'
|
||||
driver.get(url)
|
||||
#driver.get('https://www.instagram.com/explore/tags/맛스타그램/')
|
||||
wait(5)
|
||||
|
||||
|
||||
first_load(driver)
|
||||
wait(3)
|
||||
|
||||
print(driver.get_cookies())
|
||||
|
||||
with open("c:\\data\\instajumma.txt", 'w') as f:
|
||||
try:
|
||||
while True:
|
||||
for j in range(0, 10):
|
||||
pageup_and_pagedown(driver)
|
||||
|
||||
get_urls(driver, url_sets)
|
||||
remove_myci9(driver)
|
||||
print("url count = {0}\n".format(len(url_sets)), flush=True, file=f)
|
||||
finally:
|
||||
print("finished")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user