insta crawler 수정

git-svn-id: svn://192.168.0.12/source@278 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-06-28 03:29:33 +00:00
parent bf78651baa
commit 3074db4fa0
7 changed files with 1807 additions and 442 deletions

View File

@@ -17,7 +17,7 @@ from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def print_and_flush(string):
print(string)
@@ -139,7 +139,9 @@ class Browser:
def new_firefox_browser(self):
self.info = "firefox"
self.driver = webdriver.Firefox()
caps = DesiredCapabilities.FIREFOX
#caps["marionette"] = True
self.driver = webdriver.Firefox(capabilities=caps)
return self.driver
def new_opera_browser(self, driver_exec=None):

View File

@@ -6,19 +6,60 @@ Created on 2015. 12. 8.
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
import logging
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_xpath
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
from base.baseclasses import Browser
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('pymysql').setLevel(logging.WARNING)
def click_insta_load_more(driver):
element = driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def push_page_down(driver):
body = driver.find_element_by_tag_name('body')
body.send_keys(Keys.PAGE_DOWN)
def focus_driver(driver):
position = driver.get_window_position()
size = driver.get_window_size()
driver.maximize_window()
driver.set_window_size(size['width'], size["height"])
driver.set_window_position(position['x'], position['y'])
class InstaInit(CrawlInit):
@@ -52,7 +93,7 @@ class InstaInit(CrawlInit):
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
return result.date()
else:
return self.start_day()
@@ -60,452 +101,480 @@ class InstaInit(CrawlInit):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
return result.date()
else:
return self.end_day()
class InstaBodyCrawler:
def __init__(self, driver=None):
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
if is_debuging:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.__set_cookies(self.__r.cookies)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaAlgorithm:
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
self.send_to_db = send_to_db
self.crawl_init = crawl_init
self.browser = browser
self.driver = driver
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
self.keyword_id = keyword_id
self.reload_wait_second = reload_wait_second
self.num_of_load_content = num_of_load_content
self.page_down = page_down
self.list_crawl = []
def set_driver(self, driver):
self.driver = driver
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
# printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
def start_crawl(self):
self.crawl()
self.close()
def close(self):
if self.driver and not is_debuging:
self.driver.quit()
self.send_to_db.close()
printl("Finished Crawling :)")
def crawl(self):
raise NotImplementedError
def is_until_page(self):
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
return True
else:
self.article = article
return False
def find_article_url(self):
a = self.article.find_element_by_xpath("div/section/a")
return a.get_attribute("href")
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
def find_article_profileurl(self):
img = self.article.find_element_by_xpath("header/a/img[@src]")
return img.get_attribute("src")
def find_article_nickname(self):
a = self.article.find_element_by_xpath("header/div/a")
return a.text
def find_article_date(self):
el_time = self.article.find_element_by_xpath("div/section/a/time")
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
def find_article_data(self):
ul = self.article.find_element_by_xpath("div/ul")
try:
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
span = ul.find_element_by_css_selector("li h1>span")
return span.text
except:
return ""
def find_article_id(self):
return self.find_platform_id()
def find_platform_name(self):
return 'instagram'
def find_article_form(self):
return 'body'
def find_platform_id(self):
a = self.article.find_element_by_xpath("header/div/a")
if a:
href = a.get_attribute("href")
str_id = href.replace(insta_url, "").replace("/", "")
return str_id
else:
return None
def find_like_num(self):
div = self.article.find_element_by_xpath("div/section[1]/div")
try:
span = div.find_element_by_xpath("span/span")
str_num = span.text
str_num = str_num.replace(',', '')
if str_num[-1] == 'm':
num = float(str_num[0:-1]) * 1000000
elif str_num[-1] == 'k':
num = float(str_num[0:-1]) * 1000
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
num = int(str_num)
return str(num)
except:
a_list = div.find_elements_by_tag_name("a")
if len(a_list) > 1:
return str(len(a_list))
else:
if a_list and a_list[0].get_attribute('title'):
return str(1)
else:
return str(0)
# span = div.find_element_by_xpath("span[1]")
# if len(span.text.strip()) < 1:
# return str(1)
# else:
# return str(0)
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
self.list_crawl.append(element)
backup_set.add(element['url'])
if self.is_until_page():
return False
if self.list_crawl:
printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def find_reply_num(self):
ul = self.article.find_element_by_xpath("div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) < 2:
return "0"
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
span = li.find_element_by_xpath("button/span[2]")
str_num = span.text.replace(",", "")
return str_num
except:
return str(len(lis) - 1)
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
content["platform_id"] = self.find_platform_id()
content["article_url"] = self.find_article_url()
content["article_profileurl"] = self.find_article_profileurl()
content["article_nickname"] = self.find_article_nickname()
content["platform_name"] = self.find_platform_name()
content["article_date"] = self.find_article_date()
content["article_data"] = self.find_article_data()
content["article_form"] = 'body'
content["platform_form"] = 'post'
content["platform_title"] = content["article_id"]
reply_num = self.find_reply_num()
if int(reply_num) > 0:
content["article_order"] = int(reply_num)
like_num = self.find_like_num()
if int(float(like_num)) > 0:
content["reply_url"] = int(float(like_num))
return content
def find_platform_title(self):
pass
def find_article_title(self):
pass
class InstaReplyCrawler:
def __init__(self, driver=None, article=None):
self.driver = driver
self.activity = article
self.reply_list = list()
def find_init(self):
self.reply_list.clear()
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
def crawl_list(self):
if self.list_crawl:
printl()
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
printl()
for element in self.list_crawl:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(body_wait_sec)
self.crawl_content(element['url'], {}, element['url'])
except Exception as e:
print_and_flush(e)
raise Exception
printl(e)
logging.info(e)
class InstaAlgorithmNormal(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
self.crawl_list()
self.list_crawl.clear()
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmBrowser(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
def url_load(self, url):
if insta_tag_url in url:
list_tag = ListTag(url)
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
return list_tag, insta_list, end_cursor, has_next
else:
self.article = article
list_user = ListUser(url)
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
return list_user, insta_list, end_cursor, has_next
def has_more(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
return True
except Exception as e:
return False
def read_more_reply(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
enter_element(button)
except Exception as e:
print_and_flush(e)
def read_all_reply(self, ul):
i = 0
while i < 200 and self.has_more(ul):
self.read_more_reply(ul)
i += 1
# for i in range(0, 10):
# if self.has_more(ul):
# self.read_more_reply(ul)
# else:
# break
def get_reply_ul(self):
ul = self.article.find_element_by_xpath("div/ul")
return ul
def has_reply(self, ul):
try:
lis = ul.find_elements_by_css_selector("li>a")
if len(lis) > 0:
return True
except:
return False
return False
def crawl_all(self):
self.find_init()
self.set_article()
try:
ul = self.get_reply_ul()
if self.has_reply(ul):
self.read_all_reply(ul)
self.crawl_reply(ul)
except Exception as e:
print_and_flush(e)
def crawl_reply(self, ul):
article_data = self.find_article_data(ul)
article_id = self.find_article_id(ul)
if len(article_data) != len(article_id):
print_and_flush("article_data != article_id")
for i in range(0, len(article_id)):
content = dict()
content["article_data"] = article_data[i]
content["article_id"] = article_id[i]
content["article_nickname"] = article_id[i]
content["platform_name"] = "instagram"
content["platform_form"] = "post"
content["article_form"] = 'reply'
content["article_order"] = i
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def find_article_id(self, ul):
id_list = list()
a_list = ul.find_elements_by_xpath("li/a")
for i in a_list:
id_list.append(i.text)
return id_list
def find_article_profileurl(self, ul):
pass
def find_article_nickname(self, ul):
return self.find_article_id(ul)
def find_article_data(self, ul):
data_list = list()
span_list = ul.find_elements_by_css_selector("li>span")
for i in span_list:
data_list.append(i.text)
return data_list
def find_article_url(self, ul):
pass
def find_platform_id(self, ul):
pass
def find_article_form(self, ul=None):
return 'reply'
def find_platform_name(self, ul=None):
return 'instagram'
def find_platform_form(self, ul=None):
return 'post'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class InstaPageCrawler:
def __init__(self, driver=None, begin_date=None, end_date=None):
self.driver = driver
self.url_set = set()
self.begin_date = begin_date
self.end_date = end_date
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def find_article_url(self):
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
return a.get_attribute("href")
def init(self):
self.url_set.clear()
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
def has_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
return True
except:
return False
def move_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
enter_element(a)
return True
except:
return False
def has_first_page(self):
try:
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
enter_element(a)
return True
except:
return False
def crawling_ok(self, url):
self.url_set.add(url)
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def find_article_date(self):
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
def crawl(self):
real_time = True
while real_time:
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
wait(3)
printl(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
list_crawler.set_end_cursor(end_cursor2)
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
# ajax load
page_down = 0
while is_load_more:
if page_down == self.page_down:
page_down = 0
try:
focus_driver(self.driver)
click_insta_load_more(self.driver)
except:
push_page_down(self.driver)
page_down += 1
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
# printl("list length = " + str(len(insta_list)))
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# printl("number of backup_set = {0}".format(len(backup_set)))
i += 1
self.crawl_list()
self.list_crawl.clear()
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
if self.driver:
self.driver.close()
wait(3)
self.driver = self.browser.new_browser()
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaMainCrawler:
def __init__(self):
self.page_crawler = InstaPageCrawler()
self.body_crawler = InstaBodyCrawler()
self.reply_crawler = InstaReplyCrawler()
self.send_to_db = SendtoDB()
self.browser = Browser()
self.crawl_init = InstaInit()
self.browser = Browser()
self.driver = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
if not self.page_crawler.has_first_page():
return
while True:
str_date = self.page_crawler.find_article_date()
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
print_and_flush(str_date)
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
#if self.page_crawler.is_earlier(date_val.date()):
if self.page_crawler.is_late(date_val):
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
#if self.page_crawler.is_late(date_val.date()):
if self.page_crawler.is_earlier(date_val):
break
try:
body_content = self.crawl_body()
self.crawl_reply(body_content)
self.page_crawler.url_set.add(body_content["article_url"])
print_and_flush("ok")
except Exception as e:
print_and_flush('fail')
print_and_flush(e)
if self.page_crawler.has_next():
self.page_crawler.move_next()
else:
break
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_article()
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return content
def crawl_reply(self, body_content):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.crawl_all()
content_list = self.reply_crawler.get_content()
if content_list:
for i in content_list:
i['article_url'] = body_content['article_url']
i['platform_id'] = body_content['platform_id']
self.send_to_db.send_reply(content_list)
pass
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
self.init_browser(browser)
def set_driver(self, driver):
self.driver = driver
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
try:
self.set_driver(self.browser.get_new_driver(browser))
except Exception as e:
logging.info(e)
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
@@ -525,30 +594,10 @@ class InstaMainCrawler:
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(3)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
if self.page_crawler.has_first_page():
self.crawl_all(backup_set)
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
backup_set = self.page_crawler.url_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()
if self.driver:
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
else:
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm.start_crawl()

View File

@@ -0,0 +1,556 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_xpath
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
from base.baseclasses import Browser
from selenium.webdriver.common.action_chains import ActionChains
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class InstaBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def find_article_url(self):
a = self.article.find_element_by_xpath("div/section/a")
return a.get_attribute("href")
def find_article_profileurl(self):
img = self.article.find_element_by_xpath("header/a/img[@src]")
return img.get_attribute("src")
def find_article_nickname(self):
a = self.article.find_element_by_xpath("header/div/a")
return a.text
def find_article_date(self):
el_time = self.article.find_element_by_xpath("div/section/a/time")
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
def find_article_data(self):
ul = self.article.find_element_by_xpath("div/ul")
try:
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
span = ul.find_element_by_css_selector("li h1>span")
return span.text
except:
return ""
def find_article_id(self):
return self.find_platform_id()
def find_platform_name(self):
return 'instagram'
def find_article_form(self):
return 'body'
def find_platform_id(self):
a = self.article.find_element_by_xpath("header/div/a")
if a:
href = a.get_attribute("href")
str_id = href.replace(insta_url, "").replace("/", "")
return str_id
else:
return None
def find_like_num(self):
div = self.article.find_element_by_xpath("div/section[1]/div")
try:
span = div.find_element_by_xpath("span/span")
str_num = span.text
str_num = str_num.replace(',', '')
if str_num[-1] == 'm':
num = float(str_num[0:-1]) * 1000000
elif str_num[-1] == 'k':
num = float(str_num[0:-1]) * 1000
else:
num = int(str_num)
return str(num)
except:
a_list = div.find_elements_by_tag_name("a")
if len(a_list) > 1:
return str(len(a_list))
else:
if a_list and a_list[0].get_attribute('title'):
return str(1)
else:
return str(0)
# span = div.find_element_by_xpath("span[1]")
# if len(span.text.strip()) < 1:
# return str(1)
# else:
# return str(0)
def find_reply_num(self):
ul = self.article.find_element_by_xpath("div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) < 2:
return "0"
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
span = li.find_element_by_xpath("button/span[2]")
str_num = span.text.replace(",", "")
return str_num
except:
return str(len(lis) - 1)
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
content["platform_id"] = self.find_platform_id()
content["article_url"] = self.find_article_url()
content["article_profileurl"] = self.find_article_profileurl()
content["article_nickname"] = self.find_article_nickname()
content["platform_name"] = self.find_platform_name()
content["article_date"] = self.find_article_date()
content["article_data"] = self.find_article_data()
content["article_form"] = 'body'
content["platform_form"] = 'post'
content["platform_title"] = content["article_id"]
reply_num = self.find_reply_num()
if int(reply_num) > 0:
content["article_order"] = int(reply_num)
like_num = self.find_like_num()
if int(float(like_num)) > 0:
content["reply_url"] = int(float(like_num))
return content
def find_platform_title(self):
pass
def find_article_title(self):
pass
class InstaReplyCrawler:
def __init__(self, driver=None, article=None):
self.driver = driver
self.activity = article
self.reply_list = list()
def find_init(self):
self.reply_list.clear()
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def has_more(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
return True
except Exception as e:
return False
def read_more_reply(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
enter_element(button)
except Exception as e:
print_and_flush(e)
def read_all_reply(self, ul):
i = 0
while i < 200 and self.has_more(ul):
self.read_more_reply(ul)
i += 1
# for i in range(0, 10):
# if self.has_more(ul):
# self.read_more_reply(ul)
# else:
# break
def get_reply_ul(self):
ul = self.article.find_element_by_xpath("div/ul")
return ul
def has_reply(self, ul):
try:
lis = ul.find_elements_by_css_selector("li>a")
if len(lis) > 0:
return True
except:
return False
return False
def crawl_all(self):
self.find_init()
self.set_article()
try:
ul = self.get_reply_ul()
if self.has_reply(ul):
self.read_all_reply(ul)
self.crawl_reply(ul)
except Exception as e:
print_and_flush(e)
def crawl_reply(self, ul):
article_data = self.find_article_data(ul)
article_id = self.find_article_id(ul)
if len(article_data) != len(article_id):
print_and_flush("article_data != article_id")
for i in range(0, len(article_id)):
content = dict()
content["article_data"] = article_data[i]
content["article_id"] = article_id[i]
content["article_nickname"] = article_id[i]
content["platform_name"] = "instagram"
content["platform_form"] = "post"
content["article_form"] = 'reply'
content["article_order"] = i
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def find_article_id(self, ul):
id_list = list()
a_list = ul.find_elements_by_xpath("li/a")
for i in a_list:
id_list.append(i.text)
return id_list
def find_article_profileurl(self, ul):
pass
def find_article_nickname(self, ul):
return self.find_article_id(ul)
def find_article_data(self, ul):
data_list = list()
span_list = ul.find_elements_by_css_selector("li>span")
for i in span_list:
data_list.append(i.text)
return data_list
def find_article_url(self, ul):
pass
def find_platform_id(self, ul):
pass
def find_article_form(self, ul=None):
return 'reply'
def find_platform_name(self, ul=None):
return 'instagram'
def find_platform_form(self, ul=None):
return 'post'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class InstaPageCrawler:
def __init__(self, driver=None, begin_date=None, end_date=None):
self.driver = driver
self.url_set = set()
self.begin_date = begin_date
self.end_date = end_date
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def find_article_url(self):
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
return a.get_attribute("href")
def init(self):
self.url_set.clear()
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
def has_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
return True
except:
return False
def move_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
enter_element(a)
return True
except:
return False
def has_first_page(self):
try:
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
enter_element(a)
return True
except:
return False
def crawling_ok(self, url):
self.url_set.add(url)
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def find_article_date(self):
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
class InstaMainCrawler:
def __init__(self):
self.page_crawler = InstaPageCrawler()
self.body_crawler = InstaBodyCrawler()
self.reply_crawler = InstaReplyCrawler()
self.send_to_db = SendtoDB()
self.browser = Browser()
self.crawl_init = InstaInit()
self.driver = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
if not self.page_crawler.has_first_page():
return
while True:
str_date = self.page_crawler.find_article_date()
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
print_and_flush(str_date)
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
# if self.page_crawler.is_earlier(date_val.date()):
if self.page_crawler.is_late(date_val):
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
# if self.page_crawler.is_late(date_val.date()):
if self.page_crawler.is_earlier(date_val):
break
try:
wait(3)
body_content = self.crawl_body()
self.crawl_reply(body_content)
self.page_crawler.url_set.add(body_content["article_url"])
print_and_flush("ok")
except Exception as e:
print_and_flush('fail')
print_and_flush(e)
if self.page_crawler.has_next():
self.page_crawler.move_next()
else:
break
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_article()
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return content
def crawl_reply(self, body_content):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.crawl_all()
content_list = self.reply_crawler.get_content()
if content_list:
for i in content_list:
i['article_url'] = body_content['article_url']
i['platform_id'] = body_content['platform_id']
self.send_to_db.send_reply(content_list)
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
wait(3)
self.driver.get(url_list[i])
wait(5)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
if self.page_crawler.has_first_page():
self.crawl_all(backup_set)
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
backup_set = self.page_crawler.url_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()

View File

@@ -0,0 +1,426 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, 12)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, 24)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, 20)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
#printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
pass
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = list_crawler.has_next
for element in insta_list:
old_elements = 0
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
backup_set.add(element['url'])
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(1)
insta_list = list_crawler.load_more()
is_load_more = list_crawler.has_next
old_elements = 0
printl("list length = " + str(len(insta_list)))
for element in insta_list:
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
try:
self.crawl_content(element['url'], list_crawler.get_cookies(),
list_crawler.get_url())
except Exception as e:
printl(e)
backup_set.add(element['url'])
i += 1
except Exception as e:
printl(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()

View File

@@ -0,0 +1,99 @@
def get_headers_for_list_html():
return {"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip, deflate, sdch, br",
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/50.0.2661.102 Safari/537.36"
}
def get_headers_for_body_html(cookies):
if cookies:
request_headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip, deflate, sdch, br",
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/50.0.2661.102 Safari/537.36",
"upgrade-insecure-requests": "1",
"cache-control": "max-age=0",
'cookie': 'mid=' + cookies['mid'] + '; sessionid=' + cookies['sessionid'] +
'; ig_pr=1; ig_vw=1920; csrftoken=' + cookies['csrftoken'] + "; s_network="
}
else:
request_headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"accept-encoding": "gzip, deflate, sdch, br",
"accept-language": "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/50.0.2661.102 Safari/537.36",
"upgrade-insecure-requests": "1",
"cache-control": "max-age=0"
}
return request_headers
def get_headers_for_ajax(cookies, referer, form_data):
request_headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'origin': 'https://www.instagram.com',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/50.0.2661.102 Safari/537.36',
'cookie': 'mid=' + cookies['mid'] + '; sessionid=' + cookies['sessionid'] +
'; ig_pr=1; ig_vw=1920; csrftoken=' + cookies['csrftoken'] + "; s_network=",
'x-csrftoken': cookies['csrftoken'],
'x-instagram-ajax': 1,
'x-requested-with': 'XMLHttpRequest',
'referer': referer,
'content-length': str(len(form_data)),
'connection': 'keep-alive'
}
return request_headers
def get_form_data_for_list_user(user_id, end_cursor, count):
res = 'q=ig_user(' \
+ str(user_id) + \
')+%7B+media.after(' \
+ str(end_cursor) + \
'%2C+' \
+ str(count) + \
')+%7B%0A++count%2C%0A++nodes+%7B%0A++++caption%2C%0A++++code%2C%0A++++comments+%7B%0A++++++' \
'count%0A++++%7D%2C%0A++++date%2C%0A++++dimensions+%7B%0A++++++height%2C%0A++++++width%0A++++' \
'%7D%2C%0A++++display_src%2C%0A++++id%2C%0A++++is_video%2C%0A++++likes+%7B%0A++++++count%0A++++' \
'%7D%2C%0A++++owner+%7B%0A++++++id%0A++++%7D%2C%0A++++thumbnail_src%2C%0A++++video_views%0A++%7D%2C%0A++' \
'page_info%0A%7D%0A+%7D&ref=users%3A%3Ashow'
return res
def get_form_data_for_list_tag(hash_tag, end_cursor, count):
res = 'q=ig_hashtag(' \
+ str(hash_tag) + \
')+%7B+media.after(' \
+ str(end_cursor) + \
'%2C+' + str(count) + \
')+%7B%0A++count%2C%0A++nodes+%7B%0A++++caption%2C%0A++++code%2C%0A++++comments+%7B%0A++++++count%0A++++' \
'%7D%2C%0A++++date%2C%0A++++dimensions+%7B%0A++++++height%2C%0A++++++width%0A++++%7D%2C%0A++++' \
'display_src%2C%0A++++id%2C%0A++++is_video%2C%0A++++likes+%7B%0A++++++count%0A++++%7D%2C%0A++++' \
'owner+%7B%0A++++++id%0A++++%7D%2C%0A++++thumbnail_src%2C%0A++++video_views%0A++%7D%2C%0A++' \
'page_info%0A%7D%0A+%7D&ref=tags%3A%3Ashow'
return res
def get_form_data_for_reply(body_code, start_cursor, count):
res = 'q=ig_shortcode(' \
+ str(body_code) + ')+%7B%0A++comments.before' \
'(%0A++++++++++++' \
+ str(start_cursor) + \
'%2C%0A++++++++++++' \
+ str(count) + \
'%0A++++++++++)' \
'+%7B%0A++++count%2C%0A++++nodes+%7B%0A++++++id%2C%0A++++++' \
'created_at%2C%0A++++++text%2C%0A++++++user+%7B%0A++++++++id%2C%0A++++++++' \
'profile_pic_url%2C%0A++++++++username%0A++++++%7D%0A++++%7D%2C%0A++++' \
'page_info%0A++%7D%0A%7D%0A&ref=media%3A%3Ashow'
return res

View File

@@ -0,0 +1,158 @@
import re
import json
import requests
import datetime
rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);\s*</script>')
old_date = datetime.datetime(1970, 1, 1, 9)
def get_json_from_html(content):
if type(content) == bytes:
s = content.decode('utf-8')
elif type(content) == str:
s = content
elif type(content) == requests.models.Response:
s = content.content.decode('utf-8')
else:
raise TypeError
m = rx_json_html.search(s)
if m:
return json.loads(m.group(1))
else:
raise TypeError("Check requests.response")
def parse_list_user_html(content):
json_data = get_json_from_html(content)
profilepage = json_data['entry_data']['ProfilePage']
has_next = False
end_cursor = None
body_list = []
user_id = None
if profilepage:
user_id = profilepage[0]["user"]["id"]
has_next = profilepage[0]["user"]["media"]["page_info"]["has_next_page"]
end_cursor = profilepage[0]["user"]["media"]["page_info"]["end_cursor"]
nodes = profilepage[0]["user"]["media"]["nodes"]
for node in nodes:
body_list.append(
{
"code": node["code"],
"url": "https://www.instagram.com/p/" + node["code"] + "/",
"date": old_date + datetime.timedelta(seconds=node["date"])
}
)
return body_list, end_cursor, has_next, user_id
def parse_list_tag_html(content):
json_data = get_json_from_html(content)
tagpage = json_data['entry_data']['TagPage']
has_next = False
end_cursor = None
body_list = []
if tagpage:
print('start_cursor = ', end='', flush=True)
print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True)
end_cursor = tagpage[0]["tag"]["media"]["page_info"]["end_cursor"]
has_next = tagpage[0]["tag"]["media"]["page_info"]["has_next_page"]
nodes = tagpage[0]["tag"]["media"]["nodes"]
for node in nodes:
body_list.append({
"code": node["code"],
"url": "https://www.instagram.com/p/" + node["code"] + "/",
"date": old_date + datetime.timedelta(seconds=node["date"])
})
return body_list, end_cursor, has_next
def parse_list_ajax(content):
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
has_next = False
end_cursor = None
body_list = []
if json_data["status"] == "ok":
has_next = json_data["media"]["page_info"]["has_next_page"]
end_cursor = json_data["media"]["page_info"]["end_cursor"]
nodes = json_data["media"]["nodes"]
for node in nodes:
body_list.append(
{
"code": node["code"],
"url": "https://www.instagram.com/p/" + node["code"] + "/",
"date": old_date + datetime.timedelta(seconds=node["date"])
}
)
return body_list, end_cursor, has_next
def parse_body_html(content):
json_data = get_json_from_html(content)
postpage = json_data["entry_data"]["PostPage"]
body = {}
reply = []
start_cursor = None
has_previous = False
if postpage:
media = postpage[0]["media"]
body = {
"article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_data": media["caption"],
"article_id": media["owner"]["username"],
"article_nickname": media["owner"]["username"],
"platform_id": media["owner"]["username"],
"platform_name": "instagram",
"platform_form": "post",
"platform_title": media["owner"]["username"],
"article_form": "body",
"article_profileurl": media["owner"]["profile_pic_url"],
"article_order": str(media["comments"]["count"]),
"reply_url": str(media["likes"]["count"])
}
comments = postpage[0]["media"]["comments"]
has_previous = comments["page_info"]["has_previous_page"]
start_cursor = comments["page_info"]["start_cursor"]
nodes = comments["nodes"]
for node in nodes:
reply.append({
"article_data": node["text"],
"article_date":
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_id": node["user"]["username"],
"article_nickname": node["user"]["username"],
"article_profileurl": node["user"]["profile_pic_url"],
"platform_name": "instagram",
"platform_form": "post",
"article_form": "reply"
})
return body, reply, start_cursor, has_previous
def parse_reply_ajax(content):
json_data = json.loads(content.decode('utf-8'), encoding="utf-8")
reply = []
start_cursor = None
has_previous = False
if json_data["status"] == "ok":
comments = json_data["comments"]
has_previous = comments["page_info"]["has_previous_page"]
start_cursor = comments["page_info"]["start_cursor"]
nodes = comments["nodes"]
for node in nodes:
reply.append({
"article_data": node["text"],
"article_date":
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_id": node["user"]["username"],
"article_nickname": node["user"]["username"],
"article_profileurl": node["user"]["profile_pic_url"],
"platform_name": "instagram",
"platform_form": "post",
"article_form": "reply",
})
return reply, start_cursor, has_previous

View File

@@ -0,0 +1,75 @@
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import Browser
from base.baseclasses import enter_element
from selenium.webdriver.common.keys import Keys
def pageup_and_pagedown(_driver):
body = _driver.find_element_by_tag_name('body')
for i in range(0, 2):
body.send_keys(Keys.PAGE_UP)
wait(0.2)
for i in range(0, 5):
body.send_keys(Keys.PAGE_DOWN)
wait(0.2)
def first_load(_driver):
element = _driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def get_urls(_driver, url_set):
elements = _driver.find_elements_by_css_selector("div._myci9>a")
for element in elements:
url_set.add(element.get_attribute('href'))
def remove_myci9(_driver):
elements = _driver.find_elements_by_css_selector("div._myci9")
for i in range(0, len(elements) - 4 if len(elements) - 4 > 0 else 0):
_driver.execute_script("""
var element = document.querySelector("div._myci9");
if (element)
element.parentNode.removeChild(element);
""")
browser = Browser()
driver = browser.get_new_driver('chrome')
url_sets = set()
wait(5)
url = "https://www.instagram.com/explore/tags/%EC%A4%8C%EB%A7%88%EA%B7%B8%EB%9E%A8/"
#url = 'https://www.instagram.com/explore/tags/%EB%A7%9B%EC%8A%A4%ED%83%80%EA%B7%B8%EB%9E%A8/'
driver.get(url)
#driver.get('https://www.instagram.com/explore/tags/맛스타그램/')
wait(5)
first_load(driver)
wait(3)
print(driver.get_cookies())
with open("c:\\data\\instajumma.txt", 'w') as f:
try:
while True:
for j in range(0, 10):
pageup_and_pagedown(driver)
get_urls(driver, url_sets)
remove_myci9(driver)
print("url count = {0}\n".format(len(url_sets)), flush=True, file=f)
finally:
print("finished")