인스타 크롤러 주석 제거 및 백업 파일들 삭제

This commit is contained in:
mjjo
2017-06-30 17:26:00 +09:00
parent 5d15913c7a
commit 3d806ae5db
4 changed files with 0 additions and 1589 deletions

View File

@@ -722,8 +722,6 @@ class InstaContent:
url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format(
self.query_id, self.__code, len(self.reply), self.start_cursor) self.query_id, self.__code, len(self.reply), self.start_cursor)
# url = self.__referer + "?max_id="+self.start_cursor
# self.log_load_reply_more_before(form_data, headers)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True) timeout=requests_timeout, stream=True)
content = requests_get(self.__r) content = requests_get(self.__r)
@@ -735,8 +733,6 @@ class InstaContent:
self.reply += reply self.reply += reply
printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor))
# self.log_load_reply_more_after()
return self.reply return self.reply
def get_cookies(self): def get_cookies(self):

View File

@@ -1,556 +0,0 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_xpath
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
from base.baseclasses import Browser
from selenium.webdriver.common.action_chains import ActionChains
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class InstaBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def find_article_url(self):
a = self.article.find_element_by_xpath("div/section/a")
return a.get_attribute("href")
def find_article_profileurl(self):
img = self.article.find_element_by_xpath("header/a/img[@src]")
return img.get_attribute("src")
def find_article_nickname(self):
a = self.article.find_element_by_xpath("header/div/a")
return a.text
def find_article_date(self):
el_time = self.article.find_element_by_xpath("div/section/a/time")
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
def find_article_data(self):
ul = self.article.find_element_by_xpath("div/ul")
try:
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
span = ul.find_element_by_css_selector("li h1>span")
return span.text
except:
return ""
def find_article_id(self):
return self.find_platform_id()
def find_platform_name(self):
return 'instagram'
def find_article_form(self):
return 'body'
def find_platform_id(self):
a = self.article.find_element_by_xpath("header/div/a")
if a:
href = a.get_attribute("href")
str_id = href.replace(insta_url, "").replace("/", "")
return str_id
else:
return None
def find_like_num(self):
div = self.article.find_element_by_xpath("div/section[1]/div")
try:
span = div.find_element_by_xpath("span/span")
str_num = span.text
str_num = str_num.replace(',', '')
if str_num[-1] == 'm':
num = float(str_num[0:-1]) * 1000000
elif str_num[-1] == 'k':
num = float(str_num[0:-1]) * 1000
else:
num = int(str_num)
return str(num)
except:
a_list = div.find_elements_by_tag_name("a")
if len(a_list) > 1:
return str(len(a_list))
else:
if a_list and a_list[0].get_attribute('title'):
return str(1)
else:
return str(0)
# span = div.find_element_by_xpath("span[1]")
# if len(span.text.strip()) < 1:
# return str(1)
# else:
# return str(0)
def find_reply_num(self):
ul = self.article.find_element_by_xpath("div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) < 2:
return "0"
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
span = li.find_element_by_xpath("button/span[2]")
str_num = span.text.replace(",", "")
return str_num
except:
return str(len(lis) - 1)
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
content["platform_id"] = self.find_platform_id()
content["article_url"] = self.find_article_url()
content["article_profileurl"] = self.find_article_profileurl()
content["article_nickname"] = self.find_article_nickname()
content["platform_name"] = self.find_platform_name()
content["article_date"] = self.find_article_date()
content["article_data"] = self.find_article_data()
content["article_form"] = 'body'
content["platform_form"] = 'post'
content["platform_title"] = content["article_id"]
reply_num = self.find_reply_num()
if int(reply_num) > 0:
content["article_order"] = int(reply_num)
like_num = self.find_like_num()
if int(float(like_num)) > 0:
content["reply_url"] = int(float(like_num))
return content
def find_platform_title(self):
pass
def find_article_title(self):
pass
class InstaReplyCrawler:
def __init__(self, driver=None, article=None):
self.driver = driver
self.activity = article
self.reply_list = list()
def find_init(self):
self.reply_list.clear()
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def has_more(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
return True
except Exception as e:
return False
def read_more_reply(self, ul):
try:
button = ul.find_element_by_css_selector("li>button")
enter_element(button)
except Exception as e:
print_and_flush(e)
def read_all_reply(self, ul):
i = 0
while i < 200 and self.has_more(ul):
self.read_more_reply(ul)
i += 1
# for i in range(0, 10):
# if self.has_more(ul):
# self.read_more_reply(ul)
# else:
# break
def get_reply_ul(self):
ul = self.article.find_element_by_xpath("div/ul")
return ul
def has_reply(self, ul):
try:
lis = ul.find_elements_by_css_selector("li>a")
if len(lis) > 0:
return True
except:
return False
return False
def crawl_all(self):
self.find_init()
self.set_article()
try:
ul = self.get_reply_ul()
if self.has_reply(ul):
self.read_all_reply(ul)
self.crawl_reply(ul)
except Exception as e:
print_and_flush(e)
def crawl_reply(self, ul):
article_data = self.find_article_data(ul)
article_id = self.find_article_id(ul)
if len(article_data) != len(article_id):
print_and_flush("article_data != article_id")
for i in range(0, len(article_id)):
content = dict()
content["article_data"] = article_data[i]
content["article_id"] = article_id[i]
content["article_nickname"] = article_id[i]
content["platform_name"] = "instagram"
content["platform_form"] = "post"
content["article_form"] = 'reply'
content["article_order"] = i
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def find_article_id(self, ul):
id_list = list()
a_list = ul.find_elements_by_xpath("li/a")
for i in a_list:
id_list.append(i.text)
return id_list
def find_article_profileurl(self, ul):
pass
def find_article_nickname(self, ul):
return self.find_article_id(ul)
def find_article_data(self, ul):
data_list = list()
span_list = ul.find_elements_by_css_selector("li>span")
for i in span_list:
data_list.append(i.text)
return data_list
def find_article_url(self, ul):
pass
def find_platform_id(self, ul):
pass
def find_article_form(self, ul=None):
return 'reply'
def find_platform_name(self, ul=None):
return 'instagram'
def find_platform_form(self, ul=None):
return 'post'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class InstaPageCrawler:
def __init__(self, driver=None, begin_date=None, end_date=None):
self.driver = driver
self.url_set = set()
self.begin_date = begin_date
self.end_date = end_date
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def find_article_url(self):
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
return a.get_attribute("href")
def init(self):
self.url_set.clear()
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
def has_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
return True
except:
return False
def move_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
enter_element(a)
return True
except:
return False
def has_first_page(self):
try:
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
enter_element(a)
return True
except:
return False
def crawling_ok(self, url):
self.url_set.add(url)
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def find_article_date(self):
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
class InstaMainCrawler:
def __init__(self):
self.page_crawler = InstaPageCrawler()
self.body_crawler = InstaBodyCrawler()
self.reply_crawler = InstaReplyCrawler()
self.send_to_db = SendtoDB()
self.browser = Browser()
self.crawl_init = InstaInit()
self.driver = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
if not self.page_crawler.has_first_page():
return
while True:
str_date = self.page_crawler.find_article_date()
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
print_and_flush(str_date)
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
# if self.page_crawler.is_earlier(date_val.date()):
if self.page_crawler.is_late(date_val):
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
# if self.page_crawler.is_late(date_val.date()):
if self.page_crawler.is_earlier(date_val):
break
try:
wait(3)
body_content = self.crawl_body()
self.crawl_reply(body_content)
self.page_crawler.url_set.add(body_content["article_url"])
print_and_flush("ok")
except Exception as e:
print_and_flush('fail')
print_and_flush(e)
if self.page_crawler.has_next():
self.page_crawler.move_next()
else:
break
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_article()
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return content
def crawl_reply(self, body_content):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.crawl_all()
content_list = self.reply_crawler.get_content()
if content_list:
for i in content_list:
i['article_url'] = body_content['article_url']
i['platform_id'] = body_content['platform_id']
self.send_to_db.send_reply(content_list)
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
wait(3)
self.driver.get(url_list[i])
wait(5)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
if self.page_crawler.has_first_page():
self.crawl_all(backup_set)
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
backup_set = self.page_crawler.url_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()

View File

@@ -1,426 +0,0 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, 12)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, 24)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, 20)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
#printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
pass
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = list_crawler.has_next
for element in insta_list:
old_elements = 0
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
backup_set.add(element['url'])
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(1)
insta_list = list_crawler.load_more()
is_load_more = list_crawler.has_next
old_elements = 0
printl("list length = " + str(len(insta_list)))
for element in insta_list:
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
try:
self.crawl_content(element['url'], list_crawler.get_cookies(),
list_crawler.get_url())
except Exception as e:
printl(e)
backup_set.add(element['url'])
i += 1
except Exception as e:
printl(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()

View File

@@ -1,603 +0,0 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
import logging
from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('pymysql').setLevel(logging.WARNING)
def click_insta_load_more(driver):
element = driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def push_page_down(driver):
body = driver.find_element_by_tag_name('body')
body.send_keys(Keys.PAGE_DOWN)
def focus_driver(driver):
position = driver.get_window_position()
size = driver.get_window_size()
driver.maximize_window()
driver.set_window_size(size['width'], size["height"])
driver.set_window_position(position['x'], position['y'])
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result.date()
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result.date()
else:
return self.end_day()
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
if is_debuging:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.__set_cookies(self.__r.cookies)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaAlgorithm:
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
self.send_to_db = send_to_db
self.crawl_init = crawl_init
self.browser = browser
self.driver = driver
self.keyword_id = keyword_id
self.reload_wait_second = reload_wait_second
self.num_of_load_content = num_of_load_content
self.page_down = page_down
self.list_crawl = []
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
# printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start_crawl(self):
self.crawl()
self.close()
def close(self):
if self.driver and not is_debuging:
self.driver.quit()
self.send_to_db.close()
printl("Finished Crawling :)")
def crawl(self):
raise NotImplementedError
def is_until_page(self):
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
return True
else:
return False
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
self.list_crawl.append(element)
backup_set.add(element['url'])
if self.is_until_page():
return False
if self.list_crawl:
printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def crawl_list(self):
if self.list_crawl:
printl()
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
printl()
for element in self.list_crawl:
try:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(body_wait_sec)
self.crawl_content(element['url'], {}, element['url'])
except Exception as e:
printl(e)
logging.info(e)
class InstaAlgorithmNormal(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
self.crawl_list()
self.list_crawl.clear()
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmBrowser(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
def url_load(self, url):
if insta_tag_url in url:
list_tag = ListTag(url)
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
return list_tag, insta_list, end_cursor, has_next
else:
list_user = ListUser(url)
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
return list_user, insta_list, end_cursor, has_next
def crawl(self):
real_time = True
while real_time:
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
wait(3)
printl(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
list_crawler.set_end_cursor(end_cursor2)
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
# ajax load
page_down = 0
while is_load_more:
if page_down == self.page_down:
page_down = 0
try:
focus_driver(self.driver)
click_insta_load_more(self.driver)
except:
push_page_down(self.driver)
page_down += 1
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
# printl("list length = " + str(len(insta_list)))
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# printl("number of backup_set = {0}".format(len(backup_set)))
i += 1
self.crawl_list()
self.list_crawl.clear()
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
if self.driver:
self.driver.close()
wait(3)
self.driver = self.browser.new_browser()
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
self.browser = Browser()
self.driver = None
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
self.init_browser(browser)
def set_driver(self, driver):
self.driver = driver
def init_browser(self, browser):
try:
self.set_driver(self.browser.get_new_driver(browser))
except Exception as e:
logging.info(e)
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
if self.driver:
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
else:
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm.start_crawl()