Files
clients/WebBasedCrawler/insta/instacrawl_backup2.py
admin 3074db4fa0 insta crawler 수정
git-svn-id: svn://192.168.0.12/source@278 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-06-28 03:29:33 +00:00

427 lines
15 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, 12)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, 24)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, 20)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
#printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(2)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
pass
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = list_crawler.has_next
for element in insta_list:
old_elements = 0
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
backup_set.add(element['url'])
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(1)
insta_list = list_crawler.load_more()
is_load_more = list_crawler.has_next
old_elements = 0
printl("list length = " + str(len(insta_list)))
for element in insta_list:
if element['date'].date() > self.crawl_init.get_end_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
continue
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 8:
is_load_more = False
break
else:
if not element['url'] in backup_set:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(1.5)
try:
self.crawl_content(element['url'], list_crawler.get_cookies(),
list_crawler.get_url())
except Exception as e:
printl(e)
backup_set.add(element['url'])
i += 1
except Exception as e:
printl(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
self.send_to_db.close()
#self.driver.quit()