diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index 986788b..e398252 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -722,8 +722,6 @@ class InstaContent: url = 'https://www.instagram.com/graphql/query/?query_id={}&shortcode={}&first={}&after={}'.format( self.query_id, self.__code, len(self.reply), self.start_cursor) - # url = self.__referer + "?max_id="+self.start_cursor - # self.log_load_reply_more_before(form_data, headers) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) @@ -735,8 +733,6 @@ class InstaContent: self.reply += reply printl('{} - reply : {} (next : {})'.format(self.__referer, len(self.reply), self.start_cursor)) - # self.log_load_reply_more_after() - return self.reply def get_cookies(self): diff --git a/WebBasedCrawler/insta/instacrawl_backup.py b/WebBasedCrawler/insta/instacrawl_backup.py deleted file mode 100644 index ad0a3b7..0000000 --- a/WebBasedCrawler/insta/instacrawl_backup.py +++ /dev/null @@ -1,556 +0,0 @@ -#-*- coding: utf-8 -*- -''' -Created on 2015. 12. 8. - -@author: cococo -''' -import re -import datetime - -from base.baseclasses import SendtoDB -from base.baseclasses import print_and_flush -from base.baseclasses import CrawlInit -from base.baseclasses import wait -from base.baseclasses import find_element_by_xpath -from base.baseclasses import find_element_by_css_selector -from base.baseclasses import enter_element -from base.baseclasses import Browser -from selenium.webdriver.common.action_chains import ActionChains - -insta_url = "https://www.instagram.com/" -insta_tag_url = "https://www.instagram.com/explore/tags/" - - -class InstaInit(CrawlInit): - def __init__(self, before_day=0): - super().__init__(before_day) - self.urls = dict() - self.urls[9] = insta_tag_url - self.urls[10] = insta_url - - def split_searches(self): - search = self.searches() - splited_list = search.split(',') - trimmed_list = list() - if self.platform() == 10: - for x in splited_list: - trimmed_list.append(x.strip()) - else: - for x in splited_list: - trimmed_list.append(self.utf8(x)) - return trimmed_list - - def make_url(self): - urls = list() - for x in self.split_searches(): - url = self.urls[self.platform()] + x - urls.append(url) - return urls - - def get_begin_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - result += datetime.timedelta(days=self.before_day) - return result - else: - return self.start_day() - - def get_end_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - return result - else: - return self.end_day() - - -class InstaBodyCrawler: - def __init__(self, driver=None): - self.driver = driver - self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})") - - def set_driver(self, driver): - self.driver = driver - - def set_article(self, article=None): - if article is None: - try: - self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10) - except Exception as e: - print_and_flush(e) - raise Exception - else: - self.article = article - - def find_article_url(self): - a = self.article.find_element_by_xpath("div/section/a") - return a.get_attribute("href") - - def find_article_profileurl(self): - img = self.article.find_element_by_xpath("header/a/img[@src]") - return img.get_attribute("src") - - def find_article_nickname(self): - a = self.article.find_element_by_xpath("header/div/a") - return a.text - - def find_article_date(self): - el_time = self.article.find_element_by_xpath("div/section/a/time") - str_time = el_time.get_attribute("datetime") - m = self.re_date.search(str_time) - if m is None: - return "0000-00-00 00:00:00" - else: - return m.group(1) + " " + m.group(2) - - def find_article_data(self): - ul = self.article.find_element_by_xpath("div/ul") - try: - #li = ul.find_element_by_css_selector("li[data-reactid$='.0']") - span = ul.find_element_by_css_selector("li h1>span") - return span.text - except: - return "" - - def find_article_id(self): - return self.find_platform_id() - - def find_platform_name(self): - return 'instagram' - - def find_article_form(self): - return 'body' - - def find_platform_id(self): - a = self.article.find_element_by_xpath("header/div/a") - if a: - href = a.get_attribute("href") - str_id = href.replace(insta_url, "").replace("/", "") - return str_id - else: - return None - - def find_like_num(self): - div = self.article.find_element_by_xpath("div/section[1]/div") - try: - span = div.find_element_by_xpath("span/span") - str_num = span.text - str_num = str_num.replace(',', '') - if str_num[-1] == 'm': - num = float(str_num[0:-1]) * 1000000 - elif str_num[-1] == 'k': - num = float(str_num[0:-1]) * 1000 - else: - num = int(str_num) - return str(num) - except: - a_list = div.find_elements_by_tag_name("a") - if len(a_list) > 1: - return str(len(a_list)) - else: - if a_list and a_list[0].get_attribute('title'): - return str(1) - else: - return str(0) - # span = div.find_element_by_xpath("span[1]") - # if len(span.text.strip()) < 1: - # return str(1) - # else: - # return str(0) - - def find_reply_num(self): - ul = self.article.find_element_by_xpath("div/ul") - lis = ul.find_elements_by_tag_name("li") - if len(lis) < 2: - return "0" - try: - li = ul.find_element_by_css_selector("li[data-reactid$='.1']") - span = li.find_element_by_xpath("button/span[2]") - str_num = span.text.replace(",", "") - return str_num - except: - return str(len(lis) - 1) - - def get_content(self): - content = dict() - content["article_id"] = self.find_article_id() - content["platform_id"] = self.find_platform_id() - content["article_url"] = self.find_article_url() - content["article_profileurl"] = self.find_article_profileurl() - content["article_nickname"] = self.find_article_nickname() - content["platform_name"] = self.find_platform_name() - content["article_date"] = self.find_article_date() - content["article_data"] = self.find_article_data() - content["article_form"] = 'body' - content["platform_form"] = 'post' - content["platform_title"] = content["article_id"] - reply_num = self.find_reply_num() - if int(reply_num) > 0: - content["article_order"] = int(reply_num) - like_num = self.find_like_num() - if int(float(like_num)) > 0: - content["reply_url"] = int(float(like_num)) - return content - - def find_platform_title(self): - pass - - def find_article_title(self): - pass - - -class InstaReplyCrawler: - def __init__(self, driver=None, article=None): - self.driver = driver - self.activity = article - self.reply_list = list() - - def find_init(self): - self.reply_list.clear() - - def set_driver(self, driver): - self.driver = driver - - def set_article(self, article=None): - if article is None: - try: - self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10) - except Exception as e: - print_and_flush(e) - raise Exception - else: - self.article = article - - def has_more(self, ul): - try: - button = ul.find_element_by_css_selector("li>button") - return True - except Exception as e: - return False - - def read_more_reply(self, ul): - try: - button = ul.find_element_by_css_selector("li>button") - enter_element(button) - except Exception as e: - print_and_flush(e) - - def read_all_reply(self, ul): - i = 0 - while i < 200 and self.has_more(ul): - self.read_more_reply(ul) - i += 1 - # for i in range(0, 10): - # if self.has_more(ul): - # self.read_more_reply(ul) - # else: - # break - - def get_reply_ul(self): - ul = self.article.find_element_by_xpath("div/ul") - return ul - - def has_reply(self, ul): - try: - lis = ul.find_elements_by_css_selector("li>a") - if len(lis) > 0: - return True - except: - return False - return False - - def crawl_all(self): - self.find_init() - self.set_article() - try: - ul = self.get_reply_ul() - if self.has_reply(ul): - self.read_all_reply(ul) - self.crawl_reply(ul) - except Exception as e: - print_and_flush(e) - - def crawl_reply(self, ul): - article_data = self.find_article_data(ul) - article_id = self.find_article_id(ul) - if len(article_data) != len(article_id): - print_and_flush("article_data != article_id") - for i in range(0, len(article_id)): - content = dict() - content["article_data"] = article_data[i] - content["article_id"] = article_id[i] - content["article_nickname"] = article_id[i] - content["platform_name"] = "instagram" - content["platform_form"] = "post" - content["article_form"] = 'reply' - content["article_order"] = i - self.reply_list.append(content) - - def get_content(self): - return self.reply_list - - def find_article_id(self, ul): - id_list = list() - a_list = ul.find_elements_by_xpath("li/a") - for i in a_list: - id_list.append(i.text) - return id_list - - def find_article_profileurl(self, ul): - pass - - def find_article_nickname(self, ul): - return self.find_article_id(ul) - - def find_article_data(self, ul): - data_list = list() - span_list = ul.find_elements_by_css_selector("li>span") - for i in span_list: - data_list.append(i.text) - return data_list - - def find_article_url(self, ul): - pass - - def find_platform_id(self, ul): - pass - - def find_article_form(self, ul=None): - return 'reply' - - def find_platform_name(self, ul=None): - return 'instagram' - - def find_platform_form(self, ul=None): - return 'post' - - def click_element(self, element): - ac = ActionChains(self.driver) - ac.move_to_element_with_offset(element, 0, 0).click().perform() - wait(2) - - -class InstaPageCrawler: - def __init__(self, driver=None, begin_date=None, end_date=None): - self.driver = driver - self.url_set = set() - self.begin_date = begin_date - self.end_date = end_date - self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})") - - def set_driver(self, driver): - self.driver = driver - - def find_article_url(self): - a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60) - return a.get_attribute("href") - - def init(self): - self.url_set.clear() - - def set_date(self, begin_date, end_date): - self.set_begin_date(begin_date) - self.set_end_date(end_date) - - def set_end_date(self, end_date): - if type(end_date) == str: - self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') - elif type(end_date) == datetime.datetime or type(end_date) == datetime.date: - self.end_date = end_date - else: - self.end_date = datetime.datetime.today() - self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day) - self.end_date += datetime.timedelta(days=1) - - def set_begin_date(self, begin_date): - if type(begin_date) == str: - self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d') - elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date: - self.begin_date = begin_date - else: - self.begin_date = datetime.datetime.today() - self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day) - - def has_next(self): - try: - a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30) - return True - except: - return False - - def move_next(self): - try: - a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30) - enter_element(a) - return True - except: - return False - - def has_first_page(self): - try: - #a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60) - #a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']") - a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']") - enter_element(a) - return True - except: - return False - - def crawling_ok(self, url): - self.url_set.add(url) - - def is_earlier(self, time_date): - return True if time_date < self.begin_date else False - - def is_late(self, time_date): - return True if time_date > self.end_date else False - - def find_article_date(self): - el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60) - str_time = el_time.get_attribute("datetime") - m = self.re_date.search(str_time) - if m is None: - return "0000-00-00 00:00:00" - else: - return m.group(1) + " " + m.group(2) - - -class InstaMainCrawler: - def __init__(self): - self.page_crawler = InstaPageCrawler() - self.body_crawler = InstaBodyCrawler() - self.reply_crawler = InstaReplyCrawler() - self.send_to_db = SendtoDB() - self.browser = Browser() - self.crawl_init = InstaInit() - self.driver = None - - def set_driver(self, driver): - self.page_crawler.set_driver(driver) - self.body_crawler.set_driver(driver) - self.reply_crawler.set_driver(driver) - self.driver = driver - - def set_keyword_id(self, keyword_id): - self.keyword_id = keyword_id - - def crawl_all(self, backup_set=None): - self.page_crawler.init() - if backup_set: - self.page_crawler.url_set = backup_set.copy() - if not self.page_crawler.has_first_page(): - return - while True: - str_date = self.page_crawler.find_article_date() - date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S') - print_and_flush(str_date) - if self.page_crawler.find_article_url() in self.page_crawler.url_set: - if self.page_crawler.has_next(): - self.page_crawler.move_next() - continue - else: - break - # if self.page_crawler.is_earlier(date_val.date()): - if self.page_crawler.is_late(date_val): - if self.page_crawler.has_next(): - self.page_crawler.move_next() - continue - else: - break - # if self.page_crawler.is_late(date_val.date()): - if self.page_crawler.is_earlier(date_val): - break - try: - wait(3) - body_content = self.crawl_body() - self.crawl_reply(body_content) - self.page_crawler.url_set.add(body_content["article_url"]) - print_and_flush("ok") - except Exception as e: - print_and_flush('fail') - print_and_flush(e) - if self.page_crawler.has_next(): - self.page_crawler.move_next() - else: - break - - def crawl_body(self): - self.body_crawler.set_driver(self.driver) - self.body_crawler.set_article() - content = self.body_crawler.get_content() - content["keyword_id"] = self.keyword_id - print_and_flush(content["article_url"]) - self.send_to_db.delete_url(content['article_url']) - self.send_to_db.send_body(content) - return content - - def crawl_reply(self, body_content): - self.reply_crawler.set_driver(self.driver) - self.reply_crawler.crawl_all() - content_list = self.reply_crawler.get_content() - if content_list: - for i in content_list: - i['article_url'] = body_content['article_url'] - i['platform_id'] = body_content['platform_id'] - self.send_to_db.send_reply(content_list) - - def start(self): - self.crawler_start() - - def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): - self.init_browser(browser) - self.init_keyword_id(keyword_id) - self.init_db(db_num) - self.init_before_day(before_day) - self.init_until_page(until_page) - - def init_browser(self, browser): - self.set_driver(self.browser.get_new_driver(browser)) - - def init_keyword_id(self, keyword_id): - if type(keyword_id) != int: - self.keyword_id = int(keyword_id) - else: - self.keyword_id = keyword_id - self.crawl_init.get_keyword_parameters(keyword_id) - self.crawl_init.disconnect() - - def init_db(self, db_num): - self.send_to_db.set_db(db_num) - - def init_before_day(self, before_day): - self.crawl_init.set_before_day(before_day) - - def init_until_page(self, until_page): - self.crawl_init.set_until_page(until_page) - - def crawler_start(self): - real_time = True - while real_time: - print_and_flush("Crawling Start") - url_list = self.crawl_init.make_url() - i = 0 - backup_set = set() - while i < len(url_list): - try: - print_and_flush(url_list[i] + "\n") - wait(3) - self.driver.get(url_list[i]) - wait(5) - self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(), - end_date=self.crawl_init.get_end_day()) - if self.page_crawler.has_first_page(): - self.crawl_all(backup_set) - i += 1 - backup_set.clear() - except Exception as e: - print_and_flush(e) - backup_set = self.page_crawler.url_set.copy() - self.driver.quit() - self.set_driver(self.browser.new_browser()) - wait(5) - real_time = self.crawl_init.is_realtime() - print_and_flush("Finished Crawling :)") - self.send_to_db.close() - #self.driver.quit() diff --git a/WebBasedCrawler/insta/instacrawl_backup2.py b/WebBasedCrawler/insta/instacrawl_backup2.py deleted file mode 100644 index 88bd3e7..0000000 --- a/WebBasedCrawler/insta/instacrawl_backup2.py +++ /dev/null @@ -1,426 +0,0 @@ -#-*- coding: utf-8 -*- -''' -Created on 2015. 12. 8. - -@author: cococo -''' -import re -import datetime -import insta.instaparser as instaparser -import insta.instaheaders as instaheaders -import requests - -from base.baseclasses import SendtoDB -from base.baseclasses import print_and_flush -from base.baseclasses import CrawlInit -from base.baseclasses import wait - - -def printl(*objects, sep=' ', end='\n', file=None, flush=True): - print(*objects, sep=sep, end=end, file=file, flush=flush) - -insta_url = "https://www.instagram.com/" -insta_tag_url = "https://www.instagram.com/explore/tags/" -insta_query = "https://www.instagram.com/query/" -insta_body_url = 'https://www.instagram.com/p/' - - -class InstaInit(CrawlInit): - def __init__(self, before_day=0): - super().__init__(before_day) - self.urls = dict() - self.urls[9] = insta_tag_url - self.urls[10] = insta_url - - def split_searches(self): - search = self.searches() - splited_list = search.split(',') - trimmed_list = list() - if self.platform() == 10: - for x in splited_list: - trimmed_list.append(x.strip()) - else: - for x in splited_list: - trimmed_list.append(self.utf8(x)) - return trimmed_list - - def make_url(self): - urls = list() - for x in self.split_searches(): - url = self.urls[self.platform()] + x - urls.append(url) - return urls - - def get_begin_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - result += datetime.timedelta(days=self.before_day) - return result - else: - return self.start_day() - - def get_end_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - return result - else: - return self.end_day() - - -class ListTag: - def __init__(self, url): - self.__r = None - self.__tag = '' - self.__url = '' - self.list_tag = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.load_url(url) - - def load_url(self, url): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html()) - printl("") - printl("") - printl('headers = ', end=' ') - printl(instaheaders.get_headers_for_list_html()) - - self.__r.raise_for_status() - self.__tag = self.__get_tag(url) - self.__set_cookies(self.__r.cookies) - self.__url = url - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - - return self.list_tag - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, 12) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__set_cookies(self.__r.cookies) - self.__r.raise_for_status() - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - return self.list_tag - - def __get_tag(self, url): - m = re.search(insta_tag_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_tag - - -class ListUser: - def __init__(self, url): - self.__r = None - self.__user = '' - self.__url = '' - self.list_user = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.load_url(url) - - def load_url(self, url): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html()) - self.__r.raise_for_status() - self.__url = url - self.__set_cookies(self.__r.cookies) - self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) - return self.list_user - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, 24) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - - self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - return self.list_user - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_user - - -class InstaContent: - def __init__(self, url, cookies, referer): - self.__r = None - self.__referer = '' - self.__code = '' - self.body = None - self.reply = [] - self.start_cursor = None - self.has_previous = False - self.cookies = {} - self.load_url(url, cookies, referer) - - def load_url(self, url, cookies, referer): - self.__set_cookies(cookies) - self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies)) - self.__r.raise_for_status() - self.__referer = referer - self.__code = self.__get_code(url) - self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) - return self.body, self.reply - - def get_body(self): - return self.body - - def get_reply(self): - return self.reply - - def load_reply_more(self): - form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, 20) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) - printl("") - printl("") - printl('start_cursor = ' + self.start_cursor) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('start_cursor = ' + str(self.start_cursor)) - printl('has_previous = ', end='') - printl(self.has_previous) - printl("") - return self.reply - - def get_cookies(self): - return self.cookies - - def __get_code(self, url): - m = re.search(insta_body_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - -class InstaMainCrawler: - def __init__(self): - self.send_to_db = SendtoDB() - self.crawl_init = InstaInit() - - def set_keyword_id(self, keyword_id): - self.keyword_id = keyword_id - - def crawl_all(self, backup_set=None): - pass - - def crawl_content(self, url, cookies, referer): - content = InstaContent(url, cookies, referer) - body = content.get_body() - replies = content.get_reply() - body['article_url'] = url - body['keyword_id'] = self.keyword_id - #printl(body['article_url']) - while content.has_previous: - replies = content.load_reply_more() + replies - wait(2) - for j in range(0, len(replies)): - replies[j]['article_url'] = body['article_url'] - replies[j]['platform_id'] = body['platform_id'] - replies[j]['article_order'] = j - self.send_to_db.delete_url(body['article_url']) - self.send_to_db.send_body(body) - if replies: - self.send_to_db.send_reply(replies) - printl('ok') - printl() - - def start(self): - self.crawler_start() - - def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): - self.init_keyword_id(keyword_id) - self.init_db(db_num) - self.init_before_day(before_day) - self.init_until_page(until_page) - - def init_browser(self, browser): - pass - - def init_keyword_id(self, keyword_id): - if type(keyword_id) != int: - self.keyword_id = int(keyword_id) - else: - self.keyword_id = keyword_id - self.crawl_init.get_keyword_parameters(keyword_id) - self.crawl_init.disconnect() - - def init_db(self, db_num): - self.send_to_db.set_db(db_num) - - def init_before_day(self, before_day): - self.crawl_init.set_before_day(before_day) - - def init_until_page(self, until_page): - self.crawl_init.set_until_page(until_page) - - def crawler_start(self): - real_time = True - - while real_time: - print_and_flush("Crawling Start") - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - printl(url_list[i] + "\n") - if insta_tag_url in url_list[i]: - list_crawler = ListTag(url_list[i]) - else: - list_crawler = ListUser(url_list[i]) - wait(1) - insta_list = list_crawler.get_list() - is_load_more = list_crawler.has_next - for element in insta_list: - old_elements = 0 - if element['date'].date() > self.crawl_init.get_end_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - continue - elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - old_elements += 1 - if old_elements > 8: - is_load_more = False - break - else: - if not element['url'] in backup_set: - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - wait(1.5) - self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) - backup_set.add(element['url']) - - # ajax load - while is_load_more: - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(1) - insta_list = list_crawler.load_more() - is_load_more = list_crawler.has_next - old_elements = 0 - printl("list length = " + str(len(insta_list))) - for element in insta_list: - if element['date'].date() > self.crawl_init.get_end_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - continue - elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - - old_elements += 1 - if old_elements > 8: - is_load_more = False - break - else: - if not element['url'] in backup_set: - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - wait(1.5) - try: - self.crawl_content(element['url'], list_crawler.get_cookies(), - list_crawler.get_url()) - except Exception as e: - printl(e) - backup_set.add(element['url']) - i += 1 - except Exception as e: - printl(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - self.send_to_db.close() - #self.driver.quit() diff --git a/WebBasedCrawler/insta/instacrawl_backup3.py b/WebBasedCrawler/insta/instacrawl_backup3.py deleted file mode 100644 index 7b114b0..0000000 --- a/WebBasedCrawler/insta/instacrawl_backup3.py +++ /dev/null @@ -1,603 +0,0 @@ -#-*- coding: utf-8 -*- -''' -Created on 2015. 12. 8. - -@author: cococo -''' -import re -import datetime -import insta.instaparser as instaparser -import insta.instaheaders as instaheaders -import requests -import logging - - -from base.baseclasses import SendtoDB -from base.baseclasses import CrawlInit -from base.baseclasses import wait -from base.baseclasses import Browser -from selenium.webdriver.common.keys import Keys -from base.baseclasses import enter_element - - -def printl(*objects, sep=' ', end='\n', file=None, flush=True): - print(*objects, sep=sep, end=end, file=file, flush=flush) - -insta_url = "https://www.instagram.com/" -insta_tag_url = "https://www.instagram.com/explore/tags/" -insta_query = "https://www.instagram.com/query/" -insta_body_url = 'https://www.instagram.com/p/' - -is_debuging = False - -num_of_list_ajax = 24 -num_of_reply_ajax = 100 -list_wait_sec = 0.9 -body_wait_sec = 0.5 -reply_wait_sec = 0.8 -num_of_page_down = 20 - - -logging.basicConfig(level=logging.INFO, - format="%(module)s(%(lineno)s):%(funcName)s:%(message)s") -logging.getLogger('requests').setLevel(logging.WARNING) -logging.getLogger('pymysql').setLevel(logging.WARNING) - - -def click_insta_load_more(driver): - element = driver.find_element_by_css_selector("div._pupj3 > a") - enter_element(element) - - -def push_page_down(driver): - body = driver.find_element_by_tag_name('body') - body.send_keys(Keys.PAGE_DOWN) - - -def focus_driver(driver): - position = driver.get_window_position() - size = driver.get_window_size() - driver.maximize_window() - driver.set_window_size(size['width'], size["height"]) - driver.set_window_position(position['x'], position['y']) - - -class InstaInit(CrawlInit): - def __init__(self, before_day=0): - super().__init__(before_day) - self.urls = dict() - self.urls[9] = insta_tag_url - self.urls[10] = insta_url - - def split_searches(self): - search = self.searches() - splited_list = search.split(',') - trimmed_list = list() - if self.platform() == 10: - for x in splited_list: - trimmed_list.append(x.strip()) - else: - for x in splited_list: - trimmed_list.append(self.utf8(x)) - return trimmed_list - - def make_url(self): - urls = list() - for x in self.split_searches(): - url = self.urls[self.platform()] + x - urls.append(url) - return urls - - def get_begin_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - result += datetime.timedelta(days=self.before_day) - return result.date() - else: - return self.start_day() - - def get_end_day(self): - if self.is_realtime(): - date_now = datetime.datetime.now() - result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) - return result.date() - else: - return self.end_day() - - -class ListTag: - def __init__(self, url): - self.__r = None - self.__tag = '' - self.__url = '' - self.list_tag = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.load_url(url) - - def load_url(self, url): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html()) - if is_debuging: - printl("") - printl("") - printl('headers = ', end=' ') - printl(instaheaders.get_headers_for_list_html()) - - self.__r.raise_for_status() - self.__tag = self.__get_tag(url) - self.__set_cookies(self.__r.cookies) - self.__url = url - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - - return self.list_tag - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - if is_debuging: - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__set_cookies(self.__r.cookies) - self.__r.raise_for_status() - self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - return self.list_tag - - def __get_tag(self, url): - m = re.search(insta_tag_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_tag - - -class ListUser: - def __init__(self, url): - self.__r = None - self.__user = '' - self.__url = '' - self.list_user = [] - self.end_cursor = None - self.has_next = False - self.cookies = {} - self.load_url(url) - - def load_url(self, url): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html()) - self.__r.raise_for_status() - self.__url = url - self.__set_cookies(self.__r.cookies) - self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content) - return self.list_user - - def load_more(self): - form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) - if is_debuging: - printl("") - printl("") - printl('end_cursor = ' + str(self.end_cursor)) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('end_cursor = ' + str(self.end_cursor)) - printl('has_next = ', end='') - printl(self.has_next) - printl("") - - self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) - return self.list_user - - def get_cookies(self): - return self.cookies - - def get_url(self): - return self.__url - - def set_end_cursor(self, cursor): - self.end_cursor = cursor - - def get_end_cursor(self): - return self.end_cursor - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - def get_list(self): - return self.list_user - - -class InstaContent: - def __init__(self, url, cookies, referer): - self.__r = None - self.__referer = '' - self.__code = '' - self.body = None - self.reply = [] - self.start_cursor = None - self.has_previous = False - self.cookies = {} - self.load_url(url, cookies, referer) - - def load_url(self, url, cookies, referer): - self.__set_cookies(cookies) - self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies)) - self.__r.raise_for_status() - self.__referer = referer - self.__code = self.__get_code(url) - self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content) - self.__set_cookies(self.__r.cookies) - return self.body, self.reply - - def get_body(self): - return self.body - - def get_reply(self): - return self.reply - - def load_reply_more(self): - form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) - headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) - if is_debuging: - printl("") - printl("") - printl('start_cursor = ' + self.start_cursor) - printl('form_data' + form_data) - printl('headers = ', end=' ') - printl(headers) - self.__r = requests.post(insta_query, headers=headers, data=form_data) - self.__r.raise_for_status() - self.__set_cookies(self.__r.cookies) - self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content) - if is_debuging: - printl("") - printl('self.__r.cookies=', end='') - printl(self.__r.cookies) - printl('start_cursor = ' + str(self.start_cursor)) - printl('has_previous = ', end='') - printl(self.has_previous) - printl("") - return self.reply - - def get_cookies(self): - return self.cookies - - def __get_code(self, url): - m = re.search(insta_body_url + "([^/]*)", url) - if m: - return m.group(1) - else: - raise RuntimeError('Tag Error') - - def __set_cookies(self, cookies): - for k, v in cookies.items(): - self.cookies[k] = v - - -class InstaAlgorithm: - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - self.send_to_db = send_to_db - self.crawl_init = crawl_init - self.browser = browser - self.driver = driver - self.keyword_id = keyword_id - self.reload_wait_second = reload_wait_second - self.num_of_load_content = num_of_load_content - self.page_down = page_down - self.list_crawl = [] - - def crawl_content(self, url, cookies, referer): - content = InstaContent(url, cookies, referer) - body = content.get_body() - replies = content.get_reply() - body['article_url'] = url - body['keyword_id'] = self.keyword_id - # printl(body['article_url']) - while content.has_previous: - replies = content.load_reply_more() + replies - wait(reply_wait_sec) - for j in range(0, len(replies)): - replies[j]['article_url'] = body['article_url'] - replies[j]['platform_id'] = body['platform_id'] - replies[j]['article_order'] = j - self.send_to_db.delete_url(body['article_url']) - self.send_to_db.send_body(body) - if replies: - self.send_to_db.send_reply(replies) - printl('ok') - printl() - - def start_crawl(self): - self.crawl() - self.close() - - def close(self): - if self.driver and not is_debuging: - self.driver.quit() - self.send_to_db.close() - printl("Finished Crawling :)") - - def crawl(self): - raise NotImplementedError - - def is_until_page(self): - if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl): - return True - else: - return False - - def crawl_contents(self, contents_list, backup_set): - """ - :param contents_list: - :param backup_set: - :return: is_load_more - """ - old_elements = 0 - for element in contents_list: - if element['date'].date() > self.crawl_init.get_end_day(): - # printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - - elif element['date'].date() < self.crawl_init.get_begin_day(): - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - old_elements += 1 - if old_elements > 6: - return False - else: - if not element['url'] in backup_set: - # printl(element['url']) - # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - # wait(1.5) - # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) - self.list_crawl.append(element) - backup_set.add(element['url']) - if self.is_until_page(): - return False - if self.list_crawl: - printl("Number of Lists = {0}".format(len(self.list_crawl))) - return True - - def crawl_list(self): - if self.list_crawl: - printl() - printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S"))) - printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S"))) - printl("Total gathered contents = {0}".format(len(self.list_crawl))) - printl() - for element in self.list_crawl: - try: - printl(element['url']) - printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) - wait(body_wait_sec) - self.crawl_content(element['url'], {}, element['url']) - except Exception as e: - printl(e) - logging.info(e) - - -class InstaAlgorithmNormal(InstaAlgorithm): - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second, num_of_load_content, page_down) - if self.driver: - self.driver.quit() - - def crawl(self): - real_time = True - while real_time: - printl("Crawling Start") - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - printl(url_list[i] + "\n") - if insta_tag_url in url_list[i]: - list_crawler = ListTag(url_list[i]) - else: - list_crawler = ListUser(url_list[i]) - wait(1) - insta_list = list_crawler.get_list() - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - # ajax load - while is_load_more: - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(self.reload_wait_second) - insta_list = list_crawler.load_more() - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - self.crawl_list() - self.list_crawl.clear() - i += 1 - except Exception as e: - logging.info(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - - -class InstaAlgorithmBrowser(InstaAlgorithm): - def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second=2, num_of_load_content=12, page_down=50): - super().__init__(driver, browser, crawl_init, send_to_db, keyword_id, - reload_wait_second, num_of_load_content, page_down) - - def url_load(self, url): - if insta_tag_url in url: - list_tag = ListTag(url) - insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source) - return list_tag, insta_list, end_cursor, has_next - else: - list_user = ListUser(url) - insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source) - return list_user, insta_list, end_cursor, has_next - - def crawl(self): - real_time = True - while real_time: - url_list = self.crawl_init.make_url() - i = 0 - end_cursor = None - backup_set = set() - while i < len(url_list): - # first connect - try: - wait(3) - printl(url_list[i] + "\n") - self.driver.get(url_list[i]) - wait(5) - list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i]) - is_load_more = self.crawl_contents(insta_list, backup_set) and has_next - list_crawler.set_end_cursor(end_cursor2) - list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()} - # ajax load - page_down = 0 - while is_load_more: - if page_down == self.page_down: - page_down = 0 - try: - focus_driver(self.driver) - click_insta_load_more(self.driver) - except: - push_page_down(self.driver) - page_down += 1 - if end_cursor: - list_crawler.end_cursor = end_cursor - end_cursor = None - wait(self.reload_wait_second) - insta_list = list_crawler.load_more() - # printl("list length = " + str(len(insta_list))) - is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next - # printl("number of backup_set = {0}".format(len(backup_set))) - i += 1 - self.crawl_list() - self.list_crawl.clear() - except Exception as e: - logging.info(e) - end_cursor = list_crawler.end_cursor - printl('end_cursor=' + end_cursor) - if e.args: - wait(300) - if self.driver: - self.driver.close() - wait(3) - self.driver = self.browser.new_browser() - real_time = self.crawl_init.is_realtime() - printl("Finished Crawling :)") - - -class InstaMainCrawler: - def __init__(self): - self.send_to_db = SendtoDB() - self.crawl_init = InstaInit() - self.browser = Browser() - self.driver = None - - def set_keyword_id(self, keyword_id): - self.keyword_id = keyword_id - - def crawl_all(self, backup_set=None): - pass - - def start(self): - self.crawler_start() - - def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): - self.init_keyword_id(keyword_id) - self.init_db(db_num) - self.init_before_day(before_day) - self.init_until_page(until_page) - self.init_browser(browser) - - def set_driver(self, driver): - self.driver = driver - - def init_browser(self, browser): - try: - self.set_driver(self.browser.get_new_driver(browser)) - except Exception as e: - logging.info(e) - - def init_keyword_id(self, keyword_id): - if type(keyword_id) != int: - self.keyword_id = int(keyword_id) - else: - self.keyword_id = keyword_id - self.crawl_init.get_keyword_parameters(keyword_id) - self.crawl_init.disconnect() - - def init_db(self, db_num): - self.send_to_db.set_db(db_num) - - def init_before_day(self, before_day): - self.crawl_init.set_before_day(before_day) - - def init_until_page(self, until_page): - self.crawl_init.set_until_page(until_page) - - def crawler_start(self): - if self.driver: - algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db, - self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) - else: - algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db, - self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down) - algorithm.start_crawl()