From b873412ef299fc500f795f3ef1a8b1cc8805dbf1 Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 30 May 2017 03:32:11 +0000 Subject: [PATCH] git-svn-id: svn://192.168.0.12/source@348 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- WebBasedCrawler/effect.ini | 8 +- WebBasedCrawler/effect/InstaUrlValidator.py | 86 ++++++++ WebBasedCrawler/effect/effectinstagram.py | 200 +++++++++++++++++- WebBasedCrawler/effect/resultsender.py | 11 + WebBasedCrawler/effectprocess.py | 22 +- WebBasedCrawler/facebook/facebookcrawl_new.py | 91 ++++++++ WebBasedCrawler/facebook/facebookcrawltemp.py | 197 +++++++++++++++++ WebBasedCrawler/insta/instacrawl.py | 1 - WebBasedCrawler/insta/instaparser.py | 30 +-- 9 files changed, 616 insertions(+), 30 deletions(-) create mode 100644 WebBasedCrawler/effect/InstaUrlValidator.py create mode 100644 WebBasedCrawler/facebook/facebookcrawl_new.py create mode 100644 WebBasedCrawler/facebook/facebookcrawltemp.py diff --git a/WebBasedCrawler/effect.ini b/WebBasedCrawler/effect.ini index 7500cb9..f073140 100644 --- a/WebBasedCrawler/effect.ini +++ b/WebBasedCrawler/effect.ini @@ -1,5 +1,11 @@ -[database] +[#database] user=root pass=1234 host=192.168.0.82 +name=bigbird + +[database] +user=admin +pass=con2214lac! +host=182.162.171.147 name=bigbird \ No newline at end of file diff --git a/WebBasedCrawler/effect/InstaUrlValidator.py b/WebBasedCrawler/effect/InstaUrlValidator.py new file mode 100644 index 0000000..87b0d62 --- /dev/null +++ b/WebBasedCrawler/effect/InstaUrlValidator.py @@ -0,0 +1,86 @@ +class InstaUrlValidator: + def __init__(self, input_url): + self.protocol = 'https' + self.host = 'www.instagram.com' + self.path1 = 'p' + + self.input_user_key = '' + self.input_url = input_url + + def preprocess_input_url(self): + if type(self.input_url) != str: + raise TypeError('input url error') + + self.preprocessed_input_url = self.input_url.strip() + + def check_protocol(self): + start_index = 0 + end_index = self.preprocessed_input_url.find(':') + if end_index == -1: + return start_index + + if self.preprocessed_input_url[end_index+1] != '/' or self.preprocessed_input_url[end_index+2] != '/': + raise ValueError('incorrect url format') + + return end_index + 3 + + def check_host(self, start_index): + end_index = self.preprocessed_input_url.find('/', start_index) + if end_index == -1: + raise ValueError('incorrect url format') + + input_host = self.preprocessed_input_url[start_index:end_index] + if input_host not in self.host: + raise ValueError('incorrect host') + + return end_index + 1 + + def check_path1(self, start_index): + end_index = self.preprocessed_input_url.find('/', start_index) + if end_index == -1: + raise ValueError('incorrect path') + + input_path1 = self.preprocessed_input_url[start_index:end_index] + if input_path1 != self.path1: + raise ValueError('incorrect path (/p/)') + + return end_index + 1 + + def check_path2(self, start_index): + end_index = self.preprocessed_input_url.find('/', start_index) + # if end_index == -1: + # raise ValueError('incorrect path') + # + # self.input_user_key = self.preprocessed_input_url[start_index:end_index] + + if end_index != -1: + self.input_user_key = self.preprocessed_input_url[start_index:end_index] + else: + self.input_user_key = self.preprocessed_input_url[start_index:] + + def make_instagram_url(self): + if len(self.input_user_key) <= 0: + raise ValueError('incorrect user key') + + url = self.protocol + '://' + self.host + '/' + self.path1 + '/' + self.input_user_key + '/' + return url + + def validate_url(self): + try: + self.preprocess_input_url() + start_index = self.check_protocol() + start_index = self.check_host(start_index) + start_index = self.check_path1(start_index) + self.check_path2(start_index) + + except Exception as e: + raise e + + def get_insta_url(self): + try: + self.validate_url() + url = self.make_instagram_url() + except Exception as e: + raise e + + return url \ No newline at end of file diff --git a/WebBasedCrawler/effect/effectinstagram.py b/WebBasedCrawler/effect/effectinstagram.py index b21f113..4249e7f 100644 --- a/WebBasedCrawler/effect/effectinstagram.py +++ b/WebBasedCrawler/effect/effectinstagram.py @@ -36,6 +36,20 @@ insta_tag_url = "https://www.instagram.com/explore/tags/" insta_query = "https://www.instagram.com/query/" insta_body_url = 'https://www.instagram.com/p/' +DATE = 0 +REPLY_DAY = 1 +REPLY_ACC = 2 +LIKE_DAY = 3 +LIKE_ACC = 4 +DAY = 5 +ACC = 6 +REPLY = 7 +LIKE = 8 + +BUZZ_KEY = [ + "date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc" +] + def requests_get(req, timeout=requests_timeout): body = [] @@ -136,10 +150,25 @@ class InstaContent: class EffectInsta(object): - def __init__(self, event_num, event_code, url): + + def __init__(self, event_num, event_code, url, start_date): self.event_num = event_num self.event_code = event_code self.url = url + self.start_date = start_date.replace("-", "") + self.database = self.database_init() + + + def database_init(self): + try: + cg = get_settings() + except Exception as e: + raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error') + + database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender() + database.connect() + + return database def start(self): #content = insta.instacrawl.InstaContent(self.url, {}, self.url) @@ -187,8 +216,12 @@ class EffectInsta(object): result['replycount'] = int(body.get('article_order'), 0) result['likecount'] = int(body.get('reply_url'), 0) result['interactioncount'] = self.get_replycount(body, replies) - result['replybuzz'] = self.get_reply_buzz(body, replies) + replybuzz = self.get_reply_buzz(body, replies) + likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0)) + totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs) + result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True) result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0) + return result def get_replycount(self, body, replies): @@ -197,22 +230,171 @@ class EffectInsta(object): set_reply_id.add(i.get('article_id', '')) return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id) + # def get_reply_buzz(self, body, replies): + # start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date() + # end_date = datetime.datetime.now().date() + # date_dict = dict() + # while start_date <= end_date: + # date_dict[start_date.strftime('%Y%m%d')] = 0 + # start_date = start_date + datetime.timedelta(days=1) + # + # for reply in replies: + # str_reply_date = reply.get('article_date', '1990-01-01 00:00:00') + # reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d') + # print(reply_date) + # if reply_date in date_dict: + # date_dict[reply_date] = date_dict[reply_date] + 1 + # + # print(date_dict) + # + # json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] + # + # return json.dumps(json_array, sort_keys=True) + def get_reply_buzz(self, body, replies): - start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date() - end_date = datetime.datetime.now().date() + start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date() + today = datetime.datetime.now().date() + date_dict = dict() - while start_date <= end_date: + while start_date <= today: date_dict[start_date.strftime('%Y%m%d')] = 0 start_date = start_date + datetime.timedelta(days=1) for reply in replies: - str_reply_date = reply.get('article_date', '1990-01-01 00:00:00') - reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y') + str_reply_date = reply.get('article_date') + reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d') if reply_date in date_dict: date_dict[reply_date] = date_dict[reply_date] + 1 - json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] + reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d')) + reply_acc_count = 0 + for reply_buzz in reply_buzzs: + date = reply_buzz[BUZZ_KEY[DATE]] + reply_count = date_dict[date] + reply_acc_count += reply_count + reply_buzz[BUZZ_KEY[DAY]] = date_dict[date] + reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count - return json.dumps(json_array, sort_keys=True) + # json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] + # return json.dumps(json_array, sort_keys=True) + return reply_buzzs + def get_like_buzz(self, like_count): + start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date() + today = datetime.datetime.today().strftime('%Y%m%d') + try: + buzzs = self.database.get_buzz(self.event_num) + if buzzs != None: + buzzs = json.loads(buzzs) + else: + buzzs = [] + buzzs = self.get_buzzs(buzzs, LIKE) + like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today) + like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs) + like_buzzs = self.put_today_buzz(like_buzzs, like_count) + except Exception as e: + raise effect.effecterror.DBQueryError(str(e)) + + return like_buzzs + + def make_base_buzz_instance(self, values): + base_buzz_instance = dict() + base_buzz_instance[BUZZ_KEY[DATE]] = values[0] + base_buzz_instance[BUZZ_KEY[DAY]] = values[1] + base_buzz_instance[BUZZ_KEY[ACC]] = values[2] + + return base_buzz_instance + + def make_summary_buzz_instance(self, values): + summary_buzz_instance = dict() + summary_buzz_instance[BUZZ_KEY[DATE]] = values[0] + summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]] + summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]] + summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]] + summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]] + + return summary_buzz_instance + + def make_dummy_buzzs(self, start_date, end_date): + + startdate = datetime.datetime.strptime(start_date, '%Y%m%d') + enddate = datetime.datetime.strptime(end_date, '%Y%m%d') + + buzzs = [] + while startdate <= enddate: + buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0]) + buzzs.append(buzz_instance) + + startdate += datetime.timedelta(days=1) + + return buzzs + + def put_today_buzz(self, buzzs, today_acc_buzz_count): + today = datetime.date.today().strftime('%Y%m%d') + today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]] + # if today_buzz_count < 0: + # today_buzz_count = 0 + + result_buzzs = buzzs.copy() + result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0 + result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count + + return result_buzzs + + def fill_buzzs_into_dummy(self, buzzs, dummy): + buzzs_clone = buzzs.copy() + dummy_clone = dummy.copy() + + for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone): + dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]] + dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]] + dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]] + + for index, dummy_buzz in enumerate(dummy_clone): + previous_index = index - 1 + previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]] + current_acc_value = dummy_buzz[BUZZ_KEY[ACC]] + + if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0: + dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value + + return dummy_clone + + def get_buzzs(self, buzzs, buzz_type): + result_buzzs = [] + if buzz_type == LIKE: + for buzz in buzzs: + buzz_instance = self.make_base_buzz_instance([ + buzz[BUZZ_KEY[DATE]], + buzz[BUZZ_KEY[LIKE_DAY]], + buzz[BUZZ_KEY[LIKE_ACC]] + ]) + result_buzzs.append(buzz_instance) + + return result_buzzs + + def is_valid_data(self, reply_buzzs, like_buzzs): + reply_dates = self.get_date_list(reply_buzzs) + like_dates = self.get_date_list(like_buzzs) + + if reply_dates == like_dates: + return True + else: + return False + + def summary_reply_and_like(self, reply_buzzs, like_buzzs): + # if self.is_valid_data(reply_buzzs, like_buzzs) == False: + # raise IndexError("") + + summary_buzzs = [] + for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs): + date = reply_buzz[BUZZ_KEY[DATE]] + summary_buzz_instance = self.make_summary_buzz_instance([ + date, + reply_buzz, + like_buzz + ]) + summary_buzzs.append(summary_buzz_instance) + + return summary_buzzs \ No newline at end of file diff --git a/WebBasedCrawler/effect/resultsender.py b/WebBasedCrawler/effect/resultsender.py index 5478dc4..435d39d 100644 --- a/WebBasedCrawler/effect/resultsender.py +++ b/WebBasedCrawler/effect/resultsender.py @@ -37,6 +37,17 @@ class ResultSender: return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \ ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list))) + def get_buzz(self, event_num): + query = 'select replybuzz from stats_s1_effect where event_num = ' + str(event_num) + + if not self.conn.open: + self.connect() + with self.conn.cursor() as cursor: + cursor.execute(query) + buzz = cursor.fetchone() + + return buzz['replybuzz'] if buzz != None else buzz + def send(self, table_name, dictionary): query = self._make_query(table_name, dictionary) self._exec_query(query) diff --git a/WebBasedCrawler/effectprocess.py b/WebBasedCrawler/effectprocess.py index 9333fa1..6f7b65c 100644 --- a/WebBasedCrawler/effectprocess.py +++ b/WebBasedCrawler/effectprocess.py @@ -1,6 +1,7 @@ import effect.effectinstagram import effect.effecterror import effect.effectkakaostory +from effect.InstaUrlValidator import InstaUrlValidator from base.baseclasses import printl import sys import base.baseclasses @@ -33,9 +34,17 @@ def get_browser_info(platform_, file_name="browser.txt"): return options.get(platform_, options['default']) -def get_effect_process(platform_, event_num, url): +def get_effect_process(platform_, event_num, url, start_date): if platform_ == 'instagram': - return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url) + try: + insta_url_validator = InstaUrlValidator(url) + insta_url = insta_url_validator.get_insta_url() + except Exception as e: + printl("x!@#!@#!@#e010!@#check url") + exit(1) + + # return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url, start_date) + return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), insta_url, start_date) else: browser_info = get_browser_info(platform_) @@ -52,14 +61,19 @@ if __name__ == '__main__': sys.argv[1] instagram, kakaostory, facebook sys.argv[2] event_num sys.argv[3] url + sys.argv[4] start date """ - if len(sys.argv) != 4: + # if len(sys.argv) != 4: + # printl("x!@#!@#!@#e010!@#check argument") + # exit(1) + + if len(sys.argv) != 5: printl("x!@#!@#!@#e010!@#check argument") exit(1) try: - effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3]) + effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) effect_process.start() except effect.effecterror.EffectException as e: printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e)) diff --git a/WebBasedCrawler/facebook/facebookcrawl_new.py b/WebBasedCrawler/facebook/facebookcrawl_new.py new file mode 100644 index 0000000..1f77428 --- /dev/null +++ b/WebBasedCrawler/facebook/facebookcrawl_new.py @@ -0,0 +1,91 @@ +import time +from selenium.common.exceptions import WebDriverException +from base.baseclasses import find_element_by_css_selector +from base.baseclasses import find_elements_by_css_selector +from bs4 import BeautifulSoup + +try: + import lxml + parser_opt = 'lxml' +except ImportError: + parser_opt = 'html.parser' + +limit_reload = 5 + +list_tag_css_selector = "div#initial_browse_result" +list_page_css_selector = "div#pagelet_timeline_main_column" +list_group_css_selector = "div#pagelet_group_" +each_post_css_selector = "div._4-u2._4-u8" +wait_second_for_find_element = 30 + + +class ListBase(object): + def __init__(self, driver): + self.driver = driver + self.url_list = [] + self.list_css_selector = None + self.list_container_dom = None + self.current_post = None + + def set_url_elements(self): + elements = find_element_by_css_selector(self.driver, + self.list_css_selector + " " + each_post_css_selector, + wait_second_for_find_element) + self.url_list.extend(elements) + + def move_first(self): + self.url_list = self.current_post.pop(0) if self.url_list else None + + def move_next(self): + self.move_first() + + def check_list_and_load(self): + for _ in range(limit_reload): + num_of_list = len(self.url_list) + if num_of_list < 2: + self.load_more_list() + num_of_list = self.get_num_of_list() + if not num_of_list: + raise WebDriverException("There is no data or ajax error") + + def load_more_list(self): + position = self.driver.get_window_position() + size = self.driver.get_window_size() + self.driver.maximize_window() + self.driver.set_window_size(size['width'], size["height"]) + self.driver.set_window_position(position['x'], position['y']) + for _ in range(2): + self.driver.execute_script("window.scrollBy(0, -400)") + time.sleep(0.3) + for _ in range(4): + self.driver.execute_script("window.scrollBy(0, 800)") + time.sleep(0.3) + + def has_next(self): + raise NotImplementedError + + def get_url(self): + raise NotImplementedError + + def get_date(self): + raise NotImplementedError + + def remove_current_post(self): + css_selector = "div#" + self.current_post.id + self.driver.execute_script('document.querySelector("' + css_selector + '").remove()') + + def get_num_of_list(self): + raise NotImplementedError + + +class ListTag(ListBase): + def __init__(self, driver): + super().__init__(driver) + self.list_css_selector = list_tag_css_selector + + +class ListPage(ListBase): + def __init__(self, driver): + self.driver = driver + self.list_css_selector = list_page_css_selector + diff --git a/WebBasedCrawler/facebook/facebookcrawltemp.py b/WebBasedCrawler/facebook/facebookcrawltemp.py new file mode 100644 index 0000000..f014fb1 --- /dev/null +++ b/WebBasedCrawler/facebook/facebookcrawltemp.py @@ -0,0 +1,197 @@ +#-*- coding: utf-8 -*- + +import logging +import re +import json +import datetime +import time + + +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium.common.exceptions import WebDriverException +from bs4 import BeautifulSoup + +from base.baseclasses import SendtoDB +from base.baseclasses import print_and_flush +from base.baseclasses import CrawlInit +from base.baseclasses import wait +from base.baseclasses import find_element_by_css_selector +from base.baseclasses import find_elements_by_css_selector +from base.baseclasses import find_elements_by_xpath +from base.baseclasses import enter_element +from base.baseclasses import Browser + +facebook_url = "http://bigbird.iptime.org/fbtest.php" +facebook_tag_url = "https://www.facebook.com/hashtag/" + +facebook_id = 'concepters22@gmail.com' +facebook_password = 'zjstpqxjtm' + + +class FacebookInit(CrawlInit): + def __init__(self, before_day=0): + super().__init__(before_day) + self.urls = dict() + self.urls[11] = facebook_tag_url + self.urls[12] = facebook_url + + def split_searches(self): + search = self.searches() + splited_list = search.split(',') + return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list] + # trimmed_list = list() + # if self.platform() == 12: + # for x in splited_list: + # trimmed_list.append(x.strip()) + # else: + # for x in splited_list: + # trimmed_list.append(self.utf8(x)) + # return trimmed_list + + def make_url(self): + return [self.urls[self.platform()] + ('profile.php?id=' + x if x.isnumeric() else x) + "?fref=ts" + for x in self.split_searches()] + # return [self.urls[self.platform()] + x for x in self.split_searches()] + # urls = list() + # for x in self.split_searches(): + # url = self.urls[self.platform()] + x + "?fref=ts" + # urls.append(url) + # return urls + + def get_begin_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + result += datetime.timedelta(days=self.before_day) + return result + else: + return self.start_day() + + def get_end_day(self): + if self.is_realtime(): + date_now = datetime.datetime.now() + result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day) + return result + else: + return self.end_day() + + def is_hashtag(self): + return False if self.platform() == 12 else True + + +class FacebookMainCrawler: + def __init__(self): + self.crawl_init = FacebookInit() + self.browser = Browser() + self.driver = None + self.keyword_id = None + self.url = None + self.db_num = None + + def set_driver(self, driver): + self.driver = driver + + def set_keyword_id(self, keyword_id): + self.keyword_id = keyword_id + + def start(self): + self.crawl_start() + + def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): + self.init_browser(browser) + self.init_keyword_id(keyword_id) + self.init_db(db_num) + self.init_before_day(before_day) + self.init_until_page(until_page) + + def init_browser(self, browser): + self.set_driver(self.browser.get_new_driver(browser)) + + def init_keyword_id(self, keyword_id): + self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id + self.crawl_init.get_keyword_parameters(keyword_id) + self.crawl_init.disconnect() + + def init_db(self, db_num): + self.db_num = db_num + + def init_before_day(self, before_day): + self.crawl_init.set_before_day(before_day) + + def init_until_page(self, until_page): + self.crawl_init.set_until_page(until_page) + + def set_main_window_handler(self, window_handler): + self.main_window_handler = window_handler + + def crawl_start(self): + real_time = True + while real_time: + print_and_flush("Crawler Start") + url_list = self.crawl_init.make_url() + i = 0 + backup_set = set() + while i < len(url_list): + try: + self.set_main_window_handler(self.driver.window_handles[0]) + print_and_flush(url_list[i] + "\n") + self.driver.get(url_list[i]) + wait(5) + self.facebook_login() + body = self.driver.find_element_by_tag_name('body') + self.click_element(body) + self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(), + end_date=self.crawl_init.get_end_day()) + self.crawl_all_current_url(backup_set) + i += 1 + backup_set.clear() + except Exception as e: + logging.info(e) + self.driver.quit() + self.set_driver(self.browser.new_browser()) + wait(5) + real_time = self.crawl_init.is_realtime() + print_and_flush("Finished Crawling :)") + self.driver.quit() + + def go_bigbird(self, driver): + driver.get(facebook_url) + + def click_facebook_login(self, driver): + element_a = find_element_by_css_selector(driver, "a[href]", 15) + enter_element(element_a) + + def login_facebook(self, driver, f_id, f_pw): + element_email = find_element_by_css_selector(driver, "input#email", 15) + element_password = find_element_by_css_selector(driver, "input#pass", 15) + element_button = find_element_by_css_selector(driver, "button#loginbutton", 15) + element_email.send_keys(f_id) + element_password.send_keys(f_pw) + enter_element(element_button) + + def facebook_login(self): + try: + element_email = find_element_by_css_selector(self.driver, '#email', 15) + element_pwd = find_element_by_css_selector(self.driver, '#pass', 15) + except: + return + email = 'concepters22@gmail.com' + password = 'zjstpqxjtm' + element_email.send_keys(email) + element_pwd.send_keys(password) + label = self.driver.find_element_by_css_selector('#loginbutton') + element_input = label.find_element_by_xpath('input') + element_input.send_keys(Keys.NULL) + element_input.send_keys(Keys.ENTER) + wait(5) + + def click_element(self, element): + ac = ActionChains(self.driver) + # ac.move_to_element_with_offset(element, 0, 0).click().perform() + ac.move_to_element(element).click().perform() + wait(4) + diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index d3deb44..29a5ddf 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -314,7 +314,6 @@ def crawl_content_process(qu, keyword_id, db_num): break ok = True while ok: - time.sleep(2) try: # get a instance of InstaContent by do_no_proxy func. # if element['url'] is invalid, content is None diff --git a/WebBasedCrawler/insta/instaparser.py b/WebBasedCrawler/insta/instaparser.py index 3bbe120..f3951bf 100644 --- a/WebBasedCrawler/insta/instaparser.py +++ b/WebBasedCrawler/insta/instaparser.py @@ -103,10 +103,10 @@ def parse_body_html(content): start_cursor = None has_previous = False if postpage: - media = postpage[0]["media"] + media = postpage[0]["graphql"]["shortcode_media"] body = { - "article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"), - "article_data": media["caption"], + "article_date": (old_date + datetime.timedelta(seconds=media["taken_at_timestamp"])).strftime("%Y-%m-%d %H:%M:%S"), + "article_data": media["edge_media_to_caption"]["edges"][0]["node"]["text"], "article_id": media["owner"]["username"], "article_nickname": media["owner"]["username"], "platform_id": media["owner"]["username"], @@ -115,22 +115,22 @@ def parse_body_html(content): "platform_title": media["owner"]["username"], "article_form": "body", "article_profileurl": media["owner"]["profile_pic_url"], - "article_order": str(media["comments"]["count"]), - "article_hit": str(media.get('video_views', 0)), - "reply_url": str(media["likes"]["count"]) + "article_order": str(media["edge_media_to_comment"]["count"]), + "article_hit": str(0), + "reply_url": str(media["edge_media_preview_like"]["count"]) } - comments = postpage[0]["media"]["comments"] - has_previous = comments["page_info"]["has_previous_page"] - start_cursor = comments["page_info"]["start_cursor"] - nodes = comments["nodes"] + comments = postpage[0]["graphql"]["shortcode_media"]["edge_media_to_comment"] + has_previous = comments["page_info"]["has_next_page"] + start_cursor = comments["page_info"]["end_cursor"] + nodes = comments["edges"] for node in nodes: reply.append({ - "article_data": node["text"], + "article_data": node["node"]["text"], "article_date": - (old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"), - "article_id": node["user"]["username"], - "article_nickname": node["user"]["username"], - "article_profileurl": node["user"]["profile_pic_url"], + (old_date + datetime.timedelta(seconds=node["node"]["created_at"])).strftime("%Y-%m-%d %H:%M:%S"), + "article_id": node["node"]["owner"]["username"], + "article_nickname": node["node"]["owner"]["username"], + "article_profileurl": node["node"]["owner"]["profile_pic_url"], "platform_name": "instagram", "platform_form": "post", "article_form": "reply"