198 lines
7.1 KiB
Python
198 lines
7.1 KiB
Python
#-*- coding: utf-8 -*-
|
|
|
|
import logging
|
|
import re
|
|
import json
|
|
import datetime
|
|
import time
|
|
|
|
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.common.exceptions import WebDriverException
|
|
from bs4 import BeautifulSoup
|
|
|
|
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import print_and_flush
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import find_element_by_css_selector
|
|
from base.baseclasses import find_elements_by_css_selector
|
|
from base.baseclasses import find_elements_by_xpath
|
|
from base.baseclasses import enter_element
|
|
from base.baseclasses import Browser
|
|
|
|
facebook_url = "http://bigbird.iptime.org/fbtest.php"
|
|
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
|
|
|
facebook_id = 'concepters22@gmail.com'
|
|
facebook_password = 'zjstpqxjtm'
|
|
|
|
|
|
class FacebookInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[11] = facebook_tag_url
|
|
self.urls[12] = facebook_url
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
|
# trimmed_list = list()
|
|
# if self.platform() == 12:
|
|
# for x in splited_list:
|
|
# trimmed_list.append(x.strip())
|
|
# else:
|
|
# for x in splited_list:
|
|
# trimmed_list.append(self.utf8(x))
|
|
# return trimmed_list
|
|
|
|
def make_url(self):
|
|
return [self.urls[self.platform()] + ('profile.php?id=' + x if x.isnumeric() else x) + "?fref=ts"
|
|
for x in self.split_searches()]
|
|
# return [self.urls[self.platform()] + x for x in self.split_searches()]
|
|
# urls = list()
|
|
# for x in self.split_searches():
|
|
# url = self.urls[self.platform()] + x + "?fref=ts"
|
|
# urls.append(url)
|
|
# return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result
|
|
else:
|
|
return self.end_day()
|
|
|
|
def is_hashtag(self):
|
|
return False if self.platform() == 12 else True
|
|
|
|
|
|
class FacebookMainCrawler:
|
|
def __init__(self):
|
|
self.crawl_init = FacebookInit()
|
|
self.browser = Browser()
|
|
self.driver = None
|
|
self.keyword_id = None
|
|
self.url = None
|
|
self.db_num = None
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def start(self):
|
|
self.crawl_start()
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
self.init_browser(browser)
|
|
self.init_keyword_id(keyword_id)
|
|
self.init_db(db_num)
|
|
self.init_before_day(before_day)
|
|
self.init_until_page(until_page)
|
|
|
|
def init_browser(self, browser):
|
|
self.set_driver(self.browser.get_new_driver(browser))
|
|
|
|
def init_keyword_id(self, keyword_id):
|
|
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
|
self.crawl_init.disconnect()
|
|
|
|
def init_db(self, db_num):
|
|
self.db_num = db_num
|
|
|
|
def init_before_day(self, before_day):
|
|
self.crawl_init.set_before_day(before_day)
|
|
|
|
def init_until_page(self, until_page):
|
|
self.crawl_init.set_until_page(until_page)
|
|
|
|
def set_main_window_handler(self, window_handler):
|
|
self.main_window_handler = window_handler
|
|
|
|
def crawl_start(self):
|
|
real_time = True
|
|
while real_time:
|
|
print_and_flush("Crawler Start")
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
try:
|
|
self.set_main_window_handler(self.driver.window_handles[0])
|
|
print_and_flush(url_list[i] + "\n")
|
|
self.driver.get(url_list[i])
|
|
wait(5)
|
|
self.facebook_login()
|
|
body = self.driver.find_element_by_tag_name('body')
|
|
self.click_element(body)
|
|
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
|
end_date=self.crawl_init.get_end_day())
|
|
self.crawl_all_current_url(backup_set)
|
|
i += 1
|
|
backup_set.clear()
|
|
except Exception as e:
|
|
logging.info(e)
|
|
self.driver.quit()
|
|
self.set_driver(self.browser.new_browser())
|
|
wait(5)
|
|
real_time = self.crawl_init.is_realtime()
|
|
print_and_flush("Finished Crawling :)")
|
|
self.driver.quit()
|
|
|
|
def go_bigbird(self, driver):
|
|
driver.get(facebook_url)
|
|
|
|
def click_facebook_login(self, driver):
|
|
element_a = find_element_by_css_selector(driver, "a[href]", 15)
|
|
enter_element(element_a)
|
|
|
|
def login_facebook(self, driver, f_id, f_pw):
|
|
element_email = find_element_by_css_selector(driver, "input#email", 15)
|
|
element_password = find_element_by_css_selector(driver, "input#pass", 15)
|
|
element_button = find_element_by_css_selector(driver, "button#loginbutton", 15)
|
|
element_email.send_keys(f_id)
|
|
element_password.send_keys(f_pw)
|
|
enter_element(element_button)
|
|
|
|
def facebook_login(self):
|
|
try:
|
|
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
|
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
|
except:
|
|
return
|
|
email = 'concepters22@gmail.com'
|
|
password = 'zjstpqxjtm'
|
|
element_email.send_keys(email)
|
|
element_pwd.send_keys(password)
|
|
label = self.driver.find_element_by_css_selector('#loginbutton')
|
|
element_input = label.find_element_by_xpath('input')
|
|
element_input.send_keys(Keys.NULL)
|
|
element_input.send_keys(Keys.ENTER)
|
|
wait(5)
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
ac.move_to_element(element).click().perform()
|
|
wait(4)
|
|
|