Files
clients/WebBasedCrawler/facebook/facebookcrawl_new.py

92 lines
2.9 KiB
Python

import time
from selenium.common.exceptions import WebDriverException
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import find_elements_by_css_selector
from bs4 import BeautifulSoup
try:
import lxml
parser_opt = 'lxml'
except ImportError:
parser_opt = 'html.parser'
limit_reload = 5
list_tag_css_selector = "div#initial_browse_result"
list_page_css_selector = "div#pagelet_timeline_main_column"
list_group_css_selector = "div#pagelet_group_"
each_post_css_selector = "div._4-u2._4-u8"
wait_second_for_find_element = 30
class ListBase(object):
def __init__(self, driver):
self.driver = driver
self.url_list = []
self.list_css_selector = None
self.list_container_dom = None
self.current_post = None
def set_url_elements(self):
elements = find_element_by_css_selector(self.driver,
self.list_css_selector + " " + each_post_css_selector,
wait_second_for_find_element)
self.url_list.extend(elements)
def move_first(self):
self.url_list = self.current_post.pop(0) if self.url_list else None
def move_next(self):
self.move_first()
def check_list_and_load(self):
for _ in range(limit_reload):
num_of_list = len(self.url_list)
if num_of_list < 2:
self.load_more_list()
num_of_list = self.get_num_of_list()
if not num_of_list:
raise WebDriverException("There is no data or ajax error")
def load_more_list(self):
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
for _ in range(2):
self.driver.execute_script("window.scrollBy(0, -400)")
time.sleep(0.3)
for _ in range(4):
self.driver.execute_script("window.scrollBy(0, 800)")
time.sleep(0.3)
def has_next(self):
raise NotImplementedError
def get_url(self):
raise NotImplementedError
def get_date(self):
raise NotImplementedError
def remove_current_post(self):
css_selector = "div#" + self.current_post.id
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
def get_num_of_list(self):
raise NotImplementedError
class ListTag(ListBase):
def __init__(self, driver):
super().__init__(driver)
self.list_css_selector = list_tag_css_selector
class ListPage(ListBase):
def __init__(self, driver):
self.driver = driver
self.list_css_selector = list_page_css_selector