92 lines
2.9 KiB
Python
92 lines
2.9 KiB
Python
import time
|
|
from selenium.common.exceptions import WebDriverException
|
|
from base.baseclasses import find_element_by_css_selector
|
|
from base.baseclasses import find_elements_by_css_selector
|
|
from bs4 import BeautifulSoup
|
|
|
|
try:
|
|
import lxml
|
|
parser_opt = 'lxml'
|
|
except ImportError:
|
|
parser_opt = 'html.parser'
|
|
|
|
limit_reload = 5
|
|
|
|
list_tag_css_selector = "div#initial_browse_result"
|
|
list_page_css_selector = "div#pagelet_timeline_main_column"
|
|
list_group_css_selector = "div#pagelet_group_"
|
|
each_post_css_selector = "div._4-u2._4-u8"
|
|
wait_second_for_find_element = 30
|
|
|
|
|
|
class ListBase(object):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.url_list = []
|
|
self.list_css_selector = None
|
|
self.list_container_dom = None
|
|
self.current_post = None
|
|
|
|
def set_url_elements(self):
|
|
elements = find_element_by_css_selector(self.driver,
|
|
self.list_css_selector + " " + each_post_css_selector,
|
|
wait_second_for_find_element)
|
|
self.url_list.extend(elements)
|
|
|
|
def move_first(self):
|
|
self.url_list = self.current_post.pop(0) if self.url_list else None
|
|
|
|
def move_next(self):
|
|
self.move_first()
|
|
|
|
def check_list_and_load(self):
|
|
for _ in range(limit_reload):
|
|
num_of_list = len(self.url_list)
|
|
if num_of_list < 2:
|
|
self.load_more_list()
|
|
num_of_list = self.get_num_of_list()
|
|
if not num_of_list:
|
|
raise WebDriverException("There is no data or ajax error")
|
|
|
|
def load_more_list(self):
|
|
position = self.driver.get_window_position()
|
|
size = self.driver.get_window_size()
|
|
self.driver.maximize_window()
|
|
self.driver.set_window_size(size['width'], size["height"])
|
|
self.driver.set_window_position(position['x'], position['y'])
|
|
for _ in range(2):
|
|
self.driver.execute_script("window.scrollBy(0, -400)")
|
|
time.sleep(0.3)
|
|
for _ in range(4):
|
|
self.driver.execute_script("window.scrollBy(0, 800)")
|
|
time.sleep(0.3)
|
|
|
|
def has_next(self):
|
|
raise NotImplementedError
|
|
|
|
def get_url(self):
|
|
raise NotImplementedError
|
|
|
|
def get_date(self):
|
|
raise NotImplementedError
|
|
|
|
def remove_current_post(self):
|
|
css_selector = "div#" + self.current_post.id
|
|
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
|
|
|
|
def get_num_of_list(self):
|
|
raise NotImplementedError
|
|
|
|
|
|
class ListTag(ListBase):
|
|
def __init__(self, driver):
|
|
super().__init__(driver)
|
|
self.list_css_selector = list_tag_css_selector
|
|
|
|
|
|
class ListPage(ListBase):
|
|
def __init__(self, driver):
|
|
self.driver = driver
|
|
self.list_css_selector = list_page_css_selector
|
|
|