846 lines
35 KiB
Python
846 lines
35 KiB
Python
#-*- coding: utf-8 -*-
|
|
import logging
|
|
import re
|
|
import json
|
|
import datetime
|
|
|
|
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
|
|
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import print_and_flush
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import find_element_by_css_selector
|
|
from base.baseclasses import find_elements_by_css_selector
|
|
from base.baseclasses import find_elements_by_xpath
|
|
from base.baseclasses import enter_element
|
|
from base.baseclasses import Browser
|
|
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(module)s(%(lineno)s):%(funcName)s:%(message)s'
|
|
)
|
|
|
|
|
|
facebook_url = "https://www.facebook.com/"
|
|
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
|
|
|
|
|
class FacebookInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[11] = facebook_tag_url
|
|
self.urls[12] = facebook_url
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
|
# trimmed_list = list()
|
|
# if self.platform() == 12:
|
|
# for x in splited_list:
|
|
# trimmed_list.append(x.strip())
|
|
# else:
|
|
# for x in splited_list:
|
|
# trimmed_list.append(self.utf8(x))
|
|
# return trimmed_list
|
|
|
|
def make_url(self):
|
|
return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()]
|
|
# urls = list()
|
|
# for x in self.split_searches():
|
|
# url = self.urls[self.platform()] + x + "?fref=ts"
|
|
# urls.append(url)
|
|
# return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result
|
|
else:
|
|
return self.end_day()
|
|
|
|
def is_hashtag(self):
|
|
return False if self.platform() == 12 else True
|
|
|
|
|
|
class FacebookBodyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.re_date = re.compile(
|
|
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
|
)
|
|
self.re_id = re.compile("id=([\\d]+)")
|
|
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
|
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
|
#(("id=([\\d]+)|facebook.com/([\\w._]+)\\?"))
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def find_article_id(self):
|
|
href = self.find_article_url()
|
|
m = self.re_ids.search(href)
|
|
return m.group(1) if m.group(2) is None else m.group(2)
|
|
|
|
def find_article_nickname(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("div[class='fbPhotoContributorName']")
|
|
except:
|
|
element = self.driver.find_element_by_css_selector("span.fwb>a")
|
|
return element.text
|
|
|
|
def find_article_data(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("span[class='hasCaption']")
|
|
except:
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("div[class='_5pbx userContent']")
|
|
except:
|
|
return ""
|
|
return element.text
|
|
|
|
def find_platform_id(self):
|
|
pass
|
|
|
|
def find_article_date(self):
|
|
element = self.driver.find_element_by_css_selector("abbr[data-utime]")
|
|
str_datetime = element.get_attribute("title")
|
|
logging.debug(str_datetime)
|
|
m = self.re_date.match(str_datetime)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
|
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
|
m.group(5) + ":00"
|
|
else:
|
|
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
|
|
|
|
def find_article_url(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']")
|
|
except:
|
|
element = self.driver.find_element_by_css_selector("span#fbPhotoPageTimestamp>a[class='_39g5']")
|
|
return element.get_attribute('href')
|
|
|
|
def find_article_title(self):
|
|
return self.driver.title
|
|
|
|
def find_platform_name(self):
|
|
pass
|
|
|
|
def find_like_users(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
|
|
enter_element(element)
|
|
ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
|
except:
|
|
return None
|
|
try:
|
|
while True:
|
|
#a_element = find_element_by_css_selector(self.driver, "a[class$='uiBoxLightblue uiMorePagerPrimary']",
|
|
# 30)
|
|
a_element = WebDriverWait(self.driver, 20).\
|
|
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
|
"a[class$='uiBoxLightblue uiMorePagerPrimary']")))
|
|
enter_element(a_element)
|
|
wait(1)
|
|
except Exception as e:
|
|
print(e)
|
|
#ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
|
a_elements = self.driver.find_elements_by_css_selector("div[class='fsl fwb fcb']>a")
|
|
like_users = list()
|
|
for a in a_elements:
|
|
like_user = dict()
|
|
like_user['nickname'] = a.text
|
|
m = self.re_ids.search(a.get_attribute('href'))
|
|
like_user['id'] = m.group(2) if m.group(1) is None else m.group(1)
|
|
like_users.append(like_user)
|
|
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
|
enter_element(cancel)
|
|
return {'data': like_users, 'count': len(like_users)}
|
|
|
|
def find_share_users(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
|
|
enter_element(element)
|
|
#share_element = find_element_by_css_selector(self.driver, "#repost_view_dialog", 30)
|
|
page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
|
|
except:
|
|
return None
|
|
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
|
while len(page_scroller_children) > 1:
|
|
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
|
|
wait(2)
|
|
#page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
|
|
page_scroller = WebDriverWait(self.driver, 20).\
|
|
until(EC.visibility_of_element_located((By.CSS_SELECTOR, "pagelet_scrolling_pager")))
|
|
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
|
a_tags = self.driver.find_elements_by_css_selector("span[class='fwb']>a[class='profileLink']")
|
|
share_users = list()
|
|
for a in a_tags:
|
|
share_user = dict()
|
|
share_user['url'] = a.get_attribute('href')
|
|
share_user['nickname'] = a.text
|
|
str_id = share_user['url'][share_user['url'].rindex('/') + 1:]
|
|
m = self.re_id.search(str_id)
|
|
share_user['id'] = str_id if m is None else m.group(1)
|
|
share_users.append(share_user)
|
|
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
|
enter_element(cancel)
|
|
return {'data': share_users, 'count': len(share_users)}
|
|
|
|
def find_like_user_number(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
|
|
except:
|
|
return None
|
|
str_num = element.text
|
|
m = re.search("(\\d+)", str_num.replace(",", ""))
|
|
return None if m is None else m.group(1)
|
|
|
|
def find_share_user_number(self):
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
|
|
except:
|
|
return None
|
|
str_num = element.text
|
|
m = re.search("(\\d+)", str_num.replace(",", ""))
|
|
return None if m is None else m.group(1)
|
|
|
|
def find_reply_number(self):
|
|
pass
|
|
|
|
def find_article_profileurl(self):
|
|
try:
|
|
img = self.driver.find_element_by_css_selector('div._38vo>img')
|
|
except:
|
|
img = self.driver.find_element_by_css_selector("img._s0._54ru")
|
|
return img.get_attribute('src')
|
|
|
|
def get_content(self):
|
|
content = dict()
|
|
content['article_id'] = self.find_article_id()
|
|
content['article_url'] = self.find_article_url()
|
|
content['article_data'] = self.find_article_data()
|
|
content['article_date'] = self.find_article_date()
|
|
content['article_title'] = self.find_article_title()
|
|
content['article_nickname'] = self.find_article_nickname()
|
|
content['article_form'] = 'body'
|
|
content['platform_name'] = 'facebook'
|
|
content['platform_form'] = 'post'
|
|
content['platform_title'] = content['article_nickname']
|
|
content['platform_id'] = content['article_id']
|
|
content['article_profileurl'] = self.find_article_profileurl()
|
|
like_user_num = self.find_like_user_number()
|
|
share_user_num = self.find_share_user_number()
|
|
if like_user_num:
|
|
content['article_hit'] = self.find_like_user_number()
|
|
if share_user_num:
|
|
content['reply_url'] = self.find_share_user_number()
|
|
likes = self.find_like_users()
|
|
shares = self.find_like_users()
|
|
data = list()
|
|
if likes:
|
|
data.append({"likes": likes})
|
|
if shares:
|
|
data.append({"shares": shares})
|
|
if data:
|
|
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode({"data": data})
|
|
return content
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
|
|
class FacebookReplyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.reply_list = list()
|
|
self.order = 0
|
|
self.div = None
|
|
self.re_date = re.compile(
|
|
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
|
)
|
|
self.re_id = re.compile("id=([\\d]+)")
|
|
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
|
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
|
|
|
def find_init(self):
|
|
self.reply_list.clear()
|
|
self.order = 0
|
|
self.reload_count = 0
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def read_all_reply(self):
|
|
try:
|
|
a_element = WebDriverWait(self.driver, 15).\
|
|
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
|
"a.UFIPagerLink")))
|
|
enter_element(a_element)
|
|
except:
|
|
pass
|
|
self.read_all_child_reply()
|
|
|
|
def read_all_child_reply(self):
|
|
try:
|
|
a_elements = self.driver.find_elements_by_css_selector("a.UFICommentLink")
|
|
for a_element in a_elements:
|
|
enter_element(a_element)
|
|
except:
|
|
pass
|
|
|
|
def set_div(self, div=None):
|
|
if div is None:
|
|
try:
|
|
self.div = self.driver.find_element_by_xpath(
|
|
"//div[@data-reactroot and @class='UFIList']/div[not(@class)]")
|
|
# self.div = self.driver.find_element_by_css_selector("div[data-reactroot].UFIList>div:not([class])")
|
|
except:
|
|
self.div = None
|
|
else:
|
|
self.div = div
|
|
|
|
def has_reply(self):
|
|
"""after set_div execute this"""
|
|
if not self.div:
|
|
return False
|
|
else:
|
|
children = self.div.find_elements_by_css_selector("*")
|
|
return True if len(children) > 0 else False
|
|
|
|
def crawl_reply(self, div, article_parent=None):
|
|
content = dict()
|
|
content['article_id'] = self.find_article_id(div)
|
|
content['article_nickname'] = self.find_article_nickname(div)
|
|
content['article_data'] = self.find_article_data(div)
|
|
content['article_date'] = self.find_article_date(div)
|
|
content['article_profileurl'] = self.find_article_profileurl(div)
|
|
content['article_order'] = self.order
|
|
like_num = self.find_like_number(div)
|
|
if like_num:
|
|
content['article_hit'] = like_num
|
|
if article_parent:
|
|
content['article_parent'] = article_parent
|
|
content.update({'article_form': 'reply', 'platform_name': 'facebook', 'platform_form': 'post'})
|
|
self.order += 1
|
|
self.reply_list.append(content)
|
|
|
|
def crawl_all(self):
|
|
self.read_all_reply()
|
|
self.set_div()
|
|
try:
|
|
if self.has_reply():
|
|
elements = self.div.find_elements_by_xpath("div")
|
|
article_parent = None
|
|
for div in elements:
|
|
if div.get_attribute('class').find("UFIReplyList") != -1:
|
|
reply_div = div.find_elements_by_xpath('div[@role]')
|
|
for child in reply_div:
|
|
self.crawl_reply(child, article_parent)
|
|
elif div.get_attribute("role") == "article":
|
|
self.crawl_reply(div)
|
|
article_parent = self.reply_list[len(self.reply_list) - 1]['article_nickname']
|
|
else:
|
|
pass
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
def get_content(self):
|
|
return self.reply_list
|
|
|
|
def find_article_id(self, div):
|
|
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
|
|
m = self.re_ids.search(element.get_attribute('href'))
|
|
if not m:
|
|
return 'None'
|
|
return m.group(1) if not m.group(2) else m.group(2)
|
|
|
|
def find_article_parent(self, div):
|
|
pass
|
|
|
|
def find_article_date(self, div):
|
|
element = div.find_element_by_css_selector("abbr.livetimestamp")
|
|
str_datetime = element.get_attribute("title")
|
|
m = self.re_date.match(str_datetime)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
|
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
|
m.group(5) + ":00"
|
|
else:
|
|
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
|
|
|
|
def find_article_data(self, div):
|
|
element = div.find_element_by_css_selector("span.UFICommentBody")
|
|
return element.text
|
|
|
|
def find_article_profileurl(self, div):
|
|
element = div.find_element_by_css_selector("img[class^='img UFIActorImage']")
|
|
return element.get_attribute('src')
|
|
|
|
def find_article_nickname(self, div):
|
|
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
|
|
return element.text
|
|
|
|
def find_like_number(self, div):
|
|
try:
|
|
element = div.find_element_by_css_selector('a[ajaxify]')
|
|
m = re.search("(\\d+)", element.text.replace(",", ""))
|
|
return m.group(1) if m else None
|
|
except:
|
|
return None
|
|
|
|
|
|
class FacebookPageCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.url_set = set()
|
|
self.index = 0
|
|
self.limit = 500
|
|
self.re_date = re.compile("([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})")
|
|
self.reload_count = 0
|
|
self.is_hash = False
|
|
self.main_handle = None
|
|
self.begin_date = None
|
|
self.end_date = None
|
|
self.posts = None
|
|
self.current_url = None
|
|
|
|
def set_limit(self, limit=500):
|
|
self.limit = limit
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_main_handle(self):
|
|
self.main_handle = self.driver.window_handles[0]
|
|
|
|
def find_article_date(self, div):
|
|
try:
|
|
element = div.find_element_by_css_selector("abbr.livetimestamp")
|
|
except:
|
|
element = div.find_element_by_css_selector("abbr[title]")
|
|
str_datetime = element.get_attribute("title")
|
|
logging.debug(str_datetime)
|
|
m = self.re_date.match(str_datetime)
|
|
if m is None:
|
|
return datetime.datetime(year=1999, month=1, day=1)
|
|
else:
|
|
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
|
return datetime.datetime(
|
|
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
|
hour=(int(m.group(4)) + 12), minute=int(m.group(5))
|
|
)
|
|
else:
|
|
return datetime.datetime(
|
|
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
|
hour=(int(m.group(4))), minute=int(m.group(5))
|
|
)
|
|
|
|
def next_post_by_user(self):
|
|
try:
|
|
#self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts_wait()
|
|
if len(self.posts) < 1:
|
|
print_and_flush("not posts")
|
|
self.posts = None
|
|
return None
|
|
except Exception as e:
|
|
print_and_flush("cannot found _5pcq")
|
|
logging.info(e)
|
|
self.posts = None
|
|
return None
|
|
while True:
|
|
self.index += 1
|
|
if self.index >= len(self.posts):
|
|
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
|
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts()
|
|
if self.index >= len(self.posts):
|
|
if self.load_more_posts() is False:
|
|
self.posts = None
|
|
return None
|
|
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
|
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts()
|
|
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
|
continue
|
|
time_date = self.find_article_date(self.posts[self.index - 1])
|
|
logging.info("number of posts: " + str(len(self.posts)))
|
|
print_and_flush(str(time_date))
|
|
if type(time_date) == str:
|
|
continue
|
|
if self.is_earlier(time_date):
|
|
self.posts = None
|
|
return None
|
|
if self.is_late(time_date):
|
|
continue
|
|
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
|
return self.posts[self.index - 1]
|
|
|
|
def next_post_by_tag(self):
|
|
try:
|
|
# self.posts = find_elements_by_css_selector(self.driver, "a[class='_5pcq']")
|
|
# self.posts = find_elements_by_xpath(self.driver, "//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts_wait()
|
|
if len(self.posts) < 1:
|
|
print_and_flush("not posts")
|
|
self.posts = None
|
|
return None
|
|
except Exception as e:
|
|
print_and_flush("cannot found _5pcq")
|
|
logging.info(e)
|
|
self.posts = None
|
|
return None
|
|
while True:
|
|
self.index += 1
|
|
if self.index > self.limit:
|
|
self.posts = None
|
|
return None
|
|
if self.index >= len(self.posts):
|
|
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
|
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts()
|
|
if self.index >= len(self.posts):
|
|
if self.load_more_posts() is False:
|
|
self.posts = None
|
|
return None
|
|
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
|
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
self.posts = self.find_posts()
|
|
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
|
continue
|
|
logging.info("number of posts: " + str(len(self.posts)))
|
|
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
|
return self.posts[self.index - 1]
|
|
|
|
def load_more_posts(self):
|
|
# previous_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
# previous_posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
|
previous_posts = self.find_posts()
|
|
for i in range(0, 4):
|
|
print_and_flush("Try load more")
|
|
body = self.driver.find_element_by_tag_name("body")
|
|
body.send_keys(Keys.NULL)
|
|
body.send_keys(Keys.END)
|
|
wait(4)
|
|
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
present_posts = self.find_posts()
|
|
if len(previous_posts) != len(present_posts):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
for i in range(0, 4):
|
|
print_and_flush("Try load more")
|
|
body = self.driver.find_element_by_tag_name("body")
|
|
for j in range(0, 2):
|
|
body.send_keys(Keys.PAGE_UP)
|
|
wait(0.1)
|
|
for j in range(0, 15):
|
|
body.send_keys(Keys.PAGE_DOWN)
|
|
wait(0.1)
|
|
wait(4)
|
|
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
present_posts = self.find_posts()
|
|
if len(previous_posts) != len(present_posts):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
for i in range(0, 10):
|
|
print_and_flush("Try load more")
|
|
self.driver.execute_script("window.scrollBy(0, 800)")
|
|
wait(4)
|
|
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
|
present_posts = self.find_posts()
|
|
if len(previous_posts) != len(present_posts):
|
|
wait(2)
|
|
self.reload_count = 0
|
|
return True
|
|
if self.reload_count < 8:
|
|
print_and_flush("index reload")
|
|
self.reload_count += 1
|
|
self.index -= 1 if self.index > 0 else 0
|
|
position = self.driver.get_window_position()
|
|
size = self.driver.get_window_size()
|
|
self.driver.maximize_window()
|
|
self.driver.set_window_size(size['width'], size["height"])
|
|
self.driver.set_window_position(position['x'], position['y'])
|
|
return True
|
|
if self.reload_count < 15:
|
|
print_and_flush("refresh")
|
|
self.driver.refresh()
|
|
wait(5)
|
|
self.index = 0
|
|
self.reload_count += 1
|
|
return True
|
|
return False
|
|
|
|
def is_earlier(self, time_date):
|
|
return True if time_date < self.begin_date else False
|
|
|
|
def is_late(self, time_date):
|
|
return True if time_date > self.end_date else False
|
|
|
|
def set_date(self, begin_date, end_date):
|
|
self.set_begin_date(begin_date)
|
|
self.set_end_date(end_date)
|
|
|
|
def set_end_date(self, end_date):
|
|
if type(end_date) == str:
|
|
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
|
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
|
self.end_date = end_date
|
|
else:
|
|
self.end_date = datetime.datetime.today()
|
|
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
|
self.end_date += datetime.timedelta(days=1)
|
|
|
|
def set_begin_date(self, begin_date):
|
|
if type(begin_date) == str:
|
|
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
|
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
|
self.begin_date = begin_date
|
|
else:
|
|
self.begin_date = datetime.datetime.today()
|
|
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month,
|
|
day=self.begin_date.day)
|
|
|
|
def crawling_ok(self):
|
|
self.url_set.add(self.current_url)
|
|
|
|
def init(self):
|
|
self.index = 0
|
|
self.posts = None
|
|
self.url_set.clear()
|
|
|
|
def find_posts(self):
|
|
try:
|
|
divs = self.driver.find_elements_by_xpath("//div[@class='_1dwg']")
|
|
except:
|
|
return None
|
|
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
|
posts = list()
|
|
for div in divs:
|
|
try:
|
|
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
|
except:
|
|
pass
|
|
return posts
|
|
|
|
def find_posts_wait(self):
|
|
try:
|
|
divs = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']", 30)
|
|
except:
|
|
return None
|
|
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
|
posts = list()
|
|
for div in divs:
|
|
try:
|
|
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
|
except:
|
|
pass
|
|
return posts
|
|
|
|
|
|
class FacebookMainCrawler:
|
|
def __init__(self):
|
|
self.page_crawler = FacebookPageCrawler()
|
|
self.body_crawler = FacebookBodyCrawler()
|
|
self.reply_crawler = FacebookReplyCrawler()
|
|
self.send_to_db = SendtoDB()
|
|
self.crawl_init = FacebookInit()
|
|
self.browser = Browser()
|
|
self.driver = None
|
|
self.keyword_id = None
|
|
self.url = None
|
|
|
|
def set_driver(self, driver):
|
|
self.page_crawler.set_driver(driver)
|
|
self.body_crawler.set_driver(driver)
|
|
self.reply_crawler.set_driver(driver)
|
|
self.driver = driver
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def crawl_all_current_url(self, backup_set=None):
|
|
self.page_crawler.init()
|
|
if backup_set:
|
|
self.page_crawler.url_set = backup_set.copy()
|
|
while True:
|
|
post = self.page_crawler.next_post_by_tag() if self.crawl_init.is_hashtag() \
|
|
else self.page_crawler.next_post_by_user()
|
|
if post is None:
|
|
break
|
|
try:
|
|
self.click_new_tab(post)
|
|
self.control_tab()
|
|
self.switch_new_tab()
|
|
wait(5)
|
|
body = self.driver.find_element_by_tag_name('body')
|
|
self.click_element(body)
|
|
body_info = self.crawl_body()
|
|
self.crawl_reply(body_info)
|
|
self.page_crawler.crawling_ok()
|
|
print_and_flush("ok")
|
|
self.switch_main_tab()
|
|
except WebDriverException as ee:
|
|
logging.info(ee)
|
|
print_and_flush("fail")
|
|
raise WebDriverException
|
|
except Exception as e:
|
|
print_and_flush("failed")
|
|
logging.info(e)
|
|
|
|
def crawl_body(self):
|
|
self.body_crawler.set_driver(self.driver)
|
|
content = self.body_crawler.get_content()
|
|
content["keyword_id"] = self.keyword_id
|
|
print_and_flush(content["article_url"])
|
|
self.send_to_db.delete_url(content['article_url'])
|
|
self.send_to_db.send_body(content)
|
|
return {"article_url": content["article_url"], "platform_id": content["platform_id"]}
|
|
|
|
def crawl_reply(self, body_info):
|
|
self.reply_crawler.set_driver(self.driver)
|
|
self.reply_crawler.set_div()
|
|
if self.reply_crawler.has_reply():
|
|
self.reply_crawler.crawl_all()
|
|
contents = self.reply_crawler.get_content()
|
|
for content in contents:
|
|
content.update(body_info)
|
|
self.send_to_db.send_reply(contents)
|
|
|
|
def start(self):
|
|
self.crawl_start()
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
self.init_browser(browser)
|
|
self.init_keyword_id(keyword_id)
|
|
self.init_db(db_num)
|
|
self.init_before_day(before_day)
|
|
self.init_until_page(until_page)
|
|
|
|
def init_browser(self, browser):
|
|
self.set_driver(self.browser.get_new_driver(browser))
|
|
|
|
def init_keyword_id(self, keyword_id):
|
|
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
|
self.crawl_init.disconnect()
|
|
|
|
def init_db(self, db_num):
|
|
self.send_to_db.set_db(db_num)
|
|
|
|
def init_before_day(self, before_day):
|
|
self.crawl_init.set_before_day(before_day)
|
|
|
|
def init_until_page(self, until_page):
|
|
self.crawl_init.set_until_page(until_page)
|
|
self.page_crawler.set_limit(self.crawl_init.until_page)
|
|
|
|
def set_main_window_handler(self, window_handler):
|
|
self.main_window_handler = window_handler
|
|
|
|
def crawl_start(self):
|
|
real_time = True
|
|
while real_time:
|
|
print_and_flush("Crawler Start")
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
try:
|
|
self.set_main_window_handler(self.driver.window_handles[0])
|
|
print_and_flush(url_list[i] + "\n")
|
|
self.driver.get(url_list[i])
|
|
wait(5)
|
|
self.facebook_login()
|
|
body = self.driver.find_element_by_tag_name('body')
|
|
self.click_element(body)
|
|
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
|
end_date=self.crawl_init.get_end_day())
|
|
self.crawl_all_current_url(backup_set)
|
|
i += 1
|
|
backup_set.clear()
|
|
except Exception as e:
|
|
logging.info(e)
|
|
backup_set = self.page_crawler.url_set.copy()
|
|
self.set_driver(self.browser.new_browser())
|
|
wait(5)
|
|
real_time = self.crawl_init.is_realtime()
|
|
print_and_flush("Finished Crawling :)")
|
|
self.send_to_db.close()
|
|
self.driver.quit()
|
|
|
|
def facebook_login(self):
|
|
try:
|
|
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
|
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
|
except:
|
|
return
|
|
email = 'concepters22@gmail.com'
|
|
password = 'zjstpqxjtm'
|
|
element_email.send_keys(email)
|
|
element_pwd.send_keys(password)
|
|
label = self.driver.find_element_by_css_selector('#loginbutton')
|
|
element_input = label.find_element_by_xpath('input')
|
|
element_input.send_keys(Keys.NULL)
|
|
element_input.send_keys(Keys.ENTER)
|
|
wait(5)
|
|
|
|
def click_new_tab(self, element):
|
|
#ac = ActionChains(self.driver)
|
|
#ac.key_down(Keys.CONTROL).move_to_element(element).click().key_up(Keys.CONTROL).perform()
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.CONTROL + Keys.ENTER)
|
|
wait(3)
|
|
|
|
def switch_new_tab(self):
|
|
self.driver.switch_to_window(self.driver.window_handles[1])
|
|
|
|
def switch_main_tab(self):
|
|
self.driver.close()
|
|
self.driver.switch_to_window(self.main_window_handler)
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
ac.move_to_element(element).click().perform()
|
|
wait(4)
|
|
|
|
def control_tab(self):
|
|
ac = ActionChains(self.driver)
|
|
ac.key_down(Keys.CONTROL).key_down(Keys.TAB).perform()
|
|
wait(2) |