selenium, beautifulsoup4로 구현한 python 크롤러
git-svn-id: svn://192.168.0.12/source@241 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
0
WebBasedCrawler/base/__init__.py
Normal file
0
WebBasedCrawler/base/__init__.py
Normal file
377
WebBasedCrawler/base/baseclasses.py
Normal file
377
WebBasedCrawler/base/baseclasses.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import sys
|
||||
import time
|
||||
import os
|
||||
import psutil
|
||||
import threading
|
||||
from time import localtime, strftime
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
|
||||
def print_and_flush(string):
|
||||
print(string)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def fcntwait(n):
|
||||
time.sleep(n)
|
||||
|
||||
|
||||
def wait(n):
|
||||
th = threading.Thread(target=fcntwait, args=(n,))
|
||||
th.start()
|
||||
th.join()
|
||||
|
||||
|
||||
def insert_log(msg):
|
||||
pid = os.getpid()
|
||||
tm = strftime("%Y_%m_%d", localtime())
|
||||
filename = tm + "_" + str(pid) + ".log"
|
||||
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
|
||||
with open(filename, "a") as f:
|
||||
f.write(total_msg)
|
||||
f.flush()
|
||||
|
||||
|
||||
def enter_element(element):
|
||||
element.send_keys(Keys.NULL)
|
||||
element.send_keys(Keys.ENTER)
|
||||
wait(2)
|
||||
|
||||
|
||||
def find_element_by_css_selector(driver, tag, time=0):
|
||||
element = WebDriverWait(driver, time).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, tag))
|
||||
)
|
||||
return element
|
||||
|
||||
|
||||
def find_elements_by_css_selector(driver, tag, time=0):
|
||||
elements = WebDriverWait(driver, time).until(
|
||||
EC.presence_of_all_elements_located((By.CSS_SELECTOR, tag))
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
def find_element_by_xpath(driver, tag, time=0):
|
||||
element = WebDriverWait(driver, time).until(
|
||||
EC.presence_of_element_located((By.XPATH, tag))
|
||||
)
|
||||
return element
|
||||
|
||||
|
||||
def find_elements_by_xpath(driver, tag, time=0):
|
||||
elements = WebDriverWait(driver, time).until(
|
||||
EC.presence_of_all_elements_located((By.XPATH, tag))
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
class Browser:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.info = ""
|
||||
|
||||
def get_new_driver(self, name):
|
||||
"""
|
||||
windows system:
|
||||
name = chrome, ie, opera, firefox
|
||||
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
|
||||
linux system:
|
||||
name = chrome, opera, firefox
|
||||
default driver_exec: chromedriver, operadriver
|
||||
"""
|
||||
if sys.platform == "win32":
|
||||
if name == "chrome":
|
||||
return self.new_chrome_browser(driver_exec="chromedriver.exe")
|
||||
elif name == "ie":
|
||||
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
|
||||
elif name == "opera":
|
||||
return self.new_opera_browser(driver_exec="operadriver.exe")
|
||||
elif name == "firefox":
|
||||
return self.new_firefox_browser()
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
if name == "chrome":
|
||||
return self.new_chrome_browser(driver_exec="chromedriver")
|
||||
elif name == "opera":
|
||||
return self.new_opera_browser(driver_exec="operadriver")
|
||||
elif name == "firefox":
|
||||
return self.new_firefox_browser()
|
||||
else:
|
||||
return None
|
||||
|
||||
def new_chrome_browser(self, driver_exec=None):
|
||||
self.info = "chrome"
|
||||
if driver_exec is not None:
|
||||
self.chrome_driver_path = driver_exec
|
||||
self.chrome_basename = os.path.basename(driver_exec)
|
||||
if self.is_server_executed(self.chrome_basename):
|
||||
port = self.port(self.chrome_basename)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
|
||||
else:
|
||||
self.driver = webdriver.Chrome(self.chrome_driver_path)
|
||||
return self.driver
|
||||
|
||||
def new_ie_browser(self, driver_exec=None):
|
||||
self.info = "ie"
|
||||
if driver_exec is not None:
|
||||
self.ie_driver_path = driver_exec
|
||||
self.ie_basename = os.path.basename(driver_exec)
|
||||
if self.is_server_executed(self.ie_basename):
|
||||
port = self.port(self.ie_basename)
|
||||
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
|
||||
else:
|
||||
self.driver = webdriver.Ie(self.ie_driver_path)
|
||||
return self.driver
|
||||
|
||||
def new_firefox_browser(self):
|
||||
self.info = "firefox"
|
||||
self.driver = webdriver.Firefox()
|
||||
return self.driver
|
||||
|
||||
def new_opera_browser(self, driver_exec=None):
|
||||
self.info = "opera"
|
||||
if driver_exec is not None:
|
||||
self.opera_driver_path = driver_exec
|
||||
self.opera_basename = os.path.basename(driver_exec)
|
||||
if self.is_server_executed(self.opera_basename):
|
||||
port = self.port(self.opera_basename)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
|
||||
else:
|
||||
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
|
||||
return self.driver
|
||||
|
||||
def driver(self):
|
||||
return self.driver
|
||||
|
||||
def is_server_executed(self, driver_basename):
|
||||
for ps in psutil.process_iter():
|
||||
if ps.name() == driver_basename:
|
||||
conns = ps.connections()
|
||||
for x in conns:
|
||||
if x.status == "LISTEN":
|
||||
return True
|
||||
return False
|
||||
|
||||
def port(self, driver_basename):
|
||||
for ps in psutil.process_iter():
|
||||
if ps.name() == driver_basename:
|
||||
conns = ps.connections()
|
||||
for x in conns:
|
||||
if x.status == "LISTEN":
|
||||
return str(x.laddr[1])
|
||||
return str(9999)
|
||||
|
||||
def new_browser(self):
|
||||
if self.info == "chrome":
|
||||
return self.new_chrome_browser()
|
||||
elif self.info == "ie":
|
||||
return self.new_ie_browser()
|
||||
elif self.info == "opera":
|
||||
return self.new_opera_browser()
|
||||
elif self.info == "firefox":
|
||||
return self.new_firefox_browser()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class SendtoDB:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
|
||||
def __init__(self, db_num=0):
|
||||
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
self.db_num = db_num
|
||||
|
||||
def set_db(self, db_num):
|
||||
self.db_num = str(db_num)
|
||||
|
||||
def make_insert_query_backup(self, dictionary):
|
||||
query = "insert into data_" + str(self.db_num) + " ("
|
||||
for key in dictionary.keys():
|
||||
query += (key + ",")
|
||||
query = query[:len(query) - 1] + ")"
|
||||
query += " values("
|
||||
for key, value in dictionary.items():
|
||||
if type(value) == int:
|
||||
query += (str(value) + ",")
|
||||
else:
|
||||
query += self.conn.escape(value) + ","
|
||||
query = query[:len(query) - 1] + ")"
|
||||
return query
|
||||
|
||||
def make_insert_query(self, dictionary):
|
||||
query = "insert into data_" + str(self.db_num) + " ("
|
||||
key_list = list()
|
||||
val_list = list()
|
||||
for key, val in dictionary.items():
|
||||
key_list.append(key)
|
||||
if type(val) == int:
|
||||
val_list.append(str(val))
|
||||
else:
|
||||
val_list.append(self.conn.escape(val))
|
||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
|
||||
|
||||
def send_body(self, body):
|
||||
if not body:
|
||||
return
|
||||
self.conn_check()
|
||||
with self.conn.cursor() as cursor:
|
||||
query = self.make_insert_query(body)
|
||||
try:
|
||||
cursor.execute(query)
|
||||
self.conn.commit()
|
||||
except Exception as e:
|
||||
pass
|
||||
# print(e)
|
||||
# sys.stdout.flush()
|
||||
# print(query)
|
||||
# sys.stdout.flush()
|
||||
|
||||
def send_reply(self, reply):
|
||||
if not reply:
|
||||
return
|
||||
for i in reply:
|
||||
self.send_body(i)
|
||||
|
||||
def conn_check(self):
|
||||
if not self.conn.open:
|
||||
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
|
||||
def close(self):
|
||||
self.conn.close()
|
||||
|
||||
def delete_url(self, url):
|
||||
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
|
||||
self.conn_check()
|
||||
with self.conn.cursor() as cursor:
|
||||
try:
|
||||
cursor.execute(query)
|
||||
self.conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
sys.stdout.flush()
|
||||
print(query)
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
class CrawlInit:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
|
||||
def __init__(self, before_day=0):
|
||||
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
self.urls = dict()
|
||||
self.before_day = before_day
|
||||
|
||||
def set_before_day(self, before_day):
|
||||
if type(before_day) == str:
|
||||
self.before_day = int(before_day)
|
||||
elif type(before_day) == int:
|
||||
self.before_day = before_day
|
||||
|
||||
def set_until_page(self, until_page):
|
||||
if type(until_page) == str:
|
||||
self.until_page = int(until_page)
|
||||
elif type(until_page) == int:
|
||||
self.until_page = until_page
|
||||
|
||||
def get_keyword_parameters(self, keyword_id):
|
||||
query = "select * from keyword where id = " + str(keyword_id)
|
||||
try:
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
self.params = cursor.fetchone()
|
||||
return self.params
|
||||
except Exception as e:
|
||||
print(e)
|
||||
sys.stdout.flush()
|
||||
exit(1)
|
||||
return dict()
|
||||
|
||||
def get_naver_cafe_list(self):
|
||||
query = "select url, clubid from navercafelist"
|
||||
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
|
||||
pass
|
||||
else:
|
||||
query += (" where group_num = " + str(self.authorship()))
|
||||
try:
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
list_result = cursor.fetchall()
|
||||
for i in list_result:
|
||||
self.urls[i["url"]] = i["clubid"]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
sys.stdout.flush()
|
||||
exit(1)
|
||||
return self.urls
|
||||
|
||||
def start_day(self):
|
||||
return self.params["start"]
|
||||
|
||||
def end_day(self):
|
||||
return self.params["end"]
|
||||
|
||||
def keyword_id(self):
|
||||
return self.params["id"]
|
||||
|
||||
def realtime(self):
|
||||
return self.params["realtime"]
|
||||
|
||||
def searches(self):
|
||||
return self.params["searches"]
|
||||
|
||||
def authorship(self):
|
||||
return self.params["authorship"]
|
||||
|
||||
def platform(self):
|
||||
return self.params["platform"]
|
||||
|
||||
def is_realtime(self):
|
||||
if str(self.realtime()) == '0':
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def euc_kr(self, keyword):
|
||||
byte_code = list(keyword.encode("euc_kr"))
|
||||
encoded_keyword = ""
|
||||
for i in byte_code:
|
||||
if i == 0x20:
|
||||
encoded_keyword += "+"
|
||||
else:
|
||||
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
||||
return encoded_keyword
|
||||
|
||||
def utf8(self, keyword):
|
||||
byte_code = list(keyword.encode("utf-8"))
|
||||
encoded_keyword = ""
|
||||
for i in byte_code:
|
||||
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
||||
return encoded_keyword
|
||||
|
||||
def disconnect(self):
|
||||
self.conn.close()
|
||||
|
||||
def date_to_str(self, arg_date):
|
||||
return arg_date.strftime("%Y-%m-%d")
|
||||
0
WebBasedCrawler/facebook/__init__.py
Normal file
0
WebBasedCrawler/facebook/__init__.py
Normal file
846
WebBasedCrawler/facebook/facebookcrawl.py
Normal file
846
WebBasedCrawler/facebook/facebookcrawl.py
Normal file
@@ -0,0 +1,846 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import datetime
|
||||
|
||||
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import find_elements_by_css_selector
|
||||
from base.baseclasses import find_elements_by_xpath
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(module)s(%(lineno)s):%(funcName)s:%(message)s'
|
||||
)
|
||||
|
||||
|
||||
facebook_url = "https://www.facebook.com/"
|
||||
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
||||
|
||||
|
||||
class FacebookInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[11] = facebook_tag_url
|
||||
self.urls[12] = facebook_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
||||
# trimmed_list = list()
|
||||
# if self.platform() == 12:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(x.strip())
|
||||
# else:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(self.utf8(x))
|
||||
# return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()]
|
||||
# urls = list()
|
||||
# for x in self.split_searches():
|
||||
# url = self.urls[self.platform()] + x + "?fref=ts"
|
||||
# urls.append(url)
|
||||
# return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
def is_hashtag(self):
|
||||
return False if self.platform() == 12 else True
|
||||
|
||||
|
||||
class FacebookBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.re_date = re.compile(
|
||||
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
||||
)
|
||||
self.re_id = re.compile("id=([\\d]+)")
|
||||
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
||||
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
||||
#(("id=([\\d]+)|facebook.com/([\\w._]+)\\?"))
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def find_article_id(self):
|
||||
href = self.find_article_url()
|
||||
m = self.re_ids.search(href)
|
||||
return m.group(1) if m.group(2) is None else m.group(2)
|
||||
|
||||
def find_article_nickname(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("div[class='fbPhotoContributorName']")
|
||||
except:
|
||||
element = self.driver.find_element_by_css_selector("span.fwb>a")
|
||||
return element.text
|
||||
|
||||
def find_article_data(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("span[class='hasCaption']")
|
||||
except:
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("div[class='_5pbx userContent']")
|
||||
except:
|
||||
return ""
|
||||
return element.text
|
||||
|
||||
def find_platform_id(self):
|
||||
pass
|
||||
|
||||
def find_article_date(self):
|
||||
element = self.driver.find_element_by_css_selector("abbr[data-utime]")
|
||||
str_datetime = element.get_attribute("title")
|
||||
logging.debug(str_datetime)
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
||||
m.group(5) + ":00"
|
||||
else:
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
|
||||
|
||||
def find_article_url(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']")
|
||||
except:
|
||||
element = self.driver.find_element_by_css_selector("span#fbPhotoPageTimestamp>a[class='_39g5']")
|
||||
return element.get_attribute('href')
|
||||
|
||||
def find_article_title(self):
|
||||
return self.driver.title
|
||||
|
||||
def find_platform_name(self):
|
||||
pass
|
||||
|
||||
def find_like_users(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
|
||||
enter_element(element)
|
||||
ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
||||
except:
|
||||
return None
|
||||
try:
|
||||
while True:
|
||||
#a_element = find_element_by_css_selector(self.driver, "a[class$='uiBoxLightblue uiMorePagerPrimary']",
|
||||
# 30)
|
||||
a_element = WebDriverWait(self.driver, 20).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
||||
"a[class$='uiBoxLightblue uiMorePagerPrimary']")))
|
||||
enter_element(a_element)
|
||||
wait(1)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
#ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
||||
a_elements = self.driver.find_elements_by_css_selector("div[class='fsl fwb fcb']>a")
|
||||
like_users = list()
|
||||
for a in a_elements:
|
||||
like_user = dict()
|
||||
like_user['nickname'] = a.text
|
||||
m = self.re_ids.search(a.get_attribute('href'))
|
||||
like_user['id'] = m.group(2) if m.group(1) is None else m.group(1)
|
||||
like_users.append(like_user)
|
||||
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
||||
enter_element(cancel)
|
||||
return {'data': like_users, 'count': len(like_users)}
|
||||
|
||||
def find_share_users(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
|
||||
enter_element(element)
|
||||
#share_element = find_element_by_css_selector(self.driver, "#repost_view_dialog", 30)
|
||||
page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
|
||||
except:
|
||||
return None
|
||||
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
||||
while len(page_scroller_children) > 1:
|
||||
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
|
||||
wait(2)
|
||||
#page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
|
||||
page_scroller = WebDriverWait(self.driver, 20).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR, "pagelet_scrolling_pager")))
|
||||
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
||||
a_tags = self.driver.find_elements_by_css_selector("span[class='fwb']>a[class='profileLink']")
|
||||
share_users = list()
|
||||
for a in a_tags:
|
||||
share_user = dict()
|
||||
share_user['url'] = a.get_attribute('href')
|
||||
share_user['nickname'] = a.text
|
||||
str_id = share_user['url'][share_user['url'].rindex('/') + 1:]
|
||||
m = self.re_id.search(str_id)
|
||||
share_user['id'] = str_id if m is None else m.group(1)
|
||||
share_users.append(share_user)
|
||||
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
||||
enter_element(cancel)
|
||||
return {'data': share_users, 'count': len(share_users)}
|
||||
|
||||
def find_like_user_number(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
|
||||
except:
|
||||
return None
|
||||
str_num = element.text
|
||||
m = re.search("(\\d+)", str_num.replace(",", ""))
|
||||
return None if m is None else m.group(1)
|
||||
|
||||
def find_share_user_number(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
|
||||
except:
|
||||
return None
|
||||
str_num = element.text
|
||||
m = re.search("(\\d+)", str_num.replace(",", ""))
|
||||
return None if m is None else m.group(1)
|
||||
|
||||
def find_reply_number(self):
|
||||
pass
|
||||
|
||||
def find_article_profileurl(self):
|
||||
try:
|
||||
img = self.driver.find_element_by_css_selector('div._38vo>img')
|
||||
except:
|
||||
img = self.driver.find_element_by_css_selector("img._s0._54ru")
|
||||
return img.get_attribute('src')
|
||||
|
||||
def get_content(self):
|
||||
content = dict()
|
||||
content['article_id'] = self.find_article_id()
|
||||
content['article_url'] = self.find_article_url()
|
||||
content['article_data'] = self.find_article_data()
|
||||
content['article_date'] = self.find_article_date()
|
||||
content['article_title'] = self.find_article_title()
|
||||
content['article_nickname'] = self.find_article_nickname()
|
||||
content['article_form'] = 'body'
|
||||
content['platform_name'] = 'facebook'
|
||||
content['platform_form'] = 'post'
|
||||
content['platform_title'] = content['article_nickname']
|
||||
content['platform_id'] = content['article_id']
|
||||
content['article_profileurl'] = self.find_article_profileurl()
|
||||
like_user_num = self.find_like_user_number()
|
||||
share_user_num = self.find_share_user_number()
|
||||
if like_user_num:
|
||||
content['article_hit'] = self.find_like_user_number()
|
||||
if share_user_num:
|
||||
content['reply_url'] = self.find_share_user_number()
|
||||
likes = self.find_like_users()
|
||||
shares = self.find_like_users()
|
||||
data = list()
|
||||
if likes:
|
||||
data.append({"likes": likes})
|
||||
if shares:
|
||||
data.append({"shares": shares})
|
||||
if data:
|
||||
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode({"data": data})
|
||||
return content
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
wait(2)
|
||||
|
||||
|
||||
class FacebookReplyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.reply_list = list()
|
||||
self.order = 0
|
||||
self.div = None
|
||||
self.re_date = re.compile(
|
||||
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
||||
)
|
||||
self.re_id = re.compile("id=([\\d]+)")
|
||||
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
||||
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
||||
|
||||
def find_init(self):
|
||||
self.reply_list.clear()
|
||||
self.order = 0
|
||||
self.reload_count = 0
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def read_all_reply(self):
|
||||
try:
|
||||
a_element = WebDriverWait(self.driver, 15).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
||||
"a.UFIPagerLink")))
|
||||
enter_element(a_element)
|
||||
except:
|
||||
pass
|
||||
self.read_all_child_reply()
|
||||
|
||||
def read_all_child_reply(self):
|
||||
try:
|
||||
a_elements = self.driver.find_elements_by_css_selector("a.UFICommentLink")
|
||||
for a_element in a_elements:
|
||||
enter_element(a_element)
|
||||
except:
|
||||
pass
|
||||
|
||||
def set_div(self, div=None):
|
||||
if div is None:
|
||||
try:
|
||||
self.div = self.driver.find_element_by_xpath(
|
||||
"//div[@data-reactroot and @class='UFIList']/div[not(@class)]")
|
||||
# self.div = self.driver.find_element_by_css_selector("div[data-reactroot].UFIList>div:not([class])")
|
||||
except:
|
||||
self.div = None
|
||||
else:
|
||||
self.div = div
|
||||
|
||||
def has_reply(self):
|
||||
"""after set_div execute this"""
|
||||
if not self.div:
|
||||
return False
|
||||
else:
|
||||
children = self.div.find_elements_by_css_selector("*")
|
||||
return True if len(children) > 0 else False
|
||||
|
||||
def crawl_reply(self, div, article_parent=None):
|
||||
content = dict()
|
||||
content['article_id'] = self.find_article_id(div)
|
||||
content['article_nickname'] = self.find_article_nickname(div)
|
||||
content['article_data'] = self.find_article_data(div)
|
||||
content['article_date'] = self.find_article_date(div)
|
||||
content['article_profileurl'] = self.find_article_profileurl(div)
|
||||
content['article_order'] = self.order
|
||||
like_num = self.find_like_number(div)
|
||||
if like_num:
|
||||
content['article_hit'] = like_num
|
||||
if article_parent:
|
||||
content['article_parent'] = article_parent
|
||||
content.update({'article_form': 'reply', 'platform_name': 'facebook', 'platform_form': 'post'})
|
||||
self.order += 1
|
||||
self.reply_list.append(content)
|
||||
|
||||
def crawl_all(self):
|
||||
self.read_all_reply()
|
||||
self.set_div()
|
||||
try:
|
||||
if self.has_reply():
|
||||
elements = self.div.find_elements_by_xpath("div")
|
||||
article_parent = None
|
||||
for div in elements:
|
||||
if div.get_attribute('class').find("UFIReplyList") != -1:
|
||||
reply_div = div.find_elements_by_xpath('div[@role]')
|
||||
for child in reply_div:
|
||||
self.crawl_reply(child, article_parent)
|
||||
elif div.get_attribute("role") == "article":
|
||||
self.crawl_reply(div)
|
||||
article_parent = self.reply_list[len(self.reply_list) - 1]['article_nickname']
|
||||
else:
|
||||
pass
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
|
||||
def get_content(self):
|
||||
return self.reply_list
|
||||
|
||||
def find_article_id(self, div):
|
||||
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
|
||||
m = self.re_ids.search(element.get_attribute('href'))
|
||||
if not m:
|
||||
return 'None'
|
||||
return m.group(1) if not m.group(2) else m.group(2)
|
||||
|
||||
def find_article_parent(self, div):
|
||||
pass
|
||||
|
||||
def find_article_date(self, div):
|
||||
element = div.find_element_by_css_selector("abbr.livetimestamp")
|
||||
str_datetime = element.get_attribute("title")
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
||||
m.group(5) + ":00"
|
||||
else:
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
|
||||
|
||||
def find_article_data(self, div):
|
||||
element = div.find_element_by_css_selector("span.UFICommentBody")
|
||||
return element.text
|
||||
|
||||
def find_article_profileurl(self, div):
|
||||
element = div.find_element_by_css_selector("img[class^='img UFIActorImage']")
|
||||
return element.get_attribute('src')
|
||||
|
||||
def find_article_nickname(self, div):
|
||||
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
|
||||
return element.text
|
||||
|
||||
def find_like_number(self, div):
|
||||
try:
|
||||
element = div.find_element_by_css_selector('a[ajaxify]')
|
||||
m = re.search("(\\d+)", element.text.replace(",", ""))
|
||||
return m.group(1) if m else None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
class FacebookPageCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.url_set = set()
|
||||
self.index = 0
|
||||
self.limit = 500
|
||||
self.re_date = re.compile("([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})")
|
||||
self.reload_count = 0
|
||||
self.is_hash = False
|
||||
self.main_handle = None
|
||||
self.begin_date = None
|
||||
self.end_date = None
|
||||
self.posts = None
|
||||
self.current_url = None
|
||||
|
||||
def set_limit(self, limit=500):
|
||||
self.limit = limit
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_main_handle(self):
|
||||
self.main_handle = self.driver.window_handles[0]
|
||||
|
||||
def find_article_date(self, div):
|
||||
try:
|
||||
element = div.find_element_by_css_selector("abbr.livetimestamp")
|
||||
except:
|
||||
element = div.find_element_by_css_selector("abbr[title]")
|
||||
str_datetime = element.get_attribute("title")
|
||||
logging.debug(str_datetime)
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return datetime.datetime(year=1999, month=1, day=1)
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return datetime.datetime(
|
||||
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
||||
hour=(int(m.group(4)) + 12), minute=int(m.group(5))
|
||||
)
|
||||
else:
|
||||
return datetime.datetime(
|
||||
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
||||
hour=(int(m.group(4))), minute=int(m.group(5))
|
||||
)
|
||||
|
||||
def next_post_by_user(self):
|
||||
try:
|
||||
#self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts_wait()
|
||||
if len(self.posts) < 1:
|
||||
print_and_flush("not posts")
|
||||
self.posts = None
|
||||
return None
|
||||
except Exception as e:
|
||||
print_and_flush("cannot found _5pcq")
|
||||
logging.info(e)
|
||||
self.posts = None
|
||||
return None
|
||||
while True:
|
||||
self.index += 1
|
||||
if self.index >= len(self.posts):
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.index >= len(self.posts):
|
||||
if self.load_more_posts() is False:
|
||||
self.posts = None
|
||||
return None
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
||||
continue
|
||||
time_date = self.find_article_date(self.posts[self.index - 1])
|
||||
logging.info("number of posts: " + str(len(self.posts)))
|
||||
print_and_flush(str(time_date))
|
||||
if type(time_date) == str:
|
||||
continue
|
||||
if self.is_earlier(time_date):
|
||||
self.posts = None
|
||||
return None
|
||||
if self.is_late(time_date):
|
||||
continue
|
||||
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
||||
return self.posts[self.index - 1]
|
||||
|
||||
def next_post_by_tag(self):
|
||||
try:
|
||||
# self.posts = find_elements_by_css_selector(self.driver, "a[class='_5pcq']")
|
||||
# self.posts = find_elements_by_xpath(self.driver, "//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts_wait()
|
||||
if len(self.posts) < 1:
|
||||
print_and_flush("not posts")
|
||||
self.posts = None
|
||||
return None
|
||||
except Exception as e:
|
||||
print_and_flush("cannot found _5pcq")
|
||||
logging.info(e)
|
||||
self.posts = None
|
||||
return None
|
||||
while True:
|
||||
self.index += 1
|
||||
if self.index > self.limit:
|
||||
self.posts = None
|
||||
return None
|
||||
if self.index >= len(self.posts):
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.index >= len(self.posts):
|
||||
if self.load_more_posts() is False:
|
||||
self.posts = None
|
||||
return None
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
||||
continue
|
||||
logging.info("number of posts: " + str(len(self.posts)))
|
||||
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
||||
return self.posts[self.index - 1]
|
||||
|
||||
def load_more_posts(self):
|
||||
# previous_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# previous_posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
previous_posts = self.find_posts()
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
body.send_keys(Keys.NULL)
|
||||
body.send_keys(Keys.END)
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
for j in range(0, 2):
|
||||
body.send_keys(Keys.PAGE_UP)
|
||||
wait(0.1)
|
||||
for j in range(0, 15):
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
wait(0.1)
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
for i in range(0, 10):
|
||||
print_and_flush("Try load more")
|
||||
self.driver.execute_script("window.scrollBy(0, 800)")
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
if self.reload_count < 8:
|
||||
print_and_flush("index reload")
|
||||
self.reload_count += 1
|
||||
self.index -= 1 if self.index > 0 else 0
|
||||
position = self.driver.get_window_position()
|
||||
size = self.driver.get_window_size()
|
||||
self.driver.maximize_window()
|
||||
self.driver.set_window_size(size['width'], size["height"])
|
||||
self.driver.set_window_position(position['x'], position['y'])
|
||||
return True
|
||||
if self.reload_count < 15:
|
||||
print_and_flush("refresh")
|
||||
self.driver.refresh()
|
||||
wait(5)
|
||||
self.index = 0
|
||||
self.reload_count += 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_earlier(self, time_date):
|
||||
return True if time_date < self.begin_date else False
|
||||
|
||||
def is_late(self, time_date):
|
||||
return True if time_date > self.end_date else False
|
||||
|
||||
def set_date(self, begin_date, end_date):
|
||||
self.set_begin_date(begin_date)
|
||||
self.set_end_date(end_date)
|
||||
|
||||
def set_end_date(self, end_date):
|
||||
if type(end_date) == str:
|
||||
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
||||
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
||||
self.end_date = end_date
|
||||
else:
|
||||
self.end_date = datetime.datetime.today()
|
||||
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
||||
self.end_date += datetime.timedelta(days=1)
|
||||
|
||||
def set_begin_date(self, begin_date):
|
||||
if type(begin_date) == str:
|
||||
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
||||
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
||||
self.begin_date = begin_date
|
||||
else:
|
||||
self.begin_date = datetime.datetime.today()
|
||||
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month,
|
||||
day=self.begin_date.day)
|
||||
|
||||
def crawling_ok(self):
|
||||
self.url_set.add(self.current_url)
|
||||
|
||||
def init(self):
|
||||
self.index = 0
|
||||
self.posts = None
|
||||
self.url_set.clear()
|
||||
|
||||
def find_posts(self):
|
||||
try:
|
||||
divs = self.driver.find_elements_by_xpath("//div[@class='_1dwg']")
|
||||
except:
|
||||
return None
|
||||
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
||||
posts = list()
|
||||
for div in divs:
|
||||
try:
|
||||
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
||||
except:
|
||||
pass
|
||||
return posts
|
||||
|
||||
def find_posts_wait(self):
|
||||
try:
|
||||
divs = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']", 30)
|
||||
except:
|
||||
return None
|
||||
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
||||
posts = list()
|
||||
for div in divs:
|
||||
try:
|
||||
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
||||
except:
|
||||
pass
|
||||
return posts
|
||||
|
||||
|
||||
class FacebookMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = FacebookPageCrawler()
|
||||
self.body_crawler = FacebookBodyCrawler()
|
||||
self.reply_crawler = FacebookReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = FacebookInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
self.keyword_id = None
|
||||
self.url = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all_current_url(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.url_set = backup_set.copy()
|
||||
while True:
|
||||
post = self.page_crawler.next_post_by_tag() if self.crawl_init.is_hashtag() \
|
||||
else self.page_crawler.next_post_by_user()
|
||||
if post is None:
|
||||
break
|
||||
try:
|
||||
self.click_new_tab(post)
|
||||
self.control_tab()
|
||||
self.switch_new_tab()
|
||||
wait(5)
|
||||
body = self.driver.find_element_by_tag_name('body')
|
||||
self.click_element(body)
|
||||
body_info = self.crawl_body()
|
||||
self.crawl_reply(body_info)
|
||||
self.page_crawler.crawling_ok()
|
||||
print_and_flush("ok")
|
||||
self.switch_main_tab()
|
||||
except WebDriverException as ee:
|
||||
logging.info(ee)
|
||||
print_and_flush("fail")
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
print_and_flush("failed")
|
||||
logging.info(e)
|
||||
|
||||
def crawl_body(self):
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
return {"article_url": content["article_url"], "platform_id": content["platform_id"]}
|
||||
|
||||
def crawl_reply(self, body_info):
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.set_div()
|
||||
if self.reply_crawler.has_reply():
|
||||
self.reply_crawler.crawl_all()
|
||||
contents = self.reply_crawler.get_content()
|
||||
for content in contents:
|
||||
content.update(body_info)
|
||||
self.send_to_db.send_reply(contents)
|
||||
|
||||
def start(self):
|
||||
self.crawl_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
self.page_crawler.set_limit(self.crawl_init.until_page)
|
||||
|
||||
def set_main_window_handler(self, window_handler):
|
||||
self.main_window_handler = window_handler
|
||||
|
||||
def crawl_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawler Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
self.set_main_window_handler(self.driver.window_handles[0])
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
self.facebook_login()
|
||||
body = self.driver.find_element_by_tag_name('body')
|
||||
self.click_element(body)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
self.crawl_all_current_url(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
backup_set = self.page_crawler.url_set.copy()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
self.driver.quit()
|
||||
|
||||
def facebook_login(self):
|
||||
try:
|
||||
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
||||
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
||||
except:
|
||||
return
|
||||
email = 'concepters22@gmail.com'
|
||||
password = 'zjstpqxjtm'
|
||||
element_email.send_keys(email)
|
||||
element_pwd.send_keys(password)
|
||||
label = self.driver.find_element_by_css_selector('#loginbutton')
|
||||
element_input = label.find_element_by_xpath('input')
|
||||
element_input.send_keys(Keys.NULL)
|
||||
element_input.send_keys(Keys.ENTER)
|
||||
wait(5)
|
||||
|
||||
def click_new_tab(self, element):
|
||||
#ac = ActionChains(self.driver)
|
||||
#ac.key_down(Keys.CONTROL).move_to_element(element).click().key_up(Keys.CONTROL).perform()
|
||||
element.send_keys(Keys.NULL)
|
||||
element.send_keys(Keys.CONTROL + Keys.ENTER)
|
||||
wait(3)
|
||||
|
||||
def switch_new_tab(self):
|
||||
self.driver.switch_to_window(self.driver.window_handles[1])
|
||||
|
||||
def switch_main_tab(self):
|
||||
self.driver.close()
|
||||
self.driver.switch_to_window(self.main_window_handler)
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
ac.move_to_element(element).click().perform()
|
||||
wait(4)
|
||||
|
||||
def control_tab(self):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.key_down(Keys.CONTROL).key_down(Keys.TAB).perform()
|
||||
wait(2)
|
||||
907
WebBasedCrawler/facebook/facebookcrawlbs.py
Normal file
907
WebBasedCrawler/facebook/facebookcrawlbs.py
Normal file
@@ -0,0 +1,907 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import find_elements_by_css_selector
|
||||
from base.baseclasses import find_elements_by_xpath
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(module)s(%(lineno)s):%(funcName)s:%(message)s'
|
||||
)
|
||||
|
||||
# parser_method = 'html.parser'
|
||||
parser_method = 'lxml'
|
||||
|
||||
facebook_url = "https://www.facebook.com/"
|
||||
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
||||
|
||||
|
||||
class FacebookInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[11] = facebook_tag_url
|
||||
self.urls[12] = facebook_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
||||
# trimmed_list = list()
|
||||
# if self.platform() == 12:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(x.strip())
|
||||
# else:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(self.utf8(x))
|
||||
# return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()]
|
||||
# urls = list()
|
||||
# for x in self.split_searches():
|
||||
# url = self.urls[self.platform()] + x + "?fref=ts"
|
||||
# urls.append(url)
|
||||
# return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
def is_hashtag(self):
|
||||
return False if self.platform() == 12 else True
|
||||
|
||||
|
||||
class FacebookBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.re_date = re.compile(
|
||||
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
||||
)
|
||||
self.re_id = re.compile("id=([\\d]+)")
|
||||
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
||||
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
||||
#(("id=([\\d]+)|facebook.com/([\\w._]+)\\?"))
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def find_article_id(self, soup):
|
||||
element = soup.find('a', class_='_2yug')
|
||||
if element:
|
||||
href = element.get('href')
|
||||
else:
|
||||
href = self.find_article_url(soup)
|
||||
m = self.re_ids.search(href)
|
||||
return m.group(1) if m.group(2) is None else m.group(2)
|
||||
|
||||
def find_article_nickname(self, soup):
|
||||
nickname = soup.find('div', class_='fbPhotoContributorName')
|
||||
if not nickname or not nickname.get_text():
|
||||
span = soup.find('span', class_='fwb fcg')
|
||||
if span:
|
||||
nickname = span.a
|
||||
else:
|
||||
nickname = soup.find('a', 'profileLink')
|
||||
if not nickname:
|
||||
nickname = soup.find('a', class_='_2yug')
|
||||
return nickname.get_text() if nickname else ""
|
||||
|
||||
def find_article_data(self, soup):
|
||||
element = soup.find('span', class_='hasCaption')
|
||||
if not element:
|
||||
element = soup.find('div', class_='_5pbx userContent')
|
||||
if not element:
|
||||
element = soup.find('div', class_='_39k5')
|
||||
return element.get_text() if element else ""
|
||||
|
||||
def find_platform_id(self, soup):
|
||||
pass
|
||||
|
||||
def find_article_date(self, soup):
|
||||
element = soup.find('abbr', attrs={'data-utime': True})
|
||||
if element:
|
||||
str_datetime = element.get('title')
|
||||
logging.debug(str_datetime)
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
||||
m.group(5) + ":00"
|
||||
else:
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + \
|
||||
":00"
|
||||
else:
|
||||
element = soup.find('a', '_39g5')
|
||||
str_datetime = element.get_text()
|
||||
return "0000-00-00 00:00:00"
|
||||
|
||||
def find_article_url(self, soup):
|
||||
element = soup.select_one("span.fsm.fwn.fcg > a._5pcq")
|
||||
if not element:
|
||||
element = soup.select_one("span#fbPhotoPageTimestamp > a._39g5")
|
||||
if not element:
|
||||
element = soup.find('a', '_39g5')
|
||||
return element.get('href') \
|
||||
if element.get('href').startswith('http') else facebook_url[:-1] + element.get('href')
|
||||
|
||||
def find_article_title(self, soup):
|
||||
title = soup.find('div', "_4lmk")
|
||||
return title.get_text() if title else soup.title.get_text()
|
||||
|
||||
def find_platform_name(self):
|
||||
pass
|
||||
|
||||
def find_like_users(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
|
||||
enter_element(element)
|
||||
ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
||||
except:
|
||||
return None
|
||||
try:
|
||||
while True:
|
||||
#a_element = find_element_by_css_selector(self.driver, "a[class$='uiBoxLightblue uiMorePagerPrimary']",
|
||||
# 30)
|
||||
a_element = WebDriverWait(self.driver, 20).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
||||
"a[class$='uiBoxLightblue uiMorePagerPrimary']")))
|
||||
enter_element(a_element)
|
||||
wait(1)
|
||||
except Exception as e:
|
||||
pass
|
||||
# print(e)
|
||||
#ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
|
||||
soup = BeautifulSoup(self.driver.page_source, parser_method)
|
||||
a_elements = soup.select('div.fsl.fwb.fcb > a')
|
||||
like_users = list()
|
||||
for a in a_elements:
|
||||
like_user = dict()
|
||||
like_user['nickname'] = a.get_text()
|
||||
url = a.get('href')
|
||||
m = self.re_ids.search(url if url.startswith('http') else facebook_url[:-1] + url)
|
||||
like_user['id'] = m.group(2) if m.group(1) is None else m.group(1)
|
||||
like_users.append(like_user)
|
||||
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
||||
enter_element(cancel)
|
||||
return {'data': like_users, 'count': len(like_users)}
|
||||
|
||||
def find_share_users(self):
|
||||
try:
|
||||
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
|
||||
enter_element(element)
|
||||
#share_element = find_element_by_css_selector(self.driver, "#repost_view_dialog", 30)
|
||||
page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 20)
|
||||
except:
|
||||
return None
|
||||
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
||||
while len(page_scroller_children) > 1:
|
||||
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
|
||||
wait(2)
|
||||
#page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
|
||||
page_scroller = WebDriverWait(self.driver, 20).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR, "pagelet_scrolling_pager")))
|
||||
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
|
||||
soup = BeautifulSoup(self.driver.page_source, parser_method)
|
||||
a_tags = soup.select('span.fwb > a.profileLink')
|
||||
share_users = list()
|
||||
for a in a_tags:
|
||||
share_user = dict()
|
||||
url = a.get('href')
|
||||
share_user['url'] = url if url.startswith('http') else facebook_url[:-1] + url
|
||||
share_user['nickname'] = a.get_text()
|
||||
str_id = share_user['url'][share_user['url'].rindex('/') + 1:]
|
||||
m = self.re_id.search(str_id)
|
||||
share_user['id'] = str_id if m is None else m.group(1)
|
||||
share_users.append(share_user)
|
||||
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
|
||||
enter_element(cancel)
|
||||
return {'data': share_users, 'count': len(share_users)}
|
||||
|
||||
def find_like_user_number(self, soup):
|
||||
element = soup.find('a', attrs={'data-testid': 'n_other_people_link'})
|
||||
if not element:
|
||||
return None
|
||||
str_num = element.get_text()
|
||||
m = re.search("(\\d+)", str_num.replace(",", ""))
|
||||
return None if m is None else m.group(1)
|
||||
|
||||
def find_share_user_number(self, soup):
|
||||
element = soup.find('a', class_='UFIShareLink')
|
||||
if not element:
|
||||
return None
|
||||
str_num = element.get_text()
|
||||
m = re.search("(\\d+)", str_num.replace(",", ""))
|
||||
return None if m is None else m.group(1)
|
||||
|
||||
def find_reply_number(self, soup):
|
||||
pass
|
||||
|
||||
def find_article_profileurl(self, soup):
|
||||
img = soup.select_one('div._38vo > img')
|
||||
if not img:
|
||||
img = soup.select_one('img._s0._54ru')
|
||||
if not img:
|
||||
div = soup.find("div", class_='_2yuf')
|
||||
if div:
|
||||
src = div.get('style')
|
||||
return src[src.index("http"):].replace('\\', "").replace("\")", "")
|
||||
return img.get('src') if img else ""
|
||||
|
||||
def get_content(self):
|
||||
soup = BeautifulSoup(self.driver.page_source, parser_method)
|
||||
content = dict()
|
||||
# logging.info('start_get_content')
|
||||
content['article_id'] = self.find_article_id(soup)
|
||||
# logging.info('article_id')
|
||||
content['article_url'] = self.find_article_url(soup)
|
||||
# logging.info('article_url')
|
||||
content['article_data'] = self.find_article_data(soup)
|
||||
# logging.info('article_data')
|
||||
content['article_date'] = self.find_article_date(soup)
|
||||
# logging.info('article_date')
|
||||
content['article_title'] = self.find_article_title(soup)
|
||||
# logging.info('article_title')
|
||||
content['article_nickname'] = self.find_article_nickname(soup)
|
||||
# logging.info('article_nickname')
|
||||
content['article_form'] = 'body'
|
||||
content['platform_name'] = 'facebook'
|
||||
content['platform_form'] = 'post'
|
||||
content['platform_title'] = content['article_nickname']
|
||||
content['platform_id'] = content['article_id']
|
||||
content['article_profileurl'] = self.find_article_profileurl(soup)
|
||||
# logging.info('article_profileurl')
|
||||
like_user_num = self.find_like_user_number(soup)
|
||||
# logging.info('like_user_number')
|
||||
share_user_num = self.find_share_user_number(soup)
|
||||
# logging.info('share_user_number')
|
||||
if like_user_num:
|
||||
content['article_hit'] = like_user_num
|
||||
if share_user_num:
|
||||
content['reply_url'] = share_user_num
|
||||
likes = self.find_like_users()
|
||||
# logging.info('find_like_users')
|
||||
shares = self.find_like_users()
|
||||
# logging.info('find_like_shares')
|
||||
data = list()
|
||||
if likes:
|
||||
data.append({"likes": likes})
|
||||
if shares:
|
||||
data.append({"shares": shares})
|
||||
if data:
|
||||
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode({"data": data})
|
||||
return content
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
wait(2)
|
||||
|
||||
|
||||
class FacebookReplyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.reply_list = list()
|
||||
self.order = 0
|
||||
self.div = None
|
||||
self.re_date = re.compile(
|
||||
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
|
||||
)
|
||||
self.re_id = re.compile("id=([\\d]+)")
|
||||
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
|
||||
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
|
||||
|
||||
def find_init(self):
|
||||
self.reply_list.clear()
|
||||
self.order = 0
|
||||
self.reload_count = 0
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def read_all_reply(self):
|
||||
try:
|
||||
start_time = time.time()
|
||||
while True:
|
||||
a_element = WebDriverWait(self.driver, 10).\
|
||||
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
|
||||
"a.UFIPagerLink")))
|
||||
a_element.send_keys(Keys.NULL)
|
||||
a_element.send_keys(Keys.ENTER)
|
||||
wait(1)
|
||||
if time.time() - start_time > 600.0:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
self.read_all_child_reply()
|
||||
|
||||
def read_all_child_reply(self):
|
||||
try:
|
||||
a_elements = self.driver.find_elements_by_css_selector("a.UFICommentLink")
|
||||
for a_element in a_elements:
|
||||
a_element.send_keys(Keys.NULL)
|
||||
a_element.send_keys(Keys.ENTER)
|
||||
wait(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
def set_div(self, div=None):
|
||||
if div is None:
|
||||
try:
|
||||
self.div = self.driver.find_element_by_xpath(
|
||||
"//div[@data-reactroot and @class='UFIList']/div[not(@class)]")
|
||||
# self.div = self.driver.find_element_by_css_selector("div[data-reactroot].UFIList>div:not([class])")
|
||||
except:
|
||||
self.div = None
|
||||
else:
|
||||
self.div = div
|
||||
|
||||
def has_reply(self):
|
||||
"""after set_div execute this"""
|
||||
if not self.div:
|
||||
return False
|
||||
else:
|
||||
children = self.div.find_elements_by_css_selector("*")
|
||||
return True if len(children) > 0 else False
|
||||
|
||||
def crawl_reply(self, div, article_parent=None):
|
||||
content = dict()
|
||||
content['article_id'] = self.find_article_id(div)
|
||||
content['article_nickname'] = self.find_article_nickname(div)
|
||||
content['article_data'] = self.find_article_data(div)
|
||||
content['article_date'] = self.find_article_date(div)
|
||||
content['article_profileurl'] = self.find_article_profileurl(div)
|
||||
content['article_order'] = self.order
|
||||
like_num = self.find_like_number(div)
|
||||
if like_num:
|
||||
content['article_hit'] = like_num
|
||||
if article_parent:
|
||||
content['article_parent'] = article_parent
|
||||
content.update({'article_form': 'reply', 'platform_name': 'facebook', 'platform_form': 'post'})
|
||||
self.order += 1
|
||||
self.reply_list.append(content)
|
||||
|
||||
def crawl_all(self):
|
||||
self.find_init()
|
||||
self.read_all_reply()
|
||||
self.set_div()
|
||||
try:
|
||||
if self.has_reply():
|
||||
soup = BeautifulSoup(self.driver.page_source, parser_method)
|
||||
parent = soup.find('div', attrs={"data-reactroot": True, "class": "UFIList"})
|
||||
child = parent.find('div', attrs={"class": False}, recursive=False)
|
||||
elements = child.find_all('div', recursive=False)
|
||||
article_parent = None
|
||||
for div in elements:
|
||||
if "UFIReplyList" in div.get('class'):
|
||||
reply_div = div.find_all('div', attrs={'role': True}, recursive=False)
|
||||
for child_reply in reply_div:
|
||||
self.crawl_reply(child_reply, article_parent)
|
||||
elif div.get('role') == "article":
|
||||
self.crawl_reply(div)
|
||||
article_parent = self.reply_list[len(self.reply_list) - 1]['article_nickname']
|
||||
else:
|
||||
pass
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
|
||||
def get_content(self):
|
||||
return self.reply_list
|
||||
|
||||
def find_article_id(self, div):
|
||||
element = div.find("a", class_=re.compile('UFICommentActorName$'))
|
||||
url = element.get('href')
|
||||
m = self.re_ids.search(url if url.startswith else facebook_url[:-1] + url)
|
||||
if not m:
|
||||
return 'None'
|
||||
return m.group(1) if not m.group(2) else m.group(2)
|
||||
|
||||
def find_article_parent(self, div):
|
||||
pass
|
||||
|
||||
def find_article_date(self, div):
|
||||
element = div.find("abbr", "livetimestamp")
|
||||
str_datetime = element.get("title")
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
|
||||
m.group(5) + ":00"
|
||||
else:
|
||||
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
|
||||
|
||||
def find_article_data(self, div):
|
||||
element = div.find("span", "UFICommentBody")
|
||||
return element.get_text()
|
||||
|
||||
def find_article_profileurl(self, div):
|
||||
element = div.select_one('img.img.UFIActorImage')
|
||||
url = element.get('src')
|
||||
return url if url.startswith('http') else facebook_url[:-1] + url
|
||||
|
||||
def find_article_nickname(self, div):
|
||||
element = div.find("a", 'UFICommentActorName')
|
||||
return element.get_text()
|
||||
|
||||
def find_like_number(self, div):
|
||||
try:
|
||||
element = div.find('a', ajaxify=True)
|
||||
m = re.search("(\\d+)", element.get_text().replace(",", ""))
|
||||
return m.group(1) if m else None
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
class FacebookPageCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.url_set = set()
|
||||
self.index = 0
|
||||
self.limit = 500
|
||||
self.re_date = re.compile("([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})")
|
||||
self.reload_count = 0
|
||||
self.is_hash = False
|
||||
self.main_handle = None
|
||||
self.begin_date = None
|
||||
self.end_date = None
|
||||
self.posts = None
|
||||
self.current_url = None
|
||||
|
||||
def set_limit(self, limit=500):
|
||||
self.limit = limit
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_main_handle(self):
|
||||
self.main_handle = self.driver.window_handles[0]
|
||||
|
||||
def find_article_date(self, div):
|
||||
try:
|
||||
element = div.find_element_by_css_selector("abbr.livetimestamp")
|
||||
except:
|
||||
element = div.find_element_by_css_selector("abbr[title]")
|
||||
str_datetime = element.get_attribute("title")
|
||||
logging.debug(str_datetime)
|
||||
m = self.re_date.match(str_datetime)
|
||||
if m is None:
|
||||
return datetime.datetime(year=1999, month=1, day=1)
|
||||
else:
|
||||
if str_datetime.find("오후") != -1 and m.group(4) != "12":
|
||||
return datetime.datetime(
|
||||
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
||||
hour=(int(m.group(4)) + 12), minute=int(m.group(5))
|
||||
)
|
||||
else:
|
||||
return datetime.datetime(
|
||||
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
|
||||
hour=(int(m.group(4))), minute=int(m.group(5))
|
||||
)
|
||||
|
||||
def next_post_by_user(self):
|
||||
try:
|
||||
#self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts_wait()
|
||||
if len(self.posts) < 1:
|
||||
print_and_flush("not posts")
|
||||
self.posts = None
|
||||
return None
|
||||
except Exception as e:
|
||||
print_and_flush("cannot found _5pcq")
|
||||
logging.info(e)
|
||||
self.posts = None
|
||||
return None
|
||||
while True:
|
||||
self.index += 1
|
||||
if self.index >= len(self.posts):
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.index >= len(self.posts):
|
||||
if self.load_more_posts() is False:
|
||||
self.posts = None
|
||||
return None
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
print(self.posts[self.index - 1].get_attribute("href"), flush=True)
|
||||
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
||||
continue
|
||||
time_date = self.find_article_date(self.posts[self.index - 1])
|
||||
print("number of posts: ", self.index, '/', str(len(self.posts)), flush=True)
|
||||
print_and_flush(str(time_date))
|
||||
if type(time_date) == str:
|
||||
continue
|
||||
if self.is_earlier(time_date):
|
||||
self.posts = None
|
||||
return None
|
||||
if self.is_late(time_date):
|
||||
continue
|
||||
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
||||
return self.posts[self.index - 1]
|
||||
|
||||
def next_post_by_tag(self):
|
||||
try:
|
||||
# self.posts = find_elements_by_css_selector(self.driver, "a[class='_5pcq']")
|
||||
# self.posts = find_elements_by_xpath(self.driver, "//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts_wait()
|
||||
if len(self.posts) < 1:
|
||||
print_and_flush("not posts")
|
||||
self.posts = None
|
||||
return None
|
||||
except Exception as e:
|
||||
print_and_flush("cannot found _5pcq")
|
||||
logging.info(e)
|
||||
self.posts = None
|
||||
return None
|
||||
while True:
|
||||
self.index += 1
|
||||
if self.index > self.limit:
|
||||
self.posts = None
|
||||
return None
|
||||
if self.index >= len(self.posts):
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
if self.index >= len(self.posts):
|
||||
if self.load_more_posts() is False:
|
||||
self.posts = None
|
||||
return None
|
||||
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
self.posts = self.find_posts()
|
||||
print(self.posts[self.index - 1].get_attribute("href"), flush=True)
|
||||
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
|
||||
continue
|
||||
print("number of posts: ", self.index, '/', str(len(self.posts)), flush=True)
|
||||
self.current_url = self.posts[self.index - 1].get_attribute('href')
|
||||
return self.posts[self.index - 1]
|
||||
|
||||
def load_more_posts(self):
|
||||
# previous_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
# previous_posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
|
||||
previous_posts = self.find_posts()
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
body.send_keys(Keys.NULL)
|
||||
body.send_keys(Keys.END)
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
for j in range(0, 2):
|
||||
body.send_keys(Keys.PAGE_UP)
|
||||
wait(0.1)
|
||||
for j in range(0, 15):
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
wait(0.1)
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
for i in range(0, 10):
|
||||
print_and_flush("Try load more")
|
||||
self.driver.execute_script("window.scrollBy(0, 800)")
|
||||
wait(4)
|
||||
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
|
||||
present_posts = self.find_posts()
|
||||
if len(previous_posts) != len(present_posts):
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
if self.reload_count < 8:
|
||||
print_and_flush("index reload")
|
||||
self.reload_count += 1
|
||||
self.index -= 1 if self.index > 0 else 0
|
||||
position = self.driver.get_window_position()
|
||||
size = self.driver.get_window_size()
|
||||
self.driver.maximize_window()
|
||||
self.driver.set_window_size(size['width'], size["height"])
|
||||
self.driver.set_window_position(position['x'], position['y'])
|
||||
return True
|
||||
if self.reload_count < 15:
|
||||
print_and_flush("refresh")
|
||||
self.driver.refresh()
|
||||
wait(5)
|
||||
self.index = 0
|
||||
self.reload_count += 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_earlier(self, time_date):
|
||||
return True if time_date < self.begin_date else False
|
||||
|
||||
def is_late(self, time_date):
|
||||
return True if time_date > self.end_date else False
|
||||
|
||||
def set_date(self, begin_date, end_date):
|
||||
self.set_begin_date(begin_date)
|
||||
self.set_end_date(end_date)
|
||||
|
||||
def set_end_date(self, end_date):
|
||||
if type(end_date) == str:
|
||||
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
||||
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
||||
self.end_date = end_date
|
||||
else:
|
||||
self.end_date = datetime.datetime.today()
|
||||
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
||||
self.end_date += datetime.timedelta(days=1)
|
||||
|
||||
def set_begin_date(self, begin_date):
|
||||
if type(begin_date) == str:
|
||||
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
||||
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
||||
self.begin_date = begin_date
|
||||
else:
|
||||
self.begin_date = datetime.datetime.today()
|
||||
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month,
|
||||
day=self.begin_date.day)
|
||||
|
||||
def crawling_ok(self):
|
||||
self.url_set.add(self.current_url)
|
||||
|
||||
def init(self):
|
||||
self.index = 0
|
||||
self.posts = None
|
||||
self.url_set.clear()
|
||||
|
||||
def find_posts(self):
|
||||
try:
|
||||
divs = self.driver.find_elements_by_xpath("//div[@class='_1dwg']")
|
||||
except:
|
||||
return None
|
||||
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
||||
posts = list()
|
||||
for div in divs:
|
||||
try:
|
||||
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
||||
except:
|
||||
pass
|
||||
return posts
|
||||
|
||||
def find_posts_wait(self):
|
||||
try:
|
||||
divs = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']", 30)
|
||||
except:
|
||||
return None
|
||||
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
|
||||
posts = list()
|
||||
for div in divs:
|
||||
try:
|
||||
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
|
||||
except:
|
||||
pass
|
||||
return posts
|
||||
|
||||
|
||||
class FacebookMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = FacebookPageCrawler()
|
||||
self.body_crawler = FacebookBodyCrawler()
|
||||
self.reply_crawler = FacebookReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = FacebookInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
self.keyword_id = None
|
||||
self.url = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all_current_url(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.url_set = backup_set.copy()
|
||||
while True:
|
||||
post = self.page_crawler.next_post_by_tag() if self.crawl_init.is_hashtag() \
|
||||
else self.page_crawler.next_post_by_user()
|
||||
if post is None:
|
||||
break
|
||||
self.click_new_tab(post)
|
||||
self.control_tab()
|
||||
self.switch_new_tab()
|
||||
wait(5)
|
||||
body = self.driver.find_element_by_tag_name('body')
|
||||
self.click_element(body)
|
||||
self.click_element(body)
|
||||
try:
|
||||
self.page_crawler.crawling_ok()
|
||||
body_info = self.crawl_body()
|
||||
print("body : ok", flush=True)
|
||||
self.crawl_reply(body_info)
|
||||
print("reply : ok", flush=True)
|
||||
except WebDriverException as ee:
|
||||
logging.info(ee)
|
||||
print_and_flush("fail")
|
||||
# raise WebDriverException
|
||||
except Exception as e:
|
||||
print_and_flush("fail")
|
||||
logging.info(e)
|
||||
finally:
|
||||
self.switch_main_tab()
|
||||
|
||||
def crawl_body(self):
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
# print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
return {"article_url": content["article_url"], "platform_id": content["platform_id"]}
|
||||
|
||||
def crawl_reply(self, body_info):
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.set_div()
|
||||
if self.reply_crawler.has_reply():
|
||||
self.reply_crawler.crawl_all()
|
||||
contents = self.reply_crawler.get_content()
|
||||
for content in contents:
|
||||
content.update(body_info)
|
||||
self.send_to_db.send_reply(contents)
|
||||
|
||||
def start(self):
|
||||
self.crawl_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
self.page_crawler.set_limit(self.crawl_init.until_page)
|
||||
|
||||
def set_main_window_handler(self, window_handler):
|
||||
self.main_window_handler = window_handler
|
||||
|
||||
def crawl_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawler Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
self.set_main_window_handler(self.driver.window_handles[0])
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
self.facebook_login()
|
||||
body = self.driver.find_element_by_tag_name('body')
|
||||
self.click_element(body)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
self.crawl_all_current_url(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
backup_set = self.page_crawler.url_set.copy()
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
self.driver.quit()
|
||||
|
||||
def facebook_login(self):
|
||||
try:
|
||||
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
||||
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
||||
except:
|
||||
return
|
||||
email = 'concepters22@gmail.com'
|
||||
password = 'zjstpqxjtm'
|
||||
element_email.send_keys(email)
|
||||
element_pwd.send_keys(password)
|
||||
label = self.driver.find_element_by_css_selector('#loginbutton')
|
||||
element_input = label.find_element_by_xpath('input')
|
||||
element_input.send_keys(Keys.NULL)
|
||||
element_input.send_keys(Keys.ENTER)
|
||||
wait(5)
|
||||
|
||||
def click_new_tab(self, element):
|
||||
#ac = ActionChains(self.driver)
|
||||
#ac.key_down(Keys.CONTROL).move_to_element(element).click().key_up(Keys.CONTROL).perform()
|
||||
element.send_keys(Keys.NULL)
|
||||
element.send_keys(Keys.CONTROL + Keys.ENTER)
|
||||
wait(3)
|
||||
|
||||
def switch_new_tab(self):
|
||||
self.driver.switch_to_window(self.driver.window_handles[1])
|
||||
|
||||
def switch_main_tab(self):
|
||||
while len(self.driver.window_handles) > 1:
|
||||
self.driver.switch_to_window(self.driver.window_handles[1])
|
||||
self.driver.close()
|
||||
self.driver.switch_to_window(self.main_window_handler)
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
ac.move_to_element(element).click().perform()
|
||||
wait(4)
|
||||
|
||||
def control_tab(self):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.key_down(Keys.CONTROL).key_down(Keys.TAB).perform()
|
||||
wait(2)
|
||||
0
WebBasedCrawler/insta/__init__.py
Normal file
0
WebBasedCrawler/insta/__init__.py
Normal file
541
WebBasedCrawler/insta/instacrawl.py
Normal file
541
WebBasedCrawler/insta/instacrawl.py
Normal file
@@ -0,0 +1,541 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import re
|
||||
import datetime
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_xpath
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[9] = insta_tag_url
|
||||
self.urls[10] = insta_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
trimmed_list = list()
|
||||
if self.platform() == 10:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(x.strip())
|
||||
else:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(self.utf8(x))
|
||||
return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
urls = list()
|
||||
for x in self.split_searches():
|
||||
url = self.urls[self.platform()] + x
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class InstaBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
else:
|
||||
self.article = article
|
||||
|
||||
def find_article_url(self):
|
||||
a = self.article.find_element_by_xpath("div/section/a")
|
||||
return a.get_attribute("href")
|
||||
|
||||
def find_article_profileurl(self):
|
||||
img = self.article.find_element_by_xpath("header/a/img[@src]")
|
||||
return img.get_attribute("src")
|
||||
|
||||
def find_article_nickname(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
return a.text
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = self.article.find_element_by_xpath("div/section/a/time")
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
|
||||
def find_article_data(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||
span = li.find_element_by_xpath("h1/span")
|
||||
return span.text
|
||||
except:
|
||||
return ""
|
||||
|
||||
def find_article_id(self):
|
||||
return self.find_platform_id()
|
||||
|
||||
def find_platform_name(self):
|
||||
return 'instagram'
|
||||
|
||||
def find_article_form(self):
|
||||
return 'body'
|
||||
|
||||
def find_platform_id(self):
|
||||
a = self.article.find_element_by_xpath("header/div/a")
|
||||
if a:
|
||||
href = a.get_attribute("href")
|
||||
str_id = href.replace(insta_url, "").replace("/", "")
|
||||
return str_id
|
||||
else:
|
||||
return None
|
||||
|
||||
def find_like_num(self):
|
||||
div = self.article.find_element_by_xpath("div/section/div[@data-reactid]")
|
||||
try:
|
||||
span = div.find_element_by_css_selector("span[data-reactid$='.1'")
|
||||
str_num = span.text
|
||||
if str_num[-1] == 'm':
|
||||
num = float(str_num[0:-1]) * 1000000
|
||||
elif str_num[-1] == 'k':
|
||||
num = float(str_num[0:-1]) * 1000
|
||||
else:
|
||||
num = int(str_num)
|
||||
return str(num)
|
||||
except:
|
||||
a_list = div.find_elements_by_tag_name("a")
|
||||
if len(a_list) > 1:
|
||||
return str(len(a_list))
|
||||
else:
|
||||
span = div.find_element_by_xpath("span[1]")
|
||||
if len(span.text.strip()) < 1:
|
||||
return str(1)
|
||||
else:
|
||||
return str(0)
|
||||
|
||||
def find_reply_num(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
lis = ul.find_elements_by_tag_name("li")
|
||||
if len(lis) < 2:
|
||||
return "0"
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
||||
span = li.find_element_by_xpath("button/span[2]")
|
||||
str_num = span.text.replace(",", "")
|
||||
return str_num
|
||||
except:
|
||||
return str(len(lis) - 1)
|
||||
|
||||
def get_content(self):
|
||||
content = dict()
|
||||
content["article_id"] = self.find_article_id()
|
||||
content["platform_id"] = self.find_platform_id()
|
||||
content["article_url"] = self.find_article_url()
|
||||
content["article_profileurl"] = self.find_article_profileurl()
|
||||
content["article_nickname"] = self.find_article_nickname()
|
||||
content["platform_name"] = self.find_platform_name()
|
||||
content["article_date"] = self.find_article_date()
|
||||
content["article_data"] = self.find_article_data()
|
||||
content["article_form"] = 'body'
|
||||
content["platform_form"] = 'post'
|
||||
reply_num = self.find_reply_num()
|
||||
if int(reply_num) > 0:
|
||||
content["article_order"] = int(reply_num)
|
||||
like_num = self.find_like_num()
|
||||
if int(float(like_num)) > 0:
|
||||
content["article_hit"] = int(float(like_num))
|
||||
return content
|
||||
|
||||
def find_platform_title(self):
|
||||
pass
|
||||
|
||||
def find_article_title(self):
|
||||
pass
|
||||
|
||||
class InstaReplyCrawler:
|
||||
def __init__(self, driver=None, article=None):
|
||||
self.driver = driver
|
||||
self.activity = article
|
||||
self.reply_list = list()
|
||||
|
||||
def find_init(self):
|
||||
self.reply_list.clear()
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_article(self, article=None):
|
||||
if article is None:
|
||||
try:
|
||||
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
raise Exception
|
||||
else:
|
||||
self.article = article
|
||||
|
||||
def has_more(self, ul):
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def read_more_reply(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button")
|
||||
enter_element(button)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def read_all_reply(self, ul):
|
||||
for i in range(0, 10):
|
||||
if self.has_more(ul):
|
||||
self.read_more_reply(ul)
|
||||
else:
|
||||
break
|
||||
|
||||
def get_reply_ul(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
return ul
|
||||
|
||||
def has_reply(self, ul):
|
||||
try:
|
||||
lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']")
|
||||
if len(lis) > 0:
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
return False
|
||||
|
||||
def crawl_all(self):
|
||||
self.find_init()
|
||||
self.set_article()
|
||||
try:
|
||||
ul = self.get_reply_ul()
|
||||
if self.has_reply(ul):
|
||||
self.read_all_reply(ul)
|
||||
self.crawl_reply(ul)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def crawl_reply(self, ul):
|
||||
article_data = self.find_article_data(ul)
|
||||
article_id = self.find_article_id(ul)
|
||||
if len(article_data) != len(article_id):
|
||||
print_and_flush("article_data != article_id")
|
||||
for i in range(0, len(article_id)):
|
||||
content = dict()
|
||||
content["article_data"] = article_data[i]
|
||||
content["article_id"] = article_id[i]
|
||||
content["article_nickname"] = article_id[i]
|
||||
content["platform_name"] = "instagram"
|
||||
content["platform_form"] = "post"
|
||||
content["article_form"] = 'reply'
|
||||
content["article_order"] = i
|
||||
self.reply_list.append(content)
|
||||
|
||||
def get_content(self):
|
||||
return self.reply_list
|
||||
|
||||
def find_article_id(self, ul):
|
||||
id_list = list()
|
||||
a_list = ul.find_elements_by_xpath("li/a")
|
||||
for i in a_list:
|
||||
id_list.append(i.text)
|
||||
return id_list
|
||||
|
||||
def find_article_profileurl(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_nickname(self, ul):
|
||||
return self.find_article_id(ul)
|
||||
|
||||
def find_article_data(self, ul):
|
||||
data_list = list()
|
||||
span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span")
|
||||
for i in span_list:
|
||||
data_list.append(i.text)
|
||||
return data_list
|
||||
|
||||
def find_article_url(self, ul):
|
||||
pass
|
||||
|
||||
def find_platform_id(self, ul):
|
||||
pass
|
||||
|
||||
def find_article_form(self, ul=None):
|
||||
return 'reply'
|
||||
|
||||
def find_platform_name(self, ul=None):
|
||||
return 'instagram'
|
||||
|
||||
def find_platform_form(self, ul=None):
|
||||
return 'post'
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
wait(2)
|
||||
|
||||
|
||||
class InstaPageCrawler:
|
||||
def __init__(self, driver=None, begin_date=None, end_date=None):
|
||||
self.driver = driver
|
||||
self.url_set = set()
|
||||
self.begin_date = begin_date
|
||||
self.end_date = end_date
|
||||
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def find_article_url(self):
|
||||
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
|
||||
return a.get_attribute("href")
|
||||
|
||||
def init(self):
|
||||
self.url_set.clear()
|
||||
|
||||
def set_date(self, begin_date, end_date):
|
||||
self.set_begin_date(begin_date)
|
||||
self.set_end_date(end_date)
|
||||
|
||||
def set_end_date(self, end_date):
|
||||
if type(end_date) == str:
|
||||
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
||||
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
||||
self.end_date = end_date
|
||||
else:
|
||||
self.end_date = datetime.datetime.today()
|
||||
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
||||
self.end_date += datetime.timedelta(days=1)
|
||||
|
||||
def set_begin_date(self, begin_date):
|
||||
if type(begin_date) == str:
|
||||
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
||||
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
||||
self.begin_date = begin_date
|
||||
else:
|
||||
self.begin_date = datetime.datetime.today()
|
||||
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
|
||||
|
||||
def has_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def move_next(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def has_first_page(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def crawling_ok(self, url):
|
||||
self.url_set.add(url)
|
||||
|
||||
def is_earlier(self, time_date):
|
||||
return True if time_date < self.begin_date else False
|
||||
|
||||
def is_late(self, time_date):
|
||||
return True if time_date > self.end_date else False
|
||||
|
||||
def find_article_date(self):
|
||||
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
|
||||
str_time = el_time.get_attribute("datetime")
|
||||
m = self.re_date.search(str_time)
|
||||
if m is None:
|
||||
return "0000-00-00 00:00:00"
|
||||
else:
|
||||
return m.group(1) + " " + m.group(2)
|
||||
|
||||
|
||||
class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = InstaPageCrawler()
|
||||
self.body_crawler = InstaBodyCrawler()
|
||||
self.reply_crawler = InstaReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.browser = Browser()
|
||||
self.crawl_init = InstaInit()
|
||||
self.driver = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.url_set = backup_set.copy()
|
||||
if not self.page_crawler.has_first_page():
|
||||
return
|
||||
while True:
|
||||
str_date = self.page_crawler.find_article_date()
|
||||
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
|
||||
print_and_flush(str_date)
|
||||
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
#if self.page_crawler.is_earlier(date_val.date()):
|
||||
if self.page_crawler.is_late(date_val):
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
continue
|
||||
else:
|
||||
break
|
||||
#if self.page_crawler.is_late(date_val.date()):
|
||||
if self.page_crawler.is_earlier(date_val):
|
||||
break
|
||||
try:
|
||||
body_content = self.crawl_body()
|
||||
self.crawl_reply(body_content)
|
||||
self.page_crawler.url_set.add(body_content["article_url"])
|
||||
print_and_flush("ok")
|
||||
except Exception as e:
|
||||
print_and_flush('fail')
|
||||
print_and_flush(e)
|
||||
if self.page_crawler.has_next():
|
||||
self.page_crawler.move_next()
|
||||
else:
|
||||
break
|
||||
|
||||
def crawl_body(self):
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
self.body_crawler.set_article()
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
return content
|
||||
|
||||
def crawl_reply(self, body_content):
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.crawl_all()
|
||||
content_list = self.reply_crawler.get_content()
|
||||
if content_list:
|
||||
for i in content_list:
|
||||
i['article_url'] = body_content['article_url']
|
||||
i['platform_id'] = body_content['platform_id']
|
||||
self.send_to_db.send_reply(content_list)
|
||||
|
||||
def start(self):
|
||||
self.crawler_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(3)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
if self.page_crawler.has_first_page():
|
||||
self.crawl_all(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
backup_set = self.page_crawler.url_set.copy()
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
self.driver.quit()
|
||||
0
WebBasedCrawler/kakao/__init__.py
Normal file
0
WebBasedCrawler/kakao/__init__.py
Normal file
@@ -1,28 +1,34 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
__author__ = 'cococo'
|
||||
import sys
|
||||
import re
|
||||
import datetime
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
import sys
|
||||
import re
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
from navercrawl import wait
|
||||
from navercrawl import print_and_flush
|
||||
from navercrawl import SendtoDB
|
||||
from navercrawl import Browser
|
||||
from navercrawl import CrawlInit
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
|
||||
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import Browser
|
||||
from base.baseclasses import CrawlInit
|
||||
|
||||
__author__ = 'cococo'
|
||||
kakaostory_url = 'https://story.kakao.com/'
|
||||
kakaostory_channel_url = 'https://story.kakao.com/ch/'
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
||||
|
||||
|
||||
class KakaoBodyCrawler:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
@@ -174,10 +180,8 @@ class KakaoBodyCrawler:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
feelings = dict()
|
||||
feelings['data'] = data
|
||||
feelings['count'] = len(data)
|
||||
@@ -232,10 +236,8 @@ class KakaoBodyCrawler:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
feelings = dict()
|
||||
feelings['data'] = data
|
||||
feelings['count'] = len(data)
|
||||
@@ -346,10 +348,8 @@ class KakaoBodyCrawler:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
shares = dict()
|
||||
shares['data'] = data
|
||||
shares['count'] = len(data)
|
||||
@@ -400,10 +400,8 @@ class KakaoBodyCrawler:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
shares = dict()
|
||||
shares['data'] = data
|
||||
shares['count'] = len(data)
|
||||
@@ -553,10 +551,8 @@ class KakaoReplyCrawler_backup:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
|
||||
def find_article_id(self, li):
|
||||
a = li.find_element_by_xpath("div[@class='pf']/a")
|
||||
@@ -753,10 +749,8 @@ class KakaoReplyCrawler:
|
||||
except WebDriverException:
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
|
||||
def find_article_id(self, ul):
|
||||
a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a")
|
||||
@@ -1026,7 +1020,7 @@ class KakaoPageCrawler:
|
||||
|
||||
def load_more_activities(self):
|
||||
previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
|
||||
for i in range(0, 5):
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
body.send_keys(Keys.NULL)
|
||||
@@ -1037,13 +1031,13 @@ class KakaoPageCrawler:
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
for i in range(0, 5):
|
||||
for i in range(0, 4):
|
||||
print_and_flush("Try load more")
|
||||
body = self.driver.find_element_by_tag_name("body")
|
||||
for j in range(0, 3):
|
||||
for j in range(0, 2):
|
||||
body.send_keys(Keys.PAGE_UP)
|
||||
wait(0.1)
|
||||
for j in range(0, 50):
|
||||
for j in range(0, 15):
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
wait(0.1)
|
||||
wait(4)
|
||||
@@ -1061,10 +1055,10 @@ class KakaoPageCrawler:
|
||||
wait(2)
|
||||
self.reload_count = 0
|
||||
return True
|
||||
if self.reload_count < 10:
|
||||
if self.reload_count < 8:
|
||||
print_and_flush("index reload")
|
||||
self.reload_count += 1
|
||||
self.index //= 2
|
||||
self.index -= 1 if self.index > 0 else 0
|
||||
position = self.driver.get_window_position()
|
||||
size = self.driver.get_window_size()
|
||||
self.driver.maximize_window()
|
||||
@@ -1141,70 +1135,6 @@ class KakaoPageCrawler:
|
||||
return temp_date
|
||||
|
||||
|
||||
class KakaoMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = KakaoPageCrawler()
|
||||
self.body_crawler = KakaoBodyCrawler()
|
||||
self.reply_crawler = KakaoReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.driver = None
|
||||
self.browser = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all_current_url(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.activity_data_model_set = backup_set.copy()
|
||||
while True:
|
||||
activity = self.page_crawler.next_activity()
|
||||
if activity is None:
|
||||
break
|
||||
try:
|
||||
self.crawl_body(activity)
|
||||
self.crawl_reply(activity)
|
||||
self.page_crawler.crawling_ok()
|
||||
print_and_flush("ok")
|
||||
except WebDriverException as ee:
|
||||
print_and_flush(ee)
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush("fail")
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
print_and_flush("failed")
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
print_and_flush(e)
|
||||
|
||||
def crawl_body(self, activity):
|
||||
# print_and_flush("start body crawl")
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
self.body_crawler.set_activity(activity)
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
|
||||
def crawl_reply(self, activity):
|
||||
# print_and_flush("start reply crawl")
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.set_activity(activity)
|
||||
if self.reply_crawler.has_reply():
|
||||
self.reply_crawler.crawl_all()
|
||||
self.send_to_db.send_reply(self.reply_crawler.get_content())
|
||||
|
||||
|
||||
class KakaoInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
@@ -1250,6 +1180,128 @@ class KakaoInit(CrawlInit):
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class KakaoMainCrawler:
|
||||
def __init__(self):
|
||||
self.page_crawler = KakaoPageCrawler()
|
||||
self.body_crawler = KakaoBodyCrawler()
|
||||
self.reply_crawler = KakaoReplyCrawler()
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = KakaoInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.page_crawler.set_driver(driver)
|
||||
self.body_crawler.set_driver(driver)
|
||||
self.reply_crawler.set_driver(driver)
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all_current_url(self, backup_set=None):
|
||||
self.page_crawler.init()
|
||||
if backup_set:
|
||||
self.page_crawler.activity_data_model_set = backup_set.copy()
|
||||
while True:
|
||||
activity = self.page_crawler.next_activity()
|
||||
if activity is None:
|
||||
break
|
||||
try:
|
||||
self.crawl_body(activity)
|
||||
self.crawl_reply(activity)
|
||||
self.page_crawler.crawling_ok()
|
||||
print_and_flush("ok")
|
||||
except WebDriverException as ee:
|
||||
logging.info(ee)
|
||||
# print_and_flush(e)
|
||||
print_and_flush("fail")
|
||||
raise WebDriverException
|
||||
except Exception as e:
|
||||
print_and_flush("failed")
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
|
||||
def crawl_body(self, activity):
|
||||
# print_and_flush("start body crawl")
|
||||
self.body_crawler.set_driver(self.driver)
|
||||
self.body_crawler.set_activity(activity)
|
||||
content = self.body_crawler.get_content()
|
||||
content["keyword_id"] = self.keyword_id
|
||||
print_and_flush(content["article_url"])
|
||||
self.send_to_db.delete_url(content['article_url'])
|
||||
self.send_to_db.send_body(content)
|
||||
|
||||
def crawl_reply(self, activity):
|
||||
# print_and_flush("start reply crawl")
|
||||
self.reply_crawler.set_driver(self.driver)
|
||||
self.reply_crawler.set_activity(activity)
|
||||
if self.reply_crawler.has_reply():
|
||||
self.reply_crawler.crawl_all()
|
||||
self.send_to_db.send_reply(self.reply_crawler.get_content())
|
||||
|
||||
def start(self):
|
||||
self.crawl_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawl_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawler Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(3)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
self.crawl_all_current_url(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
backup_set = self.page_crawler.activity_data_model_set.copy()
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
# kakao_main.driver.implicitly_wait(5)
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
# kakao_main.driver.quit()
|
||||
self.send_to_db.close()
|
||||
self.driver.quit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
"""
|
||||
argv:
|
||||
@@ -1293,10 +1345,8 @@ if __name__ == '__main__':
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
exc_type, exc_obj, exc_tb = sys.exc_info()
|
||||
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
|
||||
print(exc_type, fname, exc_tb.tb_lineno)
|
||||
logging.info(e)
|
||||
# print_and_flush(e)
|
||||
backup_set = kakao_main.page_crawler.activity_data_model_set.copy()
|
||||
kakao_main.set_driver(browser.new_browser())
|
||||
# kakao_main.driver.implicitly_wait(5)
|
||||
0
WebBasedCrawler/naver/__init__.py
Normal file
0
WebBasedCrawler/naver/__init__.py
Normal file
1
WebBasedCrawler/naver/navercrawl.py
Normal file
1
WebBasedCrawler/naver/navercrawl.py
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
92
WebBasedCrawler/webbasedcrawler.py
Normal file
92
WebBasedCrawler/webbasedcrawler.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import sys
|
||||
|
||||
from insta import instacrawl
|
||||
from kakao import kakaocrawl
|
||||
from naver import navercrawl
|
||||
from facebook import facebookcrawl
|
||||
from facebook import facebookcrawlbs
|
||||
|
||||
from base.baseclasses import print_and_flush
|
||||
|
||||
|
||||
class WebBasedCrawler:
|
||||
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
|
||||
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
|
||||
|
||||
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
|
||||
if platform == "instagram":
|
||||
self.crawler = instacrawl.InstaMainCrawler()
|
||||
elif platform == "kakaochannel":
|
||||
self.crawler = kakaocrawl.KakaoMainCrawler()
|
||||
elif platform == "navercafe":
|
||||
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
|
||||
elif platform == "facebook":
|
||||
self.crawler = facebookcrawlbs.FacebookMainCrawler()
|
||||
else:
|
||||
self.crawler = None
|
||||
raise Exception
|
||||
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
|
||||
|
||||
def start(self):
|
||||
self.crawler.start()
|
||||
|
||||
|
||||
browser_opt = ('chrome', "ie", "opera", "firefox")
|
||||
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
|
||||
|
||||
|
||||
def get_browser_info(platform_, file_name="browser.txt"):
|
||||
if sys.platform == 'win32':
|
||||
options = {'default': 'ie'}
|
||||
else:
|
||||
options = {'default': 'firefox'}
|
||||
try:
|
||||
with open(file_name, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
elif len(line.strip()) < 1:
|
||||
continue
|
||||
else:
|
||||
platform, browser = line.split("=")
|
||||
platform = platform.strip()
|
||||
browser = browser.strip()
|
||||
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
|
||||
print_and_flush("check option: " + line)
|
||||
else:
|
||||
options[platform] = browser
|
||||
except FileNotFoundError:
|
||||
print_and_flush("browser.txt file is not exists")
|
||||
print_and_flush("use " + options['default'] + " browser")
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
print_and_flush("Unknown error occurs")
|
||||
exit(1)
|
||||
return options.get(platform_, options['default'])
|
||||
|
||||
if __name__ == '__main__':
|
||||
"""
|
||||
sys.argv[0] webbasedcrawler.py
|
||||
sys.argv[1] instagram, kakaochannel, navercafe, facebook
|
||||
sys.argv[2] keyword_id
|
||||
sys.argv[3] data group
|
||||
sys.argv[4] start_day
|
||||
sys.argv[5] until_page
|
||||
"""
|
||||
|
||||
if len(sys.argv) == 6:
|
||||
print_and_flush("Python Crawling Executed")
|
||||
else:
|
||||
print_and_flush("Check Argumenets!")
|
||||
exit(1)
|
||||
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2],
|
||||
sys.argv[3], sys.argv[4], sys.argv[5])
|
||||
crawler.start()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
exit(0)
|
||||
Reference in New Issue
Block a user