selenium, beautifulsoup4로 구현한 python 크롤러

git-svn-id: svn://192.168.0.12/source@241 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-01-19 06:52:00 +00:00
parent 73ede17add
commit 21b11500bd
14 changed files with 2926 additions and 114 deletions

View File

View File

@@ -0,0 +1,377 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
import time
import os
import psutil
import threading
from time import localtime, strftime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def print_and_flush(string):
print(string)
sys.stdout.flush()
def fcntwait(n):
time.sleep(n)
def wait(n):
th = threading.Thread(target=fcntwait, args=(n,))
th.start()
th.join()
def insert_log(msg):
pid = os.getpid()
tm = strftime("%Y_%m_%d", localtime())
filename = tm + "_" + str(pid) + ".log"
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
with open(filename, "a") as f:
f.write(total_msg)
f.flush()
def enter_element(element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def find_element_by_css_selector(driver, tag, time=0):
element = WebDriverWait(driver, time).until(
EC.presence_of_element_located((By.CSS_SELECTOR, tag))
)
return element
def find_elements_by_css_selector(driver, tag, time=0):
elements = WebDriverWait(driver, time).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, tag))
)
return elements
def find_element_by_xpath(driver, tag, time=0):
element = WebDriverWait(driver, time).until(
EC.presence_of_element_located((By.XPATH, tag))
)
return element
def find_elements_by_xpath(driver, tag, time=0):
elements = WebDriverWait(driver, time).until(
EC.presence_of_all_elements_located((By.XPATH, tag))
)
return elements
class Browser:
def __init__(self, driver=None):
self.driver = driver
self.info = ""
def get_new_driver(self, name):
"""
windows system:
name = chrome, ie, opera, firefox
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
linux system:
name = chrome, opera, firefox
default driver_exec: chromedriver, operadriver
"""
if sys.platform == "win32":
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver.exe")
elif name == "ie":
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver.exe")
elif name == "firefox":
return self.new_firefox_browser()
else:
return None
else:
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver")
elif name == "firefox":
return self.new_firefox_browser()
else:
return None
def new_chrome_browser(self, driver_exec=None):
self.info = "chrome"
if driver_exec is not None:
self.chrome_driver_path = driver_exec
self.chrome_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.chrome_basename):
port = self.port(self.chrome_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
else:
self.driver = webdriver.Chrome(self.chrome_driver_path)
return self.driver
def new_ie_browser(self, driver_exec=None):
self.info = "ie"
if driver_exec is not None:
self.ie_driver_path = driver_exec
self.ie_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.ie_basename):
port = self.port(self.ie_basename)
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
else:
self.driver = webdriver.Ie(self.ie_driver_path)
return self.driver
def new_firefox_browser(self):
self.info = "firefox"
self.driver = webdriver.Firefox()
return self.driver
def new_opera_browser(self, driver_exec=None):
self.info = "opera"
if driver_exec is not None:
self.opera_driver_path = driver_exec
self.opera_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.opera_basename):
port = self.port(self.opera_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
else:
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
return self.driver
def driver(self):
return self.driver
def is_server_executed(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return True
return False
def port(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return str(x.laddr[1])
return str(9999)
def new_browser(self):
if self.info == "chrome":
return self.new_chrome_browser()
elif self.info == "ie":
return self.new_ie_browser()
elif self.info == "opera":
return self.new_opera_browser()
elif self.info == "firefox":
return self.new_firefox_browser()
else:
return None
class SendtoDB:
pymysql = __import__('pymysql.cursors')
def __init__(self, db_num=0):
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.db_num = db_num
def set_db(self, db_num):
self.db_num = str(db_num)
def make_insert_query_backup(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
for key in dictionary.keys():
query += (key + ",")
query = query[:len(query) - 1] + ")"
query += " values("
for key, value in dictionary.items():
if type(value) == int:
query += (str(value) + ",")
else:
query += self.conn.escape(value) + ","
query = query[:len(query) - 1] + ")"
return query
def make_insert_query(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
key_list = list()
val_list = list()
for key, val in dictionary.items():
key_list.append(key)
if type(val) == int:
val_list.append(str(val))
else:
val_list.append(self.conn.escape(val))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
def send_body(self, body):
if not body:
return
self.conn_check()
with self.conn.cursor() as cursor:
query = self.make_insert_query(body)
try:
cursor.execute(query)
self.conn.commit()
except Exception as e:
pass
# print(e)
# sys.stdout.flush()
# print(query)
# sys.stdout.flush()
def send_reply(self, reply):
if not reply:
return
for i in reply:
self.send_body(i)
def conn_check(self):
if not self.conn.open:
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
def close(self):
self.conn.close()
def delete_url(self, url):
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
self.conn_check()
with self.conn.cursor() as cursor:
try:
cursor.execute(query)
self.conn.commit()
except Exception as e:
print(e)
sys.stdout.flush()
print(query)
sys.stdout.flush()
class CrawlInit:
pymysql = __import__('pymysql.cursors')
def __init__(self, before_day=0):
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.urls = dict()
self.before_day = before_day
def set_before_day(self, before_day):
if type(before_day) == str:
self.before_day = int(before_day)
elif type(before_day) == int:
self.before_day = before_day
def set_until_page(self, until_page):
if type(until_page) == str:
self.until_page = int(until_page)
elif type(until_page) == int:
self.until_page = until_page
def get_keyword_parameters(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
self.params = cursor.fetchone()
return self.params
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return dict()
def get_naver_cafe_list(self):
query = "select url, clubid from navercafelist"
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
pass
else:
query += (" where group_num = " + str(self.authorship()))
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
list_result = cursor.fetchall()
for i in list_result:
self.urls[i["url"]] = i["clubid"]
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return self.urls
def start_day(self):
return self.params["start"]
def end_day(self):
return self.params["end"]
def keyword_id(self):
return self.params["id"]
def realtime(self):
return self.params["realtime"]
def searches(self):
return self.params["searches"]
def authorship(self):
return self.params["authorship"]
def platform(self):
return self.params["platform"]
def is_realtime(self):
if str(self.realtime()) == '0':
return False
else:
return True
def euc_kr(self, keyword):
byte_code = list(keyword.encode("euc_kr"))
encoded_keyword = ""
for i in byte_code:
if i == 0x20:
encoded_keyword += "+"
else:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def utf8(self, keyword):
byte_code = list(keyword.encode("utf-8"))
encoded_keyword = ""
for i in byte_code:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def disconnect(self):
self.conn.close()
def date_to_str(self, arg_date):
return arg_date.strftime("%Y-%m-%d")

View File

View File

@@ -0,0 +1,846 @@
#-*- coding: utf-8 -*-
import logging
import re
import json
import datetime
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import find_elements_by_css_selector
from base.baseclasses import find_elements_by_xpath
from base.baseclasses import enter_element
from base.baseclasses import Browser
logging.basicConfig(
level=logging.INFO,
format='%(module)s(%(lineno)s):%(funcName)s:%(message)s'
)
facebook_url = "https://www.facebook.com/"
facebook_tag_url = "https://www.facebook.com/hashtag/"
class FacebookInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[11] = facebook_tag_url
self.urls[12] = facebook_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
# trimmed_list = list()
# if self.platform() == 12:
# for x in splited_list:
# trimmed_list.append(x.strip())
# else:
# for x in splited_list:
# trimmed_list.append(self.utf8(x))
# return trimmed_list
def make_url(self):
return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()]
# urls = list()
# for x in self.split_searches():
# url = self.urls[self.platform()] + x + "?fref=ts"
# urls.append(url)
# return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
def is_hashtag(self):
return False if self.platform() == 12 else True
class FacebookBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile(
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
)
self.re_id = re.compile("id=([\\d]+)")
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
#(("id=([\\d]+)|facebook.com/([\\w._]+)\\?"))
def set_driver(self, driver):
self.driver = driver
def find_article_id(self):
href = self.find_article_url()
m = self.re_ids.search(href)
return m.group(1) if m.group(2) is None else m.group(2)
def find_article_nickname(self):
try:
element = self.driver.find_element_by_css_selector("div[class='fbPhotoContributorName']")
except:
element = self.driver.find_element_by_css_selector("span.fwb>a")
return element.text
def find_article_data(self):
try:
element = self.driver.find_element_by_css_selector("span[class='hasCaption']")
except:
try:
element = self.driver.find_element_by_css_selector("div[class='_5pbx userContent']")
except:
return ""
return element.text
def find_platform_id(self):
pass
def find_article_date(self):
element = self.driver.find_element_by_css_selector("abbr[data-utime]")
str_datetime = element.get_attribute("title")
logging.debug(str_datetime)
m = self.re_date.match(str_datetime)
if m is None:
return "0000-00-00 00:00:00"
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
m.group(5) + ":00"
else:
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
def find_article_url(self):
try:
element = self.driver.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']")
except:
element = self.driver.find_element_by_css_selector("span#fbPhotoPageTimestamp>a[class='_39g5']")
return element.get_attribute('href')
def find_article_title(self):
return self.driver.title
def find_platform_name(self):
pass
def find_like_users(self):
try:
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
enter_element(element)
ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
except:
return None
try:
while True:
#a_element = find_element_by_css_selector(self.driver, "a[class$='uiBoxLightblue uiMorePagerPrimary']",
# 30)
a_element = WebDriverWait(self.driver, 20).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
"a[class$='uiBoxLightblue uiMorePagerPrimary']")))
enter_element(a_element)
wait(1)
except Exception as e:
print(e)
#ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
a_elements = self.driver.find_elements_by_css_selector("div[class='fsl fwb fcb']>a")
like_users = list()
for a in a_elements:
like_user = dict()
like_user['nickname'] = a.text
m = self.re_ids.search(a.get_attribute('href'))
like_user['id'] = m.group(2) if m.group(1) is None else m.group(1)
like_users.append(like_user)
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
enter_element(cancel)
return {'data': like_users, 'count': len(like_users)}
def find_share_users(self):
try:
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
enter_element(element)
#share_element = find_element_by_css_selector(self.driver, "#repost_view_dialog", 30)
page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
except:
return None
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
while len(page_scroller_children) > 1:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
wait(2)
#page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
page_scroller = WebDriverWait(self.driver, 20).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR, "pagelet_scrolling_pager")))
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
a_tags = self.driver.find_elements_by_css_selector("span[class='fwb']>a[class='profileLink']")
share_users = list()
for a in a_tags:
share_user = dict()
share_user['url'] = a.get_attribute('href')
share_user['nickname'] = a.text
str_id = share_user['url'][share_user['url'].rindex('/') + 1:]
m = self.re_id.search(str_id)
share_user['id'] = str_id if m is None else m.group(1)
share_users.append(share_user)
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
enter_element(cancel)
return {'data': share_users, 'count': len(share_users)}
def find_like_user_number(self):
try:
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
except:
return None
str_num = element.text
m = re.search("(\\d+)", str_num.replace(",", ""))
return None if m is None else m.group(1)
def find_share_user_number(self):
try:
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
except:
return None
str_num = element.text
m = re.search("(\\d+)", str_num.replace(",", ""))
return None if m is None else m.group(1)
def find_reply_number(self):
pass
def find_article_profileurl(self):
try:
img = self.driver.find_element_by_css_selector('div._38vo>img')
except:
img = self.driver.find_element_by_css_selector("img._s0._54ru")
return img.get_attribute('src')
def get_content(self):
content = dict()
content['article_id'] = self.find_article_id()
content['article_url'] = self.find_article_url()
content['article_data'] = self.find_article_data()
content['article_date'] = self.find_article_date()
content['article_title'] = self.find_article_title()
content['article_nickname'] = self.find_article_nickname()
content['article_form'] = 'body'
content['platform_name'] = 'facebook'
content['platform_form'] = 'post'
content['platform_title'] = content['article_nickname']
content['platform_id'] = content['article_id']
content['article_profileurl'] = self.find_article_profileurl()
like_user_num = self.find_like_user_number()
share_user_num = self.find_share_user_number()
if like_user_num:
content['article_hit'] = self.find_like_user_number()
if share_user_num:
content['reply_url'] = self.find_share_user_number()
likes = self.find_like_users()
shares = self.find_like_users()
data = list()
if likes:
data.append({"likes": likes})
if shares:
data.append({"shares": shares})
if data:
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode({"data": data})
return content
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class FacebookReplyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.reply_list = list()
self.order = 0
self.div = None
self.re_date = re.compile(
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
)
self.re_id = re.compile("id=([\\d]+)")
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
def find_init(self):
self.reply_list.clear()
self.order = 0
self.reload_count = 0
def set_driver(self, driver):
self.driver = driver
def read_all_reply(self):
try:
a_element = WebDriverWait(self.driver, 15).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
"a.UFIPagerLink")))
enter_element(a_element)
except:
pass
self.read_all_child_reply()
def read_all_child_reply(self):
try:
a_elements = self.driver.find_elements_by_css_selector("a.UFICommentLink")
for a_element in a_elements:
enter_element(a_element)
except:
pass
def set_div(self, div=None):
if div is None:
try:
self.div = self.driver.find_element_by_xpath(
"//div[@data-reactroot and @class='UFIList']/div[not(@class)]")
# self.div = self.driver.find_element_by_css_selector("div[data-reactroot].UFIList>div:not([class])")
except:
self.div = None
else:
self.div = div
def has_reply(self):
"""after set_div execute this"""
if not self.div:
return False
else:
children = self.div.find_elements_by_css_selector("*")
return True if len(children) > 0 else False
def crawl_reply(self, div, article_parent=None):
content = dict()
content['article_id'] = self.find_article_id(div)
content['article_nickname'] = self.find_article_nickname(div)
content['article_data'] = self.find_article_data(div)
content['article_date'] = self.find_article_date(div)
content['article_profileurl'] = self.find_article_profileurl(div)
content['article_order'] = self.order
like_num = self.find_like_number(div)
if like_num:
content['article_hit'] = like_num
if article_parent:
content['article_parent'] = article_parent
content.update({'article_form': 'reply', 'platform_name': 'facebook', 'platform_form': 'post'})
self.order += 1
self.reply_list.append(content)
def crawl_all(self):
self.read_all_reply()
self.set_div()
try:
if self.has_reply():
elements = self.div.find_elements_by_xpath("div")
article_parent = None
for div in elements:
if div.get_attribute('class').find("UFIReplyList") != -1:
reply_div = div.find_elements_by_xpath('div[@role]')
for child in reply_div:
self.crawl_reply(child, article_parent)
elif div.get_attribute("role") == "article":
self.crawl_reply(div)
article_parent = self.reply_list[len(self.reply_list) - 1]['article_nickname']
else:
pass
except Exception as e:
logging.info(e)
def get_content(self):
return self.reply_list
def find_article_id(self, div):
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
m = self.re_ids.search(element.get_attribute('href'))
if not m:
return 'None'
return m.group(1) if not m.group(2) else m.group(2)
def find_article_parent(self, div):
pass
def find_article_date(self, div):
element = div.find_element_by_css_selector("abbr.livetimestamp")
str_datetime = element.get_attribute("title")
m = self.re_date.match(str_datetime)
if m is None:
return "0000-00-00 00:00:00"
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
m.group(5) + ":00"
else:
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
def find_article_data(self, div):
element = div.find_element_by_css_selector("span.UFICommentBody")
return element.text
def find_article_profileurl(self, div):
element = div.find_element_by_css_selector("img[class^='img UFIActorImage']")
return element.get_attribute('src')
def find_article_nickname(self, div):
element = div.find_element_by_css_selector("a[class$='UFICommentActorName']")
return element.text
def find_like_number(self, div):
try:
element = div.find_element_by_css_selector('a[ajaxify]')
m = re.search("(\\d+)", element.text.replace(",", ""))
return m.group(1) if m else None
except:
return None
class FacebookPageCrawler:
def __init__(self, driver=None):
self.driver = driver
self.url_set = set()
self.index = 0
self.limit = 500
self.re_date = re.compile("([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})")
self.reload_count = 0
self.is_hash = False
self.main_handle = None
self.begin_date = None
self.end_date = None
self.posts = None
self.current_url = None
def set_limit(self, limit=500):
self.limit = limit
def set_driver(self, driver):
self.driver = driver
def set_main_handle(self):
self.main_handle = self.driver.window_handles[0]
def find_article_date(self, div):
try:
element = div.find_element_by_css_selector("abbr.livetimestamp")
except:
element = div.find_element_by_css_selector("abbr[title]")
str_datetime = element.get_attribute("title")
logging.debug(str_datetime)
m = self.re_date.match(str_datetime)
if m is None:
return datetime.datetime(year=1999, month=1, day=1)
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return datetime.datetime(
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
hour=(int(m.group(4)) + 12), minute=int(m.group(5))
)
else:
return datetime.datetime(
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
hour=(int(m.group(4))), minute=int(m.group(5))
)
def next_post_by_user(self):
try:
#self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts_wait()
if len(self.posts) < 1:
print_and_flush("not posts")
self.posts = None
return None
except Exception as e:
print_and_flush("cannot found _5pcq")
logging.info(e)
self.posts = None
return None
while True:
self.index += 1
if self.index >= len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
continue
time_date = self.find_article_date(self.posts[self.index - 1])
logging.info("number of posts: " + str(len(self.posts)))
print_and_flush(str(time_date))
if type(time_date) == str:
continue
if self.is_earlier(time_date):
self.posts = None
return None
if self.is_late(time_date):
continue
self.current_url = self.posts[self.index - 1].get_attribute('href')
return self.posts[self.index - 1]
def next_post_by_tag(self):
try:
# self.posts = find_elements_by_css_selector(self.driver, "a[class='_5pcq']")
# self.posts = find_elements_by_xpath(self.driver, "//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts_wait()
if len(self.posts) < 1:
print_and_flush("not posts")
self.posts = None
return None
except Exception as e:
print_and_flush("cannot found _5pcq")
logging.info(e)
self.posts = None
return None
while True:
self.index += 1
if self.index > self.limit:
self.posts = None
return None
if self.index >= len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
continue
logging.info("number of posts: " + str(len(self.posts)))
self.current_url = self.posts[self.index - 1].get_attribute('href')
return self.posts[self.index - 1]
def load_more_posts(self):
# previous_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# previous_posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
previous_posts = self.find_posts()
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
body.send_keys(Keys.NULL)
body.send_keys(Keys.END)
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
for j in range(0, 2):
body.send_keys(Keys.PAGE_UP)
wait(0.1)
for j in range(0, 15):
body.send_keys(Keys.PAGE_DOWN)
wait(0.1)
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
for i in range(0, 10):
print_and_flush("Try load more")
self.driver.execute_script("window.scrollBy(0, 800)")
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
if self.reload_count < 8:
print_and_flush("index reload")
self.reload_count += 1
self.index -= 1 if self.index > 0 else 0
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
return True
if self.reload_count < 15:
print_and_flush("refresh")
self.driver.refresh()
wait(5)
self.index = 0
self.reload_count += 1
return True
return False
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month,
day=self.begin_date.day)
def crawling_ok(self):
self.url_set.add(self.current_url)
def init(self):
self.index = 0
self.posts = None
self.url_set.clear()
def find_posts(self):
try:
divs = self.driver.find_elements_by_xpath("//div[@class='_1dwg']")
except:
return None
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
posts = list()
for div in divs:
try:
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
except:
pass
return posts
def find_posts_wait(self):
try:
divs = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']", 30)
except:
return None
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
posts = list()
for div in divs:
try:
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
except:
pass
return posts
class FacebookMainCrawler:
def __init__(self):
self.page_crawler = FacebookPageCrawler()
self.body_crawler = FacebookBodyCrawler()
self.reply_crawler = FacebookReplyCrawler()
self.send_to_db = SendtoDB()
self.crawl_init = FacebookInit()
self.browser = Browser()
self.driver = None
self.keyword_id = None
self.url = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all_current_url(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
while True:
post = self.page_crawler.next_post_by_tag() if self.crawl_init.is_hashtag() \
else self.page_crawler.next_post_by_user()
if post is None:
break
try:
self.click_new_tab(post)
self.control_tab()
self.switch_new_tab()
wait(5)
body = self.driver.find_element_by_tag_name('body')
self.click_element(body)
body_info = self.crawl_body()
self.crawl_reply(body_info)
self.page_crawler.crawling_ok()
print_and_flush("ok")
self.switch_main_tab()
except WebDriverException as ee:
logging.info(ee)
print_and_flush("fail")
raise WebDriverException
except Exception as e:
print_and_flush("failed")
logging.info(e)
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return {"article_url": content["article_url"], "platform_id": content["platform_id"]}
def crawl_reply(self, body_info):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.set_div()
if self.reply_crawler.has_reply():
self.reply_crawler.crawl_all()
contents = self.reply_crawler.get_content()
for content in contents:
content.update(body_info)
self.send_to_db.send_reply(contents)
def start(self):
self.crawl_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
self.page_crawler.set_limit(self.crawl_init.until_page)
def set_main_window_handler(self, window_handler):
self.main_window_handler = window_handler
def crawl_start(self):
real_time = True
while real_time:
print_and_flush("Crawler Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
self.set_main_window_handler(self.driver.window_handles[0])
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
self.facebook_login()
body = self.driver.find_element_by_tag_name('body')
self.click_element(body)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
self.crawl_all_current_url(backup_set)
i += 1
backup_set.clear()
except Exception as e:
logging.info(e)
backup_set = self.page_crawler.url_set.copy()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
self.driver.quit()
def facebook_login(self):
try:
element_email = find_element_by_css_selector(self.driver, '#email', 15)
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
except:
return
email = 'concepters22@gmail.com'
password = 'zjstpqxjtm'
element_email.send_keys(email)
element_pwd.send_keys(password)
label = self.driver.find_element_by_css_selector('#loginbutton')
element_input = label.find_element_by_xpath('input')
element_input.send_keys(Keys.NULL)
element_input.send_keys(Keys.ENTER)
wait(5)
def click_new_tab(self, element):
#ac = ActionChains(self.driver)
#ac.key_down(Keys.CONTROL).move_to_element(element).click().key_up(Keys.CONTROL).perform()
element.send_keys(Keys.NULL)
element.send_keys(Keys.CONTROL + Keys.ENTER)
wait(3)
def switch_new_tab(self):
self.driver.switch_to_window(self.driver.window_handles[1])
def switch_main_tab(self):
self.driver.close()
self.driver.switch_to_window(self.main_window_handler)
def click_element(self, element):
ac = ActionChains(self.driver)
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
ac.move_to_element(element).click().perform()
wait(4)
def control_tab(self):
ac = ActionChains(self.driver)
ac.key_down(Keys.CONTROL).key_down(Keys.TAB).perform()
wait(2)

View File

@@ -0,0 +1,907 @@
#-*- coding: utf-8 -*-
import logging
import re
import json
import datetime
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import find_elements_by_css_selector
from base.baseclasses import find_elements_by_xpath
from base.baseclasses import enter_element
from base.baseclasses import Browser
logging.basicConfig(
level=logging.INFO,
format='%(module)s(%(lineno)s):%(funcName)s:%(message)s'
)
# parser_method = 'html.parser'
parser_method = 'lxml'
facebook_url = "https://www.facebook.com/"
facebook_tag_url = "https://www.facebook.com/hashtag/"
class FacebookInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[11] = facebook_tag_url
self.urls[12] = facebook_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
# trimmed_list = list()
# if self.platform() == 12:
# for x in splited_list:
# trimmed_list.append(x.strip())
# else:
# for x in splited_list:
# trimmed_list.append(self.utf8(x))
# return trimmed_list
def make_url(self):
return [self.urls[self.platform()] + x + "?fref=ts" for x in self.split_searches()]
# urls = list()
# for x in self.split_searches():
# url = self.urls[self.platform()] + x + "?fref=ts"
# urls.append(url)
# return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
def is_hashtag(self):
return False if self.platform() == 12 else True
class FacebookBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile(
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
)
self.re_id = re.compile("id=([\\d]+)")
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
#(("id=([\\d]+)|facebook.com/([\\w._]+)\\?"))
def set_driver(self, driver):
self.driver = driver
def find_article_id(self, soup):
element = soup.find('a', class_='_2yug')
if element:
href = element.get('href')
else:
href = self.find_article_url(soup)
m = self.re_ids.search(href)
return m.group(1) if m.group(2) is None else m.group(2)
def find_article_nickname(self, soup):
nickname = soup.find('div', class_='fbPhotoContributorName')
if not nickname or not nickname.get_text():
span = soup.find('span', class_='fwb fcg')
if span:
nickname = span.a
else:
nickname = soup.find('a', 'profileLink')
if not nickname:
nickname = soup.find('a', class_='_2yug')
return nickname.get_text() if nickname else ""
def find_article_data(self, soup):
element = soup.find('span', class_='hasCaption')
if not element:
element = soup.find('div', class_='_5pbx userContent')
if not element:
element = soup.find('div', class_='_39k5')
return element.get_text() if element else ""
def find_platform_id(self, soup):
pass
def find_article_date(self, soup):
element = soup.find('abbr', attrs={'data-utime': True})
if element:
str_datetime = element.get('title')
logging.debug(str_datetime)
m = self.re_date.match(str_datetime)
if m is None:
return "0000-00-00 00:00:00"
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
m.group(5) + ":00"
else:
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + \
":00"
else:
element = soup.find('a', '_39g5')
str_datetime = element.get_text()
return "0000-00-00 00:00:00"
def find_article_url(self, soup):
element = soup.select_one("span.fsm.fwn.fcg > a._5pcq")
if not element:
element = soup.select_one("span#fbPhotoPageTimestamp > a._39g5")
if not element:
element = soup.find('a', '_39g5')
return element.get('href') \
if element.get('href').startswith('http') else facebook_url[:-1] + element.get('href')
def find_article_title(self, soup):
title = soup.find('div', "_4lmk")
return title.get_text() if title else soup.title.get_text()
def find_platform_name(self):
pass
def find_like_users(self):
try:
element = self.driver.find_element_by_css_selector("a[data-testid='n_other_people_link']")
enter_element(element)
ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
except:
return None
try:
while True:
#a_element = find_element_by_css_selector(self.driver, "a[class$='uiBoxLightblue uiMorePagerPrimary']",
# 30)
a_element = WebDriverWait(self.driver, 20).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
"a[class$='uiBoxLightblue uiMorePagerPrimary']")))
enter_element(a_element)
wait(1)
except Exception as e:
pass
# print(e)
#ul = find_element_by_css_selector(self.driver, "ul[class^='uiList']", 30)
soup = BeautifulSoup(self.driver.page_source, parser_method)
a_elements = soup.select('div.fsl.fwb.fcb > a')
like_users = list()
for a in a_elements:
like_user = dict()
like_user['nickname'] = a.get_text()
url = a.get('href')
m = self.re_ids.search(url if url.startswith('http') else facebook_url[:-1] + url)
like_user['id'] = m.group(2) if m.group(1) is None else m.group(1)
like_users.append(like_user)
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
enter_element(cancel)
return {'data': like_users, 'count': len(like_users)}
def find_share_users(self):
try:
element = self.driver.find_element_by_css_selector("a[class='UFIShareLink']")
enter_element(element)
#share_element = find_element_by_css_selector(self.driver, "#repost_view_dialog", 30)
page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 20)
except:
return None
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
while len(page_scroller_children) > 1:
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
wait(2)
#page_scroller = find_element_by_css_selector(self.driver, '#pagelet_scrolling_pager', 30)
page_scroller = WebDriverWait(self.driver, 20).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR, "pagelet_scrolling_pager")))
page_scroller_children = page_scroller.find_elements_by_css_selector("*")
soup = BeautifulSoup(self.driver.page_source, parser_method)
a_tags = soup.select('span.fwb > a.profileLink')
share_users = list()
for a in a_tags:
share_user = dict()
url = a.get('href')
share_user['url'] = url if url.startswith('http') else facebook_url[:-1] + url
share_user['nickname'] = a.get_text()
str_id = share_user['url'][share_user['url'].rindex('/') + 1:]
m = self.re_id.search(str_id)
share_user['id'] = str_id if m is None else m.group(1)
share_users.append(share_user)
cancel = self.driver.find_element_by_css_selector("a[class*='layerCancel']")
enter_element(cancel)
return {'data': share_users, 'count': len(share_users)}
def find_like_user_number(self, soup):
element = soup.find('a', attrs={'data-testid': 'n_other_people_link'})
if not element:
return None
str_num = element.get_text()
m = re.search("(\\d+)", str_num.replace(",", ""))
return None if m is None else m.group(1)
def find_share_user_number(self, soup):
element = soup.find('a', class_='UFIShareLink')
if not element:
return None
str_num = element.get_text()
m = re.search("(\\d+)", str_num.replace(",", ""))
return None if m is None else m.group(1)
def find_reply_number(self, soup):
pass
def find_article_profileurl(self, soup):
img = soup.select_one('div._38vo > img')
if not img:
img = soup.select_one('img._s0._54ru')
if not img:
div = soup.find("div", class_='_2yuf')
if div:
src = div.get('style')
return src[src.index("http"):].replace('\\', "").replace("\")", "")
return img.get('src') if img else ""
def get_content(self):
soup = BeautifulSoup(self.driver.page_source, parser_method)
content = dict()
# logging.info('start_get_content')
content['article_id'] = self.find_article_id(soup)
# logging.info('article_id')
content['article_url'] = self.find_article_url(soup)
# logging.info('article_url')
content['article_data'] = self.find_article_data(soup)
# logging.info('article_data')
content['article_date'] = self.find_article_date(soup)
# logging.info('article_date')
content['article_title'] = self.find_article_title(soup)
# logging.info('article_title')
content['article_nickname'] = self.find_article_nickname(soup)
# logging.info('article_nickname')
content['article_form'] = 'body'
content['platform_name'] = 'facebook'
content['platform_form'] = 'post'
content['platform_title'] = content['article_nickname']
content['platform_id'] = content['article_id']
content['article_profileurl'] = self.find_article_profileurl(soup)
# logging.info('article_profileurl')
like_user_num = self.find_like_user_number(soup)
# logging.info('like_user_number')
share_user_num = self.find_share_user_number(soup)
# logging.info('share_user_number')
if like_user_num:
content['article_hit'] = like_user_num
if share_user_num:
content['reply_url'] = share_user_num
likes = self.find_like_users()
# logging.info('find_like_users')
shares = self.find_like_users()
# logging.info('find_like_shares')
data = list()
if likes:
data.append({"likes": likes})
if shares:
data.append({"shares": shares})
if data:
content["etc"] = json.JSONEncoder(indent=4, ensure_ascii=False).encode({"data": data})
return content
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class FacebookReplyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.reply_list = list()
self.order = 0
self.div = None
self.re_date = re.compile(
"([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})"
)
self.re_id = re.compile("id=([\\d]+)")
# self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._-]+)\\??", re.UNICODE)
self.re_ids = re.compile("id=([\\d]+)|facebook.com/(?!p[a-zA-Z_.-]+\\.php)([\\w._\\-%]+)")
def find_init(self):
self.reply_list.clear()
self.order = 0
self.reload_count = 0
def set_driver(self, driver):
self.driver = driver
def read_all_reply(self):
try:
start_time = time.time()
while True:
a_element = WebDriverWait(self.driver, 10).\
until(EC.visibility_of_element_located((By.CSS_SELECTOR,
"a.UFIPagerLink")))
a_element.send_keys(Keys.NULL)
a_element.send_keys(Keys.ENTER)
wait(1)
if time.time() - start_time > 600.0:
break
except:
pass
self.read_all_child_reply()
def read_all_child_reply(self):
try:
a_elements = self.driver.find_elements_by_css_selector("a.UFICommentLink")
for a_element in a_elements:
a_element.send_keys(Keys.NULL)
a_element.send_keys(Keys.ENTER)
wait(1)
except:
pass
def set_div(self, div=None):
if div is None:
try:
self.div = self.driver.find_element_by_xpath(
"//div[@data-reactroot and @class='UFIList']/div[not(@class)]")
# self.div = self.driver.find_element_by_css_selector("div[data-reactroot].UFIList>div:not([class])")
except:
self.div = None
else:
self.div = div
def has_reply(self):
"""after set_div execute this"""
if not self.div:
return False
else:
children = self.div.find_elements_by_css_selector("*")
return True if len(children) > 0 else False
def crawl_reply(self, div, article_parent=None):
content = dict()
content['article_id'] = self.find_article_id(div)
content['article_nickname'] = self.find_article_nickname(div)
content['article_data'] = self.find_article_data(div)
content['article_date'] = self.find_article_date(div)
content['article_profileurl'] = self.find_article_profileurl(div)
content['article_order'] = self.order
like_num = self.find_like_number(div)
if like_num:
content['article_hit'] = like_num
if article_parent:
content['article_parent'] = article_parent
content.update({'article_form': 'reply', 'platform_name': 'facebook', 'platform_form': 'post'})
self.order += 1
self.reply_list.append(content)
def crawl_all(self):
self.find_init()
self.read_all_reply()
self.set_div()
try:
if self.has_reply():
soup = BeautifulSoup(self.driver.page_source, parser_method)
parent = soup.find('div', attrs={"data-reactroot": True, "class": "UFIList"})
child = parent.find('div', attrs={"class": False}, recursive=False)
elements = child.find_all('div', recursive=False)
article_parent = None
for div in elements:
if "UFIReplyList" in div.get('class'):
reply_div = div.find_all('div', attrs={'role': True}, recursive=False)
for child_reply in reply_div:
self.crawl_reply(child_reply, article_parent)
elif div.get('role') == "article":
self.crawl_reply(div)
article_parent = self.reply_list[len(self.reply_list) - 1]['article_nickname']
else:
pass
except Exception as e:
logging.info(e)
def get_content(self):
return self.reply_list
def find_article_id(self, div):
element = div.find("a", class_=re.compile('UFICommentActorName$'))
url = element.get('href')
m = self.re_ids.search(url if url.startswith else facebook_url[:-1] + url)
if not m:
return 'None'
return m.group(1) if not m.group(2) else m.group(2)
def find_article_parent(self, div):
pass
def find_article_date(self, div):
element = div.find("abbr", "livetimestamp")
str_datetime = element.get("title")
m = self.re_date.match(str_datetime)
if m is None:
return "0000-00-00 00:00:00"
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + str(int(m.group(4)) + 12) + ":" + \
m.group(5) + ":00"
else:
return m.group(1) + "-" + m.group(2) + "-" + m.group(3) + " " + m.group(4) + ":" + m.group(5) + ":00"
def find_article_data(self, div):
element = div.find("span", "UFICommentBody")
return element.get_text()
def find_article_profileurl(self, div):
element = div.select_one('img.img.UFIActorImage')
url = element.get('src')
return url if url.startswith('http') else facebook_url[:-1] + url
def find_article_nickname(self, div):
element = div.find("a", 'UFICommentActorName')
return element.get_text()
def find_like_number(self, div):
try:
element = div.find('a', ajaxify=True)
m = re.search("(\\d+)", element.get_text().replace(",", ""))
return m.group(1) if m else None
except:
return None
class FacebookPageCrawler:
def __init__(self, driver=None):
self.driver = driver
self.url_set = set()
self.index = 0
self.limit = 500
self.re_date = re.compile("([\\d]{4})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2})[^\\d]+([\\d]{1,2}):([\\d]{1,2})")
self.reload_count = 0
self.is_hash = False
self.main_handle = None
self.begin_date = None
self.end_date = None
self.posts = None
self.current_url = None
def set_limit(self, limit=500):
self.limit = limit
def set_driver(self, driver):
self.driver = driver
def set_main_handle(self):
self.main_handle = self.driver.window_handles[0]
def find_article_date(self, div):
try:
element = div.find_element_by_css_selector("abbr.livetimestamp")
except:
element = div.find_element_by_css_selector("abbr[title]")
str_datetime = element.get_attribute("title")
logging.debug(str_datetime)
m = self.re_date.match(str_datetime)
if m is None:
return datetime.datetime(year=1999, month=1, day=1)
else:
if str_datetime.find("오후") != -1 and m.group(4) != "12":
return datetime.datetime(
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
hour=(int(m.group(4)) + 12), minute=int(m.group(5))
)
else:
return datetime.datetime(
year=int(m.group(1)), month=int(m.group(2)), day=int(m.group(3)),
hour=(int(m.group(4))), minute=int(m.group(5))
)
def next_post_by_user(self):
try:
#self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts_wait()
if len(self.posts) < 1:
print_and_flush("not posts")
self.posts = None
return None
except Exception as e:
print_and_flush("cannot found _5pcq")
logging.info(e)
self.posts = None
return None
while True:
self.index += 1
if self.index >= len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
print(self.posts[self.index - 1].get_attribute("href"), flush=True)
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
continue
time_date = self.find_article_date(self.posts[self.index - 1])
print("number of posts: ", self.index, '/', str(len(self.posts)), flush=True)
print_and_flush(str(time_date))
if type(time_date) == str:
continue
if self.is_earlier(time_date):
self.posts = None
return None
if self.is_late(time_date):
continue
self.current_url = self.posts[self.index - 1].get_attribute('href')
return self.posts[self.index - 1]
def next_post_by_tag(self):
try:
# self.posts = find_elements_by_css_selector(self.driver, "a[class='_5pcq']")
# self.posts = find_elements_by_xpath(self.driver, "//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts_wait()
if len(self.posts) < 1:
print_and_flush("not posts")
self.posts = None
return None
except Exception as e:
print_and_flush("cannot found _5pcq")
logging.info(e)
self.posts = None
return None
while True:
self.index += 1
if self.index > self.limit:
self.posts = None
return None
if self.index >= len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
print(self.posts[self.index - 1].get_attribute("href"), flush=True)
if self.posts[self.index - 1].get_attribute("href") in self.url_set:
continue
print("number of posts: ", self.index, '/', str(len(self.posts)), flush=True)
self.current_url = self.posts[self.index - 1].get_attribute('href')
return self.posts[self.index - 1]
def load_more_posts(self):
# previous_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# previous_posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
previous_posts = self.find_posts()
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
body.send_keys(Keys.NULL)
body.send_keys(Keys.END)
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
for j in range(0, 2):
body.send_keys(Keys.PAGE_UP)
wait(0.1)
for j in range(0, 15):
body.send_keys(Keys.PAGE_DOWN)
wait(0.1)
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
for i in range(0, 10):
print_and_flush("Try load more")
self.driver.execute_script("window.scrollBy(0, 800)")
wait(4)
# present_posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
present_posts = self.find_posts()
if len(previous_posts) != len(present_posts):
wait(2)
self.reload_count = 0
return True
if self.reload_count < 8:
print_and_flush("index reload")
self.reload_count += 1
self.index -= 1 if self.index > 0 else 0
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
return True
if self.reload_count < 15:
print_and_flush("refresh")
self.driver.refresh()
wait(5)
self.index = 0
self.reload_count += 1
return True
return False
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month,
day=self.begin_date.day)
def crawling_ok(self):
self.url_set.add(self.current_url)
def init(self):
self.index = 0
self.posts = None
self.url_set.clear()
def find_posts(self):
try:
divs = self.driver.find_elements_by_xpath("//div[@class='_1dwg']")
except:
return None
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
posts = list()
for div in divs:
try:
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
except:
pass
return posts
def find_posts_wait(self):
try:
divs = find_elements_by_xpath(self.driver, "//div[@class='_1dwg']", 30)
except:
return None
# return [div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']") for div in divs]
posts = list()
for div in divs:
try:
posts.append(div.find_element_by_css_selector("span[class='fsm fwn fcg']>a[class='_5pcq']"))
except:
pass
return posts
class FacebookMainCrawler:
def __init__(self):
self.page_crawler = FacebookPageCrawler()
self.body_crawler = FacebookBodyCrawler()
self.reply_crawler = FacebookReplyCrawler()
self.send_to_db = SendtoDB()
self.crawl_init = FacebookInit()
self.browser = Browser()
self.driver = None
self.keyword_id = None
self.url = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all_current_url(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
while True:
post = self.page_crawler.next_post_by_tag() if self.crawl_init.is_hashtag() \
else self.page_crawler.next_post_by_user()
if post is None:
break
self.click_new_tab(post)
self.control_tab()
self.switch_new_tab()
wait(5)
body = self.driver.find_element_by_tag_name('body')
self.click_element(body)
self.click_element(body)
try:
self.page_crawler.crawling_ok()
body_info = self.crawl_body()
print("body : ok", flush=True)
self.crawl_reply(body_info)
print("reply : ok", flush=True)
except WebDriverException as ee:
logging.info(ee)
print_and_flush("fail")
# raise WebDriverException
except Exception as e:
print_and_flush("fail")
logging.info(e)
finally:
self.switch_main_tab()
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
# print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return {"article_url": content["article_url"], "platform_id": content["platform_id"]}
def crawl_reply(self, body_info):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.set_div()
if self.reply_crawler.has_reply():
self.reply_crawler.crawl_all()
contents = self.reply_crawler.get_content()
for content in contents:
content.update(body_info)
self.send_to_db.send_reply(contents)
def start(self):
self.crawl_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
self.page_crawler.set_limit(self.crawl_init.until_page)
def set_main_window_handler(self, window_handler):
self.main_window_handler = window_handler
def crawl_start(self):
real_time = True
while real_time:
print_and_flush("Crawler Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
self.set_main_window_handler(self.driver.window_handles[0])
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
self.facebook_login()
body = self.driver.find_element_by_tag_name('body')
self.click_element(body)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
self.crawl_all_current_url(backup_set)
i += 1
backup_set.clear()
except Exception as e:
logging.info(e)
backup_set = self.page_crawler.url_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
self.driver.quit()
def facebook_login(self):
try:
element_email = find_element_by_css_selector(self.driver, '#email', 15)
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
except:
return
email = 'concepters22@gmail.com'
password = 'zjstpqxjtm'
element_email.send_keys(email)
element_pwd.send_keys(password)
label = self.driver.find_element_by_css_selector('#loginbutton')
element_input = label.find_element_by_xpath('input')
element_input.send_keys(Keys.NULL)
element_input.send_keys(Keys.ENTER)
wait(5)
def click_new_tab(self, element):
#ac = ActionChains(self.driver)
#ac.key_down(Keys.CONTROL).move_to_element(element).click().key_up(Keys.CONTROL).perform()
element.send_keys(Keys.NULL)
element.send_keys(Keys.CONTROL + Keys.ENTER)
wait(3)
def switch_new_tab(self):
self.driver.switch_to_window(self.driver.window_handles[1])
def switch_main_tab(self):
while len(self.driver.window_handles) > 1:
self.driver.switch_to_window(self.driver.window_handles[1])
self.driver.close()
self.driver.switch_to_window(self.main_window_handler)
def click_element(self, element):
ac = ActionChains(self.driver)
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
ac.move_to_element(element).click().perform()
wait(4)
def control_tab(self):
ac = ActionChains(self.driver)
ac.key_down(Keys.CONTROL).key_down(Keys.TAB).perform()
wait(2)

View File

View File

@@ -0,0 +1,541 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_xpath
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import enter_element
from base.baseclasses import Browser
from selenium.webdriver.common.action_chains import ActionChains
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
class InstaBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def find_article_url(self):
a = self.article.find_element_by_xpath("div/section/a")
return a.get_attribute("href")
def find_article_profileurl(self):
img = self.article.find_element_by_xpath("header/a/img[@src]")
return img.get_attribute("src")
def find_article_nickname(self):
a = self.article.find_element_by_xpath("header/div/a")
return a.text
def find_article_date(self):
el_time = self.article.find_element_by_xpath("div/section/a/time")
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
def find_article_data(self):
ul = self.article.find_element_by_xpath("div/ul")
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
span = li.find_element_by_xpath("h1/span")
return span.text
except:
return ""
def find_article_id(self):
return self.find_platform_id()
def find_platform_name(self):
return 'instagram'
def find_article_form(self):
return 'body'
def find_platform_id(self):
a = self.article.find_element_by_xpath("header/div/a")
if a:
href = a.get_attribute("href")
str_id = href.replace(insta_url, "").replace("/", "")
return str_id
else:
return None
def find_like_num(self):
div = self.article.find_element_by_xpath("div/section/div[@data-reactid]")
try:
span = div.find_element_by_css_selector("span[data-reactid$='.1'")
str_num = span.text
if str_num[-1] == 'm':
num = float(str_num[0:-1]) * 1000000
elif str_num[-1] == 'k':
num = float(str_num[0:-1]) * 1000
else:
num = int(str_num)
return str(num)
except:
a_list = div.find_elements_by_tag_name("a")
if len(a_list) > 1:
return str(len(a_list))
else:
span = div.find_element_by_xpath("span[1]")
if len(span.text.strip()) < 1:
return str(1)
else:
return str(0)
def find_reply_num(self):
ul = self.article.find_element_by_xpath("div/ul")
lis = ul.find_elements_by_tag_name("li")
if len(lis) < 2:
return "0"
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
span = li.find_element_by_xpath("button/span[2]")
str_num = span.text.replace(",", "")
return str_num
except:
return str(len(lis) - 1)
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
content["platform_id"] = self.find_platform_id()
content["article_url"] = self.find_article_url()
content["article_profileurl"] = self.find_article_profileurl()
content["article_nickname"] = self.find_article_nickname()
content["platform_name"] = self.find_platform_name()
content["article_date"] = self.find_article_date()
content["article_data"] = self.find_article_data()
content["article_form"] = 'body'
content["platform_form"] = 'post'
reply_num = self.find_reply_num()
if int(reply_num) > 0:
content["article_order"] = int(reply_num)
like_num = self.find_like_num()
if int(float(like_num)) > 0:
content["article_hit"] = int(float(like_num))
return content
def find_platform_title(self):
pass
def find_article_title(self):
pass
class InstaReplyCrawler:
def __init__(self, driver=None, article=None):
self.driver = driver
self.activity = article
self.reply_list = list()
def find_init(self):
self.reply_list.clear()
def set_driver(self, driver):
self.driver = driver
def set_article(self, article=None):
if article is None:
try:
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
except Exception as e:
print_and_flush(e)
raise Exception
else:
self.article = article
def has_more(self, ul):
try:
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
return True
except Exception as e:
return False
def read_more_reply(self, ul):
try:
button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button")
enter_element(button)
except Exception as e:
print_and_flush(e)
def read_all_reply(self, ul):
for i in range(0, 10):
if self.has_more(ul):
self.read_more_reply(ul)
else:
break
def get_reply_ul(self):
ul = self.article.find_element_by_xpath("div/ul")
return ul
def has_reply(self, ul):
try:
lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']")
if len(lis) > 0:
return True
except:
return False
return False
def crawl_all(self):
self.find_init()
self.set_article()
try:
ul = self.get_reply_ul()
if self.has_reply(ul):
self.read_all_reply(ul)
self.crawl_reply(ul)
except Exception as e:
print_and_flush(e)
def crawl_reply(self, ul):
article_data = self.find_article_data(ul)
article_id = self.find_article_id(ul)
if len(article_data) != len(article_id):
print_and_flush("article_data != article_id")
for i in range(0, len(article_id)):
content = dict()
content["article_data"] = article_data[i]
content["article_id"] = article_id[i]
content["article_nickname"] = article_id[i]
content["platform_name"] = "instagram"
content["platform_form"] = "post"
content["article_form"] = 'reply'
content["article_order"] = i
self.reply_list.append(content)
def get_content(self):
return self.reply_list
def find_article_id(self, ul):
id_list = list()
a_list = ul.find_elements_by_xpath("li/a")
for i in a_list:
id_list.append(i.text)
return id_list
def find_article_profileurl(self, ul):
pass
def find_article_nickname(self, ul):
return self.find_article_id(ul)
def find_article_data(self, ul):
data_list = list()
span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span")
for i in span_list:
data_list.append(i.text)
return data_list
def find_article_url(self, ul):
pass
def find_platform_id(self, ul):
pass
def find_article_form(self, ul=None):
return 'reply'
def find_platform_name(self, ul=None):
return 'instagram'
def find_platform_form(self, ul=None):
return 'post'
def click_element(self, element):
ac = ActionChains(self.driver)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
class InstaPageCrawler:
def __init__(self, driver=None, begin_date=None, end_date=None):
self.driver = driver
self.url_set = set()
self.begin_date = begin_date
self.end_date = end_date
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
def set_driver(self, driver):
self.driver = driver
def find_article_url(self):
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
return a.get_attribute("href")
def init(self):
self.url_set.clear()
def set_date(self, begin_date, end_date):
self.set_begin_date(begin_date)
self.set_end_date(end_date)
def set_end_date(self, end_date):
if type(end_date) == str:
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
self.end_date = end_date
else:
self.end_date = datetime.datetime.today()
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
self.end_date += datetime.timedelta(days=1)
def set_begin_date(self, begin_date):
if type(begin_date) == str:
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
self.begin_date = begin_date
else:
self.begin_date = datetime.datetime.today()
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
def has_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
return True
except:
return False
def move_next(self):
try:
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
enter_element(a)
return True
except:
return False
def has_first_page(self):
try:
a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
enter_element(a)
return True
except:
return False
def crawling_ok(self, url):
self.url_set.add(url)
def is_earlier(self, time_date):
return True if time_date < self.begin_date else False
def is_late(self, time_date):
return True if time_date > self.end_date else False
def find_article_date(self):
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
str_time = el_time.get_attribute("datetime")
m = self.re_date.search(str_time)
if m is None:
return "0000-00-00 00:00:00"
else:
return m.group(1) + " " + m.group(2)
class InstaMainCrawler:
def __init__(self):
self.page_crawler = InstaPageCrawler()
self.body_crawler = InstaBodyCrawler()
self.reply_crawler = InstaReplyCrawler()
self.send_to_db = SendtoDB()
self.browser = Browser()
self.crawl_init = InstaInit()
self.driver = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.url_set = backup_set.copy()
if not self.page_crawler.has_first_page():
return
while True:
str_date = self.page_crawler.find_article_date()
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
print_and_flush(str_date)
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
#if self.page_crawler.is_earlier(date_val.date()):
if self.page_crawler.is_late(date_val):
if self.page_crawler.has_next():
self.page_crawler.move_next()
continue
else:
break
#if self.page_crawler.is_late(date_val.date()):
if self.page_crawler.is_earlier(date_val):
break
try:
body_content = self.crawl_body()
self.crawl_reply(body_content)
self.page_crawler.url_set.add(body_content["article_url"])
print_and_flush("ok")
except Exception as e:
print_and_flush('fail')
print_and_flush(e)
if self.page_crawler.has_next():
self.page_crawler.move_next()
else:
break
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_article()
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
return content
def crawl_reply(self, body_content):
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.crawl_all()
content_list = self.reply_crawler.get_content()
if content_list:
for i in content_list:
i['article_url'] = body_content['article_url']
i['platform_id'] = body_content['platform_id']
self.send_to_db.send_reply(content_list)
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
real_time = True
while real_time:
print_and_flush("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(3)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
if self.page_crawler.has_first_page():
self.crawl_all(backup_set)
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
backup_set = self.page_crawler.url_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.send_to_db.close()
self.driver.quit()

View File

View File

@@ -1,28 +1,34 @@
#-*- coding: utf-8 -*-
__author__ = 'cococo'
import sys
import re
import datetime
import json
import time
import logging
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import sys
import re
import datetime
import json
import os
import time
from navercrawl import wait
from navercrawl import print_and_flush
from navercrawl import SendtoDB
from navercrawl import Browser
from navercrawl import CrawlInit
from selenium.common.exceptions import WebDriverException
from base.baseclasses import wait
from base.baseclasses import print_and_flush
from base.baseclasses import SendtoDB
from base.baseclasses import Browser
from base.baseclasses import CrawlInit
__author__ = 'cococo'
kakaostory_url = 'https://story.kakao.com/'
kakaostory_channel_url = 'https://story.kakao.com/ch/'
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
class KakaoBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
@@ -174,10 +180,8 @@ class KakaoBodyCrawler:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
feelings = dict()
feelings['data'] = data
feelings['count'] = len(data)
@@ -232,10 +236,8 @@ class KakaoBodyCrawler:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
feelings = dict()
feelings['data'] = data
feelings['count'] = len(data)
@@ -346,10 +348,8 @@ class KakaoBodyCrawler:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
shares = dict()
shares['data'] = data
shares['count'] = len(data)
@@ -400,10 +400,8 @@ class KakaoBodyCrawler:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
shares = dict()
shares['data'] = data
shares['count'] = len(data)
@@ -553,10 +551,8 @@ class KakaoReplyCrawler_backup:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
def find_article_id(self, li):
a = li.find_element_by_xpath("div[@class='pf']/a")
@@ -753,10 +749,8 @@ class KakaoReplyCrawler:
except WebDriverException:
raise WebDriverException
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
logging.info(e)
# print_and_flush(e)
def find_article_id(self, ul):
a_list = ul.find_elements_by_xpath("li/div[@class='pf']/a")
@@ -1026,7 +1020,7 @@ class KakaoPageCrawler:
def load_more_activities(self):
previous_activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
for i in range(0, 5):
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
body.send_keys(Keys.NULL)
@@ -1037,13 +1031,13 @@ class KakaoPageCrawler:
wait(2)
self.reload_count = 0
return True
for i in range(0, 5):
for i in range(0, 4):
print_and_flush("Try load more")
body = self.driver.find_element_by_tag_name("body")
for j in range(0, 3):
for j in range(0, 2):
body.send_keys(Keys.PAGE_UP)
wait(0.1)
for j in range(0, 50):
for j in range(0, 15):
body.send_keys(Keys.PAGE_DOWN)
wait(0.1)
wait(4)
@@ -1061,10 +1055,10 @@ class KakaoPageCrawler:
wait(2)
self.reload_count = 0
return True
if self.reload_count < 10:
if self.reload_count < 8:
print_and_flush("index reload")
self.reload_count += 1
self.index //= 2
self.index -= 1 if self.index > 0 else 0
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
@@ -1141,70 +1135,6 @@ class KakaoPageCrawler:
return temp_date
class KakaoMainCrawler:
def __init__(self):
self.page_crawler = KakaoPageCrawler()
self.body_crawler = KakaoBodyCrawler()
self.reply_crawler = KakaoReplyCrawler()
self.send_to_db = SendtoDB()
self.driver = None
self.browser = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all_current_url(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.activity_data_model_set = backup_set.copy()
while True:
activity = self.page_crawler.next_activity()
if activity is None:
break
try:
self.crawl_body(activity)
self.crawl_reply(activity)
self.page_crawler.crawling_ok()
print_and_flush("ok")
except WebDriverException as ee:
print_and_flush(ee)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush("fail")
raise WebDriverException
except Exception as e:
print_and_flush("failed")
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print_and_flush(e)
def crawl_body(self, activity):
# print_and_flush("start body crawl")
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_activity(activity)
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
def crawl_reply(self, activity):
# print_and_flush("start reply crawl")
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.set_activity(activity)
if self.reply_crawler.has_reply():
self.reply_crawler.crawl_all()
self.send_to_db.send_reply(self.reply_crawler.get_content())
class KakaoInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
@@ -1250,6 +1180,128 @@ class KakaoInit(CrawlInit):
return self.end_day()
class KakaoMainCrawler:
def __init__(self):
self.page_crawler = KakaoPageCrawler()
self.body_crawler = KakaoBodyCrawler()
self.reply_crawler = KakaoReplyCrawler()
self.send_to_db = SendtoDB()
self.crawl_init = KakaoInit()
self.browser = Browser()
self.driver = None
def set_driver(self, driver):
self.page_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all_current_url(self, backup_set=None):
self.page_crawler.init()
if backup_set:
self.page_crawler.activity_data_model_set = backup_set.copy()
while True:
activity = self.page_crawler.next_activity()
if activity is None:
break
try:
self.crawl_body(activity)
self.crawl_reply(activity)
self.page_crawler.crawling_ok()
print_and_flush("ok")
except WebDriverException as ee:
logging.info(ee)
# print_and_flush(e)
print_and_flush("fail")
raise WebDriverException
except Exception as e:
print_and_flush("failed")
logging.info(e)
# print_and_flush(e)
def crawl_body(self, activity):
# print_and_flush("start body crawl")
self.body_crawler.set_driver(self.driver)
self.body_crawler.set_activity(activity)
content = self.body_crawler.get_content()
content["keyword_id"] = self.keyword_id
print_and_flush(content["article_url"])
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
def crawl_reply(self, activity):
# print_and_flush("start reply crawl")
self.reply_crawler.set_driver(self.driver)
self.reply_crawler.set_activity(activity)
if self.reply_crawler.has_reply():
self.reply_crawler.crawl_all()
self.send_to_db.send_reply(self.reply_crawler.get_content())
def start(self):
self.crawl_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawl_start(self):
real_time = True
while real_time:
print_and_flush("Crawler Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(3)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
self.crawl_all_current_url(backup_set)
i += 1
backup_set.clear()
except Exception as e:
logging.info(e)
# print_and_flush(e)
backup_set = self.page_crawler.activity_data_model_set.copy()
self.driver.quit()
self.set_driver(self.browser.new_browser())
# kakao_main.driver.implicitly_wait(5)
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
# kakao_main.driver.quit()
self.send_to_db.close()
self.driver.quit()
if __name__ == '__main__':
"""
argv:
@@ -1293,10 +1345,8 @@ if __name__ == '__main__':
i += 1
backup_set.clear()
except Exception as e:
print_and_flush(e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
logging.info(e)
# print_and_flush(e)
backup_set = kakao_main.page_crawler.activity_data_model_set.copy()
kakao_main.set_driver(browser.new_browser())
# kakao_main.driver.implicitly_wait(5)

View File

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,92 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
from insta import instacrawl
from kakao import kakaocrawl
from naver import navercrawl
from facebook import facebookcrawl
from facebook import facebookcrawlbs
from base.baseclasses import print_and_flush
class WebBasedCrawler:
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
if platform == "instagram":
self.crawler = instacrawl.InstaMainCrawler()
elif platform == "kakaochannel":
self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == "facebook":
self.crawler = facebookcrawlbs.FacebookMainCrawler()
else:
self.crawler = None
raise Exception
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
def start(self):
self.crawler.start()
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
def get_browser_info(platform_, file_name="browser.txt"):
if sys.platform == 'win32':
options = {'default': 'ie'}
else:
options = {'default': 'firefox'}
try:
with open(file_name, 'r') as f:
for line in f:
if line.startswith("#"):
continue
elif len(line.strip()) < 1:
continue
else:
platform, browser = line.split("=")
platform = platform.strip()
browser = browser.strip()
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
print_and_flush("check option: " + line)
else:
options[platform] = browser
except FileNotFoundError:
print_and_flush("browser.txt file is not exists")
print_and_flush("use " + options['default'] + " browser")
except Exception as e:
print_and_flush(e)
print_and_flush("Unknown error occurs")
exit(1)
return options.get(platform_, options['default'])
if __name__ == '__main__':
"""
sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook
sys.argv[2] keyword_id
sys.argv[3] data group
sys.argv[4] start_day
sys.argv[5] until_page
"""
if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed")
else:
print_and_flush("Check Argumenets!")
exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2],
sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start()
print_and_flush("Finished Crawling :)")
exit(0)