1 line
42 KiB
Python
1 line
42 KiB
Python
#-*- coding: utf-8 -*-
|
|
__author__ = 'cococo'
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
import threading
|
|
from time import localtime, strftime
|
|
import time
|
|
import os
|
|
import sys
|
|
import datetime
|
|
import psutil
|
|
import re
|
|
|
|
def fcntwait(n):
|
|
time.sleep(n)
|
|
|
|
def wait(n):
|
|
th = threading.Thread(target=fcntwait, args=(n,))
|
|
th.start()
|
|
th.join()
|
|
|
|
def insert_log(msg):
|
|
pid = os.getpid()
|
|
tm = strftime("%Y_%m_%d", localtime())
|
|
filename = tm + "_" + str(pid) + ".log"
|
|
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
|
|
with open(filename, "a") as f:
|
|
f.write(total_msg)
|
|
f.flush()
|
|
|
|
def print_and_flush(string):
|
|
print(string)
|
|
sys.stdout.flush()
|
|
|
|
class Asistance:
|
|
def __init__(self):
|
|
self.re_clubid = re.compile("search\\.clubid=([\\d]+)")
|
|
self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})")
|
|
|
|
def clubid(self, url):
|
|
m = self.re_clubid.search(url)
|
|
if m is None:
|
|
return str()
|
|
else:
|
|
return m.group(1)
|
|
|
|
def date(self, url):
|
|
m = self.re_date.search(url)
|
|
if m is None:
|
|
return str("Start: ALL, End: ALL")
|
|
else:
|
|
return str("Start: " + m.group(1) + ", End: " + m.group(2))
|
|
|
|
class Browser:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.info = ""
|
|
|
|
def get_new_driver(self, name):
|
|
"""
|
|
windows system:
|
|
name = chrome, ie, opera, firefox
|
|
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
|
|
linux system:
|
|
name = chrome, opera, firefox
|
|
default driver_exec: chromedriver, operadriver
|
|
"""
|
|
if sys.platform == "win32":
|
|
if name == "chrome":
|
|
return self.new_chrome_browser(driver_exec="chromedriver.exe")
|
|
elif name == "ie":
|
|
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
|
|
elif name == "opera":
|
|
return self.new_opera_browser(driver_exec="operadriver.exe")
|
|
elif name == "firefox":
|
|
return self.new_firefox_browser()
|
|
else:
|
|
return None
|
|
else:
|
|
if name == "chrome":
|
|
return self.new_chrome_browser(driver_exec="chromedriver")
|
|
elif name == "opera":
|
|
return self.new_opera_browser(driver_exec="operadriver")
|
|
elif name == "firefox":
|
|
return self.new_firefox_browser()
|
|
else:
|
|
return None
|
|
|
|
def new_chrome_browser(self, driver_exec=None):
|
|
self.info = "chrome"
|
|
if driver_exec is not None:
|
|
self.chrome_driver_path = driver_exec
|
|
self.chrome_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.chrome_basename):
|
|
port = self.port(self.chrome_basename)
|
|
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
|
|
else:
|
|
self.driver = webdriver.Chrome(self.chrome_driver_path)
|
|
return self.driver
|
|
|
|
def new_ie_browser(self, driver_exec=None):
|
|
self.info = "ie"
|
|
if driver_exec is not None:
|
|
self.ie_driver_path = driver_exec
|
|
self.ie_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.ie_basename):
|
|
port = self.port(self.ie_basename)
|
|
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
|
|
else:
|
|
self.driver = webdriver.Ie(self.ie_driver_path)
|
|
return self.driver
|
|
|
|
def new_firefox_browser(self):
|
|
self.info = "firefox"
|
|
self.driver = webdriver.Firefox()
|
|
return self.driver
|
|
|
|
def new_opera_browser(self, driver_exec=None):
|
|
self.info = "opera"
|
|
if driver_exec is not None:
|
|
self.opera_driver_path = driver_exec
|
|
self.opera_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.opera_basename):
|
|
port = self.port(self.opera_basename)
|
|
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
|
|
else:
|
|
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
|
|
return self.driver
|
|
|
|
def driver(self):
|
|
return self.driver
|
|
|
|
def is_server_executed(self, driver_basename):
|
|
for ps in psutil.process_iter():
|
|
if ps.name() == driver_basename:
|
|
conns = ps.connections()
|
|
for x in conns:
|
|
if x.status == "LISTEN":
|
|
return True
|
|
return False
|
|
|
|
def port(self, driver_basename):
|
|
for ps in psutil.process_iter():
|
|
if ps.name() == driver_basename:
|
|
conns = ps.connections()
|
|
for x in conns:
|
|
if x.status == "LISTEN":
|
|
return str(x.laddr[1])
|
|
return str(9999)
|
|
|
|
def new_browser(self):
|
|
if self.info == "chrome":
|
|
return self.new_chrome_browser()
|
|
elif self.info == "ie":
|
|
return self.new_ie_browser()
|
|
elif self.info == "opera":
|
|
return self.new_opera_browser()
|
|
elif self.info == "firefox":
|
|
return self.new_firefox_browser()
|
|
else:
|
|
return None
|
|
|
|
class NaverCafeCrawler:
|
|
#driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe')
|
|
def __init__(self):
|
|
self.driver = None
|
|
# webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe')
|
|
# self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe')
|
|
# firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
|
|
# firefox_capabilities['marionette'] = True
|
|
# firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe')
|
|
# webdriver.Chrome()
|
|
# self.driver = webdriver.Firefox()
|
|
# self.driver.set_window_size(1600, 900)
|
|
self.main_area_crawler = NaverCafeMainAreaCrawler()
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def suff(self, url):
|
|
self.driver.get(url)
|
|
wait(2)
|
|
|
|
def screenshot(self,filename):
|
|
self.driver.save_screenshot(filename)
|
|
|
|
def html(self):
|
|
return self.driver.page_source
|
|
|
|
def savepage(self, filename):
|
|
with open(filename,'w',encoding='UTF8') as f:
|
|
f.write(self.html())
|
|
|
|
def naver_login(self, id, password):
|
|
self.suff('http://www.naver.com')
|
|
wait(2)
|
|
element = self.driver.find_element_by_id('id')
|
|
element.send_keys(id)
|
|
#element = driver.find_element_by_id('label_pw')
|
|
element = self.driver.find_element_by_id('pw')
|
|
element.send_keys(password)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(3)
|
|
#element = self.driver.find_element_by_class_name('btn_login')
|
|
#self.click_element(element)
|
|
|
|
def cafe_search(self, keyword):
|
|
element = self.driver.find_element_by_id('topLayerQueryInput')
|
|
element.send_keys(keyword)
|
|
wait(1)
|
|
element.send_keys(Keys.ENTER)
|
|
#element.send_keys(Keys.RETURN)
|
|
wait(2)
|
|
|
|
def get_url(self):
|
|
return self.driver.current_url()
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
#ac.move_to_element(element).click().perform()
|
|
#element.send_keys(Keys.NULL)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
def start(self):
|
|
self.main_area_crawler.set_driver(self.driver)
|
|
self.main_area_crawler.crawl_all_cafe_main()
|
|
|
|
def close(self):
|
|
self.driver.close()
|
|
|
|
def quit(self):
|
|
self.driver.quit()
|
|
|
|
class NaverCafeMainAreaCrawler:
|
|
def __init__(self):
|
|
self.board_crawler = NaverCafeBoardCrawler()
|
|
self.body_crawler = NaverCafeBodyCrawler()
|
|
self.reply_crawler = NaverCafeReplyCrawler()
|
|
self.send_to_db = SendtoDB()
|
|
self.browser = None
|
|
|
|
def print(self, arg):
|
|
print(arg)
|
|
sys.stdout.flush()
|
|
|
|
def set_driver(self, driver):
|
|
self.board_crawler.set_driver(driver)
|
|
self.body_crawler.set_driver(driver)
|
|
self.reply_crawler.set_driver(driver)
|
|
self.driver = driver
|
|
|
|
def copy_list(self, backup_list):
|
|
for i in backup_list:
|
|
self.board_crawler.content_num_list.append(i)
|
|
|
|
def crawl_all_cafe_main(self, backup_list=None):
|
|
self.board_crawler.clear_content_num_list()
|
|
if backup_list:
|
|
self.copy_list(backup_list)
|
|
has_next_table = True
|
|
while has_next_table:
|
|
self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag()))
|
|
# if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1:
|
|
self.release_memory()
|
|
while self.board_crawler.move_next_content():
|
|
try:
|
|
self.crawl_body()
|
|
self.crawl_reply()
|
|
self.print("ok")
|
|
except Exception as e:
|
|
self.print("fail")
|
|
self.print(e)
|
|
self.driver.back()
|
|
wait(1)
|
|
has_next_table = self.board_crawler.move_next_page()
|
|
|
|
def crawl_body(self):
|
|
self.body_crawler.set_driver(self.driver)
|
|
content = self.body_crawler.get_content()
|
|
content['keyword_id'] = self.keyword_id
|
|
self.send_to_db.delete_url(content['article_url'])
|
|
self.send_to_db.send_body(content)
|
|
self.print(content['article_url'])
|
|
|
|
def crawl_reply(self):
|
|
self.reply_crawler.set_driver(self.driver)
|
|
if self.reply_crawler.find_comments_element():
|
|
self.reply_crawler.set_article_url(self.body_crawler.find_article_url())
|
|
self.reply_crawler.crawl_all()
|
|
self.send_to_db.send_reply(self.reply_crawler.get_content())
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def release_memory_firefox(self):
|
|
index = self.driver.current_url.find("%26search.page=")
|
|
if index is -1:
|
|
temp_url = self.driver.current_url
|
|
else:
|
|
temp_url = self.driver.current_url[:index]
|
|
temp_page = self.board_crawler.current_page_num_by_tag()
|
|
if temp_page.strip() is "1":
|
|
url = temp_url
|
|
else:
|
|
url = temp_url + "%26search.page=" + temp_page.strip()
|
|
self.print("Release Memory Process")
|
|
self.driver.get("about:memory")
|
|
wait(2)
|
|
self.driver.execute_script("doMMU()")
|
|
wait(2)
|
|
self.driver.execute_script("doGC()")
|
|
wait(2)
|
|
self.driver.execute_script("doCC()")
|
|
wait(2)
|
|
self.driver.get(url)
|
|
wait(2)
|
|
print_and_flush("reloaded")
|
|
|
|
def release_memory_others(self):
|
|
temp_url = self.driver.current_url
|
|
self.print("Release Memory Process")
|
|
self.driver.get(temp_url)
|
|
wait(2)
|
|
print_and_flush("reloaded")
|
|
|
|
def release_memory(self):
|
|
if self.browser.info == "firefox":
|
|
if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1:
|
|
self.release_memory_firefox()
|
|
else:
|
|
if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1):
|
|
self.release_memory_others()
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
#ac.move_to_element(element).click().perform()
|
|
#element.send_keys(Keys.NULL)
|
|
ac.move_to_element(element).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
class NaverCafeBoardCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.content_num_list = list()
|
|
import re
|
|
self.re_page = re.compile("search\\.page=([\\d]+)")
|
|
|
|
def clear_content_num_list(self):
|
|
self.content_num_list.clear()
|
|
|
|
def current_url(self):
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
return self.driver.current_url
|
|
|
|
def current_page_num_by_url(self):
|
|
url = self.current_url()
|
|
m = self.re_page.search(url)
|
|
if m is None:
|
|
return self.current_page_num_by_tag()
|
|
else:
|
|
return m.group(1)
|
|
|
|
def current_page_num_by_tag(self):
|
|
page_navigate = None
|
|
try:
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']")
|
|
except:
|
|
return str(1)
|
|
if page_navigate is None:
|
|
return str(1)
|
|
tds = page_navigate.find_elements_by_tag_name('td')
|
|
for td in tds:
|
|
try:
|
|
page_on = td.get_attribute('class')
|
|
if page_on == 'on':
|
|
return td.text
|
|
except:
|
|
continue
|
|
return str(1)
|
|
|
|
def move_next_content(self):
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
trs = self.driver.find_elements_by_css_selector("tr[align='center']")
|
|
for tr in trs:
|
|
try:
|
|
content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']")
|
|
if len(str(content_num.text).strip()) < 1:
|
|
continue
|
|
if content_num.text in self.content_num_list:
|
|
continue
|
|
self.content_num_list.append(content_num.text)
|
|
sub = tr.find_element_by_css_selector("a[class='m-tcol-c']")
|
|
self.enter_element(sub)
|
|
return True
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
def move_next_page(self):
|
|
page_navigate = None
|
|
try:
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']")
|
|
except:
|
|
return False
|
|
if page_navigate is None:
|
|
return False
|
|
tds = page_navigate.find_elements_by_tag_name('td')
|
|
is_next = False
|
|
for td in tds:
|
|
if is_next:
|
|
a = td.find_element_by_tag_name("a")
|
|
self.enter_element(a)
|
|
#self.enter_element(td)
|
|
return True
|
|
try:
|
|
page_on = td.get_attribute('class')
|
|
if page_on == 'on':
|
|
is_next = True
|
|
continue
|
|
except:
|
|
continue
|
|
return False
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
#ac.move_to_element(element).click().perform()
|
|
#element.send_keys(Keys.NULL)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
class NaverCafeBodyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.init_re()
|
|
|
|
def init_re(self):
|
|
self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+')
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def find_init(self):
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
|
|
def find_article_title(self):
|
|
self.find_init()
|
|
article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']")
|
|
return article_title.text
|
|
|
|
def find_article_date(self):
|
|
self.find_init()
|
|
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']")
|
|
article_date = str(element.text)
|
|
article_date.strip()
|
|
article_date = article_date.replace('. ',' ').replace('.','-') + ":00"
|
|
return article_date
|
|
|
|
def find_article_data(self):
|
|
self.find_init()
|
|
article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']")
|
|
return article_data.text
|
|
|
|
def find_article_nickname(self):
|
|
self.find_init()
|
|
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']")
|
|
nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']")
|
|
onclick = nick_element.get_attribute('onclick')
|
|
onclick_attr_list = onclick.split(',')
|
|
if len(onclick_attr_list) > 4:
|
|
return onclick_attr_list[3].strip().replace("'", "")
|
|
else:
|
|
return str()
|
|
|
|
def find_article_id(self):
|
|
self.find_init()
|
|
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']")
|
|
nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']")
|
|
onclick = nick_element.get_attribute('onclick')
|
|
onclick_attr_list = onclick.split(',')
|
|
if len(onclick_attr_list) > 2:
|
|
return onclick_attr_list[1].strip().replace("'", "")
|
|
else:
|
|
return str()
|
|
|
|
def find_article_hit(self):
|
|
self.find_init()
|
|
element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']")
|
|
return element.text
|
|
|
|
def find_platform_name(self):
|
|
return 'naver'
|
|
|
|
def find_platform_form(self):
|
|
return 'cafe'
|
|
|
|
def find_article_form(self):
|
|
return 'body'
|
|
|
|
def find_platform_title(self):
|
|
self.driver.switch_to_default_content()
|
|
element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']")
|
|
return element.text
|
|
|
|
def find_article_url(self):
|
|
self.find_init()
|
|
element = self.driver.find_element_by_css_selector("a[id='linkUrl']")
|
|
return element.text
|
|
|
|
def find_platform_id(self):
|
|
article_url = str(self.find_article_url())
|
|
m = self.re_platform_id.search(article_url)
|
|
try:
|
|
return m.group(1)
|
|
except:
|
|
return str()
|
|
|
|
def print(self):
|
|
print("article_id = " + self.find_article_id())
|
|
print("article_nickname = " + self.find_article_nickname())
|
|
print("article_title = " + self.find_article_title())
|
|
print("article_date = " + self.find_article_date())
|
|
print("article_hit = " + self.find_article_hit())
|
|
print("article_url = " + self.find_article_url())
|
|
print("platform_title = " + self.find_platform_title())
|
|
print("article_data = " + self.find_article_data())
|
|
|
|
def get_content(self):
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id()
|
|
content["article_nickname"] = self.find_article_nickname()
|
|
content["article_title"] = self.find_article_title()
|
|
content["article_date"] = self.find_article_date()
|
|
content["article_hit"] = self.find_article_hit()
|
|
content["article_url"] = self.find_article_url()
|
|
content["article_data"] = self.find_article_data()
|
|
content["article_form"] = self.find_article_form()
|
|
content["platform_title"] = self.find_platform_title()
|
|
content["platform_name"] = self.find_platform_name()
|
|
content["platform_form"] = self.find_platform_form()
|
|
content["platform_id"] = self.find_platform_id()
|
|
return content
|
|
|
|
class NaverCafeReplyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.article_parent = str()
|
|
self.reply_list = list()
|
|
self.init_re()
|
|
|
|
def init_re(self):
|
|
self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+')
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def find_comments_element(self):
|
|
self.find_init()
|
|
try:
|
|
self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']")
|
|
if self.reply_elements is None:
|
|
return False
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def find_init(self):
|
|
self.count = 0
|
|
self.reply_list.clear()
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
|
|
def set_article_url(self, article_url):
|
|
self.article_url = article_url
|
|
|
|
def crawl_all(self):
|
|
has_next_comment_page = True
|
|
while has_next_comment_page:
|
|
self.crawl_current_page_reply()
|
|
has_next_comment_page = self.move_next_comment_page()
|
|
|
|
def move_next_comment_page(self):
|
|
element = None
|
|
try:
|
|
element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']")
|
|
children = element.find_elements_by_css_selector("*")
|
|
flag = False
|
|
for child in children:
|
|
if flag is True and child.tag_name == "a":
|
|
self.enter_element(child)
|
|
wait(1)
|
|
self.driver.switch_to_default_content()
|
|
self.driver.switch_to_frame('cafe_main')
|
|
return True
|
|
if child.tag_name == "strong":
|
|
flag = True
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
return False
|
|
if element is None:
|
|
return False
|
|
return False
|
|
|
|
def crawl_current_page_reply(self):
|
|
lis = self.reply_elements.find_elements_by_tag_name('li')
|
|
for li in lis:
|
|
if li.get_attribute('class') == 'reply':
|
|
self.crawl_reply_reply(li)
|
|
elif len(li.get_attribute('class')) < 1:
|
|
self.crawl_reply(li)
|
|
else:
|
|
pass
|
|
|
|
def find_article_url(self, li=None):
|
|
return self.article_url
|
|
|
|
def find_article_date(self, li):
|
|
element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']")
|
|
article_date = str(element.text)
|
|
article_date.strip()
|
|
article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00"
|
|
return article_date
|
|
|
|
def find_article_data(self, li):
|
|
element = li.find_element_by_css_selector("span[class='comm_body']")
|
|
article_data = element.text
|
|
return article_data
|
|
|
|
def find_article_parent(self, li):
|
|
try:
|
|
element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']")
|
|
article_parent = element.text
|
|
return article_parent
|
|
except:
|
|
return self.article_parent
|
|
|
|
def find_article_id(self, li):
|
|
element = li.find_element_by_css_selector("input[name='writerid']")
|
|
article_id = element.get_attribute('value')
|
|
return article_id
|
|
|
|
def find_article_nickname(self, li):
|
|
article_nickname = li.find_element_by_css_selector("td[class='p-nick']")
|
|
return article_nickname.text
|
|
|
|
def find_platform_id(self):
|
|
article_url = str(self.find_article_url())
|
|
m = self.re_platform_id.search(article_url)
|
|
try:
|
|
return m.group(1)
|
|
except:
|
|
return str()
|
|
|
|
def crawl_reply(self, li):
|
|
article_nickname = self.find_article_nickname(li)
|
|
self.article_parent = article_nickname
|
|
article_order = self.count
|
|
self.count += 1
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id(li)
|
|
content["article_nickname"] = self.find_article_nickname(li)
|
|
content["article_date"] = self.find_article_date(li)
|
|
content["article_data"] = self.find_article_data(li)
|
|
content["article_order"] = article_order
|
|
content["article_form"] = self.find_article_form()
|
|
content["platform_name"] = self.find_platform_name()
|
|
content["platform_form"] = self.find_platform_form()
|
|
content["article_url"] = self.find_article_url()
|
|
content["platform_id"] = self.find_platform_id()
|
|
self.reply_list.append(content)
|
|
|
|
def crawl_reply_reply(self, li):
|
|
article_parent = self.find_article_parent(li)
|
|
article_order = self.count
|
|
self.count += 1
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id(li)
|
|
content["article_nickname"] = self.find_article_nickname(li)
|
|
content["article_date"] = self.find_article_date(li)
|
|
content["article_data"] = self.find_article_data(li)
|
|
content["article_order"] = article_order
|
|
content["article_parent"] = article_parent
|
|
content["article_form"] = self.find_article_form()
|
|
content["platform_name"] = self.find_platform_name()
|
|
content["platform_form"] = self.find_platform_form()
|
|
content["article_url"] = self.find_article_url()
|
|
content["platform_id"] = self.find_platform_id()
|
|
self.reply_list.append(content)
|
|
|
|
def find_platform_name(self):
|
|
return 'naver'
|
|
|
|
def find_platform_form(self):
|
|
return 'cafe'
|
|
|
|
def find_article_form(self):
|
|
return 'reply'
|
|
|
|
def get_content(self):
|
|
return self.reply_list
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
#ac.move_to_element(element).click().perform()
|
|
#element.send_keys(Keys.NULL)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
def enter_element(self, element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
class SendtoDB:
|
|
pymysql = __import__('pymysql.cursors')
|
|
|
|
def __init__(self, db_num=0):
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
self.db_num = db_num
|
|
|
|
def set_db(self, db_num):
|
|
self.db_num = str(db_num)
|
|
|
|
def make_insert_query_backup(self, dictionary):
|
|
query = "insert into data_" + str(self.db_num) + " ("
|
|
for key in dictionary.keys():
|
|
query += (key + ",")
|
|
query = query[:len(query) - 1] + ")"
|
|
query += " values("
|
|
for key, value in dictionary.items():
|
|
if type(value) == int:
|
|
query += (str(value) + ",")
|
|
else:
|
|
query += self.conn.escape(value) + ","
|
|
query = query[:len(query) - 1] + ")"
|
|
return query
|
|
|
|
def make_insert_query(self, dictionary):
|
|
query = "insert into data_" + str(self.db_num) + " ("
|
|
key_list = list()
|
|
val_list = list()
|
|
for key, val in dictionary.items():
|
|
key_list.append(key)
|
|
if type(val) == int:
|
|
val_list.append(str(val))
|
|
else:
|
|
val_list.append(self.conn.escape(val))
|
|
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
|
|
|
|
def send_body(self, body):
|
|
if not body:
|
|
return
|
|
self.conn_check()
|
|
with self.conn.cursor() as cursor:
|
|
query = self.make_insert_query(body)
|
|
try:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
print(query)
|
|
sys.stdout.flush()
|
|
|
|
def send_reply(self, reply):
|
|
if not reply:
|
|
return
|
|
for i in reply:
|
|
self.send_body(i)
|
|
|
|
def conn_check(self):
|
|
if not self.conn.open:
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
|
|
def close(self):
|
|
self.conn.close()
|
|
|
|
def delete_url(self, url):
|
|
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
|
|
self.conn_check()
|
|
with self.conn.cursor() as cursor:
|
|
try:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
print(query)
|
|
sys.stdout.flush()
|
|
|
|
# class NaverCafeInit:
|
|
# pymysql = __import__('pymysql.cursors')
|
|
# url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid="
|
|
# url_second = "&search.searchdate="
|
|
# url_third = "&search.searchBy=0&search.query="
|
|
# url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0"
|
|
#
|
|
# def __init__(self, before_day=0):
|
|
# self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
|
|
# user='admin', passwd='admin123',
|
|
# db='concepters', charset='utf8',
|
|
# cursorclass=self.pymysql.cursors.DictCursor)
|
|
# self.urls = dict()
|
|
# self.before_day = before_day
|
|
#
|
|
# def set_before_day(self, before_day):
|
|
# if type(before_day) == str:
|
|
# self.before_day = int(before_day)
|
|
# elif type(before_day) == int:
|
|
# self.before_day = before_day
|
|
#
|
|
# def set_until_page(self, until_page):
|
|
# if type(until_page) == str:
|
|
# self.before_day = int(until_page)
|
|
# elif type(until_page) == int:
|
|
# self.before_day = until_page
|
|
#
|
|
# def split_searches(self):
|
|
# search = self.searches()
|
|
# splited_list = search.split(',')
|
|
# trimmed_list = list()
|
|
# for x in splited_list:
|
|
# trimmed_list.append(self.euc_kr(x.strip()))
|
|
# return trimmed_list
|
|
#
|
|
# def get_keyword_parameters(self, keyword_id):
|
|
# query = "select * from keyword where id = " + str(keyword_id)
|
|
# try:
|
|
# with self.conn.cursor() as cursor:
|
|
# cursor.execute(query)
|
|
# self.params = cursor.fetchone()
|
|
# return self.params
|
|
# except Exception as e:
|
|
# print(e)
|
|
# sys.stdout.flush()
|
|
# exit(1)
|
|
# return dict()
|
|
#
|
|
# def get_naver_cafe_list(self):
|
|
# query = "select url, clubid from navercafelist"
|
|
# if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
|
|
# pass
|
|
# else:
|
|
# query += (" where group_num = " + str(self.authorship()))
|
|
# try:
|
|
# with self.conn.cursor() as cursor:
|
|
# cursor.execute(query)
|
|
# list_result = cursor.fetchall()
|
|
# for i in list_result:
|
|
# self.urls[i["url"]] = i["clubid"]
|
|
# except Exception as e:
|
|
# print(e)
|
|
# sys.stdout.flush()
|
|
# exit(1)
|
|
# return self.urls
|
|
#
|
|
# def start_day(self):
|
|
# return self.params["start"]
|
|
#
|
|
# def end_day(self):
|
|
# return self.params["end"]
|
|
#
|
|
# def keyword_id(self):
|
|
# return self.params["id"]
|
|
#
|
|
# def realtime(self):
|
|
# return self.params["realtime"]
|
|
#
|
|
# def searches(self):
|
|
# return self.params["searches"]
|
|
#
|
|
# def authorship(self):
|
|
# return self.params["authorship"]
|
|
#
|
|
# def platform(self):
|
|
# return self.params["platform"]
|
|
#
|
|
# def is_realtime(self):
|
|
# if str(self.realtime()) == '0':
|
|
# return False
|
|
# else:
|
|
# return True
|
|
#
|
|
# def euc_kr(self, keyword):
|
|
# byte_code = list(keyword.encode("euc_kr"))
|
|
# encoded_keyword = ""
|
|
# for i in byte_code:
|
|
# if i == 0x20:
|
|
# encoded_keyword += "+"
|
|
# else:
|
|
# encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
|
# return encoded_keyword
|
|
#
|
|
# def url_all_days(self):
|
|
# url_list = list()
|
|
# for key, val in self.urls.items():
|
|
# if self.is_realtime():
|
|
# today = datetime.date.today()
|
|
# url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val)
|
|
# else:
|
|
# url = self.make_url(self.start_day(), self.end_day(), val)
|
|
# for i in url:
|
|
# url_list.append(i)
|
|
# return url_list
|
|
#
|
|
# def url_day_by_day(self):
|
|
# one_day = datetime.timedelta(days=1)
|
|
# url_list = list()
|
|
# for key, val in self.urls.items():
|
|
# if self.is_realtime():
|
|
# end = datetime.date.today()
|
|
# start = end + datetime.timedelta(days=self.before_day)
|
|
# else:
|
|
# start = self.start_day()
|
|
# end = self.end_day()
|
|
# while start <= end:
|
|
# url = self.make_url(start, start, val)
|
|
# for i in url:
|
|
# url_list.append(i)
|
|
# start += one_day
|
|
# return url_list
|
|
#
|
|
# def make_url(self, start_day, end_day, clubid):
|
|
# urls = list()
|
|
# for x in self.split_searches():
|
|
# url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth
|
|
# urls.append(url)
|
|
# return urls
|
|
#
|
|
# def disconnect(self):
|
|
# self.conn.close()
|
|
#
|
|
# def date_to_str(self, arg_date):
|
|
# return arg_date.strftime("%Y-%m-%d")
|
|
|
|
class CrawlInit:
|
|
pymysql = __import__('pymysql.cursors')
|
|
|
|
def __init__(self, before_day=0):
|
|
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
self.urls = dict()
|
|
self.before_day = before_day
|
|
|
|
def set_before_day(self, before_day):
|
|
if type(before_day) == str:
|
|
self.before_day = int(before_day)
|
|
elif type(before_day) == int:
|
|
self.before_day = before_day
|
|
|
|
def set_until_page(self, until_page):
|
|
if type(until_page) == str:
|
|
self.until_page = int(until_page)
|
|
elif type(until_page) == int:
|
|
self.until_page = until_page
|
|
|
|
def get_keyword_parameters(self, keyword_id):
|
|
query = "select * from keyword where id = " + str(keyword_id)
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.params = cursor.fetchone()
|
|
return self.params
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
exit(1)
|
|
return dict()
|
|
|
|
def get_naver_cafe_list(self):
|
|
query = "select url, clubid from navercafelist"
|
|
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
|
|
pass
|
|
else:
|
|
query += (" where group_num = " + str(self.authorship()))
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
list_result = cursor.fetchall()
|
|
for i in list_result:
|
|
self.urls[i["url"]] = i["clubid"]
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
exit(1)
|
|
return self.urls
|
|
|
|
def start_day(self):
|
|
return self.params["start"]
|
|
|
|
def end_day(self):
|
|
return self.params["end"]
|
|
|
|
def keyword_id(self):
|
|
return self.params["id"]
|
|
|
|
def realtime(self):
|
|
return self.params["realtime"]
|
|
|
|
def searches(self):
|
|
return self.params["searches"]
|
|
|
|
def authorship(self):
|
|
return self.params["authorship"]
|
|
|
|
def platform(self):
|
|
return self.params["platform"]
|
|
|
|
def is_realtime(self):
|
|
if str(self.realtime()) == '0':
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def euc_kr(self, keyword):
|
|
byte_code = list(keyword.encode("euc_kr"))
|
|
encoded_keyword = ""
|
|
for i in byte_code:
|
|
if i == 0x20:
|
|
encoded_keyword += "+"
|
|
else:
|
|
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
|
return encoded_keyword
|
|
|
|
def utf8(self, keyword):
|
|
byte_code = list(keyword.encode("utf-8"))
|
|
encoded_keyword = ""
|
|
for i in byte_code:
|
|
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
|
return encoded_keyword
|
|
|
|
def disconnect(self):
|
|
self.conn.close()
|
|
|
|
def date_to_str(self, arg_date):
|
|
return arg_date.strftime("%Y-%m-%d")
|
|
|
|
class NaverCafeInit(CrawlInit):
|
|
url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid="
|
|
url_second = "&search.searchdate="
|
|
url_third = "&search.searchBy=0&search.query="
|
|
url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0"
|
|
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
|
|
def url_all_days(self):
|
|
url_list = list()
|
|
for key, val in self.urls.items():
|
|
if self.is_realtime():
|
|
today = datetime.date.today()
|
|
url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val)
|
|
else:
|
|
url = self.make_url(self.start_day(), self.end_day(), val)
|
|
for i in url:
|
|
url_list.append(i)
|
|
return url_list
|
|
|
|
def url_day_by_day(self):
|
|
one_day = datetime.timedelta(days=1)
|
|
url_list = list()
|
|
for key, val in self.urls.items():
|
|
if self.is_realtime():
|
|
end = datetime.date.today()
|
|
start = end + datetime.timedelta(days=self.before_day)
|
|
else:
|
|
start = self.start_day()
|
|
end = self.end_day()
|
|
while start <= end:
|
|
url = self.make_url(start, start, val)
|
|
for i in url:
|
|
url_list.append(i)
|
|
start += one_day
|
|
return url_list
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
trimmed_list = list()
|
|
for x in splited_list:
|
|
trimmed_list.append(self.euc_kr(x.strip()))
|
|
return trimmed_list
|
|
|
|
def make_url(self, start_day, end_day, clubid):
|
|
urls = list()
|
|
for x in self.split_searches():
|
|
url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth
|
|
urls.append(url)
|
|
return urls
|
|
|
|
if __name__ == '__main__':
|
|
"""
|
|
argv:
|
|
0 - navercrawl.py
|
|
1 - keyword_id
|
|
2 - data db num
|
|
3 - before_day
|
|
"""
|
|
# crawler = NaverCafeCrawler()
|
|
# crawler.naver_login('kyounggoon', 'qorwjd123')
|
|
# crawler.suff('http://cafe.naver.com/imsanbu')
|
|
# crawler.cafe_search('성형')
|
|
# crawler.start()
|
|
# crawler.cafe_search_current_page_list()
|
|
|
|
# crawler = NaverCafeCrawler()
|
|
# crawler.naver_login('kyounggoon', 'qorwjd123')
|
|
# crawler.suff('http://cafe.naver.com/imsanbu')
|
|
# crawler.cafe_search('성형')
|
|
# crawler.main_area_crawler.send_to_db.set_db("294")
|
|
# crawler.main_area_crawler.set_keyword_id("111111")
|
|
# crawler.start()
|
|
|
|
if len(sys.argv) < 4:
|
|
print("Fail to process execute")
|
|
exit(1)
|
|
else:
|
|
print("Start Python Crawling")
|
|
|
|
#initialization
|
|
|
|
naver_id = "ehotnsdl1234"
|
|
naver_password = "66556655*"
|
|
|
|
naver_init = NaverCafeInit(int(sys.argv[3]))
|
|
naver_init.get_keyword_parameters(sys.argv[1])
|
|
naver_init.get_naver_cafe_list()
|
|
naver_init.disconnect()
|
|
naver_cafe = NaverCafeCrawler()
|
|
browser = Browser()
|
|
# arg: chrome, fierfox, ie, opera
|
|
naver_cafe.set_driver(browser.get_new_driver("chrome"))
|
|
wait(5)
|
|
naver_cafe.naver_login(naver_id, naver_password)
|
|
naver_main_area_crawler = NaverCafeMainAreaCrawler()
|
|
naver_main_area_crawler.set_driver(naver_cafe.driver)
|
|
naver_main_area_crawler.set_keyword_id(sys.argv[1])
|
|
naver_main_area_crawler.send_to_db.set_db(sys.argv[2])
|
|
naver_main_area_crawler.browser = browser
|
|
asis = Asistance()
|
|
|
|
realtime = True
|
|
while realtime:
|
|
print_and_flush("Crawler Start")
|
|
url_list = naver_init.url_all_days()
|
|
i = 0
|
|
backup_list = list()
|
|
while i < len(url_list):
|
|
try:
|
|
print_and_flush(url_list[i] + "\n")
|
|
print_and_flush("clubid: " + asis.clubid(url_list[i]))
|
|
print_and_flush(asis.date(url_list[i]) + "\n")
|
|
naver_cafe.suff(url_list[i])
|
|
naver_main_area_crawler.crawl_all_cafe_main(backup_list)
|
|
i += 1
|
|
backup_list.clear()
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
backup_list = list(naver_main_area_crawler.board_crawler.content_num_list)
|
|
naver_cafe.set_driver(browser.new_browser())
|
|
wait(5)
|
|
naver_cafe.naver_login(naver_id, naver_password)
|
|
naver_main_area_crawler.set_driver(naver_cafe.driver)
|
|
realtime = naver_init.is_realtime()
|
|
print_and_flush("Finished Crawling :)")
|
|
|
|
naver_cafe.quit()
|
|
naver_main_area_crawler.send_to_db.close()
|
|
print("Exit. Bye :)")
|
|
exit(0)
|
|
|
|
|
|
#http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 |