Files
clients/WebBasedCrawler/navercrawl.py
admin cc8122e074 WebBasedCrawler 추가
git-svn-id: svn://192.168.0.12/source@229 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-12-07 03:25:49 +00:00

1 line
42 KiB
Python

#-*- coding: utf-8 -*-
__author__ = 'cococo'
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import threading
from time import localtime, strftime
import time
import os
import sys
import datetime
import psutil
import re
def fcntwait(n):
time.sleep(n)
def wait(n):
th = threading.Thread(target=fcntwait, args=(n,))
th.start()
th.join()
def insert_log(msg):
pid = os.getpid()
tm = strftime("%Y_%m_%d", localtime())
filename = tm + "_" + str(pid) + ".log"
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
with open(filename, "a") as f:
f.write(total_msg)
f.flush()
def print_and_flush(string):
print(string)
sys.stdout.flush()
class Asistance:
def __init__(self):
self.re_clubid = re.compile("search\\.clubid=([\\d]+)")
self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})")
def clubid(self, url):
m = self.re_clubid.search(url)
if m is None:
return str()
else:
return m.group(1)
def date(self, url):
m = self.re_date.search(url)
if m is None:
return str("Start: ALL, End: ALL")
else:
return str("Start: " + m.group(1) + ", End: " + m.group(2))
class Browser:
def __init__(self, driver=None):
self.driver = driver
self.info = ""
def get_new_driver(self, name):
"""
windows system:
name = chrome, ie, opera, firefox
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
linux system:
name = chrome, opera, firefox
default driver_exec: chromedriver, operadriver
"""
if sys.platform == "win32":
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver.exe")
elif name == "ie":
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver.exe")
elif name == "firefox":
return self.new_firefox_browser()
else:
return None
else:
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver")
elif name == "firefox":
return self.new_firefox_browser()
else:
return None
def new_chrome_browser(self, driver_exec=None):
self.info = "chrome"
if driver_exec is not None:
self.chrome_driver_path = driver_exec
self.chrome_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.chrome_basename):
port = self.port(self.chrome_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
else:
self.driver = webdriver.Chrome(self.chrome_driver_path)
return self.driver
def new_ie_browser(self, driver_exec=None):
self.info = "ie"
if driver_exec is not None:
self.ie_driver_path = driver_exec
self.ie_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.ie_basename):
port = self.port(self.ie_basename)
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
else:
self.driver = webdriver.Ie(self.ie_driver_path)
return self.driver
def new_firefox_browser(self):
self.info = "firefox"
self.driver = webdriver.Firefox()
return self.driver
def new_opera_browser(self, driver_exec=None):
self.info = "opera"
if driver_exec is not None:
self.opera_driver_path = driver_exec
self.opera_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.opera_basename):
port = self.port(self.opera_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
else:
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
return self.driver
def driver(self):
return self.driver
def is_server_executed(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return True
return False
def port(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return str(x.laddr[1])
return str(9999)
def new_browser(self):
if self.info == "chrome":
return self.new_chrome_browser()
elif self.info == "ie":
return self.new_ie_browser()
elif self.info == "opera":
return self.new_opera_browser()
elif self.info == "firefox":
return self.new_firefox_browser()
else:
return None
class NaverCafeCrawler:
#driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe')
def __init__(self):
self.driver = None
# webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe')
# self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe')
# firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
# firefox_capabilities['marionette'] = True
# firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe')
# webdriver.Chrome()
# self.driver = webdriver.Firefox()
# self.driver.set_window_size(1600, 900)
self.main_area_crawler = NaverCafeMainAreaCrawler()
def set_driver(self, driver):
self.driver = driver
def suff(self, url):
self.driver.get(url)
wait(2)
def screenshot(self,filename):
self.driver.save_screenshot(filename)
def html(self):
return self.driver.page_source
def savepage(self, filename):
with open(filename,'w',encoding='UTF8') as f:
f.write(self.html())
def naver_login(self, id, password):
self.suff('http://www.naver.com')
wait(2)
element = self.driver.find_element_by_id('id')
element.send_keys(id)
#element = driver.find_element_by_id('label_pw')
element = self.driver.find_element_by_id('pw')
element.send_keys(password)
element.send_keys(Keys.ENTER)
wait(3)
#element = self.driver.find_element_by_class_name('btn_login')
#self.click_element(element)
def cafe_search(self, keyword):
element = self.driver.find_element_by_id('topLayerQueryInput')
element.send_keys(keyword)
wait(1)
element.send_keys(Keys.ENTER)
#element.send_keys(Keys.RETURN)
wait(2)
def get_url(self):
return self.driver.current_url()
def click_element(self, element):
ac = ActionChains(self.driver)
#ac.move_to_element(element).click().perform()
#element.send_keys(Keys.NULL)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def start(self):
self.main_area_crawler.set_driver(self.driver)
self.main_area_crawler.crawl_all_cafe_main()
def close(self):
self.driver.close()
def quit(self):
self.driver.quit()
class NaverCafeMainAreaCrawler:
def __init__(self):
self.board_crawler = NaverCafeBoardCrawler()
self.body_crawler = NaverCafeBodyCrawler()
self.reply_crawler = NaverCafeReplyCrawler()
self.send_to_db = SendtoDB()
self.browser = None
def print(self, arg):
print(arg)
sys.stdout.flush()
def set_driver(self, driver):
self.board_crawler.set_driver(driver)
self.body_crawler.set_driver(driver)
self.reply_crawler.set_driver(driver)
self.driver = driver
def copy_list(self, backup_list):
for i in backup_list:
self.board_crawler.content_num_list.append(i)
def crawl_all_cafe_main(self, backup_list=None):
self.board_crawler.clear_content_num_list()
if backup_list:
self.copy_list(backup_list)
has_next_table = True
while has_next_table:
self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag()))
# if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1:
self.release_memory()
while self.board_crawler.move_next_content():
try:
self.crawl_body()
self.crawl_reply()
self.print("ok")
except Exception as e:
self.print("fail")
self.print(e)
self.driver.back()
wait(1)
has_next_table = self.board_crawler.move_next_page()
def crawl_body(self):
self.body_crawler.set_driver(self.driver)
content = self.body_crawler.get_content()
content['keyword_id'] = self.keyword_id
self.send_to_db.delete_url(content['article_url'])
self.send_to_db.send_body(content)
self.print(content['article_url'])
def crawl_reply(self):
self.reply_crawler.set_driver(self.driver)
if self.reply_crawler.find_comments_element():
self.reply_crawler.set_article_url(self.body_crawler.find_article_url())
self.reply_crawler.crawl_all()
self.send_to_db.send_reply(self.reply_crawler.get_content())
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def release_memory_firefox(self):
index = self.driver.current_url.find("%26search.page=")
if index is -1:
temp_url = self.driver.current_url
else:
temp_url = self.driver.current_url[:index]
temp_page = self.board_crawler.current_page_num_by_tag()
if temp_page.strip() is "1":
url = temp_url
else:
url = temp_url + "%26search.page=" + temp_page.strip()
self.print("Release Memory Process")
self.driver.get("about:memory")
wait(2)
self.driver.execute_script("doMMU()")
wait(2)
self.driver.execute_script("doGC()")
wait(2)
self.driver.execute_script("doCC()")
wait(2)
self.driver.get(url)
wait(2)
print_and_flush("reloaded")
def release_memory_others(self):
temp_url = self.driver.current_url
self.print("Release Memory Process")
self.driver.get(temp_url)
wait(2)
print_and_flush("reloaded")
def release_memory(self):
if self.browser.info == "firefox":
if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1:
self.release_memory_firefox()
else:
if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1):
self.release_memory_others()
def click_element(self, element):
ac = ActionChains(self.driver)
#ac.move_to_element(element).click().perform()
#element.send_keys(Keys.NULL)
ac.move_to_element(element).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
class NaverCafeBoardCrawler:
def __init__(self, driver=None):
self.driver = driver
self.content_num_list = list()
import re
self.re_page = re.compile("search\\.page=([\\d]+)")
def clear_content_num_list(self):
self.content_num_list.clear()
def current_url(self):
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
return self.driver.current_url
def current_page_num_by_url(self):
url = self.current_url()
m = self.re_page.search(url)
if m is None:
return self.current_page_num_by_tag()
else:
return m.group(1)
def current_page_num_by_tag(self):
page_navigate = None
try:
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']")
except:
return str(1)
if page_navigate is None:
return str(1)
tds = page_navigate.find_elements_by_tag_name('td')
for td in tds:
try:
page_on = td.get_attribute('class')
if page_on == 'on':
return td.text
except:
continue
return str(1)
def move_next_content(self):
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
trs = self.driver.find_elements_by_css_selector("tr[align='center']")
for tr in trs:
try:
content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']")
if len(str(content_num.text).strip()) < 1:
continue
if content_num.text in self.content_num_list:
continue
self.content_num_list.append(content_num.text)
sub = tr.find_element_by_css_selector("a[class='m-tcol-c']")
self.enter_element(sub)
return True
except:
pass
return False
def move_next_page(self):
page_navigate = None
try:
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']")
except:
return False
if page_navigate is None:
return False
tds = page_navigate.find_elements_by_tag_name('td')
is_next = False
for td in tds:
if is_next:
a = td.find_element_by_tag_name("a")
self.enter_element(a)
#self.enter_element(td)
return True
try:
page_on = td.get_attribute('class')
if page_on == 'on':
is_next = True
continue
except:
continue
return False
def set_driver(self, driver):
self.driver = driver
def click_element(self, element):
ac = ActionChains(self.driver)
#ac.move_to_element(element).click().perform()
#element.send_keys(Keys.NULL)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
class NaverCafeBodyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.init_re()
def init_re(self):
self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+')
def set_driver(self, driver):
self.driver = driver
def find_init(self):
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
def find_article_title(self):
self.find_init()
article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']")
return article_title.text
def find_article_date(self):
self.find_init()
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']")
article_date = str(element.text)
article_date.strip()
article_date = article_date.replace('. ',' ').replace('.','-') + ":00"
return article_date
def find_article_data(self):
self.find_init()
article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']")
return article_data.text
def find_article_nickname(self):
self.find_init()
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']")
nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']")
onclick = nick_element.get_attribute('onclick')
onclick_attr_list = onclick.split(',')
if len(onclick_attr_list) > 4:
return onclick_attr_list[3].strip().replace("'", "")
else:
return str()
def find_article_id(self):
self.find_init()
element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']")
nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']")
onclick = nick_element.get_attribute('onclick')
onclick_attr_list = onclick.split(',')
if len(onclick_attr_list) > 2:
return onclick_attr_list[1].strip().replace("'", "")
else:
return str()
def find_article_hit(self):
self.find_init()
element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']")
return element.text
def find_platform_name(self):
return 'naver'
def find_platform_form(self):
return 'cafe'
def find_article_form(self):
return 'body'
def find_platform_title(self):
self.driver.switch_to_default_content()
element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']")
return element.text
def find_article_url(self):
self.find_init()
element = self.driver.find_element_by_css_selector("a[id='linkUrl']")
return element.text
def find_platform_id(self):
article_url = str(self.find_article_url())
m = self.re_platform_id.search(article_url)
try:
return m.group(1)
except:
return str()
def print(self):
print("article_id = " + self.find_article_id())
print("article_nickname = " + self.find_article_nickname())
print("article_title = " + self.find_article_title())
print("article_date = " + self.find_article_date())
print("article_hit = " + self.find_article_hit())
print("article_url = " + self.find_article_url())
print("platform_title = " + self.find_platform_title())
print("article_data = " + self.find_article_data())
def get_content(self):
content = dict()
content["article_id"] = self.find_article_id()
content["article_nickname"] = self.find_article_nickname()
content["article_title"] = self.find_article_title()
content["article_date"] = self.find_article_date()
content["article_hit"] = self.find_article_hit()
content["article_url"] = self.find_article_url()
content["article_data"] = self.find_article_data()
content["article_form"] = self.find_article_form()
content["platform_title"] = self.find_platform_title()
content["platform_name"] = self.find_platform_name()
content["platform_form"] = self.find_platform_form()
content["platform_id"] = self.find_platform_id()
return content
class NaverCafeReplyCrawler:
def __init__(self, driver=None):
self.driver = driver
self.article_parent = str()
self.reply_list = list()
self.init_re()
def init_re(self):
self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+')
def set_driver(self, driver):
self.driver = driver
def find_comments_element(self):
self.find_init()
try:
self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']")
if self.reply_elements is None:
return False
return True
except:
return False
def find_init(self):
self.count = 0
self.reply_list.clear()
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
def set_article_url(self, article_url):
self.article_url = article_url
def crawl_all(self):
has_next_comment_page = True
while has_next_comment_page:
self.crawl_current_page_reply()
has_next_comment_page = self.move_next_comment_page()
def move_next_comment_page(self):
element = None
try:
element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']")
children = element.find_elements_by_css_selector("*")
flag = False
for child in children:
if flag is True and child.tag_name == "a":
self.enter_element(child)
wait(1)
self.driver.switch_to_default_content()
self.driver.switch_to_frame('cafe_main')
return True
if child.tag_name == "strong":
flag = True
except Exception as e:
print(e)
sys.stdout.flush()
return False
if element is None:
return False
return False
def crawl_current_page_reply(self):
lis = self.reply_elements.find_elements_by_tag_name('li')
for li in lis:
if li.get_attribute('class') == 'reply':
self.crawl_reply_reply(li)
elif len(li.get_attribute('class')) < 1:
self.crawl_reply(li)
else:
pass
def find_article_url(self, li=None):
return self.article_url
def find_article_date(self, li):
element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']")
article_date = str(element.text)
article_date.strip()
article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00"
return article_date
def find_article_data(self, li):
element = li.find_element_by_css_selector("span[class='comm_body']")
article_data = element.text
return article_data
def find_article_parent(self, li):
try:
element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']")
article_parent = element.text
return article_parent
except:
return self.article_parent
def find_article_id(self, li):
element = li.find_element_by_css_selector("input[name='writerid']")
article_id = element.get_attribute('value')
return article_id
def find_article_nickname(self, li):
article_nickname = li.find_element_by_css_selector("td[class='p-nick']")
return article_nickname.text
def find_platform_id(self):
article_url = str(self.find_article_url())
m = self.re_platform_id.search(article_url)
try:
return m.group(1)
except:
return str()
def crawl_reply(self, li):
article_nickname = self.find_article_nickname(li)
self.article_parent = article_nickname
article_order = self.count
self.count += 1
content = dict()
content["article_id"] = self.find_article_id(li)
content["article_nickname"] = self.find_article_nickname(li)
content["article_date"] = self.find_article_date(li)
content["article_data"] = self.find_article_data(li)
content["article_order"] = article_order
content["article_form"] = self.find_article_form()
content["platform_name"] = self.find_platform_name()
content["platform_form"] = self.find_platform_form()
content["article_url"] = self.find_article_url()
content["platform_id"] = self.find_platform_id()
self.reply_list.append(content)
def crawl_reply_reply(self, li):
article_parent = self.find_article_parent(li)
article_order = self.count
self.count += 1
content = dict()
content["article_id"] = self.find_article_id(li)
content["article_nickname"] = self.find_article_nickname(li)
content["article_date"] = self.find_article_date(li)
content["article_data"] = self.find_article_data(li)
content["article_order"] = article_order
content["article_parent"] = article_parent
content["article_form"] = self.find_article_form()
content["platform_name"] = self.find_platform_name()
content["platform_form"] = self.find_platform_form()
content["article_url"] = self.find_article_url()
content["platform_id"] = self.find_platform_id()
self.reply_list.append(content)
def find_platform_name(self):
return 'naver'
def find_platform_form(self):
return 'cafe'
def find_article_form(self):
return 'reply'
def get_content(self):
return self.reply_list
def click_element(self, element):
ac = ActionChains(self.driver)
#ac.move_to_element(element).click().perform()
#element.send_keys(Keys.NULL)
ac.move_to_element_with_offset(element, 0, 0).click().perform()
wait(2)
def enter_element(self, element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
class SendtoDB:
pymysql = __import__('pymysql.cursors')
def __init__(self, db_num=0):
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.db_num = db_num
def set_db(self, db_num):
self.db_num = str(db_num)
def make_insert_query_backup(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
for key in dictionary.keys():
query += (key + ",")
query = query[:len(query) - 1] + ")"
query += " values("
for key, value in dictionary.items():
if type(value) == int:
query += (str(value) + ",")
else:
query += self.conn.escape(value) + ","
query = query[:len(query) - 1] + ")"
return query
def make_insert_query(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
key_list = list()
val_list = list()
for key, val in dictionary.items():
key_list.append(key)
if type(val) == int:
val_list.append(str(val))
else:
val_list.append(self.conn.escape(val))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
def send_body(self, body):
if not body:
return
self.conn_check()
with self.conn.cursor() as cursor:
query = self.make_insert_query(body)
try:
cursor.execute(query)
self.conn.commit()
except Exception as e:
print(e)
sys.stdout.flush()
print(query)
sys.stdout.flush()
def send_reply(self, reply):
if not reply:
return
for i in reply:
self.send_body(i)
def conn_check(self):
if not self.conn.open:
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
def close(self):
self.conn.close()
def delete_url(self, url):
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
self.conn_check()
with self.conn.cursor() as cursor:
try:
cursor.execute(query)
self.conn.commit()
except Exception as e:
print(e)
sys.stdout.flush()
print(query)
sys.stdout.flush()
# class NaverCafeInit:
# pymysql = __import__('pymysql.cursors')
# url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid="
# url_second = "&search.searchdate="
# url_third = "&search.searchBy=0&search.query="
# url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0"
#
# def __init__(self, before_day=0):
# self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
# user='admin', passwd='admin123',
# db='concepters', charset='utf8',
# cursorclass=self.pymysql.cursors.DictCursor)
# self.urls = dict()
# self.before_day = before_day
#
# def set_before_day(self, before_day):
# if type(before_day) == str:
# self.before_day = int(before_day)
# elif type(before_day) == int:
# self.before_day = before_day
#
# def set_until_page(self, until_page):
# if type(until_page) == str:
# self.before_day = int(until_page)
# elif type(until_page) == int:
# self.before_day = until_page
#
# def split_searches(self):
# search = self.searches()
# splited_list = search.split(',')
# trimmed_list = list()
# for x in splited_list:
# trimmed_list.append(self.euc_kr(x.strip()))
# return trimmed_list
#
# def get_keyword_parameters(self, keyword_id):
# query = "select * from keyword where id = " + str(keyword_id)
# try:
# with self.conn.cursor() as cursor:
# cursor.execute(query)
# self.params = cursor.fetchone()
# return self.params
# except Exception as e:
# print(e)
# sys.stdout.flush()
# exit(1)
# return dict()
#
# def get_naver_cafe_list(self):
# query = "select url, clubid from navercafelist"
# if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
# pass
# else:
# query += (" where group_num = " + str(self.authorship()))
# try:
# with self.conn.cursor() as cursor:
# cursor.execute(query)
# list_result = cursor.fetchall()
# for i in list_result:
# self.urls[i["url"]] = i["clubid"]
# except Exception as e:
# print(e)
# sys.stdout.flush()
# exit(1)
# return self.urls
#
# def start_day(self):
# return self.params["start"]
#
# def end_day(self):
# return self.params["end"]
#
# def keyword_id(self):
# return self.params["id"]
#
# def realtime(self):
# return self.params["realtime"]
#
# def searches(self):
# return self.params["searches"]
#
# def authorship(self):
# return self.params["authorship"]
#
# def platform(self):
# return self.params["platform"]
#
# def is_realtime(self):
# if str(self.realtime()) == '0':
# return False
# else:
# return True
#
# def euc_kr(self, keyword):
# byte_code = list(keyword.encode("euc_kr"))
# encoded_keyword = ""
# for i in byte_code:
# if i == 0x20:
# encoded_keyword += "+"
# else:
# encoded_keyword += str(hex(i)).replace("0x", "%").upper()
# return encoded_keyword
#
# def url_all_days(self):
# url_list = list()
# for key, val in self.urls.items():
# if self.is_realtime():
# today = datetime.date.today()
# url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val)
# else:
# url = self.make_url(self.start_day(), self.end_day(), val)
# for i in url:
# url_list.append(i)
# return url_list
#
# def url_day_by_day(self):
# one_day = datetime.timedelta(days=1)
# url_list = list()
# for key, val in self.urls.items():
# if self.is_realtime():
# end = datetime.date.today()
# start = end + datetime.timedelta(days=self.before_day)
# else:
# start = self.start_day()
# end = self.end_day()
# while start <= end:
# url = self.make_url(start, start, val)
# for i in url:
# url_list.append(i)
# start += one_day
# return url_list
#
# def make_url(self, start_day, end_day, clubid):
# urls = list()
# for x in self.split_searches():
# url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth
# urls.append(url)
# return urls
#
# def disconnect(self):
# self.conn.close()
#
# def date_to_str(self, arg_date):
# return arg_date.strftime("%Y-%m-%d")
class CrawlInit:
pymysql = __import__('pymysql.cursors')
def __init__(self, before_day=0):
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.urls = dict()
self.before_day = before_day
def set_before_day(self, before_day):
if type(before_day) == str:
self.before_day = int(before_day)
elif type(before_day) == int:
self.before_day = before_day
def set_until_page(self, until_page):
if type(until_page) == str:
self.until_page = int(until_page)
elif type(until_page) == int:
self.until_page = until_page
def get_keyword_parameters(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
self.params = cursor.fetchone()
return self.params
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return dict()
def get_naver_cafe_list(self):
query = "select url, clubid from navercafelist"
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
pass
else:
query += (" where group_num = " + str(self.authorship()))
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
list_result = cursor.fetchall()
for i in list_result:
self.urls[i["url"]] = i["clubid"]
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return self.urls
def start_day(self):
return self.params["start"]
def end_day(self):
return self.params["end"]
def keyword_id(self):
return self.params["id"]
def realtime(self):
return self.params["realtime"]
def searches(self):
return self.params["searches"]
def authorship(self):
return self.params["authorship"]
def platform(self):
return self.params["platform"]
def is_realtime(self):
if str(self.realtime()) == '0':
return False
else:
return True
def euc_kr(self, keyword):
byte_code = list(keyword.encode("euc_kr"))
encoded_keyword = ""
for i in byte_code:
if i == 0x20:
encoded_keyword += "+"
else:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def utf8(self, keyword):
byte_code = list(keyword.encode("utf-8"))
encoded_keyword = ""
for i in byte_code:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def disconnect(self):
self.conn.close()
def date_to_str(self, arg_date):
return arg_date.strftime("%Y-%m-%d")
class NaverCafeInit(CrawlInit):
url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid="
url_second = "&search.searchdate="
url_third = "&search.searchBy=0&search.query="
url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0"
def __init__(self, before_day=0):
super().__init__(before_day)
def url_all_days(self):
url_list = list()
for key, val in self.urls.items():
if self.is_realtime():
today = datetime.date.today()
url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val)
else:
url = self.make_url(self.start_day(), self.end_day(), val)
for i in url:
url_list.append(i)
return url_list
def url_day_by_day(self):
one_day = datetime.timedelta(days=1)
url_list = list()
for key, val in self.urls.items():
if self.is_realtime():
end = datetime.date.today()
start = end + datetime.timedelta(days=self.before_day)
else:
start = self.start_day()
end = self.end_day()
while start <= end:
url = self.make_url(start, start, val)
for i in url:
url_list.append(i)
start += one_day
return url_list
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
for x in splited_list:
trimmed_list.append(self.euc_kr(x.strip()))
return trimmed_list
def make_url(self, start_day, end_day, clubid):
urls = list()
for x in self.split_searches():
url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth
urls.append(url)
return urls
if __name__ == '__main__':
"""
argv:
0 - navercrawl.py
1 - keyword_id
2 - data db num
3 - before_day
"""
# crawler = NaverCafeCrawler()
# crawler.naver_login('kyounggoon', 'qorwjd123')
# crawler.suff('http://cafe.naver.com/imsanbu')
# crawler.cafe_search('성형')
# crawler.start()
# crawler.cafe_search_current_page_list()
# crawler = NaverCafeCrawler()
# crawler.naver_login('kyounggoon', 'qorwjd123')
# crawler.suff('http://cafe.naver.com/imsanbu')
# crawler.cafe_search('성형')
# crawler.main_area_crawler.send_to_db.set_db("294")
# crawler.main_area_crawler.set_keyword_id("111111")
# crawler.start()
if len(sys.argv) < 4:
print("Fail to process execute")
exit(1)
else:
print("Start Python Crawling")
#initialization
naver_id = "ehotnsdl1234"
naver_password = "66556655*"
naver_init = NaverCafeInit(int(sys.argv[3]))
naver_init.get_keyword_parameters(sys.argv[1])
naver_init.get_naver_cafe_list()
naver_init.disconnect()
naver_cafe = NaverCafeCrawler()
browser = Browser()
# arg: chrome, fierfox, ie, opera
naver_cafe.set_driver(browser.get_new_driver("chrome"))
wait(5)
naver_cafe.naver_login(naver_id, naver_password)
naver_main_area_crawler = NaverCafeMainAreaCrawler()
naver_main_area_crawler.set_driver(naver_cafe.driver)
naver_main_area_crawler.set_keyword_id(sys.argv[1])
naver_main_area_crawler.send_to_db.set_db(sys.argv[2])
naver_main_area_crawler.browser = browser
asis = Asistance()
realtime = True
while realtime:
print_and_flush("Crawler Start")
url_list = naver_init.url_all_days()
i = 0
backup_list = list()
while i < len(url_list):
try:
print_and_flush(url_list[i] + "\n")
print_and_flush("clubid: " + asis.clubid(url_list[i]))
print_and_flush(asis.date(url_list[i]) + "\n")
naver_cafe.suff(url_list[i])
naver_main_area_crawler.crawl_all_cafe_main(backup_list)
i += 1
backup_list.clear()
except Exception as e:
print_and_flush(e)
backup_list = list(naver_main_area_crawler.board_crawler.content_num_list)
naver_cafe.set_driver(browser.new_browser())
wait(5)
naver_cafe.naver_login(naver_id, naver_password)
naver_main_area_crawler.set_driver(naver_cafe.driver)
realtime = naver_init.is_realtime()
print_and_flush("Finished Crawling :)")
naver_cafe.quit()
naver_main_area_crawler.send_to_db.close()
print("Exit. Bye :)")
exit(0)
#http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0