449 lines
16 KiB
Python
449 lines
16 KiB
Python
#-*- coding: utf-8 -*-
|
|
'''
|
|
Created on 2015. 12. 8.
|
|
|
|
@author: cococo
|
|
'''
|
|
import sys
|
|
import time
|
|
import os
|
|
import psutil
|
|
import threading
|
|
import re
|
|
import pymysql
|
|
import random
|
|
|
|
from time import localtime, strftime
|
|
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
|
is_debug = False
|
|
|
|
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
|
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
|
|
|
|
|
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
|
if is_debug:
|
|
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
|
|
|
|
|
def print_and_flush(string):
|
|
print(string)
|
|
sys.stdout.flush()
|
|
|
|
|
|
def fcntwait(n):
|
|
time.sleep(n)
|
|
|
|
|
|
def wait(n):
|
|
th = threading.Thread(target=fcntwait, args=(n,))
|
|
th.start()
|
|
th.join()
|
|
|
|
|
|
def insert_log(msg):
|
|
pid = os.getpid()
|
|
tm = strftime("%Y_%m_%d", localtime())
|
|
filename = tm + "_" + str(pid) + ".log"
|
|
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
|
|
with open(filename, "a") as f:
|
|
f.write(total_msg)
|
|
f.flush()
|
|
|
|
|
|
def enter_element(element):
|
|
element.send_keys(Keys.NULL)
|
|
element.send_keys(Keys.ENTER)
|
|
wait(2)
|
|
|
|
|
|
def find_element_by_css_selector(driver, tag, time=0):
|
|
element = WebDriverWait(driver, time).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, tag))
|
|
)
|
|
return element
|
|
|
|
|
|
def find_elements_by_css_selector(driver, tag, time=0):
|
|
elements = WebDriverWait(driver, time).until(
|
|
EC.presence_of_all_elements_located((By.CSS_SELECTOR, tag))
|
|
)
|
|
return elements
|
|
|
|
|
|
def find_element_by_xpath(driver, tag, time=0):
|
|
element = WebDriverWait(driver, time).until(
|
|
EC.presence_of_element_located((By.XPATH, tag))
|
|
)
|
|
return element
|
|
|
|
|
|
def find_elements_by_xpath(driver, tag, time=0):
|
|
elements = WebDriverWait(driver, time).until(
|
|
EC.presence_of_all_elements_located((By.XPATH, tag))
|
|
)
|
|
return elements
|
|
|
|
|
|
class Browser:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.info = ""
|
|
|
|
def get_new_driver(self, name):
|
|
"""
|
|
windows system:
|
|
name = chrome, ie, opera, firefox
|
|
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
|
|
linux system:
|
|
name = chrome, opera, firefox
|
|
default driver_exec: chromedriver, operadriver
|
|
"""
|
|
if sys.platform == "win32":
|
|
if name == "chrome":
|
|
return self.new_chrome_browser(driver_exec="chromedriver.exe")
|
|
elif name == "ie":
|
|
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
|
|
elif name == "opera":
|
|
return self.new_opera_browser(driver_exec="operadriver.exe")
|
|
elif name == "firefox":
|
|
return self.new_firefox_browser(executable_path="geckodriver.exe")
|
|
else:
|
|
return None
|
|
else:
|
|
if name == "chrome":
|
|
return self.new_chrome_browser(driver_exec="chromedriver")
|
|
elif name == "opera":
|
|
return self.new_opera_browser(driver_exec="operadriver")
|
|
elif name == "firefox":
|
|
return self.new_firefox_browser(executable_path="geckodriver")
|
|
else:
|
|
return None
|
|
|
|
def new_chrome_browser(self, driver_exec=None):
|
|
self.info = "chrome"
|
|
if driver_exec is not None:
|
|
self.chrome_driver_path = driver_exec
|
|
self.chrome_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.chrome_basename):
|
|
port = self.port(self.chrome_basename)
|
|
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
|
|
desired_capabilities=webdriver.DesiredCapabilities.CHROME)
|
|
else:
|
|
self.driver = webdriver.Chrome(self.chrome_driver_path)
|
|
return self.driver
|
|
|
|
def new_ie_browser(self, driver_exec=None):
|
|
self.info = "ie"
|
|
if driver_exec is not None:
|
|
self.ie_driver_path = driver_exec
|
|
self.ie_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.ie_basename):
|
|
port = self.port(self.ie_basename)
|
|
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
|
|
else:
|
|
self.driver = webdriver.Ie(self.ie_driver_path)
|
|
return self.driver
|
|
|
|
def new_firefox_browser(self, executable_path=None):
|
|
self.info = "firefox"
|
|
caps = DesiredCapabilities.FIREFOX
|
|
|
|
if executable_path is not None:
|
|
self.firefox_driver_path = executable_path
|
|
self.firefox_basename = os.path.basename(executable_path)
|
|
if self.is_server_executed(self.firefox_basename):
|
|
port = self.port(self.firefox_basename)
|
|
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
|
|
desired_capabilities=caps)
|
|
else:
|
|
self.driver = webdriver.Firefox(desired_capabilities=caps,
|
|
executable_path=self.firefox_driver_path)
|
|
|
|
#caps["marionette"] = True
|
|
return self.driver
|
|
|
|
def new_opera_browser(self, driver_exec=None):
|
|
self.info = "opera"
|
|
if driver_exec is not None:
|
|
self.opera_driver_path = driver_exec
|
|
self.opera_basename = os.path.basename(driver_exec)
|
|
if self.is_server_executed(self.opera_basename):
|
|
port = self.port(self.opera_basename)
|
|
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
|
|
desired_capabilities=webdriver.DesiredCapabilities.OPERA)
|
|
else:
|
|
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA,
|
|
executable_path=self.opera_driver_path)
|
|
return self.driver
|
|
|
|
def driver(self):
|
|
return self.driver
|
|
|
|
def is_server_executed(self, driver_basename):
|
|
for ps in psutil.process_iter():
|
|
if ps.name() == driver_basename:
|
|
conns = ps.connections()
|
|
for x in conns:
|
|
if x.status == "LISTEN":
|
|
return True
|
|
return False
|
|
|
|
def port(self, driver_basename):
|
|
for ps in psutil.process_iter():
|
|
if ps.name() == driver_basename:
|
|
conns = ps.connections()
|
|
for x in conns:
|
|
if x.status == "LISTEN":
|
|
return str(x.laddr[1])
|
|
return str(9999)
|
|
|
|
def new_browser(self):
|
|
if self.info == "chrome":
|
|
return self.new_chrome_browser()
|
|
elif self.info == "ie":
|
|
return self.new_ie_browser()
|
|
elif self.info == "opera":
|
|
return self.new_opera_browser()
|
|
elif self.info == "firefox":
|
|
return self.new_firefox_browser()
|
|
else:
|
|
return None
|
|
|
|
|
|
class SendtoDB:
|
|
pymysql = __import__('pymysql.cursors')
|
|
re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE)
|
|
|
|
def __init__(self, db_num=0):
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
self.db_num = db_num
|
|
|
|
def set_db(self, db_num):
|
|
self.db_num = str(db_num)
|
|
|
|
def make_insert_query_backup(self, dictionary):
|
|
query = "insert into data_" + str(self.db_num) + " ("
|
|
for key in dictionary.keys():
|
|
query += (key + ",")
|
|
query = query[:len(query) - 1] + ")"
|
|
query += " values("
|
|
for key, value in dictionary.items():
|
|
if type(value) == int:
|
|
query += (str(value) + ",")
|
|
else:
|
|
query += self.conn.escape(value) + ","
|
|
query = query[:len(query) - 1] + ")"
|
|
return query
|
|
|
|
def make_insert_query(self, dictionary):
|
|
query = "insert into data_" + str(self.db_num) + " ("
|
|
key_list = list()
|
|
val_list = list()
|
|
for key, val in dictionary.items():
|
|
key_list.append(key)
|
|
if type(val) == int:
|
|
val_list.append(str(val))
|
|
else:
|
|
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
|
|
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
|
|
|
|
def send_body(self, body):
|
|
if not body:
|
|
return
|
|
self.conn_check()
|
|
query = self.make_insert_query(body)
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except self.pymysql.err.OperationalError as e:
|
|
print(e)
|
|
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
|
|
print("connection lost. try to reconnection")
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except self.pymysql.err.MySQLError as e:
|
|
print(e)
|
|
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
|
|
print("connection lost. try to reconnection")
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except Exception as e:
|
|
print(e, flush=True)
|
|
print(query, flush=True)
|
|
|
|
def send_reply(self, reply):
|
|
if not reply:
|
|
return
|
|
for i in reply:
|
|
self.send_body(i)
|
|
|
|
def conn_check(self):
|
|
if not self.conn.open:
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
|
|
def close(self):
|
|
self.conn.close()
|
|
|
|
def delete_url(self, url):
|
|
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
|
|
self.conn_check()
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except self.pymysql.err.OperationalError as e:
|
|
print(e)
|
|
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
|
|
print("connection lost. try to reconnection")
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except self.pymysql.err.MySQLError as e:
|
|
print(e)
|
|
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
|
|
print("connection lost. try to reconnection")
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.conn.commit()
|
|
except Exception as e:
|
|
print(e, flush=True)
|
|
print(query, flush=True)
|
|
|
|
|
|
class CrawlInit:
|
|
pymysql = __import__('pymysql.cursors')
|
|
|
|
def __init__(self, before_day=0):
|
|
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
|
user='admin', passwd='admin123',
|
|
db='concepters', charset='utf8',
|
|
cursorclass=self.pymysql.cursors.DictCursor)
|
|
self.urls = dict()
|
|
self.before_day = before_day
|
|
|
|
def set_before_day(self, before_day):
|
|
if type(before_day) == str:
|
|
self.before_day = int(before_day)
|
|
elif type(before_day) == int:
|
|
self.before_day = before_day
|
|
|
|
def set_until_page(self, until_page):
|
|
if type(until_page) == str:
|
|
self.until_page = int(until_page)
|
|
elif type(until_page) == int:
|
|
self.until_page = until_page
|
|
|
|
def get_keyword_parameters(self, keyword_id):
|
|
query = "select * from keyword where id = " + str(keyword_id)
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
self.params = cursor.fetchone()
|
|
return self.params
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
exit(1)
|
|
return dict()
|
|
|
|
def get_naver_cafe_list(self):
|
|
query = "select url, clubid from navercafelist"
|
|
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
|
|
pass
|
|
else:
|
|
query += (" where group_num = " + str(self.authorship()))
|
|
try:
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(query)
|
|
list_result = cursor.fetchall()
|
|
for i in list_result:
|
|
self.urls[i["url"]] = i["clubid"]
|
|
except Exception as e:
|
|
print(e)
|
|
sys.stdout.flush()
|
|
exit(1)
|
|
return self.urls
|
|
|
|
def start_day(self):
|
|
return self.params["start"]
|
|
|
|
def end_day(self):
|
|
return self.params["end"]
|
|
|
|
def keyword_id(self):
|
|
return self.params["id"]
|
|
|
|
def realtime(self):
|
|
return self.params["realtime"]
|
|
|
|
def searches(self):
|
|
return self.params["searches"]
|
|
|
|
def authorship(self):
|
|
return self.params["authorship"]
|
|
|
|
def platform(self):
|
|
return self.params["platform"]
|
|
|
|
def is_realtime(self):
|
|
if str(self.realtime()) == '0':
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def euc_kr(self, keyword):
|
|
byte_code = list(keyword.encode("euc_kr"))
|
|
encoded_keyword = ""
|
|
for i in byte_code:
|
|
if i == 0x20:
|
|
encoded_keyword += "+"
|
|
else:
|
|
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
|
return encoded_keyword
|
|
|
|
def utf8(self, keyword):
|
|
byte_code = list(keyword.encode("utf-8"))
|
|
encoded_keyword = ""
|
|
for i in byte_code:
|
|
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
|
|
return encoded_keyword
|
|
|
|
def disconnect(self):
|
|
self.conn.close()
|
|
|
|
def date_to_str(self, arg_date):
|
|
return arg_date.strftime("%Y-%m-%d")
|