Files
clients/WebBasedCrawler/base/baseclasses.py

480 lines
17 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
import time
import os
import psutil
import threading
import re
import pymysql
import random
import inspect
from time import localtime, strftime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def is_debugger_attached():
try:
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
except:
return False
return False
is_debug = is_debugger_attached()
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
objects = ('{}({}) :'.format(file_path, line_no),) + objects
print(*objects, sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush)
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
printl(objects, sep, end, file, flush)
def print_and_flush(string):
print(string)
sys.stdout.flush()
def fcntwait(n):
time.sleep(n)
def wait(n):
th = threading.Thread(target=fcntwait, args=(n,))
th.start()
th.join()
def insert_log(msg):
pid = os.getpid()
tm = strftime("%Y_%m_%d", localtime())
filename = tm + "_" + str(pid) + ".log"
total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg
with open(filename, "a") as f:
f.write(total_msg)
f.flush()
def enter_element(element):
element.send_keys(Keys.NULL)
element.send_keys(Keys.ENTER)
wait(2)
def find_element_by_css_selector(driver, tag, time=0):
element = WebDriverWait(driver, time).until(
EC.presence_of_element_located((By.CSS_SELECTOR, tag))
)
return element
def find_elements_by_css_selector(driver, tag, time=0):
elements = WebDriverWait(driver, time).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, tag))
)
return elements
def find_element_by_xpath(driver, tag, time=0):
element = WebDriverWait(driver, time).until(
EC.presence_of_element_located((By.XPATH, tag))
)
return element
def find_elements_by_xpath(driver, tag, time=0):
elements = WebDriverWait(driver, time).until(
EC.presence_of_all_elements_located((By.XPATH, tag))
)
return elements
class Browser:
def __init__(self, driver=None):
self.driver = driver
self.info = ""
def get_new_driver(self, name):
"""
windows system:
name = chrome, ie, opera, firefox
default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe
linux system:
name = chrome, opera, firefox
default driver_exec: chromedriver, operadriver
"""
if sys.platform == "win32":
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver.exe")
elif name == "ie":
return self.new_ie_browser(driver_exec="IEDriverServer.exe")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver.exe")
elif name == "firefox":
return self.new_firefox_browser(executable_path="geckodriver.exe")
else:
return None
else:
if name == "chrome":
return self.new_chrome_browser(driver_exec="chromedriver")
elif name == "opera":
return self.new_opera_browser(driver_exec="operadriver")
elif name == "firefox":
return self.new_firefox_browser(executable_path="geckodriver")
else:
return None
def new_chrome_browser(self, driver_exec=None):
self.info = "chrome"
if driver_exec is not None:
self.chrome_driver_path = driver_exec
self.chrome_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.chrome_basename):
port = self.port(self.chrome_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
desired_capabilities=webdriver.DesiredCapabilities.CHROME)
else:
self.driver = webdriver.Chrome(self.chrome_driver_path)
return self.driver
def new_ie_browser(self, driver_exec=None):
self.info = "ie"
if driver_exec is not None:
self.ie_driver_path = driver_exec
self.ie_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.ie_basename):
port = self.port(self.ie_basename)
self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER)
else:
self.driver = webdriver.Ie(self.ie_driver_path)
return self.driver
def new_firefox_browser(self, executable_path=None):
self.info = "firefox"
caps = DesiredCapabilities.FIREFOX
if executable_path is not None:
self.firefox_driver_path = executable_path
self.firefox_basename = os.path.basename(executable_path)
# if self.is_server_executed(self.firefox_basename):
# port = self.port(self.firefox_basename)
# self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
# desired_capabilities=caps)
# else:
self.driver = webdriver.Firefox(capabilities=caps,
executable_path=self.firefox_driver_path)
#caps["marionette"] = True
return self.driver
def new_opera_browser(self, driver_exec=None):
self.info = "opera"
if driver_exec is not None:
self.opera_driver_path = driver_exec
self.opera_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.opera_basename):
port = self.port(self.opera_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
desired_capabilities=webdriver.DesiredCapabilities.OPERA)
else:
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA,
executable_path=self.opera_driver_path)
return self.driver
def driver(self):
return self.driver
def is_server_executed(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return True
return False
def port(self, driver_basename):
for ps in psutil.process_iter():
if ps.name() == driver_basename:
conns = ps.connections()
for x in conns:
if x.status == "LISTEN":
return str(x.laddr[1])
return str(9999)
def new_browser(self):
if self.info == "chrome":
return self.new_chrome_browser()
elif self.info == "ie":
return self.new_ie_browser()
elif self.info == "opera":
return self.new_opera_browser()
elif self.info == "firefox":
return self.new_firefox_browser()
else:
return None
class SendtoDB:
pymysql = __import__('pymysql.cursors')
re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE)
def __init__(self, db_num=0):
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.db_num = db_num
def set_db(self, db_num):
self.db_num = str(db_num)
def make_insert_query_backup(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
for key in dictionary.keys():
query += (key + ",")
query = query[:len(query) - 1] + ")"
query += " values("
for key, value in dictionary.items():
if type(value) == int:
query += (str(value) + ",")
else:
query += self.conn.escape(value) + ","
query = query[:len(query) - 1] + ")"
return query
def make_insert_query(self, dictionary):
query = "insert into data_" + str(self.db_num) + " ("
key_list = list()
val_list = list()
for key, val in dictionary.items():
key_list.append(key)
if type(val) == int:
val_list.append(str(val))
else:
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
def send_body(self, body):
if not body:
return
self.conn_check()
query = self.make_insert_query(body)
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except self.pymysql.err.OperationalError as e:
print(e)
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
print("connection lost. try to reconnection")
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except self.pymysql.err.MySQLError as e:
print(e)
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
print("connection lost. try to reconnection")
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except Exception as e:
print(e, flush=True)
print(query, flush=True)
def send_reply(self, reply):
if not reply:
return
for i in reply:
self.send_body(i)
def conn_check(self):
if not self.conn.open:
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
def close(self):
self.conn.close()
def delete_url(self, url):
query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url))
self.conn_check()
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except self.pymysql.err.OperationalError as e:
print(e)
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
print("connection lost. try to reconnection")
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except self.pymysql.err.MySQLError as e:
print(e)
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
print("connection lost. try to reconnection")
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
except Exception as e:
print(e, flush=True)
print(query, flush=True)
class CrawlInit:
pymysql = __import__('pymysql.cursors')
def __init__(self, before_day=0):
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
self.urls = dict()
self.before_day = before_day
def set_before_day(self, before_day):
if type(before_day) == str:
self.before_day = int(before_day)
elif type(before_day) == int:
self.before_day = before_day
def set_until_page(self, until_page):
if type(until_page) == str:
self.until_page = int(until_page)
elif type(until_page) == int:
self.until_page = until_page
def get_keyword_parameters(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
self.params = cursor.fetchone()
return self.params
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return dict()
def get_naver_cafe_list(self):
query = "select url, clubid from navercafelist"
if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0:
pass
else:
query += (" where group_num = " + str(self.authorship()))
try:
with self.conn.cursor() as cursor:
cursor.execute(query)
list_result = cursor.fetchall()
for i in list_result:
self.urls[i["url"]] = i["clubid"]
except Exception as e:
print(e)
sys.stdout.flush()
exit(1)
return self.urls
def start_day(self):
return self.params["start"]
def end_day(self):
return self.params["end"]
def keyword_id(self):
return self.params["id"]
def realtime(self):
return self.params["realtime"]
def searches(self):
return self.params["searches"]
def authorship(self):
return self.params["authorship"]
def platform(self):
return self.params["platform"]
def is_realtime(self):
if str(self.realtime()) == '0':
return False
else:
return True
def euc_kr(self, keyword):
byte_code = list(keyword.encode("euc_kr"))
encoded_keyword = ""
for i in byte_code:
if i == 0x20:
encoded_keyword += "+"
else:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def utf8(self, keyword):
byte_code = list(keyword.encode("utf-8"))
encoded_keyword = ""
for i in byte_code:
encoded_keyword += str(hex(i)).replace("0x", "%").upper()
return encoded_keyword
def disconnect(self):
self.conn.close()
def date_to_str(self, arg_date):
return arg_date.strftime("%Y-%m-%d")