#-*- coding: utf-8 -*- ''' Created on 2015. 12. 8. @author: cococo ''' import sys import time import os import psutil import threading import re import pymysql import random import inspect from time import localtime, strftime from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.desired_capabilities import DesiredCapabilities def is_debugger_attached(): for frame in inspect.stack(): if frame[1].endswith("pydevd.py"): return True return False is_debug = is_debugger_attached() def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() call_frame = inspect.getouterframes(cur_frame, 2) frame_no = call_frame[1][3] == 'printd' and 2 or 1 file_path = call_frame[frame_no][1] line_no = call_frame[frame_no][2] # class_name = '' # if 'self' in call_frame[frame_no][0].f_locals: # class_name = str(call_frame[frame_no][0].f_locals['self'].__class__) # method_name = call_frame[frame_no][3] try: objects = ('{}({}) :'.format(file_path, line_no),) + objects print(*objects, sep=sep, end=end, file=file, flush=flush) except Exception as e: print(e) else: print(*objects, sep=sep, end=end, file=file, flush=flush) def printd(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: printl(objects, sep, end, file, flush) def print_and_flush(string): print(string) sys.stdout.flush() def fcntwait(n): time.sleep(n) def wait(n): th = threading.Thread(target=fcntwait, args=(n,)) th.start() th.join() def insert_log(msg): pid = os.getpid() tm = strftime("%Y_%m_%d", localtime()) filename = tm + "_" + str(pid) + ".log" total_msg = "[" + strftime("%Y_%m_%d %H:%M:%S", localtime()) + "] " + msg with open(filename, "a") as f: f.write(total_msg) f.flush() def enter_element(element): element.send_keys(Keys.NULL) element.send_keys(Keys.ENTER) wait(2) def find_element_by_css_selector(driver, tag, time=0): element = WebDriverWait(driver, time).until( EC.presence_of_element_located((By.CSS_SELECTOR, tag)) ) return element def find_elements_by_css_selector(driver, tag, time=0): elements = WebDriverWait(driver, time).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, tag)) ) return elements def find_element_by_xpath(driver, tag, time=0): element = WebDriverWait(driver, time).until( EC.presence_of_element_located((By.XPATH, tag)) ) return element def find_elements_by_xpath(driver, tag, time=0): elements = WebDriverWait(driver, time).until( EC.presence_of_all_elements_located((By.XPATH, tag)) ) return elements class Browser: def __init__(self, driver=None): self.driver = driver self.info = "" def get_new_driver(self, name): """ windows system: name = chrome, ie, opera, firefox default driver_exec: chromedriver.exe, IEDriverServer.exe, operadriver.exe linux system: name = chrome, opera, firefox default driver_exec: chromedriver, operadriver """ if sys.platform == "win32": if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver.exe") elif name == "ie": return self.new_ie_browser(driver_exec="IEDriverServer.exe") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver.exe") elif name == "firefox": return self.new_firefox_browser(executable_path="geckodriver.exe") else: return None else: if name == "chrome": return self.new_chrome_browser(driver_exec="chromedriver") elif name == "opera": return self.new_opera_browser(driver_exec="operadriver") elif name == "firefox": return self.new_firefox_browser(executable_path="geckodriver") else: return None def new_chrome_browser(self, driver_exec=None): self.info = "chrome" if driver_exec is not None: self.chrome_driver_path = driver_exec self.chrome_basename = os.path.basename(driver_exec) if self.is_server_executed(self.chrome_basename): port = self.port(self.chrome_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME) else: self.driver = webdriver.Chrome(self.chrome_driver_path) return self.driver def new_ie_browser(self, driver_exec=None): self.info = "ie" if driver_exec is not None: self.ie_driver_path = driver_exec self.ie_basename = os.path.basename(driver_exec) if self.is_server_executed(self.ie_basename): port = self.port(self.ie_basename) self.driver = webdriver.Remote("http://127.0.0.1:" + port, webdriver.DesiredCapabilities.INTERNETEXPLORER) else: self.driver = webdriver.Ie(self.ie_driver_path) return self.driver def new_firefox_browser(self, executable_path=None): self.info = "firefox" caps = DesiredCapabilities.FIREFOX if executable_path is not None: self.firefox_driver_path = executable_path self.firefox_basename = os.path.basename(executable_path) # if self.is_server_executed(self.firefox_basename): # port = self.port(self.firefox_basename) # self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, # desired_capabilities=caps) # else: self.driver = webdriver.Firefox(capabilities=caps, executable_path=self.firefox_driver_path) #caps["marionette"] = True return self.driver def new_opera_browser(self, driver_exec=None): self.info = "opera" if driver_exec is not None: self.opera_driver_path = driver_exec self.opera_basename = os.path.basename(driver_exec) if self.is_server_executed(self.opera_basename): port = self.port(self.opera_basename) self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA) else: self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path) return self.driver def driver(self): return self.driver def is_server_executed(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return True return False def port(self, driver_basename): for ps in psutil.process_iter(): if ps.name() == driver_basename: conns = ps.connections() for x in conns: if x.status == "LISTEN": return str(x.laddr[1]) return str(9999) def new_browser(self): if self.info == "chrome": return self.new_chrome_browser() elif self.info == "ie": return self.new_ie_browser() elif self.info == "opera": return self.new_opera_browser() elif self.info == "firefox": return self.new_firefox_browser() else: return None class SendtoDB: pymysql = __import__('pymysql.cursors') re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE) def __init__(self, db_num=0): self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.db_num = db_num def set_db(self, db_num): self.db_num = str(db_num) def make_insert_query_backup(self, dictionary): query = "insert into data_" + str(self.db_num) + " (" for key in dictionary.keys(): query += (key + ",") query = query[:len(query) - 1] + ")" query += " values(" for key, value in dictionary.items(): if type(value) == int: query += (str(value) + ",") else: query += self.conn.escape(value) + "," query = query[:len(query) - 1] + ")" return query def make_insert_query(self, dictionary): query = "insert into data_" + str(self.db_num) + " (" key_list = list() val_list = list() for key, val in dictionary.items(): key_list.append(key) if type(val) == int: val_list.append(str(val)) else: val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val)))) return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" def send_body(self, body): if not body: return self.conn_check() query = self.make_insert_query(body) try: with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except self.pymysql.err.OperationalError as e: print(e) if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server print("connection lost. try to reconnection") self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except self.pymysql.err.MySQLError as e: print(e) if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server print("connection lost. try to reconnection") self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except Exception as e: print(e, flush=True) print(query, flush=True) def send_reply(self, reply): if not reply: return for i in reply: self.send_body(i) def conn_check(self): if not self.conn.open: self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) def close(self): self.conn.close() def delete_url(self, url): query = "delete from data_" + str(self.db_num) + " where article_url = " + self.conn.escape(str(url)) self.conn_check() try: with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except self.pymysql.err.OperationalError as e: print(e) if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server print("connection lost. try to reconnection") self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except self.pymysql.err.MySQLError as e: print(e) if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server print("connection lost. try to reconnection") self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) with self.conn.cursor() as cursor: cursor.execute(query) self.conn.commit() except Exception as e: print(e, flush=True) print(query, flush=True) class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d")