From 4fa93a7cc47c4934a464e9600b8c264451f16475 Mon Sep 17 00:00:00 2001 From: admin Date: Wed, 29 Mar 2017 03:19:06 +0000 Subject: [PATCH] git-svn-id: svn://192.168.0.12/source@345 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- WebBasedCrawler/browser.txt | 4 ++-- WebBasedCrawler/insta/instacrawl.py | 11 +++++++---- WebBasedCrawler/insta/instaheaders.py | 1 - WebBasedCrawler/insta/instaparser.py | 10 +++++++--- WebBasedCrawler/naver/navercrawl.py | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/WebBasedCrawler/browser.txt b/WebBasedCrawler/browser.txt index 25b2489..55708bc 100644 --- a/WebBasedCrawler/browser.txt +++ b/WebBasedCrawler/browser.txt @@ -22,6 +22,6 @@ default=chrome kakaostory=chrome -#instagram=firefox -navercafe=firefox +instagram=chrome +navercafe=chrome #facebook=chrome diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index b6ba641..d3deb44 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -86,6 +86,7 @@ def requests_get(req, timeout=requests_timeout): if time.time() > (start + timeout): req.close() raise Exception("timeout") + return b''.join(body) @@ -313,6 +314,7 @@ def crawl_content_process(qu, keyword_id, db_num): break ok = True while ok: + time.sleep(2) try: # get a instance of InstaContent by do_no_proxy func. # if element['url'] is invalid, content is None @@ -339,6 +341,7 @@ def crawl_content_process(qu, keyword_id, db_num): send_to_db.send_body(body) if replies: send_to_db.send_reply(replies) + printl("proxies = ", content.proxies['http'][7:]) printl(element['url']) printl('ok') ok = False @@ -411,15 +414,15 @@ class ListTag: self.load_url(url, self.proxies) def load_url(self, url, proxies): - self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, - timeout=requests_timeout, stream=True) + self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True) content = requests_get(self.__r) + self.log_load_url_before() self.__r.raise_for_status() self.__tag = self.__get_tag(url) self.__set_cookies(self.__r.cookies) self.__url = url - # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) + #self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content) self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content) self.__r.close() self.log_load_url_after() @@ -1033,7 +1036,7 @@ class InstaMainCrawler: def __init__(self): self.send_to_db = SendtoDB() self.crawl_init = InstaInit() - # self.browser = Browser() + #self.browser = Browser() self.browser = None self.driver = None diff --git a/WebBasedCrawler/insta/instaheaders.py b/WebBasedCrawler/insta/instaheaders.py index 456f72a..b678dcc 100644 --- a/WebBasedCrawler/insta/instaheaders.py +++ b/WebBasedCrawler/insta/instaheaders.py @@ -6,7 +6,6 @@ def get_headers_for_list_html(): " Chrome/50.0.2661.102 Safari/537.36" } - def get_headers_for_body_html(cookies): if cookies: request_headers = { diff --git a/WebBasedCrawler/insta/instaparser.py b/WebBasedCrawler/insta/instaparser.py index 4d3ea3e..3bbe120 100644 --- a/WebBasedCrawler/insta/instaparser.py +++ b/WebBasedCrawler/insta/instaparser.py @@ -4,6 +4,7 @@ import requests import datetime rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);\s*') +#rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);') old_date = datetime.datetime(1970, 1, 1, 9) @@ -17,7 +18,10 @@ def get_json_from_html(content): else: raise TypeError m = rx_json_html.search(s) + if m: + #return json.dumps(json.loads(m.group(1))) + #return json.loads(json.dumps(m.group(1))) return json.loads(m.group(1)) else: raise TypeError("Check requests.response") @@ -50,14 +54,14 @@ def parse_list_user_html(content): def parse_list_tag_html(content): json_data = get_json_from_html(content) - tagpage = json_data['entry_data']['TagPage'] + tagpage = json_data["entry_data"]["TagPage"] has_next = False end_cursor = None body_list = [] if tagpage: - print('start_cursor = ', end='', flush=True) - print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True) + #print('start_cursor = ', end='', flush=True) + #print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True) #start_cursor doesn't exsist end_cursor = tagpage[0]["tag"]["media"]["page_info"]["end_cursor"] has_next = tagpage[0]["tag"]["media"]["page_info"]["has_next_page"] nodes = tagpage[0]["tag"]["media"]["nodes"] diff --git a/WebBasedCrawler/naver/navercrawl.py b/WebBasedCrawler/naver/navercrawl.py index 2c25a59..45ffbaf 100644 --- a/WebBasedCrawler/naver/navercrawl.py +++ b/WebBasedCrawler/naver/navercrawl.py @@ -1 +1 @@ -#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import sys import datetime import re from base.baseclasses import wait from base.baseclasses import print_and_flush from base.baseclasses import Browser from base.baseclasses import SendtoDB from base.baseclasses import enter_element class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) # self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_set = set() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_set(self): self.content_num_set.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_set: continue self.content_num_set.add(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) # class NaverCafeInit: # pymysql = __import__('pymysql.cursors') # url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" # url_second = "&search.searchdate=" # url_third = "&search.searchBy=0&search.query=" # url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" # # def __init__(self, before_day=0): # self.conn = self.pymysql.connect(host ='bigbird.iptime.org', # user='admin', passwd='admin123', # db='concepters', charset='utf8', # cursorclass=self.pymysql.cursors.DictCursor) # self.urls = dict() # self.before_day = before_day # # def set_before_day(self, before_day): # if type(before_day) == str: # self.before_day = int(before_day) # elif type(before_day) == int: # self.before_day = before_day # # def set_until_page(self, until_page): # if type(until_page) == str: # self.before_day = int(until_page) # elif type(until_page) == int: # self.before_day = until_page # # def split_searches(self): # search = self.searches() # splited_list = search.split(',') # trimmed_list = list() # for x in splited_list: # trimmed_list.append(self.euc_kr(x.strip())) # return trimmed_list # # def get_keyword_parameters(self, keyword_id): # query = "select * from keyword where id = " + str(keyword_id) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # self.params = cursor.fetchone() # return self.params # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return dict() # # def get_naver_cafe_list(self): # query = "select url, clubid from navercafelist" # if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: # pass # else: # query += (" where group_num = " + str(self.authorship())) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # list_result = cursor.fetchall() # for i in list_result: # self.urls[i["url"]] = i["clubid"] # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return self.urls # # def start_day(self): # return self.params["start"] # # def end_day(self): # return self.params["end"] # # def keyword_id(self): # return self.params["id"] # # def realtime(self): # return self.params["realtime"] # # def searches(self): # return self.params["searches"] # # def authorship(self): # return self.params["authorship"] # # def platform(self): # return self.params["platform"] # # def is_realtime(self): # if str(self.realtime()) == '0': # return False # else: # return True # # def euc_kr(self, keyword): # byte_code = list(keyword.encode("euc_kr")) # encoded_keyword = "" # for i in byte_code: # if i == 0x20: # encoded_keyword += "+" # else: # encoded_keyword += str(hex(i)).replace("0x", "%").upper() # return encoded_keyword # # def url_all_days(self): # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # today = datetime.date.today() # url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) # else: # url = self.make_url(self.start_day(), self.end_day(), val) # for i in url: # url_list.append(i) # return url_list # # def url_day_by_day(self): # one_day = datetime.timedelta(days=1) # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # end = datetime.date.today() # start = end + datetime.timedelta(days=self.before_day) # else: # start = self.start_day() # end = self.end_day() # while start <= end: # url = self.make_url(start, start, val) # for i in url: # url_list.append(i) # start += one_day # return url_list # # def make_url(self, start_day, end_day, clubid): # urls = list() # for x in self.split_searches(): # url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth # urls.append(url) # return urls # # def disconnect(self): # self.conn.close() # # def date_to_str(self, arg_date): # return arg_date.strftime("%Y-%m-%d") class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") class NaverCafeInit(CrawlInit): url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): super().__init__(before_day) def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.crawl_init = NaverCafeInit() self.browser = Browser() self.naver_cafe = NaverCafeCrawler() def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.naver_cafe.set_driver(driver) self.driver = driver def copy_list(self, backup_set): for i in backup_set: self.board_crawler.content_num_set.add(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_set() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index == -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() == "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def start(self): self.crawl_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_browser(browser) self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) def init_browser(self, browser): self.set_driver(self.browser.get_new_driver(browser)) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.get_naver_cafe_list() self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawl_start(self): naver_id = "ehotnsdl1234" naver_password = "66556655*" asis = Asistance() self.naver_cafe.naver_login(naver_id, naver_password) wait(5) real_time = True while real_time: print_and_flush("Crawler Start") url_list = self.crawl_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") self.driver.get(url_list[i]) wait(5) self.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = self.board_crawler.content_num_set.copy() self.driver.quit() self.set_driver(self.browser.new_browser()) wait(5) self.naver_cafe.naver_login(naver_id, naver_password) wait(3) real_time = self.crawl_init.is_realtime() print_and_flush("Finished Crawling :)") self.send_to_db.close() self.driver.quit() if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "66556655*" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("chrome")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = list(naver_main_area_crawler.board_crawler.content_num_set) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file +#-*- coding: utf-8 -*- __author__ = 'cococo' from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.action_chains import ActionChains import sys import datetime import re from base.baseclasses import wait from base.baseclasses import print_and_flush from base.baseclasses import Browser from base.baseclasses import SendtoDB from base.baseclasses import enter_element class Asistance: def __init__(self): self.re_clubid = re.compile("search\\.clubid=([\\d]+)") self.re_date = re.compile("search\\.searchdate=([\\d]{4}-[\\d]{2}-[\\d]{2})([\\d]{4}-[\\d]{2}-[\\d]{2})") def clubid(self, url): m = self.re_clubid.search(url) if m is None: return str() else: return m.group(1) def date(self, url): m = self.re_date.search(url) if m is None: return str("Start: ALL, End: ALL") else: return str("Start: " + m.group(1) + ", End: " + m.group(2)) class NaverCafeCrawler: #driver = webdriver.PhantomJS('C:\\Users\\cococo\\AppData\\Roaming\\npm\\node_modules\\phantomjs\\lib\\phantom\\phantomjs.exe') def __init__(self): self.driver = None # webdriver.Ie('C:\\Users\\cococo\\Downloads\\IEDriverServer_x64_2.48.0\\IEDriverServer.exe') # self.driver = webdriver.Chrome('C:\\Users\\cococo\\Downloads\\chromedriver_win32\\chromedriver.exe') # firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX # firefox_capabilities['marionette'] = True # firefox_capabilities['binary'] = ('C:\\Users\\cococo\\Downloads\\wires-0.4.2-win\\wires-0.4.2-win.exe') # webdriver.Chrome() # self.driver = webdriver.Firefox() # self.driver.set_window_size(1600, 900) # self.main_area_crawler = NaverCafeMainAreaCrawler() def set_driver(self, driver): self.driver = driver def suff(self, url): self.driver.get(url) wait(2) def screenshot(self,filename): self.driver.save_screenshot(filename) def html(self): return self.driver.page_source def savepage(self, filename): with open(filename,'w',encoding='UTF8') as f: f.write(self.html()) def naver_login(self, id, password): self.suff('http://www.naver.com') wait(2) element = self.driver.find_element_by_id('id') element.send_keys(id) #element = driver.find_element_by_id('label_pw') element = self.driver.find_element_by_id('pw') element.send_keys(password) element.send_keys(Keys.ENTER) wait(3) #element = self.driver.find_element_by_class_name('btn_login') #self.click_element(element) def cafe_search(self, keyword): element = self.driver.find_element_by_id('topLayerQueryInput') element.send_keys(keyword) wait(1) element.send_keys(Keys.ENTER) #element.send_keys(Keys.RETURN) wait(2) def get_url(self): return self.driver.current_url() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) def start(self): self.main_area_crawler.set_driver(self.driver) self.main_area_crawler.crawl_all_cafe_main() def close(self): self.driver.close() def quit(self): self.driver.quit() class NaverCafeBoardCrawler: def __init__(self, driver=None): self.driver = driver self.content_num_set = set() import re self.re_page = re.compile("search\\.page=([\\d]+)") def clear_content_num_set(self): self.content_num_set.clear() def current_url(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return self.driver.current_url def current_page_num_by_url(self): url = self.current_url() m = self.re_page.search(url) if m is None: return self.current_page_num_by_tag() else: return m.group(1) def current_page_num_by_tag(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return str(1) if page_navigate is None: return str(1) tds = page_navigate.find_elements_by_tag_name('td') for td in tds: try: page_on = td.get_attribute('class') if page_on == 'on': return td.text except: continue return str(1) def move_next_content(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') trs = self.driver.find_elements_by_css_selector("tr[align='center']") for tr in trs: try: content_num = tr.find_element_by_css_selector("span[class='m-tcol-c list-count']") if len(str(content_num.text).strip()) < 1: continue if content_num.text in self.content_num_set: continue self.content_num_set.add(content_num.text) sub = tr.find_element_by_css_selector("a[class='m-tcol-c']") enter_element(sub) return True except: pass return False def move_next_page(self): page_navigate = None try: self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') page_navigate = self.driver.find_element_by_css_selector("div[class='prev-next']") except: return False if page_navigate is None: return False tds = page_navigate.find_elements_by_tag_name('td') is_next = False for td in tds: if is_next: a = td.find_element_by_tag_name("a") enter_element(a) #self.enter_element(td) return True try: page_on = td.get_attribute('class') if page_on == 'on': is_next = True continue except: continue return False def set_driver(self, driver): self.driver = driver def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) class NaverCafeBodyCrawler: def __init__(self, driver=None): self.driver = driver self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_init(self): self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def find_article_title(self): self.find_init() article_title = self.driver.find_element_by_css_selector("span[class='b m-tcol-c']") return article_title.text def find_article_date(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c date']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self): self.find_init() article_data = self.driver.find_element_by_css_selector("div[class='tbody m-tcol-c']") return article_data.text def find_article_nickname(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 4: return onclick_attr_list[3].strip().replace("'", "") else: return str() def find_article_id(self): self.find_init() element = self.driver.find_element_by_css_selector("td[class='m-tcol-c b nick']") nick_element = element.find_element_by_css_selector("a[class='m-tcol-c b']") onclick = nick_element.get_attribute('onclick') onclick_attr_list = onclick.split(',') if len(onclick_attr_list) > 2: return onclick_attr_list[1].strip().replace("'", "") else: return str() def find_article_hit(self): self.find_init() element = self.driver.find_element_by_css_selector("span[class='b m-tcol-c reply _rosReadcount']") return element.text def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'body' def find_platform_title(self): self.driver.switch_to_default_content() element = self.driver.find_element_by_css_selector("span[class='m-tcol-p']") return element.text def find_article_url(self): self.find_init() element = self.driver.find_element_by_css_selector("a[id='linkUrl']") return element.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def print(self): print("article_id = " + self.find_article_id()) print("article_nickname = " + self.find_article_nickname()) print("article_title = " + self.find_article_title()) print("article_date = " + self.find_article_date()) print("article_hit = " + self.find_article_hit()) print("article_url = " + self.find_article_url()) print("platform_title = " + self.find_platform_title()) print("article_data = " + self.find_article_data()) def get_content(self): content = dict() content["article_id"] = self.find_article_id() content["article_nickname"] = self.find_article_nickname() content["article_title"] = self.find_article_title() content["article_date"] = self.find_article_date() content["article_hit"] = self.find_article_hit() content["article_url"] = self.find_article_url() content["article_data"] = self.find_article_data() content["article_form"] = self.find_article_form() content["platform_title"] = self.find_platform_title() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["platform_id"] = self.find_platform_id() return content class NaverCafeReplyCrawler: def __init__(self, driver=None): self.driver = driver self.article_parent = str() self.reply_list = list() self.init_re() def init_re(self): self.re_platform_id = re.compile('http://cafe.naver.com/([0-9A-Za-z_-]+)/.+') def set_driver(self, driver): self.driver = driver def find_comments_element(self): self.find_init() try: self.reply_elements = self.driver.find_element_by_css_selector("ul[class='cmlist']") if self.reply_elements is None: return False return True except: return False def find_init(self): self.count = 0 self.reply_list.clear() self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') def set_article_url(self, article_url): self.article_url = article_url def crawl_all(self): has_next_comment_page = True while has_next_comment_page: self.crawl_current_page_reply() has_next_comment_page = self.move_next_comment_page() def move_next_comment_page(self): element = None try: element = self.driver.find_element_by_css_selector("div[id='cmt_paginate']") children = element.find_elements_by_css_selector("*") flag = False for child in children: if flag is True and child.tag_name == "a": enter_element(child) wait(1) self.driver.switch_to_default_content() self.driver.switch_to_frame('cafe_main') return True if child.tag_name == "strong": flag = True except Exception as e: print(e) sys.stdout.flush() return False if element is None: return False return False def crawl_current_page_reply(self): lis = self.reply_elements.find_elements_by_tag_name('li') for li in lis: if li.get_attribute('class') == 'reply': self.crawl_reply_reply(li) elif len(li.get_attribute('class')) < 1: self.crawl_reply(li) else: pass def find_article_url(self, li=None): return self.article_url def find_article_date(self, li): element = li.find_element_by_css_selector("span[class='date m-tcol-c filter-50']") article_date = str(element.text) article_date.strip() article_date = article_date.replace('. ', ' ').replace('.', '-') + ":00" return article_date def find_article_data(self, li): element = li.find_element_by_css_selector("span[class='comm_body']") article_data = element.text return article_data def find_article_parent(self, li): try: element = li.find_element_by_css_selector("a[class='m-tcol-c filter-50 nick']") article_parent = element.text return article_parent except: return self.article_parent def find_article_id(self, li): element = li.find_element_by_css_selector("input[name='writerid']") article_id = element.get_attribute('value') return article_id def find_article_nickname(self, li): article_nickname = li.find_element_by_css_selector("td[class='p-nick']") return article_nickname.text def find_platform_id(self): article_url = str(self.find_article_url()) m = self.re_platform_id.search(article_url) try: return m.group(1) except: return str() def crawl_reply(self, li): article_nickname = self.find_article_nickname(li) self.article_parent = article_nickname article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def crawl_reply_reply(self, li): article_parent = self.find_article_parent(li) article_order = self.count self.count += 1 content = dict() content["article_id"] = self.find_article_id(li) content["article_nickname"] = self.find_article_nickname(li) content["article_date"] = self.find_article_date(li) content["article_data"] = self.find_article_data(li) content["article_order"] = article_order content["article_parent"] = article_parent content["article_form"] = self.find_article_form() content["platform_name"] = self.find_platform_name() content["platform_form"] = self.find_platform_form() content["article_url"] = self.find_article_url() content["platform_id"] = self.find_platform_id() self.reply_list.append(content) def find_platform_name(self): return 'naver' def find_platform_form(self): return 'cafe' def find_article_form(self): return 'reply' def get_content(self): return self.reply_list def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element_with_offset(element, 0, 0).click().perform() wait(2) # class NaverCafeInit: # pymysql = __import__('pymysql.cursors') # url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" # url_second = "&search.searchdate=" # url_third = "&search.searchBy=0&search.query=" # url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" # # def __init__(self, before_day=0): # self.conn = self.pymysql.connect(host ='bigbird.iptime.org', # user='admin', passwd='admin123', # db='concepters', charset='utf8', # cursorclass=self.pymysql.cursors.DictCursor) # self.urls = dict() # self.before_day = before_day # # def set_before_day(self, before_day): # if type(before_day) == str: # self.before_day = int(before_day) # elif type(before_day) == int: # self.before_day = before_day # # def set_until_page(self, until_page): # if type(until_page) == str: # self.before_day = int(until_page) # elif type(until_page) == int: # self.before_day = until_page # # def split_searches(self): # search = self.searches() # splited_list = search.split(',') # trimmed_list = list() # for x in splited_list: # trimmed_list.append(self.euc_kr(x.strip())) # return trimmed_list # # def get_keyword_parameters(self, keyword_id): # query = "select * from keyword where id = " + str(keyword_id) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # self.params = cursor.fetchone() # return self.params # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return dict() # # def get_naver_cafe_list(self): # query = "select url, clubid from navercafelist" # if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: # pass # else: # query += (" where group_num = " + str(self.authorship())) # try: # with self.conn.cursor() as cursor: # cursor.execute(query) # list_result = cursor.fetchall() # for i in list_result: # self.urls[i["url"]] = i["clubid"] # except Exception as e: # print(e) # sys.stdout.flush() # exit(1) # return self.urls # # def start_day(self): # return self.params["start"] # # def end_day(self): # return self.params["end"] # # def keyword_id(self): # return self.params["id"] # # def realtime(self): # return self.params["realtime"] # # def searches(self): # return self.params["searches"] # # def authorship(self): # return self.params["authorship"] # # def platform(self): # return self.params["platform"] # # def is_realtime(self): # if str(self.realtime()) == '0': # return False # else: # return True # # def euc_kr(self, keyword): # byte_code = list(keyword.encode("euc_kr")) # encoded_keyword = "" # for i in byte_code: # if i == 0x20: # encoded_keyword += "+" # else: # encoded_keyword += str(hex(i)).replace("0x", "%").upper() # return encoded_keyword # # def url_all_days(self): # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # today = datetime.date.today() # url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) # else: # url = self.make_url(self.start_day(), self.end_day(), val) # for i in url: # url_list.append(i) # return url_list # # def url_day_by_day(self): # one_day = datetime.timedelta(days=1) # url_list = list() # for key, val in self.urls.items(): # if self.is_realtime(): # end = datetime.date.today() # start = end + datetime.timedelta(days=self.before_day) # else: # start = self.start_day() # end = self.end_day() # while start <= end: # url = self.make_url(start, start, val) # for i in url: # url_list.append(i) # start += one_day # return url_list # # def make_url(self, start_day, end_day, clubid): # urls = list() # for x in self.split_searches(): # url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth # urls.append(url) # return urls # # def disconnect(self): # self.conn.close() # # def date_to_str(self, arg_date): # return arg_date.strftime("%Y-%m-%d") class CrawlInit: pymysql = __import__('pymysql.cursors') def __init__(self, before_day=0): self.conn = self.pymysql.connect(host ='bigbird.iptime.org', user='admin', passwd='admin123', db='concepters', charset='utf8', cursorclass=self.pymysql.cursors.DictCursor) self.urls = dict() self.before_day = before_day def set_before_day(self, before_day): if type(before_day) == str: self.before_day = int(before_day) elif type(before_day) == int: self.before_day = before_day def set_until_page(self, until_page): if type(until_page) == str: self.until_page = int(until_page) elif type(until_page) == int: self.until_page = until_page def get_keyword_parameters(self, keyword_id): query = "select * from keyword where id = " + str(keyword_id) try: with self.conn.cursor() as cursor: cursor.execute(query) self.params = cursor.fetchone() return self.params except Exception as e: print(e) sys.stdout.flush() exit(1) return dict() def get_naver_cafe_list(self): query = "select url, clubid from navercafelist" if self.authorship() is None or len(self.authorship()) == 0 or self.authorship == 0: pass else: query += (" where group_num = " + str(self.authorship())) try: with self.conn.cursor() as cursor: cursor.execute(query) list_result = cursor.fetchall() for i in list_result: self.urls[i["url"]] = i["clubid"] except Exception as e: print(e) sys.stdout.flush() exit(1) return self.urls def start_day(self): return self.params["start"] def end_day(self): return self.params["end"] def keyword_id(self): return self.params["id"] def realtime(self): return self.params["realtime"] def searches(self): return self.params["searches"] def authorship(self): return self.params["authorship"] def platform(self): return self.params["platform"] def is_realtime(self): if str(self.realtime()) == '0': return False else: return True def euc_kr(self, keyword): byte_code = list(keyword.encode("euc_kr")) encoded_keyword = "" for i in byte_code: if i == 0x20: encoded_keyword += "+" else: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def utf8(self, keyword): byte_code = list(keyword.encode("utf-8")) encoded_keyword = "" for i in byte_code: encoded_keyword += str(hex(i)).replace("0x", "%").upper() return encoded_keyword def disconnect(self): self.conn.close() def date_to_str(self, arg_date): return arg_date.strftime("%Y-%m-%d") class NaverCafeInit(CrawlInit): url_first = "http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=" url_second = "&search.searchdate=" url_third = "&search.searchBy=0&search.query=" url_forth = "&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=15&search.media=0" def __init__(self, before_day=0): super().__init__(before_day) def url_all_days(self): url_list = list() for key, val in self.urls.items(): if self.is_realtime(): today = datetime.date.today() url = self.make_url(today + datetime.timedelta(days=self.before_day), today, val) else: url = self.make_url(self.start_day(), self.end_day(), val) for i in url: url_list.append(i) return url_list def url_day_by_day(self): one_day = datetime.timedelta(days=1) url_list = list() for key, val in self.urls.items(): if self.is_realtime(): end = datetime.date.today() start = end + datetime.timedelta(days=self.before_day) else: start = self.start_day() end = self.end_day() while start <= end: url = self.make_url(start, start, val) for i in url: url_list.append(i) start += one_day return url_list def split_searches(self): search = self.searches() splited_list = search.split(',') trimmed_list = list() for x in splited_list: trimmed_list.append(self.euc_kr(x.strip())) return trimmed_list def make_url(self, start_day, end_day, clubid): urls = list() for x in self.split_searches(): url = self.url_first + str(clubid) + self.url_second + str(start_day) + str(end_day) + self.url_third + x + self.url_forth urls.append(url) return urls class NaverCafeMainAreaCrawler: def __init__(self): self.board_crawler = NaverCafeBoardCrawler() self.body_crawler = NaverCafeBodyCrawler() self.reply_crawler = NaverCafeReplyCrawler() self.send_to_db = SendtoDB() self.crawl_init = NaverCafeInit() self.browser = Browser() self.naver_cafe = NaverCafeCrawler() def print(self, arg): print(arg) sys.stdout.flush() def set_driver(self, driver): self.board_crawler.set_driver(driver) self.body_crawler.set_driver(driver) self.reply_crawler.set_driver(driver) self.naver_cafe.set_driver(driver) self.driver = driver def copy_list(self, backup_set): for i in backup_set: self.board_crawler.content_num_set.add(i) def crawl_all_cafe_main(self, backup_list=None): self.board_crawler.clear_content_num_set() if backup_list: self.copy_list(backup_list) has_next_table = True while has_next_table: self.print("Page number : " + str(self.board_crawler.current_page_num_by_tag())) # if (int(self.board_crawler.current_page_num_by_url()) % 5) == 1: self.release_memory() while self.board_crawler.move_next_content(): try: self.crawl_body() self.crawl_reply() self.print("ok") except Exception as e: self.print("fail") self.print(e) self.driver.back() wait(1) has_next_table = self.board_crawler.move_next_page() def crawl_body(self): self.body_crawler.set_driver(self.driver) content = self.body_crawler.get_content() content['keyword_id'] = self.keyword_id self.send_to_db.delete_url(content['article_url']) self.send_to_db.send_body(content) self.print(content['article_url']) def crawl_reply(self): self.reply_crawler.set_driver(self.driver) if self.reply_crawler.find_comments_element(): self.reply_crawler.set_article_url(self.body_crawler.find_article_url()) self.reply_crawler.crawl_all() self.send_to_db.send_reply(self.reply_crawler.get_content()) def set_keyword_id(self, keyword_id): self.keyword_id = keyword_id def release_memory_firefox(self): index = self.driver.current_url.find("%26search.page=") if index == -1: temp_url = self.driver.current_url else: temp_url = self.driver.current_url[:index] temp_page = self.board_crawler.current_page_num_by_tag() if temp_page.strip() == "1": url = temp_url else: url = temp_url + "%26search.page=" + temp_page.strip() self.print("Release Memory Process") self.driver.get("about:memory") wait(2) self.driver.execute_script("doMMU()") wait(2) self.driver.execute_script("doGC()") wait(2) self.driver.execute_script("doCC()") wait(2) self.driver.get(url) wait(2) print_and_flush("reloaded") def release_memory_others(self): temp_url = self.driver.current_url self.print("Release Memory Process") self.driver.get(temp_url) wait(2) print_and_flush("reloaded") def release_memory(self): if self.browser.info == "firefox": if (int(self.board_crawler.current_page_num_by_tag()) % 5) == 1: self.release_memory_firefox() else: if (int(self.board_crawler.current_page_num_by_tag()) != 1) and ((int(self.board_crawler.current_page_num_by_tag()) % 5) == 1): self.release_memory_others() def click_element(self, element): ac = ActionChains(self.driver) #ac.move_to_element(element).click().perform() #element.send_keys(Keys.NULL) ac.move_to_element(element).click().perform() wait(2) def start(self): self.crawl_start() def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): self.init_browser(browser) self.init_keyword_id(keyword_id) self.init_db(db_num) self.init_before_day(before_day) self.init_until_page(until_page) def init_browser(self, browser): self.set_driver(self.browser.get_new_driver(browser)) def init_keyword_id(self, keyword_id): if type(keyword_id) != int: self.keyword_id = int(keyword_id) else: self.keyword_id = keyword_id self.crawl_init.get_keyword_parameters(keyword_id) self.crawl_init.get_naver_cafe_list() self.crawl_init.disconnect() def init_db(self, db_num): self.send_to_db.set_db(db_num) def init_before_day(self, before_day): self.crawl_init.set_before_day(before_day) def init_until_page(self, until_page): self.crawl_init.set_until_page(until_page) def crawl_start(self): naver_id = "ehotnsdl1234" naver_password = "conc8600" asis = Asistance() self.naver_cafe.naver_login(naver_id, naver_password) wait(5) real_time = True while real_time: print_and_flush("Crawler Start") url_list = self.crawl_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") self.driver.get(url_list[i]) wait(5) self.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = self.board_crawler.content_num_set.copy() self.driver.quit() self.set_driver(self.browser.new_browser()) wait(5) self.naver_cafe.naver_login(naver_id, naver_password) wait(3) real_time = self.crawl_init.is_realtime() print_and_flush("Finished Crawling :)") self.send_to_db.close() self.driver.quit() if __name__ == '__main__': """ argv: 0 - navercrawl.py 1 - keyword_id 2 - data db num 3 - before_day """ # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.start() # crawler.cafe_search_current_page_list() # crawler = NaverCafeCrawler() # crawler.naver_login('kyounggoon', 'qorwjd123') # crawler.suff('http://cafe.naver.com/imsanbu') # crawler.cafe_search('성형') # crawler.main_area_crawler.send_to_db.set_db("294") # crawler.main_area_crawler.set_keyword_id("111111") # crawler.start() if len(sys.argv) < 4: print("Fail to process execute") exit(1) else: print("Start Python Crawling") #initialization naver_id = "ehotnsdl1234" naver_password = "conc8600" naver_init = NaverCafeInit(int(sys.argv[3])) naver_init.get_keyword_parameters(sys.argv[1]) naver_init.get_naver_cafe_list() naver_init.disconnect() naver_cafe = NaverCafeCrawler() browser = Browser() # arg: chrome, fierfox, ie, opera naver_cafe.set_driver(browser.get_new_driver("chrome")) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler = NaverCafeMainAreaCrawler() naver_main_area_crawler.set_driver(naver_cafe.driver) naver_main_area_crawler.set_keyword_id(sys.argv[1]) naver_main_area_crawler.send_to_db.set_db(sys.argv[2]) naver_main_area_crawler.browser = browser asis = Asistance() realtime = True while realtime: print_and_flush("Crawler Start") url_list = naver_init.url_all_days() i = 0 backup_set = set() while i < len(url_list): try: print_and_flush(url_list[i] + "\n") print_and_flush("clubid: " + asis.clubid(url_list[i])) print_and_flush(asis.date(url_list[i]) + "\n") naver_cafe.suff(url_list[i]) naver_main_area_crawler.crawl_all_cafe_main(backup_set) i += 1 backup_set.clear() except Exception as e: print_and_flush(e) backup_set = list(naver_main_area_crawler.board_crawler.content_num_set) naver_cafe.set_driver(browser.new_browser()) wait(5) naver_cafe.naver_login(naver_id, naver_password) naver_main_area_crawler.set_driver(naver_cafe.driver) realtime = naver_init.is_realtime() print_and_flush("Finished Crawling :)") naver_cafe.quit() naver_main_area_crawler.send_to_db.close() print("Exit. Bye :)") exit(0) #http://cafe.naver.com/ArticleSearchList.nhn?search.clubid=11262350&search.searchdate=2015-07-302015-10-30&search.searchBy=0&search.query=%BC%BA%C7%FC&search.includeAll=&search.exclude=&search.include=&search.exact=&search.sortBy=date&userDisplay=50&search.media=0 \ No newline at end of file