kakaostory 업데이트 대응

utf-8 4byte emoji 공백처리

git-svn-id: svn://192.168.0.12/source@254 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-02-23 08:42:47 +00:00
parent 9f2f38d360
commit a508dc8daa
3 changed files with 33 additions and 18 deletions

View File

@@ -9,6 +9,7 @@ import time
import os
import psutil
import threading
import re
from time import localtime, strftime
from selenium import webdriver
@@ -189,6 +190,7 @@ class Browser:
class SendtoDB:
pymysql = __import__('pymysql.cursors')
re_emoji = re.compile(u'[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF]+', re.UNICODE)
def __init__(self, db_num=0):
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
@@ -223,7 +225,7 @@ class SendtoDB:
if type(val) == int:
val_list.append(str(val))
else:
val_list.append(self.conn.escape(val))
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")"
def send_body(self, body):

View File

@@ -545,12 +545,12 @@ class FacebookPageCrawler:
return None
while True:
self.index += 1
if self.index >= len(self.posts):
if self.index > len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.index > len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
@@ -594,12 +594,12 @@ class FacebookPageCrawler:
if self.index > self.limit:
self.posts = None
return None
if self.index >= len(self.posts):
if self.index > len(self.posts):
# self.posts = self.driver.find_elements_by_css_selector("a[class='_5pcq']")
# self.posts = self.driver.find_elements_by_xpath("//span[@class='fsm fwn fcg']/a[@class='_5pcq'][1]")
# self.posts = self.driver.find_elements_by_xpath("//div[@class='_1dwg']//span[@class='fsm fwn fcg'][1]/a[@class='_5pcq'][1]")
self.posts = self.find_posts()
if self.index >= len(self.posts):
if self.index > len(self.posts):
if self.load_more_posts() is False:
self.posts = None
return None
@@ -666,7 +666,7 @@ class FacebookPageCrawler:
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
return True
if self.reload_count < 15:
if self.reload_count < 10:
print_and_flush("refresh")
self.driver.refresh()
wait(5)

View File

@@ -203,7 +203,8 @@ class KakaoBodyCrawler:
def find_feeling_users(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
#a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewLikes' and not(@style)]")
a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewLikes' and not(@style)]")
except:
return None
self.enter_element(a)
@@ -359,14 +360,15 @@ class KakaoBodyCrawler:
def find_share_users(self):
try:
a = self.activity.find_element_by_xpath("div/div[@class='comment']/div[@class='count_group _countContainer']/div[@class='inner']/a[@class='_btnViewShareList' and not(@style)]")
#a = self.activity.find_element_by_xpath("div/div[@class='comment ']/div[@class='count_group _countContainer']/a[@class='_btnViewShareList' and not(@style)]")
a = self.activity.find_element_by_xpath("div/div/div/a[@class='_btnViewStoryShareList' and not(@style)]")
except:
return None
self.enter_element(a)
# inner_layer = self.driver.find_element_by_css_selector("div[class='inner_story_layer _layerContainer']")
inner_layer = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[class='inner_story_layer _layerContainer']")))
str_share = inner_layer.find_element_by_css_selector("strong[class='tit_story']")
re_share = re.compile("\\(([\\d]+)\\)")
re_share = re.compile("([\\d]+)")
m = re_share.search(str_share.text)
if m is None:
share_num = 0
@@ -666,15 +668,25 @@ class KakaoReplyCrawler:
self.activity = activity
def has_more(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
try:
more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']")
except:
try:
more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']")
except:
return False
if more.get_attribute('style').find('block') != -1:
return True
else:
return False
def read_more_reply(self):
more = self.activity.find_element_by_css_selector("p[class='more _commentMoreBtnContainer']")
a = more.find_element_by_css_selector("a[class='_btnCommentMore']")
try:
more = self.activity.find_element_by_css_selector("p[class='more _showMoreCommentContainer']")
a = more.find_element_by_css_selector("a[class='_btnShowMoreComment']")
except:
more = self.activity.find_element_by_css_selector("p[class='more _showPrevCommentContainer']")
a = more.find_element_by_css_selector("a[class='_btnShowPrevComment']")
self.enter_element(a)
def read_all_reply(self):
@@ -685,12 +697,12 @@ class KakaoReplyCrawler:
raise WebDriverException
def get_reply_ul(self):
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']")
return ul
def has_reply(self):
try:
ul = self.activity.find_element_by_xpath("div/div[@class='comment']/div/ul")
ul = self.activity.find_element_by_xpath("div/div/div/ul[@class='list _listContainer']")
lis = ul.find_elements_by_tag_name("li")
if len(lis) > 0:
return True
@@ -969,9 +981,9 @@ class KakaoPageCrawler:
return None
while True:
self.index += 1
if self.index >= len(self.activities):
if self.index > len(self.activities):
self.activities = self.driver.find_elements_by_css_selector("div[class='section _activity']")
if self.index >= len(self.activities):
if self.index > len(self.activities):
if self.load_more_activities() is False:
self.activities = None
return None
@@ -982,7 +994,8 @@ class KakaoPageCrawler:
time_modified_date = self.find_article_modified_date(self.activities[self.index - 1])
if time_modified_date is not None:
time_date = time_modified_date
print_and_flush(str(time_date))
print("number of post:", self.index, flush=True)
print(str(time_date), flush=True)
if type(time_date) == str:
continue
if self.is_earlier(time_date):
@@ -1065,7 +1078,7 @@ class KakaoPageCrawler:
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
return True
if self.reload_count < 15:
if self.reload_count < 10:
print_and_flush("refresh")
self.driver.refresh()
wait(5)