effectCrawler 추가
git-svn-id: svn://192.168.0.12/source@311 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -91,6 +91,7 @@ def find_elements_by_xpath(driver, tag, time=0):
|
|||||||
)
|
)
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
class Browser:
|
class Browser:
|
||||||
def __init__(self, driver=None):
|
def __init__(self, driver=None):
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
|
|||||||
@@ -60,8 +60,6 @@ class BodyCrawler(object):
|
|||||||
self.soup = None
|
self.soup = None
|
||||||
self.section_activity = None
|
self.section_activity = None
|
||||||
self.set_soup_and_activity()
|
self.set_soup_and_activity()
|
||||||
if not self.section_activity:
|
|
||||||
raise NotFoundElementError("section _activity is not Found")
|
|
||||||
|
|
||||||
# calling point may differ
|
# calling point may differ
|
||||||
def set_soup_and_activity(self):
|
def set_soup_and_activity(self):
|
||||||
@@ -231,11 +229,20 @@ class BodyCrawler(object):
|
|||||||
article_id = self.find_article_id()
|
article_id = self.find_article_id()
|
||||||
return 'channel' if article_id.startswith('ch/') else 'story'
|
return 'channel' if article_id.startswith('ch/') else 'story'
|
||||||
|
|
||||||
|
def find_error(self):
|
||||||
|
error = self.soup.find('div', class_='info_error')
|
||||||
|
if error:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
"""
|
"""
|
||||||
you need to put 'keyword_id'
|
you need to put 'keyword_id'
|
||||||
:return: dict for crawled body content
|
:return: dict for crawled body content
|
||||||
"""
|
"""
|
||||||
|
if not self.section_activity:
|
||||||
|
raise NotFoundElementError("section _activity is not Found")
|
||||||
content = dict()
|
content = dict()
|
||||||
content['article_id'] = self.find_article_id()
|
content['article_id'] = self.find_article_id()
|
||||||
content['article_nickname'] = self.find_article_nickname()
|
content['article_nickname'] = self.find_article_nickname()
|
||||||
@@ -421,6 +428,16 @@ class EffectKakaostory(object):
|
|||||||
wait(3)
|
wait(3)
|
||||||
body_crawler = BodyCrawler(self.driver)
|
body_crawler = BodyCrawler(self.driver)
|
||||||
reply_crawler = ReplyCrawler(self.driver)
|
reply_crawler = ReplyCrawler(self.driver)
|
||||||
|
except Exception as e:
|
||||||
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||||
|
|
||||||
|
try:
|
||||||
|
error = body_crawler.find_error()
|
||||||
|
except Exception as e:
|
||||||
|
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||||
|
if error:
|
||||||
|
raise effect.effecterror.DeletedUrlError("The URL is Deleted")
|
||||||
|
try:
|
||||||
body = body_crawler.get()
|
body = body_crawler.get()
|
||||||
replies = reply_crawler.get()
|
replies = reply_crawler.get()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -34,8 +34,8 @@ class ResultSender:
|
|||||||
val_list.append(str(val))
|
val_list.append(str(val))
|
||||||
else:
|
else:
|
||||||
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
|
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
|
||||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \
|
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
|
||||||
# ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||||
|
|
||||||
def send(self, table_name, dictionary):
|
def send(self, table_name, dictionary):
|
||||||
query = self._make_query(table_name, dictionary)
|
query = self._make_query(table_name, dictionary)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -112,6 +112,7 @@ def parse_body_html(content):
|
|||||||
"article_form": "body",
|
"article_form": "body",
|
||||||
"article_profileurl": media["owner"]["profile_pic_url"],
|
"article_profileurl": media["owner"]["profile_pic_url"],
|
||||||
"article_order": str(media["comments"]["count"]),
|
"article_order": str(media["comments"]["count"]),
|
||||||
|
"article_hit": str(media.get('video_views', 0)),
|
||||||
"reply_url": str(media["likes"]["count"])
|
"reply_url": str(media["likes"]["count"])
|
||||||
}
|
}
|
||||||
comments = postpage[0]["media"]["comments"]
|
comments = postpage[0]["media"]["comments"]
|
||||||
|
|||||||
@@ -336,7 +336,7 @@ class ReplyCrawler(object):
|
|||||||
|
|
||||||
def set_soup_and_activity(self):
|
def set_soup_and_activity(self):
|
||||||
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
||||||
# There are many div.section _activity. But element we use is in div.cover_wrapper
|
# There are many div.section _activity. But a element we use is in div.cover_wrapper
|
||||||
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
||||||
self.section_activity = cover_wrapper.find('div', class_='section _activity')
|
self.section_activity = cover_wrapper.find('div', class_='section _activity')
|
||||||
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
||||||
@@ -345,7 +345,7 @@ class ReplyCrawler(object):
|
|||||||
previous_num_of_replies = 0
|
previous_num_of_replies = 0
|
||||||
while self.has_more():
|
while self.has_more():
|
||||||
self.click_load_more_reply_btn()
|
self.click_load_more_reply_btn()
|
||||||
# check number of replies before and after click_load_more_reply_btn()
|
# check the number of replies before and after click_load_more_reply_btn()
|
||||||
# If These were equal, the link or ajax failed
|
# If These were equal, the link or ajax failed
|
||||||
current_num_of_replies = self.get_num_of_replies()
|
current_num_of_replies = self.get_num_of_replies()
|
||||||
if previous_num_of_replies == current_num_of_replies:
|
if previous_num_of_replies == current_num_of_replies:
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user