effectCrawler 추가
git-svn-id: svn://192.168.0.12/source@311 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -91,6 +91,7 @@ def find_elements_by_xpath(driver, tag, time=0):
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
class Browser:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
|
||||
@@ -60,8 +60,6 @@ class BodyCrawler(object):
|
||||
self.soup = None
|
||||
self.section_activity = None
|
||||
self.set_soup_and_activity()
|
||||
if not self.section_activity:
|
||||
raise NotFoundElementError("section _activity is not Found")
|
||||
|
||||
# calling point may differ
|
||||
def set_soup_and_activity(self):
|
||||
@@ -231,11 +229,20 @@ class BodyCrawler(object):
|
||||
article_id = self.find_article_id()
|
||||
return 'channel' if article_id.startswith('ch/') else 'story'
|
||||
|
||||
def find_error(self):
|
||||
error = self.soup.find('div', class_='info_error')
|
||||
if error:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get(self):
|
||||
"""
|
||||
you need to put 'keyword_id'
|
||||
:return: dict for crawled body content
|
||||
"""
|
||||
if not self.section_activity:
|
||||
raise NotFoundElementError("section _activity is not Found")
|
||||
content = dict()
|
||||
content['article_id'] = self.find_article_id()
|
||||
content['article_nickname'] = self.find_article_nickname()
|
||||
@@ -421,6 +428,16 @@ class EffectKakaostory(object):
|
||||
wait(3)
|
||||
body_crawler = BodyCrawler(self.driver)
|
||||
reply_crawler = ReplyCrawler(self.driver)
|
||||
except Exception as e:
|
||||
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||
|
||||
try:
|
||||
error = body_crawler.find_error()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.OutDatedCrawler(str(e))
|
||||
if error:
|
||||
raise effect.effecterror.DeletedUrlError("The URL is Deleted")
|
||||
try:
|
||||
body = body_crawler.get()
|
||||
replies = reply_crawler.get()
|
||||
except Exception as e:
|
||||
|
||||
@@ -34,8 +34,8 @@ class ResultSender:
|
||||
val_list.append(str(val))
|
||||
else:
|
||||
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
|
||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \
|
||||
# ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
|
||||
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||
|
||||
def send(self, table_name, dictionary):
|
||||
query = self._make_query(table_name, dictionary)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -112,6 +112,7 @@ def parse_body_html(content):
|
||||
"article_form": "body",
|
||||
"article_profileurl": media["owner"]["profile_pic_url"],
|
||||
"article_order": str(media["comments"]["count"]),
|
||||
"article_hit": str(media.get('video_views', 0)),
|
||||
"reply_url": str(media["likes"]["count"])
|
||||
}
|
||||
comments = postpage[0]["media"]["comments"]
|
||||
|
||||
@@ -336,7 +336,7 @@ class ReplyCrawler(object):
|
||||
|
||||
def set_soup_and_activity(self):
|
||||
self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
|
||||
# There are many div.section _activity. But element we use is in div.cover_wrapper
|
||||
# There are many div.section _activity. But a element we use is in div.cover_wrapper
|
||||
cover_wrapper = self.soup.find('div', class_='cover_wrapper')
|
||||
self.section_activity = cover_wrapper.find('div', class_='section _activity')
|
||||
self.ul = self.section_activity.find('ul', class_='list _listContainer')
|
||||
@@ -345,7 +345,7 @@ class ReplyCrawler(object):
|
||||
previous_num_of_replies = 0
|
||||
while self.has_more():
|
||||
self.click_load_more_reply_btn()
|
||||
# check number of replies before and after click_load_more_reply_btn()
|
||||
# check the number of replies before and after click_load_more_reply_btn()
|
||||
# If These were equal, the link or ajax failed
|
||||
current_num_of_replies = self.get_num_of_replies()
|
||||
if previous_num_of_replies == current_num_of_replies:
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user