effectCrawler 추가

git-svn-id: svn://192.168.0.12/source@311 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-11-09 10:08:12 +00:00
parent e41b1413cc
commit 6412bdbefa
7 changed files with 1117 additions and 1032 deletions

View File

@@ -91,6 +91,7 @@ def find_elements_by_xpath(driver, tag, time=0):
) )
return elements return elements
class Browser: class Browser:
def __init__(self, driver=None): def __init__(self, driver=None):
self.driver = driver self.driver = driver

View File

@@ -60,8 +60,6 @@ class BodyCrawler(object):
self.soup = None self.soup = None
self.section_activity = None self.section_activity = None
self.set_soup_and_activity() self.set_soup_and_activity()
if not self.section_activity:
raise NotFoundElementError("section _activity is not Found")
# calling point may differ # calling point may differ
def set_soup_and_activity(self): def set_soup_and_activity(self):
@@ -231,11 +229,20 @@ class BodyCrawler(object):
article_id = self.find_article_id() article_id = self.find_article_id()
return 'channel' if article_id.startswith('ch/') else 'story' return 'channel' if article_id.startswith('ch/') else 'story'
def find_error(self):
error = self.soup.find('div', class_='info_error')
if error:
return True
else:
return False
def get(self): def get(self):
""" """
you need to put 'keyword_id' you need to put 'keyword_id'
:return: dict for crawled body content :return: dict for crawled body content
""" """
if not self.section_activity:
raise NotFoundElementError("section _activity is not Found")
content = dict() content = dict()
content['article_id'] = self.find_article_id() content['article_id'] = self.find_article_id()
content['article_nickname'] = self.find_article_nickname() content['article_nickname'] = self.find_article_nickname()
@@ -421,6 +428,16 @@ class EffectKakaostory(object):
wait(3) wait(3)
body_crawler = BodyCrawler(self.driver) body_crawler = BodyCrawler(self.driver)
reply_crawler = ReplyCrawler(self.driver) reply_crawler = ReplyCrawler(self.driver)
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
try:
error = body_crawler.find_error()
except Exception as e:
raise effect.effecterror.OutDatedCrawler(str(e))
if error:
raise effect.effecterror.DeletedUrlError("The URL is Deleted")
try:
body = body_crawler.get() body = body_crawler.get()
replies = reply_crawler.get() replies = reply_crawler.get()
except Exception as e: except Exception as e:

View File

@@ -34,8 +34,8 @@ class ResultSender:
val_list.append(str(val)) val_list.append(str(val))
else: else:
val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val)))) val_list.append(self.conn.escape(self.re_emoji.sub(' ', str(val))))
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" # + " on duplicate key update " + \ return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
# ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list))) ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
def send(self, table_name, dictionary): def send(self, table_name, dictionary):
query = self._make_query(table_name, dictionary) query = self._make_query(table_name, dictionary)

File diff suppressed because it is too large Load Diff

View File

@@ -112,6 +112,7 @@ def parse_body_html(content):
"article_form": "body", "article_form": "body",
"article_profileurl": media["owner"]["profile_pic_url"], "article_profileurl": media["owner"]["profile_pic_url"],
"article_order": str(media["comments"]["count"]), "article_order": str(media["comments"]["count"]),
"article_hit": str(media.get('video_views', 0)),
"reply_url": str(media["likes"]["count"]) "reply_url": str(media["likes"]["count"])
} }
comments = postpage[0]["media"]["comments"] comments = postpage[0]["media"]["comments"]

View File

@@ -336,7 +336,7 @@ class ReplyCrawler(object):
def set_soup_and_activity(self): def set_soup_and_activity(self):
self.soup = BeautifulSoup(self.driver.page_source, parser_opt) self.soup = BeautifulSoup(self.driver.page_source, parser_opt)
# There are many div.section _activity. But element we use is in div.cover_wrapper # There are many div.section _activity. But a element we use is in div.cover_wrapper
cover_wrapper = self.soup.find('div', class_='cover_wrapper') cover_wrapper = self.soup.find('div', class_='cover_wrapper')
self.section_activity = cover_wrapper.find('div', class_='section _activity') self.section_activity = cover_wrapper.find('div', class_='section _activity')
self.ul = self.section_activity.find('ul', class_='list _listContainer') self.ul = self.section_activity.find('ul', class_='list _listContainer')
@@ -345,7 +345,7 @@ class ReplyCrawler(object):
previous_num_of_replies = 0 previous_num_of_replies = 0
while self.has_more(): while self.has_more():
self.click_load_more_reply_btn() self.click_load_more_reply_btn()
# check number of replies before and after click_load_more_reply_btn() # check the number of replies before and after click_load_more_reply_btn()
# If These were equal, the link or ajax failed # If These were equal, the link or ajax failed
current_num_of_replies = self.get_num_of_replies() current_num_of_replies = self.get_num_of_replies()
if previous_num_of_replies == current_num_of_replies: if previous_num_of_replies == current_num_of_replies:

File diff suppressed because one or more lines are too long