From 3d829e55b565c2f2b804c5794053ca54be9e1907 Mon Sep 17 00:00:00 2001 From: mjjo Date: Thu, 10 Aug 2017 11:32:08 +0900 Subject: [PATCH] =?UTF-8?q?runner=20exception=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebBasedCrawler/base/proxy2.py | 2 +- WebBasedCrawler/twitter/twittercrawl.py | 87 +++++++++++++------------ 2 files changed, 47 insertions(+), 42 deletions(-) diff --git a/WebBasedCrawler/base/proxy2.py b/WebBasedCrawler/base/proxy2.py index 655d4bc..4cf2adc 100644 --- a/WebBasedCrawler/base/proxy2.py +++ b/WebBasedCrawler/base/proxy2.py @@ -135,7 +135,7 @@ class Proxy2Handler: else: proxies = proxy_crawler.crawl_proxies() self.insert_all(proxies) - self.unlock() + self.lock_leave() return self.get(platform, proc_id) def insert(self, ip, port): diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py index 78fd2c6..f074c07 100644 --- a/WebBasedCrawler/twitter/twittercrawl.py +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -6,6 +6,7 @@ from twitter.twparser import TweetParser import base.proxy import base.proxy2 as proxy2 import base.baseclasses +import base.logger as logger import requests import bs4 @@ -122,57 +123,61 @@ class TwitterCrawler: } def runner_proc(self, proc_id, content_queue, result_queue, config): - print('[{}] {} to {} runner thread start'.format(proc_id, config.start_str, config.end_str)) + try: + print('[{}] {} to {} runner thread start'.format(proc_id, config.start_str, config.end_str)) - b_continue = True - min_tweet_id = None - max_tweet_id = None - max_position = '' - tweet_count = 0 + b_continue = True + min_tweet_id = None + max_tweet_id = None + max_position = '' + tweet_count = 0 - while b_continue: - url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position) - j = self.get_page_data(url, True, proc_id) - soup = bs4.BeautifulSoup(j['items_html'], 'lxml') - tweet_tags = soup.select("div.tweet") + while b_continue: + url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position) + j = self.get_page_data(url, True, proc_id) + soup = bs4.BeautifulSoup(j['items_html'], 'lxml') + tweet_tags = soup.select("div.tweet") - tweet_ids = [] - for tw in tweet_tags: - tweet = TweetParser.parse(tw, config.keyword_id) - tweet_ids.append(tweet.tweet_id) + tweet_ids = [] + for tw in tweet_tags: + tweet = TweetParser.parse(tw, config.keyword_id) + tweet_ids.append(tweet.tweet_id) - if tweet.is_reply is True: - # print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20])) - continue + if tweet.is_reply is True: + # print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20])) + continue - if tweet.reply_cnt > 0: - self.insert_content_pool(proc_id, content_queue, tweet, tweet) - self.db_helper.insert_tweet(tweet, config.db_num) + if tweet.reply_cnt > 0: + self.insert_content_pool(proc_id, content_queue, tweet, tweet) + self.db_helper.insert_tweet(tweet, config.db_num) - # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) - print('[{}] body {} ({}) [{}]'.format(proc_id, tweet.top_link, tweet.created_at, 'ok')) + # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) + print('[{}] body {} ({}) [{}]'.format(proc_id, tweet.top_link, tweet.created_at, 'ok')) - count = len(tweet_tags) - tweet_count += count + count = len(tweet_tags) + tweet_count += count - b_continue = count > 0 - # b_continue = j['has_more_items'] - if b_continue: - if min_tweet_id is None: - min_tweet_id = tweet_ids[0] - max_tweet_id = tweet_ids[-1] + b_continue = count > 0 + # b_continue = j['has_more_items'] + if b_continue: + if min_tweet_id is None: + min_tweet_id = tweet_ids[0] + max_tweet_id = tweet_ids[-1] - if 'min_position' in j: - max_position = j['min_position'] - else: - max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) + if 'min_position' in j: + max_position = j['min_position'] + else: + max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) + + print('[{}] {} to {} runner thread finished {}'.format(proc_id, config.start_str, config.end_str, tweet_count)) + result_queue.put({ + 'proc_id': proc_id, + 'count': tweet_count, + }) + # self.runner_processing[proc_id].value = False + except Exception as e: + logger.log(e, logger.LogLevel.ERROR) - print('[{}] {} to {} runner thread finished {}'.format(proc_id, config.start_str, config.end_str, tweet_count)) - result_queue.put({ - 'proc_id': proc_id, - 'count': tweet_count, - }) - # self.runner_processing[proc_id].value = False return proc_id, tweet_count, @staticmethod