runner exception 추가

This commit is contained in:
mjjo
2017-08-10 11:32:08 +09:00
parent 19cd5bb070
commit 3d829e55b5
2 changed files with 47 additions and 42 deletions

View File

@@ -135,7 +135,7 @@ class Proxy2Handler:
else: else:
proxies = proxy_crawler.crawl_proxies() proxies = proxy_crawler.crawl_proxies()
self.insert_all(proxies) self.insert_all(proxies)
self.unlock() self.lock_leave()
return self.get(platform, proc_id) return self.get(platform, proc_id)
def insert(self, ip, port): def insert(self, ip, port):

View File

@@ -6,6 +6,7 @@ from twitter.twparser import TweetParser
import base.proxy import base.proxy
import base.proxy2 as proxy2 import base.proxy2 as proxy2
import base.baseclasses import base.baseclasses
import base.logger as logger
import requests import requests
import bs4 import bs4
@@ -122,57 +123,61 @@ class TwitterCrawler:
} }
def runner_proc(self, proc_id, content_queue, result_queue, config): def runner_proc(self, proc_id, content_queue, result_queue, config):
print('[{}] {} to {} runner thread start'.format(proc_id, config.start_str, config.end_str)) try:
print('[{}] {} to {} runner thread start'.format(proc_id, config.start_str, config.end_str))
b_continue = True b_continue = True
min_tweet_id = None min_tweet_id = None
max_tweet_id = None max_tweet_id = None
max_position = '' max_position = ''
tweet_count = 0 tweet_count = 0
while b_continue: while b_continue:
url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position) url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position)
j = self.get_page_data(url, True, proc_id) j = self.get_page_data(url, True, proc_id)
soup = bs4.BeautifulSoup(j['items_html'], 'lxml') soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
tweet_tags = soup.select("div.tweet") tweet_tags = soup.select("div.tweet")
tweet_ids = [] tweet_ids = []
for tw in tweet_tags: for tw in tweet_tags:
tweet = TweetParser.parse(tw, config.keyword_id) tweet = TweetParser.parse(tw, config.keyword_id)
tweet_ids.append(tweet.tweet_id) tweet_ids.append(tweet.tweet_id)
if tweet.is_reply is True: if tweet.is_reply is True:
# print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20])) # print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
continue continue
if tweet.reply_cnt > 0: if tweet.reply_cnt > 0:
self.insert_content_pool(proc_id, content_queue, tweet, tweet) self.insert_content_pool(proc_id, content_queue, tweet, tweet)
self.db_helper.insert_tweet(tweet, config.db_num) self.db_helper.insert_tweet(tweet, config.db_num)
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
print('[{}] body {} ({}) [{}]'.format(proc_id, tweet.top_link, tweet.created_at, 'ok')) print('[{}] body {} ({}) [{}]'.format(proc_id, tweet.top_link, tweet.created_at, 'ok'))
count = len(tweet_tags) count = len(tweet_tags)
tweet_count += count tweet_count += count
b_continue = count > 0 b_continue = count > 0
# b_continue = j['has_more_items'] # b_continue = j['has_more_items']
if b_continue: if b_continue:
if min_tweet_id is None: if min_tweet_id is None:
min_tweet_id = tweet_ids[0] min_tweet_id = tweet_ids[0]
max_tweet_id = tweet_ids[-1] max_tweet_id = tweet_ids[-1]
if 'min_position' in j: if 'min_position' in j:
max_position = j['min_position'] max_position = j['min_position']
else: else:
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
print('[{}] {} to {} runner thread finished {}'.format(proc_id, config.start_str, config.end_str, tweet_count))
result_queue.put({
'proc_id': proc_id,
'count': tweet_count,
})
# self.runner_processing[proc_id].value = False
except Exception as e:
logger.log(e, logger.LogLevel.ERROR)
print('[{}] {} to {} runner thread finished {}'.format(proc_id, config.start_str, config.end_str, tweet_count))
result_queue.put({
'proc_id': proc_id,
'count': tweet_count,
})
# self.runner_processing[proc_id].value = False
return proc_id, tweet_count, return proc_id, tweet_count,
@staticmethod @staticmethod