- requirements.txt 추가
- print 구문 주석
This commit is contained in:
3
WebBasedCrawler/requirements.txt
Normal file
3
WebBasedCrawler/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
requests
|
||||||
|
bs4
|
||||||
|
pytz
|
||||||
@@ -43,13 +43,6 @@ class TwitterConfig:
|
|||||||
self.state = params['state']
|
self.state = params['state']
|
||||||
self.platform = params['platform']
|
self.platform = params['platform']
|
||||||
|
|
||||||
# debug
|
|
||||||
self.platform = 14
|
|
||||||
# self.start_str = '2017-05-01'
|
|
||||||
# self.end_str = '2017-05-02'
|
|
||||||
# self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d')
|
|
||||||
# self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
|
|
||||||
|
|
||||||
def split(self):
|
def split(self):
|
||||||
split_list = []
|
split_list = []
|
||||||
new_end = self.end
|
new_end = self.end
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ class TwitterCrawler():
|
|||||||
tweet = TweetParser.parse(tw, config.keyword_id)
|
tweet = TweetParser.parse(tw, config.keyword_id)
|
||||||
|
|
||||||
if tweet.is_reply is True:
|
if tweet.is_reply is True:
|
||||||
print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
|
# print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if tweet.reply_cnt > 0:
|
if tweet.reply_cnt > 0:
|
||||||
@@ -107,7 +107,7 @@ class TwitterCrawler():
|
|||||||
|
|
||||||
self.db_helper.insert_tweet(tweet, config.db_num)
|
self.db_helper.insert_tweet(tweet, config.db_num)
|
||||||
|
|
||||||
print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
|
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
|
||||||
|
|
||||||
count = len(tweet_tags)
|
count = len(tweet_tags)
|
||||||
if count == 0:
|
if count == 0:
|
||||||
@@ -125,7 +125,7 @@ class TwitterCrawler():
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet):
|
def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet):
|
||||||
print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link))
|
# print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link))
|
||||||
qu.put((tweet, tweet_top,))
|
qu.put((tweet, tweet_top,))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -153,7 +153,7 @@ class TwitterCrawler():
|
|||||||
if not parent_tw:
|
if not parent_tw:
|
||||||
break
|
break
|
||||||
|
|
||||||
print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link))
|
# print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link))
|
||||||
|
|
||||||
max_position = ''
|
max_position = ''
|
||||||
|
|
||||||
@@ -176,7 +176,7 @@ class TwitterCrawler():
|
|||||||
tweet_tags = container_tags.select('div.tweet')
|
tweet_tags = container_tags.select('div.tweet')
|
||||||
if len(tweet_tags) > 0:
|
if len(tweet_tags) > 0:
|
||||||
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
|
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
|
||||||
print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
||||||
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
||||||
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
|
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
|
||||||
tweet_count += 1
|
tweet_count += 1
|
||||||
@@ -185,7 +185,7 @@ class TwitterCrawler():
|
|||||||
if b_continue:
|
if b_continue:
|
||||||
max_position = j['min_position']
|
max_position = j['min_position']
|
||||||
|
|
||||||
result_queue.put(tweet_count)
|
result_queue.put((proc_id, tweet_count))
|
||||||
print('[{}] content thread finished'.format(proc_id))
|
print('[{}] content thread finished'.format(proc_id))
|
||||||
return proc_id, tweet_count,
|
return proc_id, tweet_count,
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user