diff --git a/WebBasedCrawler/base/proxy2.py b/WebBasedCrawler/base/proxy2.py index 2d3c33e..657bef3 100644 --- a/WebBasedCrawler/base/proxy2.py +++ b/WebBasedCrawler/base/proxy2.py @@ -161,7 +161,7 @@ class Proxy2Handler: if resp.ok: instance.set_block_at(platform, None) alive_cnt += 1 - print('proxy {}:{} alive'.format(instance.ip, instance.port)) + # print('proxy {}:{} alive'.format(instance.ip, instance.port)) else: instance.set_block_at(platform, datetime.datetime.now()) diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py index 0e9d15f..d6eb311 100644 --- a/WebBasedCrawler/twitter/twittercrawl.py +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -227,7 +227,7 @@ class TwitterCrawler: for container_tags in reply_container_tags: tweet_tags = container_tags.select('div.tweet') if len(tweet_tags) > 0: - tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw) + tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, parent_tw, top_tw) # print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link)) print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok')) self.insert_content_pool(proc_id, content_queue, tweet, top_tw) diff --git a/WebBasedCrawler/twitter/twparser.py b/WebBasedCrawler/twitter/twparser.py index 91b9511..832afbd 100644 --- a/WebBasedCrawler/twitter/twparser.py +++ b/WebBasedCrawler/twitter/twparser.py @@ -5,10 +5,11 @@ import bs4 import datetime import pytz + class TweetParser: @staticmethod - def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): + def parse(tag, keyword_id, depth=0, parent_tw: Tweet=None, top_tw: Tweet=None): tweet = Tweet() tweet.tweet_id = int(tag.attrs['data-tweet-id']) @@ -62,7 +63,7 @@ class TweetParser: tweet.platform_form = 'post' tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id tweet.article_form = 'body' if tweet.depth is 0 else 'reply' - # tweet.article_parent = None + tweet.article_parent = parent_tw.user_name if parent_tw else None tweet.article_id = tweet.user_id tweet.article_nickname = tweet.user_name # tweet.article_title = None