Files
clients/WebBasedCrawler/twitter/twparser.py
2017-07-18 11:12:43 +09:00

67 lines
2.6 KiB
Python

from twitter.tweet import Tweet
from twitter.twconfig import TwitterConfig
import bs4
import datetime
class TweetParser:
@staticmethod
def parse(tag, keyword_id):
tweet = Tweet()
nickname_tag = tag.select('strong.fullname')[0]
tweet.user_name = ''
for child in nickname_tag.children:
if isinstance(child, bs4.element.NavigableString):
if len(tweet.user_name) > 0:
tweet.user_name += ' '
tweet.user_name += child
tweet.user_id = tag.select('span.username')[0].text[1:]
tweet.text = tag.select('p.tweet-text')[0].text
time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
tweet.is_reply = len(reply_tag) > 0
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(reply_cnt_tag) > 0:
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
retweet_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(retweet_cnt_tag) > 0:
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
favorite_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(favorite_cnt_tag) > 0:
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
link_tag = tag.select('a.js-permalink')
if len(link_tag) > 0:
tweet.tweet_link = link_tag[0].attrs['href']
tweet.platform_name = 'twitter'
tweet.platform_form = 'post'
tweet.platform_title = tweet.user_id
tweet.article_form = 'reply' if tweet.is_reply else 'body'
# tweet.article_parent = None
tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name
# tweet.article_title = None
tweet.article_data = tweet.text
tweet.article_url = TwitterConfig.protocol + '://' + TwitterConfig.top_url + tweet.tweet_link
# tweet.article_hit = 0
tweet.article_date = tweet.created_at
# tweet.article_order = 0
# tweet.article_profile = tweet.user_name
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
tweet.platform_id = tweet.user_id
tweet.keyword_id = keyword_id
# tweet.reply_url = ''
# tweet.etc = ''
return tweet