97 lines
3.9 KiB
Python
97 lines
3.9 KiB
Python
from twitter.tweet import Tweet
|
|
from twitter.twconfig import TwitterConfig
|
|
|
|
import bs4
|
|
import datetime
|
|
import pytz
|
|
|
|
class TweetParser:
|
|
|
|
@staticmethod
|
|
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
|
|
tweet = Tweet()
|
|
|
|
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
|
|
|
|
nickname_tag = tag.select('strong.fullname')[0]
|
|
tweet.user_name = ''
|
|
for child in nickname_tag.children:
|
|
if isinstance(child, bs4.element.NavigableString):
|
|
if len(tweet.user_name) > 0:
|
|
tweet.user_name += ' '
|
|
tweet.user_name += child
|
|
tweet.user_id = tag.select('span.username')[0].text[1:]
|
|
tweet.text = tag.select('p.tweet-text')[0].text
|
|
|
|
# time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
|
|
# english
|
|
# tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
|
|
# korean
|
|
# time_str = time_str.replace('오전', 'AM').replace('오후', 'PM')
|
|
# tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일')
|
|
|
|
timestamp = int(tag.select('span._timestamp')[0].attrs['data-time'])
|
|
utc_dt = datetime.datetime.utcfromtimestamp(timestamp)
|
|
local_tz = pytz.timezone('Asia/Seoul')
|
|
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
|
|
tweet.created_at = local_tz.normalize(local_dt)
|
|
|
|
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
|
|
tweet.is_reply = len(reply_tag) > 0
|
|
|
|
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
|
|
if len(reply_cnt_tag) > 0:
|
|
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
|
|
|
|
retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount')
|
|
if len(retweet_cnt_tag) > 0:
|
|
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
|
|
|
|
favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount')
|
|
if len(favorite_cnt_tag) > 0:
|
|
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
|
|
|
|
link_tag = tag.select('a.js-permalink')
|
|
if len(link_tag) > 0:
|
|
tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href']
|
|
tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link
|
|
|
|
tweet.depth = depth
|
|
|
|
tweet.platform_name = 'twitter'
|
|
tweet.platform_form = 'post'
|
|
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
|
|
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
|
|
# tweet.article_parent = None
|
|
tweet.article_id = tweet.user_id
|
|
tweet.article_nickname = tweet.user_name
|
|
# tweet.article_title = None
|
|
tweet.article_data = tweet.text
|
|
tweet.article_url = tweet.top_link
|
|
# tweet.article_hit = 0
|
|
tweet.article_date = tweet.created_at
|
|
tweet.article_order = tweet.depth
|
|
# tweet.article_profile = tweet.user_name
|
|
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
|
|
tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id
|
|
tweet.keyword_id = keyword_id
|
|
tweet.reply_url = tweet.tweet_link
|
|
# tweet.etc = ''
|
|
|
|
return tweet
|
|
|
|
@staticmethod
|
|
def get_lone_container(soup, parent_tw):
|
|
lone_tweets = soup.select('div.ThreadedConversation--loneTweet')
|
|
container_tags = []
|
|
for tag in reversed(lone_tweets):
|
|
li = tag.select('li.stream-item')
|
|
if len(li) > 0 and 'data-item-id' in li[0].attrs:
|
|
tweet_id = int(li[0].attrs['data-item-id'])
|
|
if tweet_id == parent_tw.tweet_id:
|
|
break
|
|
|
|
container_tags.append(tag)
|
|
|
|
return reversed(container_tags)
|