twitter timeline 크롤링
This commit is contained in:
51
WebBasedCrawler/base/dbdata.py
Normal file
51
WebBasedCrawler/base/dbdata.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from pymysql.connections import Connection
|
||||
import datetime
|
||||
from numbers import Number
|
||||
|
||||
class DataDBRow:
|
||||
def __init__(self):
|
||||
self.platform_name = None
|
||||
self.platform_form = None
|
||||
self.platform_title = None
|
||||
self.article_form = None
|
||||
self.article_parent = None
|
||||
self.article_id = None
|
||||
self.article_nickname = None
|
||||
self.article_title = None
|
||||
self.article_data = None
|
||||
self.article_url = None
|
||||
self.article_hit = 0
|
||||
self.article_date = None
|
||||
self.article_order = 0
|
||||
self.article_profile = None
|
||||
self.article_profileurl = None
|
||||
self.platform_id = None
|
||||
self.keyword_id = -1
|
||||
self.reply_url = None
|
||||
self.etc = None
|
||||
|
||||
def get_insert_query(self, conn, db_num):
|
||||
|
||||
inst = DataDBRow()
|
||||
|
||||
keys = ''
|
||||
values = ''
|
||||
for key, value_type in inst.__dict__.items():
|
||||
if key.startswith('__') or callable(value_type):
|
||||
continue
|
||||
|
||||
if len(keys) > 0:
|
||||
keys += ', '
|
||||
values += ', '
|
||||
|
||||
keys += key
|
||||
value = self.__dict__[key]
|
||||
if isinstance(value, Number):
|
||||
values += str(value)
|
||||
elif isinstance(value, str):
|
||||
values += conn.escape(value.encode('utf8').decode('utf8'))
|
||||
else:
|
||||
values += conn.escape(value)
|
||||
|
||||
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
|
||||
return query
|
||||
41
WebBasedCrawler/twitter/twconfig.py
Normal file
41
WebBasedCrawler/twitter/twconfig.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import datetime
|
||||
|
||||
|
||||
class TwitterConfig:
|
||||
protocol = 'https'
|
||||
top_url = 'twitter.com'
|
||||
search_url = '/i/search/timeline'
|
||||
|
||||
def __init__(self):
|
||||
self.keyword_id = -1
|
||||
self.db_num = -1
|
||||
|
||||
self.id = 0
|
||||
self.realtime = False
|
||||
self.keywords = []
|
||||
self.start_str = None
|
||||
self.start = None
|
||||
self.end_str = None
|
||||
self.end = None
|
||||
self.authorship = None
|
||||
self.state = None
|
||||
self.platform = None
|
||||
|
||||
def set_param(self, keyword_id, db_num, params):
|
||||
self.keyword_id = int(keyword_id)
|
||||
self.db_num = int(db_num)
|
||||
|
||||
self.id = int(params['id'])
|
||||
self.realtime = params['realtime'] == '1'
|
||||
|
||||
self.keywords = []
|
||||
for keyword in params['searches'].split(','):
|
||||
self.keywords.append(keyword.strip())
|
||||
|
||||
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time())
|
||||
self.start_str = str(params['start'])
|
||||
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
|
||||
self.end_str = str(params['end'])
|
||||
self.authorship = params['authorship']
|
||||
self.state = params['state']
|
||||
self.platform = params['platform']
|
||||
39
WebBasedCrawler/twitter/twdbhelper.py
Normal file
39
WebBasedCrawler/twitter/twdbhelper.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from twitter.tweet import Tweet
|
||||
from base.dbdata import DataDBRow
|
||||
|
||||
|
||||
class TwitterDBHelper:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
|
||||
def __init__(self):
|
||||
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
|
||||
def __del__(self):
|
||||
self.conn.close()
|
||||
|
||||
def get_param(self, keyword_id):
|
||||
query = "select * from keyword where id = " + str(keyword_id)
|
||||
params = []
|
||||
try:
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
params = cursor.fetchone()
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
|
||||
return params
|
||||
|
||||
def insert_tweet(self, db_num: int, tweet: Tweet):
|
||||
query = tweet.get_insert_query(self.conn, db_num)
|
||||
|
||||
try:
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
self.conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
20
WebBasedCrawler/twitter/tweet.py
Normal file
20
WebBasedCrawler/twitter/tweet.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from base.dbdata import DataDBRow
|
||||
|
||||
|
||||
class Tweet(DataDBRow):
|
||||
|
||||
def __init__(self):
|
||||
super(self.__class__, self).__init__()
|
||||
|
||||
self.user_id = None
|
||||
self.user_name = None
|
||||
self.text = None
|
||||
self.created_at = None
|
||||
self.retweets = 0
|
||||
self.favorites = 0
|
||||
|
||||
self.is_reply = False
|
||||
self.reply_cnt = 0
|
||||
self.retweet_cnt = 0
|
||||
self.favorite_cnt = 0
|
||||
self.tweet_link = None
|
||||
@@ -1,36 +1,107 @@
|
||||
from twitter.twconfig import TwitterConfig
|
||||
from twitter.twdbhelper import TwitterDBHelper
|
||||
from twitter.tweet import Tweet
|
||||
from twitter.twparser import TweetParser
|
||||
|
||||
class TwitterMainCrawler:
|
||||
import base.proxy
|
||||
|
||||
import requests
|
||||
import bs4
|
||||
import json
|
||||
from urllib import parse
|
||||
|
||||
|
||||
class TwitterCrawler:
|
||||
def __init__(self):
|
||||
self.keyword_id = -1
|
||||
pass
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
# self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
# self.crawl_init.disconnect()
|
||||
pass
|
||||
|
||||
def init_db(self, db_num):
|
||||
# self.send_to_db.set_db(db_num)
|
||||
pass
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
# self.crawl_init.set_before_day(before_day)
|
||||
pass
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
# self.crawl_init.set_until_page(until_page)
|
||||
pass
|
||||
self.config = TwitterConfig()
|
||||
self.db_helper = TwitterDBHelper()
|
||||
self.proxies = None
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
# self.init_browser(browser)
|
||||
params = self.db_helper.get_param(keyword_id)
|
||||
self.config.set_param(keyword_id, db_num, params)
|
||||
|
||||
def get_url(self, query, max_position=None):
|
||||
params = {
|
||||
'f': 'tweets',
|
||||
'q': '{} since:{} until:{}'.format(query, self.config.start_str, self.config.end_str),
|
||||
'language': 'en'
|
||||
}
|
||||
|
||||
if max_position is not None:
|
||||
params['max_position'] = max_position
|
||||
|
||||
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', parse.urlencode(params), '')
|
||||
return parse.urlunparse(url_tupple)
|
||||
|
||||
def get_page(self, url):
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
|
||||
'Accept-Language': 'en-US'
|
||||
}
|
||||
if self.proxies is None:
|
||||
self.proxies = base.proxy.get_proxy_for_requests()
|
||||
|
||||
resp = None
|
||||
for cnt in range(5):
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=5)
|
||||
except Exception as e:
|
||||
print('proxy {} is expired. ({})'.format(self.proxies, e))
|
||||
self.proxies = base.proxy.get_proxy_for_requests()
|
||||
else:
|
||||
break
|
||||
|
||||
return resp
|
||||
|
||||
def insert_pool(self, tweet: Tweet):
|
||||
pass
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
b_continue = True
|
||||
min_tweet_id = None
|
||||
max_tweet_id = None
|
||||
max_position = None
|
||||
tweet_count = 0
|
||||
|
||||
while b_continue:
|
||||
if min_tweet_id is not None:
|
||||
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
|
||||
url = self.get_url(self.config.keywords[0], max_position)
|
||||
r = self.get_page(url)
|
||||
if r is None:
|
||||
break
|
||||
j = json.loads(r.content.decode('utf-8'))
|
||||
|
||||
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
|
||||
tweet_tags = soup.select("div.tweet")
|
||||
|
||||
for tw in tweet_tags:
|
||||
tweet = TweetParser.parse(tw, self.config.keyword_id)
|
||||
|
||||
if tweet.is_reply is True:
|
||||
continue
|
||||
|
||||
if tweet.created_at < self.config.start:
|
||||
b_continue = False
|
||||
break
|
||||
elif tweet.created_at > self.config.end:
|
||||
continue
|
||||
|
||||
if tweet.reply_cnt > 0:
|
||||
self.insert_pool(tweet)
|
||||
|
||||
self.db_helper.insert_tweet(self.config.db_num, tweet)
|
||||
|
||||
print('{} {}>>{}: {}'.format(tweet.created_at, tweet.article_id, tweet.user_name, tweet.text))
|
||||
|
||||
count = len(tweet_tags)
|
||||
if count == 0:
|
||||
break
|
||||
|
||||
if min_tweet_id is None:
|
||||
min_tweet_id = tweet_tags[0].attrs['data-item-id']
|
||||
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
|
||||
tweet_count += count
|
||||
|
||||
print('runner finished {}'.format(tweet_count))
|
||||
|
||||
66
WebBasedCrawler/twitter/twparser.py
Normal file
66
WebBasedCrawler/twitter/twparser.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from twitter.tweet import Tweet
|
||||
from twitter.twconfig import TwitterConfig
|
||||
|
||||
import bs4
|
||||
import datetime
|
||||
|
||||
|
||||
class TweetParser:
|
||||
|
||||
@staticmethod
|
||||
def parse(tag, keyword_id):
|
||||
tweet = Tweet()
|
||||
|
||||
nickname_tag = tag.select('strong.fullname')[0]
|
||||
tweet.user_name = ''
|
||||
for child in nickname_tag.children:
|
||||
if isinstance(child, bs4.element.NavigableString):
|
||||
if len(tweet.user_name) > 0:
|
||||
tweet.user_name += ' '
|
||||
tweet.user_name += child
|
||||
tweet.user_id = tag.select('span.username')[0].text[1:]
|
||||
tweet.text = tag.select('p.tweet-text')[0].text
|
||||
|
||||
time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
|
||||
tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
|
||||
|
||||
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
|
||||
tweet.is_reply = len(reply_tag) > 0
|
||||
|
||||
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
|
||||
if len(reply_cnt_tag) > 0:
|
||||
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
retweet_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
|
||||
if len(retweet_cnt_tag) > 0:
|
||||
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
favorite_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
|
||||
if len(favorite_cnt_tag) > 0:
|
||||
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
link_tag = tag.select('a.js-permalink')
|
||||
if len(link_tag) > 0:
|
||||
tweet.tweet_link = link_tag[0].attrs['href']
|
||||
|
||||
tweet.platform_name = 'twitter'
|
||||
tweet.platform_form = 'post'
|
||||
tweet.platform_title = tweet.user_id
|
||||
tweet.article_form = 'reply' if tweet.is_reply else 'body'
|
||||
# tweet.article_parent = None
|
||||
tweet.article_id = tweet.user_id
|
||||
tweet.article_nickname = tweet.user_name
|
||||
# tweet.article_title = None
|
||||
tweet.article_data = tweet.text
|
||||
tweet.article_url = TwitterConfig.protocol + '://' + TwitterConfig.top_url + tweet.tweet_link
|
||||
# tweet.article_hit = 0
|
||||
tweet.article_date = tweet.created_at
|
||||
# tweet.article_order = 0
|
||||
# tweet.article_profile = tweet.user_name
|
||||
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
|
||||
tweet.platform_id = tweet.user_id
|
||||
tweet.keyword_id = keyword_id
|
||||
# tweet.reply_url = ''
|
||||
# tweet.etc = ''
|
||||
|
||||
return tweet
|
||||
@@ -31,7 +31,7 @@ class WebBasedCrawler:
|
||||
elif platform == 'facebook':
|
||||
self.crawler = facebookcrawlbs.FacebookMainCrawler()
|
||||
elif platform == 'twitter':
|
||||
self.crawler = twittercrawl.TwitterMainCrawler()
|
||||
self.crawler = twittercrawl.TwitterCrawler()
|
||||
elif platform == 'youtube':
|
||||
self.crawler = youtubecrawl.YoutubeMainCrawler()
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user