108 lines
3.4 KiB
Python
108 lines
3.4 KiB
Python
from twitter.twconfig import TwitterConfig
|
|
from twitter.twdbhelper import TwitterDBHelper
|
|
from twitter.tweet import Tweet
|
|
from twitter.twparser import TweetParser
|
|
|
|
import base.proxy
|
|
|
|
import requests
|
|
import bs4
|
|
import json
|
|
from urllib import parse
|
|
|
|
|
|
class TwitterCrawler:
|
|
def __init__(self):
|
|
self.config = TwitterConfig()
|
|
self.db_helper = TwitterDBHelper()
|
|
self.proxies = None
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
params = self.db_helper.get_param(keyword_id)
|
|
self.config.set_param(keyword_id, db_num, params)
|
|
|
|
def get_url(self, query, max_position=None):
|
|
params = {
|
|
'f': 'tweets',
|
|
'q': '{} since:{} until:{}'.format(query, self.config.start_str, self.config.end_str),
|
|
'language': 'en'
|
|
}
|
|
|
|
if max_position is not None:
|
|
params['max_position'] = max_position
|
|
|
|
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', parse.urlencode(params), '')
|
|
return parse.urlunparse(url_tupple)
|
|
|
|
def get_page(self, url):
|
|
headers = {
|
|
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
|
|
'Accept-Language': 'en-US'
|
|
}
|
|
if self.proxies is None:
|
|
self.proxies = base.proxy.get_proxy_for_requests()
|
|
|
|
resp = None
|
|
for cnt in range(5):
|
|
try:
|
|
resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=5)
|
|
except Exception as e:
|
|
print('proxy {} is expired. ({})'.format(self.proxies, e))
|
|
self.proxies = base.proxy.get_proxy_for_requests()
|
|
else:
|
|
break
|
|
|
|
return resp
|
|
|
|
def insert_pool(self, tweet: Tweet):
|
|
pass
|
|
|
|
def start(self):
|
|
b_continue = True
|
|
min_tweet_id = None
|
|
max_tweet_id = None
|
|
max_position = None
|
|
tweet_count = 0
|
|
|
|
while b_continue:
|
|
if min_tweet_id is not None:
|
|
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
|
|
url = self.get_url(self.config.keywords[0], max_position)
|
|
r = self.get_page(url)
|
|
if r is None:
|
|
break
|
|
j = json.loads(r.content.decode('utf-8'))
|
|
|
|
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
|
|
tweet_tags = soup.select("div.tweet")
|
|
|
|
for tw in tweet_tags:
|
|
tweet = TweetParser.parse(tw, self.config.keyword_id)
|
|
|
|
if tweet.is_reply is True:
|
|
continue
|
|
|
|
if tweet.created_at < self.config.start:
|
|
b_continue = False
|
|
break
|
|
elif tweet.created_at > self.config.end:
|
|
continue
|
|
|
|
if tweet.reply_cnt > 0:
|
|
self.insert_pool(tweet)
|
|
|
|
self.db_helper.insert_tweet(self.config.db_num, tweet)
|
|
|
|
print('{} {}>>{}: {}'.format(tweet.created_at, tweet.article_id, tweet.user_name, tweet.text))
|
|
|
|
count = len(tweet_tags)
|
|
if count == 0:
|
|
break
|
|
|
|
if min_tweet_id is None:
|
|
min_tweet_id = tweet_tags[0].attrs['data-item-id']
|
|
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
|
|
tweet_count += count
|
|
|
|
print('runner finished {}'.format(tweet_count))
|