Files
clients/WebBasedCrawler/twitter/twittercrawl.py
2017-07-18 11:12:43 +09:00

108 lines
3.4 KiB
Python

from twitter.twconfig import TwitterConfig
from twitter.twdbhelper import TwitterDBHelper
from twitter.tweet import Tweet
from twitter.twparser import TweetParser
import base.proxy
import requests
import bs4
import json
from urllib import parse
class TwitterCrawler:
def __init__(self):
self.config = TwitterConfig()
self.db_helper = TwitterDBHelper()
self.proxies = None
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
params = self.db_helper.get_param(keyword_id)
self.config.set_param(keyword_id, db_num, params)
def get_url(self, query, max_position=None):
params = {
'f': 'tweets',
'q': '{} since:{} until:{}'.format(query, self.config.start_str, self.config.end_str),
'language': 'en'
}
if max_position is not None:
params['max_position'] = max_position
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', parse.urlencode(params), '')
return parse.urlunparse(url_tupple)
def get_page(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Accept-Language': 'en-US'
}
if self.proxies is None:
self.proxies = base.proxy.get_proxy_for_requests()
resp = None
for cnt in range(5):
try:
resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=5)
except Exception as e:
print('proxy {} is expired. ({})'.format(self.proxies, e))
self.proxies = base.proxy.get_proxy_for_requests()
else:
break
return resp
def insert_pool(self, tweet: Tweet):
pass
def start(self):
b_continue = True
min_tweet_id = None
max_tweet_id = None
max_position = None
tweet_count = 0
while b_continue:
if min_tweet_id is not None:
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
url = self.get_url(self.config.keywords[0], max_position)
r = self.get_page(url)
if r is None:
break
j = json.loads(r.content.decode('utf-8'))
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
tweet_tags = soup.select("div.tweet")
for tw in tweet_tags:
tweet = TweetParser.parse(tw, self.config.keyword_id)
if tweet.is_reply is True:
continue
if tweet.created_at < self.config.start:
b_continue = False
break
elif tweet.created_at > self.config.end:
continue
if tweet.reply_cnt > 0:
self.insert_pool(tweet)
self.db_helper.insert_tweet(self.config.db_num, tweet)
print('{} {}>>{}: {}'.format(tweet.created_at, tweet.article_id, tweet.user_name, tweet.text))
count = len(tweet_tags)
if count == 0:
break
if min_tweet_id is None:
min_tweet_id = tweet_tags[0].attrs['data-item-id']
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
tweet_count += count
print('runner finished {}'.format(tweet_count))