diff --git a/WebBasedCrawler/base/dbdata.py b/WebBasedCrawler/base/dbdata.py new file mode 100644 index 0000000..d08d986 --- /dev/null +++ b/WebBasedCrawler/base/dbdata.py @@ -0,0 +1,51 @@ +from pymysql.connections import Connection +import datetime +from numbers import Number + +class DataDBRow: + def __init__(self): + self.platform_name = None + self.platform_form = None + self.platform_title = None + self.article_form = None + self.article_parent = None + self.article_id = None + self.article_nickname = None + self.article_title = None + self.article_data = None + self.article_url = None + self.article_hit = 0 + self.article_date = None + self.article_order = 0 + self.article_profile = None + self.article_profileurl = None + self.platform_id = None + self.keyword_id = -1 + self.reply_url = None + self.etc = None + + def get_insert_query(self, conn, db_num): + + inst = DataDBRow() + + keys = '' + values = '' + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + if len(keys) > 0: + keys += ', ' + values += ', ' + + keys += key + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value) + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')) + else: + values += conn.escape(value) + + query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values) + return query diff --git a/WebBasedCrawler/twitter/twconfig.py b/WebBasedCrawler/twitter/twconfig.py new file mode 100644 index 0000000..082079a --- /dev/null +++ b/WebBasedCrawler/twitter/twconfig.py @@ -0,0 +1,41 @@ +import datetime + + +class TwitterConfig: + protocol = 'https' + top_url = 'twitter.com' + search_url = '/i/search/timeline' + + def __init__(self): + self.keyword_id = -1 + self.db_num = -1 + + self.id = 0 + self.realtime = False + self.keywords = [] + self.start_str = None + self.start = None + self.end_str = None + self.end = None + self.authorship = None + self.state = None + self.platform = None + + def set_param(self, keyword_id, db_num, params): + self.keyword_id = int(keyword_id) + self.db_num = int(db_num) + + self.id = int(params['id']) + self.realtime = params['realtime'] == '1' + + self.keywords = [] + for keyword in params['searches'].split(','): + self.keywords.append(keyword.strip()) + + self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time()) + self.start_str = str(params['start']) + self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time()) + self.end_str = str(params['end']) + self.authorship = params['authorship'] + self.state = params['state'] + self.platform = params['platform'] \ No newline at end of file diff --git a/WebBasedCrawler/twitter/twdbhelper.py b/WebBasedCrawler/twitter/twdbhelper.py new file mode 100644 index 0000000..1b702d2 --- /dev/null +++ b/WebBasedCrawler/twitter/twdbhelper.py @@ -0,0 +1,39 @@ +from twitter.tweet import Tweet +from base.dbdata import DataDBRow + + +class TwitterDBHelper: + pymysql = __import__('pymysql.cursors') + + def __init__(self): + self.conn = self.pymysql.connect(host='bigbird.iptime.org', + user='admin', passwd='admin123', + db='concepters', charset='utf8', + cursorclass=self.pymysql.cursors.DictCursor) + + def __del__(self): + self.conn.close() + + def get_param(self, keyword_id): + query = "select * from keyword where id = " + str(keyword_id) + params = [] + try: + with self.conn.cursor() as cursor: + cursor.execute(query) + params = cursor.fetchone() + + except Exception as e: + print(e) + exit(1) + + return params + + def insert_tweet(self, db_num: int, tweet: Tweet): + query = tweet.get_insert_query(self.conn, db_num) + + try: + with self.conn.cursor() as cursor: + cursor.execute(query) + self.conn.commit() + except Exception as e: + print(e) diff --git a/WebBasedCrawler/twitter/tweet.py b/WebBasedCrawler/twitter/tweet.py new file mode 100644 index 0000000..23a4b99 --- /dev/null +++ b/WebBasedCrawler/twitter/tweet.py @@ -0,0 +1,20 @@ +from base.dbdata import DataDBRow + + +class Tweet(DataDBRow): + + def __init__(self): + super(self.__class__, self).__init__() + + self.user_id = None + self.user_name = None + self.text = None + self.created_at = None + self.retweets = 0 + self.favorites = 0 + + self.is_reply = False + self.reply_cnt = 0 + self.retweet_cnt = 0 + self.favorite_cnt = 0 + self.tweet_link = None diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py index 585ceb0..02b8ec4 100644 --- a/WebBasedCrawler/twitter/twittercrawl.py +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -1,36 +1,107 @@ +from twitter.twconfig import TwitterConfig +from twitter.twdbhelper import TwitterDBHelper +from twitter.tweet import Tweet +from twitter.twparser import TweetParser -class TwitterMainCrawler: +import base.proxy + +import requests +import bs4 +import json +from urllib import parse + + +class TwitterCrawler: def __init__(self): - self.keyword_id = -1 - pass - - def init_keyword_id(self, keyword_id): - if type(keyword_id) != int: - self.keyword_id = int(keyword_id) - else: - self.keyword_id = keyword_id - # self.crawl_init.get_keyword_parameters(keyword_id) - # self.crawl_init.disconnect() - pass - - def init_db(self, db_num): - # self.send_to_db.set_db(db_num) - pass - - def init_before_day(self, before_day): - # self.crawl_init.set_before_day(before_day) - pass - - def init_until_page(self, until_page): - # self.crawl_init.set_until_page(until_page) - pass + self.config = TwitterConfig() + self.db_helper = TwitterDBHelper() + self.proxies = None def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): - self.init_keyword_id(keyword_id) - self.init_db(db_num) - self.init_before_day(before_day) - self.init_until_page(until_page) - # self.init_browser(browser) + params = self.db_helper.get_param(keyword_id) + self.config.set_param(keyword_id, db_num, params) + + def get_url(self, query, max_position=None): + params = { + 'f': 'tweets', + 'q': '{} since:{} until:{}'.format(query, self.config.start_str, self.config.end_str), + 'language': 'en' + } + + if max_position is not None: + params['max_position'] = max_position + + url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', parse.urlencode(params), '') + return parse.urlunparse(url_tupple) + + def get_page(self, url): + headers = { + 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', + 'Accept-Language': 'en-US' + } + if self.proxies is None: + self.proxies = base.proxy.get_proxy_for_requests() + + resp = None + for cnt in range(5): + try: + resp = requests.get(url, headers=headers, proxies=self.proxies, timeout=5) + except Exception as e: + print('proxy {} is expired. ({})'.format(self.proxies, e)) + self.proxies = base.proxy.get_proxy_for_requests() + else: + break + + return resp + + def insert_pool(self, tweet: Tweet): + pass def start(self): - pass \ No newline at end of file + b_continue = True + min_tweet_id = None + max_tweet_id = None + max_position = None + tweet_count = 0 + + while b_continue: + if min_tweet_id is not None: + max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) + url = self.get_url(self.config.keywords[0], max_position) + r = self.get_page(url) + if r is None: + break + j = json.loads(r.content.decode('utf-8')) + + soup = bs4.BeautifulSoup(j['items_html'], 'lxml') + tweet_tags = soup.select("div.tweet") + + for tw in tweet_tags: + tweet = TweetParser.parse(tw, self.config.keyword_id) + + if tweet.is_reply is True: + continue + + if tweet.created_at < self.config.start: + b_continue = False + break + elif tweet.created_at > self.config.end: + continue + + if tweet.reply_cnt > 0: + self.insert_pool(tweet) + + self.db_helper.insert_tweet(self.config.db_num, tweet) + + print('{} {}>>{}: {}'.format(tweet.created_at, tweet.article_id, tweet.user_name, tweet.text)) + + count = len(tweet_tags) + if count == 0: + break + + if min_tweet_id is None: + min_tweet_id = tweet_tags[0].attrs['data-item-id'] + max_tweet_id = tweet_tags[-1].attrs['data-item-id'] + tweet_count += count + + print('runner finished {}'.format(tweet_count)) diff --git a/WebBasedCrawler/twitter/twparser.py b/WebBasedCrawler/twitter/twparser.py new file mode 100644 index 0000000..dfc7406 --- /dev/null +++ b/WebBasedCrawler/twitter/twparser.py @@ -0,0 +1,66 @@ +from twitter.tweet import Tweet +from twitter.twconfig import TwitterConfig + +import bs4 +import datetime + + +class TweetParser: + + @staticmethod + def parse(tag, keyword_id): + tweet = Tweet() + + nickname_tag = tag.select('strong.fullname')[0] + tweet.user_name = '' + for child in nickname_tag.children: + if isinstance(child, bs4.element.NavigableString): + if len(tweet.user_name) > 0: + tweet.user_name += ' ' + tweet.user_name += child + tweet.user_id = tag.select('span.username')[0].text[1:] + tweet.text = tag.select('p.tweet-text')[0].text + + time_str = tag.select('a.tweet-timestamp')[0].attrs['title'] + tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y') + + reply_tag = tag.select('div.ReplyingToContextBelowAuthor') + tweet.is_reply = len(reply_tag) > 0 + + reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') + if len(reply_cnt_tag) > 0: + tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count']) + + retweet_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') + if len(retweet_cnt_tag) > 0: + tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count']) + + favorite_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') + if len(favorite_cnt_tag) > 0: + tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count']) + + link_tag = tag.select('a.js-permalink') + if len(link_tag) > 0: + tweet.tweet_link = link_tag[0].attrs['href'] + + tweet.platform_name = 'twitter' + tweet.platform_form = 'post' + tweet.platform_title = tweet.user_id + tweet.article_form = 'reply' if tweet.is_reply else 'body' + # tweet.article_parent = None + tweet.article_id = tweet.user_id + tweet.article_nickname = tweet.user_name + # tweet.article_title = None + tweet.article_data = tweet.text + tweet.article_url = TwitterConfig.protocol + '://' + TwitterConfig.top_url + tweet.tweet_link + # tweet.article_hit = 0 + tweet.article_date = tweet.created_at + # tweet.article_order = 0 + # tweet.article_profile = tweet.user_name + tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id + tweet.platform_id = tweet.user_id + tweet.keyword_id = keyword_id + # tweet.reply_url = '' + # tweet.etc = '' + + return tweet diff --git a/WebBasedCrawler/webbasedcrawler.py b/WebBasedCrawler/webbasedcrawler.py index 75783cc..44a0853 100644 --- a/WebBasedCrawler/webbasedcrawler.py +++ b/WebBasedCrawler/webbasedcrawler.py @@ -31,7 +31,7 @@ class WebBasedCrawler: elif platform == 'facebook': self.crawler = facebookcrawlbs.FacebookMainCrawler() elif platform == 'twitter': - self.crawler = twittercrawl.TwitterMainCrawler() + self.crawler = twittercrawl.TwitterCrawler() elif platform == 'youtube': self.crawler = youtubecrawl.YoutubeMainCrawler() else: